mercury/library/char.m

%---------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%---------------------------------------------------------------------------%
% Copyright (C) 1994-2008, 2011 The University of Melbourne.
% Copyright (C) 2013-2015, 2017-2022, 2024-2026 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%---------------------------------------------------------------------------%
%
% File: char.m.
% Main author: fjh.
% Stability: high.
%
% This module defines some predicates that manipulate characters.
%
% Originally we used `character' rather than `char' for the type name
% because `char' was used by NU-Prolog to mean something different.
% But now we use `char' and the use of `character' is discouraged.
%
% All predicates and functions exported by this module that deal with
% Unicode conform to version 13 of the Unicode standard.
%
%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%

:- module char.
:- interface.

:- import_module enum.
:- import_module list.
:- import_module pretty_printer.

%---------------------------------------------------------------------------%

    % A Unicode code point.
    %
:- type char == character.

:- instance enum(character).
:- instance uenum(character).

    % `to_int'/1 and `to_int(in, out)' convert a character to its
    % corresponding numerical code (integer value).
    %
    % `to_int(out, in)' converts an integer value to a character value.
    % It fails for integer values outside of the Unicode range.
    %
    % Be aware that there is no guarantee that characters can be written to
    % files or to the standard output or standard error streams. Files using an
    % 8-bit national character set would only be able to represent a subset of
    % all possible code points. Currently, the Mercury standard library can
    % only read and write UTF-8 text files, so the entire range is supported
    % (excluding surrogate and noncharacter code points).
    %
    % Note that '\0' is not accepted as a Mercury null character literal.
    % Instead, a null character can be created using `det_from_int(0)'.
    % Null characters are not allowed in Mercury strings in C grades.
    %
:- func to_int(char) = int.
:- pred to_int(char, int).
:- mode to_int(in, out) is det.
:- mode to_int(in, in) is semidet.    % implied
:- mode to_int(out, in) is semidet.

    % Converts an integer to its corresponding character, if any.
    % A more expressive name for the reverse mode of to_int.
    %
:- pred from_int(int::in, char::out) is semidet.

    % Converts an integer to its corresponding character.
    % Throws an exception if there isn't one.
    %
:- func det_from_int(int) = char.
:- pred det_from_int(int::in, char::out) is det.

    % Converts a character to its numerical character code (unsigned integer).
    %
:- func to_uint(char) = uint.

    % Converts an unsigned integer to its corresponding character, if any.
    %
:- pred from_uint(uint::in, char::out) is semidet.

    % Converts an unsigned integer to its corresponding character.
    % Throws an exception if there isn't one.
    %
:- func det_from_uint(uint) = char.

    % Returns the minimum numerical character code.
    %
:- func min_char_value = int.
:- pred min_char_value(int::out) is det.

    % Returns the maximum numerical character code.
    %
:- func max_char_value = int.
:- pred max_char_value(int::out) is det.

%---------------------------------------------------------------------------%

    % True if-and-only-if the character is a lowercase letter (a-z)
    % in the ASCII range.
    %
:- pred is_lower(char::in) is semidet.

    % True if-and-only-if the character is an uppercase letter (A-Z)
    % in the ASCII range.
    %
:- pred is_upper(char::in) is semidet.

    % Convert a character to lowercase.
    % Note that this only converts letters (A-Z) in the ASCII range.
    %
:- func to_lower(char) = char.
:- pred to_lower(char::in, char::out) is det.

    % Convert a character to uppercase.
    % Note that this only converts letters (a-z) in the ASCII range.
    %
:- func to_upper(char) = char.
:- pred to_upper(char::in, char::out) is det.

    % lower_upper(Lower, Upper) is true if-and-only-if
    % Lower is a lowercase letter (a-z) and Upper is the corresponding
    % uppercase letter (A-Z) in the ASCII range.
    %
:- pred lower_upper(char, char).
:- mode lower_upper(in, out) is semidet.
:- mode lower_upper(out, in) is semidet.

%---------------------------------------------------------------------------%

    % True if-and-only-if the character is in the ASCII range (0-127).
    %
:- pred is_ascii(char::in) is semidet.

    % True if-and-only-if the character is a whitespace character
    % in the ASCII range:
    %
    %   U+0020  space
    %   U+0009  character tabulation (horizontal tab)
    %   U+000A  line feed
    %   U+000B  line tabulation (vertical tab)
    %   U+000C  form feed
    %   U+000D  carriage return
    %
:- pred is_whitespace(char::in) is semidet.

    % True if-and-only-if the character is a letter (A-Z, a-z)
    % in the ASCII range.
    %
:- pred is_alpha(char::in) is semidet.

    % True if-and-only-if the character is a letter (A-Z, a-z) or digit (0-9)
    % in the ASCII range.
    %
:- pred is_alnum(char::in) is semidet.

    % True if-and-only-if the character is a letter (A-Z, a-z)
    % or an underscore (_) in the ASCII range.
    %
:- pred is_alpha_or_underscore(char::in) is semidet.

    % True if-and-only-if the character is a letter (A-Z, a-z),
    % a digit (0-9) or an underscore (_) in the ASCII range.
    %
:- pred is_alnum_or_underscore(char::in) is semidet.

%---------------------------------------------------------------------------%

    % True if-and-only-if the character is a decimal digit (0-9)
    % in the ASCII range.
    %
:- pred is_digit(char::in) is semidet.

    % True if-and-only-if the character is a binary digit (0 or 1)
    % in the ASCII range.
    %
:- pred is_binary_digit(char::in) is semidet.

    % True if-and-only-if the character is an octal digit (0-7)
    % in the ASCII range.
    %
:- pred is_octal_digit(char::in) is semidet.

    % True if-and-only-if the character is a decimal digit (0-9)
    % in the ASCII range. Synonym for is_digit/1.
    %
:- pred is_decimal_digit(char::in) is semidet.

    % True if-and-only-if the character is a hexadecimal digit (0-9, a-f, A-F)
    % in the ASCII range.
    %
:- pred is_hex_digit(char::in) is semidet.

    % is_base_digit(Base, Digit):
    % True if-and-only-if Digit is a digit in the given Base (0-9, a-z, A-Z).
    % Throws an exception if Base < 2 or Base > 36.
    %
:- pred is_base_digit(int::in, char::in) is semidet.

%---------------------%

    % binary_digit_to_int(Char, Int):
    % True if-and-only-if Char is a binary digit (0 or 1) representing
    % the value Int.
    %
:- pred binary_digit_to_int(char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_binary_digit_to_int(char) = int.

    % octal_digit_to_int(Char, Int):
    % True if-and-only-if Char is an octal digit (0-7) representing
    % the value Int.
    %
:- pred octal_digit_to_int(char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_octal_digit_to_int(char) = int.

    % decimal_digit_to_int(Char, Int):
    % True if-and-only-if Char is a decimal digit (0-9) representing
    % the value Int.
    %
:- pred decimal_digit_to_int(char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_decimal_digit_to_int(char) = int.

    % hex_digit_to_int(Char, Int):
    % True if-and-only-if Char is a hexadecimal digit (0-9, a-f or A-F)
    % representing the value Int.
    %
:- pred hex_digit_to_int(char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_hex_digit_to_int(char) = int.

    % base_digit_to_int(Base, Char, Int):
    % True if-and-only-if Char is a decimal digit (0-9) or a letter (a-z, A-Z)
    % representing the value Int (0-35) in the given base.
    % Throws an exception if Base < 2 or Base > 36.
    %
:- pred base_digit_to_int(int::in, char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_base_digit_to_int(int, char) = int.

    % A version of base_digit_to_int that does not check whether
    % Base is in the range 2 to 36. If it is not, the behavior is undefined.
    %
:- pred unsafe_base_digit_to_int(int::in, char::in, int::out) is semidet.

%---------------------%

    % Convert an integer in the range 0-1 to a binary digit (0 or 1) in the
    % ASCII range.
    %
:- pred int_to_binary_digit(int::in, char::out) is semidet.

    % As above, but throw an exception instead of failing.
    %
:- func det_int_to_binary_digit(int) = char.

    % Convert an integer 0-7 to an octal digit (0-7) in the ASCII range.
    %
:- pred int_to_octal_digit(int::in, char::out) is semidet.

    % As above, but throw an exception instead of failing.
    %
:- func det_int_to_octal_digit(int) = char.

    % Convert an integer 0-9 to a decimal digit (0-9) in the ASCII range.
    %
:- pred int_to_decimal_digit(int::in, char::out) is semidet.

    % As above, but throw an exception instead of failing.
    %
:- func det_int_to_decimal_digit(int) = char.

    % Convert an integer 0-15 to an uppercase hexadecimal digit (0-9, A-F) in
    % the ASCII range.
    %
:- pred int_to_hex_digit(int::in, char::out) is semidet.

    % As above, but throw an exception instead of failing.
    %
:- func det_int_to_hex_digit(int) = char.

    % base_int_to_digit(Base, Int, Char):
    % True if-and-only-if Char is a decimal digit (0-9) or an uppercase letter
    % (A-Z) representing the value Int (0-35) in the given base.
    % Throws an exception if Base < 2 or Base > 36.
    %
:- pred base_int_to_digit(int::in, int::in, char::out) is semidet.

    % As above, but throw an exception instead of failing.
    %
:- func det_base_int_to_digit(int, int) = char.

%---------------------------------------------------------------------------%

    % Encode a Unicode code point in UTF-8.
    % Fails for surrogate code points.
    %
:- pred to_utf8(char::in, list(int)::out) is semidet.

    % As above, but represent UTF-8 code units using uint8s.
    %
:- pred to_utf8_uint8(char::in, list(uint8)::out) is semidet.

    % Encode a Unicode code point in UTF-16 (native endianness).
    % Fails for surrogate code points.
    %
:- pred to_utf16(char::in, list(int)::out) is semidet.

    % As above, but represent UTF-16 code units using uint16s.
    %
:- pred to_utf16_uint16(char::in, list(uint16)::out) is semidet.

    % True if-and-only-if the character is a Unicode Surrogate code point,
    % that is a code point in General Category `Other,surrogate' (`Cs').
    % In UTF-16, a code point with a scalar value greater than 0xffff is
    % encoded with a pair of surrogate code points.
    %
:- pred is_surrogate(char::in) is semidet.

    % True if-and-only-if the character is a Unicode leading surrogate
    % code point. A leading surrogate code point is in the inclusive range
    % from 0xd800 to 0xdbff.
    %
:- pred is_leading_surrogate(char::in) is semidet.

    % True if-and-only-if the character is a Unicode trailing surrogate
    % code point. A trailing surrogate code point is in the inclusive range
    % from 0xdc00 to 0xdfff.
    %
:- pred is_trailing_surrogate(char::in) is semidet.

    % True if-and-only-if the character is a Unicode Noncharacter code point.
    % Sixty-six code points are not used to encode characters.
    % These code points should not be used for interchange, but may be used
    % internally.
    %
:- pred is_noncharacter(char::in) is semidet.

    % True if-and-only-if the character is a Unicode Control code point,
    % that is a code point in General Category `Other,control' (`Cc').
    %
:- pred is_control(char::in) is semidet.

    % True if-and-only-if the character is a Unicode Space Separator
    % code point, that is a code point in General Category
    % `Separator,space' (`Zs').
    %
:- pred is_space_separator(char::in) is semidet.

    % True if-and-only-if the character is a Unicode Line Separator code point,
    % that is a code point in General Category `Separator,line' (`Zl').
    %
:- pred is_line_separator(char::in) is semidet.

    % True if-and-only-if the character is a Unicode Paragraph Separator
    % code point, that is a code point in General Category
    % `Separator,paragraph' (`Zp').
    %
:- pred is_paragraph_separator(char::in) is semidet.

    % True if-and-only-if the character is a Unicode Private-use code point,
    % that is a code point in General Category `Other,private use' (`Co').
    %
:- pred is_private_use(char::in) is semidet.

%---------------------------------------------------------------------------%

    % Convert a char to a pretty_printer.doc for formatting.
    %
:- func char_to_doc(char) = pretty_printer.doc.
:- pragma obsolete(func(char_to_doc/1), [pretty_printer.char_to_doc/1]).

%---------------------------------------------------------------------------%

% The following have all been deprecated.

    % Use hex_digit_to_int/2 instead.
    %
:- pred is_hex_digit(char, int).
:- mode is_hex_digit(in, out) is semidet.

    % Convert an integer 0-15 to a hexadecimal digit (0-9, A-F) in the ASCII
    % range.
    %
    % Use int_to_hex_digit/2 instead.
    %
:- pred int_to_hex_char(int, char).
:- mode int_to_hex_char(in, out) is semidet.

%---------------------------------------------------------------------------%
%
% Computing hashes of chars.
%

    % Compute a hash value for a char.
    %
:- func hash(char) = int.
:- pred hash(char::in, int::out) is det.

%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%

:- implementation.

:- interface.

    % A version of is_surrogate that takes the character in its integer form.
    % Exported for use by string.m.
    %
:- pred char_int_is_surrogate(int::in) is semidet.

%---------------------------------------------------------------------------%

:- implementation.

:- import_module int.
:- import_module require.
:- import_module uint.
:- import_module uint16.
:- import_module uint8.

:- instance enum(character) where [
    func(to_int/1) is char.to_int,
    pred(from_int/2) is char.from_int
].

:- instance uenum(character) where [
    func(to_uint/1) is char.to_uint,
    pred(from_uint/2) is char.from_uint
].

:- pragma foreign_decl("C", "#include <limits.h>").

%---------------------------------------------------------------------------%
%
% All of
%
%   - func to_int/1
%   - pred to_int/2
%   - pred from_int/2
%   - func det_from_int/1
%   - pred det_from_int/2
%
% are implemented in terms of pred to_int/2. For the *from_int operations,
% this is possible *only* because that predicate has a reverse mode as its
% usual forward mode.
%

to_int(C) = N :-
    to_int(C, N).

%---------------------%
%
% The <in, out> mode of to_int.
%

:- pragma inline(pred(to_int/2)).

:- pragma foreign_proc("C",
    to_int(Character::in, Int::out),
    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
        does_not_affect_liveness],
"
    Int = (MR_UnsignedChar) Character;
").
:- pragma foreign_proc("C#",
    to_int(Character::in, Int::out),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    Int = Character;
").
:- pragma foreign_proc("Java",
    to_int(Character::in, Int::out),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    Int = Character;
").

%---------------------%
%
% The <in, in> mode of to_int.
%

:- pragma foreign_proc("C",
    to_int(Character::in, Int::in),
    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
        does_not_affect_liveness],
"
    SUCCESS_INDICATOR = ((MR_UnsignedChar) Character == Int);
").
:- pragma foreign_proc("C#",
    to_int(Character::in, Int::in),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    SUCCESS_INDICATOR = (Character == Int);
").
:- pragma foreign_proc("Java",
    to_int(Character::in, Int::in),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    SUCCESS_INDICATOR = (Character == Int);
").

%---------------------%
%
% The <out, in> mode of to_int.
%

:- pragma foreign_proc("C",
    to_int(Character::out, Int::in),
    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
        does_not_affect_liveness],
"
    Character = Int;
    SUCCESS_INDICATOR = (Character >= 0 && Character <= 0x10ffff);
").
:- pragma foreign_proc("C#",
    to_int(Character::out, Int::in),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    Character = Int;
    SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
").

:- pragma foreign_proc("Java",
    to_int(Character::out, Int::in),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    Character = Int;
    SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
").

%---------------------%

from_int(Int, Char) :-
    to_int(Char, Int).

%---------------------%

det_from_int(Int) = Char :-
    det_from_int(Int, Char).

det_from_int(Int, Char) :-
    ( if char.from_int(Int, CharPrime) then
        Char = CharPrime
    else
        unexpected($pred, "conversion failed")
    ).

%---------------------------------------------------------------------------%
%
% The to_uint/from_uint operations are implemented quite differently from
% their int versions. The reason for this is that while to_int has both
% a forward mode and a reverse mode, to_uint has only the forward mode.
% (By the time we added unsigned integers to the language, experience has
% taught us that more modes are not necessarily better.)
%

to_uint(Char) = UInt :-
    UInt = uint.cast_from_int(char.to_int(Char)).

:- pragma inline(pred(from_uint/2)).

:- pragma foreign_proc("C",
    from_uint(UInt::in, Character::out),
    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
        does_not_affect_liveness],
"
    Character = (MR_UnsignedChar) UInt;
    SUCCESS_INDICATOR = (UInt <= 0x10ffff);
").
:- pragma foreign_proc("C#",
    from_uint(UInt::in, Character::out),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    Character = (int) UInt;
    SUCCESS_INDICATOR = (UInt <= 0x10ffff);
").
:- pragma foreign_proc("Java",
    from_uint(UInt::in, Character::out),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    Character = UInt;
    SUCCESS_INDICATOR = ((UInt & 0xffffffffL) <= (0x10ffff & 0xffffffffL));
").

det_from_uint(UInt) = Char :-
    ( if char.from_uint(UInt, CharPrime) then
        Char = CharPrime
    else
        unexpected($pred, "conversion failed")
    ).

%---------------------------------------------------------------------------%

min_char_value = N :-
    min_char_value(N).

    % We use unsigned character codes, so the minimum character code
    % is always zero.
min_char_value(0).

max_char_value = N :-
    max_char_value(N).

:- pragma foreign_proc("C",
    max_char_value(Max::out),
    [will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
        does_not_affect_liveness],
"
    Max = 0x10ffff;
").
:- pragma foreign_proc("C#",
    max_char_value(Max::out),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    Max = 0x10ffff;
").
:- pragma foreign_proc("Java",
    max_char_value(Max::out),
    [will_not_call_mercury, promise_pure, thread_safe],
"
    Max = 0x10ffff;
").

%---------------------------------------------------------------------------%

is_lower(Lower) :-
    lower_upper(Lower, _).

is_upper(Upper) :-
    ( if lower_upper(_, Upper) then
        true
    else
        fail
    ).

to_lower(C1) = C2 :-
    to_lower(C1, C2).

to_lower(Char, Lower) :-
    ( if lower_upper(LowerChar, Char) then
        Lower = LowerChar
    else
        Lower = Char
    ).

to_upper(C1) = C2 :-
    to_upper(C1, C2).

to_upper(Char, Upper) :-
    ( if lower_upper(Char, UpperChar) then
        Upper = UpperChar
    else
        Upper = Char
    ).

lower_upper('a', 'A').
lower_upper('b', 'B').
lower_upper('c', 'C').
lower_upper('d', 'D').
lower_upper('e', 'E').
lower_upper('f', 'F').
lower_upper('g', 'G').
lower_upper('h', 'H').
lower_upper('i', 'I').
lower_upper('j', 'J').
lower_upper('k', 'K').
lower_upper('l', 'L').
lower_upper('m', 'M').
lower_upper('n', 'N').
lower_upper('o', 'O').
lower_upper('p', 'P').
lower_upper('q', 'Q').
lower_upper('r', 'R').
lower_upper('s', 'S').
lower_upper('t', 'T').
lower_upper('u', 'U').
lower_upper('v', 'V').
lower_upper('w', 'W').
lower_upper('x', 'X').
lower_upper('y', 'Y').
lower_upper('z', 'Z').

%---------------------------------------------------------------------------%

is_ascii(Char) :-
    Code = char.to_int(Char),
    Code >= 0x00,
    Code =< 0x7f.

% The information here is duplicated in lookup_token_action in
% mercury_term_lexer.m. If you update this, you will also need to update that.
is_whitespace(' ').
is_whitespace('\t').
is_whitespace('\n').
is_whitespace('\r').
is_whitespace('\f').
is_whitespace('\v').

is_alpha(Char) :-
    ( if is_lower(Char) then
        true
    else if is_upper(Char) then
        true
    else
        fail
    ).

is_alnum(Char) :-
    ( if is_alpha(Char) then
        true
    else if is_digit(Char) then
        true
    else
        fail
    ).

is_alpha_or_underscore(Char) :-
    ( if Char = '_' then
        true
    else
        is_alpha(Char)
    ).

is_alnum_or_underscore(Char) :-
    % We explicitly enumerate here for efficiency.
    % (The information here and in some of the following predicates,
    % e.g. lower_upper, is duplicated in lookup_token_action
    % in mercury_term_lexer.m.)
    %
    % A more concise implementation would be:
    %
    %   ( if is_digit(Char) then
    %       true
    %   else
    %       is_alpha_or_underscore(Char)
    %   ).

    ( Char = '0' ; Char = '1' ; Char = '2' ; Char = '3' ; Char = '4'
    ; Char = '5' ; Char = '6' ; Char = '7' ; Char = '8' ; Char = '9'
    ; Char = 'a' ; Char = 'b' ; Char = 'c' ; Char = 'd' ; Char = 'e'
    ; Char = 'f' ; Char = 'g' ; Char = 'h' ; Char = 'i' ; Char = 'j'
    ; Char = 'k' ; Char = 'l' ; Char = 'm' ; Char = 'n' ; Char = 'o'
    ; Char = 'p' ; Char = 'q' ; Char = 'r' ; Char = 's' ; Char = 't'
    ; Char = 'u' ; Char = 'v' ; Char = 'w' ; Char = 'x' ; Char = 'y'
    ; Char = 'z'
    ; Char = 'A' ; Char = 'B' ; Char = 'C' ; Char = 'D' ; Char = 'E'
    ; Char = 'F' ; Char = 'G' ; Char = 'H' ; Char = 'I' ; Char = 'J'
    ; Char = 'K' ; Char = 'L' ; Char = 'M' ; Char = 'N' ; Char = 'O'
    ; Char = 'P' ; Char = 'Q' ; Char = 'R' ; Char = 'S' ; Char = 'T'
    ; Char = 'U' ; Char = 'V' ; Char = 'W' ; Char = 'X' ; Char = 'Y'
    ; Char = 'Z'
    ; Char = '_'
    ).

%---------------------------------------------------------------------------%

% Lots of big tables.
%
% It is conceivable that there are more efficient implementations,
% but these versions are very portable.

%---------------------------------------------------------------------------%
%
% Digit classification.
%

is_digit(D) :-
    is_decimal_digit(D).

is_binary_digit('0').
is_binary_digit('1').

is_octal_digit('0').
is_octal_digit('1').
is_octal_digit('2').
is_octal_digit('3').
is_octal_digit('4').
is_octal_digit('5').
is_octal_digit('6').
is_octal_digit('7').

is_decimal_digit('0').
is_decimal_digit('1').
is_decimal_digit('2').
is_decimal_digit('3').
is_decimal_digit('4').
is_decimal_digit('5').
is_decimal_digit('6').
is_decimal_digit('7').
is_decimal_digit('8').
is_decimal_digit('9').

is_hex_digit('0').
is_hex_digit('1').
is_hex_digit('2').
is_hex_digit('3').
is_hex_digit('4').
is_hex_digit('5').
is_hex_digit('6').
is_hex_digit('7').
is_hex_digit('8').
is_hex_digit('9').
is_hex_digit('a').
is_hex_digit('b').
is_hex_digit('c').
is_hex_digit('d').
is_hex_digit('e').
is_hex_digit('f').
is_hex_digit('A').
is_hex_digit('B').
is_hex_digit('C').
is_hex_digit('D').
is_hex_digit('E').
is_hex_digit('F').

is_base_digit(Base, Digit) :-
    ( if 2 =< Base, Base =< 36 then
        base_digit_to_int(Base, Digit, _Int)
    else
        error($pred, "invalid base")
    ).

%---------------------------------------------------------------------------%
%
% Digit to integer conversion.
%

binary_digit_to_int('0', 0).
binary_digit_to_int('1', 1).

det_binary_digit_to_int(Digit) = Int :-
    ( if binary_digit_to_int(Digit, IntPrime) then
        Int = IntPrime
    else
        error($pred, "char.binary_digit_to_int failed")
    ).

octal_digit_to_int('0', 0).
octal_digit_to_int('1', 1).
octal_digit_to_int('2', 2).
octal_digit_to_int('3', 3).
octal_digit_to_int('4', 4).
octal_digit_to_int('5', 5).
octal_digit_to_int('6', 6).
octal_digit_to_int('7', 7).

det_octal_digit_to_int(Digit) = Int :-
    ( if octal_digit_to_int(Digit, IntPrime) then
        Int = IntPrime
    else
        error($pred, "char.octal_digit_to_int failed")
    ).

decimal_digit_to_int('0', 0).
decimal_digit_to_int('1', 1).
decimal_digit_to_int('2', 2).
decimal_digit_to_int('3', 3).
decimal_digit_to_int('4', 4).
decimal_digit_to_int('5', 5).
decimal_digit_to_int('6', 6).
decimal_digit_to_int('7', 7).
decimal_digit_to_int('8', 8).
decimal_digit_to_int('9', 9).

det_decimal_digit_to_int(Digit) = Int :-
    ( if decimal_digit_to_int(Digit, IntPrime) then
        Int = IntPrime
    else
        error($pred, "char.decimal_digit_to_int failed")
    ).

hex_digit_to_int('0', 0).
hex_digit_to_int('1', 1).
hex_digit_to_int('2', 2).
hex_digit_to_int('3', 3).
hex_digit_to_int('4', 4).
hex_digit_to_int('5', 5).
hex_digit_to_int('6', 6).
hex_digit_to_int('7', 7).
hex_digit_to_int('8', 8).
hex_digit_to_int('9', 9).
hex_digit_to_int('a', 10).
hex_digit_to_int('b', 11).
hex_digit_to_int('c', 12).
hex_digit_to_int('d', 13).
hex_digit_to_int('e', 14).
hex_digit_to_int('f', 15).
hex_digit_to_int('A', 10).
hex_digit_to_int('B', 11).
hex_digit_to_int('C', 12).
hex_digit_to_int('D', 13).
hex_digit_to_int('E', 14).
hex_digit_to_int('F', 15).

det_hex_digit_to_int(DigitStr) = Int :-
    ( if hex_digit_to_int(DigitStr, IntPrime) then
        Int = IntPrime
    else
        error($pred, "char.hex_digit_to_int failed")
    ).

base_digit_to_int(Base, DigitStr, Int) :-
    ( if 1 < Base, Base < 37 then
        unsafe_base_digit_to_int(Base, DigitStr, Int)
    else
        error($pred, "base is not in the range 2 .. 36")
    ).

det_base_digit_to_int(Base, DigitStr) = Int :-
    ( if base_digit_to_int(Base, DigitStr, IntPrime) then
        Int = IntPrime
    else
        error($pred, "char.base_digit_to_int failed")
    ).

unsafe_base_digit_to_int(Base, DigitStr0, Int) :-
    ( if lower_upper(DigitStr0, UpperStr) then
        DigitStr = UpperStr
    else
        DigitStr = DigitStr0
    ),
    int_to_extended_digit(Int, DigitStr),
    Int < Base.

%---------------------------------------------------------------------------%
%
% Integer to digit conversion.
%

int_to_binary_digit(0, '0').
int_to_binary_digit(1, '1').

det_int_to_binary_digit(Int) = Digit :-
    ( if int_to_binary_digit(Int, DigitPrime) then
        Digit = DigitPrime
    else
        error($pred, "char.int_to_binary_digit failed")
    ).

int_to_octal_digit(0, '0').
int_to_octal_digit(1, '1').
int_to_octal_digit(2, '2').
int_to_octal_digit(3, '3').
int_to_octal_digit(4, '4').
int_to_octal_digit(5, '5').
int_to_octal_digit(6, '6').
int_to_octal_digit(7, '7').

det_int_to_octal_digit(Int) = Digit :-
    ( if int_to_octal_digit(Int, DigitPrime) then
        Digit = DigitPrime
    else
        error($pred, "char.int_to_octal_digit failed")
    ).

int_to_decimal_digit(0, '0').
int_to_decimal_digit(1, '1').
int_to_decimal_digit(2, '2').
int_to_decimal_digit(3, '3').
int_to_decimal_digit(4, '4').
int_to_decimal_digit(5, '5').
int_to_decimal_digit(6, '6').
int_to_decimal_digit(7, '7').
int_to_decimal_digit(8, '8').
int_to_decimal_digit(9, '9').

det_int_to_decimal_digit(Int) = Digit :-
    ( if int_to_decimal_digit(Int, DigitPrime) then
        Digit = DigitPrime
    else
        error($pred, "char.int_to_decimal_digit failed")
    ).

int_to_hex_digit(0, '0').
int_to_hex_digit(1, '1').
int_to_hex_digit(2, '2').
int_to_hex_digit(3, '3').
int_to_hex_digit(4, '4').
int_to_hex_digit(5, '5').
int_to_hex_digit(6, '6').
int_to_hex_digit(7, '7').
int_to_hex_digit(8, '8').
int_to_hex_digit(9, '9').
int_to_hex_digit(10, 'A').
int_to_hex_digit(11, 'B').
int_to_hex_digit(12, 'C').
int_to_hex_digit(13, 'D').
int_to_hex_digit(14, 'E').
int_to_hex_digit(15, 'F').

det_int_to_hex_digit(Int) = Digit :-
    ( if int_to_hex_digit(Int, DigitPrime) then
        Digit = DigitPrime
    else
        error($pred, "char.int_to_hex_digit failed")
    ).

base_int_to_digit(Base, Int, Digit) :-
    ( if 1 < Base, Base < 37 then
        Int < Base,
        int_to_extended_digit(Int, Digit)
    else
        error($pred, "invalid base")
    ).

det_base_int_to_digit(Base, Int) = Digit :-
    ( if base_int_to_digit(Base, Int, DigitPrime) then
        Digit = DigitPrime
    else
        error($pred, "char.base_int_to_digit failed")
    ).

%---------------------------------------------------------------------------%
%
% Conversion to UTF-8 code units.
%

to_utf8(Char, CodeUnits) :-
    to_utf8_code_units(Char, NumCodeUnits, A, B, C, D),
    (
        NumCodeUnits = 1,
        CodeUnits = [uint8.to_int(A)]
    ;
        NumCodeUnits = 2,
        CodeUnits = [uint8.to_int(A), uint8.to_int(B)]
    ;
        NumCodeUnits = 3,
        CodeUnits = [uint8.to_int(A), uint8.to_int(B), uint8.to_int(C)]
    ;
        NumCodeUnits = 4,
        CodeUnits = [uint8.to_int(A), uint8.to_int(B),
            uint8.to_int(C), uint8.to_int(D)]
    ).

to_utf8_uint8(Char, CodeUnits) :-
    to_utf8_code_units(Char, NumCodeUnits, A, B, C, D),
    (
        NumCodeUnits = 1,
        CodeUnits = [A]
    ;
        NumCodeUnits = 2,
        CodeUnits = [A, B]
    ;
        NumCodeUnits = 3,
        CodeUnits = [A, B, C]
    ;
        NumCodeUnits = 4,
        CodeUnits = [A, B, C, D]
    ).

:- pred to_utf8_code_units(char::in, int::out(bound(1 ; 2 ; 3 ; 4)),
    uint8::out, uint8::out, uint8::out, uint8::out) is semidet.

to_utf8_code_units(Char, NumCodeUnits, A, B, C, D) :-
    Int = char.to_int(Char),
    ( if Int =< 0x7f then
        A = uint8.cast_from_int(Int),
        B = 0u8,
        C = 0u8,
        D = 0u8,
        NumCodeUnits = 1
    else if Int =< 0x7ff then
        A = uint8.cast_from_int(0xc0 \/ ((Int >> 6) /\ 0x1f)),
        B = uint8.cast_from_int(0x80 \/  (Int       /\ 0x3f)),
        C = 0u8,
        D = 0u8,
        NumCodeUnits = 2
    else if Int =< 0xffff then
        not char_int_is_surrogate(Int),
        A = uint8.cast_from_int(0xe0 \/ ((Int >> 12) /\ 0x0f)),
        B = uint8.cast_from_int(0x80 \/ ((Int >>  6) /\ 0x3f)),
        C = uint8.cast_from_int(0x80 \/  (Int        /\ 0x3f)),
        D = 0u8,
        NumCodeUnits = 3
    else if Int =< 0x10ffff then
        A = uint8.cast_from_int(0xf0 \/ ((Int >> 18) /\ 0x07)),
        B = uint8.cast_from_int(0x80 \/ ((Int >> 12) /\ 0x3f)),
        C = uint8.cast_from_int(0x80 \/ ((Int >>  6) /\ 0x3f)),
        D = uint8.cast_from_int(0x80 \/  (Int        /\ 0x3f)),
        NumCodeUnits = 4
    else
        error($pred, "illegal code point")
    ).

%---------------------------------------------------------------------------%
%
% Conversion to UTF-16 code units.
%

to_utf16(Char, CodeUnits) :-
    to_utf16_code_units(Char, NumCodeUnits, A, B),
    (
        NumCodeUnits = 1,
        CodeUnits = [uint16.to_int(A)]
    ;
        NumCodeUnits = 2,
        CodeUnits = [uint16.to_int(A), uint16.to_int(B)]
    ).

to_utf16_uint16(Char, CodeUnits) :-
    to_utf16_code_units(Char, NumCodeUnits, A, B),
    (
        NumCodeUnits = 1,
        CodeUnits = [A]
    ;
        NumCodeUnits = 2,
        CodeUnits = [A, B]
    ).

:- pred to_utf16_code_units(char::in, int::out(bound(1 ; 2)),
    uint16::out, uint16::out) is semidet.

to_utf16_code_units(Char, NumCodeUnits, A, B) :-
    Int = char.to_int(Char),
    ( if Int < 0xd800 then
        % Common case.
        A = uint16.cast_from_int(Int),
        B = 0u16,
        NumCodeUnits = 1
    else if Int =< 0xdfff then
        % Surrogate.
        fail
    else if Int =< 0xffff then
        A = uint16.cast_from_int(Int),
        B = 0u16,
        NumCodeUnits = 1
    else if Int =< 0x10ffff then
        U = Int - 0x10000,
        A = uint16.cast_from_int(0xd800 \/ (U >> 10)),
        B = uint16.cast_from_int(0xdc00 \/ (U /\ 0x3ff)),
        NumCodeUnits = 2
    else
        error($pred, "illegal code point")
    ).

%---------------------------------------------------------------------------%

is_surrogate(Char) :-
    Int = char.to_int(Char),
    char_int_is_surrogate(Int).

is_leading_surrogate(Char) :-
    Int = char.to_int(Char),
    Int >= 0xd800,
    Int =< 0xdbff.

is_trailing_surrogate(Char) :-
    Int = char.to_int(Char),
    Int >= 0xdc00,
    Int =< 0xdfff.

is_noncharacter(Char) :-
    Int = char.to_int(Char),
    ( 0xfdd0 =< Int, Int =< 0xfdef
    ; Int /\ 0xfffe = 0xfffe
    ).

is_control(Char) :-
    Int = char.to_int(Char),
    ( 0x0000 =< Int, Int =< 0x001f
    ; 0x007f =< Int, Int =< 0x009f
    ).

is_space_separator(Char) :-
    Int = char.to_int(Char),
    ( Int = 0x0020
    ; Int = 0x00a0
    ; Int = 0x1680
    ; 0x2000 =< Int, Int =< 0x200a
    ; Int = 0x202f
    ; Int = 0x205f
    ; Int = 0x3000
    ).

is_line_separator(Char) :-
    0x2028 = char.to_int(Char).

is_paragraph_separator(Char) :-
    0x2029 = char.to_int(Char).

is_private_use(Char) :-
    Int = char.to_int(Char),
    ( 0xe000 =< Int, Int =< 0xf8ff     % Private Use Area.
    ; 0xf0000 =< Int, Int =< 0xffffd   % Supplemental Private Use Area-A.
    ; 0x100000 =< Int, Int =< 0x10fffd % Supplemental Private Use Area-B.
    ).

char_to_doc(C) = pretty_printer.char_to_doc(C).

%---------------------------------------------------------------------------%

is_hex_digit(Digit, Int) :-
    hex_digit_to_int(Digit, Int).

int_to_hex_char(Int, Char) :-
    int_to_hex_digit(Int, Char).

%---------------------------------------------------------------------------%

:- pred int_to_extended_digit(int, char).
:- mode int_to_extended_digit(in, out) is semidet.
:- mode int_to_extended_digit(out, in) is semidet.

int_to_extended_digit(0, '0').
int_to_extended_digit(1, '1').
int_to_extended_digit(2, '2').
int_to_extended_digit(3, '3').
int_to_extended_digit(4, '4').
int_to_extended_digit(5, '5').
int_to_extended_digit(6, '6').
int_to_extended_digit(7, '7').
int_to_extended_digit(8, '8').
int_to_extended_digit(9, '9').
int_to_extended_digit(10, 'A').
int_to_extended_digit(11, 'B').
int_to_extended_digit(12, 'C').
int_to_extended_digit(13, 'D').
int_to_extended_digit(14, 'E').
int_to_extended_digit(15, 'F').
int_to_extended_digit(16, 'G').
int_to_extended_digit(17, 'H').
int_to_extended_digit(18, 'I').
int_to_extended_digit(19, 'J').
int_to_extended_digit(20, 'K').
int_to_extended_digit(21, 'L').
int_to_extended_digit(22, 'M').
int_to_extended_digit(23, 'N').
int_to_extended_digit(24, 'O').
int_to_extended_digit(25, 'P').
int_to_extended_digit(26, 'Q').
int_to_extended_digit(27, 'R').
int_to_extended_digit(28, 'S').
int_to_extended_digit(29, 'T').
int_to_extended_digit(30, 'U').
int_to_extended_digit(31, 'V').
int_to_extended_digit(32, 'W').
int_to_extended_digit(33, 'X').
int_to_extended_digit(34, 'Y').
int_to_extended_digit(35, 'Z').

%---------------------------------------------------------------------------%

hash(C) = H :-
    uint.hash(uint.cast_from_int(char.to_int(C)), H).

hash(C, H) :-
    H = hash(C).

%---------------------------------------------------------------------------%

char_int_is_surrogate(Int) :-
    % This code is sort-of duplicated, in C, in runtime/mercury_string.h,
    % in the macro MR_is_surrogate.
    Int >= 0xd800,
    Int =< 0xdfff.

%---------------------------------------------------------------------------%
:- end_module char.
%---------------------------------------------------------------------------%