Files
mercury/library/char.m
Julien Fischer b85f11b41d Fix library documentation errors.
library/*.m:
    As above.

library/getopt.m:
    Regenerate this file.
2026-01-22 21:22:38 +11:00

1275 lines
36 KiB
Mathematica

%---------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%---------------------------------------------------------------------------%
% Copyright (C) 1994-2008, 2011 The University of Melbourne.
% Copyright (C) 2013-2015, 2017-2022, 2024-2026 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%---------------------------------------------------------------------------%
%
% File: char.m.
% Main author: fjh.
% Stability: high.
%
% This module defines some predicates that manipulate characters.
%
% Originally we used `character' rather than `char' for the type name
% because `char' was used by NU-Prolog to mean something different.
% But now we use `char' and the use of `character' is discouraged.
%
% All predicates and functions exported by this module that deal with
% Unicode conform to version 13 of the Unicode standard.
%
%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%
:- module char.
:- interface.
:- import_module enum.
:- import_module list.
:- import_module pretty_printer.
%---------------------------------------------------------------------------%
% A Unicode code point.
%
:- type char == character.
:- instance enum(character).
:- instance uenum(character).
% `to_int'/1 and `to_int(in, out)' convert a character to its
% corresponding numerical code (integer value).
%
% `to_int(out, in)' converts an integer value to a character value.
% It fails for integer values outside of the Unicode range.
%
% Be aware that there is no guarantee that characters can be written to
% files or to the standard output or standard error streams. Files using an
% 8-bit national character set would only be able to represent a subset of
% all possible code points. Currently, the Mercury standard library can
% only read and write UTF-8 text files, so the entire range is supported
% (excluding surrogate and noncharacter code points).
%
% Note that '\0' is not accepted as a Mercury null character literal.
% Instead, a null character can be created using `det_from_int(0)'.
% Null characters are not allowed in Mercury strings in C grades.
%
:- func to_int(char) = int.
:- pred to_int(char, int).
:- mode to_int(in, out) is det.
:- mode to_int(in, in) is semidet. % implied
:- mode to_int(out, in) is semidet.
% Converts an integer to its corresponding character, if any.
% A more expressive name for the reverse mode of to_int.
%
:- pred from_int(int::in, char::out) is semidet.
% Converts an integer to its corresponding character.
% Throws an exception if there isn't one.
%
:- func det_from_int(int) = char.
:- pred det_from_int(int::in, char::out) is det.
% Converts a character to its numerical character code (unsigned integer).
%
:- func to_uint(char) = uint.
% Converts an unsigned integer to its corresponding character, if any.
%
:- pred from_uint(uint::in, char::out) is semidet.
% Converts an unsigned integer to its corresponding character.
% Throws an exception if there isn't one.
%
:- func det_from_uint(uint) = char.
% Returns the minimum numerical character code.
%
:- func min_char_value = int.
:- pred min_char_value(int::out) is det.
% Returns the maximum numerical character code.
%
:- func max_char_value = int.
:- pred max_char_value(int::out) is det.
%---------------------------------------------------------------------------%
% True if-and-only-if the character is a lowercase letter (a-z)
% in the ASCII range.
%
:- pred is_lower(char::in) is semidet.
% True if-and-only-if the character is an uppercase letter (A-Z)
% in the ASCII range.
%
:- pred is_upper(char::in) is semidet.
% Convert a character to lowercase.
% Note that this only converts letters (A-Z) in the ASCII range.
%
:- func to_lower(char) = char.
:- pred to_lower(char::in, char::out) is det.
% Convert a character to uppercase.
% Note that this only converts letters (a-z) in the ASCII range.
%
:- func to_upper(char) = char.
:- pred to_upper(char::in, char::out) is det.
% lower_upper(Lower, Upper) is true if-and-only-if
% Lower is a lowercase letter (a-z) and Upper is the corresponding
% uppercase letter (A-Z) in the ASCII range.
%
:- pred lower_upper(char, char).
:- mode lower_upper(in, out) is semidet.
:- mode lower_upper(out, in) is semidet.
%---------------------------------------------------------------------------%
% True if-and-only-if the character is in the ASCII range (0-127).
%
:- pred is_ascii(char::in) is semidet.
% True if-and-only-if the character is a whitespace character
% in the ASCII range:
%
% U+0020 space
% U+0009 character tabulation (horizontal tab)
% U+000A line feed
% U+000B line tabulation (vertical tab)
% U+000C form feed
% U+000D carriage return
%
:- pred is_whitespace(char::in) is semidet.
% True if-and-only-if the character is a letter (A-Z, a-z)
% in the ASCII range.
%
:- pred is_alpha(char::in) is semidet.
% True if-and-only-if the character is a letter (A-Z, a-z) or digit (0-9)
% in the ASCII range.
%
:- pred is_alnum(char::in) is semidet.
% True if-and-only-if the character is a letter (A-Z, a-z)
% or an underscore (_) in the ASCII range.
%
:- pred is_alpha_or_underscore(char::in) is semidet.
% True if-and-only-if the character is a letter (A-Z, a-z),
% a digit (0-9) or an underscore (_) in the ASCII range.
%
:- pred is_alnum_or_underscore(char::in) is semidet.
%---------------------------------------------------------------------------%
% True if-and-only-if the character is a decimal digit (0-9)
% in the ASCII range.
%
:- pred is_digit(char::in) is semidet.
% True if-and-only-if the character is a binary digit (0 or 1)
% in the ASCII range.
%
:- pred is_binary_digit(char::in) is semidet.
% True if-and-only-if the character is an octal digit (0-7)
% in the ASCII range.
%
:- pred is_octal_digit(char::in) is semidet.
% True if-and-only-if the character is a decimal digit (0-9)
% in the ASCII range. Synonym for is_digit/1.
%
:- pred is_decimal_digit(char::in) is semidet.
% True if-and-only-if the character is a hexadecimal digit (0-9, a-f, A-F)
% in the ASCII range.
%
:- pred is_hex_digit(char::in) is semidet.
% is_base_digit(Base, Digit):
% True if-and-only-if Digit is a digit in the given Base (0-9, a-z, A-Z).
% Throws an exception if Base < 2 or Base > 36.
%
:- pred is_base_digit(int::in, char::in) is semidet.
%---------------------%
% binary_digit_to_int(Char, Int):
% True if-and-only-if Char is a binary digit (0-1) representing
% the value Int.
%
:- pred binary_digit_to_int(char::in, int::out) is semidet.
% As above, but throws an exception instead of failing.
%
:- func det_binary_digit_to_int(char) = int.
% octal_digit_to_int(Char, Int):
% True if-and-only-if Char is an octal digit (0-7) representing
% the value Int.
%
:- pred octal_digit_to_int(char::in, int::out) is semidet.
% As above, but throws an exception instead of failing.
%
:- func det_octal_digit_to_int(char) = int.
% decimal_digit_to_int(Char, Int):
% True if-and-only-if Char is a decimal digit (0-9) representing
% the value Int.
%
:- pred decimal_digit_to_int(char::in, int::out) is semidet.
% As above, but throws an exception instead of failing.
%
:- func det_decimal_digit_to_int(char) = int.
% hex_digit_to_int(Char, Int):
% True if-and-only-if Char is a hexadecimal digit (0-9, a-z or A-F)
% representing the value Int.
%
:- pred hex_digit_to_int(char::in, int::out) is semidet.
% As above, but throws an exception instead of failing.
%
:- func det_hex_digit_to_int(char) = int.
% base_digit_to_int(Base, Char, Int):
% True if-and-only-if Char is a decimal digit (0-9) or a letter (a-z, A-Z)
% representing the value Int (0-35) in the given base.
% Throws an exception if Base < 2 or Base > 36.
%
:- pred base_digit_to_int(int::in, char::in, int::out) is semidet.
% As above, but throws an exception instead of failing.
%
:- func det_base_digit_to_int(int, char) = int.
% A version of base_digit_to_int that does not check whether
% Base is in the range 2 to 36. If it is not, the behavior is undefined.
%
:- pred unsafe_base_digit_to_int(int::in, char::in, int::out) is semidet.
%---------------------%
% Convert an integer in the range 0-1 to a binary digit (0 or 1) in the
% ASCII range.
%
:- pred int_to_binary_digit(int::in, char::out) is semidet.
% As above, but throw an exception instead of failing.
%
:- func det_int_to_binary_digit(int) = char.
% Convert an integer 0-7 to an octal digit (0-7) in the ASCII range.
%
:- pred int_to_octal_digit(int::in, char::out) is semidet.
% As above, but throw an exception instead of failing.
%
:- func det_int_to_octal_digit(int) = char.
% Convert an integer 0-9 to a decimal digit (0-9) in the ASCII range.
%
:- pred int_to_decimal_digit(int::in, char::out) is semidet.
% As above, but throw an exception instead of failing.
%
:- func det_int_to_decimal_digit(int) = char.
% Convert an integer 0-15 to an uppercase hexadecimal digit (0-9, A-F) in
% the ASCII range.
%
:- pred int_to_hex_digit(int::in, char::out) is semidet.
% As above, but throw an exception instead of failing.
%
:- func det_int_to_hex_digit(int) = char.
% base_int_to_digit(Base, Int, Char):
% True if-and-only-if Char is a decimal digit (0-9) or an uppercase letter
% (A-Z) representing the value Int (0-35) in the given base.
% Throws an exception if Base < 2 or Base > 36.
%
:- pred base_int_to_digit(int::in, int::in, char::out) is semidet.
% As above, but throw an exception instead of failing.
%
:- func det_base_int_to_digit(int, int) = char.
%---------------------------------------------------------------------------%
% Encode a Unicode code point in UTF-8.
% Fails for surrogate code points.
%
:- pred to_utf8(char::in, list(int)::out) is semidet.
% As above, but represent UTF-8 code units using uint8s.
%
:- pred to_utf8_uint8(char::in, list(uint8)::out) is semidet.
% Encode a Unicode code point in UTF-16 (native endianness).
% Fails for surrogate code points.
%
:- pred to_utf16(char::in, list(int)::out) is semidet.
% As above, but represent UTF-16 code units using uint16s.
%
:- pred to_utf16_uint16(char::in, list(uint16)::out) is semidet.
% True if-and-only-if the character is a Unicode Surrogate code point,
% that is a code point in General Category `Other,surrogate' (`Cs').
% In UTF-16, a code point with a scalar value greater than 0xffff is
% encoded with a pair of surrogate code points.
%
:- pred is_surrogate(char::in) is semidet.
% True if-and-only-if the character is a Unicode leading surrogate
% code point. A leading surrogate code point is in the inclusive range
% from 0xd800 to 0xdbff.
%
:- pred is_leading_surrogate(char::in) is semidet.
% True if-and-only-if the character is a Unicode trailing surrogate
% code point. A trailing surrogate code point is in the inclusive range
% from 0xdc00 to 0xdfff.
%
:- pred is_trailing_surrogate(char::in) is semidet.
% True if-and-only-if the character is a Unicode Noncharacter code point.
% Sixty-six code points are not used to encode characters.
% These code points should not be used for interchange, but may be used
% internally.
%
:- pred is_noncharacter(char::in) is semidet.
% True if-and-only-if the character is a Unicode Control code point,
% that is a code point in General Category `Other,control' (`Cc').
%
:- pred is_control(char::in) is semidet.
% True if-and-only-if the character is a Unicode Space Separator
% code point, that is a code point in General Category
% `Separator,space' (`Zs').
%
:- pred is_space_separator(char::in) is semidet.
% True if-and-only-if the character is a Unicode Line Separator code point,
% that is a code point in General Category `Separator,line' (`Zl').
%
:- pred is_line_separator(char::in) is semidet.
% True if-and-only-if the character is a Unicode Paragraph Separator
% code point, that is a code point in General Category
% `Separator,paragraph' (`Zp').
%
:- pred is_paragraph_separator(char::in) is semidet.
% True if-and-only-if the character is a Unicode Private-use code point,
% that is a code point in General Category `Other,private use' (`Co').
%
:- pred is_private_use(char::in) is semidet.
%---------------------------------------------------------------------------%
% Convert a char to a pretty_printer.doc for formatting.
%
:- func char_to_doc(char) = pretty_printer.doc.
:- pragma obsolete(func(char_to_doc/1), [pretty_printer.char_to_doc/1]).
%---------------------------------------------------------------------------%
% The following have all been deprecated.
% Use hex_digit_to_int/2 instead.
%
:- pred is_hex_digit(char, int).
:- mode is_hex_digit(in, out) is semidet.
% Convert an integer 0-15 to a hexadecimal digit (0-9, A-F) in the ASCII
% range.
%
% Use int_to_hex_digit/2 instead.
%
:- pred int_to_hex_char(int, char).
:- mode int_to_hex_char(in, out) is semidet.
%---------------------------------------------------------------------------%
%
% Computing hashes of chars.
%
% Compute a hash value for a char.
%
:- func hash(char) = int.
:- pred hash(char::in, int::out) is det.
%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%
:- implementation.
:- interface.
% A version of is_surrogate that takes the character in its integer form.
% Exported for use by string.m.
%
:- pred char_int_is_surrogate(int::in) is semidet.
%---------------------------------------------------------------------------%
:- implementation.
:- import_module int.
:- import_module require.
:- import_module uint.
:- import_module uint16.
:- import_module uint8.
:- instance enum(character) where [
func(to_int/1) is char.to_int,
pred(from_int/2) is char.from_int
].
:- instance uenum(character) where [
func(to_uint/1) is char.to_uint,
pred(from_uint/2) is char.from_uint
].
:- pragma foreign_decl("C", "#include <limits.h>").
%---------------------------------------------------------------------------%
%
% All of
%
% - func to_int/1
% - pred to_int/2
% - pred from_int/2
% - func det_from_int/1
% - pred det_from_int/2
%
% are implemented in terms of pred to_int/2. For the *from_int operations,
% this is possible *only* because that predicate has a reverse mode as
% as its usual forward mode.
%
to_int(C) = N :-
to_int(C, N).
%---------------------%
%
% The <in, out> mode of to_int.
%
:- pragma inline(pred(to_int/2)).
:- pragma foreign_proc("C",
to_int(Character::in, Int::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
Int = (MR_UnsignedChar) Character;
").
:- pragma foreign_proc("C#",
to_int(Character::in, Int::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
Int = Character;
").
:- pragma foreign_proc("Java",
to_int(Character::in, Int::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
Int = Character;
").
%---------------------%
%
% The <in, in> mode of to_int.
%
:- pragma foreign_proc("C",
to_int(Character::in, Int::in),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
SUCCESS_INDICATOR = ((MR_UnsignedChar) Character == Int);
").
:- pragma foreign_proc("C#",
to_int(Character::in, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
SUCCESS_INDICATOR = (Character == Int);
").
:- pragma foreign_proc("Java",
to_int(Character::in, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
SUCCESS_INDICATOR = (Character == Int);
").
%---------------------%
%
% The <out, in> mode of to_int.
%
:- pragma foreign_proc("C",
to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
Character = Int;
SUCCESS_INDICATOR = (Character >= 0 && Character <= 0x10ffff);
").
:- pragma foreign_proc("C#",
to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
Character = Int;
SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
").
:- pragma foreign_proc("Java",
to_int(Character::out, Int::in),
[will_not_call_mercury, promise_pure, thread_safe],
"
Character = Int;
SUCCESS_INDICATOR = (Int >= 0 && Int <= 0x10ffff);
").
%---------------------%
from_int(Int, Char) :-
to_int(Char, Int).
%---------------------%
det_from_int(Int) = Char :-
det_from_int(Int, Char).
det_from_int(Int, Char) :-
( if char.from_int(Int, CharPrime) then
Char = CharPrime
else
unexpected($pred, "conversion failed")
).
%---------------------------------------------------------------------------%
%
% The to_uint/from_uint operations are implemented quite differently from
% their int versions. The reason for this is that while to_int has both
% a forward mode and a reverse mode, to_uint has only the forward mode.
% (By the time we added unsigned integers to the language, experience has
% taught us that more modes are not necessarily better.)
%
to_uint(Char) = UInt :-
UInt = uint.cast_from_int(char.to_int(Char)).
:- pragma inline(pred(from_uint/2)).
:- pragma foreign_proc("C",
from_uint(UInt::in, Character::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
Character = (MR_UnsignedChar) UInt;
SUCCESS_INDICATOR = (UInt <= 0x10ffff);
").
:- pragma foreign_proc("C#",
from_uint(UInt::in, Character::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
Character = (int) UInt;
SUCCESS_INDICATOR = (UInt <= 0x10ffff);
").
:- pragma foreign_proc("Java",
from_uint(UInt::in, Character::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
Character = UInt;
SUCCESS_INDICATOR = ((UInt & 0xffffffffL) <= (0x10ffff & 0xffffffffL));
").
det_from_uint(UInt) = Char :-
( if char.from_uint(UInt, CharPrime) then
Char = CharPrime
else
unexpected($pred, "conversion failed")
).
%---------------------------------------------------------------------------%
min_char_value = N :-
min_char_value(N).
% We use unsigned character codes, so the minimum character code
% is always zero.
min_char_value(0).
max_char_value = N :-
max_char_value(N).
:- pragma foreign_proc("C",
max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe, will_not_modify_trail,
does_not_affect_liveness],
"
Max = 0x10ffff;
").
:- pragma foreign_proc("C#",
max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
Max = 0x10ffff;
").
:- pragma foreign_proc("Java",
max_char_value(Max::out),
[will_not_call_mercury, promise_pure, thread_safe],
"
Max = 0x10ffff;
").
%---------------------------------------------------------------------------%
is_lower(Lower) :-
lower_upper(Lower, _).
is_upper(Upper) :-
( if lower_upper(_, Upper) then
true
else
fail
).
to_lower(C1) = C2 :-
to_lower(C1, C2).
to_lower(Char, Lower) :-
( if lower_upper(LowerChar, Char) then
Lower = LowerChar
else
Lower = Char
).
to_upper(C1) = C2 :-
to_upper(C1, C2).
to_upper(Char, Upper) :-
( if lower_upper(Char, UpperChar) then
Upper = UpperChar
else
Upper = Char
).
lower_upper('a', 'A').
lower_upper('b', 'B').
lower_upper('c', 'C').
lower_upper('d', 'D').
lower_upper('e', 'E').
lower_upper('f', 'F').
lower_upper('g', 'G').
lower_upper('h', 'H').
lower_upper('i', 'I').
lower_upper('j', 'J').
lower_upper('k', 'K').
lower_upper('l', 'L').
lower_upper('m', 'M').
lower_upper('n', 'N').
lower_upper('o', 'O').
lower_upper('p', 'P').
lower_upper('q', 'Q').
lower_upper('r', 'R').
lower_upper('s', 'S').
lower_upper('t', 'T').
lower_upper('u', 'U').
lower_upper('v', 'V').
lower_upper('w', 'W').
lower_upper('x', 'X').
lower_upper('y', 'Y').
lower_upper('z', 'Z').
%---------------------------------------------------------------------------%
is_ascii(Char) :-
Code = char.to_int(Char),
Code >= 0x00,
Code =< 0x7f.
% The information here is duplicated in lookup_token_action in
% mercury_term_lexer.m. If you update this, you will also need to update that.
is_whitespace(' ').
is_whitespace('\t').
is_whitespace('\n').
is_whitespace('\r').
is_whitespace('\f').
is_whitespace('\v').
is_alpha(Char) :-
( if is_lower(Char) then
true
else if is_upper(Char) then
true
else
fail
).
is_alnum(Char) :-
( if is_alpha(Char) then
true
else if is_digit(Char) then
true
else
fail
).
is_alpha_or_underscore(Char) :-
( if Char = '_' then
true
else
is_alpha(Char)
).
is_alnum_or_underscore(Char) :-
% We explicitly enumerate here for efficiency.
% (The information here and in some of the following predicates,
% e.g. lower_upper, is duplicated in lookup_token_action
% in mercury_term_lexer.m.)
%
% A more concise implementation would be:
%
% ( if is_digit(Char) then
% true
% else
% is_alpha_or_underscore(Char)
% ).
( Char = '0' ; Char = '1' ; Char = '2' ; Char = '3' ; Char = '4'
; Char = '5' ; Char = '6' ; Char = '7' ; Char = '8' ; Char = '9'
; Char = 'a' ; Char = 'b' ; Char = 'c' ; Char = 'd' ; Char = 'e'
; Char = 'f' ; Char = 'g' ; Char = 'h' ; Char = 'i' ; Char = 'j'
; Char = 'k' ; Char = 'l' ; Char = 'm' ; Char = 'n' ; Char = 'o'
; Char = 'p' ; Char = 'q' ; Char = 'r' ; Char = 's' ; Char = 't'
; Char = 'u' ; Char = 'v' ; Char = 'w' ; Char = 'x' ; Char = 'y'
; Char = 'z'
; Char = 'A' ; Char = 'B' ; Char = 'C' ; Char = 'D' ; Char = 'E'
; Char = 'F' ; Char = 'G' ; Char = 'H' ; Char = 'I' ; Char = 'J'
; Char = 'K' ; Char = 'L' ; Char = 'M' ; Char = 'N' ; Char = 'O'
; Char = 'P' ; Char = 'Q' ; Char = 'R' ; Char = 'S' ; Char = 'T'
; Char = 'U' ; Char = 'V' ; Char = 'W' ; Char = 'X' ; Char = 'Y'
; Char = 'Z'
; Char = '_'
).
%---------------------------------------------------------------------------%
% Lots of big tables.
%
% It is conceivable that there are more efficient implementations,
% but these versions are very portable.
%---------------------------------------------------------------------------%
%
% Digit classification.
%
is_digit(D) :-
is_decimal_digit(D).
is_binary_digit('0').
is_binary_digit('1').
is_octal_digit('0').
is_octal_digit('1').
is_octal_digit('2').
is_octal_digit('3').
is_octal_digit('4').
is_octal_digit('5').
is_octal_digit('6').
is_octal_digit('7').
is_decimal_digit('0').
is_decimal_digit('1').
is_decimal_digit('2').
is_decimal_digit('3').
is_decimal_digit('4').
is_decimal_digit('5').
is_decimal_digit('6').
is_decimal_digit('7').
is_decimal_digit('8').
is_decimal_digit('9').
is_hex_digit('0').
is_hex_digit('1').
is_hex_digit('2').
is_hex_digit('3').
is_hex_digit('4').
is_hex_digit('5').
is_hex_digit('6').
is_hex_digit('7').
is_hex_digit('8').
is_hex_digit('9').
is_hex_digit('a').
is_hex_digit('b').
is_hex_digit('c').
is_hex_digit('d').
is_hex_digit('e').
is_hex_digit('f').
is_hex_digit('A').
is_hex_digit('B').
is_hex_digit('C').
is_hex_digit('D').
is_hex_digit('E').
is_hex_digit('F').
is_base_digit(Base, Digit) :-
( if 2 =< Base, Base =< 36 then
base_digit_to_int(Base, Digit, _Int)
else
error($pred, "invalid base")
).
%---------------------------------------------------------------------------%
%
% Digit to integer conversion.
%
binary_digit_to_int('0', 0).
binary_digit_to_int('1', 1).
det_binary_digit_to_int(Digit) = Int :-
( if binary_digit_to_int(Digit, IntPrime) then
Int = IntPrime
else
error($pred, "char.binary_digit_to_int failed")
).
octal_digit_to_int('0', 0).
octal_digit_to_int('1', 1).
octal_digit_to_int('2', 2).
octal_digit_to_int('3', 3).
octal_digit_to_int('4', 4).
octal_digit_to_int('5', 5).
octal_digit_to_int('6', 6).
octal_digit_to_int('7', 7).
det_octal_digit_to_int(Digit) = Int :-
( if octal_digit_to_int(Digit, IntPrime) then
Int = IntPrime
else
error($pred, "char.octal_digit_to_int failed")
).
decimal_digit_to_int('0', 0).
decimal_digit_to_int('1', 1).
decimal_digit_to_int('2', 2).
decimal_digit_to_int('3', 3).
decimal_digit_to_int('4', 4).
decimal_digit_to_int('5', 5).
decimal_digit_to_int('6', 6).
decimal_digit_to_int('7', 7).
decimal_digit_to_int('8', 8).
decimal_digit_to_int('9', 9).
det_decimal_digit_to_int(Digit) = Int :-
( if decimal_digit_to_int(Digit, IntPrime) then
Int = IntPrime
else
error($pred, "char.decimal_digit_to_int failed")
).
hex_digit_to_int('0', 0).
hex_digit_to_int('1', 1).
hex_digit_to_int('2', 2).
hex_digit_to_int('3', 3).
hex_digit_to_int('4', 4).
hex_digit_to_int('5', 5).
hex_digit_to_int('6', 6).
hex_digit_to_int('7', 7).
hex_digit_to_int('8', 8).
hex_digit_to_int('9', 9).
hex_digit_to_int('a', 10).
hex_digit_to_int('b', 11).
hex_digit_to_int('c', 12).
hex_digit_to_int('d', 13).
hex_digit_to_int('e', 14).
hex_digit_to_int('f', 15).
hex_digit_to_int('A', 10).
hex_digit_to_int('B', 11).
hex_digit_to_int('C', 12).
hex_digit_to_int('D', 13).
hex_digit_to_int('E', 14).
hex_digit_to_int('F', 15).
det_hex_digit_to_int(DigitStr) = Int :-
( if hex_digit_to_int(DigitStr, IntPrime) then
Int = IntPrime
else
error($pred, "char.hex_digit_to_int failed")
).
base_digit_to_int(Base, DigitStr, Int) :-
( if 1 < Base, Base < 37 then
unsafe_base_digit_to_int(Base, DigitStr, Int)
else
error($pred, "base is not in the range 2 .. 36")
).
det_base_digit_to_int(Base, DigitStr) = Int :-
( if base_digit_to_int(Base, DigitStr, IntPrime) then
Int = IntPrime
else
error($pred, "char.base_digit_to_int failed")
).
unsafe_base_digit_to_int(Base, DigitStr0, Int) :-
( if lower_upper(DigitStr0, UpperStr) then
DigitStr = UpperStr
else
DigitStr = DigitStr0
),
int_to_extended_digit(Int, DigitStr),
Int < Base.
%---------------------------------------------------------------------------%
%
% Integer to digit conversion.
%
int_to_binary_digit(0, '0').
int_to_binary_digit(1, '1').
det_int_to_binary_digit(Int) = Digit :-
( if int_to_binary_digit(Int, DigitPrime) then
Digit = DigitPrime
else
error($pred, "char.int_to_binary_digit failed")
).
int_to_octal_digit(0, '0').
int_to_octal_digit(1, '1').
int_to_octal_digit(2, '2').
int_to_octal_digit(3, '3').
int_to_octal_digit(4, '4').
int_to_octal_digit(5, '5').
int_to_octal_digit(6, '6').
int_to_octal_digit(7, '7').
det_int_to_octal_digit(Int) = Digit :-
( if int_to_octal_digit(Int, DigitPrime) then
Digit = DigitPrime
else
error($pred, "char.int_to_octal_digit failed")
).
int_to_decimal_digit(0, '0').
int_to_decimal_digit(1, '1').
int_to_decimal_digit(2, '2').
int_to_decimal_digit(3, '3').
int_to_decimal_digit(4, '4').
int_to_decimal_digit(5, '5').
int_to_decimal_digit(6, '6').
int_to_decimal_digit(7, '7').
int_to_decimal_digit(8, '8').
int_to_decimal_digit(9, '9').
det_int_to_decimal_digit(Int) = Digit :-
( if int_to_decimal_digit(Int, DigitPrime) then
Digit = DigitPrime
else
error($pred, "char.int_to_decimal_digit failed")
).
int_to_hex_digit(0, '0').
int_to_hex_digit(1, '1').
int_to_hex_digit(2, '2').
int_to_hex_digit(3, '3').
int_to_hex_digit(4, '4').
int_to_hex_digit(5, '5').
int_to_hex_digit(6, '6').
int_to_hex_digit(7, '7').
int_to_hex_digit(8, '8').
int_to_hex_digit(9, '9').
int_to_hex_digit(10, 'A').
int_to_hex_digit(11, 'B').
int_to_hex_digit(12, 'C').
int_to_hex_digit(13, 'D').
int_to_hex_digit(14, 'E').
int_to_hex_digit(15, 'F').
det_int_to_hex_digit(Int) = Digit :-
( if int_to_hex_digit(Int, DigitPrime) then
Digit = DigitPrime
else
error($pred, "char.int_to_hex_digit failed")
).
base_int_to_digit(Base, Int, Digit) :-
( if 1 < Base, Base < 37 then
Int < Base,
int_to_extended_digit(Int, Digit)
else
error($pred, "invalid base")
).
det_base_int_to_digit(Base, Int) = Digit :-
( if base_int_to_digit(Base, Int, DigitPrime) then
Digit = DigitPrime
else
error($pred, "char.base_int_to_digit failed")
).
%---------------------------------------------------------------------------%
%
% Conversion to UTF-8 code units.
%
to_utf8(Char, CodeUnits) :-
to_utf8_code_units(Char, NumCodeUnits, A, B, C, D),
(
NumCodeUnits = 1,
CodeUnits = [uint8.to_int(A)]
;
NumCodeUnits = 2,
CodeUnits = [uint8.to_int(A), uint8.to_int(B)]
;
NumCodeUnits = 3,
CodeUnits = [uint8.to_int(A), uint8.to_int(B), uint8.to_int(C)]
;
NumCodeUnits = 4,
CodeUnits = [uint8.to_int(A), uint8.to_int(B),
uint8.to_int(C), uint8.to_int(D)]
).
to_utf8_uint8(Char, CodeUnits) :-
to_utf8_code_units(Char, NumCodeUnits, A, B, C, D),
(
NumCodeUnits = 1,
CodeUnits = [A]
;
NumCodeUnits = 2,
CodeUnits = [A, B]
;
NumCodeUnits = 3,
CodeUnits = [A, B, C]
;
NumCodeUnits = 4,
CodeUnits = [A, B, C, D]
).
:- pred to_utf8_code_units(char::in, int::out(bound(1 ; 2 ; 3 ; 4)),
uint8::out, uint8::out, uint8::out, uint8::out) is semidet.
to_utf8_code_units(Char, NumCodeUnits, A, B, C, D) :-
Int = char.to_int(Char),
( if Int =< 0x7f then
A = uint8.cast_from_int(Int),
B = 0u8,
C = 0u8,
D = 0u8,
NumCodeUnits = 1
else if Int =< 0x7ff then
A = uint8.cast_from_int(0xc0 \/ ((Int >> 6) /\ 0x1f)),
B = uint8.cast_from_int(0x80 \/ (Int /\ 0x3f)),
C = 0u8,
D = 0u8,
NumCodeUnits = 2
else if Int =< 0xffff then
not char_int_is_surrogate(Int),
A = uint8.cast_from_int(0xe0 \/ ((Int >> 12) /\ 0x0f)),
B = uint8.cast_from_int(0x80 \/ ((Int >> 6) /\ 0x3f)),
C = uint8.cast_from_int(0x80 \/ (Int /\ 0x3f)),
D = 0u8,
NumCodeUnits = 3
else if Int =< 0x10ffff then
A = uint8.cast_from_int(0xf0 \/ ((Int >> 18) /\ 0x07)),
B = uint8.cast_from_int(0x80 \/ ((Int >> 12) /\ 0x3f)),
C = uint8.cast_from_int(0x80 \/ ((Int >> 6) /\ 0x3f)),
D = uint8.cast_from_int(0x80 \/ (Int /\ 0x3f)),
NumCodeUnits = 4
else
error($pred, "illegal code point")
).
%---------------------------------------------------------------------------%
%
% Conversion to UTF-16 code units.
%
to_utf16(Char, CodeUnits) :-
to_utf16_code_units(Char, NumCodeUnits, A, B),
(
NumCodeUnits = 1,
CodeUnits = [uint16.to_int(A)]
;
NumCodeUnits = 2,
CodeUnits = [uint16.to_int(A), uint16.to_int(B)]
).
to_utf16_uint16(Char, CodeUnits) :-
to_utf16_code_units(Char, NumCodeUnits, A, B),
(
NumCodeUnits = 1,
CodeUnits = [A]
;
NumCodeUnits = 2,
CodeUnits = [A, B]
).
:- pred to_utf16_code_units(char::in, int::out(bound(1 ; 2)),
uint16::out, uint16::out) is semidet.
to_utf16_code_units(Char, NumCodeUnits, A, B) :-
Int = char.to_int(Char),
( if Int < 0xd800 then
% Common case.
A = uint16.cast_from_int(Int),
B = 0u16,
NumCodeUnits = 1
else if Int =< 0xdfff then
% Surrogate.
fail
else if Int =< 0xffff then
A = uint16.cast_from_int(Int),
B = 0u16,
NumCodeUnits = 1
else if Int =< 0x10ffff then
U = Int - 0x10000,
A = uint16.cast_from_int(0xd800 \/ (U >> 10)),
B = uint16.cast_from_int(0xdc00 \/ (U /\ 0x3ff)),
NumCodeUnits = 2
else
error($pred, "illegal code point")
).
%---------------------------------------------------------------------------%
is_surrogate(Char) :-
Int = char.to_int(Char),
char_int_is_surrogate(Int).
is_leading_surrogate(Char) :-
Int = char.to_int(Char),
Int >= 0xd800,
Int =< 0xdbff.
is_trailing_surrogate(Char) :-
Int = char.to_int(Char),
Int >= 0xdc00,
Int =< 0xdfff.
is_noncharacter(Char) :-
Int = char.to_int(Char),
( 0xfdd0 =< Int, Int =< 0xfdef
; Int /\ 0xfffe = 0xfffe
).
is_control(Char) :-
Int = char.to_int(Char),
( 0x0000 =< Int, Int =< 0x001f
; 0x007f =< Int, Int =< 0x009f
).
is_space_separator(Char) :-
Int = char.to_int(Char),
( Int = 0x0020
; Int = 0x00a0
; Int = 0x1680
; 0x2000 =< Int, Int =< 0x200a
; Int = 0x202f
; Int = 0x205f
; Int = 0x3000
).
is_line_separator(Char) :-
0x2028 = char.to_int(Char).
is_paragraph_separator(Char) :-
0x2029 = char.to_int(Char).
is_private_use(Char) :-
Int = char.to_int(Char),
( 0xe000 =< Int, Int =< 0xf8ff % Private Use Area.
; 0xf0000 =< Int, Int =< 0xffffd % Supplemental Private Use Area-A.
; 0x100000 =< Int, Int =< 0x10fffd % Supplemental Private Use Area-B.
).
char_to_doc(C) = pretty_printer.char_to_doc(C).
%---------------------------------------------------------------------------%
is_hex_digit(Digit, Int) :-
hex_digit_to_int(Digit, Int).
int_to_hex_char(Int, Char) :-
int_to_hex_digit(Int, Char).
%---------------------------------------------------------------------------%
:- pred int_to_extended_digit(int, char).
:- mode int_to_extended_digit(in, out) is semidet.
:- mode int_to_extended_digit(out, in) is semidet.
int_to_extended_digit(0, '0').
int_to_extended_digit(1, '1').
int_to_extended_digit(2, '2').
int_to_extended_digit(3, '3').
int_to_extended_digit(4, '4').
int_to_extended_digit(5, '5').
int_to_extended_digit(6, '6').
int_to_extended_digit(7, '7').
int_to_extended_digit(8, '8').
int_to_extended_digit(9, '9').
int_to_extended_digit(10, 'A').
int_to_extended_digit(11, 'B').
int_to_extended_digit(12, 'C').
int_to_extended_digit(13, 'D').
int_to_extended_digit(14, 'E').
int_to_extended_digit(15, 'F').
int_to_extended_digit(16, 'G').
int_to_extended_digit(17, 'H').
int_to_extended_digit(18, 'I').
int_to_extended_digit(19, 'J').
int_to_extended_digit(20, 'K').
int_to_extended_digit(21, 'L').
int_to_extended_digit(22, 'M').
int_to_extended_digit(23, 'N').
int_to_extended_digit(24, 'O').
int_to_extended_digit(25, 'P').
int_to_extended_digit(26, 'Q').
int_to_extended_digit(27, 'R').
int_to_extended_digit(28, 'S').
int_to_extended_digit(29, 'T').
int_to_extended_digit(30, 'U').
int_to_extended_digit(31, 'V').
int_to_extended_digit(32, 'W').
int_to_extended_digit(33, 'X').
int_to_extended_digit(34, 'Y').
int_to_extended_digit(35, 'Z').
%---------------------------------------------------------------------------%
hash(C) = H :-
uint.hash(uint.cast_from_int(char.to_int(C)), H).
hash(C, H) :-
H = hash(C).
%---------------------------------------------------------------------------%
char_int_is_surrogate(Int) :-
% This code is sort-of duplicated, in C, in runtime/mercury_string.h,
% in the macro MR_is_surrogate.
Int >= 0xd800,
Int =< 0xdfff.
%---------------------------------------------------------------------------%
:- end_module char.
%---------------------------------------------------------------------------%