mercury/extras/xml/xml.encoding.m

%---------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%---------------------------------------------------------------------------%
% Copyright (C) 2000-2001 The University of Melbourne.
% Copyright (C) 2014, 2018, 2022 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%---------------------------------------------------------------------------%
%
% Main author: conway@cs.mu.oz.au.
%
%---------------------------------------------------------------------------%

:- module xml.encoding.
:- interface.

:- import_module parsing.

%---------------------------------------------------------------------------%

:- type ascii7
    --->    ascii7.
:- type latin1
    --->    latin1.
:- type utf8
    --->    utf8.

:- instance encoding(ascii7).
:- instance encoding(latin1).
:- instance encoding(utf8).

%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%

:- implementation.

:- import_module unicode.

:- import_module char.
:- import_module int.
:- import_module list.
:- import_module require.
:- import_module string.

%---------------------------------------------------------------------------%

:- instance encoding(ascii7) where [
    pred(decode/4) is decode_ascii7,
    pred(encode/3) is encode_ascii7
].

:- pred decode_ascii7(ascii7::in, unicode::out, entity::in, entity::out)
    is semidet.

decode_ascii7(_, U) -->
    [U].

:- pred encode_ascii7(ascii7::in, list(unicode)::in, string::out) is det.

encode_ascii7(_, Us, Str) :-
    unicodes_to_ascii7(Us, Cs, []),
    string.from_char_list(Cs, Str).

:- pred unicodes_to_ascii7(list(unicode)::in, list(char)::out, list(char)::in)
    is det.

unicodes_to_ascii7([]) --> [].
unicodes_to_ascii7([U | Us]) -->
    ( if { U > 0x00, U < 0x80, char.to_int(C, U) } then
        [C],
        unicodes_to_ascii7(Us)
    else
        { string.format(
            "unicodes_to_ascii7: couldn't convert U-%x to 7bit ascii",
            [i(U)], Msg) },
        { error(Msg) }
    ).

:- instance encoding(latin1) where [
    pred(decode/4) is decode_latin1,
    pred(encode/3) is encode_latin1
].

:- pred decode_latin1(latin1::in, unicode::out, entity::in, entity::out)
    is semidet.

decode_latin1(_, U) -->
    [U].

:- pred encode_latin1(latin1::in, list(unicode)::in, string::out) is det.

encode_latin1(_, Us, Str) :-
    unicodes_to_latin1(Us, Cs, []),
    string.from_char_list(Cs, Str).

:- pred unicodes_to_latin1(list(unicode)::in, list(char)::out, list(char)::in)
    is det.

unicodes_to_latin1([]) --> [].
unicodes_to_latin1([U | Us]) -->
    ( if { char.to_int(C, U) } then
        [C],
        unicodes_to_latin1(Us)
    else
        { string.format("unicodes_to_latin1: couldn't convert U-%x to Latin-1",
            [i(U)], Msg) },
        { error(Msg) }
    ).

:- instance encoding(utf8) where [
    pred(decode/4) is decode_utf8,
    pred(encode/3) is encode_utf8
].

:- pred decode_utf8(utf8::in, unicode::out, entity::in, entity::out)
    is semidet.

decode_utf8(_, U) -->
    [U0],
    ( if { U0 /\ 0x80  = 0 } then
        { U = U0 }
    else if { U0 /\ 0x20 = 0 } then
        [U1],
        { U = ((U0 /\ 0x1F) << 6) \/ (U1 /\ 0x3F) }
    else if { U0 /\ 0x10 = 0 } then
        [U1], [U2],
        { U = ((U0 /\ 0x0F) << 12) \/ ((U1 /\ 0x3F) << 6) \/ (U2 /\ 0x3F) }
    else if { U0 /\ 0x08 = 0 } then
        [U1], [U2], [U3],
        { U = ((U0 /\ 0x07) << 18) \/ ((U1 /\ 0x3F) << 12) \/
            ((U2 /\ 0x3F) << 6) \/ (U3 /\ 0x3F) }
    else if { U0 /\ 0x04  = 0 } then
        [U1], [U2], [U3], [U4],
        { U = ((U0 /\ 0x03) << 24) \/ ((U1 /\ 0x3F) << 18) \/
            ((U2 /\ 0x3F) << 12) \/ ((U3 /\ 0x3F) << 6) \/ (U4 /\ 0x3F) }
    else if { U0 /\ 0x02  = 0 } then
        [U1], [U2], [U3], [U4], [U5],
        { U = ((U0 /\ 0x01) << 30) \/ ((U1 /\ 0x3F) << 24) \/
            ((U2 /\ 0x3F) << 18) \/ ((U3 /\ 0x3F) << 12) \/
            ((U4 /\ 0x3F) << 6) \/ (U5 /\ 0x3F) }
    else
        %{ error("decode_utf8: bad value!") }
        { fail }
    ).

:- pred encode_utf8(utf8::in, list(unicode)::in, string::out) is det.

encode_utf8(_, Us, Str) :-
    unicodes_to_utf8(Us, Cs, []),
    string.from_char_list(Cs, Str).

:- pred unicodes_to_utf8(list(unicode)::in, list(char)::out, list(char)::in)
    is det.

unicodes_to_utf8([]) --> [].
unicodes_to_utf8([U | Us]) -->
    (if
        { U > 0x00, U =< 0x7F },
        { char.to_int(C, U) }
    then
        [C]
    else if
        { U >= 0x80, U =< 0x07FF },
        { U0 = 0xC0 \/ (0x1F /\ (U >> 6)) },
        { U1 = 0x80 \/ (0x3F /\ U) },
        { char.to_int(C0, U0) },
        { char.to_int(C1, U1) }
    then
        [C0, C1]
    else if
        { U >= 0x0800, U =< 0xFFFF },
        { U0 = 0xE0 \/ (0x0F /\ (U >> 12)) },
        { U1 = 0x80 \/ (0x3F /\ (U >> 6)) },
        { U2 = 0x80 \/ (0x3F /\ U) },
        { char.to_int(C0, U0) },
        { char.to_int(C1, U1) },
        { char.to_int(C2, U2) }
    then
        [C0, C1, C2]
    else if
        { U >= 0x010000, U =< 0x1FFFFF },
        { U0 = 0xF0 \/ (0x07 /\ (U >> 18)) },
        { U1 = 0x80 \/ (0x3F /\ (U >> 12)) },
        { U2 = 0x80 \/ (0x3F /\ (U >> 6)) },
        { U3 = 0x80 \/ (0x3F /\ U) },
        { char.to_int(C0, U0) },
        { char.to_int(C1, U1) },
        { char.to_int(C2, U2) },
        { char.to_int(C3, U3) }
    then
        [C0, C1, C2, C3]
    else if
        { U >= 0x200000, U =< 0x03FFFFFF },
        { U0 = 0xF8 \/ (0x03 /\ (U >> 24)) },
        { U1 = 0x80 \/ (0x3F /\ (U >> 18)) },
        { U2 = 0x80 \/ (0x3F /\ (U >> 12)) },
        { U3 = 0x80 \/ (0x3F /\ (U >> 6)) },
        { U4 = 0x80 \/ (0x3F /\ U) },
        { char.to_int(C0, U0) },
        { char.to_int(C1, U1) },
        { char.to_int(C2, U2) },
        { char.to_int(C3, U3) },
        { char.to_int(C4, U4) }
    then
        [C0, C1, C2, C3, C4]
    else if
        { U >= 0x04000000, U =< 0x7FFFFFFF },
        { U0 = 0xFC \/ (0x01 /\ (U >> 30)) },
        { U1 = 0x80 \/ (0x3F /\ (U >> 24)) },
        { U2 = 0x80 \/ (0x3F /\ (U >> 18)) },
        { U3 = 0x80 \/ (0x3F /\ (U >> 12)) },
        { U4 = 0x80 \/ (0x3F /\ (U >> 6)) },
        { U5 = 0x80 \/ (0x3F /\ U) },
        { char.to_int(C0, U0) },
        { char.to_int(C1, U1) },
        { char.to_int(C2, U2) },
        { char.to_int(C3, U3) },
        { char.to_int(C4, U4) },
        { char.to_int(C5, U5) }
    then
        [C0, C1, C2, C3, C4, C5]
    else
        { string.format("unicodes_to_utf8: couldn't convert U-%x to UTF-8",
            [i(U)], Msg) },
        { error(Msg) }
    ),
    unicodes_to_utf8(Us).

:- func [unicode | entity] = entity.
:- mode [out | out] = in is semidet.

[U | E] = E0 :-
    E0 ^ curr < E0 ^ size,
    string.unsafe_index(E0 ^ text, E0 ^ curr, C),
    char.to_int(C, U),
    E = E0 ^ curr := (E0 ^ curr + 1).