Files
mercury/extras/xml/xml.encoding.m
Zoltan Somogyi ce14206488 Clean up several aspects of extras/xml.
extras/xml/parsing.m:
extras/xml/unicode.m:
extras/xml/xml.cat.m:
extras/xml/xml.doc.m:
extras/xml/xml.dtd.m:
extras/xml/xml.encoding.m:
extras/xml/xml.m:
extras/xml/xml.ns.m:
extras/xml/xml.parse.chars.m:
extras/xml/xml.parse.m:
    Use predmode declarations when possible.

    Flatten camelCase.

    Add prefixes to the names of function symbols and fields
    if this avoid ambiguity, either with other parts of this code,
    or with standard Mercury function symbols or predicates.

    Replace uses of graphic characters such as ',' and '-' as
    general purpose function symbols with normal, alphanumeric
    function symbols. (Uses of graphic characters to stand for themselves
    in xml.parse.chars.m are unaffected.)

    Replace uses of graphic symbols such as ',' '->' and '[|]' as type names.

    Improve some other names as well.

    Convert (C->T;E) to (if C then T else E).

    Replace tabs with spaces.

    Delete unused imports.

    Make the order of definitions match the order of declarations.

extras/xml/tryit.m:
    Put the code for handling a single command line argument into its own
    predicate.

    Replace see/seen with read_named_file_as_string.

    Avoid using !IO to pass around the parser state.

    Instead of writing out the parse tree as a single very long line,
    convert both the DTD and the HTML code to a prettyprinter doc,
    and print that. This makes the output actually readable, and
    also makes it usefully diffable as well.

extras/xml/Mmakefile:
    Replace the old do-nothing check action with one that actually does
    check whether the code in this directory can do at least one simple task,
    the one mentioned in in samples/README. It would be nice of we had
    more tests, more extensive tests, or (preferably) both, but that would
    require someone who knows the code significantly better than I do.

    Add a rule for making the tags file.

    Compile the modules in this directory with the same default mmc flags
    as we use in the compiler directory.

extras/xml/XML_FLAGS:
    The default flags for modules in this directory.

extras/xml/Mercury.options:
    The non-default flags for modules in this directory.

extras/xml/samples/newsarticles.exp:
    The expected output of running the updated tryit program on the
    (just one) sample input in this directory.

extras/xml/README:
extras/xml/samples/README:
    Replace some obsolete references, and improve formatting.
2022-06-24 07:26:55 +10:00

236 lines
6.8 KiB
Mathematica

%---------------------------------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%---------------------------------------------------------------------------%
% Copyright (C) 2000-2001 The University of Melbourne.
% Copyright (C) 2014, 2018, 2022 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%---------------------------------------------------------------------------%
%
% Main author: conway@cs.mu.oz.au.
%
%---------------------------------------------------------------------------%
:- module xml.encoding.
:- interface.
:- import_module parsing.
%---------------------------------------------------------------------------%
:- type ascii7
---> ascii7.
:- type latin1
---> latin1.
:- type utf8
---> utf8.
:- instance encoding(ascii7).
:- instance encoding(latin1).
:- instance encoding(utf8).
%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%
:- implementation.
:- import_module unicode.
:- import_module char.
:- import_module int.
:- import_module list.
:- import_module require.
:- import_module string.
%---------------------------------------------------------------------------%
:- instance encoding(ascii7) where [
pred(decode/4) is decode_ascii7,
pred(encode/3) is encode_ascii7
].
:- pred decode_ascii7(ascii7::in, unicode::out, entity::in, entity::out)
is semidet.
decode_ascii7(_, U) -->
[U].
:- pred encode_ascii7(ascii7::in, list(unicode)::in, string::out) is det.
encode_ascii7(_, Us, Str) :-
unicodes_to_ascii7(Us, Cs, []),
string.from_char_list(Cs, Str).
:- pred unicodes_to_ascii7(list(unicode)::in, list(char)::out, list(char)::in)
is det.
unicodes_to_ascii7([]) --> [].
unicodes_to_ascii7([U | Us]) -->
( if { U > 0x00, U < 0x80, char.to_int(C, U) } then
[C],
unicodes_to_ascii7(Us)
else
{ string.format(
"unicodes_to_ascii7: couldn't convert U-%x to 7bit ascii",
[i(U)], Msg) },
{ error(Msg) }
).
:- instance encoding(latin1) where [
pred(decode/4) is decode_latin1,
pred(encode/3) is encode_latin1
].
:- pred decode_latin1(latin1::in, unicode::out, entity::in, entity::out)
is semidet.
decode_latin1(_, U) -->
[U].
:- pred encode_latin1(latin1::in, list(unicode)::in, string::out) is det.
encode_latin1(_, Us, Str) :-
unicodes_to_latin1(Us, Cs, []),
string.from_char_list(Cs, Str).
:- pred unicodes_to_latin1(list(unicode)::in, list(char)::out, list(char)::in)
is det.
unicodes_to_latin1([]) --> [].
unicodes_to_latin1([U | Us]) -->
( if { char.to_int(C, U) } then
[C],
unicodes_to_latin1(Us)
else
{ string.format("unicodes_to_latin1: couldn't convert U-%x to Latin-1",
[i(U)], Msg) },
{ error(Msg) }
).
:- instance encoding(utf8) where [
pred(decode/4) is decode_utf8,
pred(encode/3) is encode_utf8
].
:- pred decode_utf8(utf8::in, unicode::out, entity::in, entity::out)
is semidet.
decode_utf8(_, U) -->
[U0],
( if { U0 /\ 0x80 = 0 } then
{ U = U0 }
else if { U0 /\ 0x20 = 0 } then
[U1],
{ U = ((U0 /\ 0x1F) << 6) \/ (U1 /\ 0x3F) }
else if { U0 /\ 0x10 = 0 } then
[U1], [U2],
{ U = ((U0 /\ 0x0F) << 12) \/ ((U1 /\ 0x3F) << 6) \/ (U2 /\ 0x3F) }
else if { U0 /\ 0x08 = 0 } then
[U1], [U2], [U3],
{ U = ((U0 /\ 0x07) << 18) \/ ((U1 /\ 0x3F) << 12) \/
((U2 /\ 0x3F) << 6) \/ (U3 /\ 0x3F) }
else if { U0 /\ 0x04 = 0 } then
[U1], [U2], [U3], [U4],
{ U = ((U0 /\ 0x03) << 24) \/ ((U1 /\ 0x3F) << 18) \/
((U2 /\ 0x3F) << 12) \/ ((U3 /\ 0x3F) << 6) \/ (U4 /\ 0x3F) }
else if { U0 /\ 0x02 = 0 } then
[U1], [U2], [U3], [U4], [U5],
{ U = ((U0 /\ 0x01) << 30) \/ ((U1 /\ 0x3F) << 24) \/
((U2 /\ 0x3F) << 18) \/ ((U3 /\ 0x3F) << 12) \/
((U4 /\ 0x3F) << 6) \/ (U5 /\ 0x3F) }
else
%{ error("decode_utf8: bad value!") }
{ fail }
).
:- pred encode_utf8(utf8::in, list(unicode)::in, string::out) is det.
encode_utf8(_, Us, Str) :-
unicodes_to_utf8(Us, Cs, []),
string.from_char_list(Cs, Str).
:- pred unicodes_to_utf8(list(unicode)::in, list(char)::out, list(char)::in)
is det.
unicodes_to_utf8([]) --> [].
unicodes_to_utf8([U | Us]) -->
(if
{ U > 0x00, U =< 0x7F },
{ char.to_int(C, U) }
then
[C]
else if
{ U >= 0x80, U =< 0x07FF },
{ U0 = 0xC0 \/ (0x1F /\ (U >> 6)) },
{ U1 = 0x80 \/ (0x3F /\ U) },
{ char.to_int(C0, U0) },
{ char.to_int(C1, U1) }
then
[C0, C1]
else if
{ U >= 0x0800, U =< 0xFFFF },
{ U0 = 0xE0 \/ (0x0F /\ (U >> 12)) },
{ U1 = 0x80 \/ (0x3F /\ (U >> 6)) },
{ U2 = 0x80 \/ (0x3F /\ U) },
{ char.to_int(C0, U0) },
{ char.to_int(C1, U1) },
{ char.to_int(C2, U2) }
then
[C0, C1, C2]
else if
{ U >= 0x010000, U =< 0x1FFFFF },
{ U0 = 0xF0 \/ (0x07 /\ (U >> 18)) },
{ U1 = 0x80 \/ (0x3F /\ (U >> 12)) },
{ U2 = 0x80 \/ (0x3F /\ (U >> 6)) },
{ U3 = 0x80 \/ (0x3F /\ U) },
{ char.to_int(C0, U0) },
{ char.to_int(C1, U1) },
{ char.to_int(C2, U2) },
{ char.to_int(C3, U3) }
then
[C0, C1, C2, C3]
else if
{ U >= 0x200000, U =< 0x03FFFFFF },
{ U0 = 0xF8 \/ (0x03 /\ (U >> 24)) },
{ U1 = 0x80 \/ (0x3F /\ (U >> 18)) },
{ U2 = 0x80 \/ (0x3F /\ (U >> 12)) },
{ U3 = 0x80 \/ (0x3F /\ (U >> 6)) },
{ U4 = 0x80 \/ (0x3F /\ U) },
{ char.to_int(C0, U0) },
{ char.to_int(C1, U1) },
{ char.to_int(C2, U2) },
{ char.to_int(C3, U3) },
{ char.to_int(C4, U4) }
then
[C0, C1, C2, C3, C4]
else if
{ U >= 0x04000000, U =< 0x7FFFFFFF },
{ U0 = 0xFC \/ (0x01 /\ (U >> 30)) },
{ U1 = 0x80 \/ (0x3F /\ (U >> 24)) },
{ U2 = 0x80 \/ (0x3F /\ (U >> 18)) },
{ U3 = 0x80 \/ (0x3F /\ (U >> 12)) },
{ U4 = 0x80 \/ (0x3F /\ (U >> 6)) },
{ U5 = 0x80 \/ (0x3F /\ U) },
{ char.to_int(C0, U0) },
{ char.to_int(C1, U1) },
{ char.to_int(C2, U2) },
{ char.to_int(C3, U3) },
{ char.to_int(C4, U4) },
{ char.to_int(C5, U5) }
then
[C0, C1, C2, C3, C4, C5]
else
{ string.format("unicodes_to_utf8: couldn't convert U-%x to UTF-8",
[i(U)], Msg) },
{ error(Msg) }
),
unicodes_to_utf8(Us).
:- func [unicode | entity] = entity.
:- mode [out | out] = in is semidet.
[U | E] = E0 :-
E0 ^ curr < E0 ^ size,
string.unsafe_index(E0 ^ text, E0 ^ curr, C),
char.to_int(C, U),
E = E0 ^ curr := (E0 ^ curr + 1).