%---------------------------------------------------------------------------% % vim: ft=mercury ts=4 sw=4 et %---------------------------------------------------------------------------% % Copyright (C) 2000, 2005-2006, 2011 The University of Melbourne. % Copyright (C) 2014, 2016-2018, 2021-2022 The Mercury team. % This file is distributed under the terms specified in COPYING.LIB. %---------------------------------------------------------------------------% % % Main author: conway@cs.mu.oz.au. % % This module provides a bunch of parsing combinators directed towards % parsing text (in some encoding or bunch of encodings). The parsing state % that gets threaded through is polymorphic in the type of the result % stored in it. This can cause problems if you construct a big combinator % expression (particularly using the "or" combinator) where the type % of this result in the initial parsing state is unbound and is inherited % from its context. In this case, the combinator expression cannot be made % into a static ground term (the typeinfo arguments which must come first % are not known until runtime), so it gets constructed every time through. % (See e.g. xml.parse.chars.m for some examples.) % % A useful way to avoid this problem, at least in some cases, is to bind % the type variable by setting a dummy result value. e.g. instead of % % parseChar --> % a or b or c or d or e or .... % % you can write % % :- type dummy ---> dummy. % parseChar --> % return(dummy), % a or b or c or d or e or .... % % This does have a slight runtime cost (doing the return), but it has % the benefit that it makes that great big combinator expression a % constant - a big win. % %---------------------------------------------------------------------------% :- module parsing. :- interface. :- import_module unicode. :- import_module io. :- import_module list. :- import_module map. :- import_module unit. :- import_module univ. %---------------------------------------------------------------------------% :- mode pdi == in. :- mode puo == out. :- type entity_name ---> entity_anon ; entity_internal(string) ; entity_external(string). :- type entity ---> entity( name :: entity_name, text :: string, size :: int, curr :: int ). :- type encoding ---> some [Enc] (enc(Enc) => encoding(Enc)). :- func make_entity(string) = entity. :- func make_entity(entity_name, string) = entity. :- typeclass encoding(Enc) where [ (pred decode(Enc::in, unicode::out, entity::in, entity::out) is semidet), (pred encode(Enc::in, list(unicode)::in, string::out) is det) ]. :- func (make_encoding(Enc) = encoding) <= encoding(Enc). :- typeclass global(K, V) where []. :- type globals == map(univ, univ). :- type pstate(T). :- type parse(T) ---> ok(T) ; error(string). :- pred pstate(entity::in, encoding::in, globals::in, io::di, pstate(unit)::puo) is det. :- pred finish(parse(T1)::out, pstate(T1)::pdi, io::uo) is det. :- pred try_parse(parser(T1, T2)::in(parser), pred(T2, pstate(T2), pstate(T3))::in(pred(in, pdi, puo) is det), pred(string, pstate(T1), pstate(T3))::in(pred(in, pdi, puo) is det), pred(string, pstate(T1), pstate(T3))::in(pred(in, pdi, puo) is det), pstate(T1)::pdi, pstate(T3)::puo) is det. :- pred parse(parser(T1, T2)::in(parser), parse(T2)::out, pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred parse_entity(parser(T1, T2)::in(parser), entity::in, pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred tok(pstate(T1)::pdi, pstate(unicode)::puo) is det. :- pred return(T2::in, pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred return_unit(pstate(T1)::pdi, pstate(unit)::puo) is det. :- pred record_failure(string::in, pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred record_error(string::in, pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred set_encoding(encoding::in, pstate(T1)::pdi, pstate(T1)::puo) is det. :- pred get_encoding(encoding::out, pstate(T1)::pdi, pstate(T1)::puo) is det. % Match a string. % :- pred mstr(string::in, pstate(T1)::pdi, pstate(string)::puo) is det. :- pred mstr_return(string::in, T2::in, pstate(T1)::pdi, pstate(T2)::puo) is det. % Match a character. % :- pred mchr(unicode::in, pstate(T1)::pdi, pstate(unicode)::puo) is det. :- pred mchr_return(unicode::in, T2::in, pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred quote(pstate(T1)::pdi, pstate(unicode)::puo) is det. :- pred io(pred(T1, io, io)::in(pred(out, di, uo) is det), T1::out, pstate(T2)::pdi, pstate(T2)::puo) is det. :- pred io(pred(io, io)::in(pred(di, uo) is det), pstate(T2)::pdi, pstate(T2)::puo) is det. :- pred make_string(list(unicode)::in, string::out, pstate(T2)::pdi, pstate(T2)::puo) is det. :- type next(A, B) ---> next(A, B). :- type opt(T) ---> no ; yes(T). :- type parser(T1, T2) == pred(pstate(T1), pstate(T2)). :- inst parser == (pred(pdi, puo) is det). :- pred and(parser(T1, T2)::in(parser), parser(T2, T3)::in(parser), pstate(T1)::pdi, pstate(next(T2, T3))::puo) is det. :- pred or(parser(T1, T2)::in(parser), parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred next(parser(W, T)::in(parser), pred(T, pstate(T), pstate(U))::in(pred(in, pdi, puo) is det), pstate(W)::pdi, pstate(U)::puo) is det. :- pred star(parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(list(T2))::puo) is det. :- pred plus(parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(list(T2))::puo) is det. :- pred call_opt(opt(T0)::in, pred(T0, pstate(T1), pstate(T2))::in(pred(in, pdi, puo) is det), parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred opt_default(parser(T1, T2)::in(parser), T2::in, pstate(T1)::pdi, pstate(T2)::puo) is det. :- pred opt(parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(opt(T2))::puo) is det. :- pred upto(parser(T1, T2)::in(parser), parser(T1, T3)::in(parser), pstate(T1)::pdi, pstate(next(list(T2), T3))::puo) is det. :- pred range(unicode::in, unicode::in, pstate(T1)::pdi, pstate(unicode)::puo) is det. :- pred '-'(unicode::in, unicode::in, pstate(T1)::pdi, pstate(unicode)::puo) is det. :- pred wrap(parser(T1, T2)::in(parser), pred(T2, T3)::in(pred(in, out) is det), pstate(T1)::pdi, pstate(T3)::puo) is det. :- pred is_a(parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(unit)::puo) is det. :- pred first(parser(S, next(T, U))::in(parser), pstate(S)::pdi, pstate(T)::puo) is det. :- pred second(parser(S, next(T, U))::in(parser), pstate(S)::pdi, pstate(U)::puo) is det. :- pred except(list(unicode)::in, pstate(T1)::pdi, pstate(unicode)::puo) is det. :- pred no(parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(opt(T3))::puo) is det. :- pred yes(parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(opt(T2))::puo) is det. :- pred filter(parser(T1, list(opt(T2)))::in(parser), pstate(T1)::pdi, pstate(list(T2))::puo) is det. :- pred return_no(T1::in, opt(T2)::out) is det. :- pred return_yes(T::in, opt(T)::out) is det. :- pred list(parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(list(T2))::puo) is det. :- pred get_global(K::in, V::out, pstate(T)::pdi, pstate(T)::puo) is det <= global(K, V). :- pred set_global(K::in, V::in, pstate(T)::pdi, pstate(T)::puo) is det <= global(K, V). %---------------------------------------------------------------------------% %---------------------------------------------------------------------------% :- implementation. :- import_module char. :- import_module int. :- import_module string. %---------------------------------------------------------------------------% % Most of the time, what we are parsing is the text stored as % one of the arguments of the entity field. We step over the characters % of that text by updating the current pointer in the entity, without % doing any I/O. % % We however *do* need access to the I/O state, in two circumstances. % % - When processing external entities. % - When reporting warnings, e.g. about duplicate declarations. % % When this code was originally written, state variable notation % did not yet exist, which is why putting the I/O state inside % the parser state was the obvious way to handle this. The code % of the parser in xml.parse.m now relies heavily on all components % of the parser state being part of pstate(...), and the pervasive use % of higher order constructs that take a pair of pstates makes it % effectively impossible to break up the pstate into pieces without % a complete rewrite. % % There are arguments in favor of such a rewrite. % % - Putting the I/O state into the pstate requires unsafe code, in the % form of unsafe_promise_unique operations every time the I/O state % is taken out of the pstate. % % - Despite the convention that the pstate arguments used by DCG notation % have pdi/puo modes (which are aliases for in/out respectively), % we do have code that backtracks over updates to the pstate. The only % reason why this works is because the I/O state type is a dummy type. % % - Taking the status out of the parser state would allow us to write code % that *has* no data to store inside current status. This would avoid % the compiler warnings about unresolved polymorphism for the handful % of predicates in xml.parser.m that only % % - match keywords, and % - update the global field, % % neither of which constrain the T inside the status(T). % % There are also arguments against such a rewrite, besides the amount of % of work required. % % - The fact that even with state variable notation, the explicit passing % around of the (pieces of the) parse state would create clutter in % the code. This is because most parser predicates, even those that % do not themselves do I/O or touch the globals or the encoding field % would have to pass them around in read/write pairs of arguments, % in order to allow let them reach the predicates that need them % near the leaves of the call tree> % :- type pstate(T) ---> pstate( count :: int, entity :: entity, encoding :: encoding, status :: status(T), globals :: globals, io :: io ). :- type status(T) ---> ps_ok(T) ; ps_fail(string) ; ps_error(string). make_entity(Str) = entity(entity_anon, Str, Size, 0) :- string.length(Str, Size). make_entity(Name, Str) = entity(Name, Str, Size, 0) :- string.length(Str, Size). make_encoding(Enc) = 'new enc'(Enc). pstate(Entity, Enc, Globs, IO, PS) :- PS = pstate(0, Entity, Enc, ps_ok(unit), Globs, IO). finish(Res, PS0, IO) :- get_status(Status, PS0, PS), ( Status = ps_ok(Stuff), Res = ok(Stuff) ; Status = ps_fail(Msg), Res = error(Msg) ; Status = ps_error(Msg), Res = error(Msg) ), unsafe_promise_unique(PS ^ io, IO). try_parse(P, S, F, E) --> mark(M, Ent), get_status(Status0), actuate(P), get_status(Status), ( { Status = ps_ok(X) }, call(S, X) ; { Status = ps_fail(Msg) }, set_status(Status0), reset(M, Ent), call(F, Msg) ; { Status = ps_error(Msg) }, set_status(Status0), call(E, Msg) ). parse(P, Res) --> call(P), get_status(Status), { Status = ps_ok(Stuff), Res = ok(Stuff) ; Status = ps_fail(Msg), Res = error(Msg) ; Status = ps_error(Msg), Res = error(Msg) }. parse_entity(Parser, Entity, !PS) :- E0 = !.PS ^ entity, !PS ^ entity := Entity, call(Parser, !PS), E1 = !.PS ^ entity, ( if E1 ^ curr = E1 ^ size then !PS ^ entity := E0 else record_error("parse finished before the end of the entity", !PS) ). :- pred actuate(parser(T1, T2)::in(parser), pstate(T1)::pdi, pstate(T2)::puo) is det. actuate(P) --> get_status(Status), ( { Status = ps_ok(_) }, call(P) ; { Status = ps_fail(Msg) }, record_failure(Msg) ; { Status = ps_error(Msg) }, record_error(Msg) ). :- pred mark(int::out, entity::out, pstate(T)::pdi, pstate(T)::puo) is det. mark(!.PS ^ count, !.PS ^ entity, !PS). :- pred reset(int::in, entity::in, pstate(T)::pdi, pstate(T)::puo) is det. reset(Count, Entity, !PS) :- !PS ^ count := Count, !PS ^ entity := Entity. tok(!PS) :- enc(Enc) = !.PS ^ encoding, Entity0 = !.PS ^ entity, ( if decode(Enc, Uni, Entity0, Entity) then !PS ^ status := ps_ok(Uni), !PS ^ entity := Entity, Count0 = !.PS ^ count, !PS ^ count := Count0 + 1 else !PS ^ status := ps_fail("eof") ). return(X, !PS) :- !PS ^ status := ps_ok(X). return_unit --> return(unit). record_failure(Msg, !PS) :- !PS ^ status := ps_fail(Msg). record_error(Msg, !PS) :- !PS ^ status := ps_error(Msg). set_encoding(Enc, !PS) :- !PS ^ encoding := Enc. get_encoding(PS ^ encoding, PS, PS). :- pred get_status(status(T)::out, pstate(T)::pdi, pstate(T)::puo) is det. get_status(PS ^ status, PS, PS). :- pred set_status(status(T1)::in, pstate(T2)::pdi, pstate(T1)::puo) is det. set_status(S, PS, PS ^ status := S). mstr(Str) --> { string.to_char_list(Str, Chars) }, (mchrs(Chars) `next` (pred(_::in, pdi, puo) is det --> return(Str) )). mstr_return(Str, Return) --> { string.to_char_list(Str, Chars) }, (mchrs(Chars) `next` (pred(_::in, pdi, puo) is det --> return(Return) )). mchr(U) --> tok `next` (pred(C::in, pdi, puo) is det --> ( if { U = C } then return(U) else record_failure("character didn't match") )). mchr_return(U, Return) --> tok `next` (pred(C::in, pdi, puo) is det --> ( if { U = C } then return(Return) else record_failure("character didn't match") )). % Match the given list of characters. % :- pred mchrs(list(char)::in, pstate(_)::pdi, pstate(unit)::puo) is det. mchrs([]) --> return(unit). mchrs([C | Is]) --> { char.to_int(C, I) }, (tok `next` (pred(I0::in, pdi, puo) is det --> ( if { I = I0 } then mchrs(Is) else record_failure("literal failed to match") ))). quote --> tok `next` (pred(Q::in, pdi, puo) is det --> ( if { Q = ('''') ; Q = ('"') } then return(Q) else record_failure("expected a quote") )). io(Pred, Res, !PS) :- unsafe_promise_unique(!.PS ^ io, IO0), call(Pred, Res, IO0, IO), !PS ^ io := IO. io(Pred, !PS) :- unsafe_promise_unique(!.PS ^ io, IO0), call(Pred, IO0, IO), !PS ^ io := IO. make_string(UniCodes, String, PS, PS) :- enc(Enc) = PS ^ encoding, encode(Enc, UniCodes, String). (A and B) --> actuate(A) `next` (pred(X::in, pdi, puo) is det --> actuate(B) `next` (pred(Y::in, pdi, puo) is det --> return(next(X, Y)) )). (A or B) --> try_parse(A, return, (pred(_::in, pdi, puo) is det --> call(B)), record_error). next(P, T) --> actuate(P), get_status(Status1), ( { Status1 = ps_ok(X) }, call(T, X) ; { Status1 = ps_fail(Msg) }, set_status(ps_fail(Msg)) ; { Status1 = ps_error(Msg) }, set_status(ps_error(Msg)) ). star(P) --> star(P, []). :- pred star(parser(T1, T2)::in(parser), list(T2)::in, pstate(T1)::pdi, pstate(list(T2))::puo) is det. star(P, Xs0) --> get_status(Status0), mark(Start, _Ent), try_parse(P, ( pred(X::in, pdi, puo) is det --> mark(End, _EEnt), ( if { Start \= End } then set_status(Status0), star(P, [X | Xs0]) else record_failure("star(null)") ) ), ( pred(_::in, pdi, puo) is det --> { list.reverse(Xs0, Xs) }, return(Xs) ), record_error ). plus(P) --> get_status(Status0), (actuate(P) `next` (pred(X::in, pdi, puo) is det --> set_status(Status0), star(P, [X]) )). call_opt(no, _Yes, No) --> call(No). call_opt(yes(Thing), Yes, _No) --> call(Yes, Thing). opt_default(P, Def) --> try_parse(P, return, ( pred(_::in, pdi, puo) is det --> return(Def) ), record_error ). opt(P) --> try_parse(P, ( pred(X::in, pdi, puo) is det --> return(yes(X)) ), ( pred(_::in, pdi, puo) is det --> return(no) ), record_error ). upto(Rep, Fin) --> upto(Rep, Fin, []). :- pred upto(parser(T1, T2)::in(parser), parser(T1, T3)::in(parser), list(T2)::in, pstate(T1)::pdi, pstate(next(list(T2), T3))::puo) is det. upto(Rep, Fin, Rs0) --> get_status(Status0), try_parse(Fin, ( pred(F::in, pdi, puo) is det --> { list.reverse(Rs0, Rs) }, return(next(Rs, F)) ), ( pred(_::in, pdi, puo) is det --> set_status(Status0), (Rep `next` (pred(R::in, pdi, puo) is det --> set_status(Status0), upto(Rep, Fin, [R | Rs0]) ))), record_error ). range(F, L) --> tok `next` (pred(C::in, pdi, puo) is det --> ( if { F =< C, C =< L } then return(C) else record_failure("not in range") )). (F - L) --> range(F, L). wrap(P, Q) --> P `next` (pred(X::in, pdi, puo) is det --> { call(Q, X, W) }, return(W) ). is_a(P) --> P `next` (pred(_::in, pdi, puo) is det --> return(unit) ). first(P) --> P `next` (pred(next(T, _)::in, pdi, puo) is det --> return(T) ). second(P) --> P `next` (pred(next(_, T)::in, pdi, puo) is det --> return(T) ). except(Exclusions) --> tok `next` (pred(C::in, pdi, puo) is det --> ( if { list.member(C, Exclusions) } then record_failure("excluded character") else return(C) )). no(Parser) --> Parser `next` (pred(_::in, pdi, puo) is det --> return(no) ). yes(Parser) --> Parser `next` (pred(X::in, pdi, puo) is det --> return(yes(X)) ). filter(Parser) --> Parser `next` (pred(Xs0::in, pdi, puo) is det --> { filter1(Xs0, Xs) }, return(Xs) ). :- pred filter1(list(opt(T))::in, list(T)::out) is det. filter1([], []). filter1([X0 | Xs0], Xs) :- ( X0 = yes(X), filter1(Xs0, Xs1), Xs = [X | Xs1] ; X0 = no, filter1(Xs0, Xs) ). return_no(_, no). return_yes(T, yes(T)). list(P) --> P `next` (pred(X::in, pdi, puo) is det --> return([X]) ). get_global(Key, Val, !PS) :- map.lookup(!.PS ^ globals, univ(Key), Val0), det_univ_to_type(Val0, Val). set_global(Key, Val, !PS) :- map.set(univ(Key), univ(Val), !.PS ^ globals, Globals), !PS ^ globals := Globals.