mirror of
https://github.com/Mercury-Language/mercury.git
synced 2025-12-18 07:15:19 +00:00
parsers. NEWS: Report the addition of parsing_utils.m to the library. library/library.m: Include parsing_utils.m. library/parsing_utils.m: Added. library/string.m: Make string.to_int fail on overflow. Amend comments to reflect this. tests/general/Mmakefile: tests/general/test_parsing_utils.exp: tests/general/test_parsing_utils.m: Test case for parsing_utils.m.
484 lines
16 KiB
Mathematica
484 lines
16 KiB
Mathematica
%---------------------------------------------------------------------------%
|
|
% vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
|
|
%---------------------------------------------------------------------------%
|
|
% Copyright (C) 2009 The University of Melbourne.
|
|
% This file may only be copied under the terms of the GNU Library General
|
|
% Public License - see the file COPYING.LIB in the Mercury distribution.
|
|
%---------------------------------------------------------------------------%
|
|
%
|
|
% File: parsing_utils.m
|
|
% Author: Ralph Becket <rafe@csse.unimelb.edu.au>
|
|
% Stability: low
|
|
%
|
|
% Utilities for recursive descent parsers. Parsers take at least three
|
|
% arguments: a source (src) containing the input string and a parser
|
|
% state (ps) input/output pair tracking the current offset into the input.
|
|
%
|
|
% A new src and ps can be constructed by calling
|
|
% new_src_and_ps(InputString, Src, !:PS). Parsing predicates are semidet
|
|
% and typically take the form p(...input arguments..., Src, Result, !PS).
|
|
% A parser matching variable assignments of the form `x = 42' might be
|
|
% defined like this:
|
|
%
|
|
% var_assignment(Src, {Var, Value}, !PS) :-
|
|
% var(Src, Var, !PS),
|
|
% punct(Src, "=", !PS),
|
|
% expr(Src, Expr, !PS).
|
|
%
|
|
% where var/4 and expr/4 are parsers for variables and expressions
|
|
% respectively and punct/4 is provided by this module for matching
|
|
% punctuation.
|
|
%
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- module parsing_utils.
|
|
|
|
:- interface.
|
|
|
|
:- import_module char.
|
|
:- import_module float.
|
|
:- import_module int.
|
|
:- import_module list.
|
|
:- import_module maybe.
|
|
:- import_module string.
|
|
:- import_module unit.
|
|
|
|
|
|
|
|
% The parser source (input string).
|
|
%
|
|
:- type src.
|
|
|
|
% The parser "state", passed around in DCG arguments.
|
|
%
|
|
:- type ps.
|
|
|
|
% This type and inst are useful for specifying "standard" parser
|
|
% signatures.
|
|
%
|
|
:- type parser(T) == pred(src, T, ps, ps).
|
|
:- inst parser == ( pred(in, out, in, out) is semidet ).
|
|
|
|
% Construct a new parser source and state from a string.
|
|
%
|
|
:- pred new_src_and_ps(string::in, src::out, ps::out) is det.
|
|
|
|
% Obtain the current offset from the start of the input string
|
|
% (the first character in the input has offset 0).
|
|
%
|
|
:- pred current_offset(src::in, int::out,
|
|
ps::in, ps::out) is det.
|
|
|
|
% input_substring(Src, StartOffset, EndOffsetPlusOne, Substring)
|
|
% Copy the substring from the input occupying the offsets
|
|
% [StartOffset, EndOffsetPlusOne).
|
|
%
|
|
:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
|
|
|
|
% Read the next char.
|
|
%
|
|
:- pred next_char(src::in, char::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% Match a char from the given string.
|
|
%
|
|
:- pred char_in_class(string::in, src::in, char::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% Match a string exactly and any subsequent whitespace.
|
|
%
|
|
:- pred punct(string::in, src::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% keyword(Src, IdChars, Keyword, _, !PS) matches Keyword exactly (i.e., it
|
|
% must not be followed by any character in IdChars) and any subsequent
|
|
% whitespace.
|
|
%
|
|
:- pred keyword(string::in, string::in, src::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% identifier(Src, InitIdChars, IdChars, Identifier, !PS) matches the next
|
|
% identifer (result in Identifier) comprising a char from InitIdChars
|
|
% followed by zero or more chars from IdChars. Any subsequent whitespace
|
|
% is consumed.
|
|
%
|
|
:- pred identifier(string::in, string::in, src::in, string::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% Consume any whitespace.
|
|
%
|
|
:- pred whitespace(src::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% Consume any input up to, and including, the next newline character
|
|
% marking the end of the current line.
|
|
%
|
|
:- pred skip_to_eol(src::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% Succeed if we have reached the end of the input.
|
|
%
|
|
:- pred eof(src::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-][0-9]+)?
|
|
% followed by any whitespace. The float_literal_as_string version simply
|
|
% returns the matched string. The float_literal version uses
|
|
% string.to_float to convert the output of float_literal_as_string; this
|
|
% may return an approximate answer since not all floating point numbers
|
|
% can be perfectly represented as Mercury floats.
|
|
%
|
|
:- pred float_literal_as_string(src::in, string::out,
|
|
ps::in, ps::out) is semidet.
|
|
:- pred float_literal(src::in, float::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+,
|
|
% followed by any whitespace. The int_literal_as_string version simply
|
|
% returns the matched string. The int_literal version uses string.to_int
|
|
% to convert the output of int_literal_as_string; this may fail if the
|
|
% number in question cannot be represented as a Mercury int.
|
|
%
|
|
:- pred int_literal_as_string(src::in, string::out,
|
|
ps::in, ps::out) is semidet.
|
|
:- pred int_literal(src::in, int::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% Parse an string literal. The string argument is the quote character.
|
|
% A backslash (\) character in the string makes the next character
|
|
% literal (e.g., for embedding quotes). These 'escaped' characters
|
|
% are included as-is in the result, along with the preceding backslash.
|
|
% Any following whitespace is also consumed.
|
|
%
|
|
:- pred string_literal(char::in, src::in, string::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% optional(P, Src, Result, !PS) returns Result = yes(X), if P(Src, X, !PS),
|
|
% or Result = no if P does not succeed.
|
|
%
|
|
:- pred optional(parser(T)::in(parser), src::in, maybe(T)::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% zero_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
|
|
% by repeatedly applying P until P fails. The nth item in Xs is
|
|
% the result from the nth application of P.
|
|
%
|
|
:- pred zero_or_more(parser(T)::in(parser), src::in, list(T)::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% one_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
|
|
% by repeatedly applying P until P fails. The nth item in Xs is
|
|
% the result from the nth application of P. P must succeed at
|
|
% least once.
|
|
%
|
|
:- pred one_or_more(parser(T)::in(parser), src::in, list(T)::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
% brackets(L, R, P, Src, X, !PS) is equivalent to
|
|
% punct(L, Src, _, !PS), P(Src, X, !PS), punct(R, Src, _, !PS).
|
|
%
|
|
:- pred brackets(string::in, string::in, parser(T)::in(parser), src::in,
|
|
T::out, ps::in, ps::out) is semidet.
|
|
|
|
% separated_list(Separator, P, Src, Xs, !PS) is like
|
|
% zero_or_more(P, Src, Xs, !PS) except that successive applications of
|
|
% P must be separated by punct(Separator, Src, _, !PS).
|
|
%
|
|
:- pred separated_list(string::in, parser(T)::in(parser), src::in,
|
|
list(T)::out, ps::in, ps::out) is semidet.
|
|
|
|
% comma_separated_list(P, Src, Xs) is the same as
|
|
% separated_list(",", P, Src, Xs).
|
|
%
|
|
:- pred comma_separated_list(parser(T)::in(parser), src::in, list(T)::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- implementation.
|
|
|
|
|
|
|
|
% The parser "state" is just the offset into the input string.
|
|
%
|
|
:- type ps == int.
|
|
|
|
:- type src
|
|
---> src(
|
|
input_length :: int,
|
|
input_string :: string
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
new_src_and_ps(InputString, Src, PS) :-
|
|
Src = src(string.length(InputString), InputString),
|
|
PS = 0.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
% Low-level predicates.
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
current_offset(_Src, Offset, !PS) :-
|
|
Offset = !.PS.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
eof(Src, unit, !PS) :-
|
|
current_offset(Src, Offset, !PS),
|
|
Offset = Src ^ input_length.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
next_char(Src, Char, !PS) :-
|
|
current_offset(Src, Offset, !PS),
|
|
Offset < Src ^ input_length,
|
|
Char = Src ^ input_string ^ unsafe_elem(Offset),
|
|
!:PS = !.PS + 1.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
char_in_class(CharClass, Src, Char, !PS) :-
|
|
next_char(Src, Char, !PS),
|
|
string.contains_char(CharClass, Char).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
input_substring(Src, Start, EndPlusOne, Substring) :-
|
|
EndPlusOne =< Src ^ input_length,
|
|
Substring =
|
|
unsafe_substring(Src ^ input_string, Start, EndPlusOne - Start).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred match_string(string::in, src::in,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
match_string(MatchStr, Src, PS, PS + N) :-
|
|
N = string.length(MatchStr),
|
|
PS + N =< Src ^ input_length,
|
|
match_string_2(N, 0, MatchStr, PS, Src ^ input_string).
|
|
|
|
|
|
:- pred match_string_2(int::in, int::in, string::in, int::in, string::in)
|
|
is semidet.
|
|
|
|
match_string_2(N, I, MatchStr, Offset, Str) :-
|
|
( if I < N then
|
|
MatchStr ^ unsafe_elem(I) = Str ^ unsafe_elem(Offset + I),
|
|
match_string_2(N, I + 1, MatchStr, Offset, Str)
|
|
else
|
|
true
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
% Utility predicates.
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
optional(P, Src, Result, !PS) :-
|
|
( if P(Src, X, !PS) then
|
|
Result = yes(X)
|
|
else
|
|
Result = no,
|
|
semidet_true
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
zero_or_more(P, Src, Result, !PS) :-
|
|
( if P(Src, X, !PS), zero_or_more(P, Src, Xs, !PS) then
|
|
Result = [X | Xs]
|
|
else
|
|
Result = [],
|
|
semidet_true
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
one_or_more(P, Src, Result, !PS) :-
|
|
P(Src, X, !PS),
|
|
zero_or_more(P, Src, Xs, !PS),
|
|
Result = [X | Xs].
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
brackets(L, R, P, Src, Result, !PS) :-
|
|
punct(L, Src, _, !PS),
|
|
P(Src, Result, !PS),
|
|
punct(R, Src, _, !PS).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
separated_list(Separator, P, Src, Result, !PS) :-
|
|
CommaP = ( pred(CommaPSrc::in, CommaPX::out, !.PS::in, !:PS::out)
|
|
is semidet :-
|
|
punct(Separator, CommaPSrc, _, !PS),
|
|
P(CommaPSrc, CommaPX, !PS)
|
|
),
|
|
P(Src, X, !PS),
|
|
zero_or_more(CommaP, Src, Xs, !PS),
|
|
Result = [X | Xs].
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
comma_separated_list(P, Src, Result, !PS) :-
|
|
separated_list(",", P, Src, Result, !PS).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
whitespace(Src, unit, !PS) :-
|
|
( if
|
|
next_char(Src, C, !PS),
|
|
char.is_whitespace(C)
|
|
then
|
|
whitespace(Src, _, !PS)
|
|
else
|
|
true
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
skip_to_eol(Src, unit, !PS) :-
|
|
next_char(Src, C, !PS),
|
|
( if C = ('\n') then true else skip_to_eol(Src, _, !PS) ).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
punct(Punct, Src, unit, !PS) :-
|
|
match_string(Punct, Src, !PS),
|
|
whitespace(Src, _, !PS).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
keyword(IdChars, Keyword, Src, unit, !PS) :-
|
|
match_string(Keyword, Src, !PS),
|
|
not char_in_class(IdChars, Src, _, !.PS, _),
|
|
whitespace(Src, _, !PS).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
float_literal_as_string(Src, FloatStr, !PS) :-
|
|
current_offset(Src, Start, !PS),
|
|
( if next_char(Src, ('-'), !PS) then true else true ),
|
|
digits(10, Src, _, !PS),
|
|
next_char(Src, ('.'), !PS),
|
|
digits(10, Src, _, !PS),
|
|
( if char_in_class("eE", Src, _, !PS) then
|
|
( if next_char(Src, ('-'), !PS) then true else true ),
|
|
digits(10, Src, _, !PS)
|
|
else
|
|
true
|
|
),
|
|
current_offset(Src, EndPlusOne, !PS),
|
|
whitespace(Src, _, !PS),
|
|
input_substring(Src, Start, EndPlusOne, FloatStr).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
float_literal(Src, Float, !PS) :-
|
|
float_literal_as_string(Src, FloatStr, !PS),
|
|
string.to_float(FloatStr, Float).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
int_literal_as_string(Src, IntStr, !PS) :-
|
|
current_offset(Src, Start, !PS),
|
|
optional(char_in_class("-"), Src, _, !PS),
|
|
digits(10, Src, _, !PS),
|
|
not (
|
|
next_char(Src, ('.'), !PS),
|
|
digits(10, Src, _, !.PS, _)
|
|
),
|
|
current_offset(Src, EndPlusOne, !PS),
|
|
whitespace(Src, _, !PS),
|
|
input_substring(Src, Start, EndPlusOne, IntStr).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
int_literal(Src, Int, !PS) :-
|
|
int_literal_as_string(Src, IntStr, !PS),
|
|
string.to_int(IntStr, Int).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred digits(int::in, src::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
digits(Base, Src, unit, !PS) :-
|
|
next_char(Src, C, !PS),
|
|
char.digit_to_int(C, D),
|
|
D < Base,
|
|
digits_2(Base, Src, _, !PS).
|
|
|
|
|
|
:- pred digits_2(int::in, src::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
digits_2(Base, Src, unit, !PS) :-
|
|
( if
|
|
next_char(Src, C, !PS),
|
|
char.digit_to_int(C, D),
|
|
D < Base
|
|
then
|
|
digits_2(Base, Src, _, !PS)
|
|
else
|
|
true
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
string_literal(QuoteChar, Src, String, !PS) :-
|
|
current_offset(Src, Start, !PS),
|
|
next_char(Src, QuoteChar, !PS),
|
|
string_literal_2(Src, QuoteChar, _, !PS),
|
|
current_offset(Src, EndPlusOne, !PS),
|
|
whitespace(Src, _, !PS),
|
|
input_substring(Src, Start + 1, EndPlusOne - 1, String).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred string_literal_2(src::in, char::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
string_literal_2(Src, QuoteChar, unit, !PS) :-
|
|
next_char(Src, C, !PS),
|
|
( if C = QuoteChar then
|
|
true
|
|
else if C = ('\\') then
|
|
next_char(Src, _, !PS),
|
|
string_literal_2(Src, QuoteChar, _, !PS)
|
|
else
|
|
string_literal_2(Src, QuoteChar, _, !PS)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
identifier(InitIdChars, IdChars, Src, Identifier, !PS) :-
|
|
current_offset(Src, Start, !PS),
|
|
char_in_class(InitIdChars, Src, _, !PS),
|
|
identifier_2(IdChars, Src, _, !PS),
|
|
current_offset(Src, EndPlusOne, !PS),
|
|
whitespace(Src, _, !PS),
|
|
input_substring(Src, Start, EndPlusOne, Identifier).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred identifier_2(string::in, src::in, unit::out,
|
|
ps::in, ps::out) is semidet.
|
|
|
|
identifier_2(IdChars, Src, unit, !PS) :-
|
|
( if char_in_class(IdChars, Src, _, !PS) then
|
|
identifier_2(IdChars, Src, _, !PS)
|
|
else
|
|
true
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|