Files
mercury/extras/lex/lex.m
Julien Fischer 4b28829101 Use typed insts in extras/lex.
extras/lex/lex.automata.m:
extras/lex/lex.buf.m:
extras/lex/lex.lexeme.m:
extras/lex/lex.m:
    As above.
2023-02-14 17:04:26 +11:00

863 lines
29 KiB
Mathematica

%-----------------------------------------------------------------------------%
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix ft=mercury
%-----------------------------------------------------------------------------%
%
% lex.m
% Copyright (C) 2001-2002 Ralph Becket <rbeck@microsoft.com>
% Sun Aug 20 09:08:46 BST 2000
% Copyright (C) 2001-2002 The Rationalizer Intelligent Software AG
% The changes made by Rationalizer are contributed under the terms
% of the GNU Lesser General Public License, see the file COPYING.LGPL
% in this directory.
% Copyright (C) 2002, 2006, 2010-2011 The University of Melbourne
% Copyright (C) 2014, 2017-2023 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%
% This module puts everything together, compiling a list of lexemes into state
% machines and turning the input stream into a token stream.
%
% Note that the astral characters (in Unicode) are not included in the range of
% Unicode characters, as the astral planes are very sparsely assigned.
%
%-----------------------------------------------------------------------------%
:- module lex.
:- interface.
:- import_module char.
:- import_module io.
:- import_module list.
:- import_module pair.
:- import_module string.
:- import_module sparse_bitset.
:- import_module enum.
%-----------------------------------------------------------------------------%
:- type token_creator(Token) == (func(string) = Token).
:- inst token_creator == (func(in) = out is det).
:- type lexeme(Token) == pair(regexp, token_creator(Token)).
:- inst lexeme(Inst) for pair/2
---> (ground - Inst).
:- type lexer(Token, Source).
:- type lexer_state(Token, Source).
% Byte offset into the source data.
:- type offset == int.
% Any errors should be reported by raising an exception.
%
:- type read_result
---> ok(char)
; eof.
% read_pred(Offset, Result, SrcIn, SrcOut) reads the char at
% Offset from SrcIn and returns SrcOut.
%
:- type read_pred(T) == pred(offset, read_result, T, T).
:- inst read_pred == (pred(in, out, di, uo) is det).
% ignore_pred(Token): if it does not fail, Token must be ignored
%
:- type ignore_pred(Tok) == pred(Tok).
:- inst ignore_pred == (pred(in) is semidet).
% Represents a set of Unicode characters
%
:- type charset == sparse_bitset(char).
% The type of regular expressions.
%
:- type regexp.
% The typeclass for types having a natural converter to regexp's
%
:- typeclass regexp(T) where [
func re(T) = regexp
].
% Handling regexp's based on the typeclass regexp(T)
%
:- func null = regexp.
:- func T1 ++ T2 = regexp <= (regexp(T1), regexp(T2)).
:- func *(T) = regexp <= (regexp(T)).
% One of the following two functions may be deprecated in future,
% depending upon whether there's a concensus concerning
% which is preferable. Both express alternation.
%
:- func T1 \/ T2 = regexp <= (regexp(T1), regexp(T2)).
:- func (T1 or T2) = regexp <= (regexp(T1), regexp(T2)).
% Some instances of typeclass regexp(T).
%
:- instance regexp(regexp).
:- instance regexp(char).
:- instance regexp(string).
:- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)).
% Some basic non-primitive regexps.
%
:- func any(string) = regexp. % any("abc") = ('a') or ('b') or ('c')
:- func anybut(string) = regexp. % anybut("abc") is complement of any("abc")
:- func ?(T) = regexp <= regexp(T). % ?(R) = R or null
:- func +(T) = regexp <= regexp(T). % +(R) = R ++ *(R)
:- func range(char, char) = regexp. % range('a', 'z') = any("ab...xyz")
:- func (T * int) = regexp <= regexp(T). % R * N = R ++ ... ++ R
% Some useful single-char regexps.
%
:- func digit = regexp. % digit = any("0123456789")
:- func lower = regexp. % lower = any("abc...z")
:- func upper = regexp. % upper = any("ABC...Z")
:- func alpha = regexp. % alpha = lower or upper
:- func alphanum = regexp. % alphanum = alpha or digit
:- func identstart = regexp. % identstart = alpha or "_"
:- func ident = regexp. % ident = alphanum or "_"
:- func tab = regexp. % tab = re("\t")
:- func spc = regexp. % spc = re(" ")
:- func wspc = regexp. % wspc = any(" \t\n\r\f\v")
:- func dot = regexp. % dot = anybut("\r\n")
% Some useful compound regexps.
%
:- func nl = regexp. % nl = ?("\r") ++ re("\n")
:- func nat = regexp. % nat = +(digit)
:- func signed_int = regexp. % signed_int = ?("+" or "-") ++ nat
:- func real = regexp. % real = \d+((.\d+([eE]int)?)|[eE]int)
:- func identifier = regexp. % identifier = identstart ++ *(ident)
:- func whitespace = regexp. % whitespace = *(wspc)
:- func junk = regexp. % junk = *(dot)
% A range of characters, inclusive of both the first and last values.
%
:- type char_range
---> char_range(
cr_first :: int,
cr_last :: int
).
% charset(Start, End) = charset(Start `..` End)
%
% Throws an exception if Start > End.
%
:- func charset(int, int) = charset.
% Function to create a sparse bitset from a range of Unicode codepoints.
% These codepoints are checked for validity, any invalid codepoints
% are ignored. Throws an exception if cr_first value is less than cr_last.
%
:- func charset(char_range) = charset.
% Creates a union of all char ranges in the list. Returns the empty set
% if the list is empty. Any invalid codepoints are ignored.
%
:- func charset_from_ranges(list(char_range)) = charset.
% Latin is comprised of the following Unicode blocks:
% * Basic Latin
% * Latin1 Supplement
% * Latin Extended-A
% * Latin Extended-B
%
:- func latin_chars = charset is det.
% Utility predicate to create ignore_pred's.
% Use it in the form `ignore(my_token)' to ignore just `my_token'.
%
:- pred ignore(Token::in, Token::in) is semidet.
% Utility function to return noval tokens.
% Use it in the form `return(my_token) inside a lexeme definition.
%
:- func return(T, string) = T.
% Utility operator to create lexemes.
%
:- func (T1 -> token_creator(Tok)) = pair(regexp, token_creator(Tok))
<= regexp(T1).
% Construct a lexer from which we can generate running
% instances.
%
% NOTE: If several lexemes match the same string only
% the token generated by the one closest to the start
% of the list of lexemes is returned.
%
:- func init(list(lexeme(Tok))::in, read_pred(Src)::in(read_pred))
= (lexer(Tok, Src)::out) is det.
% Construct a lexer from which we can generate running
% instances. If we construct a lexer with init/4, we
% can additionally ignore specific tokens.
%
% NOTE: If several lexemes match the same string only
% the token generated by the one closest to the start
% of the list of lexemes is returned.
%
:- func init(list(lexeme(Tok))::in, read_pred(Src)::in(read_pred),
ignore_pred(Tok)::in(ignore_pred)) = (lexer(Tok, Src)::out) is det.
% Handy read predicates.
%
:- pred read_from_stdin(offset::in, read_result::out, io::di, io::uo) is det.
:- pred read_from_string(offset::in, read_result::out,
string::di, string::uo) is det.
% Generate a running instance of a lexer on some input source.
% If you want to lex strings, you must ensure they are unique
% by calling either copy/1 or unsafe_promise_unique/1 on the
% source string argument.
%
% Note that you can't get the input source back until you stop lexing.
%
:- func start(lexer(Tok, Src)::in, Src::di) = (lexer_state(Tok, Src)::uo)
is det.
% Read the next token from the input stream.
%
% CAVEAT: if the token returned happened to match the empty string,
% then you must use read_char/3 (below) to consume the next char
% in the input stream before calling read/3 again, since matching
% the empty string does not consume any chars from the input stream
% and will otherwise mean you simply get the same match ad infinitum.
%
% An alternative solution is to always include a "catch all" lexeme
% that matches any unexpected char at the end of the list of lexemes.
%
:- pred read(io.read_result(Tok)::out,
lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det.
% Calling offset_from_start/3 immediately prior to calling read/3
% will give the offset in chars from the start of the input stream
% for the result returned by the read/3 operation.
%
:- pred offset_from_start(offset::out,
lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det.
% Stop a running instance of a lexer and retrieve the input source.
%
:- func stop(lexer_state(_Tok, Src)::di) = (Src::uo) is det.
% Sometimes (e.g. when lexing the io.io) you want access to the
% input stream without interrupting the lexing process.
% This pred provides that sort of access.
%
:- pred manipulate_source(pred(Src, Src)::in(pred(di, uo) is det),
lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det.
% This is occasionally useful. It reads the next char from the
% input stream, without attempting to match it against a lexeme.
%
:- pred read_char(read_result::out,
lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
:- include_module lex.automata.
:- include_module lex.buf.
:- include_module lex.convert_NFA_to_DFA.
:- include_module lex.lexeme.
:- include_module lex.regexp.
:- import_module array.
:- import_module bool.
:- import_module exception.
:- import_module require.
:- import_module int.
:- import_module map.
:- import_module maybe.
:- import_module lex.automata.
:- import_module lex.buf.
:- import_module lex.convert_NFA_to_DFA.
:- import_module lex.lexeme.
:- import_module lex.regexp.
%-----------------------------------------------------------------------------%
:- type lexer(Token, Source)
---> lexer(
lex_compiled_lexemes :: list(live_lexeme(Token)),
lex_ignore_pred :: ignore_pred(Token),
lex_buf_read_pred :: read_pred(Source)
).
:- inst lexer for lexer/2
---> lexer(ground, ignore_pred, read_pred).
:- type lexer_instance(Token, Source)
---> lexer_instance(
init_lexemes :: list(live_lexeme(Token)),
init_winner_func :: init_winner_func(Token),
live_lexemes :: list(live_lexeme(Token)),
current_winner :: winner(Token),
buf_state :: buf_state(Source),
ignore_pred :: ignore_pred(Token)
).
:- inst lexer_instance for lexer_instance/2
---> lexer_instance(
live_lexeme_list,
init_winner_func,
live_lexeme_list,
winner,
buf.buf_state,
ignore_pred
).
:- type live_lexeme(Token) == compiled_lexeme(Token).
:- inst live_lexeme == compiled_lexeme.
:- inst live_lexeme_list == list.list_skel(live_lexeme).
:- type init_winner_func(Token)
== ( func(offset) = winner(Token) ).
:- inst init_winner_func
== ( func(in) = out is det ).
:- type winner(Token) == maybe(pair(token_creator(Token), offset)).
:- inst winner for maybe/1
---> yes(pair(token_creator, ground))
; no.
%-----------------------------------------------------------------------------%
ignore(Tok, Tok).
%-----------------------------------------------------------------------------%
return(Token, _) = Token.
%-----------------------------------------------------------------------------%
(R1 -> TC) = (re(R1) - TC).
%-----------------------------------------------------------------------------%
init(Lexemes, BufReadPred) = Lexer :-
DontIgnoreAnything =
( pred(_::in) is semidet :-
semidet_fail
),
Lexer = init(Lexemes, BufReadPred, DontIgnoreAnything).
init(Lexemes, BufReadPred, IgnorePred) = Lexer :-
CompiledLexemes = list.map(compile_lexeme, Lexemes),
Lexer = lexer(CompiledLexemes, IgnorePred, BufReadPred).
%-----------------------------------------------------------------------------%
start(Lexer0, Src) = State :-
Lexer = lexer_inst_cast(Lexer0),
init_lexer_instance(Lexer, Instance, Buf),
State = args_lexer_state(Instance, Buf, Src).
:- func lexer_inst_cast(lexer(Tok, Src)::in) = (lexer(Tok, Src)::out(lexer))
is det.
:- pragma foreign_proc("C",
lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)),
[will_not_call_mercury, promise_pure, thread_safe],
"
Lexer = Lexer0;
").
:- pragma foreign_proc("Java",
lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)),
[will_not_call_mercury, promise_pure, thread_safe],
"
Lexer = Lexer0;
").
:- pragma foreign_proc("C#",
lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)),
[will_not_call_mercury, promise_pure, thread_safe],
"
Lexer = Lexer0;
").
%-----------------------------------------------------------------------------%
:- pred init_lexer_instance(lexer(Tok, Src)::in(lexer),
lexer_instance(Tok, Src)::out(lexer_instance), buf::array_uo) is det.
init_lexer_instance(Lexer, Instance, Buf) :-
buf.init(Lexer ^ lex_buf_read_pred, BufState, Buf),
Start = BufState ^ start_offset,
InitWinnerFunc = initial_winner_func(InitLexemes),
InitLexemes = Lexer ^ lex_compiled_lexemes,
InitWinner = InitWinnerFunc(Start),
IgnorePred = Lexer ^ lex_ignore_pred,
Instance = lexer_instance(InitLexemes, InitWinnerFunc, InitLexemes,
InitWinner, BufState, IgnorePred).
%-----------------------------------------------------------------------------%
% Lexing may *start* with a candidate winner if one of the lexemes
% accepts the empty string. We pick the first such, if any, since
% that lexeme has priority.
%
:- func initial_winner_func(list(live_lexeme(Token))::in(live_lexeme_list))
= (init_winner_func(Token)::out(init_winner_func)) is det.
initial_winner_func([]) =
( func(_) = no ).
initial_winner_func([L | Ls]) =
( if in_accepting_state(L) then
( func(Offset) = yes(L ^ token - Offset) )
else
initial_winner_func(Ls)
).
%----------------------------------------------------------------------------%
offset_from_start(Offset, !State) :-
Offset = !.State ^ run ^ buf_state ^ buf_cursor,
!:State = unsafe_promise_unique(!.State).
%-----------------------------------------------------------------------------%
stop(State) = Src :-
lexer_state_args(State, _Instance, _Buf, Src).
%-----------------------------------------------------------------------------%
read(Result, State0, State) :-
lexer_state_args(State0, Instance0, Buf0, Src0),
BufState0 = Instance0 ^ buf_state,
Start = BufState0 ^ start_offset,
InitWinner = ( Instance0 ^ init_winner_func )(Start),
Instance1 = ( Instance0 ^ current_winner := InitWinner ),
read_2(Result, Instance1, Instance, Buf0, Buf, Src0, Src),
State = args_lexer_state(Instance, Buf, Src).
% Basically, just read chars from the buf and advance the live lexemes
% until we have a winner or hit an error (no parse).
%
:- pred read_2(io.read_result(Tok)::out,
lexer_instance(Tok, Src)::in(lexer_instance),
lexer_instance(Tok, Src)::out(lexer_instance),
buf::array_di, buf::array_uo, Src::di, Src::uo) is det.
read_2(Result, !Instance, !Buf, !Src) :-
BufState0 = !.Instance ^ buf_state,
buf.read(BufReadResult, BufState0, BufState, !Buf, !Src),
(
BufReadResult = ok(Char),
process_char(Result, Char, !Instance, BufState, !Buf, !Src)
;
BufReadResult = eof,
process_eof(Result, !Instance, BufState, !.Buf)
).
%-----------------------------------------------------------------------------%
:- pred process_char(io.read_result(Tok)::out, char::in,
lexer_instance(Tok, Src)::in(lexer_instance),
lexer_instance(Tok, Src)::out(lexer_instance),
buf_state(Src)::in(buf_state),
buf::array_di, buf::array_uo, Src::di, Src::uo) is det.
process_char(Result, Char, !Instance, BufState, !Buf, !Src) :-
LiveLexemes0 = !.Instance ^ live_lexemes,
Winner0 = !.Instance ^ current_winner,
advance_live_lexemes(Char, BufState ^ cursor_offset,
LiveLexemes0, LiveLexemes, Winner0, Winner),
(
LiveLexemes = [],
% Nothing left to consider.
process_any_winner(Result, Winner, !Instance, BufState, !Buf, !Src)
;
LiveLexemes = [_ | _],
% Still some open possibilities.
!:Instance = (((!.Instance
^ live_lexemes := LiveLexemes )
^ current_winner := Winner )
^ buf_state := BufState ),
read_2(Result, !Instance, !Buf, !Src)
).
%-----------------------------------------------------------------------------%
:- pred process_any_winner(io.read_result(Tok)::out, winner(Tok)::in(winner),
lexer_instance(Tok, Src)::in(lexer_instance),
lexer_instance(Tok, Src)::out(lexer_instance),
buf_state(Src)::in(buf_state),
buf::array_di, buf::array_uo, Src::di, Src::uo) is det.
process_any_winner(Result, yes(TokenCreator - Offset), Instance0, Instance,
BufState0, Buf0, Buf, Src0, Src) :-
BufState1 = rewind_cursor(Offset, BufState0),
String = string_to_cursor(BufState1, Buf0),
Token = TokenCreator(String),
IgnorePred = Instance0 ^ ignore_pred,
InitWinner = ( Instance0 ^ init_winner_func )(Offset),
Instance1 = ((( Instance0
^ live_lexemes := Instance0 ^ init_lexemes )
^ current_winner := InitWinner )
^ buf_state := commit(BufState1) ),
( if IgnorePred(Token) then
% We have to be careful to avoid an infinite loop here.
% If the longest match was the empty string, then the next char
% in the input stream cannot start a match, so it must be reported
% as an error.
( if String = "" then
buf.read(BufResult, BufState1, BufState, Buf0, Buf, Src0, Src),
(
BufResult = ok(_),
Result = error("input not matched by any regexp", Offset)
;
BufResult = eof,
Result = eof
),
Instance = ( Instance1 ^ buf_state := commit(BufState) )
else
read_2(Result, Instance1, Instance, Buf0, Buf, Src0, Src)
)
else
Result = ok(Token),
Instance = Instance1,
Buf = Buf0,
Src = Src0
).
process_any_winner(Result, no, !Instance, BufState0, !Buf, !Src) :-
Start = BufState0 ^ start_offset,
BufState = rewind_cursor(Start + 1, BufState0),
Result = error("input not matched by any regexp", Start),
InitWinner = ( !.Instance ^ init_winner_func )(Start),
!:Instance = ((( !.Instance
^ live_lexemes := !.Instance ^ init_lexemes )
^ current_winner := InitWinner )
^ buf_state := commit(BufState) ).
%-----------------------------------------------------------------------------%
:- pred process_eof(io.read_result(Tok)::out,
lexer_instance(Tok, Src)::in(lexer_instance),
lexer_instance(Tok, Src)::out(lexer_instance),
buf_state(Src)::in(buf_state), buf::array_ui) is det.
process_eof(Result, !Instance, !.BufState, !.Buf) :-
CurrentWinner = !.Instance ^ current_winner,
(
CurrentWinner = no,
Offset = !.BufState ^ cursor_offset,
Result = eof
;
CurrentWinner = yes(TokenCreator - Offset),
String = string_to_cursor(!.BufState, !.Buf),
Token = TokenCreator(String),
IgnorePred = !.Instance ^ ignore_pred,
Result = ( if IgnorePred(Token) then eof else ok(Token) )
),
InitWinner = ( !.Instance ^ init_winner_func )(Offset),
!:Instance = ((( !.Instance
^ live_lexemes := !.Instance ^ init_lexemes )
^ current_winner := InitWinner )
^ buf_state := commit(!.BufState) ).
%-----------------------------------------------------------------------------%
% Note that in the case where two or more lexemes match the same
% string, the win is given to the earliest such lexeme in the list.
% This matches the behaviour of standard C lex.
%
:- pred advance_live_lexemes(char::in, offset::in,
list(live_lexeme(Token))::in(live_lexeme_list),
list(live_lexeme(Token))::out(live_lexeme_list),
winner(Token)::in(winner), winner(Token)::out(winner)) is det.
advance_live_lexemes(_Char, _Offset, [], [], !Winner).
advance_live_lexemes(Char, Offset, [L | Ls0], Ls, !Winner) :-
State0 = L ^ state,
( if next_state(L, State0, Char, State, IsAccepting) then
(
IsAccepting = no
;
IsAccepting = yes,
( if !.Winner = yes(_ - Offset) then
true
else
!:Winner = yes(L ^ token - Offset)
)
),
advance_live_lexemes(Char, Offset, Ls0, Ls1, !Winner),
Ls = [(L ^ state := State) | Ls1]
else
advance_live_lexemes(Char, Offset, Ls0, Ls, !Winner)
).
%-----------------------------------------------------------------------------%
:- pred live_lexeme_in_accepting_state(
list(live_lexeme(Tok))::in(live_lexeme_list),
token_creator(Tok)::out(token_creator)) is semidet.
live_lexeme_in_accepting_state([L | Ls], Token) :-
( if in_accepting_state(L) then
Token = L ^ token
else
live_lexeme_in_accepting_state(Ls, Token)
).
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
% It is much more convenient (especially for integration with parsers
% such as moose) to package up the lexer_instance, buf and Src
% in a single object.
:- type lexer_state(Tok, Src)
---> lexer_state(
run :: lexer_instance(Tok, Src),
buf :: buf,
src :: Src
).
%-----------------------------------------------------------------------------%
:- func args_lexer_state(lexer_instance(Tok, Src)::in(lexer_instance),
buf::array_di, Src::di) = (lexer_state(Tok, Src)::uo) is det.
args_lexer_state(Instance, Buf, Src) = LexerState :-
unsafe_promise_unique(lexer_state(Instance, Buf, Src), LexerState).
%-----------------------------------------------------------------------------%
:- pred lexer_state_args(lexer_state(Tok, Src)::di,
lexer_instance(Tok, Src)::out(lexer_instance),
buf::array_uo, Src::uo) is det.
lexer_state_args(lexer_state(Instance, Buf0, Src0), Instance, Buf, Src) :-
unsafe_promise_unique(Buf0, Buf),
unsafe_promise_unique(Src0, Src).
%-----------------------------------------------------------------------------%
manipulate_source(P, !State) :-
lexer_state_args(!.State, Instance, Buf, Src0),
P(Src0, Src),
!:State = args_lexer_state(Instance, Buf, Src).
%----------------------------------------------------------------------------%
read_char(Result, !State) :-
lexer_state_args(!.State, Instance0, Buf0, Src0),
BufState0 = Instance0 ^ buf_state,
buf.read(Result, BufState0, BufState, Buf0, Buf, Src0, Src),
Instance = ( Instance0 ^ buf_state := commit(BufState) ),
!:State = args_lexer_state(Instance, Buf, Src).
%-----------------------------------------------------------------------------%
read_from_stdin(_Offset, Result, !IO) :-
io.read_char(IOResult, !IO),
(
IOResult = ok(Char),
Result = ok(Char)
;
IOResult = eof,
Result = eof
;
IOResult = error(_E),
throw(IOResult)
).
%-----------------------------------------------------------------------------%
% XXX This is bad for long strings! We should cache the string length
% somewhere rather than recomputing it each time we read a char.
%
read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :-
( if Offset < string.length(String) then
Result = ok(string.unsafe_index(String, Offset))
else
Result = eof
).
%-----------------------------------------------------------------------------%
%
% The type of regular expressions.
%
:- type regexp
---> eps % The empty regexp
; atom(char) % Match a single char
; conc(regexp, regexp) % Concatenation
; alt(regexp, regexp) % Alternation
; star(regexp) % Kleene closure
; charset(charset). % Matches any char in the set
%-----------------------------------------------------------------------------%
:- instance regexp(regexp) where [
re(RE) = RE
].
:- instance regexp(char) where [
re(C) = atom(C)
].
:- instance regexp(string) where [
re(S) = R :-
( if S = "" then
R = null
else
R = string.foldl(func(Char, R0) = R1 :-
( if R0 = eps then R1 = re(Char) else R1 = R0 ++ re(Char) ),
S,
eps)
)
].
:- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)) where [
re(SparseBitset) = charset(Charset) :-
Charset = sparse_bitset.foldl(
func(Enum, Set0) = insert(Set0, char.det_from_uint(to_uint(Enum))),
SparseBitset,
sparse_bitset.init)
].
%-----------------------------------------------------------------------------%
%
% Basic primitive regexps.
%
null = eps.
R1 ++ R2 = conc(re(R1), re(R2)).
R1 \/ R2 = alt(re(R1), re(R2)).
(R1 or R2) = alt(re(R1), re(R2)).
*(R1) = star(re(R1)).
%-----------------------------------------------------------------------------%
%
% Some basic non-primitive regexps.
%
% int_is_valid_char(Int) = Char.
%
% True iff Int is Char and is in [0x0..0x10ffff] and not a surrogate
% character.
%
:- func int_is_valid_char(int) = char is semidet.
int_is_valid_char(Value) = Char :-
char.from_int(Value, Char),
not char.is_surrogate(Char).
charset(Start, End) = build_charset(Start, End, sparse_bitset.init) :-
expect(Start =< End, $file, $pred,
"Start must be less than or equal to End").
charset(char_range(First, Last)) = charset(First, Last).
:- func build_charset(int, int, charset) = charset.
build_charset(First, Last, Charset0) = Charset :-
( if First =< Last then
( if int_is_valid_char(First) = Char then
Charset1 = sparse_bitset.insert(Charset0, Char)
else
Charset1 = Charset0
),
Charset = build_charset(First + 1, Last, Charset1)
else
Charset = Charset0
).
charset_from_ranges(ListOfRanges) =
union_list(map(charset, ListOfRanges)).
latin_chars =
charset_from_ranges([
char_range(0x40, 0x7d),
char_range(0xc0, 0xff),
char_range(0x100, 0x2ff)
]).
:- func valid_unicode_chars = charset.
valid_unicode_chars = charset(char_range(0x01, 0xffff)).
any(S) = R :-
( if S = "" then
R = null
else
R = re(sparse_bitset.list_to_set(string.to_char_list(S)))
).
anybut(S) = R :-
ExcludedChars = sparse_bitset.list_to_set(string.to_char_list(S)),
R = re(sparse_bitset.difference(valid_unicode_chars, ExcludedChars)).
?(R) = (R or null).
+(R) = (R ++ *(R)).
range(Start, End) = re(charset(char.to_int(Start), char.to_int(End))).
R * N = Result :-
( if N < 0 then
unexpected($pred, "N must be a non-negative number")
else if N = 0 then
Result = null
else if N = 1 then
Result = re(R)
else
Result = conc(re(R), (R * (N - 1)))
).
%-----------------------------------------------------------------------------%
%
% Some useful single-char regexps.
%
% We invite the compiler to memo the values of these constants that
% (a) are likely to be quite common in practice and (b) take *some*
% time to compute.
%
:- pragma memo(func(digit/0)).
:- pragma memo(func(lower/0)).
:- pragma memo(func(upper/0)).
:- pragma memo(func(wspc/0)).
:- pragma memo(func(dot/0)).
digit = any("0123456789").
lower = any("abcdefghijklmnopqrstuvwxyz").
upper = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ").
wspc = any(" \t\n\r\f\v").
dot = anybut("\r\n").
alpha = (lower or upper).
alphanum = (alpha or digit).
identstart = (alpha or ('_')).
ident = (alphanum or ('_')).
tab = re('\t').
spc = re(' ').
%-----------------------------------------------------------------------------%
% Some useful compound regexps.
nl = (?('\r') ++ '\n'). % matches both Posix and Windows newline.
nat = +(digit).
signed_int = ?("+" or "-") ++ nat.
real = signed_int ++ (
("." ++ nat ++ ?(("e" or "E") ++ signed_int)) or
( ("e" or "E") ++ signed_int)
).
identifier = (identstart ++ *(ident)).
whitespace = *(wspc).
junk = *(dot).
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%