%-----------------------------------------------------------------------------% % vim: ts=4 sw=4 et tw=0 wm=0 ff=unix ft=mercury %-----------------------------------------------------------------------------% % % lex.m % Copyright (C) 2001-2002 Ralph Becket % Sun Aug 20 09:08:46 BST 2000 % Copyright (C) 2001-2002 The Rationalizer Intelligent Software AG % The changes made by Rationalizer are contributed under the terms % of the GNU Lesser General Public License, see the file COPYING.LGPL % in this directory. % Copyright (C) 2002, 2006, 2010-2011 The University of Melbourne % Copyright (C) 2014, 2017-2023 The Mercury team. % This file is distributed under the terms specified in COPYING.LIB. % % This module puts everything together, compiling a list of lexemes into state % machines and turning the input stream into a token stream. % % Note that the astral characters (in Unicode) are not included in the range of % Unicode characters, as the astral planes are very sparsely assigned. % %-----------------------------------------------------------------------------% :- module lex. :- interface. :- import_module char. :- import_module io. :- import_module list. :- import_module pair. :- import_module string. :- import_module sparse_bitset. :- import_module enum. %-----------------------------------------------------------------------------% :- type token_creator(Token) == (func(string) = Token). :- inst token_creator == (func(in) = out is det). :- type lexeme(Token) == pair(regexp, token_creator(Token)). :- inst lexeme(Inst) for pair/2 ---> (ground - Inst). :- type lexer(Token, Source). :- type lexer_state(Token, Source). % Byte offset into the source data. :- type offset == int. % Any errors should be reported by raising an exception. % :- type read_result ---> ok(char) ; eof. % read_pred(Offset, Result, SrcIn, SrcOut) reads the char at % Offset from SrcIn and returns SrcOut. % :- type read_pred(T) == pred(offset, read_result, T, T). :- inst read_pred == (pred(in, out, di, uo) is det). % ignore_pred(Token): if it does not fail, Token must be ignored % :- type ignore_pred(Tok) == pred(Tok). :- inst ignore_pred == (pred(in) is semidet). % Represents a set of Unicode characters % :- type charset == sparse_bitset(char). % The type of regular expressions. % :- type regexp. % The typeclass for types having a natural converter to regexp's % :- typeclass regexp(T) where [ func re(T) = regexp ]. % Handling regexp's based on the typeclass regexp(T) % :- func null = regexp. :- func T1 ++ T2 = regexp <= (regexp(T1), regexp(T2)). :- func *(T) = regexp <= (regexp(T)). % One of the following two functions may be deprecated in future, % depending upon whether there's a concensus concerning % which is preferable. Both express alternation. % :- func T1 \/ T2 = regexp <= (regexp(T1), regexp(T2)). :- func (T1 or T2) = regexp <= (regexp(T1), regexp(T2)). % Some instances of typeclass regexp(T). % :- instance regexp(regexp). :- instance regexp(char). :- instance regexp(string). :- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)). % Some basic non-primitive regexps. % :- func any(string) = regexp. % any("abc") = ('a') or ('b') or ('c') :- func anybut(string) = regexp. % anybut("abc") is complement of any("abc") :- func ?(T) = regexp <= regexp(T). % ?(R) = R or null :- func +(T) = regexp <= regexp(T). % +(R) = R ++ *(R) :- func range(char, char) = regexp. % range('a', 'z') = any("ab...xyz") :- func (T * int) = regexp <= regexp(T). % R * N = R ++ ... ++ R % Some useful single-char regexps. % :- func digit = regexp. % digit = any("0123456789") :- func lower = regexp. % lower = any("abc...z") :- func upper = regexp. % upper = any("ABC...Z") :- func alpha = regexp. % alpha = lower or upper :- func alphanum = regexp. % alphanum = alpha or digit :- func identstart = regexp. % identstart = alpha or "_" :- func ident = regexp. % ident = alphanum or "_" :- func tab = regexp. % tab = re("\t") :- func spc = regexp. % spc = re(" ") :- func wspc = regexp. % wspc = any(" \t\n\r\f\v") :- func dot = regexp. % dot = anybut("\r\n") % Some useful compound regexps. % :- func nl = regexp. % nl = ?("\r") ++ re("\n") :- func nat = regexp. % nat = +(digit) :- func signed_int = regexp. % signed_int = ?("+" or "-") ++ nat :- func real = regexp. % real = \d+((.\d+([eE]int)?)|[eE]int) :- func identifier = regexp. % identifier = identstart ++ *(ident) :- func whitespace = regexp. % whitespace = *(wspc) :- func junk = regexp. % junk = *(dot) % A range of characters, inclusive of both the first and last values. % :- type char_range ---> char_range( cr_first :: int, cr_last :: int ). % charset(Start, End) = charset(Start `..` End) % % Throws an exception if Start > End. % :- func charset(int, int) = charset. % Function to create a sparse bitset from a range of Unicode codepoints. % These codepoints are checked for validity, any invalid codepoints % are ignored. Throws an exception if cr_first value is less than cr_last. % :- func charset(char_range) = charset. % Creates a union of all char ranges in the list. Returns the empty set % if the list is empty. Any invalid codepoints are ignored. % :- func charset_from_ranges(list(char_range)) = charset. % Latin is comprised of the following Unicode blocks: % * Basic Latin % * Latin1 Supplement % * Latin Extended-A % * Latin Extended-B % :- func latin_chars = charset is det. % Utility predicate to create ignore_pred's. % Use it in the form `ignore(my_token)' to ignore just `my_token'. % :- pred ignore(Token::in, Token::in) is semidet. % Utility function to return noval tokens. % Use it in the form `return(my_token) inside a lexeme definition. % :- func return(T, string) = T. % Utility operator to create lexemes. % :- func (T1 -> token_creator(Tok)) = pair(regexp, token_creator(Tok)) <= regexp(T1). % Construct a lexer from which we can generate running % instances. % % NOTE: If several lexemes match the same string only % the token generated by the one closest to the start % of the list of lexemes is returned. % :- func init(list(lexeme(Tok))::in, read_pred(Src)::in(read_pred)) = (lexer(Tok, Src)::out) is det. % Construct a lexer from which we can generate running % instances. If we construct a lexer with init/4, we % can additionally ignore specific tokens. % % NOTE: If several lexemes match the same string only % the token generated by the one closest to the start % of the list of lexemes is returned. % :- func init(list(lexeme(Tok))::in, read_pred(Src)::in(read_pred), ignore_pred(Tok)::in(ignore_pred)) = (lexer(Tok, Src)::out) is det. % Handy read predicates. % :- pred read_from_stdin(offset::in, read_result::out, io::di, io::uo) is det. :- pred read_from_string(offset::in, read_result::out, string::di, string::uo) is det. % Generate a running instance of a lexer on some input source. % If you want to lex strings, you must ensure they are unique % by calling either copy/1 or unsafe_promise_unique/1 on the % source string argument. % % Note that you can't get the input source back until you stop lexing. % :- func start(lexer(Tok, Src)::in, Src::di) = (lexer_state(Tok, Src)::uo) is det. % Read the next token from the input stream. % % CAVEAT: if the token returned happened to match the empty string, % then you must use read_char/3 (below) to consume the next char % in the input stream before calling read/3 again, since matching % the empty string does not consume any chars from the input stream % and will otherwise mean you simply get the same match ad infinitum. % % An alternative solution is to always include a "catch all" lexeme % that matches any unexpected char at the end of the list of lexemes. % :- pred read(io.read_result(Tok)::out, lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det. % Calling offset_from_start/3 immediately prior to calling read/3 % will give the offset in chars from the start of the input stream % for the result returned by the read/3 operation. % :- pred offset_from_start(offset::out, lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det. % Stop a running instance of a lexer and retrieve the input source. % :- func stop(lexer_state(_Tok, Src)::di) = (Src::uo) is det. % Sometimes (e.g. when lexing the io.io) you want access to the % input stream without interrupting the lexing process. % This pred provides that sort of access. % :- pred manipulate_source(pred(Src, Src)::in(pred(di, uo) is det), lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det. % This is occasionally useful. It reads the next char from the % input stream, without attempting to match it against a lexeme. % :- pred read_char(read_result::out, lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det. %-----------------------------------------------------------------------------% %-----------------------------------------------------------------------------% :- implementation. :- include_module lex.automata. :- include_module lex.buf. :- include_module lex.convert_NFA_to_DFA. :- include_module lex.lexeme. :- include_module lex.regexp. :- import_module array. :- import_module bool. :- import_module exception. :- import_module require. :- import_module int. :- import_module map. :- import_module maybe. :- import_module lex.automata. :- import_module lex.buf. :- import_module lex.convert_NFA_to_DFA. :- import_module lex.lexeme. :- import_module lex.regexp. %-----------------------------------------------------------------------------% :- type lexer(Token, Source) ---> lexer( lex_compiled_lexemes :: list(live_lexeme(Token)), lex_ignore_pred :: ignore_pred(Token), lex_buf_read_pred :: read_pred(Source) ). :- inst lexer for lexer/2 ---> lexer(ground, ignore_pred, read_pred). :- type lexer_instance(Token, Source) ---> lexer_instance( init_lexemes :: list(live_lexeme(Token)), init_winner_func :: init_winner_func(Token), live_lexemes :: list(live_lexeme(Token)), current_winner :: winner(Token), buf_state :: buf_state(Source), ignore_pred :: ignore_pred(Token) ). :- inst lexer_instance for lexer_instance/2 ---> lexer_instance( live_lexeme_list, init_winner_func, live_lexeme_list, winner, buf.buf_state, ignore_pred ). :- type live_lexeme(Token) == compiled_lexeme(Token). :- inst live_lexeme == compiled_lexeme. :- inst live_lexeme_list == list.list_skel(live_lexeme). :- type init_winner_func(Token) == ( func(offset) = winner(Token) ). :- inst init_winner_func == ( func(in) = out is det ). :- type winner(Token) == maybe(pair(token_creator(Token), offset)). :- inst winner for maybe/1 ---> yes(pair(token_creator, ground)) ; no. %-----------------------------------------------------------------------------% ignore(Tok, Tok). %-----------------------------------------------------------------------------% return(Token, _) = Token. %-----------------------------------------------------------------------------% (R1 -> TC) = (re(R1) - TC). %-----------------------------------------------------------------------------% init(Lexemes, BufReadPred) = Lexer :- DontIgnoreAnything = ( pred(_::in) is semidet :- semidet_fail ), Lexer = init(Lexemes, BufReadPred, DontIgnoreAnything). init(Lexemes, BufReadPred, IgnorePred) = Lexer :- CompiledLexemes = list.map(compile_lexeme, Lexemes), Lexer = lexer(CompiledLexemes, IgnorePred, BufReadPred). %-----------------------------------------------------------------------------% start(Lexer0, Src) = State :- Lexer = lexer_inst_cast(Lexer0), init_lexer_instance(Lexer, Instance, Buf), State = args_lexer_state(Instance, Buf, Src). :- func lexer_inst_cast(lexer(Tok, Src)::in) = (lexer(Tok, Src)::out(lexer)) is det. :- pragma foreign_proc("C", lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)), [will_not_call_mercury, promise_pure, thread_safe], " Lexer = Lexer0; "). :- pragma foreign_proc("Java", lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)), [will_not_call_mercury, promise_pure, thread_safe], " Lexer = Lexer0; "). :- pragma foreign_proc("C#", lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)), [will_not_call_mercury, promise_pure, thread_safe], " Lexer = Lexer0; "). %-----------------------------------------------------------------------------% :- pred init_lexer_instance(lexer(Tok, Src)::in(lexer), lexer_instance(Tok, Src)::out(lexer_instance), buf::array_uo) is det. init_lexer_instance(Lexer, Instance, Buf) :- buf.init(Lexer ^ lex_buf_read_pred, BufState, Buf), Start = BufState ^ start_offset, InitWinnerFunc = initial_winner_func(InitLexemes), InitLexemes = Lexer ^ lex_compiled_lexemes, InitWinner = InitWinnerFunc(Start), IgnorePred = Lexer ^ lex_ignore_pred, Instance = lexer_instance(InitLexemes, InitWinnerFunc, InitLexemes, InitWinner, BufState, IgnorePred). %-----------------------------------------------------------------------------% % Lexing may *start* with a candidate winner if one of the lexemes % accepts the empty string. We pick the first such, if any, since % that lexeme has priority. % :- func initial_winner_func(list(live_lexeme(Token))::in(live_lexeme_list)) = (init_winner_func(Token)::out(init_winner_func)) is det. initial_winner_func([]) = ( func(_) = no ). initial_winner_func([L | Ls]) = ( if in_accepting_state(L) then ( func(Offset) = yes(L ^ token - Offset) ) else initial_winner_func(Ls) ). %----------------------------------------------------------------------------% offset_from_start(Offset, !State) :- Offset = !.State ^ run ^ buf_state ^ buf_cursor, !:State = unsafe_promise_unique(!.State). %-----------------------------------------------------------------------------% stop(State) = Src :- lexer_state_args(State, _Instance, _Buf, Src). %-----------------------------------------------------------------------------% read(Result, State0, State) :- lexer_state_args(State0, Instance0, Buf0, Src0), BufState0 = Instance0 ^ buf_state, Start = BufState0 ^ start_offset, InitWinner = ( Instance0 ^ init_winner_func )(Start), Instance1 = ( Instance0 ^ current_winner := InitWinner ), read_2(Result, Instance1, Instance, Buf0, Buf, Src0, Src), State = args_lexer_state(Instance, Buf, Src). % Basically, just read chars from the buf and advance the live lexemes % until we have a winner or hit an error (no parse). % :- pred read_2(io.read_result(Tok)::out, lexer_instance(Tok, Src)::in(lexer_instance), lexer_instance(Tok, Src)::out(lexer_instance), buf::array_di, buf::array_uo, Src::di, Src::uo) is det. read_2(Result, !Instance, !Buf, !Src) :- BufState0 = !.Instance ^ buf_state, buf.read(BufReadResult, BufState0, BufState, !Buf, !Src), ( BufReadResult = ok(Char), process_char(Result, Char, !Instance, BufState, !Buf, !Src) ; BufReadResult = eof, process_eof(Result, !Instance, BufState, !.Buf) ). %-----------------------------------------------------------------------------% :- pred process_char(io.read_result(Tok)::out, char::in, lexer_instance(Tok, Src)::in(lexer_instance), lexer_instance(Tok, Src)::out(lexer_instance), buf_state(Src)::in(buf_state), buf::array_di, buf::array_uo, Src::di, Src::uo) is det. process_char(Result, Char, !Instance, BufState, !Buf, !Src) :- LiveLexemes0 = !.Instance ^ live_lexemes, Winner0 = !.Instance ^ current_winner, advance_live_lexemes(Char, BufState ^ cursor_offset, LiveLexemes0, LiveLexemes, Winner0, Winner), ( LiveLexemes = [], % Nothing left to consider. process_any_winner(Result, Winner, !Instance, BufState, !Buf, !Src) ; LiveLexemes = [_ | _], % Still some open possibilities. !:Instance = (((!.Instance ^ live_lexemes := LiveLexemes ) ^ current_winner := Winner ) ^ buf_state := BufState ), read_2(Result, !Instance, !Buf, !Src) ). %-----------------------------------------------------------------------------% :- pred process_any_winner(io.read_result(Tok)::out, winner(Tok)::in(winner), lexer_instance(Tok, Src)::in(lexer_instance), lexer_instance(Tok, Src)::out(lexer_instance), buf_state(Src)::in(buf_state), buf::array_di, buf::array_uo, Src::di, Src::uo) is det. process_any_winner(Result, yes(TokenCreator - Offset), Instance0, Instance, BufState0, Buf0, Buf, Src0, Src) :- BufState1 = rewind_cursor(Offset, BufState0), String = string_to_cursor(BufState1, Buf0), Token = TokenCreator(String), IgnorePred = Instance0 ^ ignore_pred, InitWinner = ( Instance0 ^ init_winner_func )(Offset), Instance1 = ((( Instance0 ^ live_lexemes := Instance0 ^ init_lexemes ) ^ current_winner := InitWinner ) ^ buf_state := commit(BufState1) ), ( if IgnorePred(Token) then % We have to be careful to avoid an infinite loop here. % If the longest match was the empty string, then the next char % in the input stream cannot start a match, so it must be reported % as an error. ( if String = "" then buf.read(BufResult, BufState1, BufState, Buf0, Buf, Src0, Src), ( BufResult = ok(_), Result = error("input not matched by any regexp", Offset) ; BufResult = eof, Result = eof ), Instance = ( Instance1 ^ buf_state := commit(BufState) ) else read_2(Result, Instance1, Instance, Buf0, Buf, Src0, Src) ) else Result = ok(Token), Instance = Instance1, Buf = Buf0, Src = Src0 ). process_any_winner(Result, no, !Instance, BufState0, !Buf, !Src) :- Start = BufState0 ^ start_offset, BufState = rewind_cursor(Start + 1, BufState0), Result = error("input not matched by any regexp", Start), InitWinner = ( !.Instance ^ init_winner_func )(Start), !:Instance = ((( !.Instance ^ live_lexemes := !.Instance ^ init_lexemes ) ^ current_winner := InitWinner ) ^ buf_state := commit(BufState) ). %-----------------------------------------------------------------------------% :- pred process_eof(io.read_result(Tok)::out, lexer_instance(Tok, Src)::in(lexer_instance), lexer_instance(Tok, Src)::out(lexer_instance), buf_state(Src)::in(buf_state), buf::array_ui) is det. process_eof(Result, !Instance, !.BufState, !.Buf) :- CurrentWinner = !.Instance ^ current_winner, ( CurrentWinner = no, Offset = !.BufState ^ cursor_offset, Result = eof ; CurrentWinner = yes(TokenCreator - Offset), String = string_to_cursor(!.BufState, !.Buf), Token = TokenCreator(String), IgnorePred = !.Instance ^ ignore_pred, Result = ( if IgnorePred(Token) then eof else ok(Token) ) ), InitWinner = ( !.Instance ^ init_winner_func )(Offset), !:Instance = ((( !.Instance ^ live_lexemes := !.Instance ^ init_lexemes ) ^ current_winner := InitWinner ) ^ buf_state := commit(!.BufState) ). %-----------------------------------------------------------------------------% % Note that in the case where two or more lexemes match the same % string, the win is given to the earliest such lexeme in the list. % This matches the behaviour of standard C lex. % :- pred advance_live_lexemes(char::in, offset::in, list(live_lexeme(Token))::in(live_lexeme_list), list(live_lexeme(Token))::out(live_lexeme_list), winner(Token)::in(winner), winner(Token)::out(winner)) is det. advance_live_lexemes(_Char, _Offset, [], [], !Winner). advance_live_lexemes(Char, Offset, [L | Ls0], Ls, !Winner) :- State0 = L ^ state, ( if next_state(L, State0, Char, State, IsAccepting) then ( IsAccepting = no ; IsAccepting = yes, ( if !.Winner = yes(_ - Offset) then true else !:Winner = yes(L ^ token - Offset) ) ), advance_live_lexemes(Char, Offset, Ls0, Ls1, !Winner), Ls = [(L ^ state := State) | Ls1] else advance_live_lexemes(Char, Offset, Ls0, Ls, !Winner) ). %-----------------------------------------------------------------------------% :- pred live_lexeme_in_accepting_state( list(live_lexeme(Tok))::in(live_lexeme_list), token_creator(Tok)::out(token_creator)) is semidet. live_lexeme_in_accepting_state([L | Ls], Token) :- ( if in_accepting_state(L) then Token = L ^ token else live_lexeme_in_accepting_state(Ls, Token) ). %-----------------------------------------------------------------------------% %-----------------------------------------------------------------------------% % It is much more convenient (especially for integration with parsers % such as moose) to package up the lexer_instance, buf and Src % in a single object. :- type lexer_state(Tok, Src) ---> lexer_state( run :: lexer_instance(Tok, Src), buf :: buf, src :: Src ). %-----------------------------------------------------------------------------% :- func args_lexer_state(lexer_instance(Tok, Src)::in(lexer_instance), buf::array_di, Src::di) = (lexer_state(Tok, Src)::uo) is det. args_lexer_state(Instance, Buf, Src) = LexerState :- unsafe_promise_unique(lexer_state(Instance, Buf, Src), LexerState). %-----------------------------------------------------------------------------% :- pred lexer_state_args(lexer_state(Tok, Src)::di, lexer_instance(Tok, Src)::out(lexer_instance), buf::array_uo, Src::uo) is det. lexer_state_args(lexer_state(Instance, Buf0, Src0), Instance, Buf, Src) :- unsafe_promise_unique(Buf0, Buf), unsafe_promise_unique(Src0, Src). %-----------------------------------------------------------------------------% manipulate_source(P, !State) :- lexer_state_args(!.State, Instance, Buf, Src0), P(Src0, Src), !:State = args_lexer_state(Instance, Buf, Src). %----------------------------------------------------------------------------% read_char(Result, !State) :- lexer_state_args(!.State, Instance0, Buf0, Src0), BufState0 = Instance0 ^ buf_state, buf.read(Result, BufState0, BufState, Buf0, Buf, Src0, Src), Instance = ( Instance0 ^ buf_state := commit(BufState) ), !:State = args_lexer_state(Instance, Buf, Src). %-----------------------------------------------------------------------------% read_from_stdin(_Offset, Result, !IO) :- io.read_char(IOResult, !IO), ( IOResult = ok(Char), Result = ok(Char) ; IOResult = eof, Result = eof ; IOResult = error(_E), throw(IOResult) ). %-----------------------------------------------------------------------------% % XXX This is bad for long strings! We should cache the string length % somewhere rather than recomputing it each time we read a char. % read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :- ( if Offset < string.length(String) then Result = ok(string.unsafe_index(String, Offset)) else Result = eof ). %-----------------------------------------------------------------------------% % % The type of regular expressions. % :- type regexp ---> eps % The empty regexp ; atom(char) % Match a single char ; conc(regexp, regexp) % Concatenation ; alt(regexp, regexp) % Alternation ; star(regexp) % Kleene closure ; charset(charset). % Matches any char in the set %-----------------------------------------------------------------------------% :- instance regexp(regexp) where [ re(RE) = RE ]. :- instance regexp(char) where [ re(C) = atom(C) ]. :- instance regexp(string) where [ re(S) = R :- ( if S = "" then R = null else R = string.foldl(func(Char, R0) = R1 :- ( if R0 = eps then R1 = re(Char) else R1 = R0 ++ re(Char) ), S, eps) ) ]. :- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)) where [ re(SparseBitset) = charset(Charset) :- Charset = sparse_bitset.foldl( func(Enum, Set0) = insert(Set0, char.det_from_uint(to_uint(Enum))), SparseBitset, sparse_bitset.init) ]. %-----------------------------------------------------------------------------% % % Basic primitive regexps. % null = eps. R1 ++ R2 = conc(re(R1), re(R2)). R1 \/ R2 = alt(re(R1), re(R2)). (R1 or R2) = alt(re(R1), re(R2)). *(R1) = star(re(R1)). %-----------------------------------------------------------------------------% % % Some basic non-primitive regexps. % % int_is_valid_char(Int) = Char. % % True iff Int is Char and is in [0x0..0x10ffff] and not a surrogate % character. % :- func int_is_valid_char(int) = char is semidet. int_is_valid_char(Value) = Char :- char.from_int(Value, Char), not char.is_surrogate(Char). charset(Start, End) = build_charset(Start, End, sparse_bitset.init) :- expect(Start =< End, $file, $pred, "Start must be less than or equal to End"). charset(char_range(First, Last)) = charset(First, Last). :- func build_charset(int, int, charset) = charset. build_charset(First, Last, Charset0) = Charset :- ( if First =< Last then ( if int_is_valid_char(First) = Char then Charset1 = sparse_bitset.insert(Charset0, Char) else Charset1 = Charset0 ), Charset = build_charset(First + 1, Last, Charset1) else Charset = Charset0 ). charset_from_ranges(ListOfRanges) = union_list(map(charset, ListOfRanges)). latin_chars = charset_from_ranges([ char_range(0x40, 0x7d), char_range(0xc0, 0xff), char_range(0x100, 0x2ff) ]). :- func valid_unicode_chars = charset. valid_unicode_chars = charset(char_range(0x01, 0xffff)). any(S) = R :- ( if S = "" then R = null else R = re(sparse_bitset.list_to_set(string.to_char_list(S))) ). anybut(S) = R :- ExcludedChars = sparse_bitset.list_to_set(string.to_char_list(S)), R = re(sparse_bitset.difference(valid_unicode_chars, ExcludedChars)). ?(R) = (R or null). +(R) = (R ++ *(R)). range(Start, End) = re(charset(char.to_int(Start), char.to_int(End))). R * N = Result :- ( if N < 0 then unexpected($pred, "N must be a non-negative number") else if N = 0 then Result = null else if N = 1 then Result = re(R) else Result = conc(re(R), (R * (N - 1))) ). %-----------------------------------------------------------------------------% % % Some useful single-char regexps. % % We invite the compiler to memo the values of these constants that % (a) are likely to be quite common in practice and (b) take *some* % time to compute. % :- pragma memo(func(digit/0)). :- pragma memo(func(lower/0)). :- pragma memo(func(upper/0)). :- pragma memo(func(wspc/0)). :- pragma memo(func(dot/0)). digit = any("0123456789"). lower = any("abcdefghijklmnopqrstuvwxyz"). upper = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ"). wspc = any(" \t\n\r\f\v"). dot = anybut("\r\n"). alpha = (lower or upper). alphanum = (alpha or digit). identstart = (alpha or ('_')). ident = (alphanum or ('_')). tab = re('\t'). spc = re(' '). %-----------------------------------------------------------------------------% % Some useful compound regexps. nl = (?('\r') ++ '\n'). % matches both Posix and Windows newline. nat = +(digit). signed_int = ?("+" or "-") ++ nat. real = signed_int ++ ( ("." ++ nat ++ ?(("e" or "E") ++ signed_int)) or ( ("e" or "E") ++ signed_int) ). identifier = (identstart ++ *(ident)). whitespace = *(wspc). junk = *(dot). %-----------------------------------------------------------------------------% %-----------------------------------------------------------------------------%