mirror of
https://github.com/Mercury-Language/mercury.git
synced 2026-04-19 03:13:40 +00:00
extras/lex/lex.automata.m:
extras/lex/lex.buf.m:
extras/lex/lex.lexeme.m:
extras/lex/lex.m:
As above.
863 lines
29 KiB
Mathematica
863 lines
29 KiB
Mathematica
%-----------------------------------------------------------------------------%
|
|
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix ft=mercury
|
|
%-----------------------------------------------------------------------------%
|
|
%
|
|
% lex.m
|
|
% Copyright (C) 2001-2002 Ralph Becket <rbeck@microsoft.com>
|
|
% Sun Aug 20 09:08:46 BST 2000
|
|
% Copyright (C) 2001-2002 The Rationalizer Intelligent Software AG
|
|
% The changes made by Rationalizer are contributed under the terms
|
|
% of the GNU Lesser General Public License, see the file COPYING.LGPL
|
|
% in this directory.
|
|
% Copyright (C) 2002, 2006, 2010-2011 The University of Melbourne
|
|
% Copyright (C) 2014, 2017-2023 The Mercury team.
|
|
% This file is distributed under the terms specified in COPYING.LIB.
|
|
%
|
|
% This module puts everything together, compiling a list of lexemes into state
|
|
% machines and turning the input stream into a token stream.
|
|
%
|
|
% Note that the astral characters (in Unicode) are not included in the range of
|
|
% Unicode characters, as the astral planes are very sparsely assigned.
|
|
%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- module lex.
|
|
:- interface.
|
|
|
|
:- import_module char.
|
|
:- import_module io.
|
|
:- import_module list.
|
|
:- import_module pair.
|
|
:- import_module string.
|
|
:- import_module sparse_bitset.
|
|
:- import_module enum.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- type token_creator(Token) == (func(string) = Token).
|
|
:- inst token_creator == (func(in) = out is det).
|
|
|
|
:- type lexeme(Token) == pair(regexp, token_creator(Token)).
|
|
|
|
:- inst lexeme(Inst) for pair/2
|
|
---> (ground - Inst).
|
|
|
|
:- type lexer(Token, Source).
|
|
|
|
:- type lexer_state(Token, Source).
|
|
|
|
% Byte offset into the source data.
|
|
:- type offset == int.
|
|
|
|
% Any errors should be reported by raising an exception.
|
|
%
|
|
:- type read_result
|
|
---> ok(char)
|
|
; eof.
|
|
|
|
% read_pred(Offset, Result, SrcIn, SrcOut) reads the char at
|
|
% Offset from SrcIn and returns SrcOut.
|
|
%
|
|
:- type read_pred(T) == pred(offset, read_result, T, T).
|
|
:- inst read_pred == (pred(in, out, di, uo) is det).
|
|
|
|
% ignore_pred(Token): if it does not fail, Token must be ignored
|
|
%
|
|
:- type ignore_pred(Tok) == pred(Tok).
|
|
:- inst ignore_pred == (pred(in) is semidet).
|
|
|
|
% Represents a set of Unicode characters
|
|
%
|
|
:- type charset == sparse_bitset(char).
|
|
|
|
% The type of regular expressions.
|
|
%
|
|
:- type regexp.
|
|
|
|
% The typeclass for types having a natural converter to regexp's
|
|
%
|
|
:- typeclass regexp(T) where [
|
|
func re(T) = regexp
|
|
].
|
|
|
|
% Handling regexp's based on the typeclass regexp(T)
|
|
%
|
|
:- func null = regexp.
|
|
:- func T1 ++ T2 = regexp <= (regexp(T1), regexp(T2)).
|
|
:- func *(T) = regexp <= (regexp(T)).
|
|
|
|
% One of the following two functions may be deprecated in future,
|
|
% depending upon whether there's a concensus concerning
|
|
% which is preferable. Both express alternation.
|
|
%
|
|
:- func T1 \/ T2 = regexp <= (regexp(T1), regexp(T2)).
|
|
:- func (T1 or T2) = regexp <= (regexp(T1), regexp(T2)).
|
|
|
|
% Some instances of typeclass regexp(T).
|
|
%
|
|
:- instance regexp(regexp).
|
|
:- instance regexp(char).
|
|
:- instance regexp(string).
|
|
:- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)).
|
|
|
|
% Some basic non-primitive regexps.
|
|
%
|
|
:- func any(string) = regexp. % any("abc") = ('a') or ('b') or ('c')
|
|
:- func anybut(string) = regexp. % anybut("abc") is complement of any("abc")
|
|
:- func ?(T) = regexp <= regexp(T). % ?(R) = R or null
|
|
:- func +(T) = regexp <= regexp(T). % +(R) = R ++ *(R)
|
|
:- func range(char, char) = regexp. % range('a', 'z') = any("ab...xyz")
|
|
:- func (T * int) = regexp <= regexp(T). % R * N = R ++ ... ++ R
|
|
|
|
% Some useful single-char regexps.
|
|
%
|
|
:- func digit = regexp. % digit = any("0123456789")
|
|
:- func lower = regexp. % lower = any("abc...z")
|
|
:- func upper = regexp. % upper = any("ABC...Z")
|
|
:- func alpha = regexp. % alpha = lower or upper
|
|
:- func alphanum = regexp. % alphanum = alpha or digit
|
|
:- func identstart = regexp. % identstart = alpha or "_"
|
|
:- func ident = regexp. % ident = alphanum or "_"
|
|
:- func tab = regexp. % tab = re("\t")
|
|
:- func spc = regexp. % spc = re(" ")
|
|
:- func wspc = regexp. % wspc = any(" \t\n\r\f\v")
|
|
:- func dot = regexp. % dot = anybut("\r\n")
|
|
|
|
% Some useful compound regexps.
|
|
%
|
|
:- func nl = regexp. % nl = ?("\r") ++ re("\n")
|
|
:- func nat = regexp. % nat = +(digit)
|
|
:- func signed_int = regexp. % signed_int = ?("+" or "-") ++ nat
|
|
:- func real = regexp. % real = \d+((.\d+([eE]int)?)|[eE]int)
|
|
:- func identifier = regexp. % identifier = identstart ++ *(ident)
|
|
:- func whitespace = regexp. % whitespace = *(wspc)
|
|
:- func junk = regexp. % junk = *(dot)
|
|
|
|
% A range of characters, inclusive of both the first and last values.
|
|
%
|
|
:- type char_range
|
|
---> char_range(
|
|
cr_first :: int,
|
|
cr_last :: int
|
|
).
|
|
|
|
% charset(Start, End) = charset(Start `..` End)
|
|
%
|
|
% Throws an exception if Start > End.
|
|
%
|
|
:- func charset(int, int) = charset.
|
|
|
|
% Function to create a sparse bitset from a range of Unicode codepoints.
|
|
% These codepoints are checked for validity, any invalid codepoints
|
|
% are ignored. Throws an exception if cr_first value is less than cr_last.
|
|
%
|
|
:- func charset(char_range) = charset.
|
|
|
|
% Creates a union of all char ranges in the list. Returns the empty set
|
|
% if the list is empty. Any invalid codepoints are ignored.
|
|
%
|
|
:- func charset_from_ranges(list(char_range)) = charset.
|
|
|
|
% Latin is comprised of the following Unicode blocks:
|
|
% * Basic Latin
|
|
% * Latin1 Supplement
|
|
% * Latin Extended-A
|
|
% * Latin Extended-B
|
|
%
|
|
:- func latin_chars = charset is det.
|
|
|
|
% Utility predicate to create ignore_pred's.
|
|
% Use it in the form `ignore(my_token)' to ignore just `my_token'.
|
|
%
|
|
:- pred ignore(Token::in, Token::in) is semidet.
|
|
|
|
% Utility function to return noval tokens.
|
|
% Use it in the form `return(my_token) inside a lexeme definition.
|
|
%
|
|
:- func return(T, string) = T.
|
|
|
|
% Utility operator to create lexemes.
|
|
%
|
|
:- func (T1 -> token_creator(Tok)) = pair(regexp, token_creator(Tok))
|
|
<= regexp(T1).
|
|
|
|
% Construct a lexer from which we can generate running
|
|
% instances.
|
|
%
|
|
% NOTE: If several lexemes match the same string only
|
|
% the token generated by the one closest to the start
|
|
% of the list of lexemes is returned.
|
|
%
|
|
:- func init(list(lexeme(Tok))::in, read_pred(Src)::in(read_pred))
|
|
= (lexer(Tok, Src)::out) is det.
|
|
|
|
% Construct a lexer from which we can generate running
|
|
% instances. If we construct a lexer with init/4, we
|
|
% can additionally ignore specific tokens.
|
|
%
|
|
% NOTE: If several lexemes match the same string only
|
|
% the token generated by the one closest to the start
|
|
% of the list of lexemes is returned.
|
|
%
|
|
:- func init(list(lexeme(Tok))::in, read_pred(Src)::in(read_pred),
|
|
ignore_pred(Tok)::in(ignore_pred)) = (lexer(Tok, Src)::out) is det.
|
|
|
|
% Handy read predicates.
|
|
%
|
|
:- pred read_from_stdin(offset::in, read_result::out, io::di, io::uo) is det.
|
|
:- pred read_from_string(offset::in, read_result::out,
|
|
string::di, string::uo) is det.
|
|
|
|
% Generate a running instance of a lexer on some input source.
|
|
% If you want to lex strings, you must ensure they are unique
|
|
% by calling either copy/1 or unsafe_promise_unique/1 on the
|
|
% source string argument.
|
|
%
|
|
% Note that you can't get the input source back until you stop lexing.
|
|
%
|
|
:- func start(lexer(Tok, Src)::in, Src::di) = (lexer_state(Tok, Src)::uo)
|
|
is det.
|
|
|
|
% Read the next token from the input stream.
|
|
%
|
|
% CAVEAT: if the token returned happened to match the empty string,
|
|
% then you must use read_char/3 (below) to consume the next char
|
|
% in the input stream before calling read/3 again, since matching
|
|
% the empty string does not consume any chars from the input stream
|
|
% and will otherwise mean you simply get the same match ad infinitum.
|
|
%
|
|
% An alternative solution is to always include a "catch all" lexeme
|
|
% that matches any unexpected char at the end of the list of lexemes.
|
|
%
|
|
:- pred read(io.read_result(Tok)::out,
|
|
lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det.
|
|
|
|
% Calling offset_from_start/3 immediately prior to calling read/3
|
|
% will give the offset in chars from the start of the input stream
|
|
% for the result returned by the read/3 operation.
|
|
%
|
|
:- pred offset_from_start(offset::out,
|
|
lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det.
|
|
|
|
% Stop a running instance of a lexer and retrieve the input source.
|
|
%
|
|
:- func stop(lexer_state(_Tok, Src)::di) = (Src::uo) is det.
|
|
|
|
% Sometimes (e.g. when lexing the io.io) you want access to the
|
|
% input stream without interrupting the lexing process.
|
|
% This pred provides that sort of access.
|
|
%
|
|
:- pred manipulate_source(pred(Src, Src)::in(pred(di, uo) is det),
|
|
lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det.
|
|
|
|
% This is occasionally useful. It reads the next char from the
|
|
% input stream, without attempting to match it against a lexeme.
|
|
%
|
|
:- pred read_char(read_result::out,
|
|
lexer_state(Tok, Src)::di, lexer_state(Tok, Src)::uo) is det.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- implementation.
|
|
|
|
:- include_module lex.automata.
|
|
:- include_module lex.buf.
|
|
:- include_module lex.convert_NFA_to_DFA.
|
|
:- include_module lex.lexeme.
|
|
:- include_module lex.regexp.
|
|
|
|
:- import_module array.
|
|
:- import_module bool.
|
|
:- import_module exception.
|
|
:- import_module require.
|
|
:- import_module int.
|
|
:- import_module map.
|
|
:- import_module maybe.
|
|
|
|
:- import_module lex.automata.
|
|
:- import_module lex.buf.
|
|
:- import_module lex.convert_NFA_to_DFA.
|
|
:- import_module lex.lexeme.
|
|
:- import_module lex.regexp.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- type lexer(Token, Source)
|
|
---> lexer(
|
|
lex_compiled_lexemes :: list(live_lexeme(Token)),
|
|
lex_ignore_pred :: ignore_pred(Token),
|
|
lex_buf_read_pred :: read_pred(Source)
|
|
).
|
|
|
|
:- inst lexer for lexer/2
|
|
---> lexer(ground, ignore_pred, read_pred).
|
|
|
|
:- type lexer_instance(Token, Source)
|
|
---> lexer_instance(
|
|
init_lexemes :: list(live_lexeme(Token)),
|
|
init_winner_func :: init_winner_func(Token),
|
|
live_lexemes :: list(live_lexeme(Token)),
|
|
current_winner :: winner(Token),
|
|
buf_state :: buf_state(Source),
|
|
ignore_pred :: ignore_pred(Token)
|
|
).
|
|
|
|
:- inst lexer_instance for lexer_instance/2
|
|
---> lexer_instance(
|
|
live_lexeme_list,
|
|
init_winner_func,
|
|
live_lexeme_list,
|
|
winner,
|
|
buf.buf_state,
|
|
ignore_pred
|
|
).
|
|
|
|
:- type live_lexeme(Token) == compiled_lexeme(Token).
|
|
:- inst live_lexeme == compiled_lexeme.
|
|
:- inst live_lexeme_list == list.list_skel(live_lexeme).
|
|
|
|
:- type init_winner_func(Token)
|
|
== ( func(offset) = winner(Token) ).
|
|
:- inst init_winner_func
|
|
== ( func(in) = out is det ).
|
|
|
|
:- type winner(Token) == maybe(pair(token_creator(Token), offset)).
|
|
:- inst winner for maybe/1
|
|
---> yes(pair(token_creator, ground))
|
|
; no.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
ignore(Tok, Tok).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
return(Token, _) = Token.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
(R1 -> TC) = (re(R1) - TC).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
init(Lexemes, BufReadPred) = Lexer :-
|
|
DontIgnoreAnything =
|
|
( pred(_::in) is semidet :-
|
|
semidet_fail
|
|
),
|
|
Lexer = init(Lexemes, BufReadPred, DontIgnoreAnything).
|
|
|
|
init(Lexemes, BufReadPred, IgnorePred) = Lexer :-
|
|
CompiledLexemes = list.map(compile_lexeme, Lexemes),
|
|
Lexer = lexer(CompiledLexemes, IgnorePred, BufReadPred).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
start(Lexer0, Src) = State :-
|
|
Lexer = lexer_inst_cast(Lexer0),
|
|
init_lexer_instance(Lexer, Instance, Buf),
|
|
State = args_lexer_state(Instance, Buf, Src).
|
|
|
|
:- func lexer_inst_cast(lexer(Tok, Src)::in) = (lexer(Tok, Src)::out(lexer))
|
|
is det.
|
|
|
|
:- pragma foreign_proc("C",
|
|
lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)),
|
|
[will_not_call_mercury, promise_pure, thread_safe],
|
|
"
|
|
Lexer = Lexer0;
|
|
").
|
|
|
|
:- pragma foreign_proc("Java",
|
|
lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)),
|
|
[will_not_call_mercury, promise_pure, thread_safe],
|
|
"
|
|
Lexer = Lexer0;
|
|
").
|
|
|
|
:- pragma foreign_proc("C#",
|
|
lexer_inst_cast(Lexer0::in) = (Lexer::out(lexer)),
|
|
[will_not_call_mercury, promise_pure, thread_safe],
|
|
"
|
|
Lexer = Lexer0;
|
|
").
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred init_lexer_instance(lexer(Tok, Src)::in(lexer),
|
|
lexer_instance(Tok, Src)::out(lexer_instance), buf::array_uo) is det.
|
|
|
|
init_lexer_instance(Lexer, Instance, Buf) :-
|
|
buf.init(Lexer ^ lex_buf_read_pred, BufState, Buf),
|
|
Start = BufState ^ start_offset,
|
|
InitWinnerFunc = initial_winner_func(InitLexemes),
|
|
InitLexemes = Lexer ^ lex_compiled_lexemes,
|
|
InitWinner = InitWinnerFunc(Start),
|
|
IgnorePred = Lexer ^ lex_ignore_pred,
|
|
Instance = lexer_instance(InitLexemes, InitWinnerFunc, InitLexemes,
|
|
InitWinner, BufState, IgnorePred).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Lexing may *start* with a candidate winner if one of the lexemes
|
|
% accepts the empty string. We pick the first such, if any, since
|
|
% that lexeme has priority.
|
|
%
|
|
:- func initial_winner_func(list(live_lexeme(Token))::in(live_lexeme_list))
|
|
= (init_winner_func(Token)::out(init_winner_func)) is det.
|
|
|
|
initial_winner_func([]) =
|
|
( func(_) = no ).
|
|
initial_winner_func([L | Ls]) =
|
|
( if in_accepting_state(L) then
|
|
( func(Offset) = yes(L ^ token - Offset) )
|
|
else
|
|
initial_winner_func(Ls)
|
|
).
|
|
|
|
%----------------------------------------------------------------------------%
|
|
|
|
offset_from_start(Offset, !State) :-
|
|
Offset = !.State ^ run ^ buf_state ^ buf_cursor,
|
|
!:State = unsafe_promise_unique(!.State).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
stop(State) = Src :-
|
|
lexer_state_args(State, _Instance, _Buf, Src).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
read(Result, State0, State) :-
|
|
lexer_state_args(State0, Instance0, Buf0, Src0),
|
|
BufState0 = Instance0 ^ buf_state,
|
|
Start = BufState0 ^ start_offset,
|
|
InitWinner = ( Instance0 ^ init_winner_func )(Start),
|
|
Instance1 = ( Instance0 ^ current_winner := InitWinner ),
|
|
read_2(Result, Instance1, Instance, Buf0, Buf, Src0, Src),
|
|
State = args_lexer_state(Instance, Buf, Src).
|
|
|
|
% Basically, just read chars from the buf and advance the live lexemes
|
|
% until we have a winner or hit an error (no parse).
|
|
%
|
|
:- pred read_2(io.read_result(Tok)::out,
|
|
lexer_instance(Tok, Src)::in(lexer_instance),
|
|
lexer_instance(Tok, Src)::out(lexer_instance),
|
|
buf::array_di, buf::array_uo, Src::di, Src::uo) is det.
|
|
|
|
read_2(Result, !Instance, !Buf, !Src) :-
|
|
BufState0 = !.Instance ^ buf_state,
|
|
buf.read(BufReadResult, BufState0, BufState, !Buf, !Src),
|
|
(
|
|
BufReadResult = ok(Char),
|
|
process_char(Result, Char, !Instance, BufState, !Buf, !Src)
|
|
;
|
|
BufReadResult = eof,
|
|
process_eof(Result, !Instance, BufState, !.Buf)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred process_char(io.read_result(Tok)::out, char::in,
|
|
lexer_instance(Tok, Src)::in(lexer_instance),
|
|
lexer_instance(Tok, Src)::out(lexer_instance),
|
|
buf_state(Src)::in(buf_state),
|
|
buf::array_di, buf::array_uo, Src::di, Src::uo) is det.
|
|
|
|
process_char(Result, Char, !Instance, BufState, !Buf, !Src) :-
|
|
LiveLexemes0 = !.Instance ^ live_lexemes,
|
|
Winner0 = !.Instance ^ current_winner,
|
|
advance_live_lexemes(Char, BufState ^ cursor_offset,
|
|
LiveLexemes0, LiveLexemes, Winner0, Winner),
|
|
(
|
|
LiveLexemes = [],
|
|
% Nothing left to consider.
|
|
process_any_winner(Result, Winner, !Instance, BufState, !Buf, !Src)
|
|
;
|
|
LiveLexemes = [_ | _],
|
|
% Still some open possibilities.
|
|
!:Instance = (((!.Instance
|
|
^ live_lexemes := LiveLexemes )
|
|
^ current_winner := Winner )
|
|
^ buf_state := BufState ),
|
|
read_2(Result, !Instance, !Buf, !Src)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred process_any_winner(io.read_result(Tok)::out, winner(Tok)::in(winner),
|
|
lexer_instance(Tok, Src)::in(lexer_instance),
|
|
lexer_instance(Tok, Src)::out(lexer_instance),
|
|
buf_state(Src)::in(buf_state),
|
|
buf::array_di, buf::array_uo, Src::di, Src::uo) is det.
|
|
|
|
process_any_winner(Result, yes(TokenCreator - Offset), Instance0, Instance,
|
|
BufState0, Buf0, Buf, Src0, Src) :-
|
|
BufState1 = rewind_cursor(Offset, BufState0),
|
|
String = string_to_cursor(BufState1, Buf0),
|
|
Token = TokenCreator(String),
|
|
IgnorePred = Instance0 ^ ignore_pred,
|
|
InitWinner = ( Instance0 ^ init_winner_func )(Offset),
|
|
Instance1 = ((( Instance0
|
|
^ live_lexemes := Instance0 ^ init_lexemes )
|
|
^ current_winner := InitWinner )
|
|
^ buf_state := commit(BufState1) ),
|
|
|
|
( if IgnorePred(Token) then
|
|
% We have to be careful to avoid an infinite loop here.
|
|
% If the longest match was the empty string, then the next char
|
|
% in the input stream cannot start a match, so it must be reported
|
|
% as an error.
|
|
( if String = "" then
|
|
buf.read(BufResult, BufState1, BufState, Buf0, Buf, Src0, Src),
|
|
(
|
|
BufResult = ok(_),
|
|
Result = error("input not matched by any regexp", Offset)
|
|
;
|
|
BufResult = eof,
|
|
Result = eof
|
|
),
|
|
Instance = ( Instance1 ^ buf_state := commit(BufState) )
|
|
else
|
|
read_2(Result, Instance1, Instance, Buf0, Buf, Src0, Src)
|
|
)
|
|
else
|
|
Result = ok(Token),
|
|
Instance = Instance1,
|
|
Buf = Buf0,
|
|
Src = Src0
|
|
).
|
|
process_any_winner(Result, no, !Instance, BufState0, !Buf, !Src) :-
|
|
Start = BufState0 ^ start_offset,
|
|
BufState = rewind_cursor(Start + 1, BufState0),
|
|
Result = error("input not matched by any regexp", Start),
|
|
|
|
InitWinner = ( !.Instance ^ init_winner_func )(Start),
|
|
!:Instance = ((( !.Instance
|
|
^ live_lexemes := !.Instance ^ init_lexemes )
|
|
^ current_winner := InitWinner )
|
|
^ buf_state := commit(BufState) ).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred process_eof(io.read_result(Tok)::out,
|
|
lexer_instance(Tok, Src)::in(lexer_instance),
|
|
lexer_instance(Tok, Src)::out(lexer_instance),
|
|
buf_state(Src)::in(buf_state), buf::array_ui) is det.
|
|
|
|
process_eof(Result, !Instance, !.BufState, !.Buf) :-
|
|
CurrentWinner = !.Instance ^ current_winner,
|
|
(
|
|
CurrentWinner = no,
|
|
Offset = !.BufState ^ cursor_offset,
|
|
Result = eof
|
|
;
|
|
CurrentWinner = yes(TokenCreator - Offset),
|
|
String = string_to_cursor(!.BufState, !.Buf),
|
|
Token = TokenCreator(String),
|
|
IgnorePred = !.Instance ^ ignore_pred,
|
|
Result = ( if IgnorePred(Token) then eof else ok(Token) )
|
|
),
|
|
InitWinner = ( !.Instance ^ init_winner_func )(Offset),
|
|
!:Instance = ((( !.Instance
|
|
^ live_lexemes := !.Instance ^ init_lexemes )
|
|
^ current_winner := InitWinner )
|
|
^ buf_state := commit(!.BufState) ).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Note that in the case where two or more lexemes match the same
|
|
% string, the win is given to the earliest such lexeme in the list.
|
|
% This matches the behaviour of standard C lex.
|
|
%
|
|
:- pred advance_live_lexemes(char::in, offset::in,
|
|
list(live_lexeme(Token))::in(live_lexeme_list),
|
|
list(live_lexeme(Token))::out(live_lexeme_list),
|
|
winner(Token)::in(winner), winner(Token)::out(winner)) is det.
|
|
|
|
advance_live_lexemes(_Char, _Offset, [], [], !Winner).
|
|
advance_live_lexemes(Char, Offset, [L | Ls0], Ls, !Winner) :-
|
|
State0 = L ^ state,
|
|
( if next_state(L, State0, Char, State, IsAccepting) then
|
|
(
|
|
IsAccepting = no
|
|
;
|
|
IsAccepting = yes,
|
|
( if !.Winner = yes(_ - Offset) then
|
|
true
|
|
else
|
|
!:Winner = yes(L ^ token - Offset)
|
|
)
|
|
),
|
|
advance_live_lexemes(Char, Offset, Ls0, Ls1, !Winner),
|
|
Ls = [(L ^ state := State) | Ls1]
|
|
else
|
|
advance_live_lexemes(Char, Offset, Ls0, Ls, !Winner)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred live_lexeme_in_accepting_state(
|
|
list(live_lexeme(Tok))::in(live_lexeme_list),
|
|
token_creator(Tok)::out(token_creator)) is semidet.
|
|
|
|
live_lexeme_in_accepting_state([L | Ls], Token) :-
|
|
( if in_accepting_state(L) then
|
|
Token = L ^ token
|
|
else
|
|
live_lexeme_in_accepting_state(Ls, Token)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% It is much more convenient (especially for integration with parsers
|
|
% such as moose) to package up the lexer_instance, buf and Src
|
|
% in a single object.
|
|
:- type lexer_state(Tok, Src)
|
|
---> lexer_state(
|
|
run :: lexer_instance(Tok, Src),
|
|
buf :: buf,
|
|
src :: Src
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- func args_lexer_state(lexer_instance(Tok, Src)::in(lexer_instance),
|
|
buf::array_di, Src::di) = (lexer_state(Tok, Src)::uo) is det.
|
|
|
|
args_lexer_state(Instance, Buf, Src) = LexerState :-
|
|
unsafe_promise_unique(lexer_state(Instance, Buf, Src), LexerState).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pred lexer_state_args(lexer_state(Tok, Src)::di,
|
|
lexer_instance(Tok, Src)::out(lexer_instance),
|
|
buf::array_uo, Src::uo) is det.
|
|
|
|
lexer_state_args(lexer_state(Instance, Buf0, Src0), Instance, Buf, Src) :-
|
|
unsafe_promise_unique(Buf0, Buf),
|
|
unsafe_promise_unique(Src0, Src).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
manipulate_source(P, !State) :-
|
|
lexer_state_args(!.State, Instance, Buf, Src0),
|
|
P(Src0, Src),
|
|
!:State = args_lexer_state(Instance, Buf, Src).
|
|
|
|
%----------------------------------------------------------------------------%
|
|
|
|
read_char(Result, !State) :-
|
|
lexer_state_args(!.State, Instance0, Buf0, Src0),
|
|
|
|
BufState0 = Instance0 ^ buf_state,
|
|
buf.read(Result, BufState0, BufState, Buf0, Buf, Src0, Src),
|
|
Instance = ( Instance0 ^ buf_state := commit(BufState) ),
|
|
|
|
!:State = args_lexer_state(Instance, Buf, Src).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
read_from_stdin(_Offset, Result, !IO) :-
|
|
io.read_char(IOResult, !IO),
|
|
(
|
|
IOResult = ok(Char),
|
|
Result = ok(Char)
|
|
;
|
|
IOResult = eof,
|
|
Result = eof
|
|
;
|
|
IOResult = error(_E),
|
|
throw(IOResult)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% XXX This is bad for long strings! We should cache the string length
|
|
% somewhere rather than recomputing it each time we read a char.
|
|
%
|
|
read_from_string(Offset, Result, String, unsafe_promise_unique(String)) :-
|
|
( if Offset < string.length(String) then
|
|
Result = ok(string.unsafe_index(String, Offset))
|
|
else
|
|
Result = eof
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%
|
|
% The type of regular expressions.
|
|
%
|
|
|
|
:- type regexp
|
|
---> eps % The empty regexp
|
|
; atom(char) % Match a single char
|
|
; conc(regexp, regexp) % Concatenation
|
|
; alt(regexp, regexp) % Alternation
|
|
; star(regexp) % Kleene closure
|
|
; charset(charset). % Matches any char in the set
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- instance regexp(regexp) where [
|
|
re(RE) = RE
|
|
].
|
|
|
|
:- instance regexp(char) where [
|
|
re(C) = atom(C)
|
|
].
|
|
|
|
:- instance regexp(string) where [
|
|
re(S) = R :-
|
|
( if S = "" then
|
|
R = null
|
|
else
|
|
R = string.foldl(func(Char, R0) = R1 :-
|
|
( if R0 = eps then R1 = re(Char) else R1 = R0 ++ re(Char) ),
|
|
S,
|
|
eps)
|
|
)
|
|
].
|
|
|
|
:- instance regexp(sparse_bitset(T)) <= (regexp(T),uenum(T)) where [
|
|
re(SparseBitset) = charset(Charset) :-
|
|
Charset = sparse_bitset.foldl(
|
|
func(Enum, Set0) = insert(Set0, char.det_from_uint(to_uint(Enum))),
|
|
SparseBitset,
|
|
sparse_bitset.init)
|
|
].
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%
|
|
% Basic primitive regexps.
|
|
%
|
|
|
|
null = eps.
|
|
R1 ++ R2 = conc(re(R1), re(R2)).
|
|
R1 \/ R2 = alt(re(R1), re(R2)).
|
|
(R1 or R2) = alt(re(R1), re(R2)).
|
|
*(R1) = star(re(R1)).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%
|
|
% Some basic non-primitive regexps.
|
|
%
|
|
|
|
% int_is_valid_char(Int) = Char.
|
|
%
|
|
% True iff Int is Char and is in [0x0..0x10ffff] and not a surrogate
|
|
% character.
|
|
%
|
|
:- func int_is_valid_char(int) = char is semidet.
|
|
|
|
int_is_valid_char(Value) = Char :-
|
|
char.from_int(Value, Char),
|
|
not char.is_surrogate(Char).
|
|
|
|
charset(Start, End) = build_charset(Start, End, sparse_bitset.init) :-
|
|
expect(Start =< End, $file, $pred,
|
|
"Start must be less than or equal to End").
|
|
|
|
charset(char_range(First, Last)) = charset(First, Last).
|
|
|
|
:- func build_charset(int, int, charset) = charset.
|
|
|
|
build_charset(First, Last, Charset0) = Charset :-
|
|
( if First =< Last then
|
|
( if int_is_valid_char(First) = Char then
|
|
Charset1 = sparse_bitset.insert(Charset0, Char)
|
|
else
|
|
Charset1 = Charset0
|
|
),
|
|
Charset = build_charset(First + 1, Last, Charset1)
|
|
else
|
|
Charset = Charset0
|
|
).
|
|
|
|
charset_from_ranges(ListOfRanges) =
|
|
union_list(map(charset, ListOfRanges)).
|
|
|
|
latin_chars =
|
|
charset_from_ranges([
|
|
char_range(0x40, 0x7d),
|
|
char_range(0xc0, 0xff),
|
|
char_range(0x100, 0x2ff)
|
|
]).
|
|
|
|
:- func valid_unicode_chars = charset.
|
|
|
|
valid_unicode_chars = charset(char_range(0x01, 0xffff)).
|
|
|
|
any(S) = R :-
|
|
( if S = "" then
|
|
R = null
|
|
else
|
|
R = re(sparse_bitset.list_to_set(string.to_char_list(S)))
|
|
).
|
|
|
|
anybut(S) = R :-
|
|
ExcludedChars = sparse_bitset.list_to_set(string.to_char_list(S)),
|
|
R = re(sparse_bitset.difference(valid_unicode_chars, ExcludedChars)).
|
|
|
|
?(R) = (R or null).
|
|
|
|
+(R) = (R ++ *(R)).
|
|
|
|
range(Start, End) = re(charset(char.to_int(Start), char.to_int(End))).
|
|
|
|
R * N = Result :-
|
|
( if N < 0 then
|
|
unexpected($pred, "N must be a non-negative number")
|
|
else if N = 0 then
|
|
Result = null
|
|
else if N = 1 then
|
|
Result = re(R)
|
|
else
|
|
Result = conc(re(R), (R * (N - 1)))
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%
|
|
% Some useful single-char regexps.
|
|
%
|
|
|
|
% We invite the compiler to memo the values of these constants that
|
|
% (a) are likely to be quite common in practice and (b) take *some*
|
|
% time to compute.
|
|
%
|
|
:- pragma memo(func(digit/0)).
|
|
:- pragma memo(func(lower/0)).
|
|
:- pragma memo(func(upper/0)).
|
|
:- pragma memo(func(wspc/0)).
|
|
:- pragma memo(func(dot/0)).
|
|
|
|
digit = any("0123456789").
|
|
lower = any("abcdefghijklmnopqrstuvwxyz").
|
|
upper = any("ABCDEFGHIJKLMNOPQRSTUVWXYZ").
|
|
wspc = any(" \t\n\r\f\v").
|
|
dot = anybut("\r\n").
|
|
alpha = (lower or upper).
|
|
alphanum = (alpha or digit).
|
|
identstart = (alpha or ('_')).
|
|
ident = (alphanum or ('_')).
|
|
tab = re('\t').
|
|
spc = re(' ').
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
% Some useful compound regexps.
|
|
|
|
nl = (?('\r') ++ '\n'). % matches both Posix and Windows newline.
|
|
nat = +(digit).
|
|
signed_int = ?("+" or "-") ++ nat.
|
|
real = signed_int ++ (
|
|
("." ++ nat ++ ?(("e" or "E") ++ signed_int)) or
|
|
( ("e" or "E") ++ signed_int)
|
|
).
|
|
identifier = (identstart ++ *(ident)).
|
|
whitespace = *(wspc).
|
|
junk = *(dot).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|