Files
mercury/extras/lex/lex.regexp.m
Ralph Becket 2a792e12b3 Added a new module, regex, as a companion to lex.
Estimated hours taken: 60
Branches: main

Added a new module, regex, as a companion to lex.  The new module provides
functionality for converting conventional Unix-style regular expressions
into regexps for use with lex and a number of search and search-and-replace
functions for strings.

The new functionality has been tested fairly thoroughly (and led to several
bugs in lex being identified and fixed.)

NEWS:
	Reported new additions.

extras/lex/README:
	Now just points the reader to README.lex and README.regex.

extras/lex/README.lex:
extras/lex/README.regex:
	Added.  Brief introductions to the two libraries.

extras/lex/lex.automata.m:
extras/lex/lex.buf.m:
extras/lex/lex.convert_NFA_to_DFA.m:
extras/lex/lex.regexp.m:
	Trivial formatting changes.

extras/lex/lex.lexeme.m:
	Removed the parameter on inst compiled_lexeme.

extras/lex/lex.m:
	Various formatting changes.

	Added pred offset_from_start/3 which can be used to identify
	the `current' point in the input stream with respect to lexing.

	Added pred read_char/3 which can be used to read the `next'
	char from the input stream without doing any lexing.

	Added a field init_winner_func to the lexer_instance type.  This
	is used to resolve a bug whereby regular expressions that match
	the empty string were not being spotted at the start of the input
	stream.

	Solved some bugs whereby an exception was incorrectly thrown in
	some circumstance when the end of the input stream was reached.

extras/lex/regex.m:
	Added.  This file defines the functions for converting Unix-style
	regular expression strings into regexps for use with lex and into
	regexes for use with the string search(-and-replace) predicates
	defined in this module.

extras/lex/Mmakefile:
	Improved the installation instructions and included a check target.

extras/lex/tests:
extras/lex/tests/Mmakefile:
extras/lex/tests/test_regex:
extras/lex/tests/test_regex.in:
extras/lex/tests/test_regex.exp:
	Added a test suite.

extras/lex/tests/cmp_regex_gawk:
	This program looks for differences in behaviour between gawk and
	regex.

extras/lex/samples/demo.m:
	Moved to lex_demo.m
extras/lex/samples/lex_demo.m:
	Was demo.m; slightly changed to include a match for unexpected
	characters.

extras/lex/samples/regex_demo.m:
	Added.

extras/lex/samples/Mmakefile:
	Updated.
2002-12-03 04:49:07 +00:00

222 lines
7.5 KiB
Mathematica

%-----------------------------------------------------------------------------%
% vim: ts=4 sw=4 et tw=0 wm=0 ff=unix
%
% lex.regexp.m
% Fri Aug 18 06:43:09 BST 2000
% Copyright (C) 2001 Ralph Becket <rbeck@microsoft.com>
% Copyright (C) 2001 The Rationalizer Intelligent Software AG
% The changes made by Rationalizer are contributed under the terms
% of the GNU Lesser General Public License, see the file COPYING.LGPL
% in this directory.
% Copyright (C) 2002 The University of Melbourne
%
% This file may only be copied under the terms of the GNU Library General
% Public License - see the file COPYING.LIB in the Mercury distribution.
%
% Thu Jul 26 07:45:47 UTC 2001
%
% Converts basic regular expressions into non-deterministic finite
% automata (NFAs).
%
%-----------------------------------------------------------------------------%
:- module lex__regexp.
:- interface.
:- import_module lex__automata.
% Turn a regexp into an NFA.
%
:- func regexp_to_NFA(regexp) = state_mc.
% Turn an NFA into a null transition-free NFA.
%
:- func remove_null_transitions(state_mc) = state_mc.
:- mode remove_null_transitions(in) = out(null_transition_free_state_mc) is det.
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%
:- implementation.
:- import_module counter, map, assoc_list, std_util, list, set, string.
%-----------------------------------------------------------------------------%
regexp_to_NFA(R) = NFA :-
C0 = counter__init(0),
counter__allocate(Start, C0, C1),
counter__allocate(Stop, C1, C),
compile(Start, R, Stop, Transitions, C, _),
NFA = state_mc(Start, set__make_singleton_set(Stop), Transitions).
%-----------------------------------------------------------------------------%
:- pred compile(state_no, regexp, state_no, transitions, counter, counter).
:- mode compile(in, in, in, out, in, out) is det.
% The primitive regexps.
compile(X, eps, Y, [null(X, Y)]) --> [].
compile(X, atom(C), Y, [trans(X, C, Y)]) --> [].
compile(X, conc(RA,RB), Y, TsA ++ TsB) -->
counter__allocate(Z),
compile(X, RA, Z, TsA),
compile(Z, RB, Y, TsB).
compile(X, alt(RA, RB), Y, TsA ++ TsB) -->
compile(X, RA, Y, TsA),
compile(X, RB, Y, TsB).
compile(X, star(R), Y, TsA ++ TsB) -->
compile(X, null, Y, TsA),
compile(X, R, X, TsB).
%-----------------------------------------------------------------------------%
% If we have a non-looping null transition from X to Y then
% we need to add all the transitions from Y to X.
%
% We do this by first finding the transitive closure of the
% null transition graph and then, for each edge X -> Y in that
% graph, adding X -C-> Z for all C and Z s.t. Y -C-> Z.
%
remove_null_transitions(NFA0) = NFA :-
Ts = NFA0 ^ smc_state_transitions,
split_transitions(Ts, NullTs, CharTs),
trans_closure(NullTs, map__init, _Ins, map__init, Outs),
NullFreeTs = add_atom_transitions(Outs, CharTs),
StopStates0 = NFA0 ^ smc_stop_states,
StopStates1 =
set__list_to_set(
list__filter_map(
nulls_to_stop_state(Outs, NFA0 ^ smc_stop_states),
NullTs
)
),
StopStates = StopStates0 `set__union` StopStates1,
NFA = (( NFA0
^ smc_state_transitions := NullFreeTs )
^ smc_stop_states := StopStates).
%-----------------------------------------------------------------------------%
:- pred split_transitions(transitions, transitions, transitions).
:- mode split_transitions(in, out(null_transitions), out(atom_transitions)).
split_transitions([], [], []).
split_transitions([null(X, Y) | Ts], [null(X, Y) | NTs], CTs) :-
split_transitions(Ts, NTs, CTs).
split_transitions([trans(X, C, Y) | Ts], NTs, [trans(X, C, Y) | CTs]) :-
split_transitions(Ts, NTs, CTs).
%-----------------------------------------------------------------------------%
:- type null_map == map(state_no, set(state_no)).
:- pred trans_closure(transitions, null_map, null_map, null_map, null_map).
:- mode trans_closure(in(null_transitions), in, out, in, out) is det.
trans_closure(Ts, Ins0, Ins, Outs0, Outs) :-
list__foldl2(add_edge, Ts, Ins0, Ins, Outs0, Outs).
%-----------------------------------------------------------------------------%
:- pred add_edge(transition, null_map, null_map, null_map, null_map).
:- mode add_edge(in(null_transition), in, out, in, out) is det.
add_edge(null(X, Y), Ins0, Ins, Outs0, Outs) :-
XInAndX = set__insert(null_map_lookup(X, Ins0), X),
YOutAndY = set__insert(null_map_lookup(Y, Outs0), Y),
Xs = set__to_sorted_list(XInAndX),
Ys = set__to_sorted_list(YOutAndY),
Outs = list__foldl(add_to_null_mapping(YOutAndY), Xs, Outs0),
Ins = list__foldl(add_to_null_mapping(XInAndX), Ys, Ins0).
%-----------------------------------------------------------------------------%
:- func null_map_lookup(state_no, null_map) = set(state_no).
null_map_lookup(X, Map) =
( if map__search(Map, X, Ys) then Ys
else set__init
).
%-----------------------------------------------------------------------------%
:- func add_to_null_mapping(set(state_no), state_no, null_map) = null_map.
add_to_null_mapping(Xs, Y, Map) =
map__set(Map, Y, Xs `set__union` null_map_lookup(Y, Map)).
%-----------------------------------------------------------------------------%
:- func add_atom_transitions(null_map, transitions) = transitions.
:- mode add_atom_transitions(in, in(atom_transitions)) =
out(atom_transitions) is det.
add_atom_transitions(Outs, CTs) =
list__sort_and_remove_dups(
list__condense(
[ CTs
| list__map(
add_atom_transitions_0(CTs),
map__to_assoc_list(Outs)
)
]
)
).
%-----------------------------------------------------------------------------%
:- func add_atom_transitions_0(transitions, pair(state_no, set(state_no))) =
transitions.
:- mode add_atom_transitions_0(in(atom_transitions), in) =
out(atom_transitions) is det.
add_atom_transitions_0(CTs, X - Ys) =
list__condense(
list__map(add_atom_transitions_1(CTs, X), set__to_sorted_list(Ys))
).
%-----------------------------------------------------------------------------%
:- func add_atom_transitions_1(transitions, state_no, state_no) = transitions.
:- mode add_atom_transitions_1(in(atom_transitions), in, in) =
out(atom_transitions) is det.
add_atom_transitions_1(CTs0, X, Y) = CTs :-
list__filter_map(maybe_copy_transition(X, Y), CTs0, CTs).
%-----------------------------------------------------------------------------%
:- pred maybe_copy_transition(state_no, state_no, transition, transition).
:- mode maybe_copy_transition(in,in,in(atom_transition),out(atom_transition))
is semidet.
maybe_copy_transition(X, Y, trans(Y, C, Z), trans(X, C, Z)).
%-----------------------------------------------------------------------------%
:- func nulls_to_stop_state(null_map, set(state_no), transition) = state_no.
:- mode nulls_to_stop_state(in, in, in) = out is semidet.
nulls_to_stop_state(Outs, StopStates, null(X, _Y)) = X :-
some [Z] (
set__member(Z, map__lookup(Outs, X)),
set__member(Z, StopStates)
).
%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%