mirror of
https://github.com/Mercury-Language/mercury.git
synced 2025-12-18 07:15:19 +00:00
Estimated hours taken: 0.5 Branches: main Fix compilation problems in the extras distribution caused by recent changes. extras/stream/stream.m: Provide a definition for the type stream/1. extras/*/*.m: Conform to the recent changes to the standard library.
609 lines
21 KiB
Mathematica
609 lines
21 KiB
Mathematica
%-----------------------------------------------------------------------------%
|
|
% regex.m
|
|
% Ralph Becket <rafe@cs.mu.oz.au>
|
|
% Copyright (C) 2002, 2006 The University of Melbourne
|
|
% vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
|
|
%
|
|
% This module provides basic string matching and search and replace
|
|
% functionality using regular expressions defined as strings of the
|
|
% form recognised by tools such as sed and grep.
|
|
%
|
|
% The regular expression langauge matched is a subset of POSIX 1003.2
|
|
% with a few minor differences:
|
|
% - bounds {[n][,[m]]} are not recognised;
|
|
% - collating elements [.ab.] in character sets are not recognised;
|
|
% - equivalence classes [=x=] in character sets are not recognised;
|
|
% - character classes [:space:] in character sets are not recognised;
|
|
% - special inter-character patterns such as ^, $, \<, \> are not recognised;
|
|
% - to include a literal `-' in a character set, use the range `---' or
|
|
% include `-' as one end-point of a range (e.g. [!--]);
|
|
% - regex will complain if `-' appears other than as an end-point
|
|
% of a range or the range delimiter in a character set;
|
|
% - regex is a little more sensible about including `[' and `]' in character
|
|
% sets - either character can appear as and end-point of a range;
|
|
% - literal `)' must be backslash escaped, even in the absence of an `(';
|
|
% - `.' matches any character except `\n';
|
|
% - [^...] matches any character not in ... or `\n'.
|
|
% NOTE: these minor differences may go away in a future revision of this
|
|
% module.
|
|
%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- module regex.
|
|
|
|
:- interface.
|
|
|
|
:- import_module string, list.
|
|
:- import_module lex.
|
|
|
|
% The type of (compiled) regular expressions.
|
|
%
|
|
:- type regex.
|
|
:- inst regex == lexer.
|
|
|
|
% A function for converting a POSIX style regex string to a regex
|
|
% suitable for the string matching operations provided in this
|
|
% module.
|
|
%
|
|
% An exception is thrown if the regex string is malformed. We
|
|
% memoize this function for efficiency (it is cheaper to look up a
|
|
% string in a hash table than to parse it and recompute the regex.)
|
|
%
|
|
% A regex string obeys the following grammar. Note that alternation
|
|
% has lowest priority, followed by concatenation, followed by
|
|
% *, + and ?. Hence "ab*" is equivalent to "a(b*)" and not "(ab)*"
|
|
% while "ab|cd" is equivalent to "(ab)|(cd)" and not "a(b|c)d".
|
|
%
|
|
% <regex> ::= <char> % Single char
|
|
% | <regex><regex> % Concatenation
|
|
% | . % Any char but \n
|
|
% | <set> % Any char in set
|
|
% | <regex>|<regex> % Alternation
|
|
% | <regex>* % Kleene closure (zero or more)
|
|
% | <regex>+ % One or more occurrences
|
|
% | <regex>? % Zero or one occurrence
|
|
% | (<regex>)
|
|
%
|
|
% (Note the need to use double-backslashes in Mercury strings.
|
|
% The following chars must be escaped if they are intended
|
|
% literally: .|*+?()[]\. Escapes should not be used in sets.)
|
|
%
|
|
% <char> ::= \<any char> % Literal char used in regexes
|
|
% | <ordinary char> % All others
|
|
%
|
|
% <escaped char> ::= . | | | * | + | ? | ( | ) | [ | ] | \
|
|
%
|
|
% (If the first char in a <set'> is ] then it is taken as part of
|
|
% the char set and not the closing bracket. Similarly, ] may appear
|
|
% as the end char in a range and it will not be taken as the closing
|
|
% bracket.)
|
|
%
|
|
% <set> ::= [^<set'>] % Any char not in <set'> or '\n'
|
|
% | [<set'>] % Any char in <set'>
|
|
%
|
|
% <set'> ::= <any char>-<any char>% Any char in range
|
|
% | <any char> % Literal char
|
|
% | <set'><set'> % Set union
|
|
%
|
|
:- func regex(string) = regex.
|
|
:- mode regex(in ) = out(regex) is det.
|
|
|
|
% This is a utility function for lex - it compiles a string into a
|
|
% regexp (not a regex, which is for use with this module) suitable
|
|
% for use in lexemes.
|
|
%
|
|
% We memoize this function for efficiency (it is cheaper to look up a
|
|
% string in a hash table than to parse it and recompute the regexp.)
|
|
%
|
|
:- func regexp(string) = regexp.
|
|
|
|
% left_match(Regex, String, Substring, Start, Count)
|
|
% succeeds iff Regex maximally matches the first Count characters
|
|
% of String.
|
|
%
|
|
% This is equivalent to the goal
|
|
%
|
|
% {Substring, Start, Count} = head(matches(Regex, String)),
|
|
% Start = 0
|
|
%
|
|
:- pred left_match(regex, string, string, int, int).
|
|
:- mode left_match(in(regex), in, out, out, out) is semidet.
|
|
|
|
% right_match(Regex, String, Substring, Start, Count)
|
|
% succeeds iff Regex maximally matches the last Count characters
|
|
% of String.
|
|
%
|
|
% This is equivalent to the goal
|
|
%
|
|
% {Substring, Start, Count} = last(matches(Regex, String)),
|
|
% Start + Count = length(String)
|
|
%
|
|
:- pred right_match(regex, string, string, int, int).
|
|
:- mode right_match(in(regex), in, out, out, out) is semidet.
|
|
|
|
% first_match(Regex, String, Substring, Start, Count)
|
|
% succeeds iff Regex matches some Substring of String,
|
|
% setting Substring, Start and Count to the maximal first
|
|
% such occurrence.
|
|
%
|
|
% This is equivalent to the goal
|
|
%
|
|
% {Substring, Start, Count} = head(matches(Regex, String))
|
|
%
|
|
:- pred first_match(regex, string, string, int, int).
|
|
:- mode first_match(in(regex), in, out, out, out) is semidet.
|
|
|
|
% exact_match(Regex, String)
|
|
% succeeds iff Regex exactly matches String.
|
|
%
|
|
:- pred exact_match(regex, string).
|
|
:- mode exact_match(in(regex), in ) is semidet.
|
|
|
|
% matches(Regex, String) = [{Substring, Start, Count}, ...]
|
|
% Regex exactly matches Substring = substring(String, Start, Count).
|
|
% None of the {Start, Count} regions will overlap and are in
|
|
% ascending order with respect to Start.
|
|
%
|
|
:- func matches(regex, string) = list({string, int, int}).
|
|
:- mode matches(in(regex), in ) = out is det.
|
|
|
|
% replace_first(Regex, Replacement, String)
|
|
% computes the string formed by replacing the maximal first match
|
|
% of Regex (if any) in String with Replacement.
|
|
%
|
|
:- func replace_first(regex, string, string) = string.
|
|
:- mode replace_first(in(regex), in, in ) = out is det.
|
|
|
|
% replace_all(Regex, Replacement, String)
|
|
% computes the string formed by replacing the maximal non-overlapping
|
|
% matches of Regex in String with Replacement.
|
|
%
|
|
:- func replace_all(regex, string, string) = string.
|
|
:- mode replace_all(in(regex), in, in ) = out is det.
|
|
|
|
% change_first(Regex, ChangeFn, String)
|
|
% computes the string formed by replacing the maximal first match
|
|
% of Regex (Substring, if any) in String with ChangeFn(Substring).
|
|
%
|
|
:- func change_first(regex, func(string) = string, string) = string.
|
|
:- mode change_first(in(regex), func(in ) = out is det, in ) = out is det.
|
|
|
|
% change_all(Regex, ChangeFn, String)
|
|
% computes the string formed by replacing the maximal non-overlapping
|
|
% matches of Regex, Substring, in String with ChangeFn(Substring).
|
|
%
|
|
:- func change_all(regex, func(string) = string, string) = string.
|
|
:- mode change_all(in(regex), func(in ) = out is det, in ) = out is det.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- implementation.
|
|
|
|
:- import_module int, char, bool, require, std_util, pair, io.
|
|
|
|
|
|
|
|
:- type regex == lexer(string, string).
|
|
|
|
:- type lexer_state == lexer_state(string, string).
|
|
|
|
|
|
|
|
% This states of the regex parser.
|
|
%
|
|
:- type parser_state
|
|
---> res(list(re)) % Parsing as usual.
|
|
; esc(list(re)) % A character escape.
|
|
; set1(list(re)) % The char set parsing states.
|
|
; set2(bool, list(re))
|
|
; set3(bool, char, chars, list(re))
|
|
; set4(bool, char, chars, list(re))
|
|
; set5(bool, chars, list(re)).
|
|
|
|
% The possible semi-parsed regexes.
|
|
%
|
|
:- type re
|
|
---> re(regexp) % An ordinary regexp.
|
|
; char(regexp) % A single char regexp.
|
|
; lpar % A left parenthesis.
|
|
; alt(regexp). % An alternation.
|
|
|
|
:- type chars == list(char).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pragma memo(regex/1).
|
|
|
|
regex(S) = init([regexp(S) - id], read_from_string).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- pragma memo(regexp/1).
|
|
|
|
regexp(S) = finish_regex(S, foldl(compile_regex(S), S, res([]))).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- func compile_regex(string, char, parser_state) = parser_state.
|
|
|
|
% res: we are looking for the next regex or operator.
|
|
%
|
|
compile_regex(S, C, res(REs)) =
|
|
( if C = ('.') then res([re(dot) | REs])
|
|
else if C = ('|') then res(alt(S, REs))
|
|
else if C = ('*') then res(star(S, REs))
|
|
else if C = ('+') then res(plus(S, REs))
|
|
else if C = ('?') then res(opt(S, REs))
|
|
else if C = ('(') then res([lpar | REs])
|
|
else if C = (')') then res(rpar(S, REs))
|
|
else if C = ('[') then set1(REs)
|
|
else if C = (']') then regex_error("`]' without opening `['", S)
|
|
else if C = ('\\') then esc(REs)
|
|
else res([char(re(C)) | REs])
|
|
).
|
|
|
|
% esc: the current char has been \ escaped.
|
|
%
|
|
compile_regex(_, C, esc(REs)) =
|
|
res([char(re(C)) | REs]).
|
|
|
|
% set1: we have just seen the opening [.
|
|
%
|
|
compile_regex(_, C, set1(REs)) =
|
|
( if C = ('^') then set2(yes, REs)
|
|
else set3(no, C, [], REs)
|
|
).
|
|
|
|
% set2: we are looking for the first char in the set, which may
|
|
% include ].
|
|
%
|
|
compile_regex(_, C, set2(Complement, REs)) =
|
|
set3(Complement, C, [], REs).
|
|
|
|
% set3: we are looking for a char or - or ].
|
|
%
|
|
compile_regex(_, C, set3(Complement, C0, Cs, REs)) =
|
|
( if C = (']') then res([char_set(Complement, [C0 | Cs]) | REs])
|
|
else if C = ('-') then set4(Complement, C0, Cs, REs)
|
|
else set3(Complement, C, [C0 | Cs], REs)
|
|
).
|
|
|
|
% set4: we have just seen a `-' for a range.
|
|
%
|
|
compile_regex(_, C, set4(Complement, C0, Cs, REs)) =
|
|
set5(Complement, push_range(C0, C, Cs), REs).
|
|
|
|
% set5: we are looking for a char or ].
|
|
%
|
|
compile_regex(_, C, set5(Complement, Cs, REs)) =
|
|
( if C = (']') then res([char_set(Complement, Cs) | REs])
|
|
else set3(Complement, C, Cs, REs)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Turn a list of chars into an any or anybut.
|
|
%
|
|
:- func char_set(bool, chars) = re.
|
|
|
|
char_set(no, Cs) = re(any(from_char_list(Cs))).
|
|
char_set(yes, Cs) = re(anybut(from_char_list([('\n') | Cs]))).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Push a range of chars onto a char list.
|
|
%
|
|
:- func push_range(char, char, chars) = chars.
|
|
|
|
push_range(A, B, Cs) = Rg ++ Cs :-
|
|
Lo = min(to_int(A), to_int(B)),
|
|
Hi = max(to_int(A), to_int(B)),
|
|
Rg = map(int_to_char, Lo `..` Hi).
|
|
|
|
|
|
:- func int_to_char(int) = char.
|
|
|
|
int_to_char(X) =
|
|
( if char__to_int(C, X) then C else func_error("regex__int_to_char") ).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- func finish_regex(string, parser_state) = regexp.
|
|
|
|
finish_regex(S, esc(_)) =
|
|
regex_error("expected char after `\\'", S).
|
|
|
|
finish_regex(S, set1(_)) =
|
|
regex_error("`[' without closing `]'", S).
|
|
|
|
finish_regex(S, set2(_, _)) =
|
|
regex_error("`[' without closing `]'", S).
|
|
|
|
finish_regex(S, set3(_, _, _, _)) =
|
|
regex_error("`[' without closing `]'", S).
|
|
|
|
finish_regex(S, set4(_, _, _, _)) =
|
|
regex_error("`[' without closing `]'", S).
|
|
|
|
finish_regex(S, set5(_, _, _)) =
|
|
regex_error("`[' without closing `]'", S).
|
|
|
|
finish_regex(S, res(REs)) =
|
|
( if rpar(S, REs ++ [lpar]) = [RE]
|
|
then extract_regex(RE)
|
|
else regex_error("`(' without closing `)'", S)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% The *, + and ? regexes.
|
|
%
|
|
:- func star(string, list(re)) = list(re).
|
|
|
|
star(S, REs) =
|
|
( if ( REs = [re(RE) | REs0] ; REs = [char(RE) | REs0] )
|
|
then [re(*(RE)) | REs0]
|
|
else regex_error("`*' without preceding regex", S)
|
|
).
|
|
|
|
:- func plus(string, list(re)) = list(re).
|
|
|
|
plus(S, REs) =
|
|
( if ( REs = [re(RE) | REs0] ; REs = [char(RE) | REs0] )
|
|
then [re(+(RE)) | REs0]
|
|
else regex_error("`+' without preceding regex", S)
|
|
).
|
|
|
|
:- func opt(string, list(re)) = list(re).
|
|
|
|
opt(S, REs) =
|
|
( if ( REs = [re(RE) | REs0] ; REs = [char(RE) | REs0] )
|
|
then [re(?(RE)) | REs0]
|
|
else regex_error("`?' without preceding regex", S)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Handle an alternation sign.
|
|
%
|
|
:- func alt(string, list(re)) = list(re).
|
|
|
|
alt(S, REs) =
|
|
( if REs = [alt(_) | _ ]
|
|
then regex_error("`|' immediately following `|'", S)
|
|
|
|
else if REs = [lpar | _ ]
|
|
then regex_error("`|' immediately following `('", S)
|
|
|
|
else if REs = [RE_B, alt(RE_A) | REs0]
|
|
then [alt(RE_A or extract_regex(RE_B)) | REs0]
|
|
|
|
else if REs = [RE, lpar | REs0]
|
|
then [alt(extract_regex(RE)), lpar | REs0]
|
|
|
|
else if REs = [RE_B, RE_A | REs0]
|
|
then alt(S, [re(extract_regex(RE_A) ++ extract_regex(RE_B)) | REs0])
|
|
|
|
else if REs = [RE]
|
|
then [alt(extract_regex(RE))]
|
|
|
|
else regex_error("`|' without preceding regex", S)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Handle a closing parenthesis.
|
|
%
|
|
:- func rpar(string, list(re)) = list(re).
|
|
|
|
rpar(S, REs) =
|
|
( if REs = [alt(_) | _ ]
|
|
then regex_error("`)' immediately following `|'", S)
|
|
|
|
else if REs = [RE_B, alt(RE_A) | REs0]
|
|
then rpar(S, [re(RE_A or extract_regex(RE_B)) | REs0])
|
|
|
|
else if REs = [lpar | REs0]
|
|
then [nil | REs0]
|
|
|
|
else if REs = [RE, lpar | REs0]
|
|
then [RE | REs0]
|
|
|
|
else if REs = [RE_B, RE_A | REs0]
|
|
then rpar(S, [re(extract_regex(RE_A) ++ extract_regex(RE_B)) | REs0])
|
|
|
|
else regex_error("`)' without opening `('", S)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- func extract_regex(re) = regexp.
|
|
|
|
extract_regex(re(R)) = R.
|
|
extract_regex(char(R)) = R.
|
|
extract_regex(alt(_)) = func_error("regex__extract_regex").
|
|
extract_regex(lpar) = func_error("regex__extract_regex").
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Throw a wobbly.
|
|
%
|
|
:- func regex_error(string, string) = _.
|
|
:- mode regex_error(in, in) = out is erroneous.
|
|
|
|
regex_error(Msg, String) =
|
|
func_error("regex: " ++ Msg ++ " in \"" ++ String ++ "\"").
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% The empty regex.
|
|
%
|
|
:- func nil = re.
|
|
|
|
nil = re(re("")).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
left_match(Regex, String, Substring, 0, length(Substring)) :-
|
|
State = start(Regex, unsafe_promise_unique(String)),
|
|
lex__read(ok(Substring), State, _).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% We have to keep trying successive suffixes of String until
|
|
% we find a complete match.
|
|
%
|
|
right_match(Regex, String, Substring, Start, length(Substring)) :-
|
|
right_match_2(Regex, String, 0, length(String), Substring, Start).
|
|
|
|
|
|
:- pred right_match_2(regex, string, int, int, string, int).
|
|
:- mode right_match_2(in(regex), in, in, in, out, out) is semidet.
|
|
|
|
right_match_2(Regex, String, I, Length, Substring, Start) :-
|
|
I =< Length,
|
|
Substring0 = substring(String, I, max_int),
|
|
( if exact_match(Regex, Substring0) then
|
|
Substring = Substring0,
|
|
Start = I
|
|
else
|
|
right_match_2(Regex, String, I + 1, Length, Substring, Start)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
first_match(Regex, String, Substring, Start, length(Substring)) :-
|
|
State = start(Regex, unsafe_promise_unique(String)),
|
|
first_match_2(Substring, Start, State).
|
|
|
|
|
|
:- pred first_match_2(string, int, lexer_state).
|
|
:- mode first_match_2(out, out, di ) is semidet.
|
|
|
|
first_match_2(Substring, Start, !.State) :-
|
|
lex__offset_from_start(Start0, !State),
|
|
lex__read(Result, !State),
|
|
(
|
|
Result = error(_, _),
|
|
first_match_2(Substring, Start, !.State)
|
|
;
|
|
Result = ok(Substring),
|
|
Start = Start0
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
exact_match(Regex, String) :-
|
|
State = start(Regex, unsafe_promise_unique(String)),
|
|
lex__read(ok(String), State, _).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
matches(Regex, String) = Matches :-
|
|
State = start(Regex, unsafe_promise_unique(String)),
|
|
Matches = matches_2(length(String), -1, State).
|
|
|
|
|
|
:- func matches_2(int, offset, lexer_state) = list({string, int, int}).
|
|
:- mode matches_2(in, in, di) = out is det.
|
|
|
|
matches_2(Length, LastEnd, State0) = Matches :-
|
|
lex__offset_from_start(Start0, State0, State1),
|
|
lex__read(Result, State1, State2),
|
|
lex__offset_from_start(End, State2, State3),
|
|
(
|
|
Result = eof,
|
|
Matches = []
|
|
;
|
|
Result = error(_, _),
|
|
Matches = matches_2(Length, End, State3)
|
|
;
|
|
Result = ok(Substring),
|
|
Start = Start0,
|
|
Count = End - Start,
|
|
|
|
% If we matched the empty string then we have to advance
|
|
% at least one char (and finish if we get eof.)
|
|
%
|
|
% If we've reached the end of the input then also finish
|
|
% (this avoids the situation where, say, ".*" produces
|
|
% two matches for "foo" - "foo" and the notional null string
|
|
% at the end.)
|
|
%
|
|
% If we matched the empty string at the same point the
|
|
% last match ended, then we ignore this solution and
|
|
% move on.
|
|
%
|
|
Matches =
|
|
( if Count = 0, Start = LastEnd then
|
|
|
|
% This is an empty match at the same point as the end
|
|
% of our last match. We have to ignore it and move on.
|
|
%
|
|
( if lex__read_char(ok(_), State3, State4)
|
|
then matches_2(Length, End, State4)
|
|
else []
|
|
)
|
|
|
|
else
|
|
|
|
[ {Substring, Start, Count} |
|
|
( if End = Length then
|
|
[]
|
|
else if Count = 0 then
|
|
( if lex__read_char(ok(_), State3, State4)
|
|
then matches_2(Length, End, State4)
|
|
else []
|
|
)
|
|
else
|
|
matches_2(Length, End, State3)
|
|
)
|
|
]
|
|
)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
replace_first(Regex, Replacement, String) =
|
|
change_first(Regex, func(_) = Replacement, String).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
replace_all(Regex, Replacement, String) =
|
|
change_all(Regex, func(_) = Replacement, String).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
change_first(Regex, ChangeFn, String) =
|
|
( if first_match(Regex, String, Substring, Start, Count) then
|
|
append_list([
|
|
substring(String, 0, Start),
|
|
ChangeFn(Substring),
|
|
substring(String, Start + Count, max_int)
|
|
])
|
|
else
|
|
String
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
change_all(Regex, ChangeFn, String) =
|
|
append_list(change_all_2(String, ChangeFn, 0, matches(Regex, String))).
|
|
|
|
|
|
:- func change_all_2(string, func(string) = string, int,
|
|
list({string, int, int})) = list(string).
|
|
|
|
change_all_2(String, _ChangeFn, I, []) =
|
|
[ substring(String, I, max_int) ].
|
|
|
|
change_all_2(String, ChangeFn, I, [{Substring, Start, Count} | Matches]) =
|
|
[ substring(String, I, Start - I),
|
|
ChangeFn(Substring)
|
|
| change_all_2(String, ChangeFn, Start + Count, Matches) ].
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|