Make string.replace_all with empty pattern preserve ill-formed sequences.

library/string.m:
    Define behaviour of string.replace_all on ill-formed code unit
    sequences when the pattern is empty.

    Implement that behaviour.

    Use better variable names in documentation of string.replace and
    string.replace_all.

tests/general/string_replace.exp:
tests/general/string_replace.exp2:
tests/general/string_replace.m:
    Extend test case.

    Update code style.
This commit is contained in:
Peter Wang
2019-11-06 17:10:56 +11:00
parent 8a3404d59c
commit d2c3ede17d
4 changed files with 161 additions and 28 deletions

View File

@@ -1069,19 +1069,26 @@
:- func rstrip_pred(pred(char)::in(pred(in) is semidet), string::in)
= (string::out) is det.
% replace(String0, Search, Replace, String):
% replace(String0, Pattern, Subst, String):
%
% Replace replaces the first occurrence of Search in String0
% with Replace to give String. It fails if Search does not occur
% in String0.
% Replaces the first occurrence of Pattern in String0 with Subst to give
% String. Fails if Pattern does not occur in String0.
%
:- pred replace(string::in, string::in, string::in, string::uo) is semidet.
% replace_all(String0, Search, Replace, String):
% replace_all(String0, Pattern, Subst, String):
%
% Replaces any occurrences of Search in String0 with Replace to give
% Replaces any occurrences of Pattern in String0 with Subst to give
% String.
%
% If Pattern is the empty string then Subst is inserted at every point
% in String0 except between two code units in an encoding of a code point.
% For example, these are true:
%
% replace_all("", "", "|", "|")
% replace_all("a", "", "|", "|a|")
% replace_all("ab", "", "|", "|a|b|")
%
:- func replace_all(string::in, string::in, string::in) = (string::uo) is det.
:- pred replace_all(string::in, string::in, string::in, string::uo) is det.
@@ -4718,10 +4725,7 @@ replace_all(S1, S2, S3) = S4 :-
replace_all(Str, Pat, Subst, Result) :-
( if Pat = "" then
% XXX ILSEQ foldl cannot handle ill-formed sequences.
F = (func(C, L) = [char_to_string(C) ++ Subst | L]),
Foldl = foldl(F, Str, []),
Result = append_list([Subst | list.reverse(Foldl)])
replace_all_empty_pat(Str, Subst, Result)
else
PatLength = length(Pat),
replace_all_loop(Str, Pat, Subst, PatLength, 0, [], ReversedChunks),
@@ -4729,6 +4733,50 @@ replace_all(Str, Pat, Subst, Result) :-
Result = append_list(Chunks)
).
:- pred replace_all_empty_pat(string::in, string::in, string::uo) is det.
replace_all_empty_pat(Str, Subst, Result) :-
% This implementation is not the most efficient, but it is not expected
% to be used much in practice.
to_code_unit_list(Subst, SubstCodes),
Codes0 = SubstCodes,
replace_all_empty_pat_loop(Str, SubstCodes, length(Str), Codes0, Codes),
( if from_code_unit_list_allow_ill_formed(Codes, ResultPrime) then
Result = ResultPrime
else
unexpected($pred, "string.from_code_unit_list_allow_ill_formed failed")
).
:- pred replace_all_empty_pat_loop(string::in, list(int)::in, int::in,
list(int)::in, list(int)::out) is det.
replace_all_empty_pat_loop(Str, Subst, Index, Codes0, Codes) :-
( if unsafe_prev_index(Str, Index, PrevIndex, Char) then
char.to_int(Char, CharInt),
( if CharInt =< 0x7f then
% Fast path for single code unit code points.
Codes1 = [CharInt | Codes0]
else
prepend_code_units(Str, PrevIndex, Index - 1, Codes0, Codes1)
),
Codes2 = Subst ++ Codes1,
replace_all_empty_pat_loop(Str, Subst, PrevIndex, Codes2, Codes)
else
Codes = Codes0
).
:- pred prepend_code_units(string::in, int::in, int::in,
list(int)::in, list(int)::out) is det.
prepend_code_units(Str, FirstIndex, Index, Codes0, Codes) :-
unsafe_index_code_unit(Str, Index, Code),
Codes1 = [Code | Codes0],
( if Index = FirstIndex then
Codes = Codes1
else
prepend_code_units(Str, FirstIndex, Index - 1, Codes1, Codes)
).
:- pred replace_all_loop(string::in, string::in, string::in,
int::in, int::in, list(string)::in, list(string)::out) is det.

View File

@@ -8,6 +8,8 @@ string__replace("aaa bbbb ccccc aaa", "", "**", Result)
"**aaa bbbb ccccc aaa"
string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result)
"**aßξ啕ßξ啕𐀀."
string__replace("[0x9f][0x98][0x80]😀", "", "**", Result)
"**[0x9f][0x98][0x80]😀"
string__replace("aaa bbbb ccccc aaa", "aaa", "", Result)
" bbbb ccccc aaa"
string__replace("aaa bbbb ccccc aaa", "cc", "**", Result)
@@ -28,6 +30,8 @@ string__replace_all("aaa bbbb ccccc aaa", "", "**", Result)
"**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**"
string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result)
"**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**"
string__replace_all("[0x9f][0x98][0x80]😀", "", "**", Result)
"**[0x9f]**[0x98]**[0x80]**😀**"
string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result)
" bbbb ccccc "
string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result)

View File

@@ -0,0 +1,44 @@
string__replace("", "a", "bc", Result)
FAIL!
string__replace("aaa bbbb ccccc aaa", "aab", "**", Result)
FAIL!
string__replace("aaa bbbb ccccc aaa", "aaaa", "**", Result)
FAIL!
string__replace("aaa bbbb ccccc aaa", "", "**", Result)
"**aaa bbbb ccccc aaa"
string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result)
"**aßξ啕ßξ啕𐀀."
string__replace("[0xde00]😀", "", "**", Result)
"**[0xde00]😀"
string__replace("aaa bbbb ccccc aaa", "aaa", "", Result)
" bbbb ccccc aaa"
string__replace("aaa bbbb ccccc aaa", "cc", "**", Result)
"aaa bbbb **ccc aaa"
string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result)
"a**啕ßξ啕𐀀."
string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result)
"a★★啕ßξ啕𐀀."
string__replace("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result)
"aßξ***啕𐀀."
string__replace_all("", "a", "bc", Result)
""
string__replace_all("aaa bbbb ccccc aaa", "aab", "**", Result)
"aaa bbbb ccccc aaa"
string__replace_all("aaa bbbb ccccc aaa", "aaaa", "**", Result)
"aaa bbbb ccccc aaa"
string__replace_all("aaa bbbb ccccc aaa", "", "**", Result)
"**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**"
string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result)
"**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**"
string__replace_all("[0xde00]😀", "", "**", Result)
"**[0xde00]**😀**"
string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result)
" bbbb ccccc "
string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result)
"aaa bbbb ****c aaa"
string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result)
"a**啕**啕𐀀."
string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result)
"a★★啕★★啕𐀀."
string__replace_all("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result)
"aßξ***啕𐀀."

View File

@@ -1,6 +1,11 @@
%---------------------------------------------------------------------------%
% vim: ts=4 sw=4 et ft=mercury
%---------------------------------------------------------------------------%
%
% The .exp file is for backends using UTF-8 string encoding.
% The .exp2 file is for backends using UTF-16 string encoding.
%
%---------------------------------------------------------------------------%
:- module string_replace.
@@ -14,6 +19,8 @@
:- implementation.
:- import_module char.
:- import_module int.
:- import_module list.
:- import_module string.
@@ -22,6 +29,8 @@
main(!IO) :-
Str = "aaa bbbb ccccc aaa",
Str2 = "aßξ啕ßξ啕𐀀.",
Smiley = "😀",
Str3 = between(Smiley, 1, length(Smiley)) ++ Smiley,
Tests = [
% pattern not in string
{"", "a", "bc"},
@@ -31,6 +40,7 @@ main(!IO) :-
% pattern is empty string
{Str, "", "**"},
{Str2, "", "**"},
{Str3, "", "**"},
% pattern in string
{Str, "aaa", ""},
@@ -39,31 +49,58 @@ main(!IO) :-
{Str2, "ßξ", "★★"},
{Str2, "啕ßξ", "***"}
],
list__foldl(test_replace, Tests, !IO),
list__foldl(test_replace_all, Tests, !IO).
list.foldl(test_replace, Tests, !IO),
list.foldl(test_replace_all, Tests, !IO).
:- pred test_replace({string, string, string}::in, io::di, io::uo) is det.
test_replace({Str, Pat, Subst}, !IO) :-
io__write_string("string__replace(\"" ++ Str ++
"\", \"" ++ Pat ++
"\", \"" ++ Subst ++ "\", Result) \n\t", !IO),
( string__replace(Str, Pat, Subst, Result) ->
io__write(Result, !IO),
io__nl(!IO)
;
io__write_string("FAIL!\n", !IO)
io.write_string("string__replace(\"", !IO),
write_string_debug(Str, !IO),
io.write_string("\", \"", !IO),
io.write_string(Pat, !IO),
io.write_string("\", \"", !IO),
io.write_string(Subst, !IO),
io.write_string("\", Result) \n\t", !IO),
( if string.replace(Str, Pat, Subst, Result) then
io.write_string("\"", !IO),
write_string_debug(Result, !IO),
io.write_string("\"\n", !IO)
else
io.write_string("FAIL!\n", !IO)
).
:- pred test_replace_all({string, string, string}::in, io::di, io::uo) is det.
test_replace_all({Str, Pat, Subst}, !IO) :-
io__write_string("string__replace_all(\"" ++ Str ++
"\", \"" ++ Pat ++
"\", \"" ++ Subst ++ "\", Result) \n\t", !IO),
string__replace_all(Str, Pat, Subst, Result),
io__write(Result, !IO),
io__nl(!IO).
io.write_string("string__replace_all(\"", !IO),
write_string_debug(Str, !IO),
io.write_string("\", \"", !IO),
io.write_string(Pat, !IO),
io.write_string("\", \"", !IO),
io.write_string(Subst, !IO),
io.write_string("\", Result) \n\t", !IO),
io.write_string("\"", !IO),
string.replace_all(Str, Pat, Subst, Result),
write_string_debug(Result, !IO),
io.write_string("\"\n", !IO).
%---------------------------------------------------------------------------%
%---------------------------------------------------------------------------%
:- pred write_string_debug(string::in, io::di, io::uo) is det.
write_string_debug(S, !IO) :-
write_string_debug_loop(S, 0, !IO).
:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is det.
write_string_debug_loop(S, Index, !IO) :-
( if string.index_next(S, Index, NextIndex, Char) then
( if char.is_surrogate(Char) ; Char = '\ufffd' then
unsafe_index_code_unit(S, Index, Code),
io.format("[%#x]", [i(Code)], !IO)
else
io.write_char(Char, !IO)
),
write_string_debug_loop(S, NextIndex, !IO)
else
true
).