diff --git a/library/string.m b/library/string.m index c65e857f8..12bb29177 100644 --- a/library/string.m +++ b/library/string.m @@ -1069,19 +1069,26 @@ :- func rstrip_pred(pred(char)::in(pred(in) is semidet), string::in) = (string::out) is det. - % replace(String0, Search, Replace, String): + % replace(String0, Pattern, Subst, String): % - % Replace replaces the first occurrence of Search in String0 - % with Replace to give String. It fails if Search does not occur - % in String0. + % Replaces the first occurrence of Pattern in String0 with Subst to give + % String. Fails if Pattern does not occur in String0. % :- pred replace(string::in, string::in, string::in, string::uo) is semidet. - % replace_all(String0, Search, Replace, String): + % replace_all(String0, Pattern, Subst, String): % - % Replaces any occurrences of Search in String0 with Replace to give + % Replaces any occurrences of Pattern in String0 with Subst to give % String. % + % If Pattern is the empty string then Subst is inserted at every point + % in String0 except between two code units in an encoding of a code point. + % For example, these are true: + % + % replace_all("", "", "|", "|") + % replace_all("a", "", "|", "|a|") + % replace_all("ab", "", "|", "|a|b|") + % :- func replace_all(string::in, string::in, string::in) = (string::uo) is det. :- pred replace_all(string::in, string::in, string::in, string::uo) is det. @@ -4718,10 +4725,7 @@ replace_all(S1, S2, S3) = S4 :- replace_all(Str, Pat, Subst, Result) :- ( if Pat = "" then - % XXX ILSEQ foldl cannot handle ill-formed sequences. - F = (func(C, L) = [char_to_string(C) ++ Subst | L]), - Foldl = foldl(F, Str, []), - Result = append_list([Subst | list.reverse(Foldl)]) + replace_all_empty_pat(Str, Subst, Result) else PatLength = length(Pat), replace_all_loop(Str, Pat, Subst, PatLength, 0, [], ReversedChunks), @@ -4729,6 +4733,50 @@ replace_all(Str, Pat, Subst, Result) :- Result = append_list(Chunks) ). +:- pred replace_all_empty_pat(string::in, string::in, string::uo) is det. + +replace_all_empty_pat(Str, Subst, Result) :- + % This implementation is not the most efficient, but it is not expected + % to be used much in practice. + to_code_unit_list(Subst, SubstCodes), + Codes0 = SubstCodes, + replace_all_empty_pat_loop(Str, SubstCodes, length(Str), Codes0, Codes), + ( if from_code_unit_list_allow_ill_formed(Codes, ResultPrime) then + Result = ResultPrime + else + unexpected($pred, "string.from_code_unit_list_allow_ill_formed failed") + ). + +:- pred replace_all_empty_pat_loop(string::in, list(int)::in, int::in, + list(int)::in, list(int)::out) is det. + +replace_all_empty_pat_loop(Str, Subst, Index, Codes0, Codes) :- + ( if unsafe_prev_index(Str, Index, PrevIndex, Char) then + char.to_int(Char, CharInt), + ( if CharInt =< 0x7f then + % Fast path for single code unit code points. + Codes1 = [CharInt | Codes0] + else + prepend_code_units(Str, PrevIndex, Index - 1, Codes0, Codes1) + ), + Codes2 = Subst ++ Codes1, + replace_all_empty_pat_loop(Str, Subst, PrevIndex, Codes2, Codes) + else + Codes = Codes0 + ). + +:- pred prepend_code_units(string::in, int::in, int::in, + list(int)::in, list(int)::out) is det. + +prepend_code_units(Str, FirstIndex, Index, Codes0, Codes) :- + unsafe_index_code_unit(Str, Index, Code), + Codes1 = [Code | Codes0], + ( if Index = FirstIndex then + Codes = Codes1 + else + prepend_code_units(Str, FirstIndex, Index - 1, Codes1, Codes) + ). + :- pred replace_all_loop(string::in, string::in, string::in, int::in, int::in, list(string)::in, list(string)::out) is det. diff --git a/tests/general/string_replace.exp b/tests/general/string_replace.exp index eab49079f..d633a7822 100644 --- a/tests/general/string_replace.exp +++ b/tests/general/string_replace.exp @@ -8,6 +8,8 @@ string__replace("aaa bbbb ccccc aaa", "", "**", Result) "**aaa bbbb ccccc aaa" string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result) "**aßξ啕ßξ啕𐀀." +string__replace("[0x9f][0x98][0x80]😀", "", "**", Result) + "**[0x9f][0x98][0x80]😀" string__replace("aaa bbbb ccccc aaa", "aaa", "", Result) " bbbb ccccc aaa" string__replace("aaa bbbb ccccc aaa", "cc", "**", Result) @@ -28,6 +30,8 @@ string__replace_all("aaa bbbb ccccc aaa", "", "**", Result) "**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**" string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result) "**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**" +string__replace_all("[0x9f][0x98][0x80]😀", "", "**", Result) + "**[0x9f]**[0x98]**[0x80]**😀**" string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result) " bbbb ccccc " string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result) diff --git a/tests/general/string_replace.exp2 b/tests/general/string_replace.exp2 new file mode 100644 index 000000000..082aa9237 --- /dev/null +++ b/tests/general/string_replace.exp2 @@ -0,0 +1,44 @@ +string__replace("", "a", "bc", Result) + FAIL! +string__replace("aaa bbbb ccccc aaa", "aab", "**", Result) + FAIL! +string__replace("aaa bbbb ccccc aaa", "aaaa", "**", Result) + FAIL! +string__replace("aaa bbbb ccccc aaa", "", "**", Result) + "**aaa bbbb ccccc aaa" +string__replace("aßξ啕ßξ啕𐀀.", "", "**", Result) + "**aßξ啕ßξ啕𐀀." +string__replace("[0xde00]😀", "", "**", Result) + "**[0xde00]😀" +string__replace("aaa bbbb ccccc aaa", "aaa", "", Result) + " bbbb ccccc aaa" +string__replace("aaa bbbb ccccc aaa", "cc", "**", Result) + "aaa bbbb **ccc aaa" +string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result) + "a**啕ßξ啕𐀀." +string__replace("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result) + "a★★啕ßξ啕𐀀." +string__replace("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result) + "aßξ***啕𐀀." +string__replace_all("", "a", "bc", Result) + "" +string__replace_all("aaa bbbb ccccc aaa", "aab", "**", Result) + "aaa bbbb ccccc aaa" +string__replace_all("aaa bbbb ccccc aaa", "aaaa", "**", Result) + "aaa bbbb ccccc aaa" +string__replace_all("aaa bbbb ccccc aaa", "", "**", Result) + "**a**a**a** **b**b**b**b** **c**c**c**c**c** **a**a**a**" +string__replace_all("aßξ啕ßξ啕𐀀.", "", "**", Result) + "**a**ß**ξ**啕**ß**ξ**啕**𐀀**.**" +string__replace_all("[0xde00]😀", "", "**", Result) + "**[0xde00]**😀**" +string__replace_all("aaa bbbb ccccc aaa", "aaa", "", Result) + " bbbb ccccc " +string__replace_all("aaa bbbb ccccc aaa", "cc", "**", Result) + "aaa bbbb ****c aaa" +string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "**", Result) + "a**啕**啕𐀀." +string__replace_all("aßξ啕ßξ啕𐀀.", "ßξ", "★★", Result) + "a★★啕★★啕𐀀." +string__replace_all("aßξ啕ßξ啕𐀀.", "啕ßξ", "***", Result) + "aßξ***啕𐀀." diff --git a/tests/general/string_replace.m b/tests/general/string_replace.m index 2b905fe90..3dcfa2ae8 100644 --- a/tests/general/string_replace.m +++ b/tests/general/string_replace.m @@ -1,6 +1,11 @@ %---------------------------------------------------------------------------% % vim: ts=4 sw=4 et ft=mercury %---------------------------------------------------------------------------% +% +% The .exp file is for backends using UTF-8 string encoding. +% The .exp2 file is for backends using UTF-16 string encoding. +% +%---------------------------------------------------------------------------% :- module string_replace. @@ -14,6 +19,8 @@ :- implementation. +:- import_module char. +:- import_module int. :- import_module list. :- import_module string. @@ -22,6 +29,8 @@ main(!IO) :- Str = "aaa bbbb ccccc aaa", Str2 = "aßξ啕ßξ啕𐀀.", + Smiley = "😀", + Str3 = between(Smiley, 1, length(Smiley)) ++ Smiley, Tests = [ % pattern not in string {"", "a", "bc"}, @@ -31,6 +40,7 @@ main(!IO) :- % pattern is empty string {Str, "", "**"}, {Str2, "", "**"}, + {Str3, "", "**"}, % pattern in string {Str, "aaa", ""}, @@ -39,31 +49,58 @@ main(!IO) :- {Str2, "ßξ", "★★"}, {Str2, "啕ßξ", "***"} ], - list__foldl(test_replace, Tests, !IO), - list__foldl(test_replace_all, Tests, !IO). + list.foldl(test_replace, Tests, !IO), + list.foldl(test_replace_all, Tests, !IO). :- pred test_replace({string, string, string}::in, io::di, io::uo) is det. test_replace({Str, Pat, Subst}, !IO) :- - io__write_string("string__replace(\"" ++ Str ++ - "\", \"" ++ Pat ++ - "\", \"" ++ Subst ++ "\", Result) \n\t", !IO), - ( string__replace(Str, Pat, Subst, Result) -> - io__write(Result, !IO), - io__nl(!IO) - ; - io__write_string("FAIL!\n", !IO) + io.write_string("string__replace(\"", !IO), + write_string_debug(Str, !IO), + io.write_string("\", \"", !IO), + io.write_string(Pat, !IO), + io.write_string("\", \"", !IO), + io.write_string(Subst, !IO), + io.write_string("\", Result) \n\t", !IO), + ( if string.replace(Str, Pat, Subst, Result) then + io.write_string("\"", !IO), + write_string_debug(Result, !IO), + io.write_string("\"\n", !IO) + else + io.write_string("FAIL!\n", !IO) ). :- pred test_replace_all({string, string, string}::in, io::di, io::uo) is det. test_replace_all({Str, Pat, Subst}, !IO) :- - io__write_string("string__replace_all(\"" ++ Str ++ - "\", \"" ++ Pat ++ - "\", \"" ++ Subst ++ "\", Result) \n\t", !IO), - string__replace_all(Str, Pat, Subst, Result), - io__write(Result, !IO), - io__nl(!IO). + io.write_string("string__replace_all(\"", !IO), + write_string_debug(Str, !IO), + io.write_string("\", \"", !IO), + io.write_string(Pat, !IO), + io.write_string("\", \"", !IO), + io.write_string(Subst, !IO), + io.write_string("\", Result) \n\t", !IO), + io.write_string("\"", !IO), + string.replace_all(Str, Pat, Subst, Result), + write_string_debug(Result, !IO), + io.write_string("\"\n", !IO). -%---------------------------------------------------------------------------% -%---------------------------------------------------------------------------% +:- pred write_string_debug(string::in, io::di, io::uo) is det. + +write_string_debug(S, !IO) :- + write_string_debug_loop(S, 0, !IO). + +:- pred write_string_debug_loop(string::in, int::in, io::di, io::uo) is det. + +write_string_debug_loop(S, Index, !IO) :- + ( if string.index_next(S, Index, NextIndex, Char) then + ( if char.is_surrogate(Char) ; Char = '\ufffd' then + unsafe_index_code_unit(S, Index, Code), + io.format("[%#x]", [i(Code)], !IO) + else + io.write_char(Char, !IO) + ), + write_string_debug_loop(S, NextIndex, !IO) + else + true + ).