mirror of
https://github.com/Mercury-Language/mercury.git
synced 2026-04-15 01:13:30 +00:00
633 lines
26 KiB
Mathematica
633 lines
26 KiB
Mathematica
%---------------------------------------------------------------------------%
|
|
% vim: ft=mercury ts=4 sw=4 et
|
|
%---------------------------------------------------------------------------%
|
|
% Copyright (C) 2023-2026 The Mercury team.
|
|
% This file is distributed under the terms specified in COPYING.LIB.
|
|
%---------------------------------------------------------------------------%
|
|
%
|
|
% File: edit_distance.m.
|
|
% Stability: high.
|
|
%
|
|
% This module computes the edit distance between two sequences of items.
|
|
% Its job is both similar to, and distinct from, the job of edit_seq.m.
|
|
% It is similar in that both modules work out the simplest, cheapest way
|
|
% to transform one sequence into another. It is distinct because the two
|
|
% modules aim to solve different problems.
|
|
%
|
|
% - edit_seq.m aims to solve the problem of displaying the difference
|
|
% between two given sequences in a way that makes their differences
|
|
% as easy to understand as possible.
|
|
%
|
|
% - edit_distance.m aims to solve the problem of finding, in a pool of
|
|
% candidate sequences, the candidate that is closest to a given query
|
|
% sequence.
|
|
%
|
|
% Doing a good job with the second problem requires a mechanism that
|
|
% allows callers to specify that a transposition of two elements (such as
|
|
% replacing "bc" with "cb", thus transforming e.g. "abcd" into "acbd")
|
|
% has a different cost than deleting one element in one place in the sequence
|
|
% and inserting it back at another place. This mechanism does not help
|
|
% with the first problem at all (since the simplest way to display
|
|
% a transposition *is* as a delete/insert pair), and in fact its presence
|
|
% in the system would unnecessarily complicate the algorithm.
|
|
%
|
|
% Technically, this module computes Damerau-Levenshtein distances,
|
|
% while edit_seq.m computes Levenshtein distances. (The difference between
|
|
% the two is that only the former considers transpositions to be single
|
|
% operations.)
|
|
%
|
|
%---------------------------------------------------------------------------%
|
|
%---------------------------------------------------------------------------%
|
|
|
|
:- module edit_distance.
|
|
:- interface.
|
|
|
|
:- import_module char.
|
|
:- import_module list.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
%---------------------------------------------------------------------------%
|
|
|
|
:- type edit_params(T)
|
|
---> edit_params(
|
|
% The cost of delete, insert, replace and transpose operations
|
|
% respectively. Only the *relative* values of the costs matter;
|
|
% if these are fixed, their *absolute* values are irrelevant
|
|
% (unless they are so high that they cause arithmetic
|
|
% overflows).
|
|
%
|
|
% For replacement operations, the cost may depend on what
|
|
% is being replaced by what. The intended use case is
|
|
% specifying that replacements that change a letter into
|
|
% the same letter in a different case (as char.to_lower
|
|
% or char.to_upper would do) are cheaper than other kinds of
|
|
% replacements.
|
|
cost_of_delete :: uint,
|
|
cost_of_insert :: uint,
|
|
cost_of_replace :: (func(T, T) = uint),
|
|
cost_of_transpose :: uint
|
|
).
|
|
|
|
% find_edit_distance(Params, SeqA, SeqB, Distance):
|
|
%
|
|
% Compute Distance as the sum of the costs of the operations
|
|
% needed to transform SeqA into SeqB, where the cost of each kind of
|
|
% edit operation is specified by Params.
|
|
%
|
|
:- pred find_edit_distance(edit_params(T)::in, list(T)::in, list(T)::in,
|
|
uint::out) is det.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
% find_closest_seqs(Params, SourceSeq, TargetSeqs,
|
|
% BestEditDistance, HeadBestCloseSeq, TailBestCloseSeqs):
|
|
%
|
|
% Given a source sequence SourceSeq, find the sequence in TargetSeqs
|
|
% that has the best (meaning smallest) edit distance from SourceSeq.
|
|
% Return that target sequence as HeadBestCloseSeq, and its edit distance
|
|
% in BestEditDistance. If there are any other sequences in TargetSeqs
|
|
% that also have the same edit distance from SourceSeq, return them
|
|
% in TailBestCloseSeqs. [HeadBestCloseSeq | TailBestCloseSeqs] will contain
|
|
% target sequences in the same order as TargetSeqs.
|
|
%
|
|
% Note that TargetSeqs must be a nonempty list, i.e. it cannot be [].
|
|
% However, it is ok for one of its elements to be an empty sequence.
|
|
%
|
|
:- pred find_closest_seqs(edit_params(T)::in, list(T)::in, list(list(T))::in,
|
|
uint::out, list(T)::out, list(list(T))::out) is det.
|
|
|
|
% find_best_close_enough_seqs(Params, SourceSeq, TargetSeqs,
|
|
% MaxEditDistance, BestEditDistance,
|
|
% HeadBestCloseSeq, TailBestCloseSeqs):
|
|
%
|
|
% This is a version of find_closest_seqs that takes MaxEditDistance
|
|
% as an input, and which ignores any target sequence whose edit distance
|
|
% from SourceSeq is known to exceed MaxEditDistance.
|
|
%
|
|
% If all elements of TargetSeqs have edit distances from SourceSeq
|
|
% that are greater than MaxEditDistance, then this predicate will fail.
|
|
% If this is not the case, i.e. if there are some target sequences
|
|
% whose distance is less than or equal to MaxEditDistance, this predicate
|
|
% will succeed, and will do the same job as find_closest_seqs, but it is
|
|
% intended to do so more quickly, the efficiency gain coming from stopping
|
|
% the computation of exact edit distances as soon as it becomes known
|
|
% that the exact distance would exceed MaxEditDistance.
|
|
%
|
|
:- pred find_best_close_enough_seqs(edit_params(T)::in, list(T)::in,
|
|
list(list(T))::in, uint::in, uint::out,
|
|
list(T)::out, list(list(T))::out) is semidet.
|
|
|
|
%---------------------%
|
|
|
|
% find_closest_strings(Params, SourceStr, TargetStrs,
|
|
% BestEditDistance, HeadBestCloseStr, TailBestCloseStrs):
|
|
%
|
|
% This is an instance of find_closest_seqs that takes care of the
|
|
% necessary conversions between strings and sequences of characters.
|
|
%
|
|
:- pred find_closest_strings(edit_params(char)::in, string::in,
|
|
list(string)::in, uint::out, string::out, list(string)::out) is det.
|
|
|
|
% find_best_close_enough_strings(Params, SourceStr, TargetStrs,
|
|
% MaxEditDistance,
|
|
% BestEditDistance, HeadBestCloseStr, TailBestCloseStrs):
|
|
%
|
|
% This is an instance of find_best_close_enough_seqs that takes care of the
|
|
% necessary conversions between strings and sequences of characters.
|
|
%
|
|
:- pred find_best_close_enough_strings(edit_params(char)::in, string::in,
|
|
list(string)::in, uint::in, uint::out, string::out, list(string)::out)
|
|
is semidet.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
%---------------------------------------------------------------------------%
|
|
|
|
:- implementation.
|
|
|
|
:- import_module array.
|
|
:- import_module cord.
|
|
:- import_module io.
|
|
:- import_module maybe.
|
|
:- import_module require.
|
|
:- import_module string.
|
|
:- import_module uint.
|
|
|
|
%---------------------------------------------------------------------------%
|
|
%---------------------------------------------------------------------------%
|
|
%
|
|
% The code in this module is based on spellcheck.cc in gcc.
|
|
%
|
|
|
|
find_edit_distance(Params, SeqA, SeqB, Cost) :-
|
|
find_edit_distance_up_to_ceiling(Params, SeqA, SeqB, no, Cost).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
find_closest_seqs(Params, SourceSeq, TargetSeqs,
|
|
BestCost, HeadBestSeq, TailBestSeqs) :-
|
|
(
|
|
TargetSeqs = [],
|
|
unexpected($pred,
|
|
"Calling find_closest_seqs on an empty list of target sequences" ++
|
|
" does not make sense")
|
|
;
|
|
TargetSeqs = [HeadTargetSeq | TailTargetSeqs],
|
|
find_edit_distance(Params, SourceSeq, HeadTargetSeq, HeadCost),
|
|
find_closest_seqs_loop(Params, SourceSeq, TailTargetSeqs,
|
|
HeadCost, BestCost, cord.singleton(HeadTargetSeq), BestSeqCord),
|
|
BestSeqList = cord.list(BestSeqCord),
|
|
list.det_head_tail(BestSeqList, HeadBestSeq, TailBestSeqs)
|
|
).
|
|
|
|
find_best_close_enough_seqs(Params, SourceSeq, TargetSeqs,
|
|
MaxCost, BestCost, HeadBestSeq, TailBestSeqs) :-
|
|
(
|
|
TargetSeqs = [],
|
|
unexpected($pred,
|
|
"Calling find_best_close_enough_seqs on an empty list" ++
|
|
" of target sequences does not make sense")
|
|
;
|
|
TargetSeqs = [_ | _],
|
|
find_first_close_enough_seq(Params, SourceSeq, TargetSeqs,
|
|
MaxCost, FirstCost, FirstTargetSeq, RestTargetSeqs),
|
|
find_closest_seqs_loop(Params, SourceSeq, RestTargetSeqs,
|
|
FirstCost, BestCost, cord.singleton(FirstTargetSeq), BestSeqCord),
|
|
BestSeqList = cord.list(BestSeqCord),
|
|
list.det_head_tail(BestSeqList, HeadBestSeq, TailBestSeqs)
|
|
).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
find_closest_strings(Params, SourceStr, TargetStrs,
|
|
BestCost, HeadBestStr, TailBestStrs) :-
|
|
string.to_char_list(SourceStr, SourceCharSeq),
|
|
list.map(string.to_char_list, TargetStrs, TargetCharSeqs),
|
|
find_closest_seqs(Params, SourceCharSeq, TargetCharSeqs,
|
|
BestCost, HeadBestCharSeq, TailBestCharSeqs),
|
|
string.from_char_list(HeadBestCharSeq, HeadBestStr),
|
|
list.map(string.from_char_list, TailBestCharSeqs, TailBestStrs).
|
|
|
|
find_best_close_enough_strings(Params, SourceStr, TargetStrs, MaxCost,
|
|
BestCost, HeadBestStr, TailBestStrs) :-
|
|
string.to_char_list(SourceStr, SourceCharSeq),
|
|
list.map(string.to_char_list, TargetStrs, TargetCharSeqs),
|
|
find_best_close_enough_seqs(Params, SourceCharSeq, TargetCharSeqs, MaxCost,
|
|
BestCost, HeadBestCharSeq, TailBestCharSeqs),
|
|
string.from_char_list(HeadBestCharSeq, HeadBestStr),
|
|
list.map(string.from_char_list, TailBestCharSeqs, TailBestStrs).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
|
|
:- pred find_first_close_enough_seq(edit_params(T)::in, list(T)::in,
|
|
list(list(T))::in, uint::in, uint::out, list(T)::out, list(list(T))::out)
|
|
is semidet.
|
|
|
|
find_first_close_enough_seq(Params, SourceSeq,
|
|
[HeadTargetSeq | TailTargetSeqs], MaxCost,
|
|
FirstCost, FirstTargetSeq, RestTargetSeqs) :-
|
|
find_edit_distance_up_to_ceiling(Params, SourceSeq, HeadTargetSeq,
|
|
yes(MaxCost), HeadCost),
|
|
( if HeadCost =< MaxCost then
|
|
FirstCost = HeadCost,
|
|
FirstTargetSeq = HeadTargetSeq,
|
|
RestTargetSeqs = TailTargetSeqs
|
|
else
|
|
find_first_close_enough_seq(Params, SourceSeq, TailTargetSeqs, MaxCost,
|
|
FirstCost, FirstTargetSeq, RestTargetSeqs)
|
|
).
|
|
|
|
:- pred find_closest_seqs_loop(edit_params(T)::in, list(T)::in,
|
|
list(list(T))::in, uint::in, uint::out,
|
|
cord(list(T))::in, cord(list(T))::out) is det.
|
|
|
|
find_closest_seqs_loop(_, _, [], !Cost, !CostSeqCord).
|
|
find_closest_seqs_loop(Params, SourceSeq, [HeadTargetSeq | TailTargetSeqs],
|
|
!Cost, !CostSeqCord) :-
|
|
find_edit_distance_up_to_ceiling(Params, SourceSeq, HeadTargetSeq,
|
|
yes(!.Cost), HeadCost),
|
|
( if HeadCost < !.Cost then
|
|
% Update the best known cost, ...
|
|
!:Cost = HeadCost,
|
|
% ... and throw away all the target sequences
|
|
% with the *old* best known cost.
|
|
!:CostSeqCord = cord.singleton(HeadTargetSeq),
|
|
find_closest_seqs_loop(Params, SourceSeq, TailTargetSeqs,
|
|
!Cost, !CostSeqCord)
|
|
else if HeadCost = !.Cost then
|
|
% Keep the best known cost the same, and add this target sequence
|
|
% to the list of target sequences at this cost.
|
|
cord.snoc(HeadTargetSeq, !CostSeqCord),
|
|
find_closest_seqs_loop(Params, SourceSeq, TailTargetSeqs,
|
|
!Cost, !CostSeqCord)
|
|
else
|
|
find_closest_seqs_loop(Params, SourceSeq, TailTargetSeqs,
|
|
!Cost, !CostSeqCord)
|
|
).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
%---------------------------------------------------------------------------%
|
|
|
|
% This version of find_edit_distance allows the caller to specify
|
|
% a cost ceiling, beyond which the cost does not matter (because the
|
|
% caller intends to throw away any edits whose cost is beyond the ceiling).
|
|
% Once the minimum cost in a row exceeds the ceiling (if there is one),
|
|
% we stop computing further rows, because the cost can only increase
|
|
% from then on. In such cases, the cost we return will be inaccurate,
|
|
% but this is ok; since it will be beyond the ceiling, our caller
|
|
% will throw the edit away.
|
|
%
|
|
:- pred find_edit_distance_up_to_ceiling(edit_params(T)::in,
|
|
list(T)::in, list(T)::in, maybe(uint)::in, uint::out) is det.
|
|
|
|
find_edit_distance_up_to_ceiling(Params, SeqA, SeqB, MaybeCeiling, Cost) :-
|
|
% The gcc code on which the implementation of this predicate is based
|
|
% is included at the bottom of this module in a big comment. It has been
|
|
% reformatted to follow *our* code style for C (though the code is in C++).
|
|
trace [compile_time(flag("debug_edit_distance")), io(!IO)] (
|
|
io.output_stream(CurStream, !IO),
|
|
io.format(CurStream, "\nfind_edit_distance_up_to_ceiling(%s, %s)\n",
|
|
[s(string.string(SeqA)), s(string.string(SeqB))], !IO)
|
|
),
|
|
|
|
list.ulength(SeqA, LenA),
|
|
list.ulength(SeqB, LenB),
|
|
InsertCost = Params ^ cost_of_insert,
|
|
DeleteCost = Params ^ cost_of_delete,
|
|
( if LenA = 0u then
|
|
Cost = LenB * InsertCost
|
|
else if LenB = 0u then
|
|
Cost = LenA * DeleteCost
|
|
else
|
|
% The original comment from the gcc code was:
|
|
%
|
|
% We effectively build a matrix where each (i, j) contains
|
|
% the distance between the prefix strings s[0:j] and t[0:i].
|
|
% Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
|
|
% we simply keep track of the last two rows, v_one_ago and v_two_ago,
|
|
% and a new row, v_next.
|
|
%
|
|
% In this Mercury version, we use
|
|
%
|
|
% - ItemMapA to represent s
|
|
% - ItemMapB to represent t
|
|
% - LenA to represent len_s
|
|
% - LenB to represent len_t
|
|
% - RowTwoAgo to represent v_two_ago
|
|
% - RowOneAgo to represent v_one_ago
|
|
% - RowNext to represent v_next
|
|
%
|
|
% Just as in the gcc code i iterates from 0 up to len_t-1,
|
|
% RowNum here iterates from 0u up to LenB - 1u.
|
|
% And just as in the gcc code j iterates from 0 up to len_s-1,
|
|
% J here iterates from 0u up to LenA - 1u.
|
|
%
|
|
% The Mercury compiler uses this predicate mostly when constructing
|
|
% "did you mean" addendums for diagnostic messages reporting
|
|
% failed name lookups. In modules that are large themselves, or
|
|
% (more likely) in modules that import many other modules, the set of
|
|
% names to which we want to compare the name whose lookup failed
|
|
% can be quite large. In such situations, the code constructing
|
|
% the error message will invoke find_edit_distance thousands of times.
|
|
%
|
|
% The original Mercury version of this code used maps to implement
|
|
% both ItemMap{A,B} and Row{OneAgo,TwoAgo,Next}. In some cases,
|
|
% this caused the construction of "did you mean" addendums to take
|
|
% several seconds (much more time than was needed by the *rest*
|
|
% of the compiler), which was annoying.
|
|
%
|
|
% The two main reasons of this slowdown were almost certainly
|
|
%
|
|
% - the logN factor in the complexity of pretty much all operations
|
|
% on maps, and
|
|
%
|
|
% - the allocation of logN heap cells on every update of every element
|
|
% of every map.
|
|
%
|
|
% Our fix for the first cause is to switch the representation of
|
|
% ItemMap{A,B} and Row{OneAgo,TwoAgo,Next} to arrays.
|
|
% This also helps with the second cause, but we can do better still
|
|
% by only ever using the *same* three arrays, meaning that we don't
|
|
% allocate any memory in build_rows at all.
|
|
ItemMapA = array.from_list(SeqA),
|
|
ItemMapB = array.from_list(SeqB),
|
|
RowTwoAgo = array.uinit(LenA + 1u, 0u),
|
|
init_delete_cost_row(DeleteCost, 0u, LenA, RowOneAgoList),
|
|
RowOneAgo = array.from_list(RowOneAgoList),
|
|
RowSpare = array.uinit(LenA + 1u, 0u),
|
|
|
|
build_rows(Params, LenA, LenB, ItemMapA, ItemMapB, MaybeCeiling,
|
|
0u, RowTwoAgo, RowOneAgo, RowSpare, FinalRow),
|
|
array.ulookup(FinalRow, LenA, Cost)
|
|
).
|
|
|
|
%---------------------%
|
|
|
|
% The first row is for the case of an empty target string,
|
|
% which we can reach by deleting every character in the source string.
|
|
%
|
|
:- pred init_delete_cost_row(uint::in, uint::in, uint::in, list(uint)::out)
|
|
is det.
|
|
|
|
init_delete_cost_row(DeleteCost, ColNum, MaxColNum, Row) :-
|
|
( if ColNum =< MaxColNum then
|
|
init_delete_cost_row(DeleteCost, ColNum + 1u, MaxColNum, RowTail),
|
|
Row = [ColNum * DeleteCost | RowTail]
|
|
else
|
|
Row = []
|
|
).
|
|
|
|
%---------------------%
|
|
|
|
:- pred build_rows(edit_params(T)::in, uint::in, uint::in,
|
|
array(T)::in, array(T)::in, maybe(uint)::in, uint::in,
|
|
array(uint)::array_di, array(uint)::array_di,
|
|
array(uint)::array_di, array(uint)::array_uo) is det.
|
|
|
|
build_rows(Params, LenA, LenB, ItemMapA, ItemMapB, MaybeCeiling,
|
|
RowNum, RowTwoAgo0, RowOneAgo0, RowSpare, FinalRow) :-
|
|
trace [compile_time(flag("debug_edit_distance")), io(!IO)] (
|
|
io.output_stream(CurStream, !IO),
|
|
dump_row(CurStream, RowNum, RowOneAgo0, !IO)
|
|
),
|
|
( if RowNum < LenB then
|
|
RowNext0 = RowSpare,
|
|
% The initial column is for the case of an empty source string;
|
|
% we can reach prefixes of the target string of length i (RowNum)
|
|
% by inserting i characters.
|
|
InsertCost = Params ^ cost_of_insert,
|
|
InsertCostInRow = (RowNum + 1u) * InsertCost,
|
|
array.set(0, InsertCostInRow, RowNext0, RowNext1),
|
|
|
|
MinCostInRow0 = InsertCostInRow,
|
|
build_columns(Params, LenA, LenB, ItemMapA, ItemMapB, RowNum, 0u,
|
|
RowTwoAgo0, RowTwoAgo, RowOneAgo0, RowOneAgo, RowNext1, RowNext,
|
|
MinCostInRow0, MinCostInRow),
|
|
( if
|
|
MaybeCeiling = yes(Ceiling),
|
|
MinCostInRow > Ceiling
|
|
then
|
|
FinalRow = RowNext
|
|
else
|
|
build_rows(Params, LenA, LenB, ItemMapA, ItemMapB, MaybeCeiling,
|
|
RowNum + 1u, RowOneAgo, RowNext, RowTwoAgo, FinalRow)
|
|
)
|
|
else
|
|
FinalRow = RowOneAgo0
|
|
).
|
|
|
|
% build_columns(Params, LenA, LenB, ItemMapA, ItemMapB, RowNum, J,
|
|
% !RowTwoAgo, !RowOneAgo, !RowNext):
|
|
%
|
|
% Compute the slots of the !:RowNext starting at ColNum (J+1)
|
|
% by considering their neighbors to the north, west and northwest.
|
|
%
|
|
% Obviously, this means that this predicate updates !RowNext.
|
|
% It also has array_{di,uo} pairs of arguments for !RowTwoAgo
|
|
% and !RowOneAgo, but this predicate leaves those two state variables
|
|
% unchanged. We need to return the "new" values of !:RowTwoAgo
|
|
% and !.RowOneAgo to our caller to allow the mode checker to see
|
|
% that these arrays are still unique, which allows our caller
|
|
% to reuse the same three arrays over and over, thus avoiding
|
|
% ever allocating a fourth array.
|
|
%
|
|
:- pred build_columns(edit_params(T)::in, uint::in, uint::in,
|
|
array(T)::in, array(T)::in, uint::in, uint::in,
|
|
array(uint)::array_di, array(uint)::array_uo,
|
|
array(uint)::array_di, array(uint)::array_uo,
|
|
array(uint)::array_di, array(uint)::array_uo,
|
|
uint::in, uint::out) is det.
|
|
|
|
build_columns(Params, LenA, LenB, ItemMapA, ItemMapB, RowNum, J,
|
|
!RowTwoAgo, !RowOneAgo, !RowNext, !MinCostInRow) :-
|
|
( if J < LenA then
|
|
ColNum = J + 1u,
|
|
% Note that "here", the position from which we are looking,
|
|
% is RowNext(ColNum). This means that
|
|
%
|
|
% - left is RowNext(J)
|
|
% - up is RowOneAgo(J + 1u), i.e. RowOneAgo(ColNum)
|
|
% - diag is RowOneAgo(J)
|
|
% - trans_diag is RowTwoAgo(J - 1u)
|
|
|
|
array.ulookup(!.RowOneAgo, J, DiagCost),
|
|
I = RowNum,
|
|
array.ulookup(ItemMapA, J, CurAJ),
|
|
array.ulookup(ItemMapB, I, CurBI),
|
|
( if CurAJ = CurBI then
|
|
MinCost = DiagCost,
|
|
trace [compile_time(flag("debug_edit_distance")), io(!IO)] (
|
|
io.output_stream(CurStream, !IO),
|
|
io.format(CurStream, "nc = %u; ", [u(MinCost)], !IO)
|
|
)
|
|
else
|
|
ReplacementCostFunc = Params ^ cost_of_replace,
|
|
ReplacementCost = DiagCost + ReplacementCostFunc(CurAJ, CurBI),
|
|
|
|
array.ulookup(!.RowNext, J, LeftCost),
|
|
DeleteCost = LeftCost + Params ^ cost_of_delete,
|
|
|
|
array.ulookup(!.RowOneAgo, ColNum, UpCost),
|
|
InsertCost = UpCost + Params ^ cost_of_insert,
|
|
|
|
MinCost0 = min(InsertCost, min(DeleteCost, ReplacementCost)),
|
|
|
|
( if
|
|
I > 0u,
|
|
J > 0u,
|
|
array.ulookup(ItemMapB, I - 1u, CurAJ),
|
|
array.ulookup(ItemMapA, J - 1u, CurBI)
|
|
then
|
|
array.ulookup(!.RowTwoAgo, J - 1u, TransDiagCost),
|
|
TransposeCost = TransDiagCost + Params ^ cost_of_transpose,
|
|
MinCost = min(MinCost0, TransposeCost),
|
|
trace [compile_time(flag("debug_edit_distance")), io(!IO)] (
|
|
io.output_stream(CurStream, !IO),
|
|
io.format(CurStream, "i%u d%u r%u t%u = %u; ",
|
|
[u(InsertCost), u(DeleteCost), u(ReplacementCost),
|
|
u(TransposeCost), u(MinCost)], !IO)
|
|
)
|
|
else
|
|
MinCost = MinCost0,
|
|
trace [compile_time(flag("debug_edit_distance")), io(!IO)] (
|
|
io.output_stream(CurStream, !IO),
|
|
io.format(CurStream, "i%u d%u r%u = %u; ",
|
|
[u(InsertCost), u(DeleteCost), u(ReplacementCost),
|
|
u(MinCost)], !IO)
|
|
)
|
|
)
|
|
),
|
|
array.uset(ColNum, MinCost, !RowNext),
|
|
( if MinCost < !.MinCostInRow then
|
|
!:MinCostInRow = MinCost
|
|
else
|
|
true
|
|
),
|
|
build_columns(Params, LenA, LenB, ItemMapA, ItemMapB,
|
|
RowNum, J + 1u, !RowTwoAgo, !RowOneAgo, !RowNext, !MinCostInRow)
|
|
else
|
|
trace [compile_time(flag("debug_edit_distance")), io(!IO)] (
|
|
io.output_stream(CurStream, !IO),
|
|
io.nl(CurStream, !IO)
|
|
)
|
|
).
|
|
|
|
%---------------------%
|
|
|
|
:- pred dump_row(io.text_output_stream::in, uint::in, array(uint)::in,
|
|
io::di, io::uo) is det.
|
|
|
|
dump_row(OutStream, RowNum, RowMap, !IO) :-
|
|
io.format(OutStream, "row #%2u: [", [u(RowNum)], !IO),
|
|
array.to_list(RowMap, RowAL),
|
|
dump_columns(OutStream, 0u, RowAL, !IO),
|
|
io.write_string(OutStream, "]\n", !IO),
|
|
io.flush_output(OutStream, !IO).
|
|
|
|
:- pred dump_columns(io.text_output_stream::in, uint::in, list(uint)::in,
|
|
io::di, io::uo) is det.
|
|
|
|
dump_columns(_, _, [], !IO).
|
|
dump_columns(OutStream, ColNum, [Value | Values], !IO) :-
|
|
io.format(OutStream, "%2u: %3u, ", [u(ColNum), u(Value)], !IO),
|
|
dump_columns(OutStream, ColNum + 1u, Values, !IO).
|
|
|
|
%---------------------------------------------------------------------------%
|
|
%
|
|
% edit_distance_t
|
|
% get_edit_distance (const char *s, int len_s, const char *t, int len_t)
|
|
% {
|
|
% const bool debug = false;
|
|
%
|
|
% if (debug) {
|
|
% printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
|
|
% printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
|
|
% }
|
|
%
|
|
% if (len_s == 0) {
|
|
% return BASE_COST * len_t;
|
|
% }
|
|
% if (len_t == 0) {
|
|
% return BASE_COST * len_s;
|
|
% }
|
|
%
|
|
% // We effectively build a matrix where each (i, j) contains
|
|
% // the distance between the prefix strings s[0:j] and t[0:i].
|
|
% // Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
|
|
% // we simply keep track of the last two rows, v_one_ago and v_two_ago,
|
|
% // and a new row, v_next.
|
|
% edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
|
|
% edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
|
|
% edit_distance_t *v_next = new edit_distance_t[len_s + 1];
|
|
%
|
|
% // The first row is for the case of an empty target string,
|
|
% // which we can reach by deleting every character in the source string.
|
|
% for (int i = 0; i < len_s + 1; i++) {
|
|
% v_one_ago[i] = i * BASE_COST;
|
|
% }
|
|
%
|
|
% // Build successive rows.
|
|
% for (int i = 0; i < len_t; i++) {
|
|
% if (debug) {
|
|
% printf ("i:%i v_one_ago = ", i);
|
|
% for (int j = 0; j < len_s + 1; j++) {
|
|
% printf ("%i ", v_one_ago[j]);
|
|
% }
|
|
% printf ("\n");
|
|
% }
|
|
%
|
|
% // The initial column is for the case of an empty source string;
|
|
% // we can reach prefixes of the target string of length i
|
|
% // by inserting i characters.
|
|
% v_next[0] = (i + 1) * BASE_COST;
|
|
%
|
|
% // Build the rest of the row by considering neighbors
|
|
% // to the north, west and northwest.
|
|
% for (int j = 0; j < len_s; j++) {
|
|
% edit_distance_t cost;
|
|
%
|
|
% if (s[j] == t[i]) {
|
|
% cost = 0;
|
|
% } else if (TOLOWER (s[j]) == TOLOWER (t[i])) {
|
|
% cost = CASE_COST;
|
|
% } else {
|
|
% cost = BASE_COST;
|
|
% }
|
|
%
|
|
% edit_distance_t deletion = v_next[j] + BASE_COST;
|
|
% edit_distance_t insertion = v_one_ago[j + 1] + BASE_COST;
|
|
% edit_distance_t substitution = v_one_ago[j] + cost;
|
|
% edit_distance_t cheapest = MIN (deletion, insertion);
|
|
% cheapest = MIN (cheapest, substitution);
|
|
% if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i]) {
|
|
% edit_distance_t transposition = v_two_ago[j - 1] + BASE_COST;
|
|
% cheapest = MIN (cheapest, transposition);
|
|
% }
|
|
% v_next[j + 1] = cheapest;
|
|
% }
|
|
%
|
|
% // Prepare to move on to next row.
|
|
% for (int j = 0; j < len_s + 1; j++) {
|
|
% v_two_ago[j] = v_one_ago[j];
|
|
% v_one_ago[j] = v_next[j];
|
|
% }
|
|
% }
|
|
%
|
|
% if (debug) {
|
|
% printf ("final v_next = ");
|
|
% for (int j = 0; j < len_s + 1; j++) {
|
|
% printf ("%i ", v_next[j]);
|
|
% }
|
|
% printf ("\n");
|
|
% }
|
|
%
|
|
% edit_distance_t result = v_next[len_s];
|
|
% delete[] v_two_ago;
|
|
% delete[] v_one_ago;
|
|
% delete[] v_next;
|
|
% return result;
|
|
% }
|
|
%
|
|
%---------------------------------------------------------------------------%
|
|
:- end_module edit_distance.
|
|
%---------------------------------------------------------------------------%
|