%-----------------------------------------------------------------------------% % Copyright (C) 1995-1997 The University of Melbourne. % This file may only be copied under the terms of the GNU General % Public License - see the file COPYING in the Mercury distribution. %-----------------------------------------------------------------------------% % Main author: bromage % Simplified by Marnix Klooster % The only predicate exported from this module is given two lists, and % it generates a 'longest common subsequence.' A 'common subsequence' % of two lists List1 and List2 is a list of pairs of integers I-J, % where I and J refer to items I and J (numbering from 0) in List1 and % List2, respectively. For a pair I-J we always have 0= io__write_string("[]"). lcss__show_lcss([X - Y | Lcss]) --> io__write_int(X), io__write_char('-'), io__write_int(Y), io__write_char(','), lcss__show_lcss(Lcss). %-----------------------------------------------------------------------------% % Find a longest common subsequence. The algorithm % used is very similar to that in: % % Hunt & Szymanski, "A fast algorithm for computing % longest common subsequences", CACM 20:5, pp 350--353, % 1977. % The essence of the algorithm is simple. A 'match' is pair % I-J so that List1[I]=List2[J]. For every length (>=0) it % keeps a lcss of that length that has been found, under the % matches considered so far. Initially, only length 0 has a % lcss: the empty one. All matches between the two lists are % gone through in lexicographical order. For every match the % shortest stored lcss is found to which it could be appended. % If appending the match results in a lcss that 'ends before' % the current lcss of that length, it is replaced. Here, the % 'end' of a lcss is the J component of its last pair I-J (or % 'minus infinity' for the empty lcss). % Note that always Thresh is increasing, and strictly % increasing for all K such that 0 =< Thresh[K] < 'infinity'. % Note also that if Link[I] = [P-Q | _], then Thresh[I] = Q. % If a lcss of length I has been found, Link[I] contains it in % reverse, and Thresh[I] contains its 'end' (see above). % Otherwise, Thresh[I] contains 'infinity.' lcss__find_lcss(List1, List2, L1, L2, Lcss) :- % The original version swapped List1 and List2, so that the % first was the largest. Is this swapping really worthwile? % It doesn't seem to be faster, nor does it consume less % memory. Therefore I removed it. % The consequence is that build_thresh and build_lcss need a % value representing 'infinity'; previously we could use the % length of the first list for this. Now we use length of % the longest list plus one. The reason for the plus one is % that it should be greater than any other threshold, and % on identical files, thresholds may get as large as L1=L2. int__max(L1, L2, Inf0), Inf is Inf0 + 1, % The original version uses arrays of the same length as the % longest list. But it is sufficient to use the length of the % shortest list, since a lcss cannot be longer than that. int__min(L1, L2, N), % Calculate the LCSS. To run through all matches in % lexicographical order, build_matchlist, build_thresh % build_thresh2 are used; the processing of a match I-J is % done in build_thresh3. When all matches have been % processed, build_lcss extracts the longest lcss found. lcss__build_matchlist(List1, List2, MatchList), lcss__build_thresh(N, MatchList, Inf, Thresh, Link), lcss__build_lcss(N, Inf, Thresh, Link, L1, L2, Lcss). %-----------------------------------------------------------------------------% % The matchlist represents the set of all matchings I-J % between F1 and F2. It is stored as a list of lists of % integers where the Ith element of the list is the list of % all J such that F1[I]=F2[J]. Every list in the matchlist is % in increasing order, since this is required by build_thresh % to go through the matches in lexicographical order. :- pred lcss__build_matchlist(list(A), list(A), list(list(int))). :- mode lcss__build_matchlist(in, in, out) is det. lcss__build_matchlist(List1, List2, MatchList) :- % First, invert List2. The inverted list is a % mapping from strings to lists of integers where % a given string maps to the list of strings in List2 % which match that string, in reverse order. (The % reversal is for efficiency reasons.) lcss__build_match_map(0, List2, Map), % Now match each line in List1 with those in List2, % reversing the matches as we go. lcss__match_map_to_matchlist(List1, Map, MatchList). :- pred lcss__build_match_map(int, list(A), map(A,list(int))). :- mode lcss__build_match_map(in, in, out) is det. lcss__build_match_map(_, [], Map) :- map__init(Map). lcss__build_match_map(N, [S | Ss], MapOut) :- N1 is N + 1, lcss__build_match_map(N1, Ss, MapIn), ( map__search(MapIn, S, Ns0) -> Ns1 = [N | Ns0] ; Ns1 = [ N ] ), map__set(MapIn, S, Ns1, MapOut). :- pred lcss__match_map_to_matchlist(list(A), map(A,list(int)), list(list(int))). :- mode lcss__match_map_to_matchlist(in, in, out) is det. lcss__match_map_to_matchlist([], _, []). lcss__match_map_to_matchlist([S | Ss], Map, [M | Ms]) :- lcss__match_map_to_matchlist(Ss, Map, Ms), ( map__search(Map, S, Ns0) -> list__reverse(Ns0, M) ; M = [] ). %-----------------------------------------------------------------------------% % This is the heart of the lcss procedure. The inputs are the % length of the shortest list, and the matchlist of both lists % (together with a value to be used as 'infinity'). % The algorithm maintains and outputs the arrays Thresh and % Link (see above). The values -1 and Inf are used for 'minus % infinity' and 'infinity,' respectively. :- pred lcss__build_thresh(int, list(list(int)), int, array(int), array(lcss)). :- mode lcss__build_thresh(in, in, in, array_uo, array_uo) is det. lcss__build_thresh(N, MatchList, Inf, Thresh, Link) :- % Initialize Thresh and Link. N1 is N + 1, % Why this size? Suppose we have two identical % files of length N. Then the links will be % [], [0-0], [0-0,1-1], ... [0-0..N-N], which % makes N+1 links in total. array__init(N1, Inf, Thresh0), % Thresh[0..N] := Inf array__set(Thresh0, 0, -1, Thresh1), % Thresh[0] := -1 array__init(N1, [], Link1), % Link[0..N] := [] % Process all matches in Matchlist in lexicographical order. lcss__build_thresh2(N, 0, MatchList, Thresh1, Link1, Thresh, Link). :- pred lcss__build_thresh2(int, int, list(list(int)), array(int), array(lcss), array(int), array(lcss)). :- mode lcss__build_thresh2(in, in, in, array_di, array_di, array_uo, array_uo) is det. lcss__build_thresh2(_N, _I, [], Thresh0, Link0, Thresh0, Link0). lcss__build_thresh2(N, I, [Matches | MatchRest], Thresh0, Link0, Thresh1, Link1) :- I1 is I + 1, lcss__build_thresh3(N, I, Matches, Thresh0, Link0, Thresh2, Link2), lcss__build_thresh2(N, I1, MatchRest, Thresh2, Link2, Thresh1, Link1). :- pred lcss__build_thresh3(int, int, list(int), array(int), array(lcss), array(int), array(lcss)). :- mode lcss__build_thresh3(in, in, in, array_di, array_di, array_uo, array_uo) is det. lcss__build_thresh3(_, _, [], Thresh, Link, Thresh, Link). lcss__build_thresh3(N, I, [ J | Js ], Thresh0, Link0, Thresh1, Link1) :- % Process the match I-J, and find which Thresh entry we should % attach this match to. Do this by finding the longest % subsequence in Link that 'ends before' J (see above), and % set K to its length plus one. lcss__build_thresh4(0, N, J, K, Thresh0), % Does (reversed) common subsequence [I-J | Link[K-1]] 'end % before' Link[K]? In other words, is J % Yes, so make this match part of a new entry, by % doing Link[K] := [I-J | Link[K-1]] K1 is K - 1, array__set(Thresh0, K, J, Thresh2), array__lookup(Link0, K1, LinkK1), array__set(Link0, K, [I - J | LinkK1], Link2) ; % Otherwise forget it. Link0 = Link2, Thresh0 = Thresh2 ), % Process the remaining matches that have I as their first % element. lcss__build_thresh3(N, I, Js, Thresh2, Link2, Thresh1, Link1). % lcss__build_thresh4 performs a binary search % through Thresh to find the value of K such % that Thresh[K-1] < J =< Thresh[K]. :- pred lcss__build_thresh4(int, int, int, int, array(int)). :- mode lcss__build_thresh4(in, in, in, out, in) is det. lcss__build_thresh4(Lo, Hi, J, K, Thresh) :- Width is Hi - Lo, ( Width < 1 -> error("lcss__build_thresh4") ; Width = 1 -> K = Hi ; % Use the middle element of the range. Mid is (Lo + Hi) // 2, array__lookup(Thresh, Mid, ThreshMid), ( ThreshMid < J -> lcss__build_thresh4(Mid, Hi, J, K, Thresh) ; lcss__build_thresh4(Lo, Mid, J, K, Thresh) ) ). %-----------------------------------------------------------------------------% % Now that we have the array Thresh, it is a simple exercise % to recover the Lcss: Simply find the largest value of K such % that Thresh[K] N1 is N - 1, lcss__build_lcss2(N1, Inf, Thresh, K) ; K = N ). %-----------------------------------------------------------------------------% %-----------------------------------------------------------------------------%