mirror of
https://github.com/Mercury-Language/mercury.git
synced 2025-12-18 15:26:31 +00:00
427 lines
14 KiB
Mathematica
427 lines
14 KiB
Mathematica
%-----------------------------------------------------------------------------%
|
|
% Copyright (C) 1995 University of Melbourne.
|
|
% This file may only be copied under the terms of the GNU General
|
|
% Public License - see the file COPYING in the Mercury distribution.
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- module lcss.
|
|
% Main author: bromage
|
|
|
|
:- interface.
|
|
:- import_module io, file.
|
|
|
|
% lcss__show_diff takes two files and displays their
|
|
% differences on the output stream.
|
|
:- pred lcss__show_diff(file, file, io__state, io__state).
|
|
:- mode lcss__show_diff(in, in, di, out) is det.
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
:- implementation.
|
|
:- import_module map, require, std_util, int, list, char, array.
|
|
|
|
% The longest common substring of two files can be
|
|
% represented as an ordered list of "matches". A
|
|
% match is a pair of the form I-J where I is the
|
|
% number of a line in file 1 and J is the number of
|
|
% a line in file 2.
|
|
:- type lcss == list(pair(int,int)).
|
|
|
|
% A single diff entry is an addition, a deletion or
|
|
% a change.
|
|
:- type single_diff --->
|
|
add(int, pair(int,int))
|
|
; delete(pair(int,int), int)
|
|
; change(pair(int,int), pair(int,int)).
|
|
|
|
% The complete diff file is a list of single
|
|
% differences.
|
|
:- type diff == list(single_diff).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% The process to "diff" two files is:
|
|
%
|
|
% - Identify the longest common subsequence
|
|
% in the files.
|
|
%
|
|
% - Use this information to determine the
|
|
% set of operations required to convert
|
|
% one file to the other.
|
|
%
|
|
% - Write this information to the output stream.
|
|
lcss__show_diff(File1, File2) -->
|
|
{ lcss__find_lcss(File1, File2, Lcss) },
|
|
{ lcss__to_diff(File1, File2, Lcss, Diff) },
|
|
lcss__display_diff(File1, File2, Diff).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% For debugging only. Will be removed in the
|
|
% final version.
|
|
:- pred lcss__show_lcss(lcss :: in, io__state :: di, io__state :: uo) is det.
|
|
lcss__show_lcss([]) -->
|
|
io__write_string("[]").
|
|
lcss__show_lcss([ X-Y | Lcss ]) -->
|
|
io__write_int(X),
|
|
io__write_char('-'),
|
|
io__write_int(Y),
|
|
io__write_char(','),
|
|
lcss__show_lcss(Lcss).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Find the longest common substring
|
|
:- pred lcss__find_lcss(file :: in, file :: in, lcss :: out) is det.
|
|
lcss__find_lcss(File1, File2, Lcss) :-
|
|
file__get_numlines(File1, L1),
|
|
file__get_numlines(File2, L2),
|
|
|
|
% Set N to the size of the larger of File1 or File2,
|
|
% let F1 be the larger file and F2 be the smaller file.
|
|
|
|
( L1 >= L2 ->
|
|
N = L1, F1 = File1, F2 = File2
|
|
;
|
|
N = L2, F2 = File1, F1 = File2
|
|
),
|
|
|
|
% Calculate the LCSS
|
|
|
|
lcss__build_matchlist(F1, F2, MatchList),
|
|
lcss__build_thresh(N, MatchList, Thresh, Link),
|
|
lcss__build_lcss(N, Thresh, Link, Lcss1),
|
|
|
|
% If we mapped File1 |-> F2, File2 |-> F1 then we
|
|
% need to swap all the entries in the Lcss.
|
|
|
|
( L1 >= L2 ->
|
|
Lcss = Lcss1
|
|
;
|
|
lcss__swap(Lcss1, Lcss)
|
|
).
|
|
|
|
:- pred lcss__swap(lcss :: in, lcss :: out) is det.
|
|
lcss__swap([], []).
|
|
lcss__swap([I-J | RestIn], [J-I | RestOut]) :-
|
|
lcss__swap(RestIn, RestOut).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% The matchlist is the set of all matchings (I,J)
|
|
% such that F1[I] = F2[J]. It is stored as a list
|
|
% of list of integers where the Ith element of
|
|
% the list is the list of all J such that F1[I]=F2[J].
|
|
% This list should be increasing order.
|
|
:- pred lcss__build_matchlist(file, file, list(list(int))).
|
|
:- mode lcss__build_matchlist(in, in, out) is det.
|
|
lcss__build_matchlist(File1, File2, MatchList) :-
|
|
|
|
% First, invert File2. The inverted file is a
|
|
% mapping from strings to lists of integers where
|
|
% a given string maps to the list of lines in File2
|
|
% which match that string.
|
|
|
|
file__to_list(File2, File2list),
|
|
lcss__build_match_map(1, File2list, Map),
|
|
|
|
% Now match each line in File1 with those in File2.
|
|
|
|
file__to_list(File1, File1list),
|
|
lcss__match_map_to_matchlist(File1list, Map, MatchList).
|
|
|
|
:- pred lcss__build_match_map(int, list(string), map(string,list(int))).
|
|
:- mode lcss__build_match_map(in, in, out) is det.
|
|
lcss__build_match_map(_, [], Map) :-
|
|
map__init(Map).
|
|
lcss__build_match_map(N, [S | Ss], MapOut) :-
|
|
N1 is N+1,
|
|
lcss__build_match_map(N1, Ss, MapIn),
|
|
( map__search(MapIn, S, Ns0) ->
|
|
list__append(Ns0, [N], Ns1)
|
|
;
|
|
Ns1 = [ N ]
|
|
),
|
|
map__set(MapIn, S, Ns1, MapOut).
|
|
|
|
:- pred lcss__match_map_to_matchlist(list(string), map(string,list(int)),
|
|
list(list(int))).
|
|
:- mode lcss__match_map_to_matchlist(in, in, out) is det.
|
|
lcss__match_map_to_matchlist([], _, []).
|
|
lcss__match_map_to_matchlist([S | Ss], Map, [M | Ms]) :-
|
|
lcss__match_map_to_matchlist(Ss, Map, Ms),
|
|
( map__search(Map, S, Ns0) ->
|
|
M = Ns0
|
|
;
|
|
M = []
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% This is the heart of the lcss procedure. The
|
|
% algorithm maintains two arrays, Thresh and Link.
|
|
% Thresh[I] is defined as the length of the longest
|
|
% common substring found so far which terminates at
|
|
% position I in File1. Link[I] is this actual
|
|
% substring stored in reverse.
|
|
%
|
|
% The special value N+1 is used to denote that no
|
|
% substring has been found that terminates at that
|
|
% position.
|
|
:- pred lcss__build_thresh(int, list(list(int)), array(int),
|
|
array(lcss)).
|
|
:- mode lcss__build_thresh(in, in, out, out) is det.
|
|
lcss__build_thresh(N, MatchList, Thresh, Link) :-
|
|
N1 is N+1,
|
|
array__init(0, N, N1, Thresh0),
|
|
array__set(Thresh0, 0, 0, Thresh1),
|
|
array__init(0, N, [], Link1),
|
|
lcss__build_thresh2(1, N, MatchList, Thresh1, Link1, Thresh, Link).
|
|
|
|
:- pred lcss__build_thresh2(int, int, list(list(int)),
|
|
array(int), array(lcss),
|
|
array(int), array(lcss)).
|
|
:- mode lcss__build_thresh2(in, in, in, in, in, out, out) is det.
|
|
lcss__build_thresh2(I, N, MatchList, Thresh0, Link0, Thresh1, Link1) :-
|
|
( I > N ->
|
|
% If we have stepped past the end of the file, we
|
|
% have finished.
|
|
Thresh0=Thresh1, Link0=Link1
|
|
; MatchList = [ Matches | MatchRest ] ->
|
|
% Otherwise step through each match in this MatchList
|
|
% entry, and arrach
|
|
|
|
I1 is I+1,
|
|
lcss__build_thresh3(N, I, Matches, Thresh0, Link0, Thresh2, Link2),
|
|
lcss__build_thresh2(I1, N, MatchRest, Thresh2, Link2,
|
|
Thresh1, Link1)
|
|
;
|
|
error("lcss__build_thresh2")
|
|
).
|
|
|
|
:- pred lcss__build_thresh3(int, int, list(int),
|
|
array(int), array(lcss),
|
|
array(int), array(lcss)).
|
|
:- mode lcss__build_thresh3(in, in, in, in, in, out, out) is det.
|
|
lcss__build_thresh3(_, _, [], Thresh, Link, Thresh, Link).
|
|
lcss__build_thresh3(N, I, [ J | Js ], Thresh0, Link0, Thresh1, Link1) :-
|
|
% Find which Thresh entry we should attach this match to.
|
|
lcss__build_thresh4(0, N, J, K, Thresh0),
|
|
array__lookup(Thresh0, K, ThreshK),
|
|
% Is the found Thresh entry shorter than that which would
|
|
% be obtained by using this?
|
|
( J < ThreshK ->
|
|
% Yes, so make this match part of a new entry.
|
|
K1 is K-1,
|
|
array__set(Thresh0, K, J, Thresh2),
|
|
array__lookup(Link0, K1, LinkK1),
|
|
array__set(Link0, K, [ I-J | LinkK1 ], Link2)
|
|
;
|
|
% Otherwise forget it.
|
|
Link0 = Link2, Thresh0 = Thresh2
|
|
),
|
|
lcss__build_thresh3(N, I, Js, Thresh2, Link2, Thresh1, Link1).
|
|
|
|
% lcss__build_thresh4 performs a binary search
|
|
% through Thresh to find the value of K such
|
|
% that Thresh[K-1] < J =< Thresh[K].
|
|
:- pred lcss__build_thresh4(int, int, int, int, array(int)).
|
|
:- mode lcss__build_thresh4(in, in, in, out, in) is det.
|
|
lcss__build_thresh4(Lo, Hi, J, K, Thresh) :-
|
|
Width is Hi-Lo,
|
|
( Width < 1 ->
|
|
error("lcss__build_thresh4")
|
|
; Width = 1 ->
|
|
K = Hi
|
|
;
|
|
% Use the middle element of the range.
|
|
Mid is (Lo + Hi) // 2,
|
|
array__lookup(Thresh, Mid, ThreshMid),
|
|
( ThreshMid < J ->
|
|
lcss__build_thresh4(Mid, Hi, J, K, Thresh)
|
|
;
|
|
lcss__build_thresh4(Lo, Mid, J, K, Thresh)
|
|
)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% Now that we have the array Thresh, it is a simple
|
|
% exercise to recover the Lcss: Simply find the
|
|
% largesst value of K such that Thresh[K] < N+1
|
|
% and Link[K] should be the Lcss in reverse.
|
|
:- pred lcss__build_lcss(int, array(int), array(lcss), lcss).
|
|
:- mode lcss__build_lcss(in, in, in, out) is det.
|
|
lcss__build_lcss(N, Thresh, Link, Lcss) :-
|
|
N1 is N+1,
|
|
lcss__build_lcss2(N, N1, Thresh, K),
|
|
( array__semidet_lookup(Link, K, Lcss1) ->
|
|
list__reverse(Lcss1, Lcss)
|
|
;
|
|
Lcss = []
|
|
).
|
|
|
|
% A simple linear search should be sufficient. On
|
|
% "normal" input, the number of changes is expected
|
|
% to be small compared to the size of the file, so
|
|
% a full-blown binary search is not necessary.
|
|
% (Even if this were not the case, this predicate
|
|
% is not the bottleneck at the moment.)
|
|
:- pred lcss__build_lcss2(int, int, array(int), int).
|
|
:- mode lcss__build_lcss2(in, in, in, out) is det.
|
|
lcss__build_lcss2(N, Max, Thresh, K) :-
|
|
( array__lookup(Thresh, N, Max) ->
|
|
N1 is N-1,
|
|
lcss__build_lcss2(N1, Max, Thresh, K)
|
|
;
|
|
K = N
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% lcss__to_diff turns the longest common substring
|
|
% of two files into a list of single_diffs.
|
|
:- pred lcss__to_diff(file :: in, file :: in, lcss :: in, diff :: out) is det.
|
|
lcss__to_diff(File1, File2, Lcss, Diff) :-
|
|
file__get_numlines(File1, Length1),
|
|
file__get_numlines(File2, Length2),
|
|
lcss__to_diff2(1, 1, Length1, Length2, Lcss, Diff).
|
|
|
|
:- pred lcss__to_diff2(int, int, int, int, lcss, diff).
|
|
:- mode lcss__to_diff2(in, in, in, in, in, out) is det.
|
|
|
|
% This case is taken if we have run out of matches.
|
|
% If we have not reached the end of both files, then
|
|
% some changes have to be added to reflect this.
|
|
lcss__to_diff2(X, Y, L1, L2, [], Diff) :-
|
|
XLoc is X-1, YLoc is Y-1,
|
|
( X > L1 ->
|
|
( Y > L2 ->
|
|
Diff = []
|
|
;
|
|
Diff = [add(XLoc,Y-L2)]
|
|
)
|
|
;
|
|
( Y > L2 ->
|
|
Diff = [delete(X-L1,YLoc)]
|
|
;
|
|
Diff = [change(X-L1,Y-L2)]
|
|
)
|
|
).
|
|
|
|
% Otherwise we only have to consider the amount
|
|
% of file between the current position and the
|
|
% next match.
|
|
lcss__to_diff2(X, Y, L1, L2, [X2-Y2 | Lcss], Diff) :-
|
|
XLoc is X-1, YLoc is Y-1,
|
|
XEnd is X2-1, YEnd is Y2-1,
|
|
XNext is X2+1, YNext is Y2+1,
|
|
( X = X2 ->
|
|
( Y = Y2 ->
|
|
lcss__to_diff2(XNext, YNext, L1, L2, Lcss, Diff)
|
|
;
|
|
lcss__to_diff2(XNext, YNext, L1, L2, Lcss, Diff1),
|
|
Diff = [ add(XLoc,Y-YEnd) | Diff1 ]
|
|
)
|
|
;
|
|
( Y = Y2 ->
|
|
lcss__to_diff2(XNext, YNext, L1, L2, Lcss, Diff1),
|
|
Diff = [ delete(X-XEnd,YLoc) | Diff1 ]
|
|
;
|
|
lcss__to_diff2(XNext, YNext, L1, L2, Lcss, Diff1),
|
|
Diff = [ change(X-XEnd,Y-YEnd) | Diff1 ]
|
|
)
|
|
).
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
|
|
% This is a quick 'n' dirty version until deep
|
|
% indexing is implemented in the determinism
|
|
% checker.
|
|
:- pred lcss__display_diff(file, file, diff, io__state, io__state).
|
|
:- mode lcss__display_diff(in, in, in, di, uo) is det.
|
|
lcss__display_diff(_, _, []) --> { true }.
|
|
lcss__display_diff(File1, File2, [ SingDiff | Diff ]) -->
|
|
( { SingDiff = add(X, Y1-Y2) },
|
|
lcss__write_command(X, X, 'a', Y1, Y2),
|
|
lcss__show_file(File2, '>', Y1, Y2)
|
|
; { SingDiff = delete(X1-X2, Y) },
|
|
lcss__write_command(X1, X2, 'd', Y, Y),
|
|
lcss__show_file(File1, '<', X1, X2)
|
|
; { SingDiff = change(X1-X2, Y1-Y2) },
|
|
lcss__write_command(X1, X2, 'c', Y1, Y2),
|
|
lcss__show_file(File1, '<', X1, X2),
|
|
io__write_string("---\n"),
|
|
lcss__show_file(File2, '>', Y1, Y2)
|
|
),
|
|
lcss__display_diff(File1, File2, Diff).
|
|
|
|
% lcss__display_diff takes a list of diffs and
|
|
% displays it in the standard diff(1) format.
|
|
% lcss__display_diff(_, _, []) --> { true }.
|
|
% lcss__display_diff(File1, File2, [ add(X, Y1-Y2) | Diff ]) -->
|
|
% lcss__write_command(X, X, 'a', Y1, Y2),
|
|
% lcss__show_file(File2, '>', Y1, Y2),
|
|
% lcss__display_diff(File1, File2, Diff),
|
|
% lcss__display_diff(File1, File2, [ delete(X1-X2, Y) | Diff ]) -->
|
|
% lcss__write_command(X1, X2, 'd', Y, Y),
|
|
% lcss__show_file(File1, '<', X1, X2),
|
|
% lcss__display_diff(File1, File2, Diff).
|
|
% lcss__display_diff(File1, File2, [ change(X1-X2, Y1-Y2) | Diff ]) -->
|
|
% lcss__write_command(X1, X2, 'c', Y1, Y2),
|
|
% lcss__show_file(File1, '<', X1, X2),
|
|
% io__write_string("---\n"),
|
|
% lcss__show_file(File2, '>', Y1, Y2),
|
|
% lcss__display_diff(File1, File2, Diff).
|
|
|
|
:- pred lcss__show_file(file, char, int, int, io__state, io__state).
|
|
|
|
% lcss__show_file shows the portion of the file
|
|
% from Low to High, with each line preceeded by
|
|
% the Prefix characher and a space. The diff(1)
|
|
% format specifies that the lines effected in the
|
|
% first file should be flagged by '<' and the
|
|
% lines effected in the second file should be
|
|
% flagged by '>'.
|
|
:- mode lcss__show_file(in, in, in, in, di, uo) is det.
|
|
lcss__show_file(File, Prefix, Low, High) -->
|
|
( { Low > High } ->
|
|
{ true }
|
|
;
|
|
( { file__get_line(File, Low, Line) } ->
|
|
{ Low1 is Low+1 },
|
|
io__write_char(Prefix),
|
|
io__write_char(' '),
|
|
io__write_string(Line),
|
|
lcss__show_file(File, Prefix, Low1, High)
|
|
;
|
|
{ true }
|
|
)
|
|
).
|
|
|
|
% lcss__write_command displays a diff(1) command.
|
|
% Like ed(1), a pair of numbers which are identical
|
|
% are abbreviated by a single number.
|
|
:- pred lcss__write_command(int, int, char, int, int, io__state, io__state).
|
|
:- mode lcss__write_command(in, in, in, in, in, di, uo) is det.
|
|
lcss__write_command(X1, X2, C, Y1, Y2) -->
|
|
( { X1 = X2 } ->
|
|
io__write_int(X1)
|
|
;
|
|
io__write_int(X1),
|
|
io__write_char(','),
|
|
io__write_int(X2)
|
|
),
|
|
io__write_char(C),
|
|
( { Y1 = Y2 } ->
|
|
io__write_int(Y1)
|
|
;
|
|
io__write_int(Y1),
|
|
io__write_char(','),
|
|
io__write_int(Y2)
|
|
),
|
|
io__write_char('\n').
|
|
|
|
%-----------------------------------------------------------------------------%
|
|
%-----------------------------------------------------------------------------%
|