mercury/samples/diff/lcss.m

%-----------------------------------------------------------------------------%
% Copyright (C) 1995 University of Melbourne.
% This file may only be copied under the terms of the GNU General
% Public License - see the file COPYING in the Mercury distribution.
%-----------------------------------------------------------------------------%

:- module lcss.
% Main author: bromage

:- interface.
:- import_module io, file.

	% lcss__show_diff takes two files and displays their
	% differences on the output stream.
:- pred lcss__show_diff(file, file, io__state, io__state).
:- mode lcss__show_diff(in, in, di, out) is det.

%-----------------------------------------------------------------------------%

:- implementation.
:- import_module map, require, std_util, int, list, char, array.

	% The longest common substring of two files can be
	% represented as an ordered list of "matches".  A
	% match is a pair of the form I-J where I is the
	% number of a line in file 1 and J is the number of
	% a line in file 2.
:- type lcss == list(pair(int,int)).

	% A single diff entry is an addition, a deletion or
	% a change.
:- type single_diff --->
		add(int, pair(int,int))
	;	delete(pair(int,int), int)
	;	change(pair(int,int), pair(int,int)).

	% The complete diff file is a list of single
	% differences.
:- type diff == list(single_diff).

%-----------------------------------------------------------------------------%

	% The process to "diff" two files is:
	%
	%	- Identify the longest common subsequence
	%	  in the files.
	%
	%	- Use this information to determine the
	%	  set of operations required to convert
	%	  one file to the other.
	%
	%	- Write this information to the output stream.
lcss__show_diff(File1, File2) -->
	{ lcss__find_lcss(File1, File2, Lcss) },
	{ lcss__to_diff(File1, File2, Lcss, Diff) },
	lcss__display_diff(File1, File2, Diff).

%-----------------------------------------------------------------------------%

	% For debugging only.  Will be removed in the
	% final version.
:- pred lcss__show_lcss(lcss :: in, io__state :: di, io__state :: uo) is det.
lcss__show_lcss([]) -->
	io__write_string("[]").
lcss__show_lcss([ X-Y | Lcss ]) -->
	io__write_int(X),
	io__write_char('-'),
	io__write_int(Y),
	io__write_char(','),
	lcss__show_lcss(Lcss).

%-----------------------------------------------------------------------------%

	% Find the longest common substring
:- pred lcss__find_lcss(file :: in, file :: in, lcss :: out) is det.
lcss__find_lcss(File1, File2, Lcss) :-
	file__get_numlines(File1, L1),
	file__get_numlines(File2, L2),

	% Set N to the size of the larger of File1 or File2,
	% let F1 be the larger file and F2 be the smaller file.

	( L1 >= L2 ->
	    N = L1, F1 = File1, F2 = File2
	;
	    N = L2, F2 = File1, F1 = File2
	),

	% Calculate the LCSS

	lcss__build_matchlist(F1, F2, MatchList),
	lcss__build_thresh(N, MatchList, Thresh, Link),
	lcss__build_lcss(N, Thresh, Link, Lcss1),

	% If we mapped File1 |-> F2, File2 |-> F1 then we
	% need to swap all the entries in the Lcss.

	( L1 >= L2 ->
	    Lcss = Lcss1
	;
	    lcss__swap(Lcss1, Lcss)
	).

:- pred lcss__swap(lcss :: in, lcss :: out) is det.
lcss__swap([], []).
lcss__swap([I-J | RestIn], [J-I | RestOut]) :-
	lcss__swap(RestIn, RestOut).

%-----------------------------------------------------------------------------%

	% The matchlist is the set of all matchings (I,J)
	% such that F1[I] = F2[J].  It is stored as a list
	% of list of integers where the Ith element of
	% the list is the list of all J such that F1[I]=F2[J].
	% This list should be increasing order.
:- pred lcss__build_matchlist(file, file, list(list(int))).
:- mode lcss__build_matchlist(in, in, out) is det.
lcss__build_matchlist(File1, File2, MatchList) :-

	% First, invert File2.  The inverted file is a
	% mapping from strings to lists of integers where
	% a given string maps to the list of lines in File2
	% which match that string.

	file__to_list(File2, File2list),
	lcss__build_match_map(1, File2list, Map),

	% Now match each line in File1 with those in File2.

	file__to_list(File1, File1list),
	lcss__match_map_to_matchlist(File1list, Map, MatchList).

:- pred lcss__build_match_map(int, list(string), map(string,list(int))).
:- mode lcss__build_match_map(in, in, out) is det.
lcss__build_match_map(_, [], Map) :-
	map__init(Map).
lcss__build_match_map(N, [S | Ss], MapOut) :-
	N1 is N+1,
	lcss__build_match_map(N1, Ss, MapIn),
	( map__search(MapIn, S, Ns0) ->
	    list__append(Ns0, [N], Ns1)
	;
	    Ns1 = [ N ]
	),
	map__set(MapIn, S, Ns1, MapOut).

:- pred lcss__match_map_to_matchlist(list(string), map(string,list(int)),
		list(list(int))).
:- mode lcss__match_map_to_matchlist(in, in, out) is det.
lcss__match_map_to_matchlist([], _, []).
lcss__match_map_to_matchlist([S | Ss], Map, [M | Ms]) :-
	lcss__match_map_to_matchlist(Ss, Map, Ms),
	( map__search(Map, S, Ns0) ->
	    M = Ns0
	;
	    M = []
	).

%-----------------------------------------------------------------------------%

	% This is the heart of the lcss procedure.  The
	% algorithm maintains two arrays, Thresh and Link.
	% Thresh[I] is defined as the length of the longest
	% common substring found so far which terminates at
	% position I in File1.  Link[I] is this actual
	% substring stored in reverse.
	%
	% The special value N+1 is used to denote that no
	% substring has been found that terminates at that
	% position.
:- pred lcss__build_thresh(int, list(list(int)), array(int),
		array(lcss)).
:- mode lcss__build_thresh(in, in, out, out) is det.
lcss__build_thresh(N, MatchList, Thresh, Link) :-
	N1 is N+1,
	array__init(0, N, N1, Thresh0),
	array__set(Thresh0, 0, 0, Thresh1),
	array__init(0, N, [], Link1),
	lcss__build_thresh2(1, N, MatchList, Thresh1, Link1, Thresh, Link).

:- pred lcss__build_thresh2(int, int, list(list(int)),
		array(int), array(lcss),
		array(int), array(lcss)).
:- mode lcss__build_thresh2(in, in, in, in, in, out, out) is det.
lcss__build_thresh2(I, N, MatchList, Thresh0, Link0, Thresh1, Link1) :-
	( I > N ->
	    % If we have stepped past the end of the file, we
	    % have finished.
	    Thresh0=Thresh1, Link0=Link1
	; MatchList = [ Matches | MatchRest ] ->
	    % Otherwise step through each match in this MatchList
	    % entry, and arrach

	    I1 is I+1,
	    lcss__build_thresh3(N, I, Matches, Thresh0, Link0, Thresh2, Link2),
	    lcss__build_thresh2(I1, N, MatchRest, Thresh2, Link2,
			Thresh1, Link1)
	;
	    error("lcss__build_thresh2")
	).

:- pred lcss__build_thresh3(int, int, list(int),
		array(int), array(lcss),
	 	array(int), array(lcss)).
:- mode lcss__build_thresh3(in, in, in, in, in, out, out) is det.
lcss__build_thresh3(_, _, [], Thresh, Link, Thresh, Link).
lcss__build_thresh3(N, I, [ J | Js ], Thresh0, Link0, Thresh1, Link1) :-
	% Find which Thresh entry we should attach this match to.
	lcss__build_thresh4(0, N, J, K, Thresh0),
	array__lookup(Thresh0, K, ThreshK),
	% Is the found Thresh entry shorter than that which would
	% be obtained by using this?
	( J < ThreshK ->
	    % Yes, so make this match part of a new entry.
	    K1 is K-1,
	    array__set(Thresh0, K, J, Thresh2),
	    array__lookup(Link0, K1, LinkK1),
	    array__set(Link0, K, [ I-J | LinkK1 ], Link2)
	;
	    % Otherwise forget it.
	    Link0 = Link2, Thresh0 = Thresh2
	),
	lcss__build_thresh3(N, I, Js, Thresh2, Link2, Thresh1, Link1).

	% lcss__build_thresh4 performs a binary search
	% through Thresh to find the value of K such
	% that Thresh[K-1] < J =< Thresh[K].
:- pred lcss__build_thresh4(int, int, int, int, array(int)).
:- mode lcss__build_thresh4(in, in, in, out, in) is det.
lcss__build_thresh4(Lo, Hi, J, K, Thresh) :-
	Width is Hi-Lo,
	( Width < 1 ->
	    error("lcss__build_thresh4")
	; Width = 1 ->
	    K = Hi
	;
	    % Use the middle element of the range.
	    Mid is (Lo + Hi) // 2,
	    array__lookup(Thresh, Mid, ThreshMid),
	    ( ThreshMid < J ->
		lcss__build_thresh4(Mid, Hi, J, K, Thresh)
	    ;
		lcss__build_thresh4(Lo, Mid, J, K, Thresh)
	    )
	).

%-----------------------------------------------------------------------------%

	% Now that we have the array Thresh, it is a simple
	% exercise to recover the Lcss: Simply find the
	% largesst value of K such that Thresh[K] < N+1
	% and Link[K] should be the Lcss in reverse.
:- pred lcss__build_lcss(int, array(int), array(lcss), lcss).
:- mode lcss__build_lcss(in, in, in, out) is det.
lcss__build_lcss(N, Thresh, Link, Lcss) :-
	N1 is N+1,
	lcss__build_lcss2(N, N1, Thresh, K),
	( array__semidet_lookup(Link, K, Lcss1) ->
	    list__reverse(Lcss1, Lcss)
	;
	    Lcss = []
	).

	% A simple linear search should be sufficient.  On
	% "normal" input, the number of changes is expected
	% to be small compared to the size of the file, so
	% a full-blown binary search is not necessary.
	% (Even if this were not the case, this predicate
	% is not the bottleneck at the moment.)
:- pred lcss__build_lcss2(int, int, array(int), int).
:- mode lcss__build_lcss2(in, in, in, out) is det.
lcss__build_lcss2(N, Max, Thresh, K) :-
	( array__lookup(Thresh, N, Max) ->
	    N1 is N-1,
	    lcss__build_lcss2(N1, Max, Thresh, K)
	;
	    K = N
	).

%-----------------------------------------------------------------------------%

	% lcss__to_diff turns the longest common substring
	% of two files into a list of single_diffs.
:- pred lcss__to_diff(file :: in, file :: in, lcss :: in, diff :: out) is det.
lcss__to_diff(File1, File2, Lcss, Diff) :-
	file__get_numlines(File1, Length1),
	file__get_numlines(File2, Length2),
	lcss__to_diff2(1, 1, Length1, Length2, Lcss, Diff).

:- pred lcss__to_diff2(int, int, int, int, lcss, diff).
:- mode lcss__to_diff2(in, in, in, in, in, out) is det.

	% This case is taken if we have run out of matches.
	% If we have not reached the end of both files, then
	% some changes have to be added to reflect this.
lcss__to_diff2(X, Y, L1, L2, [], Diff) :-
	XLoc is X-1, YLoc is Y-1,
	( X > L1 ->
	    ( Y > L2 ->
		Diff = []
	    ;
		Diff = [add(XLoc,Y-L2)]
	    )
	;
	    ( Y > L2 ->
		Diff = [delete(X-L1,YLoc)]
	    ;
		Diff = [change(X-L1,Y-L2)]
	    )
	).

	% Otherwise we only have to consider the amount
	% of file between the current position and the
	% next match.
lcss__to_diff2(X, Y, L1, L2, [X2-Y2 | Lcss], Diff) :-
	XLoc is X-1,   YLoc is Y-1,
	XEnd is X2-1,  YEnd is Y2-1,
	XNext is X2+1, YNext is Y2+1,
	( X = X2 ->
	    ( Y = Y2 ->
		lcss__to_diff2(XNext, YNext, L1, L2, Lcss, Diff)
	    ;
		lcss__to_diff2(XNext, YNext, L1, L2, Lcss, Diff1),
		Diff = [ add(XLoc,Y-YEnd) | Diff1 ]
	    )
	;
	    ( Y = Y2 ->
		lcss__to_diff2(XNext, YNext, L1, L2, Lcss, Diff1),
		Diff = [ delete(X-XEnd,YLoc) | Diff1 ]
	    ;
		lcss__to_diff2(XNext, YNext, L1, L2, Lcss, Diff1),
		Diff = [ change(X-XEnd,Y-YEnd) | Diff1 ]
	    )
	).

%-----------------------------------------------------------------------------%

	% This is a quick 'n' dirty version until deep
	% indexing is implemented in the determinism
	% checker.
:- pred lcss__display_diff(file, file, diff, io__state, io__state).
:- mode lcss__display_diff(in, in, in, di, uo) is det.
lcss__display_diff(_, _, []) --> { true }.
lcss__display_diff(File1, File2, [ SingDiff | Diff ]) -->
	( { SingDiff = add(X, Y1-Y2) },
	    lcss__write_command(X, X, 'a', Y1, Y2),
	    lcss__show_file(File2, '>', Y1, Y2)
	; { SingDiff = delete(X1-X2, Y) },
	    lcss__write_command(X1, X2, 'd', Y, Y),
	    lcss__show_file(File1, '<', X1, X2)
	; { SingDiff = change(X1-X2, Y1-Y2) },
	    lcss__write_command(X1, X2, 'c', Y1, Y2),
	    lcss__show_file(File1, '<', X1, X2),
	    io__write_string("---\n"),
	    lcss__show_file(File2, '>', Y1, Y2)
	),
	lcss__display_diff(File1, File2, Diff).

	% lcss__display_diff takes a list of diffs and
	% displays it in the standard diff(1) format.
% lcss__display_diff(_, _, []) --> { true }.
% lcss__display_diff(File1, File2, [ add(X, Y1-Y2) | Diff ]) -->
% 	lcss__write_command(X, X, 'a', Y1, Y2),
%	lcss__show_file(File2, '>', Y1, Y2),
% 	lcss__display_diff(File1, File2, Diff),
% lcss__display_diff(File1, File2, [ delete(X1-X2, Y) | Diff ]) -->
% 	lcss__write_command(X1, X2, 'd', Y, Y),
%	lcss__show_file(File1, '<', X1, X2),
% 	lcss__display_diff(File1, File2, Diff).
% lcss__display_diff(File1, File2, [ change(X1-X2, Y1-Y2) | Diff ]) -->
% 	lcss__write_command(X1, X2, 'c', Y1, Y2),
%	lcss__show_file(File1, '<', X1, X2),
%	io__write_string("---\n"),
%	lcss__show_file(File2, '>', Y1, Y2),
% 	lcss__display_diff(File1, File2, Diff).

:- pred lcss__show_file(file, char, int, int, io__state, io__state).

	% lcss__show_file shows the portion of the file
	% from Low to High, with each line preceeded by
	% the Prefix characher and a space.  The diff(1)
	% format specifies that the lines effected in the
	% first file should be flagged by '<' and the
	% lines effected in the second file should be
	% flagged by '>'.
:- mode lcss__show_file(in, in, in, in, di, uo) is det.
lcss__show_file(File, Prefix, Low, High) -->
	( { Low > High } ->
	    { true }
	;
	    ( { file__get_line(File, Low, Line) } ->
	    	{ Low1 is Low+1 },
	    	io__write_char(Prefix),
	    	io__write_char(' '),
		io__write_string(Line),
	    	lcss__show_file(File, Prefix, Low1, High)
	    ;
		{ true }
	    )
	).

	% lcss__write_command displays a diff(1) command.
	% Like ed(1), a pair of numbers which are identical
	% are abbreviated by a single number.
:- pred lcss__write_command(int, int, char, int, int, io__state, io__state).
:- mode lcss__write_command(in, in, in, in, in, di, uo) is det.
lcss__write_command(X1, X2, C, Y1, Y2) -->
	( { X1 = X2 } ->
	    io__write_int(X1)
	;
	    io__write_int(X1),
	    io__write_char(','),
	    io__write_int(X2)
	),
	io__write_char(C),
	( { Y1 = Y2 } ->
	    io__write_int(Y1)
	;
	    io__write_int(Y1),
	    io__write_char(','),
	    io__write_int(Y2)
	),
	io__write_char('\n').

%-----------------------------------------------------------------------------%
%-----------------------------------------------------------------------------%