@@ -20,228 +20,492 @@
** http://www.hwaci.com/drh/
**
*******************************************************************************
**
-** This file contains code used to implement "diff" operators.
+** This file contains code used to compute a "diff" between two
+** text files.
*/
#include "config.h"
#include "diff.h"
#include <assert.h>
+
+#if 0
+#define DEBUG(X) X
+#else
+#define DEBUG(X)
+#endif
+
/*
** Information about each line of a file being diffed.
+**
+** The lower 20 bits of the hash are the length of the
+** line. If any line is longer than 1048575 characters,
+** the file is considered binary.
*/
typedef struct DLine DLine;
struct DLine {
const char *z; /* The text of the line */
unsigned int h; /* Hash of the line */
};
+#define LENGTH_MASK 0x000fffff
+
/*
-** Break a blob into lines by converting each \n into a \000 and
-** creating pointers to the beginning of each line.
+** Return an array of DLine objects containing a pointer to the
+** start of each line and a hash of that line. The lower
+** bits of the hash store the length of each line.
+**
+** Trailing whitespace is removed from each line.
+**
+** Return 0 if the file is binary or contains a line that is
+** longer than 1048575 bytes.
*/
static DLine *break_into_lines(char *z, int *pnLine){
- int nLine, i, j;
+ int nLine, i, j, k, x;
unsigned int h;
DLine *a;
- for(i=0, nLine=1; z[i]; i++){
- if( z[i]=='\n' ) nLine++;
+
+ /* Count the number of lines. Allocate space to hold
+ ** the returned array.
+ */
+ for(i=j=0, nLine=1; z[i]; i++, j++){
+ int c = z[i];
+ if( c==0 ){
+ return 0;
+ }
+ if( c=='\n' && z[i+1]!=0 ){
+ nLine++;
+ if( j>1048575 ){
+ return 0;
+ }
+ j = 0;
+ }
+ }
+ if( j>1048575 ){
+ return 0;
}
a = malloc( nLine*sizeof(a[0]) );
if( a==0 ) fossil_panic("out of memory");
- a[0].z = z;
- for(i=0, j=0, h=0; z[i]; i++){
- if( z[i]=='\n' ){
- a[j].h = h;
- j++;
- a[j].z = &z[i+1];
- z[i] = 0;
- h = 0;
- }else{
- h = h ^ (h<<2) ^ z[i];
+
+ /* Fill in the array */
+ for(i=0; i<nLine; i++){
+ a[i].z = z;
+ for(j=0; z[j] && z[j]!='\n'; j++){}
+ for(k=j; k>0 && isspace(z[k-1]); k--){}
+ for(h=0, x=0; x<k; x++){
+ h = h ^ (h<<2) ^ z[x];
}
+ a[i].h = (h<<20) | k;;
+ z += j+1;
}
- a[j].h = h;
- *pnLine = j+1;
+
+ /* Return results */
+ *pnLine = nLine;
return a;
}
/*
** Return true if two DLine elements are identical.
*/
static int same_dline(DLine *pA, DLine *pB){
- return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
+ return pA->h==pB->h && memcmp(pA->z,pB->z,pA->h & LENGTH_MASK)==0;
+}
+
+/*
+** Append a single line of "diff" output to pOut.
+*/
+static void appendDiffLine(Blob *pOut, char *zPrefix, DLine *pLine){
+ blob_append(pOut, zPrefix, 1);
+ blob_append(pOut, pLine->z, pLine->h & LENGTH_MASK);
+ blob_append(pOut, "\n", 1);
}
+
/*
-** Generate a unified diff of two blobs. The text of the original
-** two blobs is destroyed by the diffing process.
+** Generate a report of the differences between files pA and pB.
+** If pOut is not NULL then a unified diff is appended there. It
+** is assumed that pOut has already been initialized. If pOut is
+** NULL, then a pointer to an array of integers is returned.
+** The integers come in triples. For each triple,
+** the elements are the number of lines copied, the number of
+** lines deleted, and the number of lines inserted. The vector
+** is terminated by a triple of all zeros.
+**
+** This diff utility does not work on binary files. If a binary
+** file is encountered, 0 is returned and pOut is written with
+** text "cannot compute difference between binary files".
+**
+****************************************************************************
+**
+** The core algorithm is a variation on the classic Wagner
+** minimum edit distance with enhancements to reduce the runtime
+** to be almost linear in the common case where the two files
+** have a lot in common. For additional information see
+** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
+** Variations"
+**
+** Consider comparing strings A and B. A=abcabba and B=cbabac
+** We construct a "Wagner" matrix W with A along the X axis and
+** B along the Y axis:
+**
+** c 6 *
+** a 5 *
+** b 4 * *
+** a 3 *
+** b 2 *
+** B c 1 *
+** 0 * * *
+** 0 1 2 3 4 5 6 7
+** a b c a b b a
+** A
+**
+** (Note: we draw this Wagner matrix with the origin at the lower
+** left whereas Myers uses the origin at the upper left. Otherwise,
+** they are the same.)
+**
+** Let Y be the maximum y value or the number of characters in B.
+** 6 in this example. X is the maximum x value or the number of
+** elements in A. Here 7.
+**
+** Our goal is to find a path from (0,0) to (X,Y). The path can
+** use horizontal, vertical, or diagonal steps. A diagonal from
+** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
+** steps corresponds to an insertion. A horizontal step corresponds
+** to a deletion. We want to find the path with the fewest
+** horizontal and vertical steps.
+**
+** Diagonal k consists of all points such that x-y==k. Diagonal
+** zero begins at the origin. Diagonal 1 begins at (1,0).
+** Diagonal -1 begins at (0,1). All diagonals move up and to the
+** right at 45 degrees. Diagonal number increase from upper left
+** to lower right.
+**
+** Myers matrix M is a lower right triangular matrix with indices d
+** along the bottom and i vertically:
+**
+**
+** i=4 | +4 \
+** 3 | +3 +2 |
+** 2 | +2 +1 0 |- k values. k = 2*i-d
+** 1 | +1 0 -1 -2 |
+** 0 | 0 -1 -2 -3 -4 /
+** ---------------
+** 0 1 2 3 4 = d
+**
+** Each element of the Myers matrix corresponds to a diagonal.
+** The diagonal is k=2*i-d. The diagonal values are shown
+** in the template above.
+**
+** Each entry in M represents the end-point on a path from (0,0).
+** The end-point is on diagonal k. The value stored in M is
+** q=x+y where (x,y) is the terminus of the path. A path
+** to M[d][i] will come through either M[d-1][i-1] or
+** though M[d-1][i], whichever holds the largest value of q.
+** If both elements hold the same value, the path comes
+** through M[d-1][i-1].
+**
+** The value of d is the number of insertions and deletions
+** made so far on the path. M grows progressively. So the
+** size of the M matrix is proportional to d*d. For the
+** common case where A and B are similar, d will be small
+** compared to X and Y so little memory is required. The
+** original Wagner algorithm requires X*Y memory, which for
+** larger files (100K lines) is more memory than we have at
+** hand.
*/
-void unified_diff(Blob *pA, Blob *pB, int nContext, Blob *pOut){
- DLine *pDA, *pDB, *A, *B;
- int nA, nB, nAp1;
- int x, y;
- int cnt;
- int i, iStart;
- int *m;
+int *text_diff(
+ Blob *pA_Blob, /* FROM file */
+ Blob *pB_Blob, /* TO file */
+ Blob *pOut, /* Write unified diff here if not NULL */
+ int nContext /* Amount of context to unified diff */
+){
+ DLine *A, *B; /* Files being compared */
+ int X, Y; /* Number of elements in A and B */
+ int x, y; /* Indices: A[x] and B[y] */
+ int szM = 0; /* Number of rows and columns in M */
+ int **M = 0; /* Myers matrix */
+ int i, d; /* Indices on M. M[d][i] */
+ int k, q; /* Diagonal number and distinct from (0,0) */
+ int K, D; /* The diagonal and d for the final solution */
+ int *R = 0; /* Result vector */
+ int r; /* Loop variables */
+ int go = 1; /* Outer loop control */
+ int MAX; /* Largest of X and Y */
/* Break the two files being diffed into individual lines.
** Compute hashes on each line for fast comparison.
*/
- pDA = break_into_lines(blob_str(pA), &nA);
- pDB = break_into_lines(blob_str(pB), &nB);
-
- /* Remove common prefix and suffix to help reduce the value
- ** of N in the O(N^2) minimum edit distance algorithm.
- */
- for(i=0; i<nA && i<nB && same_dline(&pDA[i],&pDB[i]); i++){}
- i -= nContext;
- if( i<0 ) i = 0;
- iStart = i;
- A = &pDA[iStart];
- B = &pDB[iStart];
- nA -= iStart;
- nB -= iStart;
- for(i=1; i<nA && i<nB && same_dline(&A[nA-i],&B[nB-i]); i++){}
- i -= nContext;
- if( i<1 ) i = 1;
- i--;
- nA -= i;
- nB -= i;
-
- /* Create the matrix used for the minimum edit distance
- ** calculation.
- */
- nAp1 = nA + 1;
- m = malloc( sizeof(m[0])*(nB+1)*nAp1 );
-# define M(X,Y) m[(Y)*nAp1+(X)]
-
-
- /* Find the minimum edit distance using Wagner's algorithm.
- */
- for(x=0; x<=nA; x++){
- M(x,0) = x;
+ A = break_into_lines(blob_str(pA_Blob), &X);
+ B = break_into_lines(blob_str(pB_Blob), &Y);
+
+ if( A==0 || B==0 ){
+ free(A);
+ free(B);
+ if( pOut ){
+ blob_appendf(pOut, "cannot compute difference between binary files\n");
+ }
+ return 0;
+ }
+
+ szM = 0;
+ MAX = X>Y ? X : Y;
+ if( MAX>2000 ) MAX = 2000;
+ for(d=0; go && d<=MAX; d++){
+ if( szM<d+1 ){
+ szM += szM + 10;
+ M = realloc(M, sizeof(M[0])*szM);
+ if( M==0 ){
+ fossil_panic("out of memory");
+ }
+ }
+ M[d] = malloc( sizeof(M[d][0])*(d+1) );
+ if( M[d]==0 ){
+ fossil_panic("out of memory");
+ }
+ for(i=0; i<=d; i++){
+ k = 2*i - d;
+ if( d==0 ){
+ q = 0;
+ }else if( i==0 ){
+ q = M[d-1][0];
+ }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){
+ q = M[d-1][i];
+ }else{
+ q = M[d-1][i-1];
+ }
+ x = (k + q + 1)/2;
+ y = x - k;
+ if( x<0 || x>X || y<0 || y>Y ){
+ x = y = 0;
+ }else{
+ while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
+ }
+ M[d][i] = x + y;
+ DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); )
+ if( x==X && y==Y ){
+ go = 0;
+ break;
+ }
+ }
}
- for(y=0; y<=nB; y++){
- M(0,y) = y;
- }
- for(x=1; x<=nA; x++){
- for(y=1; y<=nB; y++){
- int e = M(x-1,y) + 1;
- if( e>M(x,y-1)+1 ){
- e = M(x,y-1)+1;
+ if( d>MAX ){
+ R = malloc( sizeof(R[0])*7 );
+ R[0] = 0;
+ R[1] = X;
+ R[2] = Y;
+ R[3] = 0;
+ R[4] = 0;
+ R[5] = 0;
+ R[6] = 0;
+ }else{
+ /* Reuse M[] as follows:
+ **
+ ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted.
+ ** M[d][0] = number of lines copied after the ins or del above.
+ **
+ */
+ D = d - 1;
+ K = X - Y;
+ for(d=D, i=(K+D)/2; d>0; d--){
+ DEBUG( printf("d=%d i=%d\n", d, i); )
+ if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){
+ M[d][0] = M[d][i] - M[d-1][i-1] - 1;
+ M[d][1] = 0;
+ i--;
+ }else{
+ M[d][0] = M[d][i] - M[d-1][i] - 1;
+ M[d][1] = 1;
+ }
+ }
+
+ DEBUG(
+ printf("---------------\nM[0][0] = %5d\n", M[0][0]);
+ for(d=1; d<=D; d++){
+ printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]);
+ }
+ )
+
+ /* Allocate the output vector
+ */
+ R = malloc( sizeof(R[0])*(D+2)*3 );
+ if( R==0 ){
+ fossil_fatal("out of memory");
+ }
+
+ /* Populate the output vector
+ */
+ d = r = 0;
+ while( d<=D ){
+ int n;
+ R[r++] = M[d++][0]/2; /* COPY */
+ if( d>D ){
+ R[r++] = 0;
+ R[r++] = 0;
+ break;
}
- if( e<=M(x-1,y-1) ){
- M(x,y) = e;
- }else if( same_dline(&A[x-1], &B[y-1]) ){
- M(x,y) = M(x-1,y-1);
+ if( M[d][1]==0 ){
+ n = 1;
+ while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
+ d++;
+ n++;
+ }
+ R[r++] = n; /* DELETE */
+ if( d==D || M[d][0]>0 ){
+ R[r++] = 0; /* INSERT */
+ continue;
+ }
+ d++;
}else{
- M(x,y) = e;
+ R[r++] = 0; /* DELETE */
+ }
+ assert( M[d][1]==1 );
+ n = 1;
+ while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
+ d++;
+ n++;
}
+ R[r++] = n; /* INSERT */
}
+ R[r++] = 0;
+ R[r++] = 0;
+ R[r++] = 0;
+ }
+
+ /* Free the Myers matrix */
+ for(d=0; d<=D; d++){
+ free(M[d]);
}
+ free(M);
- /* Walk backwards through the Wagner algorithm matrix to determine
- ** the specific edits that give the minimum edit distance. Mark our
- ** path through the matrix with -1.
+ /* If pOut is defined, construct a unified diff into pOut and
+ ** delete R
*/
- x = nA;
- y = nB;
- while( x>0 || y>0 ){
- int v = M(x,y);
- M(x,y) = -1;
- if( x==0 ){
- y--;
- }else if( y==0 ){
- x--;
- }else if( M(x,y-1)+1==v ){
- y--;
- }else if( M(x-1,y)+1==v ){
- x--;
- }else{
- x--;
- y--;
- }
- }
+ if( pOut ){
+ int a = 0; /* Index of next line in A[] */
+ int b = 0; /* Index of next line in B[] */
+ int nr; /* Number of COPY/DELETE/INSERT triples to process */
+ int mxr; /* Maximum value for r */
+ int na, nb; /* Number of lines shown from A and B */
+ int i, j; /* Loop counters */
+ int m; /* Number of lines to output */
+ int skip; /* Number of lines to skip */
+
+ for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){}
+ for(r=0; r<mxr; r += 3*nr){
+ /* Figure out how many triples to show in a single block */
+ for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
+ DEBUG( printf("r=%d nr=%d\n", r, nr); )
+
+ /* For the current block comprising nr triples, figure out
+ ** how many lines of A and B are to be displayed
+ */
+ if( R[r]>nContext ){
+ na = nb = nContext;
+ skip = R[r] - nContext;
+ }else{
+ na = nb = R[r];
+ skip = 0;
+ }
+ for(i=0; i<nr; i++){
+ na += R[r+i*3+1];
+ nb += R[r+i*3+2];
+ }
+ if( R[r+nr*3]>nContext ){
+ na += nContext;
+ nb += nContext;
+ }else{
+ na += R[r+nr*3];
+ nb += R[r+nr*3];
+ }
+ for(i=1; i<nr; i++){
+ na += R[r+i*3];
+ nb += R[r+i*3];
+ }
+ blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
-#if 0
-for(y=0; y<=nB; y++){
- for(x=0; x<=nA; x++){
- printf(" %2d", M(x,y));
- }
- printf("\n");
-}
-#endif
+ /* Show the initial common area */
+ a += skip;
+ b += skip;
+ m = R[r] - skip;
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, " ", &A[a+j]);
+ }
+ a += m;
+ b += m;
- x = y = 0;
- cnt = nContext;
- while( x<nA || y<nB ){
- int t1, t2;
- if( (t1 = M(x+1,y))<0 || (t2 = M(x,y+1))<0 ){
- if( cnt>=nContext ){
- blob_appendf(pOut, "@@ -%d +%d @@\n",
- x-nContext+iStart+2, y-nContext+iStart+2);
- for(i=x-nContext+1; i<x; i++){
- if( i<0 ) continue;
- blob_appendf(pOut, " %s\n", A[i].z);
+ /* Show the differences */
+ for(i=0; i<nr; i++){
+ m = R[r+i*3+1];
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, "-", &A[a+j]);
+ }
+ a += m;
+ m = R[r+i*3+2];
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, "+", &B[b+j]);
+ }
+ b += m;
+ if( i<nr-1 ){
+ m = R[r+i*3+3];
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, " ", &B[b+j]);
+ }
+ b += m;
+ a += m;
}
}
- }
- if( t1<0 ){
- blob_appendf(pOut, "-%s\n", A[x].z);
- x++;
- cnt = 0;
- }else if( t2<0 ){
- blob_appendf(pOut, "+%s\n", B[y].z);
- y++;
- cnt = 0;
- }else{
- if( M(x+1,y+1)==(-1) && cnt<nContext ){
- blob_appendf(pOut, " %s\n", A[x].z);
+
+ /* Show the final common area */
+ assert( nr==i );
+ m = R[r+nr*3];
+ if( m>nContext ) m = nContext;
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, " ", &B[b+j]);
}
- cnt++;
- x++;
- y++;
}
+ free(R);
+ R = 0;
}
- /* Cleanup allocationed memory */
- free(m);
- free(pDA);
- free(pDB);
+ /* We no longer need the A[] and B[] vectors */
+ free(A);
+ free(B);
+
+ /* Return the result */
+ return R;
+}
+
+/*
+** COMMAND: test-rawdiff
+*/
+void test_rawdiff_cmd(void){
+ Blob a, b;
+ int r;
+ int i;
+ int *R;
+ if( g.argc<4 ) usage("FILE1 FILE2 ...");
+ blob_read_from_file(&a, g.argv[2]);
+ for(i=3; i<g.argc; i++){
+ if( i>3 ) printf("-------------------------------\n");
+ blob_read_from_file(&b, g.argv[i]);
+ R = text_diff(&a, &b, 0, 0);
+ for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){
+ printf(" copy %4d delete %4d insert %4d\n", R[r], R[r+1], R[r+2]);
+ }
+ /* free(R); */
+ blob_reset(&b);
+ }
}
/*
-** COMMAND: test-diff
+** COMMAND: test-udiff
*/
-void test_diff_cmd(void){
+void test_udiff_cmd(void){
Blob a, b, out;
if( g.argc!=4 ) usage("FILE1 FILE2");
blob_read_from_file(&a, g.argv[2]);
blob_read_from_file(&b, g.argv[3]);
blob_zero(&out);
- unified_diff(&a, &b, 4, &out);
- blob_reset(&a);
- blob_reset(&b);
- printf("%s", blob_str(&out));
- blob_reset(&out);
-}
-/*
-** COMMAND: test-uuid-diff
-*/
-void test_uuiddiff_cmd(void){
- Blob a, b, out;
- int ridA, ridB;
- if( g.argc!=4 ) usage("UUID2 UUID1");
- db_must_be_within_tree();
- ridA = name_to_rid(g.argv[2]);
- content_get(ridA, &a);
- ridB = name_to_rid(g.argv[3]);
- content_get(ridB, &b);
- blob_zero(&out);
- unified_diff(&a, &b, 4, &out);
- blob_reset(&a);
- blob_reset(&b);
- printf("%s", blob_str(&out));
- blob_reset(&out);
+ text_diff(&a, &b, &out, 3);
+ blob_write_to_file(&out, "-");
}