@@ -300,13 +300,13 @@
** of lines in these two blocks that are exactly the same. Return
** the bounds of the matching sequence.
*/
static void longestCommonSequence(
- DContext *p,
- int iS1, int iE1,
- int iS2, int iE2,
- int *piSX, int *piEX,
- int *piSY, int *piEY
+ DContext *p, /* Two files being compared */
+ int iS1, int iE1, /* Range of lines in p->aFrom[] */
+ int iS2, int iE2, /* Range of lines in p->aTo[] */
+ int *piSX, int *piEX, /* Write p->aFrom[] common segment here */
+ int *piSY, int *piEY /* Write p->aTo[] common segment here */
){
double bestScore = -1e30; /* Best score seen so far */
int i, j; /* Loop counters */
int iSX, iSY, iEX, iEY; /* Current match */
@@ -380,19 +380,27 @@
** Do a single step in the difference. Compute a sequence of
** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
** the input into lines iS2 through iE2-1 of the output and write
** that sequence into the difference context.
+**
+** The algorithm is to find a block of common text near the middle
+** of the two segments being diffed. Then recursively compute
+** differences on the blocks before and after that common segment.
+** Special cases apply if either input segment is empty or if the
+** two segments have no text in common.
*/
static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
int iSX, iEX, iSY, iEY;
if( iE1<=iS1 ){
+ /* The first segment is empty */
if( iE2>iS2 ){
appendTriple(p, 0, 0, iE2-iS2);
}
return;
}
if( iE2<=iS2 ){
+ /* The second segment is empty */
appendTriple(p, 0, iE1-iS1, 0);
return;
}
@@ -399,22 +407,36 @@
/* Find the longest matching segment between the two sequences */
longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
if( iEX>iSX ){
- /* Recursively diff either side of the matching segment */
+ /* A common segement has been found.
+ ** Recursively diff either side of the matching segment */
diff_step(p, iS1, iSX, iS2, iSY);
if( iEX>iSX ){
appendTriple(p, iEX - iSX, 0, 0);
}
diff_step(p, iEX, iE1, iEY, iE2);
}else{
+ /* The two segments have nothing in common. Delete the first then
+ ** insert the second. */
appendTriple(p, 0, iE1-iS1, iE2-iS2);
}
}
/*
** Compute the differences between two files already loaded into
** the DContext structure.
+**
+** A divide and conquer technique is used. We look for a large
+** block of common text that is in the middle of both files. Then
+** compute the difference on those parts of the file before and
+** after the common block. This technique is fast, but it does
+** not necessarily generate the minimum difference set. On the
+** other hand, we do not need a minimum difference set, only one
+** that makes sense to human readers, which this algorithm does.
+**
+** Any common text at the beginning and end of the two files is
+** removed before starting the divide-and-conquer algorithm.
*/
static void diff_all(DContext *p){
int mnE, iS, iE1, iE2;
@@ -499,365 +521,8 @@
free(c.aTo);
return c.aEdit;
}
}
-
-#if 0 /********** Disabled and replaced by code above ************/
-
-/*
-** Generate a report of the differences between files pA and pB.
-** If pOut is not NULL then a unified diff is appended there. It
-** is assumed that pOut has already been initialized. If pOut is
-** NULL, then a pointer to an array of integers is returned.
-** The integers come in triples. For each triple,
-** the elements are the number of lines copied, the number of
-** lines deleted, and the number of lines inserted. The vector
-** is terminated by a triple of all zeros.
-**
-** This diff utility does not work on binary files. If a binary
-** file is encountered, 0 is returned and pOut is written with
-** text "cannot compute difference between binary files".
-**
-****************************************************************************
-**
-** The core algorithm is a variation on the classic Wagner
-** minimum edit distance with enhancements to reduce the runtime
-** to be almost linear in the common case where the two files
-** have a lot in common. For additional information see
-** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
-** Variations"
-**
-** Consider comparing strings A and B. A=abcabba and B=cbabac
-** We construct a "Wagner" matrix W with A along the X axis and
-** B along the Y axis:
-**
-** c 6 *
-** a 5 *
-** b 4 * *
-** a 3 *
-** b 2 *
-** B c 1 *
-** 0 * * *
-** 0 1 2 3 4 5 6 7
-** a b c a b b a
-** A
-**
-** (Note: we draw this Wagner matrix with the origin at the lower
-** left whereas Myers uses the origin at the upper left. Otherwise,
-** they are the same.)
-**
-** Let Y be the maximum y value or the number of characters in B.
-** 6 in this example. X is the maximum x value or the number of
-** elements in A. Here 7.
-**
-** Our goal is to find a path from (0,0) to (X,Y). The path can
-** use horizontal, vertical, or diagonal steps. A diagonal from
-** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical
-** steps corresponds to an insertion. A horizontal step corresponds
-** to a deletion. We want to find the path with the fewest
-** horizontal and vertical steps.
-**
-** Diagonal k consists of all points such that x-y==k. Diagonal
-** zero begins at the origin. Diagonal 1 begins at (1,0).
-** Diagonal -1 begins at (0,1). All diagonals move up and to the
-** right at 45 degrees. Diagonal number increase from upper left
-** to lower right.
-**
-** Myers matrix M is a lower right triangular matrix with indices d
-** along the bottom and i vertically:
-**
-**
-** i=4 | +4 \
-** 3 | +3 +2 |
-** 2 | +2 +1 0 |- k values. k = 2*i-d
-** 1 | +1 0 -1 -2 |
-** 0 | 0 -1 -2 -3 -4 /
-** ---------------
-** 0 1 2 3 4 = d
-**
-** Each element of the Myers matrix corresponds to a diagonal.
-** The diagonal is k=2*i-d. The diagonal values are shown
-** in the template above.
-**
-** Each entry in M represents the end-point on a path from (0,0).
-** The end-point is on diagonal k. The value stored in M is
-** q=x+y where (x,y) is the terminus of the path. A path
-** to M[d][i] will come through either M[d-1][i-1] or
-** though M[d-1][i], whichever holds the largest value of q.
-** If both elements hold the same value, the path comes
-** through M[d-1][i-1].
-**
-** The value of d is the number of insertions and deletions
-** made so far on the path. M grows progressively. So the
-** size of the M matrix is proportional to d*d. For the
-** common case where A and B are similar, d will be small
-** compared to X and Y so little memory is required. The
-** original Wagner algorithm requires X*Y memory, which for
-** larger files (100K lines) is more memory than we have at
-** hand.
-*/
-int *text_diff(
- Blob *pA_Blob, /* FROM file */
- Blob *pB_Blob, /* TO file */
- Blob *pOut, /* Write unified diff here if not NULL */
- int nContext /* Amount of context to unified diff */
-){
- DLine *A, *B; /* Files being compared */
- int X, Y; /* Number of elements in A and B */
- int x, y; /* Indices: A[x] and B[y] */
- int szM = 0; /* Number of rows and columns in M */
- int **M = 0; /* Myers matrix */
- int i, d; /* Indices on M. M[d][i] */
- int k, q; /* Diagonal number and distinct from (0,0) */
- int K, D; /* The diagonal and d for the final solution */
- int *R = 0; /* Result vector */
- int r; /* Loop variables */
- int go = 1; /* Outer loop control */
- int MAX; /* Largest of X and Y */
-
- /* Break the two files being diffed into individual lines.
- ** Compute hashes on each line for fast comparison.
- */
- A = break_into_lines(blob_str(pA_Blob), &X);
- B = break_into_lines(blob_str(pB_Blob), &Y);
-
- if( A==0 || B==0 ){
- free(A);
- free(B);
- if( pOut ){
- blob_appendf(pOut, "cannot compute difference between binary files\n");
- }
- return 0;
- }
-
- szM = 0;
- MAX = X>Y ? X : Y;
- if( MAX>2000 ) MAX = 2000;
- for(d=0; go && d<=MAX; d++){
- if( szM<d+1 ){
- szM += szM + 10;
- M = realloc(M, sizeof(M[0])*szM);
- if( M==0 ){
- fossil_panic("out of memory");
- }
- }
- M[d] = malloc( sizeof(M[d][0])*(d+1) );
- if( M[d]==0 ){
- fossil_panic("out of memory");
- }
- for(i=0; i<=d; i++){
- k = 2*i - d;
- if( d==0 ){
- q = 0;
- }else if( i==0 ){
- q = M[d-1][0];
- }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){
- q = M[d-1][i];
- }else{
- q = M[d-1][i-1];
- }
- x = (k + q + 1)/2;
- y = x - k;
- if( x<0 || x>X || y<0 || y>Y ){
- x = y = 0;
- }else{
- while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
- }
- M[d][i] = x + y;
- DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); )
- if( x==X && y==Y ){
- go = 0;
- break;
- }
- }
- }
- if( d>MAX ){
- R = malloc( sizeof(R[0])*7 );
- R[0] = 0;
- R[1] = X;
- R[2] = Y;
- R[3] = 0;
- R[4] = 0;
- R[5] = 0;
- R[6] = 0;
- }else{
- /* Reuse M[] as follows:
- **
- ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted.
- ** M[d][0] = number of lines copied after the ins or del above.
- **
- */
- D = d - 1;
- K = X - Y;
- for(d=D, i=(K+D)/2; d>0; d--){
- DEBUG( printf("d=%d i=%d\n", d, i); )
- if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){
- M[d][0] = M[d][i] - M[d-1][i-1] - 1;
- M[d][1] = 0;
- i--;
- }else{
- M[d][0] = M[d][i] - M[d-1][i] - 1;
- M[d][1] = 1;
- }
- }
-
- DEBUG(
- printf("---------------\nM[0][0] = %5d\n", M[0][0]);
- for(d=1; d<=D; d++){
- printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]);
- }
- )
-
- /* Allocate the output vector
- */
- R = malloc( sizeof(R[0])*(D+2)*3 );
- if( R==0 ){
- fossil_fatal("out of memory");
- }
-
- /* Populate the output vector
- */
- d = r = 0;
- while( d<=D ){
- int n;
- R[r++] = M[d++][0]/2; /* COPY */
- if( d>D ){
- R[r++] = 0;
- R[r++] = 0;
- break;
- }
- if( M[d][1]==0 ){
- n = 1;
- while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
- d++;
- n++;
- }
- R[r++] = n; /* DELETE */
- if( d==D || M[d][0]>0 ){
- R[r++] = 0; /* INSERT */
- continue;
- }
- d++;
- }else{
- R[r++] = 0; /* DELETE */
- }
- assert( M[d][1]==1 );
- n = 1;
- while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
- d++;
- n++;
- }
- R[r++] = n; /* INSERT */
- }
- R[r++] = 0;
- R[r++] = 0;
- R[r++] = 0;
- }
-
- /* Free the Myers matrix */
- for(d=0; d<=D; d++){
- free(M[d]);
- }
- free(M);
-
- /* If pOut is defined, construct a unified diff into pOut and
- ** delete R
- */
- if( pOut ){
- int a = 0; /* Index of next line in A[] */
- int b = 0; /* Index of next line in B[] */
- int nr; /* Number of COPY/DELETE/INSERT triples to process */
- int mxr; /* Maximum value for r */
- int na, nb; /* Number of lines shown from A and B */
- int i, j; /* Loop counters */
- int m; /* Number of lines to output */
- int skip; /* Number of lines to skip */
-
- for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){}
- for(r=0; r<mxr; r += 3*nr){
- /* Figure out how many triples to show in a single block */
- for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
- DEBUG( printf("r=%d nr=%d\n", r, nr); )
-
- /* For the current block comprising nr triples, figure out
- ** how many lines of A and B are to be displayed
- */
- if( R[r]>nContext ){
- na = nb = nContext;
- skip = R[r] - nContext;
- }else{
- na = nb = R[r];
- skip = 0;
- }
- for(i=0; i<nr; i++){
- na += R[r+i*3+1];
- nb += R[r+i*3+2];
- }
- if( R[r+nr*3]>nContext ){
- na += nContext;
- nb += nContext;
- }else{
- na += R[r+nr*3];
- nb += R[r+nr*3];
- }
- for(i=1; i<nr; i++){
- na += R[r+i*3];
- nb += R[r+i*3];
- }
- blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
-
- /* Show the initial common area */
- a += skip;
- b += skip;
- m = R[r] - skip;
- for(j=0; j<m; j++){
- appendDiffLine(pOut, " ", &A[a+j]);
- }
- a += m;
- b += m;
-
- /* Show the differences */
- for(i=0; i<nr; i++){
- m = R[r+i*3+1];
- for(j=0; j<m; j++){
- appendDiffLine(pOut, "-", &A[a+j]);
- }
- a += m;
- m = R[r+i*3+2];
- for(j=0; j<m; j++){
- appendDiffLine(pOut, "+", &B[b+j]);
- }
- b += m;
- if( i<nr-1 ){
- m = R[r+i*3+3];
- for(j=0; j<m; j++){
- appendDiffLine(pOut, " ", &B[b+j]);
- }
- b += m;
- a += m;
- }
- }
-
- /* Show the final common area */
- assert( nr==i );
- m = R[r+nr*3];
- if( m>nContext ) m = nContext;
- for(j=0; j<m; j++){
- appendDiffLine(pOut, " ", &B[b+j]);
- }
- }
- free(R);
- R = 0;
- }
-
- /* We no longer need the A[] and B[] vectors */
- free(A);
- free(B);
-
- /* Return the result */
- return R;
-}
-#endif /***************** End of the Wagner/Myers algorithm ************/
/*
** COMMAND: test-rawdiff
*/