Overview
SHA1 Hash: | eeea77f340918d8255ff87c84a9e48699709bf96 |
---|---|
Date: | 2008-02-04 14:05:03 |
User: | drh |
Comment: | Improvements to comments on the diff algorithm code. Completely remove the older Wagner/Myers algorithm which had been commented out. |
Timelines: | ancestors | descendants | both | trunk |
Other Links: | files | ZIP archive | manifest |
Tags And Properties
- branch=trunk inherited from [a28c83647d]
- sym-trunk inherited from [a28c83647d]
Changes
[hide diffs]Modified src/diff.c from [50ab9d9846] to [7a0eb09931].
@@ -299,15 +299,15 @@ ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence ** of lines in these two blocks that are exactly the same. Return ** the bounds of the matching sequence. */ static void longestCommonSequence( - DContext *p, - int iS1, int iE1, - int iS2, int iE2, - int *piSX, int *piEX, - int *piSY, int *piEY + DContext *p, /* Two files being compared */ + int iS1, int iE1, /* Range of lines in p->aFrom[] */ + int iS2, int iE2, /* Range of lines in p->aTo[] */ + int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ + int *piSY, int *piEY /* Write p->aTo[] common segment here */ ){ double bestScore = -1e30; /* Best score seen so far */ int i, j; /* Loop counters */ int iSX, iSY, iEX, iEY; /* Current match */ double score; /* Current score */ @@ -379,43 +379,65 @@ /* ** Do a single step in the difference. Compute a sequence of ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of ** the input into lines iS2 through iE2-1 of the output and write ** that sequence into the difference context. +** +** The algorithm is to find a block of common text near the middle +** of the two segments being diffed. Then recursively compute +** differences on the blocks before and after that common segment. +** Special cases apply if either input segment is empty or if the +** two segments have no text in common. */ static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){ int iSX, iEX, iSY, iEY; if( iE1<=iS1 ){ + /* The first segment is empty */ if( iE2>iS2 ){ appendTriple(p, 0, 0, iE2-iS2); } return; } if( iE2<=iS2 ){ + /* The second segment is empty */ appendTriple(p, 0, iE1-iS1, 0); return; } /* Find the longest matching segment between the two sequences */ longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); if( iEX>iSX ){ - /* Recursively diff either side of the matching segment */ + /* A common segement has been found. + ** Recursively diff either side of the matching segment */ diff_step(p, iS1, iSX, iS2, iSY); if( iEX>iSX ){ appendTriple(p, iEX - iSX, 0, 0); } diff_step(p, iEX, iE1, iEY, iE2); }else{ + /* The two segments have nothing in common. Delete the first then + ** insert the second. */ appendTriple(p, 0, iE1-iS1, iE2-iS2); } } /* ** Compute the differences between two files already loaded into ** the DContext structure. +** +** A divide and conquer technique is used. We look for a large +** block of common text that is in the middle of both files. Then +** compute the difference on those parts of the file before and +** after the common block. This technique is fast, but it does +** not necessarily generate the minimum difference set. On the +** other hand, we do not need a minimum difference set, only one +** that makes sense to human readers, which this algorithm does. +** +** Any common text at the beginning and end of the two files is +** removed before starting the divide-and-conquer algorithm. */ static void diff_all(DContext *p){ int mnE, iS, iE1, iE2; /* Carve off the common header and footer */ @@ -498,367 +520,10 @@ free(c.aFrom); free(c.aTo); return c.aEdit; } } - -#if 0 /********** Disabled and replaced by code above ************/ - -/* -** Generate a report of the differences between files pA and pB. -** If pOut is not NULL then a unified diff is appended there. It -** is assumed that pOut has already been initialized. If pOut is -** NULL, then a pointer to an array of integers is returned. -** The integers come in triples. For each triple, -** the elements are the number of lines copied, the number of -** lines deleted, and the number of lines inserted. The vector -** is terminated by a triple of all zeros. -** -** This diff utility does not work on binary files. If a binary -** file is encountered, 0 is returned and pOut is written with -** text "cannot compute difference between binary files". -** -**************************************************************************** -** -** The core algorithm is a variation on the classic Wagner -** minimum edit distance with enhancements to reduce the runtime -** to be almost linear in the common case where the two files -** have a lot in common. For additional information see -** Eugene W. Myers, "An O(ND) Difference Algorithm And Its -** Variations" -** -** Consider comparing strings A and B. A=abcabba and B=cbabac -** We construct a "Wagner" matrix W with A along the X axis and -** B along the Y axis: -** -** c 6 * -** a 5 * -** b 4 * * -** a 3 * -** b 2 * -** B c 1 * -** 0 * * * -** 0 1 2 3 4 5 6 7 -** a b c a b b a -** A -** -** (Note: we draw this Wagner matrix with the origin at the lower -** left whereas Myers uses the origin at the upper left. Otherwise, -** they are the same.) -** -** Let Y be the maximum y value or the number of characters in B. -** 6 in this example. X is the maximum x value or the number of -** elements in A. Here 7. -** -** Our goal is to find a path from (0,0) to (X,Y). The path can -** use horizontal, vertical, or diagonal steps. A diagonal from -** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y]. A vertical -** steps corresponds to an insertion. A horizontal step corresponds -** to a deletion. We want to find the path with the fewest -** horizontal and vertical steps. -** -** Diagonal k consists of all points such that x-y==k. Diagonal -** zero begins at the origin. Diagonal 1 begins at (1,0). -** Diagonal -1 begins at (0,1). All diagonals move up and to the -** right at 45 degrees. Diagonal number increase from upper left -** to lower right. -** -** Myers matrix M is a lower right triangular matrix with indices d -** along the bottom and i vertically: -** -** -** i=4 | +4 \ -** 3 | +3 +2 | -** 2 | +2 +1 0 |- k values. k = 2*i-d -** 1 | +1 0 -1 -2 | -** 0 | 0 -1 -2 -3 -4 / -** --------------- -** 0 1 2 3 4 = d -** -** Each element of the Myers matrix corresponds to a diagonal. -** The diagonal is k=2*i-d. The diagonal values are shown -** in the template above. -** -** Each entry in M represents the end-point on a path from (0,0). -** The end-point is on diagonal k. The value stored in M is -** q=x+y where (x,y) is the terminus of the path. A path -** to M[d][i] will come through either M[d-1][i-1] or -** though M[d-1][i], whichever holds the largest value of q. -** If both elements hold the same value, the path comes -** through M[d-1][i-1]. -** -** The value of d is the number of insertions and deletions -** made so far on the path. M grows progressively. So the -** size of the M matrix is proportional to d*d. For the -** common case where A and B are similar, d will be small -** compared to X and Y so little memory is required. The -** original Wagner algorithm requires X*Y memory, which for -** larger files (100K lines) is more memory than we have at -** hand. -*/ -int *text_diff( - Blob *pA_Blob, /* FROM file */ - Blob *pB_Blob, /* TO file */ - Blob *pOut, /* Write unified diff here if not NULL */ - int nContext /* Amount of context to unified diff */ -){ - DLine *A, *B; /* Files being compared */ - int X, Y; /* Number of elements in A and B */ - int x, y; /* Indices: A[x] and B[y] */ - int szM = 0; /* Number of rows and columns in M */ - int **M = 0; /* Myers matrix */ - int i, d; /* Indices on M. M[d][i] */ - int k, q; /* Diagonal number and distinct from (0,0) */ - int K, D; /* The diagonal and d for the final solution */ - int *R = 0; /* Result vector */ - int r; /* Loop variables */ - int go = 1; /* Outer loop control */ - int MAX; /* Largest of X and Y */ - - /* Break the two files being diffed into individual lines. - ** Compute hashes on each line for fast comparison. - */ - A = break_into_lines(blob_str(pA_Blob), &X); - B = break_into_lines(blob_str(pB_Blob), &Y); - - if( A==0 || B==0 ){ - free(A); - free(B); - if( pOut ){ - blob_appendf(pOut, "cannot compute difference between binary files\n"); - } - return 0; - } - - szM = 0; - MAX = X>Y ? X : Y; - if( MAX>2000 ) MAX = 2000; - for(d=0; go && d<=MAX; d++){ - if( szM<d+1 ){ - szM += szM + 10; - M = realloc(M, sizeof(M[0])*szM); - if( M==0 ){ - fossil_panic("out of memory"); - } - } - M[d] = malloc( sizeof(M[d][0])*(d+1) ); - if( M[d]==0 ){ - fossil_panic("out of memory"); - } - for(i=0; i<=d; i++){ - k = 2*i - d; - if( d==0 ){ - q = 0; - }else if( i==0 ){ - q = M[d-1][0]; - }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){ - q = M[d-1][i]; - }else{ - q = M[d-1][i-1]; - } - x = (k + q + 1)/2; - y = x - k; - if( x<0 || x>X || y<0 || y>Y ){ - x = y = 0; - }else{ - while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; } - } - M[d][i] = x + y; - DEBUG( printf("M[%d][%i] = %d k=%d (%d,%d)\n", d, i, x+y, k, x, y); ) - if( x==X && y==Y ){ - go = 0; - break; - } - } - } - if( d>MAX ){ - R = malloc( sizeof(R[0])*7 ); - R[0] = 0; - R[1] = X; - R[2] = Y; - R[3] = 0; - R[4] = 0; - R[5] = 0; - R[6] = 0; - }else{ - /* Reuse M[] as follows: - ** - ** M[d][1] = 1 if a line is inserted or 0 if a line is deleted. - ** M[d][0] = number of lines copied after the ins or del above. - ** - */ - D = d - 1; - K = X - Y; - for(d=D, i=(K+D)/2; d>0; d--){ - DEBUG( printf("d=%d i=%d\n", d, i); ) - if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){ - M[d][0] = M[d][i] - M[d-1][i-1] - 1; - M[d][1] = 0; - i--; - }else{ - M[d][0] = M[d][i] - M[d-1][i] - 1; - M[d][1] = 1; - } - } - - DEBUG( - printf("---------------\nM[0][0] = %5d\n", M[0][0]); - for(d=1; d<=D; d++){ - printf("M[%d][0] = %5d M[%d][1] = %d\n",d,M[d][0],d,M[d][1]); - } - ) - - /* Allocate the output vector - */ - R = malloc( sizeof(R[0])*(D+2)*3 ); - if( R==0 ){ - fossil_fatal("out of memory"); - } - - /* Populate the output vector - */ - d = r = 0; - while( d<=D ){ - int n; - R[r++] = M[d++][0]/2; /* COPY */ - if( d>D ){ - R[r++] = 0; - R[r++] = 0; - break; - } - if( M[d][1]==0 ){ - n = 1; - while( M[d][0]==0 && d<D && M[d+1][1]==0 ){ - d++; - n++; - } - R[r++] = n; /* DELETE */ - if( d==D || M[d][0]>0 ){ - R[r++] = 0; /* INSERT */ - continue; - } - d++; - }else{ - R[r++] = 0; /* DELETE */ - } - assert( M[d][1]==1 ); - n = 1; - while( M[d][0]==0 && d<D && M[d+1][1]==1 ){ - d++; - n++; - } - R[r++] = n; /* INSERT */ - } - R[r++] = 0; - R[r++] = 0; - R[r++] = 0; - } - - /* Free the Myers matrix */ - for(d=0; d<=D; d++){ - free(M[d]); - } - free(M); - - /* If pOut is defined, construct a unified diff into pOut and - ** delete R - */ - if( pOut ){ - int a = 0; /* Index of next line in A[] */ - int b = 0; /* Index of next line in B[] */ - int nr; /* Number of COPY/DELETE/INSERT triples to process */ - int mxr; /* Maximum value for r */ - int na, nb; /* Number of lines shown from A and B */ - int i, j; /* Loop counters */ - int m; /* Number of lines to output */ - int skip; /* Number of lines to skip */ - - for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){} - for(r=0; r<mxr; r += 3*nr){ - /* Figure out how many triples to show in a single block */ - for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){} - DEBUG( printf("r=%d nr=%d\n", r, nr); ) - - /* For the current block comprising nr triples, figure out - ** how many lines of A and B are to be displayed - */ - if( R[r]>nContext ){ - na = nb = nContext; - skip = R[r] - nContext; - }else{ - na = nb = R[r]; - skip = 0; - } - for(i=0; i<nr; i++){ - na += R[r+i*3+1]; - nb += R[r+i*3+2]; - } - if( R[r+nr*3]>nContext ){ - na += nContext; - nb += nContext; - }else{ - na += R[r+nr*3]; - nb += R[r+nr*3]; - } - for(i=1; i<nr; i++){ - na += R[r+i*3]; - nb += R[r+i*3]; - } - blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb); - - /* Show the initial common area */ - a += skip; - b += skip; - m = R[r] - skip; - for(j=0; j<m; j++){ - appendDiffLine(pOut, " ", &A[a+j]); - } - a += m; - b += m; - - /* Show the differences */ - for(i=0; i<nr; i++){ - m = R[r+i*3+1]; - for(j=0; j<m; j++){ - appendDiffLine(pOut, "-", &A[a+j]); - } - a += m; - m = R[r+i*3+2]; - for(j=0; j<m; j++){ - appendDiffLine(pOut, "+", &B[b+j]); - } - b += m; - if( i<nr-1 ){ - m = R[r+i*3+3]; - for(j=0; j<m; j++){ - appendDiffLine(pOut, " ", &B[b+j]); - } - b += m; - a += m; - } - } - - /* Show the final common area */ - assert( nr==i ); - m = R[r+nr*3]; - if( m>nContext ) m = nContext; - for(j=0; j<m; j++){ - appendDiffLine(pOut, " ", &B[b+j]); - } - } - free(R); - R = 0; - } - - /* We no longer need the A[] and B[] vectors */ - free(A); - free(B); - - /* Return the result */ - return R; -} -#endif /***************** End of the Wagner/Myers algorithm ************/ /* ** COMMAND: test-rawdiff */ void test_rawdiff_cmd(void){