Overview
SHA1 Hash: | 95c07a503396eeda446351c51c48d7189db141d6 |
---|---|
Date: | 2008-02-02 23:39:22 |
User: | drh |
Comment: | A completely new diff algorithm. It is not guaranteed to find the minimum difference between files, but it seems to do a good job and runs much faster on larger files. But command-line diff is still faster for really large files. More work needed. |
Timelines: | ancestors | descendants | both | trunk |
Other Links: | files | ZIP archive | manifest |
Tags And Properties
- branch=trunk inherited from [a28c83647d]
- sym-trunk inherited from [a28c83647d]
Changes
[hide diffs]Modified src/diff.c from [ec5d799613] to [62a5d65904].
@@ -42,29 +42,53 @@ ** line. If any line is longer than 1048575 characters, ** the file is considered binary. */ typedef struct DLine DLine; struct DLine { - const char *z; /* The text of the line */ - unsigned int h; /* Hash of the line */ + const char *z; /* The text of the line */ + unsigned int h; /* Hash of the line */ + unsigned int iNext; /* Index+1 of next line with same the same hash */ + + /* an array of DLine elements services two purposes. The fields + ** above are one per line of input text. But each entry is also + ** a bucket in a hash table. */ + unsigned int iHash; /* First entry+1 in the hash array */ }; -#define LENGTH_MASK 0x000fffff +/* +** Maximum length of a line in a text file. (8192) +*/ +#define LENGTH_MASK_SZ 13 +#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) + +/* +** A context for running a diff. +*/ +typedef struct DContext DContext; +struct DContext { + int *aEdit; /* Array of copy/delete/insert triples */ + int nEdit; /* Number of integers (3x num of triples) in aEdit[] */ + int nEditAlloc; /* Space allocated for aEdit[] */ + DLine *aFrom; /* File on left side of the diff */ + int nFrom; /* Number of lines in aFrom[] */ + DLine *aTo; /* File on right side of the diff */ + int nTo; /* Number of lines in aTo[] */ +}; /* ** Return an array of DLine objects containing a pointer to the ** start of each line and a hash of that line. The lower ** bits of the hash store the length of each line. ** ** Trailing whitespace is removed from each line. ** ** Return 0 if the file is binary or contains a line that is -** longer than 1048575 bytes. +** too long. */ static DLine *break_into_lines(char *z, int *pnLine){ int nLine, i, j, k, x; - unsigned int h; + unsigned int h, h2; DLine *a; /* Count the number of lines. Allocate space to hold ** the returned array. */ @@ -73,31 +97,35 @@ if( c==0 ){ return 0; } if( c=='\n' && z[i+1]!=0 ){ nLine++; - if( j>1048575 ){ + if( j>LENGTH_MASK ){ return 0; } j = 0; } } - if( j>1048575 ){ + if( j>LENGTH_MASK ){ return 0; } a = malloc( nLine*sizeof(a[0]) ); if( a==0 ) fossil_panic("out of memory"); + memset(a, 0, nLine*sizeof(a[0]) ); /* Fill in the array */ for(i=0; i<nLine; i++){ a[i].z = z; for(j=0; z[j] && z[j]!='\n'; j++){} for(k=j; k>0 && isspace(z[k-1]); k--){} for(h=0, x=0; x<k; x++){ h = h ^ (h<<2) ^ z[x]; } - a[i].h = (h<<20) | k;; + a[i].h = h = (h<<LENGTH_MASK_SZ) | k;; + h2 = h % nLine; + a[i].iNext = a[h2].iHash; + a[h2].iHash = i+1; z += j+1; } /* Return results */ *pnLine = nLine; @@ -118,10 +146,336 @@ blob_append(pOut, zPrefix, 1); blob_append(pOut, pLine->z, pLine->h & LENGTH_MASK); blob_append(pOut, "\n", 1); } +/* +** Expand the size of aEdit[] array to hold nEdit elements. +*/ +static void expandEdit(DContext *p, int nEdit){ + int *a; + a = realloc(p->aEdit, nEdit*sizeof(int)); + if( a==0 ){ + free( p->aEdit ); + p->nEdit = 0; + nEdit = 0; + } + p->aEdit = a; + p->nEditAlloc = nEdit; +} + +/* +** Append a new COPY/DELETE/INSERT triple. +*/ +static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){ + /* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */ + if( p->nEdit>=3 ){ + if( p->aEdit[p->nEdit-1]==0 ){ + if( p->aEdit[p->nEdit-2]==0 ){ + p->aEdit[p->nEdit-3] += nCopy; + p->aEdit[p->nEdit-2] += nDel; + p->aEdit[p->nEdit-1] += nIns; + return; + } + if( nCopy==0 ){ + p->aEdit[p->nEdit-2] += nDel; + p->aEdit[p->nEdit-1] += nIns; + return; + } + } + if( nCopy==0 && nDel==0 ){ + p->aEdit[p->nEdit-1] += nIns; + return; + } + } + if( p->nEdit+3>p->nEditAlloc ){ + expandEdit(p, p->nEdit*2 + 15); + if( p->aEdit==0 ) return; + } + p->aEdit[p->nEdit++] = nCopy; + p->aEdit[p->nEdit++] = nDel; + p->aEdit[p->nEdit++] = nIns; +} + + +/* +** Given a diff context in which the aEdit[] array has been filled +** in, compute a context diff into pOut. +*/ +static void contextDiff(DContext *p, Blob *pOut, int nContext){ + DLine *A; /* Left side of the diff */ + DLine *B; /* Right side of the diff */ + int a = 0; /* Index of next line in A[] */ + int b = 0; /* Index of next line in B[] */ + int *R; /* Array of COPY/DELETE/INSERT triples */ + int r; /* Index into R[] */ + int nr; /* Number of COPY/DELETE/INSERT triples to process */ + int mxr; /* Maximum value for r */ + int na, nb; /* Number of lines shown from A and B */ + int i, j; /* Loop counters */ + int m; /* Number of lines to output */ + int skip; /* Number of lines to skip */ + + A = p->aFrom; + B = p->aTo; + R = p->aEdit; + mxr = p->nEdit; + if( mxr>2 && R[mxr-1]==0 && R[mxr-2]==0 ){ mxr -= 3; } + for(r=0; r<mxr; r += 3*nr){ + /* Figure out how many triples to show in a single block */ + for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){} + DEBUG( printf("r=%d nr=%d\n", r, nr); ) + + /* For the current block comprising nr triples, figure out + ** how many lines of A and B are to be displayed + */ + if( R[r]>nContext ){ + na = nb = nContext; + skip = R[r] - nContext; + }else{ + na = nb = R[r]; + skip = 0; + } + for(i=0; i<nr; i++){ + na += R[r+i*3+1]; + nb += R[r+i*3+2]; + } + if( R[r+nr*3]>nContext ){ + na += nContext; + nb += nContext; + }else{ + na += R[r+nr*3]; + nb += R[r+nr*3]; + } + for(i=1; i<nr; i++){ + na += R[r+i*3]; + nb += R[r+i*3]; + } + blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb); + + /* Show the initial common area */ + a += skip; + b += skip; + m = R[r] - skip; + for(j=0; j<m; j++){ + appendDiffLine(pOut, " ", &A[a+j]); + } + a += m; + b += m; + + /* Show the differences */ + for(i=0; i<nr; i++){ + m = R[r+i*3+1]; + for(j=0; j<m; j++){ + appendDiffLine(pOut, "-", &A[a+j]); + } + a += m; + m = R[r+i*3+2]; + for(j=0; j<m; j++){ + appendDiffLine(pOut, "+", &B[b+j]); + } + b += m; + if( i<nr-1 ){ + m = R[r+i*3+3]; + for(j=0; j<m; j++){ + appendDiffLine(pOut, " ", &B[b+j]); + } + b += m; + a += m; + } + } + + /* Show the final common area */ + assert( nr==i ); + m = R[r+nr*3]; + if( m>nContext ) m = nContext; + for(j=0; j<m; j++){ + appendDiffLine(pOut, " ", &B[b+j]); + } + } +} + +/* +** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] +** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence +** of lines in these two blocks that are exactly the same. Return +** the bounds of the matching sequence. +*/ +static void longestCommonSequence( + DContext *p, + int iS1, int iE1, + int iS2, int iE2, + int *piSX, int *piEX, + int *piSY, int *piEY +){ + int bestScore = -1000000000; + int i, j; + int iSX, iSY, iEX, iEY; + int score, skew, dist, mid; + + *piSX = iS1; + *piEX = iS1; + *piSY = iS2; + *piEY = iS2; + mid = (iE1 + iS1)/2; + for(i=iS1; i<iE1; i++){ + int limit = 0; + j = p->aTo[p->aFrom[i].h % p->nTo].iHash; + while( j>0 + && (j-1<iS2 || j>=iE2 || !same_dline(&p->aFrom[i], &p->aTo[j-1])) + ){ + if( limit++ > 10 ){ + j = 0; + break; + } + j = p->aTo[j-1].iNext; + } + if( j==0 ) continue; + iSX = i; + iSY = j-1; + while( iSX>iS1 && iSY>iS2 && same_dline(&p->aFrom[iSX-1],&p->aTo[iSY-1]) ){ + iSX--; + iSY--; + } + iEX = i+1; + iEY = j; + while( iEX<iE1 && iEY<iE2 && same_dline(&p->aFrom[iEX],&p->aTo[iEY]) ){ + iEX++; + iEY++; + } + skew = (iSX-iS1) - (iSY-iS2); + if( skew<0 ) skew = -skew; + dist = (iSX+iEX)/2 - mid; + if( dist<0 ) dist = -dist; + score = (iEX - iSX) - 0.05*skew - 0.05*dist; + if( score>bestScore ){ + bestScore = score; + *piSX = iSX; + *piSY = iSY; + *piEX = iEX; + *piEY = iEY; + } + } + /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n", + iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */ +} + +/* +** Do a single step in the difference. Compute a sequence of +** copy/delete/insert steps that will convert lines iS1 through iE1-1 of +** the input into lines iS2 through iE2-1 of the output and write +** that sequence into the difference context. +*/ +static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){ + int iSX, iEX, iSY, iEY; + + if( iE1<=iS1 ){ + if( iE2>iS2 ){ + appendTriple(p, 0, 0, iE2-iS2); + } + return; + } + if( iE2<=iS2 ){ + appendTriple(p, 0, iE1-iS1, 0); + return; + } + + /* Find the longest matching segment between the two sequences */ + longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); + + if( iEX>iSX ){ + /* Recursively diff either side of the matching segment */ + diff_step(p, iS1, iSX, iS2, iSY); + if( iEX>iSX ){ + appendTriple(p, iEX - iSX, 0, 0); + } + diff_step(p, iEX, iE1, iEY, iE2); + }else{ + appendTriple(p, 0, iE1-iS1, iE2-iS2); + } +} + +/* +** Generate a report of the differences between files pA and pB. +** If pOut is not NULL then a unified diff is appended there. It +** is assumed that pOut has already been initialized. If pOut is +** NULL, then a pointer to an array of integers is returned. +** The integers come in triples. For each triple, +** the elements are the number of lines copied, the number of +** lines deleted, and the number of lines inserted. The vector +** is terminated by a triple of all zeros. +** +** This diff utility does not work on binary files. If a binary +** file is encountered, 0 is returned and pOut is written with +** text "cannot compute difference between binary files". +*/ +int *text_diff( + Blob *pA_Blob, /* FROM file */ + Blob *pB_Blob, /* TO file */ + Blob *pOut, /* Write unified diff here if not NULL */ + int nContext /* Amount of context to unified diff */ +){ + DContext c; + int mnE, iS, iE1, iE2; + + memset(&c, 0, sizeof(c)); + c.aFrom = break_into_lines(blob_str(pA_Blob), &c.nFrom); + c.aTo = break_into_lines(blob_str(pB_Blob), &c.nTo); + if( c.aFrom==0 || c.aTo==0 ){ + free(c.aFrom); + free(c.aTo); + if( pOut ){ + blob_appendf(pOut, "cannot compute difference between binary files\n"); + } + return 0; + } + + /* Carve off the common header and footer */ + iE1 = c.nFrom; + iE2 = c.nTo; + while( iE1>0 && iE2>0 && same_dline(&c.aFrom[iE1-1], &c.aTo[iE2-1]) ){ + iE1--; + iE2--; + } + mnE = iE1<iE2 ? iE1 : iE2; + for(iS=0; iS<mnE && same_dline(&c.aFrom[iS],&c.aTo[iS]); iS++){} + + /* do the difference */ + if( iS>0 ){ + appendTriple(&c, iS, 0, 0); + } + diff_step(&c, iS, iE1, iS, iE2); + if( iE1<c.nFrom ){ + appendTriple(&c, c.nFrom - iE1, 0, 0); + } + + expandEdit(&c, c.nEdit+3); + if( c.aEdit ){ + c.aEdit[c.nEdit++] = 0; + c.aEdit[c.nEdit++] = 0; + c.aEdit[c.nEdit++] = 0; + } + + if( pOut ){ + /* Compute a context diff if requested */ + contextDiff(&c, pOut, nContext); + free(c.aFrom); + free(c.aTo); + free(c.aEdit); + return 0; + }else{ + /* If a context diff is not requested, then return the + ** array of COPY/DELETE/INSERT triples after terminating the + ** array with a triple of all zeros. + */ + free(c.aFrom); + free(c.aTo); + return c.aEdit; + } +} + +#if 0 /********** Disabled and replaced by code above ************/ /* ** Generate a report of the differences between files pA and pB. ** If pOut is not NULL then a unified diff is appended there. It ** is assumed that pOut has already been initialized. If pOut is @@ -472,10 +826,11 @@ free(B); /* Return the result */ return R; } +#endif /***************** End of the Wagner/Myers algorithm ************/ /* ** COMMAND: test-rawdiff */ void test_rawdiff_cmd(void){