@@ -43,13 +43,37 @@
** the file is considered binary.
*/
typedef struct DLine DLine;
struct DLine {
- const char *z; /* The text of the line */
- unsigned int h; /* Hash of the line */
+ const char *z; /* The text of the line */
+ unsigned int h; /* Hash of the line */
+ unsigned int iNext; /* Index+1 of next line with same the same hash */
+
+ /* an array of DLine elements services two purposes. The fields
+ ** above are one per line of input text. But each entry is also
+ ** a bucket in a hash table. */
+ unsigned int iHash; /* First entry+1 in the hash array */
};
-#define LENGTH_MASK 0x000fffff
+/*
+** Maximum length of a line in a text file. (8192)
+*/
+#define LENGTH_MASK_SZ 13
+#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)
+
+/*
+** A context for running a diff.
+*/
+typedef struct DContext DContext;
+struct DContext {
+ int *aEdit; /* Array of copy/delete/insert triples */
+ int nEdit; /* Number of integers (3x num of triples) in aEdit[] */
+ int nEditAlloc; /* Space allocated for aEdit[] */
+ DLine *aFrom; /* File on left side of the diff */
+ int nFrom; /* Number of lines in aFrom[] */
+ DLine *aTo; /* File on right side of the diff */
+ int nTo; /* Number of lines in aTo[] */
+};
/*
** Return an array of DLine objects containing a pointer to the
** start of each line and a hash of that line. The lower
@@ -57,13 +81,13 @@
**
** Trailing whitespace is removed from each line.
**
** Return 0 if the file is binary or contains a line that is
-** longer than 1048575 bytes.
+** too long.
*/
static DLine *break_into_lines(char *z, int *pnLine){
int nLine, i, j, k, x;
- unsigned int h;
+ unsigned int h, h2;
DLine *a;
/* Count the number of lines. Allocate space to hold
** the returned array.
@@ -74,19 +98,20 @@
return 0;
}
if( c=='\n' && z[i+1]!=0 ){
nLine++;
- if( j>1048575 ){
+ if( j>LENGTH_MASK ){
return 0;
}
j = 0;
}
}
- if( j>1048575 ){
+ if( j>LENGTH_MASK ){
return 0;
}
a = malloc( nLine*sizeof(a[0]) );
if( a==0 ) fossil_panic("out of memory");
+ memset(a, 0, nLine*sizeof(a[0]) );
/* Fill in the array */
for(i=0; i<nLine; i++){
a[i].z = z;
@@ -94,9 +119,12 @@
for(k=j; k>0 && isspace(z[k-1]); k--){}
for(h=0, x=0; x<k; x++){
h = h ^ (h<<2) ^ z[x];
}
- a[i].h = (h<<20) | k;;
+ a[i].h = h = (h<<LENGTH_MASK_SZ) | k;;
+ h2 = h % nLine;
+ a[i].iNext = a[h2].iHash;
+ a[h2].iHash = i+1;
z += j+1;
}
/* Return results */
@@ -119,8 +147,334 @@
blob_append(pOut, pLine->z, pLine->h & LENGTH_MASK);
blob_append(pOut, "\n", 1);
}
+/*
+** Expand the size of aEdit[] array to hold nEdit elements.
+*/
+static void expandEdit(DContext *p, int nEdit){
+ int *a;
+ a = realloc(p->aEdit, nEdit*sizeof(int));
+ if( a==0 ){
+ free( p->aEdit );
+ p->nEdit = 0;
+ nEdit = 0;
+ }
+ p->aEdit = a;
+ p->nEditAlloc = nEdit;
+}
+
+/*
+** Append a new COPY/DELETE/INSERT triple.
+*/
+static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){
+ /* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */
+ if( p->nEdit>=3 ){
+ if( p->aEdit[p->nEdit-1]==0 ){
+ if( p->aEdit[p->nEdit-2]==0 ){
+ p->aEdit[p->nEdit-3] += nCopy;
+ p->aEdit[p->nEdit-2] += nDel;
+ p->aEdit[p->nEdit-1] += nIns;
+ return;
+ }
+ if( nCopy==0 ){
+ p->aEdit[p->nEdit-2] += nDel;
+ p->aEdit[p->nEdit-1] += nIns;
+ return;
+ }
+ }
+ if( nCopy==0 && nDel==0 ){
+ p->aEdit[p->nEdit-1] += nIns;
+ return;
+ }
+ }
+ if( p->nEdit+3>p->nEditAlloc ){
+ expandEdit(p, p->nEdit*2 + 15);
+ if( p->aEdit==0 ) return;
+ }
+ p->aEdit[p->nEdit++] = nCopy;
+ p->aEdit[p->nEdit++] = nDel;
+ p->aEdit[p->nEdit++] = nIns;
+}
+
+
+/*
+** Given a diff context in which the aEdit[] array has been filled
+** in, compute a context diff into pOut.
+*/
+static void contextDiff(DContext *p, Blob *pOut, int nContext){
+ DLine *A; /* Left side of the diff */
+ DLine *B; /* Right side of the diff */
+ int a = 0; /* Index of next line in A[] */
+ int b = 0; /* Index of next line in B[] */
+ int *R; /* Array of COPY/DELETE/INSERT triples */
+ int r; /* Index into R[] */
+ int nr; /* Number of COPY/DELETE/INSERT triples to process */
+ int mxr; /* Maximum value for r */
+ int na, nb; /* Number of lines shown from A and B */
+ int i, j; /* Loop counters */
+ int m; /* Number of lines to output */
+ int skip; /* Number of lines to skip */
+
+ A = p->aFrom;
+ B = p->aTo;
+ R = p->aEdit;
+ mxr = p->nEdit;
+ if( mxr>2 && R[mxr-1]==0 && R[mxr-2]==0 ){ mxr -= 3; }
+ for(r=0; r<mxr; r += 3*nr){
+ /* Figure out how many triples to show in a single block */
+ for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
+ DEBUG( printf("r=%d nr=%d\n", r, nr); )
+
+ /* For the current block comprising nr triples, figure out
+ ** how many lines of A and B are to be displayed
+ */
+ if( R[r]>nContext ){
+ na = nb = nContext;
+ skip = R[r] - nContext;
+ }else{
+ na = nb = R[r];
+ skip = 0;
+ }
+ for(i=0; i<nr; i++){
+ na += R[r+i*3+1];
+ nb += R[r+i*3+2];
+ }
+ if( R[r+nr*3]>nContext ){
+ na += nContext;
+ nb += nContext;
+ }else{
+ na += R[r+nr*3];
+ nb += R[r+nr*3];
+ }
+ for(i=1; i<nr; i++){
+ na += R[r+i*3];
+ nb += R[r+i*3];
+ }
+ blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
+
+ /* Show the initial common area */
+ a += skip;
+ b += skip;
+ m = R[r] - skip;
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, " ", &A[a+j]);
+ }
+ a += m;
+ b += m;
+
+ /* Show the differences */
+ for(i=0; i<nr; i++){
+ m = R[r+i*3+1];
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, "-", &A[a+j]);
+ }
+ a += m;
+ m = R[r+i*3+2];
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, "+", &B[b+j]);
+ }
+ b += m;
+ if( i<nr-1 ){
+ m = R[r+i*3+3];
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, " ", &B[b+j]);
+ }
+ b += m;
+ a += m;
+ }
+ }
+
+ /* Show the final common area */
+ assert( nr==i );
+ m = R[r+nr*3];
+ if( m>nContext ) m = nContext;
+ for(j=0; j<m; j++){
+ appendDiffLine(pOut, " ", &B[b+j]);
+ }
+ }
+}
+
+/*
+** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
+** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence
+** of lines in these two blocks that are exactly the same. Return
+** the bounds of the matching sequence.
+*/
+static void longestCommonSequence(
+ DContext *p,
+ int iS1, int iE1,
+ int iS2, int iE2,
+ int *piSX, int *piEX,
+ int *piSY, int *piEY
+){
+ int bestScore = -1000000000;
+ int i, j;
+ int iSX, iSY, iEX, iEY;
+ int score, skew, dist, mid;
+
+ *piSX = iS1;
+ *piEX = iS1;
+ *piSY = iS2;
+ *piEY = iS2;
+ mid = (iE1 + iS1)/2;
+ for(i=iS1; i<iE1; i++){
+ int limit = 0;
+ j = p->aTo[p->aFrom[i].h % p->nTo].iHash;
+ while( j>0
+ && (j-1<iS2 || j>=iE2 || !same_dline(&p->aFrom[i], &p->aTo[j-1]))
+ ){
+ if( limit++ > 10 ){
+ j = 0;
+ break;
+ }
+ j = p->aTo[j-1].iNext;
+ }
+ if( j==0 ) continue;
+ iSX = i;
+ iSY = j-1;
+ while( iSX>iS1 && iSY>iS2 && same_dline(&p->aFrom[iSX-1],&p->aTo[iSY-1]) ){
+ iSX--;
+ iSY--;
+ }
+ iEX = i+1;
+ iEY = j;
+ while( iEX<iE1 && iEY<iE2 && same_dline(&p->aFrom[iEX],&p->aTo[iEY]) ){
+ iEX++;
+ iEY++;
+ }
+ skew = (iSX-iS1) - (iSY-iS2);
+ if( skew<0 ) skew = -skew;
+ dist = (iSX+iEX)/2 - mid;
+ if( dist<0 ) dist = -dist;
+ score = (iEX - iSX) - 0.05*skew - 0.05*dist;
+ if( score>bestScore ){
+ bestScore = score;
+ *piSX = iSX;
+ *piSY = iSY;
+ *piEX = iEX;
+ *piEY = iEY;
+ }
+ }
+ /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n",
+ iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */
+}
+
+/*
+** Do a single step in the difference. Compute a sequence of
+** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
+** the input into lines iS2 through iE2-1 of the output and write
+** that sequence into the difference context.
+*/
+static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
+ int iSX, iEX, iSY, iEY;
+
+ if( iE1<=iS1 ){
+ if( iE2>iS2 ){
+ appendTriple(p, 0, 0, iE2-iS2);
+ }
+ return;
+ }
+ if( iE2<=iS2 ){
+ appendTriple(p, 0, iE1-iS1, 0);
+ return;
+ }
+
+ /* Find the longest matching segment between the two sequences */
+ longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
+
+ if( iEX>iSX ){
+ /* Recursively diff either side of the matching segment */
+ diff_step(p, iS1, iSX, iS2, iSY);
+ if( iEX>iSX ){
+ appendTriple(p, iEX - iSX, 0, 0);
+ }
+ diff_step(p, iEX, iE1, iEY, iE2);
+ }else{
+ appendTriple(p, 0, iE1-iS1, iE2-iS2);
+ }
+}
+
+/*
+** Generate a report of the differences between files pA and pB.
+** If pOut is not NULL then a unified diff is appended there. It
+** is assumed that pOut has already been initialized. If pOut is
+** NULL, then a pointer to an array of integers is returned.
+** The integers come in triples. For each triple,
+** the elements are the number of lines copied, the number of
+** lines deleted, and the number of lines inserted. The vector
+** is terminated by a triple of all zeros.
+**
+** This diff utility does not work on binary files. If a binary
+** file is encountered, 0 is returned and pOut is written with
+** text "cannot compute difference between binary files".
+*/
+int *text_diff(
+ Blob *pA_Blob, /* FROM file */
+ Blob *pB_Blob, /* TO file */
+ Blob *pOut, /* Write unified diff here if not NULL */
+ int nContext /* Amount of context to unified diff */
+){
+ DContext c;
+ int mnE, iS, iE1, iE2;
+
+ memset(&c, 0, sizeof(c));
+ c.aFrom = break_into_lines(blob_str(pA_Blob), &c.nFrom);
+ c.aTo = break_into_lines(blob_str(pB_Blob), &c.nTo);
+ if( c.aFrom==0 || c.aTo==0 ){
+ free(c.aFrom);
+ free(c.aTo);
+ if( pOut ){
+ blob_appendf(pOut, "cannot compute difference between binary files\n");
+ }
+ return 0;
+ }
+
+ /* Carve off the common header and footer */
+ iE1 = c.nFrom;
+ iE2 = c.nTo;
+ while( iE1>0 && iE2>0 && same_dline(&c.aFrom[iE1-1], &c.aTo[iE2-1]) ){
+ iE1--;
+ iE2--;
+ }
+ mnE = iE1<iE2 ? iE1 : iE2;
+ for(iS=0; iS<mnE && same_dline(&c.aFrom[iS],&c.aTo[iS]); iS++){}
+
+ /* do the difference */
+ if( iS>0 ){
+ appendTriple(&c, iS, 0, 0);
+ }
+ diff_step(&c, iS, iE1, iS, iE2);
+ if( iE1<c.nFrom ){
+ appendTriple(&c, c.nFrom - iE1, 0, 0);
+ }
+
+ expandEdit(&c, c.nEdit+3);
+ if( c.aEdit ){
+ c.aEdit[c.nEdit++] = 0;
+ c.aEdit[c.nEdit++] = 0;
+ c.aEdit[c.nEdit++] = 0;
+ }
+
+ if( pOut ){
+ /* Compute a context diff if requested */
+ contextDiff(&c, pOut, nContext);
+ free(c.aFrom);
+ free(c.aTo);
+ free(c.aEdit);
+ return 0;
+ }else{
+ /* If a context diff is not requested, then return the
+ ** array of COPY/DELETE/INSERT triples after terminating the
+ ** array with a triple of all zeros.
+ */
+ free(c.aFrom);
+ free(c.aTo);
+ return c.aEdit;
+ }
+}
+
+#if 0 /********** Disabled and replaced by code above ************/
/*
** Generate a report of the differences between files pA and pB.
** If pOut is not NULL then a unified diff is appended there. It
@@ -473,8 +827,9 @@
/* Return the result */
return R;
}
+#endif /***************** End of the Wagner/Myers algorithm ************/
/*
** COMMAND: test-rawdiff
*/