Diff
Not logged in

Differences From:

File src/diff.c part of check-in [7e9e5fea77] - Fix a bug in the unified diff generator. by drh on 2007-11-27 03:30:50. Also file src/diff.c part of check-in [d0305b305a] - Merged mainline into my branch to get the newest application. by aku on 2007-12-05 08:07:46. [view]

To:

File src/diff.c part of check-in [95c07a5033] - A completely new diff algorithm. It is not guaranteed to find the minimum difference between files, but it seems to do a good job and runs much faster on larger files. But command-line diff is still faster for really large files. More work needed. by drh on 2008-02-02 23:39:22. Also file src/diff.c part of check-in [0523983440b] - Merged importer to mainline. by aku on 2008-02-03 01:36:14. [view]

@@ -43,13 +43,37 @@
 ** the file is considered binary.
 */
 typedef struct DLine DLine;
 struct DLine {
-  const char *z;    /* The text of the line */
-  unsigned int h;   /* Hash of the line */
+  const char *z;        /* The text of the line */
+  unsigned int h;       /* Hash of the line */
+  unsigned int iNext;   /* Index+1 of next line with same the same hash */
+
+  /* an array of DLine elements services two purposes.  The fields
+  ** above are one per line of input text.  But each entry is also
+  ** a bucket in a hash table. */
+  unsigned int iHash;   /* First entry+1 in the hash array */
 };
 
-#define LENGTH_MASK  0x000fffff
+/*
+** Maximum length of a line in a text file.  (8192)
+*/
+#define LENGTH_MASK_SZ  13
+#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
+
+/*
+** A context for running a diff.
+*/
+typedef struct DContext DContext;
+struct DContext {
+  int *aEdit;        /* Array of copy/delete/insert triples */
+  int nEdit;         /* Number of integers (3x num of triples) in aEdit[] */
+  int nEditAlloc;    /* Space allocated for aEdit[] */
+  DLine *aFrom;      /* File on left side of the diff */
+  int nFrom;         /* Number of lines in aFrom[] */
+  DLine *aTo;        /* File on right side of the diff */
+  int nTo;           /* Number of lines in aTo[] */
+};
 
 /*
 ** Return an array of DLine objects containing a pointer to the
 ** start of each line and a hash of that line.  The lower
@@ -57,13 +81,13 @@
 **
 ** Trailing whitespace is removed from each line.
 **
 ** Return 0 if the file is binary or contains a line that is
-** longer than 1048575 bytes.
+** too long.
 */
 static DLine *break_into_lines(char *z, int *pnLine){
   int nLine, i, j, k, x;
-  unsigned int h;
+  unsigned int h, h2;
   DLine *a;
 
   /* Count the number of lines.  Allocate space to hold
   ** the returned array.
@@ -74,19 +98,20 @@
       return 0;
     }
     if( c=='\n' && z[i+1]!=0 ){
       nLine++;
-      if( j>1048575 ){
+      if( j>LENGTH_MASK ){
         return 0;
       }
       j = 0;
     }
   }
-  if( j>1048575 ){
+  if( j>LENGTH_MASK ){
     return 0;
   }
   a = malloc( nLine*sizeof(a[0]) );
   if( a==0 ) fossil_panic("out of memory");
+  memset(a, 0, nLine*sizeof(a[0]) );
 
   /* Fill in the array */
   for(i=0; i<nLine; i++){
     a[i].z = z;
@@ -94,9 +119,12 @@
     for(k=j; k>0 && isspace(z[k-1]); k--){}
     for(h=0, x=0; x<k; x++){
       h = h ^ (h<<2) ^ z[x];
     }
-    a[i].h = (h<<20) | k;;
+    a[i].h = h = (h<<LENGTH_MASK_SZ) | k;;
+    h2 = h % nLine;
+    a[i].iNext = a[h2].iHash;
+    a[h2].iHash = i+1;
     z += j+1;
   }
 
   /* Return results */
@@ -119,8 +147,334 @@
   blob_append(pOut, pLine->z, pLine->h & LENGTH_MASK);
   blob_append(pOut, "\n", 1);
 }
 
+/*
+** Expand the size of aEdit[] array to hold nEdit elements.
+*/
+static void expandEdit(DContext *p, int nEdit){
+  int *a;
+  a = realloc(p->aEdit, nEdit*sizeof(int));
+  if( a==0 ){
+    free( p->aEdit );
+    p->nEdit = 0;
+    nEdit = 0;
+  }
+  p->aEdit = a;
+  p->nEditAlloc = nEdit;
+}
+
+/*
+** Append a new COPY/DELETE/INSERT triple.
+*/
+static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){
+  /* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */
+  if( p->nEdit>=3 ){
+    if( p->aEdit[p->nEdit-1]==0 ){
+      if( p->aEdit[p->nEdit-2]==0 ){
+        p->aEdit[p->nEdit-3] += nCopy;
+        p->aEdit[p->nEdit-2] += nDel;
+        p->aEdit[p->nEdit-1] += nIns;
+        return;
+      }
+      if( nCopy==0 ){
+        p->aEdit[p->nEdit-2] += nDel;
+        p->aEdit[p->nEdit-1] += nIns;
+        return;
+      }
+    }
+    if( nCopy==0 && nDel==0 ){
+      p->aEdit[p->nEdit-1] += nIns;
+      return;
+    }
+  }
+  if( p->nEdit+3>p->nEditAlloc ){
+    expandEdit(p, p->nEdit*2 + 15);
+    if( p->aEdit==0 ) return;
+  }
+  p->aEdit[p->nEdit++] = nCopy;
+  p->aEdit[p->nEdit++] = nDel;
+  p->aEdit[p->nEdit++] = nIns;
+}
+
+
+/*
+** Given a diff context in which the aEdit[] array has been filled
+** in, compute a context diff into pOut.
+*/
+static void contextDiff(DContext *p, Blob *pOut, int nContext){
+  DLine *A;     /* Left side of the diff */
+  DLine *B;     /* Right side of the diff */
+  int a = 0;    /* Index of next line in A[] */
+  int b = 0;    /* Index of next line in B[] */
+  int *R;       /* Array of COPY/DELETE/INSERT triples */
+  int r;        /* Index into R[] */
+  int nr;       /* Number of COPY/DELETE/INSERT triples to process */
+  int mxr;      /* Maximum value for r */
+  int na, nb;   /* Number of lines shown from A and B */
+  int i, j;     /* Loop counters */
+  int m;        /* Number of lines to output */
+  int skip;     /* Number of lines to skip */
+
+  A = p->aFrom;
+  B = p->aTo;
+  R = p->aEdit;
+  mxr = p->nEdit;
+  if( mxr>2 && R[mxr-1]==0 && R[mxr-2]==0 ){ mxr -= 3; }
+  for(r=0; r<mxr; r += 3*nr){
+    /* Figure out how many triples to show in a single block */
+    for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
+    DEBUG( printf("r=%d nr=%d\n", r, nr); )
+
+    /* For the current block comprising nr triples, figure out
+    ** how many lines of A and B are to be displayed
+    */
+    if( R[r]>nContext ){
+      na = nb = nContext;
+      skip = R[r] - nContext;
+    }else{
+      na = nb = R[r];
+      skip = 0;
+    }
+    for(i=0; i<nr; i++){
+      na += R[r+i*3+1];
+      nb += R[r+i*3+2];
+    }
+    if( R[r+nr*3]>nContext ){
+      na += nContext;
+      nb += nContext;
+    }else{
+      na += R[r+nr*3];
+      nb += R[r+nr*3];
+    }
+    for(i=1; i<nr; i++){
+      na += R[r+i*3];
+      nb += R[r+i*3];
+    }
+    blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
+
+    /* Show the initial common area */
+    a += skip;
+    b += skip;
+    m = R[r] - skip;
+    for(j=0; j<m; j++){
+      appendDiffLine(pOut, " ", &A[a+j]);
+    }
+    a += m;
+    b += m;
+
+    /* Show the differences */
+    for(i=0; i<nr; i++){
+      m = R[r+i*3+1];
+      for(j=0; j<m; j++){
+        appendDiffLine(pOut, "-", &A[a+j]);
+      }
+      a += m;
+      m = R[r+i*3+2];
+      for(j=0; j<m; j++){
+        appendDiffLine(pOut, "+", &B[b+j]);
+      }
+      b += m;
+      if( i<nr-1 ){
+        m = R[r+i*3+3];
+        for(j=0; j<m; j++){
+          appendDiffLine(pOut, " ", &B[b+j]);
+        }
+        b += m;
+        a += m;
+      }
+    }
+
+    /* Show the final common area */
+    assert( nr==i );
+    m = R[r+nr*3];
+    if( m>nContext ) m = nContext;
+    for(j=0; j<m; j++){
+      appendDiffLine(pOut, " ", &B[b+j]);
+    }
+  }
+}
+
+/*
+** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
+** file and lines iS2 through iE2-1 of the aTo[] file.  Locate a sequence
+** of lines in these two blocks that are exactly the same.  Return
+** the bounds of the matching sequence.
+*/
+static void longestCommonSequence(
+  DContext *p,
+  int iS1, int iE1,
+  int iS2, int iE2,
+  int *piSX, int *piEX,
+  int *piSY, int *piEY
+){
+  int bestScore = -1000000000;
+  int i, j;
+  int iSX, iSY, iEX, iEY;
+  int score, skew, dist, mid;
+
+  *piSX = iS1;
+  *piEX = iS1;
+  *piSY = iS2;
+  *piEY = iS2;
+  mid = (iE1 + iS1)/2;
+  for(i=iS1; i<iE1; i++){
+    int limit = 0;
+    j = p->aTo[p->aFrom[i].h % p->nTo].iHash;
+    while( j>0
+      && (j-1<iS2 || j>=iE2 || !same_dline(&p->aFrom[i], &p->aTo[j-1]))
+    ){
+      if( limit++ > 10 ){
+        j = 0;
+        break;
+      }
+      j = p->aTo[j-1].iNext;
+    }
+    if( j==0 ) continue;
+    iSX = i;
+    iSY = j-1;
+    while( iSX>iS1 && iSY>iS2 && same_dline(&p->aFrom[iSX-1],&p->aTo[iSY-1]) ){
+      iSX--;
+      iSY--;
+    }
+    iEX = i+1;
+    iEY = j;
+    while( iEX<iE1 && iEY<iE2 && same_dline(&p->aFrom[iEX],&p->aTo[iEY]) ){
+      iEX++;
+      iEY++;
+    }
+    skew = (iSX-iS1) - (iSY-iS2);
+    if( skew<0 ) skew = -skew;
+    dist = (iSX+iEX)/2 - mid;
+    if( dist<0 ) dist = -dist;
+    score = (iEX - iSX) - 0.05*skew - 0.05*dist;
+    if( score>bestScore ){
+      bestScore = score;
+      *piSX = iSX;
+      *piSY = iSY;
+      *piEX = iEX;
+      *piEY = iEY;
+    }
+  }
+  /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n",
+     iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY);  */
+}
+
+/*
+** Do a single step in the difference.  Compute a sequence of
+** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
+** the input into lines iS2 through iE2-1 of the output and write
+** that sequence into the difference context.
+*/
+static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
+  int iSX, iEX, iSY, iEY;
+
+  if( iE1<=iS1 ){
+    if( iE2>iS2 ){
+      appendTriple(p, 0, 0, iE2-iS2);
+    }
+    return;
+  }
+  if( iE2<=iS2 ){
+    appendTriple(p, 0, iE1-iS1, 0);
+    return;
+  }
+
+  /* Find the longest matching segment between the two sequences */
+  longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
+
+  if( iEX>iSX ){
+    /* Recursively diff either side of the matching segment */
+    diff_step(p, iS1, iSX, iS2, iSY);
+    if( iEX>iSX ){
+      appendTriple(p, iEX - iSX, 0, 0);
+    }
+    diff_step(p, iEX, iE1, iEY, iE2);
+  }else{
+    appendTriple(p, 0, iE1-iS1, iE2-iS2);
+  }
+}
+
+/*
+** Generate a report of the differences between files pA and pB.
+** If pOut is not NULL then a unified diff is appended there.  It
+** is assumed that pOut has already been initialized.  If pOut is
+** NULL, then a pointer to an array of integers is returned.
+** The integers come in triples.  For each triple,
+** the elements are the number of lines copied, the number of
+** lines deleted, and the number of lines inserted.  The vector
+** is terminated by a triple of all zeros.
+**
+** This diff utility does not work on binary files.  If a binary
+** file is encountered, 0 is returned and pOut is written with
+** text "cannot compute difference between binary files".
+*/
+int *text_diff(
+  Blob *pA_Blob,   /* FROM file */
+  Blob *pB_Blob,   /* TO file */
+  Blob *pOut,      /* Write unified diff here if not NULL */
+  int nContext     /* Amount of context to unified diff */
+){
+  DContext c;
+  int mnE, iS, iE1, iE2;
+
+  memset(&c, 0, sizeof(c));
+  c.aFrom = break_into_lines(blob_str(pA_Blob), &c.nFrom);
+  c.aTo = break_into_lines(blob_str(pB_Blob), &c.nTo);
+  if( c.aFrom==0 || c.aTo==0 ){
+    free(c.aFrom);
+    free(c.aTo);
+    if( pOut ){
+      blob_appendf(pOut, "cannot compute difference between binary files\n");
+    }
+    return 0;
+  }
+
+  /* Carve off the common header and footer */
+  iE1 = c.nFrom;
+  iE2 = c.nTo;
+  while( iE1>0 && iE2>0 && same_dline(&c.aFrom[iE1-1], &c.aTo[iE2-1]) ){
+    iE1--;
+    iE2--;
+  }
+  mnE = iE1<iE2 ? iE1 : iE2;
+  for(iS=0; iS<mnE && same_dline(&c.aFrom[iS],&c.aTo[iS]); iS++){}
+
+  /* do the difference */
+  if( iS>0 ){
+    appendTriple(&c, iS, 0, 0);
+  }
+  diff_step(&c, iS, iE1, iS, iE2);
+  if( iE1<c.nFrom ){
+    appendTriple(&c, c.nFrom - iE1, 0, 0);
+  }
+
+  expandEdit(&c, c.nEdit+3);
+  if( c.aEdit ){
+    c.aEdit[c.nEdit++] = 0;
+    c.aEdit[c.nEdit++] = 0;
+    c.aEdit[c.nEdit++] = 0;
+  }
+
+  if( pOut ){
+    /* Compute a context diff if requested */
+    contextDiff(&c, pOut, nContext);
+    free(c.aFrom);
+    free(c.aTo);
+    free(c.aEdit);
+    return 0;
+  }else{
+    /* If a context diff is not requested, then return the
+    ** array of COPY/DELETE/INSERT triples after terminating the
+    ** array with a triple of all zeros.
+    */
+    free(c.aFrom);
+    free(c.aTo);
+    return c.aEdit;
+  }
+}
+
+#if 0  /********** Disabled and replaced by code above ************/
 
 /*
 ** Generate a report of the differences between files pA and pB.
 ** If pOut is not NULL then a unified diff is appended there.  It
@@ -473,8 +827,9 @@
 
   /* Return the result */
   return R;
 }
+#endif /***************** End of the Wagner/Myers algorithm ************/
 
 /*
 ** COMMAND: test-rawdiff
 */