Fossil: Diff

Differences From:

File src/diff.c part of check-in [dbda8d6ce9] - Initial check-in of m1 sources. by drh on 2007-07-21 14:10:57. [view]
To:

File src/diff.c part of check-in [7e9e5fea77] - Fix a bug in the unified diff generator. by drh on 2007-11-27 03:30:50. Also file src/diff.c part of check-in [d0305b305a] - Merged mainline into my branch to get the newest application. by aku on 2007-12-05 08:07:46. [view]
@@ -20,228 +20,492 @@
 **   http://www.hwaci.com/drh/
 **
 *******************************************************************************
 **
-** This file contains code used to implement "diff" operators.
+** This file contains code used to compute a "diff" between two
+** text files.
 */
 #include "config.h"
 #include "diff.h"
 #include <assert.h>
 
+
+#if 0
+#define DEBUG(X) X
+#else
+#define DEBUG(X)
+#endif
+
 /*
 ** Information about each line of a file being diffed.
+**
+** The lower 20 bits of the hash are the length of the
+** line.  If any line is longer than 1048575 characters,
+** the file is considered binary.
 */
 typedef struct DLine DLine;
 struct DLine {
   const char *z;    /* The text of the line */
   unsigned int h;   /* Hash of the line */
 };
 
+#define LENGTH_MASK  0x000fffff
+
 /*
-** Break a blob into lines by converting each \n into a \000 and
-** creating pointers to the beginning of each line.
+** Return an array of DLine objects containing a pointer to the
+** start of each line and a hash of that line.  The lower
+** bits of the hash store the length of each line.
+**
+** Trailing whitespace is removed from each line.
+**
+** Return 0 if the file is binary or contains a line that is
+** longer than 1048575 bytes.
 */
 static DLine *break_into_lines(char *z, int *pnLine){
-  int nLine, i, j;
+  int nLine, i, j, k, x;
   unsigned int h;
   DLine *a;
-  for(i=0, nLine=1; z[i]; i++){
-    if( z[i]=='\n' ) nLine++;
+
+  /* Count the number of lines.  Allocate space to hold
+  ** the returned array.
+  */
+  for(i=j=0, nLine=1; z[i]; i++, j++){
+    int c = z[i];
+    if( c==0 ){
+      return 0;
+    }
+    if( c=='\n' && z[i+1]!=0 ){
+      nLine++;
+      if( j>1048575 ){
+        return 0;
+      }
+      j = 0;
+    }
+  }
+  if( j>1048575 ){
+    return 0;
   }
   a = malloc( nLine*sizeof(a[0]) );
   if( a==0 ) fossil_panic("out of memory");
-  a[0].z = z;
-  for(i=0, j=0, h=0; z[i]; i++){
-    if( z[i]=='\n' ){
-      a[j].h = h;
-      j++;
-      a[j].z = &z[i+1];
-      z[i] = 0;
-      h = 0;
-    }else{
-      h = h ^ (h<<2) ^ z[i];
+
+  /* Fill in the array */
+  for(i=0; i<nLine; i++){
+    a[i].z = z;
+    for(j=0; z[j] && z[j]!='\n'; j++){}
+    for(k=j; k>0 && isspace(z[k-1]); k--){}
+    for(h=0, x=0; x<k; x++){
+      h = h ^ (h<<2) ^ z[x];
     }
+    a[i].h = (h<<20) | k;;
+    z += j+1;
   }
-  a[j].h = h;
-  *pnLine = j+1;
+
+  /* Return results */
+  *pnLine = nLine;
   return a;
 }
 
 /*
 ** Return true if two DLine elements are identical.
 */
 static int same_dline(DLine *pA, DLine *pB){
-  return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
+  return pA->h==pB->h && memcmp(pA->z,pB->z,pA->h & LENGTH_MASK)==0;
+}
+
+/*
+** Append a single line of "diff" output to pOut.
+*/
+static void appendDiffLine(Blob *pOut, char *zPrefix, DLine *pLine){
+  blob_append(pOut, zPrefix, 1);
+  blob_append(pOut, pLine->z, pLine->h & LENGTH_MASK);
+  blob_append(pOut, "\n", 1);
 }
 
+
 /*
-** Generate a unified diff of two blobs.  The text of the original
-** two blobs is destroyed by the diffing process.
+** Generate a report of the differences between files pA and pB.
+** If pOut is not NULL then a unified diff is appended there.  It
+** is assumed that pOut has already been initialized.  If pOut is
+** NULL, then a pointer to an array of integers is returned.
+** The integers come in triples.  For each triple,
+** the elements are the number of lines copied, the number of
+** lines deleted, and the number of lines inserted.  The vector
+** is terminated by a triple of all zeros.
+**
+** This diff utility does not work on binary files.  If a binary
+** file is encountered, 0 is returned and pOut is written with
+** text "cannot compute difference between binary files".
+**
+****************************************************************************
+**
+** The core algorithm is a variation on the classic Wagner
+** minimum edit distance with enhancements to reduce the runtime
+** to be almost linear in the common case where the two files
+** have a lot in common.  For additional information see
+** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
+** Variations"
+**
+** Consider comparing strings A and B.  A=abcabba and B=cbabac
+** We construct a "Wagner" matrix W with A along the X axis and
+** B along the Y axis:
+**
+**     c 6               *
+**     a 5               *
+**     b 4           * *
+**     a 3         *
+**     b 2       *
+**   B c 1       *
+**       0 * * *
+**         0 1 2 3 4 5 6 7
+**           a b c a b b a
+**           A
+**
+** (Note: we draw this Wagner matrix with the origin at the lower
+** left whereas Myers uses the origin at the upper left.  Otherwise,
+** they are the same.)
+**
+** Let Y be the maximum y value or the number of characters in B.
+** 6 in this example.  X is the maximum x value or the number of
+** elements in A.  Here 7.
+**
+** Our goal is to find a path from (0,0) to (X,Y).  The path can
+** use horizontal, vertical, or diagonal steps.  A diagonal from
+** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y].  A vertical
+** steps corresponds to an insertion.  A horizontal step corresponds
+** to a deletion.  We want to find the path with the fewest
+** horizontal and vertical steps.
+**
+** Diagonal k consists of all points such that x-y==k.  Diagonal
+** zero begins at the origin.  Diagonal 1 begins at (1,0).
+** Diagonal -1 begins at (0,1).  All diagonals move up and to the
+** right at 45 degrees.  Diagonal number increase from upper left
+** to lower right.
+**
+** Myers matrix M is a lower right triangular matrix with indices d
+** along the bottom and i vertically:
+**
+**
+**   i=4 |            +4  \
+**     3 |         +3 +2   |
+**     2 |      +2 +1  0   |- k values.   k = 2*i-d
+**     1 |   +1  0 -1 -2   |
+**     0 | 0 -1 -2 -3 -4  /
+**       ---------------
+**         0  1  2  3  4 = d
+**
+** Each element of the Myers matrix corresponds to a diagonal.
+** The diagonal is k=2*i-d.  The diagonal values are shown
+** in the template above.
+**
+** Each entry in M represents the end-point on a path from (0,0).
+** The end-point is on diagonal k.  The value stored in M is
+** q=x+y where (x,y) is the terminus of the path.  A path
+** to M[d][i] will come through either M[d-1][i-1] or
+** though M[d-1][i], whichever holds the largest value of q.
+** If both elements hold the same value, the path comes
+** through M[d-1][i-1].
+**
+** The value of d is the number of insertions and deletions
+** made so far on the path.  M grows progressively.  So the
+** size of the M matrix is proportional to d*d.  For the
+** common case where A and B are similar, d will be small
+** compared to X and Y so little memory is required.  The
+** original Wagner algorithm requires X*Y memory, which for
+** larger files (100K lines) is more memory than we have at
+** hand.
 */
-void unified_diff(Blob *pA, Blob *pB, int nContext, Blob *pOut){
-  DLine *pDA, *pDB, *A, *B;
-  int nA, nB, nAp1;
-  int x, y;
-  int cnt;
-  int i, iStart;
-  int *m;
+int *text_diff(
+  Blob *pA_Blob,   /* FROM file */
+  Blob *pB_Blob,   /* TO file */
+  Blob *pOut,      /* Write unified diff here if not NULL */
+  int nContext     /* Amount of context to unified diff */
+){
+  DLine *A, *B;    /* Files being compared */
+  int X, Y;        /* Number of elements in A and B */
+  int x, y;        /* Indices:  A[x] and B[y] */
+  int szM = 0;     /* Number of rows and columns in M */
+  int **M = 0;     /* Myers matrix */
+  int i, d;        /* Indices on M.  M[d][i] */
+  int k, q;        /* Diagonal number and distinct from (0,0) */
+  int K, D;        /* The diagonal and d for the final solution */
+  int *R = 0;      /* Result vector */
+  int r;           /* Loop variables */
+  int go = 1;      /* Outer loop control */
+  int MAX;         /* Largest of X and Y */
 
   /* Break the two files being diffed into individual lines.
   ** Compute hashes on each line for fast comparison.
   */
-  pDA = break_into_lines(blob_str(pA), &nA);
-  pDB = break_into_lines(blob_str(pB), &nB);
-
-  /* Remove common prefix and suffix to help reduce the value
-  ** of N in the O(N^2) minimum edit distance algorithm.
-  */
-  for(i=0; i<nA && i<nB && same_dline(&pDA[i],&pDB[i]); i++){}
-  i -= nContext;
-  if( i<0 ) i = 0;
-  iStart = i;
-  A = &pDA[iStart];
-  B = &pDB[iStart];
-  nA -= iStart;
-  nB -= iStart;
-  for(i=1; i<nA && i<nB && same_dline(&A[nA-i],&B[nB-i]); i++){}
-  i -= nContext;
-  if( i<1 ) i = 1;
-  i--;
-  nA -= i;
-  nB -= i;
-
-  /* Create the matrix used for the minimum edit distance
-  ** calculation.
-  */
-  nAp1 = nA + 1;
-  m = malloc( sizeof(m[0])*(nB+1)*nAp1 );
-# define M(X,Y) m[(Y)*nAp1+(X)]
-
-
-  /* Find the minimum edit distance using Wagner's algorithm.
-  */
-  for(x=0; x<=nA; x++){
-    M(x,0) = x;
+  A = break_into_lines(blob_str(pA_Blob), &X);
+  B = break_into_lines(blob_str(pB_Blob), &Y);
+
+  if( A==0 || B==0 ){
+    free(A);
+    free(B);
+    if( pOut ){
+      blob_appendf(pOut, "cannot compute difference between binary files\n");
+    }
+    return 0;
+  }
+
+  szM = 0;
+  MAX = X>Y ? X : Y;
+  if( MAX>2000 ) MAX = 2000;
+  for(d=0; go && d<=MAX; d++){
+    if( szM<d+1 ){
+      szM += szM + 10;
+      M = realloc(M, sizeof(M[0])*szM);
+      if( M==0 ){
+        fossil_panic("out of memory");
+      }
+    }
+    M[d] = malloc( sizeof(M[d][0])*(d+1) );
+    if( M[d]==0 ){
+      fossil_panic("out of memory");
+    }
+    for(i=0; i<=d; i++){
+      k = 2*i - d;
+      if( d==0 ){
+        q = 0;
+      }else if( i==0 ){
+        q = M[d-1][0];
+      }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){
+        q = M[d-1][i];
+      }else{
+        q = M[d-1][i-1];
+      }
+      x = (k + q + 1)/2;
+      y = x - k;
+      if( x<0 || x>X || y<0 || y>Y ){
+        x = y = 0;
+      }else{
+        while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
+      }
+      M[d][i] = x + y;
+      DEBUG( printf("M[%d][%i] = %d  k=%d (%d,%d)\n", d, i, x+y, k, x, y); )
+      if( x==X && y==Y ){
+        go = 0;
+        break;
+      }
+    }
   }
-  for(y=0; y<=nB; y++){
-    M(0,y) = y;
-  }
-  for(x=1; x<=nA; x++){
-    for(y=1; y<=nB; y++){
-      int e = M(x-1,y) + 1;
-      if( e>M(x,y-1)+1 ){
-        e = M(x,y-1)+1;
+  if( d>MAX ){
+    R = malloc( sizeof(R[0])*7 );
+    R[0] = 0;
+    R[1] = X;
+    R[2] = Y;
+    R[3] = 0;
+    R[4] = 0;
+    R[5] = 0;
+    R[6] = 0;
+  }else{
+    /* Reuse M[] as follows:
+    **
+    **     M[d][1] = 1 if a line is inserted or 0 if a line is deleted.
+    **     M[d][0] = number of lines copied after the ins or del above.
+    **
+    */
+    D = d - 1;
+    K = X - Y;
+    for(d=D, i=(K+D)/2; d>0; d--){
+      DEBUG( printf("d=%d i=%d\n", d, i); )
+      if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){
+        M[d][0] = M[d][i] - M[d-1][i-1] - 1;
+        M[d][1] = 0;
+        i--;
+      }else{
+        M[d][0] = M[d][i] - M[d-1][i] - 1;
+        M[d][1] = 1;
+      }
+    }
+
+    DEBUG(
+      printf("---------------\nM[0][0] = %5d\n", M[0][0]);
+      for(d=1; d<=D; d++){
+        printf("M[%d][0] = %5d    M[%d][1] = %d\n",d,M[d][0],d,M[d][1]);
+      }
+    )
+
+    /* Allocate the output vector
+    */
+    R = malloc( sizeof(R[0])*(D+2)*3 );
+    if( R==0 ){
+      fossil_fatal("out of memory");
+    }
+
+    /* Populate the output vector
+    */
+    d = r = 0;
+    while( d<=D ){
+      int n;
+      R[r++] = M[d++][0]/2;   /* COPY */
+      if( d>D ){
+        R[r++] = 0;
+        R[r++] = 0;
+        break;
       }
-      if( e<=M(x-1,y-1) ){
-        M(x,y) = e;
-      }else if( same_dline(&A[x-1], &B[y-1]) ){
-        M(x,y) = M(x-1,y-1);
+      if( M[d][1]==0 ){
+        n = 1;
+        while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
+          d++;
+          n++;
+        }
+        R[r++] = n;           /* DELETE */
+        if( d==D || M[d][0]>0 ){
+          R[r++] = 0;         /* INSERT */
+          continue;
+        }
+        d++;
       }else{
-        M(x,y) = e;
+        R[r++] = 0;           /* DELETE */
+      }
+      assert( M[d][1]==1 );
+      n = 1;
+      while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
+        d++;
+        n++;
       }
+      R[r++] = n;            /* INSERT */
     }
+    R[r++] = 0;
+    R[r++] = 0;
+    R[r++] = 0;
+  }
+
+  /* Free the Myers matrix */
+  for(d=0; d<=D; d++){
+    free(M[d]);
   }
+  free(M);
 
-  /* Walk backwards through the Wagner algorithm matrix to determine
-  ** the specific edits that give the minimum edit distance.  Mark our
-  ** path through the matrix with -1.
+  /* If pOut is defined, construct a unified diff into pOut and
+  ** delete R
   */
-  x = nA;
-  y = nB;
-  while( x>0 || y>0 ){
-    int v = M(x,y);
-    M(x,y) = -1;
-    if( x==0 ){
-      y--;
-    }else if( y==0 ){
-      x--;
-    }else if( M(x,y-1)+1==v ){
-      y--;
-    }else if( M(x-1,y)+1==v ){
-      x--;
-    }else{
-      x--;
-      y--;
-    }
-  }
+  if( pOut ){
+    int a = 0;    /* Index of next line in A[] */
+    int b = 0;    /* Index of next line in B[] */
+    int nr;       /* Number of COPY/DELETE/INSERT triples to process */
+    int mxr;      /* Maximum value for r */
+    int na, nb;   /* Number of lines shown from A and B */
+    int i, j;     /* Loop counters */
+    int m;        /* Number of lines to output */
+    int skip;     /* Number of lines to skip */
+
+    for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){}
+    for(r=0; r<mxr; r += 3*nr){
+      /* Figure out how many triples to show in a single block */
+      for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
+      DEBUG( printf("r=%d nr=%d\n", r, nr); )
+
+      /* For the current block comprising nr triples, figure out
+      ** how many lines of A and B are to be displayed
+      */
+      if( R[r]>nContext ){
+        na = nb = nContext;
+        skip = R[r] - nContext;
+      }else{
+        na = nb = R[r];
+        skip = 0;
+      }
+      for(i=0; i<nr; i++){
+        na += R[r+i*3+1];
+        nb += R[r+i*3+2];
+      }
+      if( R[r+nr*3]>nContext ){
+        na += nContext;
+        nb += nContext;
+      }else{
+        na += R[r+nr*3];
+        nb += R[r+nr*3];
+      }
+      for(i=1; i<nr; i++){
+        na += R[r+i*3];
+        nb += R[r+i*3];
+      }
+      blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
 
-#if 0
-for(y=0; y<=nB; y++){
-  for(x=0; x<=nA; x++){
-    printf(" %2d", M(x,y));
-  }
-  printf("\n");
-}
-#endif
+      /* Show the initial common area */
+      a += skip;
+      b += skip;
+      m = R[r] - skip;
+      for(j=0; j<m; j++){
+        appendDiffLine(pOut, " ", &A[a+j]);
+      }
+      a += m;
+      b += m;
 
-  x = y = 0;
-  cnt = nContext;
-  while( x<nA || y<nB ){
-    int t1, t2;
-    if( (t1 = M(x+1,y))<0 || (t2 = M(x,y+1))<0 ){
-      if( cnt>=nContext ){
-        blob_appendf(pOut, "@@ -%d +%d @@\n",
-            x-nContext+iStart+2, y-nContext+iStart+2);
-        for(i=x-nContext+1; i<x; i++){
-          if( i<0 ) continue;
-          blob_appendf(pOut, " %s\n", A[i].z);
+      /* Show the differences */
+      for(i=0; i<nr; i++){
+        m = R[r+i*3+1];
+        for(j=0; j<m; j++){
+          appendDiffLine(pOut, "-", &A[a+j]);
+        }
+        a += m;
+        m = R[r+i*3+2];
+        for(j=0; j<m; j++){
+          appendDiffLine(pOut, "+", &B[b+j]);
+        }
+        b += m;
+        if( i<nr-1 ){
+          m = R[r+i*3+3];
+          for(j=0; j<m; j++){
+            appendDiffLine(pOut, " ", &B[b+j]);
+          }
+          b += m;
+          a += m;
         }
       }
-    }
-    if( t1<0 ){
-      blob_appendf(pOut, "-%s\n", A[x].z);
-      x++;
-      cnt = 0;
-    }else if( t2<0 ){
-      blob_appendf(pOut, "+%s\n", B[y].z);
-      y++;
-      cnt = 0;
-    }else{
-      if( M(x+1,y+1)==(-1) && cnt<nContext ){
-        blob_appendf(pOut, " %s\n", A[x].z);
+
+      /* Show the final common area */
+      assert( nr==i );
+      m = R[r+nr*3];
+      if( m>nContext ) m = nContext;
+      for(j=0; j<m; j++){
+        appendDiffLine(pOut, " ", &B[b+j]);
       }
-      cnt++;
-      x++;
-      y++;
     }
+    free(R);
+    R = 0;
   }
 
-  /* Cleanup allocationed memory */
-  free(m);
-  free(pDA);
-  free(pDB);
+  /* We no longer need the A[] and B[] vectors */
+  free(A);
+  free(B);
+
+  /* Return the result */
+  return R;
+}
+
+/*
+** COMMAND: test-rawdiff
+*/
+void test_rawdiff_cmd(void){
+  Blob a, b;
+  int r;
+  int i;
+  int *R;
+  if( g.argc<4 ) usage("FILE1 FILE2 ...");
+  blob_read_from_file(&a, g.argv[2]);
+  for(i=3; i<g.argc; i++){
+    if( i>3 ) printf("-------------------------------\n");
+    blob_read_from_file(&b, g.argv[i]);
+    R = text_diff(&a, &b, 0, 0);
+    for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){
+      printf(" copy %4d  delete %4d  insert %4d\n", R[r], R[r+1], R[r+2]);
+    }
+    /* free(R); */
+    blob_reset(&b);
+  }
 }
 
 /*
-** COMMAND: test-diff
+** COMMAND: test-udiff
 */
-void test_diff_cmd(void){
+void test_udiff_cmd(void){
   Blob a, b, out;
   if( g.argc!=4 ) usage("FILE1 FILE2");
   blob_read_from_file(&a, g.argv[2]);
   blob_read_from_file(&b, g.argv[3]);
   blob_zero(&out);
-  unified_diff(&a, &b, 4, &out);
-  blob_reset(&a);
-  blob_reset(&b);
-  printf("%s", blob_str(&out));
-  blob_reset(&out);
-}
-/*
-** COMMAND: test-uuid-diff
-*/
-void test_uuiddiff_cmd(void){
-  Blob a, b, out;
-  int ridA, ridB;
-  if( g.argc!=4 ) usage("UUID2 UUID1");
-  db_must_be_within_tree();
-  ridA = name_to_rid(g.argv[2]);
-  content_get(ridA, &a);
-  ridB = name_to_rid(g.argv[3]);
-  content_get(ridB, &b);
-  blob_zero(&out);
-  unified_diff(&a, &b, 4, &out);
-  blob_reset(&a);
-  blob_reset(&b);
-  printf("%s", blob_str(&out));
-  blob_reset(&out);
+  text_diff(&a, &b, &out, 3);
+  blob_write_to_file(&out, "-");
 }