Fossil: Check-in [57b2735ebd]

Overview

SHA1 Hash: 57b2735ebd12f83d9c98a441ffe946ce1fe9ab56

Date: 2007-11-15 21:49:14

User: drh

Comment: Enhanced text diff subroutine uses Myers enhancements to Wagners minimum edit distance algorithm. White space at the end of lines is ignored.

Timelines: ancestors | descendants | both | trunk

Other Links: files | ZIP archive | manifest

SHA1 Hash:	57b2735ebd12f83d9c98a441ffe946ce1fe9ab56
Date:	2007-11-15 21:49:14
User:	drh
Comment:	Enhanced text diff subroutine uses Myers enhancements to Wagners minimum edit distance algorithm. White space at the end of lines is ignored.
Timelines:	ancestors \| descendants \| both \| trunk
Other Links:	files \| ZIP archive \| manifest

Tags And Properties

branch=trunk inherited from [a28c83647d]
sym-trunk inherited from [a28c83647d]

Changes

[hide diffs]

Modified src/diff.c from [a87d303450] to [cb5702628e].

@@ -19,14 +19,15 @@
 **   drh@hwaci.com
 **   http://www.hwaci.com/drh/
 **
 *******************************************************************************
 **
-** This file contains code used to implement "diff" operators.
+** This file contains code used to compute a "diff" between two
+** text files.
 */
 #include "config.h"
-#include "diff.h"
+#include "diff2.h"
 #include <assert.h>
 
 /*
 ** Information about each line of a file being diffed.
 */
@@ -35,36 +36,45 @@
   const char *z;    /* The text of the line */
   unsigned int h;   /* Hash of the line */
 };
 
 /*
-** Break a blob into lines by converting each \n into a \000 and
-** creating pointers to the beginning of each line.
+** Break a blob into lines by converting inserting \000 characters.
+** Return an array of DLine objects containing a pointer to the
+** start of each line and a hash of that line.
+**
+** Trailing whitespace is removed from each line.
 */
 static DLine *break_into_lines(char *z, int *pnLine){
-  int nLine, i, j;
+  int nLine, i, j, k, x;
   unsigned int h;
   DLine *a;
+
+  /* Count the number of lines.  Allocate space to hold
+  ** the returned array.
+  */
   for(i=0, nLine=1; z[i]; i++){
     if( z[i]=='\n' ) nLine++;
   }
   a = malloc( nLine*sizeof(a[0]) );
   if( a==0 ) fossil_panic("out of memory");
-  a[0].z = z;
-  for(i=0, j=0, h=0; z[i]; i++){
-    if( z[i]=='\n' ){
-      a[j].h = h;
-      j++;
-      a[j].z = &z[i+1];
-      z[i] = 0;
-      h = 0;
-    }else{
-      h = h ^ (h<<2) ^ z[i];
+
+  /* Fill in the array */
+  for(i=0; i<nLine; i++){
+    a[i].z = z;
+    for(j=0; z[j] && z[j]!='\n'; j++){}
+    for(k=j; k>0 && isspace(z[k-1]); k--){}
+    z[k] = 0;
+    for(h=0, x=0; x<k; x++){
+      h = h ^ (h<<2) ^ z[x];
     }
+    a[i].h = h;
+    z += j+1;
   }
-  a[j].h = h;
-  *pnLine = j+1;
+
+  /* Return results */
+  *pnLine = nLine;
   return a;
 }
 
 /*
 ** Return true if two DLine elements are identical.
@@ -72,176 +82,342 @@
 static int same_dline(DLine *pA, DLine *pB){
   return pA->h==pB->h && strcmp(pA->z,pB->z)==0;
 }
 
 /*
-** Generate a unified diff of two blobs.  The text of the original
-** two blobs is destroyed by the diffing process.
+** Generate a report of the differences between files pA and pB.
+** The line ending (\r\n versus \n) is ignored - the two line
+** endings are considered to be equivalent.
+**
+** The return is a pointer to an array of integers that describes
+** the difference.  Integers come in triples.  For each triple,
+** the elements are the number of lines copied, the number of
+** lines delete, and the number of lines inserted.  The vector
+** is terminated by a triple of all zeros.
+**
+** The two blobs is destroyed ('\000' values are inserted)
+** by the diffing process.
+**
+** The core algorithm is a variation on the classic Wagner
+** minimum edit distance with enhancements to reduce the runtime
+** to be almost linear in the common case where the two files
+** have a lot in common.  For additional information see
+** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
+** Variations"
+**
+** Consider comparing strings A and B.  A=abcabba and B=cbabac
+** We construct a "Wagner" matrix W with A along the X axis and
+** B along the Y axis:
+**
+**     c 6               *
+**     a 5               *
+**     b 4           * *
+**     a 3         *
+**     b 2       *
+**   B c 1       *
+**       0 * * *
+**         0 1 2 3 4 5 6 7
+**           a b c a b b a
+**           A
+**
+** (Note: we draw this Wagner matrix with the origin at the lower
+** left whereas Myers uses the origin at the upper left.  Otherwise,
+** they are the same.)
+**
+** Let Y be the maximum y value or the number of characters in B.
+** 6 in this example.  X is the maximum x value or the number of
+** elements in A.  Here 7.
+**
+** Our goal is to find a path from (0,0) to (X,Y).  The path can
+** use horizontal, vertical, or diagonal steps.  A diagonal from
+** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y].  A vertical
+** steps corresponds to an insertion.  A horizontal step corresponds
+** to a deletion.  We want to find the path with the fewest
+** horizontal and vertical steps.
+**
+** Diagonal k consists of all points such that x-y==k.  Diagonal
+** zero begins at the origin.  Diagonal 1 begins at (1,0).
+** Diagonal -1 begins at (0,1).  All diagonals move up and to the
+** right at 45 degrees.  Diagonal number increase from upper left
+** to lower right.
+**
+** Myers matrix M is a lower right triangular matrix with indices d
+** along the bottom and i vertically:
+**
+**
+**   i=4 |            +4  \
+**     3 |         +3 +2   |
+**     2 |      +2 +1  0   |- k values.   k = 2*i-d
+**     1 |   +1  0 -1 -2   |
+**     0 | 0 -1 -2 -3 -4  /
+**       ---------------
+**         0  1  2  3  4 = d
+**
+** Each element of the Myers matrix corresponds to a diagonal.
+** The diagonal is k=2*i-d.  The diagonal values are shown
+** in the template above.
+**
+** Each entry in M represents the end-point on a path from (0,0).
+** The end-point is on diagonal k.  The value stored in M is
+** q=x+y where (x,y) is the terminus of the path.  A path
+** to M[d][i] will come through either M[d-1][i-1] or
+** though M[d-1][i], whichever holds the largest value of q.
+** If both elements hold the same value, the path comes
+** through M[d-1][i-1].
+**
+** The value of d is the number of insertions and deletions
+** made so far on the path.  M grows progressively.  So the
+** size of the M matrix is proportional to d*d.  For the
+** common case where A and B are similar, d will be small
+** compared to X and Y so little memory is required.  The
+** original Wagner algorithm requires X*Y memory, which for
+** larger files (100K lines) is more memory than we have at
+** hand.
 */
-void unified_diff(Blob *pA, Blob *pB, int nContext, Blob *pOut){
-  DLine *pDA, *pDB, *A, *B;
-  int nA, nB, nAp1;
-  int x, y;
-  int cnt;
-  int i, iStart;
-  int *m;
+int *text_diff(Blob *pA_Blob, Blob *pB_Blob, Blob *pOut, int nContext){
+  DLine *A, *B;    /* Files being compared */
+  int X, Y;        /* Number of elements in A and B */
+  int x, y;        /* Indices:  A[x] and B[y] */
+  int szM = 0;     /* Number of rows and columns in M */
+  int **M = 0;     /* Myers matrix */
+  int i, d;        /* Indices on M.  M[d][i] */
+  int k, q;        /* Diagonal number and distinct from (0,0) */
+  int K, D;        /* The diagonal and d for the final solution */
+  int *R;          /* Result vector */
+  int r;           /* Loop variables */
+  int go = 1;      /* Outer loop control */
 
   /* Break the two files being diffed into individual lines.
   ** Compute hashes on each line for fast comparison.
   */
-  pDA = break_into_lines(blob_str(pA), &nA);
-  pDB = break_into_lines(blob_str(pB), &nB);
-
-  /* Remove common prefix and suffix to help reduce the value
-  ** of N in the O(N^2) minimum edit distance algorithm.
-  */
-  for(i=0; i<nA && i<nB && same_dline(&pDA[i],&pDB[i]); i++){}
-  i -= nContext;
-  if( i<0 ) i = 0;
-  iStart = i;
-  A = &pDA[iStart];
-  B = &pDB[iStart];
-  nA -= iStart;
-  nB -= iStart;
-  for(i=1; i<nA && i<nB && same_dline(&A[nA-i],&B[nB-i]); i++){}
-  i -= nContext;
-  if( i<1 ) i = 1;
-  i--;
-  nA -= i;
-  nB -= i;
-
-  /* Create the matrix used for the minimum edit distance
-  ** calculation.
-  */
-  nAp1 = nA + 1;
-  m = malloc( sizeof(m[0])*(nB+1)*nAp1 );
-# define M(X,Y) m[(Y)*nAp1+(X)]
-
-
-  /* Find the minimum edit distance using Wagner's algorithm.
-  */
-  for(x=0; x<=nA; x++){
-    M(x,0) = x;
+  A = break_into_lines(blob_str(pA_Blob), &X);
+  B = break_into_lines(blob_str(pB_Blob), &Y);
+
+  szM = 0;
+  for(d=0; go; d++){
+    if( szM<d+1 ){
+      szM += szM + 10;
+      M = realloc(M, sizeof(M[0])*szM);
+      if( M==0 ){
+        fossil_panic("out of memory");
+      }
+    }
+    M[d] = malloc( sizeof(M[d][0])*(d+1) );
+    if( M[d]==0 ){
+      fossil_panic("out of memory");
+    }
+    for(i=0; i<=d; i++){
+      k = 2*i - d;
+      if( d==0 ){
+        q = 0;
+      }else if( i==0 ){
+        q = M[d-1][0];
+      }else if( M[d-1][i-1] < M[d-1][i] ){
+        q = M[d-1][i];
+      }else{
+        q = M[d-1][i-1];
+      }
+      x = (k + q + 1)/2;
+      y = x - k;
+      while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
+      M[d][i] = x + y;
+      if( x==X && y==Y ){
+        go = 0;
+        break;
+      }
+    }
   }
-  for(y=0; y<=nB; y++){
-    M(0,y) = y;
-  }
-  for(x=1; x<=nA; x++){
-    for(y=1; y<=nB; y++){
-      int e = M(x-1,y) + 1;
-      if( e>M(x,y-1)+1 ){
-        e = M(x,y-1)+1;
-      }
-      if( e<=M(x-1,y-1) ){
-        M(x,y) = e;
-      }else if( same_dline(&A[x-1], &B[y-1]) ){
-        M(x,y) = M(x-1,y-1);
-      }else{
-        M(x,y) = e;
-      }
+
+  /* Reuse M[] as follows:
+  **
+  **     M[d][1] = 1 if a line is inserted or 1 if a line is deleted.
+  **     M[d][0] = number of lines copied at this step.
+  **
+  */
+  D = d - 1;
+  K = X - Y;
+  for(d=D, i=(K+D)/2; d>0; d--){
+    if( i==d || M[d-1][i-1] > M[d-1][i] ){
+      M[d][0] = M[d][i] - M[d-1][i-1] - 1;
+      M[d][1] = 0;
+      i--;
+    }else{
+      M[d][0] = M[d][i] - M[d-1][i] - 1;
+      M[d][1] = 1;
     }
+  }
+
+  /* Allocate the output vector
+  */
+  R = malloc( sizeof(R[0])*(D+2)*3 );
+  if( R==0 ){
+    fossil_fatal("out of memory");
   }
 
-  /* Walk backwards through the Wagner algorithm matrix to determine
-  ** the specific edits that give the minimum edit distance.  Mark our
-  ** path through the matrix with -1.
+  /* Populate the output vector
   */
-  x = nA;
-  y = nB;
-  while( x>0 || y>0 ){
-    int v = M(x,y);
-    M(x,y) = -1;
-    if( x==0 ){
-      y--;
-    }else if( y==0 ){
-      x--;
-    }else if( M(x,y-1)+1==v ){
-      y--;
-    }else if( M(x-1,y)+1==v ){
-      x--;
+  d = r = 0;
+  while( d<=D ){
+    int n;
+    R[r++] = M[d++][0]/2;   /* COPY */
+    if( d>D ){
+      R[r++] = 0;
+      R[r++] = 0;
+      break;
+    }
+    if( M[d][1]==0 ){
+      n = 1;
+      while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
+        d++;
+        n++;
+      }
+      R[r++] = n;           /* DELETE */
+      if( d==D || M[d][0]>0 ){
+        R[r++] = 0;         /* INSERT */
+        continue;
+      }
+      d++;
     }else{
-      x--;
-      y--;
+      R[r++] = 0;           /* DELETE */
+    }
+    assert( M[d][1]==1 );
+    n = 1;
+    while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
+      d++;
+      n++;
     }
+    R[r++] = n;            /* INSERT */
   }
+  R[r++] = 0;
+  R[r++] = 0;
+  R[r++] = 0;
 
-#if 0
-for(y=0; y<=nB; y++){
-  for(x=0; x<=nA; x++){
-    printf(" %2d", M(x,y));
+  /* Free the Myers matrix */
+  for(d=0; d<=D; d++){
+    free(M[d]);
   }
-  printf("\n");
-}
-#endif
+  free(M);
+
+  /* If pOut is defined, construct a unified diff into pOut and
+  ** delete R
+  */
+  if( pOut ){
+    int a = 0;    /* Index of next line in A[] */
+    int b = 0;    /* Index of next line in B[] */
+    int nr;       /* Number of COPY/DELETE/INSERT triples to process */
+    int na, nb;   /* Number of lines shown from A and B */
+    int i, j;     /* Loop counters */
+    int m;        /* Number of lines to output */
+    int skip;     /* Number of lines to skip */
+
+    for(r=0; R[r+3]; r += 3*nr){
+      /* Figure out how many triples to show in a single block */
+      for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
+
+      /* For the current block comprising nr triples, figure out
+      ** how many lines of A and B are to be displayed
+      */
+      if( R[r]>nContext ){
+        na = nb = nContext;
+        skip = R[r] - nContext;
+      }else{
+        na = nb = R[r];
+        skip = 0;
+      }
+      for(i=0; i<nr; i++){
+        na += R[r+i*3+1];
+        nb += R[r+i*3+2];
+      }
+      if( R[r+i*3]>nContext ){
+        na += nContext;
+        nb += nContext;
+      }else{
+        na += R[r+i*3];
+        nb += R[r+i*3];
+      }
+      for(i=1; i<nr; i++){
+        na += R[r+i*3];
+        nb += R[r+i*3];
+      }
+      blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
+
+      /* Show the initial common area */
+      a += skip;
+      b += skip;
+      m = R[r] - skip;
+      for(j=0; j<m; j++){
+        blob_appendf(pOut," %s\n", A[a+j].z);
+      }
+      a += m;
+      b += m;
 
-  x = y = 0;
-  cnt = nContext;
-  while( x<nA || y<nB ){
-    int t1=0, t2=0;
-    if( (t1 = M(x+1,y))<0 || (t2 = M(x,y+1))<0 ){
-      if( cnt>=nContext ){
-        blob_appendf(pOut, "@@ -%d +%d @@\n",
-            x-nContext+iStart+2, y-nContext+iStart+2);
-        for(i=x-nContext+1; i<x; i++){
-          if( i<0 ) continue;
-          blob_appendf(pOut, " %s\n", A[i].z);
+      /* Show the differences */
+      for(i=0; i<nr; i++){
+        m = R[r+i*3+1];
+        for(j=0; j<m; j++){
+          blob_appendf(pOut,"-%s\n", A[a+j].z);
+        }
+        a += m;
+        m = R[r+i*3+2];
+        for(j=0; j<m; j++){
+          blob_appendf(pOut,"+%s\n", B[b+j].z);
+        }
+        b += m;
+        if( i<nr-1 ){
+          m = R[r+i*3+3];
+          for(j=0; j<m; j++){
+            blob_appendf(pOut," %s\n", B[b+j].z);
+          }
+          b += m;
+          a += m;
         }
       }
-    }
-    if( t1<0 ){
-      blob_appendf(pOut, "-%s\n", A[x].z);
-      x++;
-      cnt = 0;
-    }else if( t2<0 ){
-      blob_appendf(pOut, "+%s\n", B[y].z);
-      y++;
-      cnt = 0;
-    }else{
-      if( M(x+1,y+1)==(-1) && cnt<nContext ){
-        blob_appendf(pOut, " %s\n", A[x].z);
+
+      /* Show the final common area */
+      assert( nr==i );
+      m = R[r+nr*3];
+      if( m>nContext ) m = nContext;
+      for(j=0; j<m; j++){
+        blob_appendf(pOut," %s\n", B[b+j].z);
       }
-      cnt++;
-      x++;
-      y++;
     }
+    free(R);
+    R = 0;
   }
+
+  /* We no longer need the A[] and B[] vectors */
+  free(A);
+  free(B);
+
+  /* Return the result */
+  return R;
+}
 
-  /* Cleanup allocationed memory */
-  free(m);
-  free(pDA);
-  free(pDB);
+/*
+** COMMAND: test-rawdiff
+*/
+void test_rawdiff_cmd(void){
+  Blob a, b;
+  int r;
+  int *R;
+  if( g.argc!=4 ) usage("FILE1 FILE2");
+  blob_read_from_file(&a, g.argv[2]);
+  blob_read_from_file(&b, g.argv[3]);
+  R = text_diff(&a, &b, 0, 0);
+  for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){
+    printf(" copy %4d  delete %4d  insert %4d\n", R[r], R[r+1], R[r+2]);
+  }
+  free(R);
 }
 
 /*
-** COMMAND: test-diff
+** COMMAND: test-udiff
 */
-void test_diff_cmd(void){
+void test_udiff_cmd(void){
   Blob a, b, out;
   if( g.argc!=4 ) usage("FILE1 FILE2");
   blob_read_from_file(&a, g.argv[2]);
   blob_read_from_file(&b, g.argv[3]);
   blob_zero(&out);
-  unified_diff(&a, &b, 4, &out);
-  blob_reset(&a);
-  blob_reset(&b);
-  printf("%s", blob_str(&out));
-  blob_reset(&out);
-}
-/*
-** COMMAND: test-uuid-diff
-*/
-void test_uuiddiff_cmd(void){
-  Blob a, b, out;
-  int ridA, ridB;
-  if( g.argc!=4 ) usage("UUID2 UUID1");
-  db_must_be_within_tree();
-  ridA = name_to_rid(g.argv[2]);
-  content_get(ridA, &a);
-  ridB = name_to_rid(g.argv[3]);
-  content_get(ridB, &b);
-  blob_zero(&out);
-  unified_diff(&a, &b, 4, &out);
-  blob_reset(&a);
-  blob_reset(&b);
-  printf("%s", blob_str(&out));
-  blob_reset(&out);
+  text_diff(&a, &b, &out, 3);
+  blob_write_to_file(&out, "-");
 }

Modified src/diffcmd.c from [637be09787] to [dfc5dfd350].

@@ -124,11 +124,11 @@
     Blob out;
     Blob current;
     blob_zero(&current);
     blob_read_from_file(&current, zFile);
     blob_zero(&out);
-    unified_diff(&record, &current, 5, &out);
+    text_diff(&record, &current, &out, 5);
     printf("%s\n", blob_str(&out));
     blob_reset(&current);
     blob_reset(&out);
   }else{
     blob_write_to_file(&record, blob_str(&vname));

Modified src/info.c from [2ca8ead9e1] to [d4602a7e16].

@@ -510,11 +510,11 @@
 static void append_diff(int fromid, int toid){
   Blob from, to, out;
   content_get(fromid, &from);
   content_get(toid, &to);
   blob_zero(&out);
-  unified_diff(&from, &to, 5, &out);
+  text_diff(&from, &to, &out, 5);
   @ %h(blob_str(&out))
   blob_reset(&from);
   blob_reset(&to);
   blob_reset(&out);
 }
@@ -686,11 +686,11 @@
   @ <hr>
   @ <blockquote><pre>
   content_get(v1, &c1);
   content_get(v2, &c2);
   blob_zero(&diff);
-  unified_diff(&c1, &c2, 4, &diff);
+  text_diff(&c1, &c2, &diff, 4);
   blob_reset(&c1);
   blob_reset(&c2);
   @ %h(blob_str(&diff))
   @ </pre></blockquote>
   blob_reset(&diff);

Modified src/merge3.c from [73c60b9f66] to [2a2187581a].

@@ -24,11 +24,11 @@
 ** This module implements a 3-way merge
 */
 #include "config.h"
 #include "merge3.h"
 
-#if 0
+#if 1
 # define DEBUG1(X) X
 #else
 # define DEBUG1(X)
 #endif
 #if 0