Fossil: Diff

Differences From:

File src/diff.c part of check-in [e8cf0061cc] - Tweaks to the diff algorithm give a 4x performance increase. Now comparable to command-line diff. by drh on 2008-02-04 13:53:55. [view]
To:

File src/diff.c part of check-in [eeea77f340] - Improvements to comments on the diff algorithm code. Completely remove the older Wagner/Myers algorithm which had been commented out. by drh on 2008-02-04 14:05:03. [view]
@@ -300,13 +300,13 @@
 ** of lines in these two blocks that are exactly the same.  Return
 ** the bounds of the matching sequence.
 */
 static void longestCommonSequence(
-  DContext *p,
-  int iS1, int iE1,
-  int iS2, int iE2,
-  int *piSX, int *piEX,
-  int *piSY, int *piEY
+  DContext *p,               /* Two files being compared */
+  int iS1, int iE1,          /* Range of lines in p->aFrom[] */
+  int iS2, int iE2,          /* Range of lines in p->aTo[] */
+  int *piSX, int *piEX,      /* Write p->aFrom[] common segment here */
+  int *piSY, int *piEY       /* Write p->aTo[] common segment here */
 ){
   double bestScore = -1e30;  /* Best score seen so far */
   int i, j;                  /* Loop counters */
   int iSX, iSY, iEX, iEY;    /* Current match */
@@ -380,19 +380,27 @@
 ** Do a single step in the difference.  Compute a sequence of
 ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
 ** the input into lines iS2 through iE2-1 of the output and write
 ** that sequence into the difference context.
+**
+** The algorithm is to find a block of common text near the middle
+** of the two segments being diffed.  Then recursively compute
+** differences on the blocks before and after that common segment.
+** Special cases apply if either input segment is empty or if the
+** two segments have no text in common.
 */
 static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
   int iSX, iEX, iSY, iEY;
 
   if( iE1<=iS1 ){
+    /* The first segment is empty */
     if( iE2>iS2 ){
       appendTriple(p, 0, 0, iE2-iS2);
     }
     return;
   }
   if( iE2<=iS2 ){
+    /* The second segment is empty */
     appendTriple(p, 0, iE1-iS1, 0);
     return;
   }
 
@@ -399,22 +407,36 @@
   /* Find the longest matching segment between the two sequences */
   longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
 
   if( iEX>iSX ){
-    /* Recursively diff either side of the matching segment */
+    /* A common segement has been found.
+    ** Recursively diff either side of the matching segment */
     diff_step(p, iS1, iSX, iS2, iSY);
     if( iEX>iSX ){
       appendTriple(p, iEX - iSX, 0, 0);
     }
     diff_step(p, iEX, iE1, iEY, iE2);
   }else{
+    /* The two segments have nothing in common.  Delete the first then
+    ** insert the second. */
     appendTriple(p, 0, iE1-iS1, iE2-iS2);
   }
 }
 
 /*
 ** Compute the differences between two files already loaded into
 ** the DContext structure.
+**
+** A divide and conquer technique is used.  We look for a large
+** block of common text that is in the middle of both files.  Then
+** compute the difference on those parts of the file before and
+** after the common block.  This technique is fast, but it does
+** not necessarily generate the minimum difference set.  On the
+** other hand, we do not need a minimum difference set, only one
+** that makes sense to human readers, which this algorithm does.
+**
+** Any common text at the beginning and end of the two files is
+** removed before starting the divide-and-conquer algorithm.
 */
 static void diff_all(DContext *p){
   int mnE, iS, iE1, iE2;
 
@@ -499,365 +521,8 @@
     free(c.aTo);
     return c.aEdit;
   }
 }
-
-#if 0  /********** Disabled and replaced by code above ************/
-
-/*
-** Generate a report of the differences between files pA and pB.
-** If pOut is not NULL then a unified diff is appended there.  It
-** is assumed that pOut has already been initialized.  If pOut is
-** NULL, then a pointer to an array of integers is returned.
-** The integers come in triples.  For each triple,
-** the elements are the number of lines copied, the number of
-** lines deleted, and the number of lines inserted.  The vector
-** is terminated by a triple of all zeros.
-**
-** This diff utility does not work on binary files.  If a binary
-** file is encountered, 0 is returned and pOut is written with
-** text "cannot compute difference between binary files".
-**
-****************************************************************************
-**
-** The core algorithm is a variation on the classic Wagner
-** minimum edit distance with enhancements to reduce the runtime
-** to be almost linear in the common case where the two files
-** have a lot in common.  For additional information see
-** Eugene W. Myers, "An O(ND) Difference Algorithm And Its
-** Variations"
-**
-** Consider comparing strings A and B.  A=abcabba and B=cbabac
-** We construct a "Wagner" matrix W with A along the X axis and
-** B along the Y axis:
-**
-**     c 6               *
-**     a 5               *
-**     b 4           * *
-**     a 3         *
-**     b 2       *
-**   B c 1       *
-**       0 * * *
-**         0 1 2 3 4 5 6 7
-**           a b c a b b a
-**           A
-**
-** (Note: we draw this Wagner matrix with the origin at the lower
-** left whereas Myers uses the origin at the upper left.  Otherwise,
-** they are the same.)
-**
-** Let Y be the maximum y value or the number of characters in B.
-** 6 in this example.  X is the maximum x value or the number of
-** elements in A.  Here 7.
-**
-** Our goal is to find a path from (0,0) to (X,Y).  The path can
-** use horizontal, vertical, or diagonal steps.  A diagonal from
-** (x-1,y-1) to (x,y) is only allowed if A[x]==B[y].  A vertical
-** steps corresponds to an insertion.  A horizontal step corresponds
-** to a deletion.  We want to find the path with the fewest
-** horizontal and vertical steps.
-**
-** Diagonal k consists of all points such that x-y==k.  Diagonal
-** zero begins at the origin.  Diagonal 1 begins at (1,0).
-** Diagonal -1 begins at (0,1).  All diagonals move up and to the
-** right at 45 degrees.  Diagonal number increase from upper left
-** to lower right.
-**
-** Myers matrix M is a lower right triangular matrix with indices d
-** along the bottom and i vertically:
-**
-**
-**   i=4 |            +4  \
-**     3 |         +3 +2   |
-**     2 |      +2 +1  0   |- k values.   k = 2*i-d
-**     1 |   +1  0 -1 -2   |
-**     0 | 0 -1 -2 -3 -4  /
-**       ---------------
-**         0  1  2  3  4 = d
-**
-** Each element of the Myers matrix corresponds to a diagonal.
-** The diagonal is k=2*i-d.  The diagonal values are shown
-** in the template above.
-**
-** Each entry in M represents the end-point on a path from (0,0).
-** The end-point is on diagonal k.  The value stored in M is
-** q=x+y where (x,y) is the terminus of the path.  A path
-** to M[d][i] will come through either M[d-1][i-1] or
-** though M[d-1][i], whichever holds the largest value of q.
-** If both elements hold the same value, the path comes
-** through M[d-1][i-1].
-**
-** The value of d is the number of insertions and deletions
-** made so far on the path.  M grows progressively.  So the
-** size of the M matrix is proportional to d*d.  For the
-** common case where A and B are similar, d will be small
-** compared to X and Y so little memory is required.  The
-** original Wagner algorithm requires X*Y memory, which for
-** larger files (100K lines) is more memory than we have at
-** hand.
-*/
-int *text_diff(
-  Blob *pA_Blob,   /* FROM file */
-  Blob *pB_Blob,   /* TO file */
-  Blob *pOut,      /* Write unified diff here if not NULL */
-  int nContext     /* Amount of context to unified diff */
-){
-  DLine *A, *B;    /* Files being compared */
-  int X, Y;        /* Number of elements in A and B */
-  int x, y;        /* Indices:  A[x] and B[y] */
-  int szM = 0;     /* Number of rows and columns in M */
-  int **M = 0;     /* Myers matrix */
-  int i, d;        /* Indices on M.  M[d][i] */
-  int k, q;        /* Diagonal number and distinct from (0,0) */
-  int K, D;        /* The diagonal and d for the final solution */
-  int *R = 0;      /* Result vector */
-  int r;           /* Loop variables */
-  int go = 1;      /* Outer loop control */
-  int MAX;         /* Largest of X and Y */
-
-  /* Break the two files being diffed into individual lines.
-  ** Compute hashes on each line for fast comparison.
-  */
-  A = break_into_lines(blob_str(pA_Blob), &X);
-  B = break_into_lines(blob_str(pB_Blob), &Y);
-
-  if( A==0 || B==0 ){
-    free(A);
-    free(B);
-    if( pOut ){
-      blob_appendf(pOut, "cannot compute difference between binary files\n");
-    }
-    return 0;
-  }
-
-  szM = 0;
-  MAX = X>Y ? X : Y;
-  if( MAX>2000 ) MAX = 2000;
-  for(d=0; go && d<=MAX; d++){
-    if( szM<d+1 ){
-      szM += szM + 10;
-      M = realloc(M, sizeof(M[0])*szM);
-      if( M==0 ){
-        fossil_panic("out of memory");
-      }
-    }
-    M[d] = malloc( sizeof(M[d][0])*(d+1) );
-    if( M[d]==0 ){
-      fossil_panic("out of memory");
-    }
-    for(i=0; i<=d; i++){
-      k = 2*i - d;
-      if( d==0 ){
-        q = 0;
-      }else if( i==0 ){
-        q = M[d-1][0];
-      }else if( i<d-1 && M[d-1][i-1] < M[d-1][i] ){
-        q = M[d-1][i];
-      }else{
-        q = M[d-1][i-1];
-      }
-      x = (k + q + 1)/2;
-      y = x - k;
-      if( x<0 || x>X || y<0 || y>Y ){
-        x = y = 0;
-      }else{
-        while( x<X && y<Y && same_dline(&A[x],&B[y]) ){ x++; y++; }
-      }
-      M[d][i] = x + y;
-      DEBUG( printf("M[%d][%i] = %d  k=%d (%d,%d)\n", d, i, x+y, k, x, y); )
-      if( x==X && y==Y ){
-        go = 0;
-        break;
-      }
-    }
-  }
-  if( d>MAX ){
-    R = malloc( sizeof(R[0])*7 );
-    R[0] = 0;
-    R[1] = X;
-    R[2] = Y;
-    R[3] = 0;
-    R[4] = 0;
-    R[5] = 0;
-    R[6] = 0;
-  }else{
-    /* Reuse M[] as follows:
-    **
-    **     M[d][1] = 1 if a line is inserted or 0 if a line is deleted.
-    **     M[d][0] = number of lines copied after the ins or del above.
-    **
-    */
-    D = d - 1;
-    K = X - Y;
-    for(d=D, i=(K+D)/2; d>0; d--){
-      DEBUG( printf("d=%d i=%d\n", d, i); )
-      if( i==d || (i>0 && M[d-1][i-1] > M[d-1][i]) ){
-        M[d][0] = M[d][i] - M[d-1][i-1] - 1;
-        M[d][1] = 0;
-        i--;
-      }else{
-        M[d][0] = M[d][i] - M[d-1][i] - 1;
-        M[d][1] = 1;
-      }
-    }
-
-    DEBUG(
-      printf("---------------\nM[0][0] = %5d\n", M[0][0]);
-      for(d=1; d<=D; d++){
-        printf("M[%d][0] = %5d    M[%d][1] = %d\n",d,M[d][0],d,M[d][1]);
-      }
-    )
-
-    /* Allocate the output vector
-    */
-    R = malloc( sizeof(R[0])*(D+2)*3 );
-    if( R==0 ){
-      fossil_fatal("out of memory");
-    }
-
-    /* Populate the output vector
-    */
-    d = r = 0;
-    while( d<=D ){
-      int n;
-      R[r++] = M[d++][0]/2;   /* COPY */
-      if( d>D ){
-        R[r++] = 0;
-        R[r++] = 0;
-        break;
-      }
-      if( M[d][1]==0 ){
-        n = 1;
-        while( M[d][0]==0 && d<D && M[d+1][1]==0 ){
-          d++;
-          n++;
-        }
-        R[r++] = n;           /* DELETE */
-        if( d==D || M[d][0]>0 ){
-          R[r++] = 0;         /* INSERT */
-          continue;
-        }
-        d++;
-      }else{
-        R[r++] = 0;           /* DELETE */
-      }
-      assert( M[d][1]==1 );
-      n = 1;
-      while( M[d][0]==0 && d<D && M[d+1][1]==1 ){
-        d++;
-        n++;
-      }
-      R[r++] = n;            /* INSERT */
-    }
-    R[r++] = 0;
-    R[r++] = 0;
-    R[r++] = 0;
-  }
-
-  /* Free the Myers matrix */
-  for(d=0; d<=D; d++){
-    free(M[d]);
-  }
-  free(M);
-
-  /* If pOut is defined, construct a unified diff into pOut and
-  ** delete R
-  */
-  if( pOut ){
-    int a = 0;    /* Index of next line in A[] */
-    int b = 0;    /* Index of next line in B[] */
-    int nr;       /* Number of COPY/DELETE/INSERT triples to process */
-    int mxr;      /* Maximum value for r */
-    int na, nb;   /* Number of lines shown from A and B */
-    int i, j;     /* Loop counters */
-    int m;        /* Number of lines to output */
-    int skip;     /* Number of lines to skip */
-
-    for(mxr=0; R[mxr+1] || R[mxr+2] || R[mxr+3]; mxr += 3){}
-    for(r=0; r<mxr; r += 3*nr){
-      /* Figure out how many triples to show in a single block */
-      for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){}
-      DEBUG( printf("r=%d nr=%d\n", r, nr); )
-
-      /* For the current block comprising nr triples, figure out
-      ** how many lines of A and B are to be displayed
-      */
-      if( R[r]>nContext ){
-        na = nb = nContext;
-        skip = R[r] - nContext;
-      }else{
-        na = nb = R[r];
-        skip = 0;
-      }
-      for(i=0; i<nr; i++){
-        na += R[r+i*3+1];
-        nb += R[r+i*3+2];
-      }
-      if( R[r+nr*3]>nContext ){
-        na += nContext;
-        nb += nContext;
-      }else{
-        na += R[r+nr*3];
-        nb += R[r+nr*3];
-      }
-      for(i=1; i<nr; i++){
-        na += R[r+i*3];
-        nb += R[r+i*3];
-      }
-      blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb);
-
-      /* Show the initial common area */
-      a += skip;
-      b += skip;
-      m = R[r] - skip;
-      for(j=0; j<m; j++){
-        appendDiffLine(pOut, " ", &A[a+j]);
-      }
-      a += m;
-      b += m;
-
-      /* Show the differences */
-      for(i=0; i<nr; i++){
-        m = R[r+i*3+1];
-        for(j=0; j<m; j++){
-          appendDiffLine(pOut, "-", &A[a+j]);
-        }
-        a += m;
-        m = R[r+i*3+2];
-        for(j=0; j<m; j++){
-          appendDiffLine(pOut, "+", &B[b+j]);
-        }
-        b += m;
-        if( i<nr-1 ){
-          m = R[r+i*3+3];
-          for(j=0; j<m; j++){
-            appendDiffLine(pOut, " ", &B[b+j]);
-          }
-          b += m;
-          a += m;
-        }
-      }
-
-      /* Show the final common area */
-      assert( nr==i );
-      m = R[r+nr*3];
-      if( m>nContext ) m = nContext;
-      for(j=0; j<m; j++){
-        appendDiffLine(pOut, " ", &B[b+j]);
-      }
-    }
-    free(R);
-    R = 0;
-  }
-
-  /* We no longer need the A[] and B[] vectors */
-  free(A);
-  free(B);
-
-  /* Return the result */
-  return R;
-}
-#endif /***************** End of the Wagner/Myers algorithm ************/
 
 /*
 ** COMMAND: test-rawdiff
 */