Artifact Content
Not logged in

Artifact 5d18e08162e0656a84213840d2d938a5c6fb3690

File src/merge3.c part of check-in [dbda8d6ce9] - Initial check-in of m1 sources. by drh on 2007-07-21 14:10:57.

/*
** Copyright (c) 2007 D. Richard Hipp
**
** This program is free software; you can redistribute it and/or
** modify it under the terms of the GNU General Public
** License version 2 as published by the Free Software Foundation.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
** General Public License for more details.
** 
** You should have received a copy of the GNU General Public
** License along with this library; if not, write to the
** Free Software Foundation, Inc., 59 Temple Place - Suite 330,
** Boston, MA  02111-1307, USA.
**
** Author contact information:
**   drh@hwaci.com
**   http://www.hwaci.com/drh/
**
*******************************************************************************
**
** This module implements a 3-way merge
*/
#include "config.h"
#include "merge3.h"

#if 0
# define DEBUG1(X) X
#else
# define DEBUG1(X)
#endif
#if 0
#define DEBUG2(X) X
/*
** For debugging:
** Print 16 characters of text from zBuf
*/
static const char *print16(const char *z){
  int i;
  static char zBuf[20];
  for(i=0; i<16; i++){
    if( z[i]>=0x20 && z[i]<=0x7e ){
      zBuf[i] = z[i];
    }else{
      zBuf[i] = '.';
    }
  }
  zBuf[i] = 0;
  return zBuf;
}
#else
# define DEBUG2(X)
#endif

/*
** Must be a 32-bit integer
*/
typedef unsigned int u32;

/*
** Must be a 16-bit value 
*/
typedef unsigned short int u16;

/*
** The width of a hash window in bytes.  The algorithm only works if this
** is a power of 2.
*/
#define NHASH 16

/*
** The current state of the rolling hash.
**
** z[] holds the values that have been hashed.  z[] is a circular buffer.
** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of 
** the window.
**
** Hash.a is the sum of all elements of hash.z[].  Hash.b is a weighted
** sum.  Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1.
** (Each index for z[] should be module NHASH, of course.  The %NHASH operator
** is omitted in the prior expression for brevity.)
*/
typedef struct hash hash;
struct hash {
  u16 a, b;         /* Hash values */
  u16 i;            /* Start of the hash window */
  char z[NHASH];    /* The values that have been hashed */
};

/*
** Initialize the rolling hash using the first NHASH characters of z[]
*/
static void hash_init(hash *pHash, const char *z){
  u16 a, b, i;
  a = b = 0;
  for(i=0; i<NHASH; i++){
    a += z[i];
    b += (NHASH-i)*z[i];
    pHash->z[i] = z[i];
  }
  pHash->a = a & 0xffff;
  pHash->b = b & 0xffff;
  pHash->i = 0;
}

/*
** Advance the rolling hash by a single character "c"
*/
static void hash_next(hash *pHash, int c){
  u16 old = pHash->z[pHash->i];
  pHash->z[pHash->i] = c;
  pHash->i = (pHash->i+1)&(NHASH-1);
  pHash->a = pHash->a - old + c;
  pHash->b = pHash->b - NHASH*old + pHash->a;
}

/*
** Return a 32-bit hash value
*/
static u32 hash_32bit(hash *pHash){
  return (pHash->a & 0xffff) | (((u32)(pHash->b & 0xffff))<<16);
}

/*
** Maximum number of landmarks to set in the source file.
*/
#define MX_LANDMARK (1024*128)

/*
** A mapping structure is used to record which parts of two
** files contain the same text.  There are zero or more mapping
** entries in a mapping.  Each entry maps a segment of text in
** the source file into a segment of the output file.
**
**     fromFirst...fromLast ->  toFirst...toLast
**
** Extra text might be inserted in the output file after a 
** mapping.  The nExtra parameter records the number of bytes
** of extra text to insert.
*/
typedef struct Mapping Mapping;
struct Mapping {
  int nMap;
  int nUsed;
  struct Mapping_entry {
    int fromFirst, fromLast;
    int toFirst, toLast;
    int nExtra;
  } *aMap;
};

/*
** Free malloced memory associated with a mapping.
*/
static void MappingClear(Mapping *p){
  free(p->aMap);
  memset(p, 0, sizeof(*p));
}

/*
** Add an entry to a mapping structure.  The mapping is:
**
**     a...b  ->   c...d
**
** The nExtra parameter is initially zero.  It will be changed
** later if necessary.
*/
static void MappingInsert(Mapping *p, int a, int b, int c, int d){
  struct Mapping_entry *pEntry;
  int i;
  for(i=0, pEntry=p->aMap; i<p->nUsed; i++, pEntry++){
    if( pEntry->fromFirst==a && pEntry->fromLast==b && pEntry->toFirst==c ){
      DEBUG2( printf("DUPLICATE: %6d..%-6d %6d..%d\n", a, b, c, d); )
      return;
    }
  }
  if( p->nUsed>=p->nMap ){
    p->nMap = p->nMap * 2 + 10;
    p->aMap = realloc(p->aMap, p->nMap*sizeof(p->aMap[0]) );
    if( p->aMap==0 ) exit(1);
  }
  pEntry = &p->aMap[p->nUsed];
  pEntry->fromFirst = a;
  pEntry->fromLast = b;
  pEntry->toFirst = c;
  pEntry->toLast = d;
  pEntry->nExtra = 0;
  p->nUsed++;
}

DEBUG1(
/*
** For debugging purposes:
** Print the content of a mapping.
*/
static void MappingPrint(Mapping *pMap){
  int i;
  struct Mapping_entry *p;
  for(i=0, p=pMap->aMap; i<pMap->nUsed; i++, p++){
    printf("%6d..%-6d %6d..%-6d  %d\n",
       p->fromFirst, p->fromLast,
       p->toFirst, p->toLast, p->nExtra);
  }
}
)

/*
** Remove deleted entries from a mapping.  Deleted enties have 
** an fromFirst of less than 0.
*/
static void MappingPurgeDeletedEntries(Mapping *p){
  int i, j;
  for(i=j=0; i<p->nUsed; i++){
    if( p->aMap[i].fromFirst<0 ) continue;
    if( j<i ){
      p->aMap[j] = p->aMap[i];
    }
    j++;
  }
  p->nUsed = j;
}

/*
** Comparisons functions used for sorting elements of a Mapping
*/
static int intAbs(int x){ return x<0 ? -x : x; }
static int compareSize(const void *a, const void *b){
  const struct Mapping_entry *A = (const struct Mapping_entry*)a;
  const struct Mapping_entry *B = (const struct Mapping_entry*)b;
  int rc;
  rc = (B->fromLast - B->fromFirst) - (A->fromLast - A->fromFirst);
  if( rc==0 ){
    rc = intAbs(A->toFirst - A->fromFirst) -
              intAbs(B->toFirst - B->fromFirst);
  }
  return rc;
}
static int compareFrom(const void *a, const void *b){
  const struct Mapping_entry *A = (const struct Mapping_entry*)a;
  const struct Mapping_entry *B = (const struct Mapping_entry*)b;
  return A->fromFirst - B->fromFirst;
}
static int compareTo(const void *a, const void *b){
  const struct Mapping_entry *A = (const struct Mapping_entry*)a;
  const struct Mapping_entry *B = (const struct Mapping_entry*)b;
  return A->toFirst - B->toFirst;
}

/*
** Routines for sorting the entries of a mapping.  SortSize sorts
** the entries in order of decreasing size (largest first.)  
** SortFrom and SortTo sort the entries in order of increasing
** fromFirst and toFirst.
*/
static void MappingSortSize(Mapping *p){
  qsort(p->aMap, p->nUsed, sizeof(p->aMap[0]), compareSize);
}
static void MappingSortFrom(Mapping *p){
  qsort(p->aMap, p->nUsed, sizeof(p->aMap[0]), compareFrom);
}
static void MappingSortTo(Mapping *p){
  qsort(p->aMap, p->nUsed, sizeof(p->aMap[0]), compareTo);
}

/*
** Initialize pMap to contain a set of similarities between two files.
*/
static void MappingInit(
  const char *zSrc,      /* The source or pattern file */
  int lenSrc,            /* Length of the source file */
  const char *zOut,      /* The target file */
  int lenOut,            /* Length of the target file */
  Mapping *pMap          /* Write the map of similaries here */
){
  int i, j, base, prefix;
  hash h;
  int *collide;
  int origLenOut = lenOut;
  struct Mapping_entry *aMap;
  int landmark[MX_LANDMARK];

  /*
  ** Initialize the map
  */
  memset(pMap, 0, sizeof(*pMap));

  /*
  ** Find common prefix and suffix
  */
  if( lenSrc<=NHASH || lenOut<=NHASH ){
    MappingInsert(pMap, 0, 0, 0, 0);
    goto add_nextra;
  }
  for(i=0; i<lenSrc && i<lenOut && zSrc[i]==zOut[i]; i++){}
  if( i>=NHASH ){
    MappingInsert(pMap, 0, i-1, 0, i-1);
    lenSrc -= i;
    zSrc += i;
    lenOut -= i;
    zOut += i;
    if( lenSrc<=0 || lenOut<=0 ) goto add_nextra;
    prefix = i;
  }else{
    prefix = 0;
  }
  for(i=1; i<=lenSrc && i<=lenOut && zSrc[lenSrc-i]==zOut[lenOut-i]; i++){}
  if( i>NHASH ){
    MappingInsert(pMap, prefix+lenSrc-i+1, prefix+lenSrc-1,
                        prefix+lenOut-i+1, prefix+lenOut-1);
    lenSrc -= i;
    lenOut -= i;
  }

  /* If the source file is very small, it means that we have no
  ** chance of ever finding any matches.  We can leave early.
  */
  if( lenSrc<=NHASH ) goto add_nextra;

  /* Compute the hash table used to locate matching sections in the
  ** source file.
  */
  collide = malloc( lenSrc*sizeof(int)/NHASH );
  if( collide==0 ) return;
  memset(landmark, -1, sizeof(landmark));
  memset(collide, -1, lenSrc*sizeof(int)/NHASH );
  for(i=0; i<lenSrc-NHASH; i+=NHASH){
    int hv;
    hash_init(&h, &zSrc[i]);
    hv = hash_32bit(&h) & (MX_LANDMARK-1);
    collide[i/NHASH] = landmark[hv];
    landmark[hv] = i/NHASH;
  }

  /* Begin scanning the target file and generating mappings.  In this
  ** step, we generate as many mapping entries is we can.  Many of these
  ** entries might overlap.  The overlapping entries are removed by
  ** the loop the follows.
  */
  base = 0;    /* We have already checked everything before zOut[base] */
  while( base<lenOut-NHASH ){
    int iSrc, iBlock, nextBase, nextBaseI;
    hash_init(&h, &zOut[base]);
    i = 0;     /* Trying to match a landmark against zOut[base+i] */
    nextBaseI = NHASH;
    nextBase = base;
    while(1){
      int hv;

      hv = hash_32bit(&h) & (MX_LANDMARK-1);
      DEBUG2( printf("LOOKING: %d+%d+%d=%d [%s]\n",
              prefix,base,i,prefix+base+i, print16(&zOut[base+i])); )
      iBlock = landmark[hv];
      while( iBlock>=0 ){
        /*
        ** The hash window has identified a potential match against 
        ** landmark block iBlock.  But we need to investigate further.
        ** 
        ** Look for a region in zOut that matches zSrc. Anchor the search
        ** at zSrc[iSrc] and zOut[base+i].
        **
        ** Set cnt equal to the length of the match and set ofst so that
        ** zSrc[ofst] is the first element of the match. 
        */
        int cnt, ofstSrc;
        int j, k, x, y;

        /* Beginning at iSrc, match forwards as far as we can.  j counts
        ** the number of characters that match */
        iSrc = iBlock*NHASH;
        for(j=0, x=iSrc, y=base+i; x<lenSrc && y<lenOut; j++, x++, y++){
          if( zSrc[x]!=zOut[y] ) break;
        }
        j--;

        /* Beginning at iSrc-1, match backwards as far as we can.  k counts
        ** the number of characters that match */
        for(k=1; k<iSrc && k<=base+i; k++){
          if( zSrc[iSrc-k]!=zOut[base+i-k] ) break;
        }
        k--;

        /* Compute the offset and size of the matching region zSrc */
        ofstSrc = iSrc-k;
        cnt = j+k+1;
        DEBUG2( printf("MATCH %d bytes at SRC[%d..%d]: [%s]\n",
                 cnt, ofstSrc, ofstSrc+cnt-1, print16(&zSrc[ofstSrc])); )
        if( cnt>NHASH ){
          int ofstOut = base+i-k;
          DEBUG2( printf("COPY %6d..%-6d %6d..%d\n",
            prefix+ofstSrc, prefix+ofstSrc+cnt-1,
            prefix+ofstOut, prefix+ofstOut+cnt-1); )
          MappingInsert(pMap,
            prefix+ofstSrc, prefix+ofstSrc+cnt-1,
            prefix+ofstOut, prefix+ofstOut+cnt-1);
          if( nextBase < ofstOut+cnt-1 ){
            nextBase = ofstOut+cnt-1;
            nextBaseI = i+NHASH;
          }
        }

        /* Check the next matching block */
        iBlock = collide[iBlock];
      }

      /* If we found a match, then jump out to the outer loop and begin
      ** a new cycle.
      */
      if( nextBase>base && i>=nextBaseI ){
        base = nextBase;
        break;
      }

      /* Advance the hash by one character.  Keep looking for a match */
      if( base+i+NHASH>=lenOut ){
        base = lenOut;
        break;
      }
      hash_next(&h, zOut[base+i+NHASH]);
      i++;
    }
  }
  free(collide);
  DEBUG1(
   printf("after creation:\n");
   MappingPrint(pMap);
  )

  /* In this step, we will remove overlapping entries from the mapping.
  **
  ** We use a greedy algorithm.  Select the largest mapping first and
  ** remove all overlapping mappings.  Then take the next largest
  ** mapping and remove others that overlap with it.  Keep going until
  ** all mappings have been processed.
  */
  MappingSortSize(pMap);
  for(i=0; i<pMap->nUsed; i++){
    int sortNeeded = 0;
    int purgeNeeded = 0;
    struct Mapping_entry *pA;
    pA = &pMap->aMap[i];
    for(j=i+1; j<pMap->nUsed; j++){
      int diff;
      struct Mapping_entry *pB;
      pB = &pMap->aMap[j];
      if( pB->fromLast<pA->fromFirst || pB->fromFirst>pA->fromLast ){
        /* No overlap.  Do nothing */
      }else if( pB->fromFirst>=pA->fromFirst && pB->fromLast<=pA->fromLast ){
        /* B is contained entirely within A.  Drop B */
        pB->fromFirst = -1;
        purgeNeeded = 1;
        continue;
      }else if( pB->fromFirst<pA->fromFirst ){
        /* The tail B overlaps the head of A */
        assert( pB->fromLast>=pA->fromFirst && pB->fromLast<=pA->fromLast );
        diff = pB->fromLast + 1 - pA->fromFirst;
        pB->fromLast -= diff;
        pB->toLast -= diff;
        sortNeeded = 1;
      }else{
        /* The head of B overlaps the tail of A */
        assert( pB->fromFirst<=pA->fromLast && pB->fromLast>pA->fromLast );
        diff = pA->fromLast + 1 - pB->fromFirst;
        pB->fromFirst += diff;
        pB->toFirst += diff;
        sortNeeded = 1;
      }
      if( pB->toLast<pA->toFirst || pB->toFirst>pA->toLast ){
        /* No overlap.  Do nothing */
      }else if( pB->toFirst>=pA->toFirst && pB->toLast<=pA->toLast ){
        /* B is contained entirely within A.  Drop B */
        pB->fromFirst = -1;
        purgeNeeded = 1;
      }else if( pB->toFirst<pA->toFirst ){
        /* The tail of B overlaps the head of A */
        assert( pB->toLast>=pA->toFirst && pB->toLast<=pA->toLast );
        diff = pB->toLast + 1 - pA->toFirst;
        pB->fromLast -= diff;
        pB->toLast -= diff;
        sortNeeded = 1;
      }else{
        /* The head of B overlaps the tail of A */
        assert( pB->toFirst<=pA->toLast && pB->toLast>pA->toLast );
        diff = pA->toLast + 1 - pB->toFirst;
        pB->fromFirst += diff;
        pB->toFirst += diff;
        sortNeeded = 1;
      }
    }
    if( purgeNeeded ){
      MappingPurgeDeletedEntries(pMap);
    }
    if( sortNeeded && i<pMap->nUsed-2 ){
      MappingSortSize(pMap);
    }
  }

  /* Final step:  Arrange the mapping entires so that they are in the
  ** order of the output file.  Then fill in the nExtra values.
  */
add_nextra:
  MappingSortTo(pMap);
  aMap = pMap->aMap;
  for(i=0; i<pMap->nUsed-1; i++){
    aMap[i].nExtra = aMap[i+1].toFirst - aMap[i].toLast - 1;
  }
  if( pMap->nUsed>0 && origLenOut > aMap[i].toLast+1 ){
    aMap[i].nExtra = origLenOut - aMap[i].toLast - 1;
  }
}

/*
** Translate an index into a file using a mapping.
**
** The mapping "p" shows how blocks in the input file map into blocks
** of the output file.  The index iFrom is an index into the input file.
** This routine returns the index into the output file of the corresponding
** character.
**
** If pInserted!=0 and iFrom points to the last character before a
** insert in the output file, then the return value is adjusted forward
** so that it points to the end of the insertion and the number of
** bytes inserted is written into *pInserted.  If pInserted==0 then
** iFrom always maps directly in the corresponding output file
** index regardless of whether or not it points to the last character
** before an insertion.
*/
static int MappingTranslate(Mapping *p, int iFrom, int *pInserted){
  int i;
  for(i=0; i<p->nUsed; i++){
    if( iFrom>p->aMap[i].fromLast ) continue;
    if( iFrom<=p->aMap[i].fromFirst ){
      return p->aMap[i].toFirst;
    }
    if( pInserted && iFrom==p->aMap[i].fromLast ){
      int n = p->aMap[i].nExtra;
      *pInserted = n;
      return p->aMap[i].toLast + n;
    }else{
      return p->aMap[i].toFirst + iFrom - p->aMap[i].fromFirst;
    }
  }
  i--;
  return p->aMap[i].toLast + p->aMap[i].nExtra;
}

/*
** Do a three-way merge.  Initialize pOut to contain the result.
*/
void blob_merge(Blob *pPivot, Blob *pV1, Blob *pV2, Blob *pOut){
  Mapping map1, map2;
  int i;
  const char *zV1, *zV2;
  blob_zero(pOut);
  DEBUG1( printf("map1:\n"); )
  MappingInit(
    blob_buffer(pPivot), blob_size(pPivot),
    blob_buffer(pV1), blob_size(pV1),
    &map1);
  MappingSortFrom(&map1);
  DEBUG1( 
    printf("map1-final:\n");
    MappingPrint(&map1);
    printf("map2:\n");
  )
  MappingInit(
    blob_buffer(pPivot), blob_size(pPivot),
    blob_buffer(pV2), blob_size(pV2),
    &map2);
  DEBUG1(
    printf("map2-final:\n");
    MappingPrint(&map2);
  )
  zV1 = blob_buffer(pV1);
  zV2 = blob_buffer(pV2);
  if( map2.nUsed==0 ) return;
  if( map1.aMap[0].toFirst>0 ){
    blob_append(pOut, zV1, map1.aMap[0].toFirst);
    DEBUG1( printf("INSERT %d bytes from V1[0..%d]\n",
            map1.aMap[0].toFirst, map1.aMap[0].toFirst-1); )
  }
  if( map2.aMap[0].toFirst>0 ){
    blob_append(pOut, zV2, map2.aMap[0].toFirst);
    DEBUG1( printf("INSERT %d bytes from V2[0..%d]\n",
            map2.aMap[0].toFirst, map2.aMap[0].toFirst-1); )
  }
  for(i=0; i<map2.nUsed; i++){
    int iFirst, iLast, nInsert;
    struct Mapping_entry *p = &map2.aMap[i];
    iFirst = MappingTranslate(&map1, p->fromFirst, 0);
    iLast = MappingTranslate(&map1, p->fromLast, &nInsert);
    blob_append(pOut, &zV1[iFirst], iLast - iFirst + 1);
    DEBUG1(
      printf("COPY %d bytes from V1[%d..%d]\n", iLast-iFirst+1, iFirst, iLast);
    )
    if( p->nExtra>0 ){
      if( p->nExtra==nInsert
          && memcmp(&zV2[p->toLast+1], &zV1[iLast-nInsert+1], nInsert)==0 ){
        /* Omit a duplicate insert */
        DEBUG1( printf("OMIT duplicate insert\n"); )
      }else{
        blob_append(pOut, &zV2[p->toLast+1], p->nExtra);
        DEBUG1(
          printf("INSERT %d bytes from V2[%d..%d]\n",
                  p->nExtra, p->toLast+1, p->toLast+p->nExtra);
        )
      }
    }
  }
  MappingClear(&map1);
  MappingClear(&map2);
}

/*
** COMMAND:  test-3-way-merge
**
** Combine change in going from PIVOT->VERSION1 with the change going
** from PIVOT->VERSION2 and write the combined changes into MERGED.
*/
void delta_3waymerge_cmd(void){
  Blob pivot, v1, v2, merged;
  if( g.argc!=6 ){
    fprintf(stderr,"Usage: %s %s PIVOT V1 V2 MERGED\n", g.argv[0], g.argv[1]);
    exit(1);
  }
  if( blob_read_from_file(&pivot, g.argv[2])<0 ){
    fprintf(stderr,"cannot read %s\n", g.argv[2]);
    exit(1);
  }
  if( blob_read_from_file(&v1, g.argv[3])<0 ){
    fprintf(stderr,"cannot read %s\n", g.argv[3]);
    exit(1);
  }
  if( blob_read_from_file(&v2, g.argv[4])<0 ){
    fprintf(stderr,"cannot read %s\n", g.argv[4]);
    exit(1);
  }
  blob_merge(&pivot, &v1, &v2, &merged);
  if( blob_write_to_file(&merged, g.argv[5])<blob_size(&merged) ){
    fprintf(stderr,"cannot write %s\n", g.argv[4]);
    exit(1);
  }
  blob_reset(&pivot);
  blob_reset(&v1);
  blob_reset(&v2);
  blob_reset(&merged);
}


/*
** COMMAND:  test-mapping
*/
void mapping_test(void){
  int i;
  const char *z;
  Blob a, b;
  Mapping map;
  if( g.argc!=4 ){
    usage("FILE1 FILE2");
  }
  blob_read_from_file(&a, g.argv[2]);
  blob_read_from_file(&b, g.argv[3]);
  memset(&map, 0, sizeof(map));
  MappingInit(blob_buffer(&a), blob_size(&a),
              blob_buffer(&b), blob_size(&b),
              &map);
  z = blob_buffer(&a);
  for(i=0; i<map.nUsed; i++){
    printf("======= %6d..%-6d %6d..%-6d %d\n", 
         map.aMap[i].fromFirst, map.aMap[i].fromLast,
         map.aMap[i].toFirst, map.aMap[i].toLast,
         map.aMap[i].nExtra);
    printf("%.*s\n", map.aMap[i].fromLast - map.aMap[i].fromFirst + 1, 
                     &z[map.aMap[i].fromFirst]);
  }
}