File Annotation
Not logged in
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Copyright (c) 2006 D. Richard Hipp
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** This program is free software; you can redistribute it and/or
dbda8d6ce9 2007-07-21       drh: ** modify it under the terms of the GNU General Public
dbda8d6ce9 2007-07-21       drh: ** License version 2 as published by the Free Software Foundation.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** This program is distributed in the hope that it will be useful,
dbda8d6ce9 2007-07-21       drh: ** but WITHOUT ANY WARRANTY; without even the implied warranty of
dbda8d6ce9 2007-07-21       drh: ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
dbda8d6ce9 2007-07-21       drh: ** General Public License for more details.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** You should have received a copy of the GNU General Public
dbda8d6ce9 2007-07-21       drh: ** License along with this library; if not, write to the
dbda8d6ce9 2007-07-21       drh: ** Free Software Foundation, Inc., 59 Temple Place - Suite 330,
dbda8d6ce9 2007-07-21       drh: ** Boston, MA  02111-1307, USA.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** Author contact information:
dbda8d6ce9 2007-07-21       drh: **   drh@hwaci.com
dbda8d6ce9 2007-07-21       drh: **   http://www.hwaci.com/drh/
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: *******************************************************************************
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** This module implements the delta compress algorithm.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** Though developed specifically for fossil, the code in this file
dbda8d6ce9 2007-07-21       drh: ** is generally appliable and is thus easily separated from the
dbda8d6ce9 2007-07-21       drh: ** fossil source code base.  Nothing in this file depends on anything
dbda8d6ce9 2007-07-21       drh: ** else in fossil.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: #include <stdio.h>
dbda8d6ce9 2007-07-21       drh: #include <assert.h>
dbda8d6ce9 2007-07-21       drh: #include <stdlib.h>
dbda8d6ce9 2007-07-21       drh: #include <string.h>
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Macros for turning debugging printfs on and off
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: #if 0
dbda8d6ce9 2007-07-21       drh: # define DEBUG1(X) X
dbda8d6ce9 2007-07-21       drh: #else
dbda8d6ce9 2007-07-21       drh: # define DEBUG1(X)
dbda8d6ce9 2007-07-21       drh: #endif
dbda8d6ce9 2007-07-21       drh: #if 0
dbda8d6ce9 2007-07-21       drh: #define DEBUG2(X) X
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** For debugging:
dbda8d6ce9 2007-07-21       drh: ** Print 16 characters of text from zBuf
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: static const char *print16(const char *z){
dbda8d6ce9 2007-07-21       drh:   int i;
dbda8d6ce9 2007-07-21       drh:   static char zBuf[20];
dbda8d6ce9 2007-07-21       drh:   for(i=0; i<16; i++){
dbda8d6ce9 2007-07-21       drh:     if( z[i]>=0x20 && z[i]<=0x7e ){
dbda8d6ce9 2007-07-21       drh:       zBuf[i] = z[i];
dbda8d6ce9 2007-07-21       drh:     }else{
dbda8d6ce9 2007-07-21       drh:       zBuf[i] = '.';
dbda8d6ce9 2007-07-21       drh:     }
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   zBuf[i] = 0;
dbda8d6ce9 2007-07-21       drh:   return zBuf;
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: #else
dbda8d6ce9 2007-07-21       drh: # define DEBUG2(X)
dbda8d6ce9 2007-07-21       drh: #endif
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** The "u32" type must be an unsigned 32-bit integer.  Adjust this
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: typedef unsigned int u32;
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Must be a 16-bit value
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: typedef short int s16;
dbda8d6ce9 2007-07-21       drh: typedef unsigned short int u16;
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** The width of a hash window in bytes.  The algorithm only works if this
dbda8d6ce9 2007-07-21       drh: ** is a power of 2.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: #define NHASH 16
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** The current state of the rolling hash.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** z[] holds the values that have been hashed.  z[] is a circular buffer.
dbda8d6ce9 2007-07-21       drh: ** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of
dbda8d6ce9 2007-07-21       drh: ** the window.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** Hash.a is the sum of all elements of hash.z[].  Hash.b is a weighted
dbda8d6ce9 2007-07-21       drh: ** sum.  Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1.
dbda8d6ce9 2007-07-21       drh: ** (Each index for z[] should be module NHASH, of course.  The %NHASH operator
dbda8d6ce9 2007-07-21       drh: ** is omitted in the prior expression for brevity.)
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: typedef struct hash hash;
dbda8d6ce9 2007-07-21       drh: struct hash {
dbda8d6ce9 2007-07-21       drh:   u16 a, b;         /* Hash values */
dbda8d6ce9 2007-07-21       drh:   u16 i;            /* Start of the hash window */
dbda8d6ce9 2007-07-21       drh:   char z[NHASH];    /* The values that have been hashed */
dbda8d6ce9 2007-07-21       drh: };
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Initialize the rolling hash using the first NHASH characters of z[]
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: static void hash_init(hash *pHash, const char *z){
dbda8d6ce9 2007-07-21       drh:   u16 a, b, i;
dbda8d6ce9 2007-07-21       drh:   a = b = 0;
dbda8d6ce9 2007-07-21       drh:   for(i=0; i<NHASH; i++){
dbda8d6ce9 2007-07-21       drh:     a += z[i];
dbda8d6ce9 2007-07-21       drh:     b += (NHASH-i)*z[i];
dbda8d6ce9 2007-07-21       drh:     pHash->z[i] = z[i];
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   pHash->a = a & 0xffff;
dbda8d6ce9 2007-07-21       drh:   pHash->b = b & 0xffff;
dbda8d6ce9 2007-07-21       drh:   pHash->i = 0;
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Advance the rolling hash by a single character "c"
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: static void hash_next(hash *pHash, int c){
dbda8d6ce9 2007-07-21       drh:   u16 old = pHash->z[pHash->i];
dbda8d6ce9 2007-07-21       drh:   pHash->z[pHash->i] = c;
dbda8d6ce9 2007-07-21       drh:   pHash->i = (pHash->i+1)&(NHASH-1);
dbda8d6ce9 2007-07-21       drh:   pHash->a = pHash->a - old + c;
dbda8d6ce9 2007-07-21       drh:   pHash->b = pHash->b - NHASH*old + pHash->a;
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Return a 32-bit hash value
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: static u32 hash_32bit(hash *pHash){
dbda8d6ce9 2007-07-21       drh:   return (pHash->a & 0xffff) | (((u32)(pHash->b & 0xffff))<<16);
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Write an base-64 integer into the given buffer.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: static void putInt(unsigned int v, char **pz){
dbda8d6ce9 2007-07-21       drh:   static const char zDigits[] =
dbda8d6ce9 2007-07-21       drh:     "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~";
dbda8d6ce9 2007-07-21       drh:   /*  123456789 123456789 123456789 123456789 123456789 123456789 123 */
dbda8d6ce9 2007-07-21       drh:   int i, j;
dbda8d6ce9 2007-07-21       drh:   char zBuf[20];
dbda8d6ce9 2007-07-21       drh:   if( v==0 ){
dbda8d6ce9 2007-07-21       drh:     *(*pz)++ = '0';
dbda8d6ce9 2007-07-21       drh:     return;
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   for(i=0; v>0; i++, v>>=6){
dbda8d6ce9 2007-07-21       drh:     zBuf[i] = zDigits[v&0x3f];
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   for(j=i-1; j>=0; j--){
dbda8d6ce9 2007-07-21       drh:     *(*pz)++ = zBuf[j];
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Read bytes from *pz and convert them into a positive integer.  When
dbda8d6ce9 2007-07-21       drh: ** finished, leave *pz pointing to the first character past the end of
dbda8d6ce9 2007-07-21       drh: ** the integer.  The *pLen parameter holds the length of the string
dbda8d6ce9 2007-07-21       drh: ** in *pz and is decremented once for each character in the integer.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: static unsigned int getInt(const char **pz, int *pLen){
dbda8d6ce9 2007-07-21       drh:   static const signed char zValue[] = {
dbda8d6ce9 2007-07-21       drh:     -1, -1, -1, -1, -1, -1, -1, -1,   -1, -1, -1, -1, -1, -1, -1, -1,
dbda8d6ce9 2007-07-21       drh:     -1, -1, -1, -1, -1, -1, -1, -1,   -1, -1, -1, -1, -1, -1, -1, -1,
dbda8d6ce9 2007-07-21       drh:     -1, -1, -1, -1, -1, -1, -1, -1,   -1, -1, -1, -1, -1, -1, -1, -1,
dbda8d6ce9 2007-07-21       drh:      0,  1,  2,  3,  4,  5,  6,  7,    8,  9, -1, -1, -1, -1, -1, -1,
dbda8d6ce9 2007-07-21       drh:     -1, 10, 11, 12, 13, 14, 15, 16,   17, 18, 19, 20, 21, 22, 23, 24,
dbda8d6ce9 2007-07-21       drh:     25, 26, 27, 28, 29, 30, 31, 32,   33, 34, 35, -1, -1, -1, -1, 36,
dbda8d6ce9 2007-07-21       drh:     -1, 37, 38, 39, 40, 41, 42, 43,   44, 45, 46, 47, 48, 49, 50, 51,
dbda8d6ce9 2007-07-21       drh:     52, 53, 54, 55, 56, 57, 58, 59,   60, 61, 62, -1, -1, -1, 63, -1,
dbda8d6ce9 2007-07-21       drh:   };
dbda8d6ce9 2007-07-21       drh:   unsigned int v = 0;
dbda8d6ce9 2007-07-21       drh:   int c;
dbda8d6ce9 2007-07-21       drh:   unsigned char *z = (unsigned char*)*pz;
dbda8d6ce9 2007-07-21       drh:   unsigned char *zStart = z;
dbda8d6ce9 2007-07-21       drh:   while( (c = zValue[0x7f&*(z++)])>=0 ){
dbda8d6ce9 2007-07-21       drh:      v = (v<<6) + c;
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   z--;
dbda8d6ce9 2007-07-21       drh:   *pLen -= z - zStart;
dbda8d6ce9 2007-07-21       drh:   *pz = (char*)z;
dbda8d6ce9 2007-07-21       drh:   return v;
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
609e4caf29 2007-08-25       aku: ** Return the number digits in the base-64 representation of a positive integer
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: static int digit_count(int v){
dbda8d6ce9 2007-07-21       drh:   unsigned int i, x;
dbda8d6ce9 2007-07-21       drh:   for(i=1, x=64; v>=x; i++, x <<= 6){}
dbda8d6ce9 2007-07-21       drh:   return i;
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Compute a 32-bit checksum on the N-byte buffer.  Return the result.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: static unsigned int checksum(const char *zIn, int N){
dbda8d6ce9 2007-07-21       drh:   const unsigned char *z = (const unsigned char*)zIn;
dbda8d6ce9 2007-07-21       drh:   unsigned int sum = 0;
dbda8d6ce9 2007-07-21       drh:   while( N>=4 ){
dbda8d6ce9 2007-07-21       drh:     sum += (z[0]<<24) | (z[1]<<16) | (z[2]<<8) | z[3];
dbda8d6ce9 2007-07-21       drh:     z += 4;
dbda8d6ce9 2007-07-21       drh:     N -= 4;
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   if( N>0 ){
dbda8d6ce9 2007-07-21       drh:     unsigned char zBuf[4];
dbda8d6ce9 2007-07-21       drh:     memset(zBuf, 0, sizeof(zBuf));
dbda8d6ce9 2007-07-21       drh:     memcpy(zBuf, z, N);
dbda8d6ce9 2007-07-21       drh:     z = zBuf;
dbda8d6ce9 2007-07-21       drh:     sum += (z[0]<<24) | (z[1]<<16) | (z[2]<<8) | z[3];
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   return sum;
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Maximum number of landmarks to set in the source file.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: #define MX_LANDMARK (1024*128)
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Create a new delta.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** The delta is written into a preallocated buffer, zDelta, which
dbda8d6ce9 2007-07-21       drh: ** should be at least 60 bytes longer than the target file, zOut.
dbda8d6ce9 2007-07-21       drh: ** The delta string will be NUL-terminated, but it might also contain
dbda8d6ce9 2007-07-21       drh: ** embedded NUL characters if either the zSrc or zOut files are
dbda8d6ce9 2007-07-21       drh: ** binary.  This function returns the length of the delta string
dbda8d6ce9 2007-07-21       drh: ** in bytes, excluding the final NUL terminator character.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** Output Format:
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** The delta begins with a base64 number followed by a newline.  This
dbda8d6ce9 2007-07-21       drh: ** number is the number of bytes in the TARGET file.  Thus, given a
dbda8d6ce9 2007-07-21       drh: ** delta file z, a program can compute the size of the output file
609e4caf29 2007-08-25       aku: ** simply by reading the first line and decoding the base-64 number
609e4caf29 2007-08-25       aku: ** found there.  The delta_output_size() routine does exactly this.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** After the initial size number, the delta consists of a series of
dbda8d6ce9 2007-07-21       drh: ** literal text segments and commands to copy from the SOURCE file.
dbda8d6ce9 2007-07-21       drh: ** A copy command looks like this:
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: **     NNN@MMM,
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** where NNN is the number of bytes to be copied and MMM is the offset
dbda8d6ce9 2007-07-21       drh: ** into the source file of the first byte (both base-64).   If NNN is 0
dbda8d6ce9 2007-07-21       drh: ** it means copy the rest of the input file.  Literal text is like this:
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: **     NNN:TTTTT
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** where NNN is the number of bytes of text (base-64) and TTTTT is the text.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** The last term is of the form
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: **     NNN;
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** In this case, NNN is a 32-bit bigendian checksum of the output file
dbda8d6ce9 2007-07-21       drh: ** that can be used to verify that the delta applied correctly.  All
dbda8d6ce9 2007-07-21       drh: ** numbers are in base-64.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** Pure text files generate a pure text delta.  Binary files generate a
dbda8d6ce9 2007-07-21       drh: ** delta that may contain some binary data.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** Algorithm:
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** The encoder first builds a hash table to help it find matching
6f1af23ebe 2007-08-26       aku: ** patterns in the source file.  16-byte chunks of the source file
dbda8d6ce9 2007-07-21       drh: ** sampled at evenly spaced intervals are used to populate the hash
dbda8d6ce9 2007-07-21       drh: ** table.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** Next we begin scanning the target file using a sliding 16-byte
dbda8d6ce9 2007-07-21       drh: ** window.  The hash of the 16-byte window in the target is used to
dbda8d6ce9 2007-07-21       drh: ** search for a matching section in the source file.  When a match
dbda8d6ce9 2007-07-21       drh: ** is found, a copy command is added to the delta.  An effort is
dbda8d6ce9 2007-07-21       drh: ** made to extend the matching section to regions that come before
dbda8d6ce9 2007-07-21       drh: ** and after the 16-byte hash window.  A copy command is only issued
dbda8d6ce9 2007-07-21       drh: ** if the result would use less space that just quoting the text
dbda8d6ce9 2007-07-21       drh: ** literally. Literal text is added to the delta for sections that
dbda8d6ce9 2007-07-21       drh: ** do not match or which can not be encoded efficiently using copy
dbda8d6ce9 2007-07-21       drh: ** commands.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: int delta_create(
dbda8d6ce9 2007-07-21       drh:   const char *zSrc,      /* The source or pattern file */
dbda8d6ce9 2007-07-21       drh:   unsigned int lenSrc,   /* Length of the source file */
dbda8d6ce9 2007-07-21       drh:   const char *zOut,      /* The target file */
dbda8d6ce9 2007-07-21       drh:   unsigned int lenOut,   /* Length of the target file */
dbda8d6ce9 2007-07-21       drh:   char *zDelta           /* Write the delta into this buffer */
dbda8d6ce9 2007-07-21       drh: ){
dbda8d6ce9 2007-07-21       drh:   int i, base;
dbda8d6ce9 2007-07-21       drh:   char *zOrigDelta = zDelta;
dbda8d6ce9 2007-07-21       drh:   hash h;
dbda8d6ce9 2007-07-21       drh:   int *collide;
dbda8d6ce9 2007-07-21       drh:   int lastRead = -1;         /* Last byte of zSrc read by a COPY command */
dbda8d6ce9 2007-07-21       drh:   int landmark[MX_LANDMARK];
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:   /* Add the target file size to the beginning of the delta
dbda8d6ce9 2007-07-21       drh:   */
dbda8d6ce9 2007-07-21       drh:   putInt(lenOut, &zDelta);
dbda8d6ce9 2007-07-21       drh:   *(zDelta++) = '\n';
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:   /* If the source file is very small, it means that we have no
dbda8d6ce9 2007-07-21       drh:   ** chance of ever doing a copy command.  Just output a single
dbda8d6ce9 2007-07-21       drh:   ** literal segment for the entire target and exit.
dbda8d6ce9 2007-07-21       drh:   */
dbda8d6ce9 2007-07-21       drh:   if( lenSrc<=NHASH ){
dbda8d6ce9 2007-07-21       drh:     putInt(lenOut, &zDelta);
dbda8d6ce9 2007-07-21       drh:     *(zDelta++) = ':';
dbda8d6ce9 2007-07-21       drh:     memcpy(zDelta, zOut, lenOut);
dbda8d6ce9 2007-07-21       drh:     zDelta += lenOut;
dbda8d6ce9 2007-07-21       drh:     putInt(checksum(zOut, lenOut), &zDelta);
dbda8d6ce9 2007-07-21       drh:     *(zDelta++) = ';';
dbda8d6ce9 2007-07-21       drh:     return zDelta - zOrigDelta;
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:   /* Compute the hash table used to locate matching sections in the
dbda8d6ce9 2007-07-21       drh:   ** source file.
dbda8d6ce9 2007-07-21       drh:   */
dbda8d6ce9 2007-07-21       drh:   collide = malloc( lenSrc*sizeof(int)/NHASH );
dbda8d6ce9 2007-07-21       drh:   if( collide==0 ) return -1;
dbda8d6ce9 2007-07-21       drh:   memset(landmark, -1, sizeof(landmark));
dbda8d6ce9 2007-07-21       drh:   memset(collide, -1, lenSrc*sizeof(int)/NHASH );
dbda8d6ce9 2007-07-21       drh:   for(i=0; i<lenSrc-NHASH; i+=NHASH){
dbda8d6ce9 2007-07-21       drh:     int hv;
dbda8d6ce9 2007-07-21       drh:     hash_init(&h, &zSrc[i]);
dbda8d6ce9 2007-07-21       drh:     hv = hash_32bit(&h) & (MX_LANDMARK-1);
dbda8d6ce9 2007-07-21       drh:     collide[i/NHASH] = landmark[hv];
dbda8d6ce9 2007-07-21       drh:     landmark[hv] = i/NHASH;
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:   /* Begin scanning the target file and generating copy commands and
dbda8d6ce9 2007-07-21       drh:   ** literal sections of the delta.
dbda8d6ce9 2007-07-21       drh:   */
dbda8d6ce9 2007-07-21       drh:   base = 0;    /* We have already generated everything before zOut[base] */
73bddaebb9 2007-08-09       drh:   while( base+NHASH<lenOut ){
dbda8d6ce9 2007-07-21       drh:     int iSrc, iBlock;
e63a9fd9d0 2007-09-25       jnc:     unsigned int bestCnt, bestOfst=0, bestLitsz=0;
dbda8d6ce9 2007-07-21       drh:     hash_init(&h, &zOut[base]);
dbda8d6ce9 2007-07-21       drh:     i = 0;     /* Trying to match a landmark against zOut[base+i] */
dbda8d6ce9 2007-07-21       drh:     bestCnt = 0;
dbda8d6ce9 2007-07-21       drh:     while( 1 ){
dbda8d6ce9 2007-07-21       drh:       int hv;
f6b4c6458b 2007-09-06       drh:       int limit = 250;
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:       hv = hash_32bit(&h) & (MX_LANDMARK-1);
dbda8d6ce9 2007-07-21       drh:       DEBUG2( printf("LOOKING: %4d [%s]\n", base+i, print16(&zOut[base+i])); )
dbda8d6ce9 2007-07-21       drh:       iBlock = landmark[hv];
b816fadfc7 2007-09-05       drh:       while( iBlock>=0 && (limit--)>0 ){
dbda8d6ce9 2007-07-21       drh:         /*
dbda8d6ce9 2007-07-21       drh:         ** The hash window has identified a potential match against
dbda8d6ce9 2007-07-21       drh:         ** landmark block iBlock.  But we need to investigate further.
dbda8d6ce9 2007-07-21       drh:         **
dbda8d6ce9 2007-07-21       drh:         ** Look for a region in zOut that matches zSrc. Anchor the search
dbda8d6ce9 2007-07-21       drh:         ** at zSrc[iSrc] and zOut[base+i].  Do not include anything prior to
dbda8d6ce9 2007-07-21       drh:         ** zOut[base] or after zOut[outLen] nor anything after zSrc[srcLen].
dbda8d6ce9 2007-07-21       drh:         **
dbda8d6ce9 2007-07-21       drh:         ** Set cnt equal to the length of the match and set ofst so that
dbda8d6ce9 2007-07-21       drh:         ** zSrc[ofst] is the first element of the match.  litsz is the number
dbda8d6ce9 2007-07-21       drh:         ** of characters between zOut[base] and the beginning of the match.
dbda8d6ce9 2007-07-21       drh:         ** sz will be the overhead (in bytes) needed to encode the copy
dbda8d6ce9 2007-07-21       drh:         ** command.  Only generate copy command if the overhead of the
dbda8d6ce9 2007-07-21       drh:         ** copy command is less than the amount of literal text to be copied.
dbda8d6ce9 2007-07-21       drh:         */
dbda8d6ce9 2007-07-21       drh:         int cnt, ofst, litsz;
dbda8d6ce9 2007-07-21       drh:         int j, k, x, y;
dbda8d6ce9 2007-07-21       drh:         int sz;
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:         /* Beginning at iSrc, match forwards as far as we can.  j counts
dbda8d6ce9 2007-07-21       drh:         ** the number of characters that match */
dbda8d6ce9 2007-07-21       drh:         iSrc = iBlock*NHASH;
dbda8d6ce9 2007-07-21       drh:         for(j=0, x=iSrc, y=base+i; x<lenSrc && y<lenOut; j++, x++, y++){
dbda8d6ce9 2007-07-21       drh:           if( zSrc[x]!=zOut[y] ) break;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         j--;
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:         /* Beginning at iSrc-1, match backwards as far as we can.  k counts
dbda8d6ce9 2007-07-21       drh:         ** the number of characters that match */
dbda8d6ce9 2007-07-21       drh:         for(k=1; k<iSrc && k<=i; k++){
dbda8d6ce9 2007-07-21       drh:           if( zSrc[iSrc-k]!=zOut[base+i-k] ) break;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         k--;
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:         /* Compute the offset and size of the matching region */
dbda8d6ce9 2007-07-21       drh:         ofst = iSrc-k;
dbda8d6ce9 2007-07-21       drh:         cnt = j+k+1;
dbda8d6ce9 2007-07-21       drh:         litsz = i-k;  /* Number of bytes of literal text before the copy */
dbda8d6ce9 2007-07-21       drh:         DEBUG2( printf("MATCH %d bytes at %d: [%s] litsz=%d\n",
dbda8d6ce9 2007-07-21       drh:                         cnt, ofst, print16(&zSrc[ofst]), litsz); )
dbda8d6ce9 2007-07-21       drh:         /* sz will hold the number of bytes needed to encode the "insert"
dbda8d6ce9 2007-07-21       drh:         ** command and the copy command, not counting the "insert" text */
dbda8d6ce9 2007-07-21       drh:         sz = digit_count(i-k)+digit_count(cnt)+digit_count(ofst)+3;
dbda8d6ce9 2007-07-21       drh:         if( cnt>=sz && cnt>bestCnt ){
dbda8d6ce9 2007-07-21       drh:           /* Remember this match only if it is the best so far and it
dbda8d6ce9 2007-07-21       drh:           ** does not increase the file size */
dbda8d6ce9 2007-07-21       drh:           bestCnt = cnt;
dbda8d6ce9 2007-07-21       drh:           bestOfst = iSrc-k;
dbda8d6ce9 2007-07-21       drh:           bestLitsz = litsz;
dbda8d6ce9 2007-07-21       drh:           DEBUG2( printf("... BEST SO FAR\n"); )
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:         /* Check the next matching block */
dbda8d6ce9 2007-07-21       drh:         iBlock = collide[iBlock];
dbda8d6ce9 2007-07-21       drh:       }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:       /* We have a copy command that does not cause the delta to be larger
dbda8d6ce9 2007-07-21       drh:       ** than a literal insert.  So add the copy command to the delta.
dbda8d6ce9 2007-07-21       drh:       */
b816fadfc7 2007-09-05       drh:       if( bestCnt>0 ){
dbda8d6ce9 2007-07-21       drh:         if( bestLitsz>0 ){
dbda8d6ce9 2007-07-21       drh:           /* Add an insert command before the copy */
dbda8d6ce9 2007-07-21       drh:           putInt(bestLitsz,&zDelta);
dbda8d6ce9 2007-07-21       drh:           *(zDelta++) = ':';
dbda8d6ce9 2007-07-21       drh:           memcpy(zDelta, &zOut[base], bestLitsz);
dbda8d6ce9 2007-07-21       drh:           zDelta += bestLitsz;
dbda8d6ce9 2007-07-21       drh:           base += bestLitsz;
dbda8d6ce9 2007-07-21       drh:           DEBUG2( printf("insert %d\n", bestLitsz); )
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         base += bestCnt;
dbda8d6ce9 2007-07-21       drh:         putInt(bestCnt, &zDelta);
dbda8d6ce9 2007-07-21       drh:         *(zDelta++) = '@';
dbda8d6ce9 2007-07-21       drh:         putInt(bestOfst, &zDelta);
dbda8d6ce9 2007-07-21       drh:         DEBUG2( printf("copy %d bytes from %d\n", bestCnt, bestOfst); )
dbda8d6ce9 2007-07-21       drh:         *(zDelta++) = ',';
dbda8d6ce9 2007-07-21       drh:         if( bestOfst + bestCnt -1 > lastRead ){
dbda8d6ce9 2007-07-21       drh:           lastRead = bestOfst + bestCnt - 1;
dbda8d6ce9 2007-07-21       drh:           DEBUG2( printf("lastRead becomes %d\n", lastRead); )
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         bestCnt = 0;
dbda8d6ce9 2007-07-21       drh:         break;
dbda8d6ce9 2007-07-21       drh:       }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:       /* If we reach this point, it means no match is found so far */
dbda8d6ce9 2007-07-21       drh:       if( base+i+NHASH>lenOut ){
dbda8d6ce9 2007-07-21       drh:         /* We have reached the end of the file and have not found any
dbda8d6ce9 2007-07-21       drh:         ** matches.  Do an "insert" for everything that does not match */
dbda8d6ce9 2007-07-21       drh:         putInt(lenOut-base, &zDelta);
dbda8d6ce9 2007-07-21       drh:         *(zDelta++) = ':';
dbda8d6ce9 2007-07-21       drh:         memcpy(zDelta, &zOut[base], lenOut-base);
dbda8d6ce9 2007-07-21       drh:         zDelta += lenOut-base;
dbda8d6ce9 2007-07-21       drh:         base = lenOut;
dbda8d6ce9 2007-07-21       drh:         break;
dbda8d6ce9 2007-07-21       drh:       }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:       /* Advance the hash by one character.  Keep looking for a match */
dbda8d6ce9 2007-07-21       drh:       hash_next(&h, zOut[base+i+NHASH]);
dbda8d6ce9 2007-07-21       drh:       i++;
dbda8d6ce9 2007-07-21       drh:     }
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   /* Output a final "insert" record to get all the text at the end of
dbda8d6ce9 2007-07-21       drh:   ** the file that does not match anything in the source file.
dbda8d6ce9 2007-07-21       drh:   */
dbda8d6ce9 2007-07-21       drh:   if( base<lenOut ){
dbda8d6ce9 2007-07-21       drh:     putInt(lenOut-base, &zDelta);
dbda8d6ce9 2007-07-21       drh:     *(zDelta++) = ':';
dbda8d6ce9 2007-07-21       drh:     memcpy(zDelta, &zOut[base], lenOut-base);
dbda8d6ce9 2007-07-21       drh:     zDelta += lenOut-base;
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   /* Output the final checksum record. */
dbda8d6ce9 2007-07-21       drh:   putInt(checksum(zOut, lenOut), &zDelta);
dbda8d6ce9 2007-07-21       drh:   *(zDelta++) = ';';
dbda8d6ce9 2007-07-21       drh:   free(collide);
dbda8d6ce9 2007-07-21       drh:   return zDelta - zOrigDelta;
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Return the size (in bytes) of the output from applying
dbda8d6ce9 2007-07-21       drh: ** a delta.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** This routine is provided so that an procedure that is able
dbda8d6ce9 2007-07-21       drh: ** to call delta_apply() can learn how much space is required
dbda8d6ce9 2007-07-21       drh: ** for the output and hence allocate nor more space that is really
dbda8d6ce9 2007-07-21       drh: ** needed.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: int delta_output_size(const char *zDelta, int lenDelta){
dbda8d6ce9 2007-07-21       drh:   int size;
dbda8d6ce9 2007-07-21       drh:   size = getInt(&zDelta, &lenDelta);
dbda8d6ce9 2007-07-21       drh:   if( *zDelta!='\n' ){
dbda8d6ce9 2007-07-21       drh:     /* ERROR: size integer not terminated by "\n" */
dbda8d6ce9 2007-07-21       drh:     return -1;
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   return size;
dbda8d6ce9 2007-07-21       drh: }
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh: /*
dbda8d6ce9 2007-07-21       drh: ** Apply a delta.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** The output buffer should be big enough to hold the whole output
dbda8d6ce9 2007-07-21       drh: ** file and a NUL terminator at the end.  The delta_output_size()
dbda8d6ce9 2007-07-21       drh: ** routine will determine this size for you.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** The delta string should be null-terminated.  But the delta string
dbda8d6ce9 2007-07-21       drh: ** may contain embedded NUL characters (if the input and output are
dbda8d6ce9 2007-07-21       drh: ** binary files) so we also have to pass in the length of the delta in
dbda8d6ce9 2007-07-21       drh: ** the lenDelta parameter.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** This function returns the size of the output file in bytes (excluding
dbda8d6ce9 2007-07-21       drh: ** the final NUL terminator character).  Except, if the delta string is
dbda8d6ce9 2007-07-21       drh: ** malformed or intended for use with a source file other than zSrc,
dbda8d6ce9 2007-07-21       drh: ** then this routine returns -1.
dbda8d6ce9 2007-07-21       drh: **
dbda8d6ce9 2007-07-21       drh: ** Refer to the delta_create() documentation above for a description
dbda8d6ce9 2007-07-21       drh: ** of the delta file format.
dbda8d6ce9 2007-07-21       drh: */
dbda8d6ce9 2007-07-21       drh: int delta_apply(
dbda8d6ce9 2007-07-21       drh:   const char *zSrc,      /* The source or pattern file */
dbda8d6ce9 2007-07-21       drh:   int lenSrc,            /* Length of the source file */
dbda8d6ce9 2007-07-21       drh:   const char *zDelta,    /* Delta to apply to the pattern */
dbda8d6ce9 2007-07-21       drh:   int lenDelta,          /* Length of the delta */
dbda8d6ce9 2007-07-21       drh:   char *zOut             /* Write the output into this preallocated buffer */
dbda8d6ce9 2007-07-21       drh: ){
dbda8d6ce9 2007-07-21       drh:   unsigned int limit;
dbda8d6ce9 2007-07-21       drh:   unsigned int total = 0;
dbda8d6ce9 2007-07-21       drh:   char *zOrigOut = zOut;
dbda8d6ce9 2007-07-21       drh: 
dbda8d6ce9 2007-07-21       drh:   limit = getInt(&zDelta, &lenDelta);
dbda8d6ce9 2007-07-21       drh:   if( *zDelta!='\n' ){
dbda8d6ce9 2007-07-21       drh:     /* ERROR: size integer not terminated by "\n" */
dbda8d6ce9 2007-07-21       drh:     return -1;
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   zDelta++; lenDelta--;
dbda8d6ce9 2007-07-21       drh:   while( *zDelta && lenDelta>0 ){
dbda8d6ce9 2007-07-21       drh:     unsigned int cnt, ofst;
dbda8d6ce9 2007-07-21       drh:     cnt = getInt(&zDelta, &lenDelta);
dbda8d6ce9 2007-07-21       drh:     switch( zDelta[0] ){
dbda8d6ce9 2007-07-21       drh:       case '@': {
dbda8d6ce9 2007-07-21       drh:         zDelta++; lenDelta--;
dbda8d6ce9 2007-07-21       drh:         ofst = getInt(&zDelta, &lenDelta);
dbda8d6ce9 2007-07-21       drh:         if( zDelta[0]!=',' ){
dbda8d6ce9 2007-07-21       drh:           /* ERROR: copy command not terminated by ',' */
dbda8d6ce9 2007-07-21       drh:           return -1;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         zDelta++; lenDelta--;
dbda8d6ce9 2007-07-21       drh:         DEBUG1( printf("COPY %d from %d\n", cnt, ofst); )
dbda8d6ce9 2007-07-21       drh:         total += cnt;
dbda8d6ce9 2007-07-21       drh:         if( total>limit ){
dbda8d6ce9 2007-07-21       drh:           /* ERROR: copy exceeds output file size */
dbda8d6ce9 2007-07-21       drh:           return -1;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         if( ofst+cnt > lenSrc ){
dbda8d6ce9 2007-07-21       drh:           /* ERROR: copy extends past end of input */
dbda8d6ce9 2007-07-21       drh:           return -1;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         memcpy(zOut, &zSrc[ofst], cnt);
dbda8d6ce9 2007-07-21       drh:         zOut += cnt;
dbda8d6ce9 2007-07-21       drh:         break;
dbda8d6ce9 2007-07-21       drh:       }
dbda8d6ce9 2007-07-21       drh:       case ':': {
dbda8d6ce9 2007-07-21       drh:         zDelta++; lenDelta--;
dbda8d6ce9 2007-07-21       drh:         total += cnt;
dbda8d6ce9 2007-07-21       drh:         if( total>limit ){
dbda8d6ce9 2007-07-21       drh:           /* ERROR:  insert command gives an output larger than predicted */
dbda8d6ce9 2007-07-21       drh:           return -1;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         DEBUG1( printf("INSERT %d\n", cnt); )
dbda8d6ce9 2007-07-21       drh:         if( cnt>lenDelta ){
dbda8d6ce9 2007-07-21       drh:           /* ERROR: insert count exceeds size of delta */
dbda8d6ce9 2007-07-21       drh:           return -1;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         memcpy(zOut, zDelta, cnt);
dbda8d6ce9 2007-07-21       drh:         zOut += cnt;
dbda8d6ce9 2007-07-21       drh:         zDelta += cnt;
dbda8d6ce9 2007-07-21       drh:         lenDelta -= cnt;
dbda8d6ce9 2007-07-21       drh:         break;
dbda8d6ce9 2007-07-21       drh:       }
dbda8d6ce9 2007-07-21       drh:       case ';': {
dbda8d6ce9 2007-07-21       drh:         zDelta++; lenDelta--;
dbda8d6ce9 2007-07-21       drh:         zOut[0] = 0;
dbda8d6ce9 2007-07-21       drh:         if( cnt!=checksum(zOrigOut, total) ){
dbda8d6ce9 2007-07-21       drh:           /* ERROR:  bad checksum */
dbda8d6ce9 2007-07-21       drh:           return -1;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         if( total!=limit ){
dbda8d6ce9 2007-07-21       drh:           /* ERROR: generated size does not match predicted size */
dbda8d6ce9 2007-07-21       drh:           return -1;
dbda8d6ce9 2007-07-21       drh:         }
dbda8d6ce9 2007-07-21       drh:         return total;
dbda8d6ce9 2007-07-21       drh:       }
dbda8d6ce9 2007-07-21       drh:       default: {
dbda8d6ce9 2007-07-21       drh:         /* ERROR: unknown delta operator */
dbda8d6ce9 2007-07-21       drh:         return -1;
dbda8d6ce9 2007-07-21       drh:       }
dbda8d6ce9 2007-07-21       drh:     }
dbda8d6ce9 2007-07-21       drh:   }
dbda8d6ce9 2007-07-21       drh:   /* ERROR: unterminated delta */
dbda8d6ce9 2007-07-21       drh:   return -1;
dbda8d6ce9 2007-07-21       drh: }