dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Copyright (c) 2006 D. Richard Hipp dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** This program is free software; you can redistribute it and/or dbda8d6ce9 2007-07-21 drh: ** modify it under the terms of the GNU General Public dbda8d6ce9 2007-07-21 drh: ** License version 2 as published by the Free Software Foundation. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** This program is distributed in the hope that it will be useful, dbda8d6ce9 2007-07-21 drh: ** but WITHOUT ANY WARRANTY; without even the implied warranty of dbda8d6ce9 2007-07-21 drh: ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dbda8d6ce9 2007-07-21 drh: ** General Public License for more details. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** You should have received a copy of the GNU General Public dbda8d6ce9 2007-07-21 drh: ** License along with this library; if not, write to the dbda8d6ce9 2007-07-21 drh: ** Free Software Foundation, Inc., 59 Temple Place - Suite 330, dbda8d6ce9 2007-07-21 drh: ** Boston, MA 02111-1307, USA. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Author contact information: dbda8d6ce9 2007-07-21 drh: ** drh@hwaci.com dbda8d6ce9 2007-07-21 drh: ** http://www.hwaci.com/drh/ dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ******************************************************************************* dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** This module implements the delta compress algorithm. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Though developed specifically for fossil, the code in this file dbda8d6ce9 2007-07-21 drh: ** is generally appliable and is thus easily separated from the dbda8d6ce9 2007-07-21 drh: ** fossil source code base. Nothing in this file depends on anything dbda8d6ce9 2007-07-21 drh: ** else in fossil. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: #include <stdio.h> dbda8d6ce9 2007-07-21 drh: #include <assert.h> dbda8d6ce9 2007-07-21 drh: #include <stdlib.h> dbda8d6ce9 2007-07-21 drh: #include <string.h> dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Macros for turning debugging printfs on and off dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: #if 0 dbda8d6ce9 2007-07-21 drh: # define DEBUG1(X) X dbda8d6ce9 2007-07-21 drh: #else dbda8d6ce9 2007-07-21 drh: # define DEBUG1(X) dbda8d6ce9 2007-07-21 drh: #endif dbda8d6ce9 2007-07-21 drh: #if 0 dbda8d6ce9 2007-07-21 drh: #define DEBUG2(X) X dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** For debugging: dbda8d6ce9 2007-07-21 drh: ** Print 16 characters of text from zBuf dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: static const char *print16(const char *z){ dbda8d6ce9 2007-07-21 drh: int i; dbda8d6ce9 2007-07-21 drh: static char zBuf[20]; dbda8d6ce9 2007-07-21 drh: for(i=0; i<16; i++){ dbda8d6ce9 2007-07-21 drh: if( z[i]>=0x20 && z[i]<=0x7e ){ dbda8d6ce9 2007-07-21 drh: zBuf[i] = z[i]; dbda8d6ce9 2007-07-21 drh: }else{ dbda8d6ce9 2007-07-21 drh: zBuf[i] = '.'; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: zBuf[i] = 0; dbda8d6ce9 2007-07-21 drh: return zBuf; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: #else dbda8d6ce9 2007-07-21 drh: # define DEBUG2(X) dbda8d6ce9 2007-07-21 drh: #endif dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** The "u32" type must be an unsigned 32-bit integer. Adjust this dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: typedef unsigned int u32; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Must be a 16-bit value dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: typedef short int s16; dbda8d6ce9 2007-07-21 drh: typedef unsigned short int u16; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** The width of a hash window in bytes. The algorithm only works if this dbda8d6ce9 2007-07-21 drh: ** is a power of 2. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: #define NHASH 16 dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** The current state of the rolling hash. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** z[] holds the values that have been hashed. z[] is a circular buffer. dbda8d6ce9 2007-07-21 drh: ** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of dbda8d6ce9 2007-07-21 drh: ** the window. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Hash.a is the sum of all elements of hash.z[]. Hash.b is a weighted dbda8d6ce9 2007-07-21 drh: ** sum. Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1. dbda8d6ce9 2007-07-21 drh: ** (Each index for z[] should be module NHASH, of course. The %NHASH operator dbda8d6ce9 2007-07-21 drh: ** is omitted in the prior expression for brevity.) dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: typedef struct hash hash; dbda8d6ce9 2007-07-21 drh: struct hash { dbda8d6ce9 2007-07-21 drh: u16 a, b; /* Hash values */ dbda8d6ce9 2007-07-21 drh: u16 i; /* Start of the hash window */ dbda8d6ce9 2007-07-21 drh: char z[NHASH]; /* The values that have been hashed */ dbda8d6ce9 2007-07-21 drh: }; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Initialize the rolling hash using the first NHASH characters of z[] dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: static void hash_init(hash *pHash, const char *z){ dbda8d6ce9 2007-07-21 drh: u16 a, b, i; dbda8d6ce9 2007-07-21 drh: a = b = 0; dbda8d6ce9 2007-07-21 drh: for(i=0; i<NHASH; i++){ dbda8d6ce9 2007-07-21 drh: a += z[i]; dbda8d6ce9 2007-07-21 drh: b += (NHASH-i)*z[i]; dbda8d6ce9 2007-07-21 drh: pHash->z[i] = z[i]; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: pHash->a = a & 0xffff; dbda8d6ce9 2007-07-21 drh: pHash->b = b & 0xffff; dbda8d6ce9 2007-07-21 drh: pHash->i = 0; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Advance the rolling hash by a single character "c" dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: static void hash_next(hash *pHash, int c){ dbda8d6ce9 2007-07-21 drh: u16 old = pHash->z[pHash->i]; dbda8d6ce9 2007-07-21 drh: pHash->z[pHash->i] = c; dbda8d6ce9 2007-07-21 drh: pHash->i = (pHash->i+1)&(NHASH-1); dbda8d6ce9 2007-07-21 drh: pHash->a = pHash->a - old + c; dbda8d6ce9 2007-07-21 drh: pHash->b = pHash->b - NHASH*old + pHash->a; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Return a 32-bit hash value dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: static u32 hash_32bit(hash *pHash){ dbda8d6ce9 2007-07-21 drh: return (pHash->a & 0xffff) | (((u32)(pHash->b & 0xffff))<<16); dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Write an base-64 integer into the given buffer. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: static void putInt(unsigned int v, char **pz){ dbda8d6ce9 2007-07-21 drh: static const char zDigits[] = dbda8d6ce9 2007-07-21 drh: "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~"; dbda8d6ce9 2007-07-21 drh: /* 123456789 123456789 123456789 123456789 123456789 123456789 123 */ dbda8d6ce9 2007-07-21 drh: int i, j; dbda8d6ce9 2007-07-21 drh: char zBuf[20]; dbda8d6ce9 2007-07-21 drh: if( v==0 ){ dbda8d6ce9 2007-07-21 drh: *(*pz)++ = '0'; dbda8d6ce9 2007-07-21 drh: return; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: for(i=0; v>0; i++, v>>=6){ dbda8d6ce9 2007-07-21 drh: zBuf[i] = zDigits[v&0x3f]; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: for(j=i-1; j>=0; j--){ dbda8d6ce9 2007-07-21 drh: *(*pz)++ = zBuf[j]; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Read bytes from *pz and convert them into a positive integer. When dbda8d6ce9 2007-07-21 drh: ** finished, leave *pz pointing to the first character past the end of dbda8d6ce9 2007-07-21 drh: ** the integer. The *pLen parameter holds the length of the string dbda8d6ce9 2007-07-21 drh: ** in *pz and is decremented once for each character in the integer. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: static unsigned int getInt(const char **pz, int *pLen){ dbda8d6ce9 2007-07-21 drh: static const signed char zValue[] = { dbda8d6ce9 2007-07-21 drh: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, dbda8d6ce9 2007-07-21 drh: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, dbda8d6ce9 2007-07-21 drh: -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, dbda8d6ce9 2007-07-21 drh: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, dbda8d6ce9 2007-07-21 drh: -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, dbda8d6ce9 2007-07-21 drh: 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, 36, dbda8d6ce9 2007-07-21 drh: -1, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, dbda8d6ce9 2007-07-21 drh: 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, -1, -1, -1, 63, -1, dbda8d6ce9 2007-07-21 drh: }; dbda8d6ce9 2007-07-21 drh: unsigned int v = 0; dbda8d6ce9 2007-07-21 drh: int c; dbda8d6ce9 2007-07-21 drh: unsigned char *z = (unsigned char*)*pz; dbda8d6ce9 2007-07-21 drh: unsigned char *zStart = z; dbda8d6ce9 2007-07-21 drh: while( (c = zValue[0x7f&*(z++)])>=0 ){ dbda8d6ce9 2007-07-21 drh: v = (v<<6) + c; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: z--; dbda8d6ce9 2007-07-21 drh: *pLen -= z - zStart; dbda8d6ce9 2007-07-21 drh: *pz = (char*)z; dbda8d6ce9 2007-07-21 drh: return v; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* 609e4caf29 2007-08-25 aku: ** Return the number digits in the base-64 representation of a positive integer dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: static int digit_count(int v){ dbda8d6ce9 2007-07-21 drh: unsigned int i, x; dbda8d6ce9 2007-07-21 drh: for(i=1, x=64; v>=x; i++, x <<= 6){} dbda8d6ce9 2007-07-21 drh: return i; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Compute a 32-bit checksum on the N-byte buffer. Return the result. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: static unsigned int checksum(const char *zIn, int N){ dbda8d6ce9 2007-07-21 drh: const unsigned char *z = (const unsigned char*)zIn; dbda8d6ce9 2007-07-21 drh: unsigned int sum = 0; dbda8d6ce9 2007-07-21 drh: while( N>=4 ){ dbda8d6ce9 2007-07-21 drh: sum += (z[0]<<24) | (z[1]<<16) | (z[2]<<8) | z[3]; dbda8d6ce9 2007-07-21 drh: z += 4; dbda8d6ce9 2007-07-21 drh: N -= 4; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: if( N>0 ){ dbda8d6ce9 2007-07-21 drh: unsigned char zBuf[4]; dbda8d6ce9 2007-07-21 drh: memset(zBuf, 0, sizeof(zBuf)); dbda8d6ce9 2007-07-21 drh: memcpy(zBuf, z, N); dbda8d6ce9 2007-07-21 drh: z = zBuf; dbda8d6ce9 2007-07-21 drh: sum += (z[0]<<24) | (z[1]<<16) | (z[2]<<8) | z[3]; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: return sum; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Maximum number of landmarks to set in the source file. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: #define MX_LANDMARK (1024*128) dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Create a new delta. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** The delta is written into a preallocated buffer, zDelta, which dbda8d6ce9 2007-07-21 drh: ** should be at least 60 bytes longer than the target file, zOut. dbda8d6ce9 2007-07-21 drh: ** The delta string will be NUL-terminated, but it might also contain dbda8d6ce9 2007-07-21 drh: ** embedded NUL characters if either the zSrc or zOut files are dbda8d6ce9 2007-07-21 drh: ** binary. This function returns the length of the delta string dbda8d6ce9 2007-07-21 drh: ** in bytes, excluding the final NUL terminator character. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Output Format: dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** The delta begins with a base64 number followed by a newline. This dbda8d6ce9 2007-07-21 drh: ** number is the number of bytes in the TARGET file. Thus, given a dbda8d6ce9 2007-07-21 drh: ** delta file z, a program can compute the size of the output file 609e4caf29 2007-08-25 aku: ** simply by reading the first line and decoding the base-64 number 609e4caf29 2007-08-25 aku: ** found there. The delta_output_size() routine does exactly this. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** After the initial size number, the delta consists of a series of dbda8d6ce9 2007-07-21 drh: ** literal text segments and commands to copy from the SOURCE file. dbda8d6ce9 2007-07-21 drh: ** A copy command looks like this: dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** NNN@MMM, dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** where NNN is the number of bytes to be copied and MMM is the offset dbda8d6ce9 2007-07-21 drh: ** into the source file of the first byte (both base-64). If NNN is 0 dbda8d6ce9 2007-07-21 drh: ** it means copy the rest of the input file. Literal text is like this: dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** NNN:TTTTT dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** where NNN is the number of bytes of text (base-64) and TTTTT is the text. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** The last term is of the form dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** NNN; dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** In this case, NNN is a 32-bit bigendian checksum of the output file dbda8d6ce9 2007-07-21 drh: ** that can be used to verify that the delta applied correctly. All dbda8d6ce9 2007-07-21 drh: ** numbers are in base-64. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Pure text files generate a pure text delta. Binary files generate a dbda8d6ce9 2007-07-21 drh: ** delta that may contain some binary data. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Algorithm: dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** The encoder first builds a hash table to help it find matching 6f1af23ebe 2007-08-26 aku: ** patterns in the source file. 16-byte chunks of the source file dbda8d6ce9 2007-07-21 drh: ** sampled at evenly spaced intervals are used to populate the hash dbda8d6ce9 2007-07-21 drh: ** table. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Next we begin scanning the target file using a sliding 16-byte dbda8d6ce9 2007-07-21 drh: ** window. The hash of the 16-byte window in the target is used to dbda8d6ce9 2007-07-21 drh: ** search for a matching section in the source file. When a match dbda8d6ce9 2007-07-21 drh: ** is found, a copy command is added to the delta. An effort is dbda8d6ce9 2007-07-21 drh: ** made to extend the matching section to regions that come before dbda8d6ce9 2007-07-21 drh: ** and after the 16-byte hash window. A copy command is only issued dbda8d6ce9 2007-07-21 drh: ** if the result would use less space that just quoting the text dbda8d6ce9 2007-07-21 drh: ** literally. Literal text is added to the delta for sections that dbda8d6ce9 2007-07-21 drh: ** do not match or which can not be encoded efficiently using copy dbda8d6ce9 2007-07-21 drh: ** commands. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: int delta_create( dbda8d6ce9 2007-07-21 drh: const char *zSrc, /* The source or pattern file */ dbda8d6ce9 2007-07-21 drh: unsigned int lenSrc, /* Length of the source file */ dbda8d6ce9 2007-07-21 drh: const char *zOut, /* The target file */ dbda8d6ce9 2007-07-21 drh: unsigned int lenOut, /* Length of the target file */ dbda8d6ce9 2007-07-21 drh: char *zDelta /* Write the delta into this buffer */ dbda8d6ce9 2007-07-21 drh: ){ dbda8d6ce9 2007-07-21 drh: int i, base; dbda8d6ce9 2007-07-21 drh: char *zOrigDelta = zDelta; dbda8d6ce9 2007-07-21 drh: hash h; dbda8d6ce9 2007-07-21 drh: int *collide; dbda8d6ce9 2007-07-21 drh: int lastRead = -1; /* Last byte of zSrc read by a COPY command */ dbda8d6ce9 2007-07-21 drh: int landmark[MX_LANDMARK]; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* Add the target file size to the beginning of the delta dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: putInt(lenOut, &zDelta); dbda8d6ce9 2007-07-21 drh: *(zDelta++) = '\n'; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* If the source file is very small, it means that we have no dbda8d6ce9 2007-07-21 drh: ** chance of ever doing a copy command. Just output a single dbda8d6ce9 2007-07-21 drh: ** literal segment for the entire target and exit. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: if( lenSrc<=NHASH ){ dbda8d6ce9 2007-07-21 drh: putInt(lenOut, &zDelta); dbda8d6ce9 2007-07-21 drh: *(zDelta++) = ':'; dbda8d6ce9 2007-07-21 drh: memcpy(zDelta, zOut, lenOut); dbda8d6ce9 2007-07-21 drh: zDelta += lenOut; dbda8d6ce9 2007-07-21 drh: putInt(checksum(zOut, lenOut), &zDelta); dbda8d6ce9 2007-07-21 drh: *(zDelta++) = ';'; dbda8d6ce9 2007-07-21 drh: return zDelta - zOrigDelta; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* Compute the hash table used to locate matching sections in the dbda8d6ce9 2007-07-21 drh: ** source file. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: collide = malloc( lenSrc*sizeof(int)/NHASH ); dbda8d6ce9 2007-07-21 drh: if( collide==0 ) return -1; dbda8d6ce9 2007-07-21 drh: memset(landmark, -1, sizeof(landmark)); dbda8d6ce9 2007-07-21 drh: memset(collide, -1, lenSrc*sizeof(int)/NHASH ); dbda8d6ce9 2007-07-21 drh: for(i=0; i<lenSrc-NHASH; i+=NHASH){ dbda8d6ce9 2007-07-21 drh: int hv; dbda8d6ce9 2007-07-21 drh: hash_init(&h, &zSrc[i]); dbda8d6ce9 2007-07-21 drh: hv = hash_32bit(&h) & (MX_LANDMARK-1); dbda8d6ce9 2007-07-21 drh: collide[i/NHASH] = landmark[hv]; dbda8d6ce9 2007-07-21 drh: landmark[hv] = i/NHASH; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* Begin scanning the target file and generating copy commands and dbda8d6ce9 2007-07-21 drh: ** literal sections of the delta. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: base = 0; /* We have already generated everything before zOut[base] */ 73bddaebb9 2007-08-09 drh: while( base+NHASH<lenOut ){ dbda8d6ce9 2007-07-21 drh: int iSrc, iBlock; e63a9fd9d0 2007-09-25 jnc: unsigned int bestCnt, bestOfst=0, bestLitsz=0; dbda8d6ce9 2007-07-21 drh: hash_init(&h, &zOut[base]); dbda8d6ce9 2007-07-21 drh: i = 0; /* Trying to match a landmark against zOut[base+i] */ dbda8d6ce9 2007-07-21 drh: bestCnt = 0; dbda8d6ce9 2007-07-21 drh: while( 1 ){ dbda8d6ce9 2007-07-21 drh: int hv; f6b4c6458b 2007-09-06 drh: int limit = 250; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: hv = hash_32bit(&h) & (MX_LANDMARK-1); dbda8d6ce9 2007-07-21 drh: DEBUG2( printf("LOOKING: %4d [%s]\n", base+i, print16(&zOut[base+i])); ) dbda8d6ce9 2007-07-21 drh: iBlock = landmark[hv]; b816fadfc7 2007-09-05 drh: while( iBlock>=0 && (limit--)>0 ){ dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** The hash window has identified a potential match against dbda8d6ce9 2007-07-21 drh: ** landmark block iBlock. But we need to investigate further. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Look for a region in zOut that matches zSrc. Anchor the search dbda8d6ce9 2007-07-21 drh: ** at zSrc[iSrc] and zOut[base+i]. Do not include anything prior to dbda8d6ce9 2007-07-21 drh: ** zOut[base] or after zOut[outLen] nor anything after zSrc[srcLen]. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Set cnt equal to the length of the match and set ofst so that dbda8d6ce9 2007-07-21 drh: ** zSrc[ofst] is the first element of the match. litsz is the number dbda8d6ce9 2007-07-21 drh: ** of characters between zOut[base] and the beginning of the match. dbda8d6ce9 2007-07-21 drh: ** sz will be the overhead (in bytes) needed to encode the copy dbda8d6ce9 2007-07-21 drh: ** command. Only generate copy command if the overhead of the dbda8d6ce9 2007-07-21 drh: ** copy command is less than the amount of literal text to be copied. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: int cnt, ofst, litsz; dbda8d6ce9 2007-07-21 drh: int j, k, x, y; dbda8d6ce9 2007-07-21 drh: int sz; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* Beginning at iSrc, match forwards as far as we can. j counts dbda8d6ce9 2007-07-21 drh: ** the number of characters that match */ dbda8d6ce9 2007-07-21 drh: iSrc = iBlock*NHASH; dbda8d6ce9 2007-07-21 drh: for(j=0, x=iSrc, y=base+i; x<lenSrc && y<lenOut; j++, x++, y++){ dbda8d6ce9 2007-07-21 drh: if( zSrc[x]!=zOut[y] ) break; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: j--; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* Beginning at iSrc-1, match backwards as far as we can. k counts dbda8d6ce9 2007-07-21 drh: ** the number of characters that match */ dbda8d6ce9 2007-07-21 drh: for(k=1; k<iSrc && k<=i; k++){ dbda8d6ce9 2007-07-21 drh: if( zSrc[iSrc-k]!=zOut[base+i-k] ) break; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: k--; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* Compute the offset and size of the matching region */ dbda8d6ce9 2007-07-21 drh: ofst = iSrc-k; dbda8d6ce9 2007-07-21 drh: cnt = j+k+1; dbda8d6ce9 2007-07-21 drh: litsz = i-k; /* Number of bytes of literal text before the copy */ dbda8d6ce9 2007-07-21 drh: DEBUG2( printf("MATCH %d bytes at %d: [%s] litsz=%d\n", dbda8d6ce9 2007-07-21 drh: cnt, ofst, print16(&zSrc[ofst]), litsz); ) dbda8d6ce9 2007-07-21 drh: /* sz will hold the number of bytes needed to encode the "insert" dbda8d6ce9 2007-07-21 drh: ** command and the copy command, not counting the "insert" text */ dbda8d6ce9 2007-07-21 drh: sz = digit_count(i-k)+digit_count(cnt)+digit_count(ofst)+3; dbda8d6ce9 2007-07-21 drh: if( cnt>=sz && cnt>bestCnt ){ dbda8d6ce9 2007-07-21 drh: /* Remember this match only if it is the best so far and it dbda8d6ce9 2007-07-21 drh: ** does not increase the file size */ dbda8d6ce9 2007-07-21 drh: bestCnt = cnt; dbda8d6ce9 2007-07-21 drh: bestOfst = iSrc-k; dbda8d6ce9 2007-07-21 drh: bestLitsz = litsz; dbda8d6ce9 2007-07-21 drh: DEBUG2( printf("... BEST SO FAR\n"); ) dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* Check the next matching block */ dbda8d6ce9 2007-07-21 drh: iBlock = collide[iBlock]; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* We have a copy command that does not cause the delta to be larger dbda8d6ce9 2007-07-21 drh: ** than a literal insert. So add the copy command to the delta. dbda8d6ce9 2007-07-21 drh: */ b816fadfc7 2007-09-05 drh: if( bestCnt>0 ){ dbda8d6ce9 2007-07-21 drh: if( bestLitsz>0 ){ dbda8d6ce9 2007-07-21 drh: /* Add an insert command before the copy */ dbda8d6ce9 2007-07-21 drh: putInt(bestLitsz,&zDelta); dbda8d6ce9 2007-07-21 drh: *(zDelta++) = ':'; dbda8d6ce9 2007-07-21 drh: memcpy(zDelta, &zOut[base], bestLitsz); dbda8d6ce9 2007-07-21 drh: zDelta += bestLitsz; dbda8d6ce9 2007-07-21 drh: base += bestLitsz; dbda8d6ce9 2007-07-21 drh: DEBUG2( printf("insert %d\n", bestLitsz); ) dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: base += bestCnt; dbda8d6ce9 2007-07-21 drh: putInt(bestCnt, &zDelta); dbda8d6ce9 2007-07-21 drh: *(zDelta++) = '@'; dbda8d6ce9 2007-07-21 drh: putInt(bestOfst, &zDelta); dbda8d6ce9 2007-07-21 drh: DEBUG2( printf("copy %d bytes from %d\n", bestCnt, bestOfst); ) dbda8d6ce9 2007-07-21 drh: *(zDelta++) = ','; dbda8d6ce9 2007-07-21 drh: if( bestOfst + bestCnt -1 > lastRead ){ dbda8d6ce9 2007-07-21 drh: lastRead = bestOfst + bestCnt - 1; dbda8d6ce9 2007-07-21 drh: DEBUG2( printf("lastRead becomes %d\n", lastRead); ) dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: bestCnt = 0; dbda8d6ce9 2007-07-21 drh: break; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* If we reach this point, it means no match is found so far */ dbda8d6ce9 2007-07-21 drh: if( base+i+NHASH>lenOut ){ dbda8d6ce9 2007-07-21 drh: /* We have reached the end of the file and have not found any dbda8d6ce9 2007-07-21 drh: ** matches. Do an "insert" for everything that does not match */ dbda8d6ce9 2007-07-21 drh: putInt(lenOut-base, &zDelta); dbda8d6ce9 2007-07-21 drh: *(zDelta++) = ':'; dbda8d6ce9 2007-07-21 drh: memcpy(zDelta, &zOut[base], lenOut-base); dbda8d6ce9 2007-07-21 drh: zDelta += lenOut-base; dbda8d6ce9 2007-07-21 drh: base = lenOut; dbda8d6ce9 2007-07-21 drh: break; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* Advance the hash by one character. Keep looking for a match */ dbda8d6ce9 2007-07-21 drh: hash_next(&h, zOut[base+i+NHASH]); dbda8d6ce9 2007-07-21 drh: i++; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: /* Output a final "insert" record to get all the text at the end of dbda8d6ce9 2007-07-21 drh: ** the file that does not match anything in the source file. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: if( base<lenOut ){ dbda8d6ce9 2007-07-21 drh: putInt(lenOut-base, &zDelta); dbda8d6ce9 2007-07-21 drh: *(zDelta++) = ':'; dbda8d6ce9 2007-07-21 drh: memcpy(zDelta, &zOut[base], lenOut-base); dbda8d6ce9 2007-07-21 drh: zDelta += lenOut-base; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: /* Output the final checksum record. */ dbda8d6ce9 2007-07-21 drh: putInt(checksum(zOut, lenOut), &zDelta); dbda8d6ce9 2007-07-21 drh: *(zDelta++) = ';'; dbda8d6ce9 2007-07-21 drh: free(collide); dbda8d6ce9 2007-07-21 drh: return zDelta - zOrigDelta; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Return the size (in bytes) of the output from applying dbda8d6ce9 2007-07-21 drh: ** a delta. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** This routine is provided so that an procedure that is able dbda8d6ce9 2007-07-21 drh: ** to call delta_apply() can learn how much space is required dbda8d6ce9 2007-07-21 drh: ** for the output and hence allocate nor more space that is really dbda8d6ce9 2007-07-21 drh: ** needed. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: int delta_output_size(const char *zDelta, int lenDelta){ dbda8d6ce9 2007-07-21 drh: int size; dbda8d6ce9 2007-07-21 drh: size = getInt(&zDelta, &lenDelta); dbda8d6ce9 2007-07-21 drh: if( *zDelta!='\n' ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: size integer not terminated by "\n" */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: return size; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: /* dbda8d6ce9 2007-07-21 drh: ** Apply a delta. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** The output buffer should be big enough to hold the whole output dbda8d6ce9 2007-07-21 drh: ** file and a NUL terminator at the end. The delta_output_size() dbda8d6ce9 2007-07-21 drh: ** routine will determine this size for you. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** The delta string should be null-terminated. But the delta string dbda8d6ce9 2007-07-21 drh: ** may contain embedded NUL characters (if the input and output are dbda8d6ce9 2007-07-21 drh: ** binary files) so we also have to pass in the length of the delta in dbda8d6ce9 2007-07-21 drh: ** the lenDelta parameter. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** This function returns the size of the output file in bytes (excluding dbda8d6ce9 2007-07-21 drh: ** the final NUL terminator character). Except, if the delta string is dbda8d6ce9 2007-07-21 drh: ** malformed or intended for use with a source file other than zSrc, dbda8d6ce9 2007-07-21 drh: ** then this routine returns -1. dbda8d6ce9 2007-07-21 drh: ** dbda8d6ce9 2007-07-21 drh: ** Refer to the delta_create() documentation above for a description dbda8d6ce9 2007-07-21 drh: ** of the delta file format. dbda8d6ce9 2007-07-21 drh: */ dbda8d6ce9 2007-07-21 drh: int delta_apply( dbda8d6ce9 2007-07-21 drh: const char *zSrc, /* The source or pattern file */ dbda8d6ce9 2007-07-21 drh: int lenSrc, /* Length of the source file */ dbda8d6ce9 2007-07-21 drh: const char *zDelta, /* Delta to apply to the pattern */ dbda8d6ce9 2007-07-21 drh: int lenDelta, /* Length of the delta */ dbda8d6ce9 2007-07-21 drh: char *zOut /* Write the output into this preallocated buffer */ dbda8d6ce9 2007-07-21 drh: ){ dbda8d6ce9 2007-07-21 drh: unsigned int limit; dbda8d6ce9 2007-07-21 drh: unsigned int total = 0; dbda8d6ce9 2007-07-21 drh: char *zOrigOut = zOut; dbda8d6ce9 2007-07-21 drh: dbda8d6ce9 2007-07-21 drh: limit = getInt(&zDelta, &lenDelta); dbda8d6ce9 2007-07-21 drh: if( *zDelta!='\n' ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: size integer not terminated by "\n" */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: zDelta++; lenDelta--; dbda8d6ce9 2007-07-21 drh: while( *zDelta && lenDelta>0 ){ dbda8d6ce9 2007-07-21 drh: unsigned int cnt, ofst; dbda8d6ce9 2007-07-21 drh: cnt = getInt(&zDelta, &lenDelta); dbda8d6ce9 2007-07-21 drh: switch( zDelta[0] ){ dbda8d6ce9 2007-07-21 drh: case '@': { dbda8d6ce9 2007-07-21 drh: zDelta++; lenDelta--; dbda8d6ce9 2007-07-21 drh: ofst = getInt(&zDelta, &lenDelta); dbda8d6ce9 2007-07-21 drh: if( zDelta[0]!=',' ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: copy command not terminated by ',' */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: zDelta++; lenDelta--; dbda8d6ce9 2007-07-21 drh: DEBUG1( printf("COPY %d from %d\n", cnt, ofst); ) dbda8d6ce9 2007-07-21 drh: total += cnt; dbda8d6ce9 2007-07-21 drh: if( total>limit ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: copy exceeds output file size */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: if( ofst+cnt > lenSrc ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: copy extends past end of input */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: memcpy(zOut, &zSrc[ofst], cnt); dbda8d6ce9 2007-07-21 drh: zOut += cnt; dbda8d6ce9 2007-07-21 drh: break; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: case ':': { dbda8d6ce9 2007-07-21 drh: zDelta++; lenDelta--; dbda8d6ce9 2007-07-21 drh: total += cnt; dbda8d6ce9 2007-07-21 drh: if( total>limit ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: insert command gives an output larger than predicted */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: DEBUG1( printf("INSERT %d\n", cnt); ) dbda8d6ce9 2007-07-21 drh: if( cnt>lenDelta ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: insert count exceeds size of delta */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: memcpy(zOut, zDelta, cnt); dbda8d6ce9 2007-07-21 drh: zOut += cnt; dbda8d6ce9 2007-07-21 drh: zDelta += cnt; dbda8d6ce9 2007-07-21 drh: lenDelta -= cnt; dbda8d6ce9 2007-07-21 drh: break; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: case ';': { dbda8d6ce9 2007-07-21 drh: zDelta++; lenDelta--; dbda8d6ce9 2007-07-21 drh: zOut[0] = 0; dbda8d6ce9 2007-07-21 drh: if( cnt!=checksum(zOrigOut, total) ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: bad checksum */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: if( total!=limit ){ dbda8d6ce9 2007-07-21 drh: /* ERROR: generated size does not match predicted size */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: return total; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: default: { dbda8d6ce9 2007-07-21 drh: /* ERROR: unknown delta operator */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: } dbda8d6ce9 2007-07-21 drh: /* ERROR: unterminated delta */ dbda8d6ce9 2007-07-21 drh: return -1; dbda8d6ce9 2007-07-21 drh: }