Check-in [61ddd63b72]
Not logged in
Overview

SHA1 Hash:61ddd63b72f1436c9ad18b6bba9a754276ac50fc
Date: 2008-03-06 22:58:48
User: drh
Comment:Work toward making fossil work better on large repositories. This version implements a cache in the content manager. It is not clear yet if this is necessarily a good idea - this check-in might end up on an abandoned branch at some point.
Timelines: ancestors | descendants | both | trunk
Other Links: files | ZIP archive | manifest

Tags And Properties
Changes
[hide diffs]

Modified src/blob.c from [2269ea6655] to [abc48788ed].

@@ -206,11 +206,10 @@
 /*
 ** Copy a blob
 */
 void blob_copy(Blob *pTo, Blob *pFrom){
   blob_is_init(pFrom);
-  blob_is_init(pTo);
   blob_zero(pTo);
   blob_append(pTo, blob_buffer(pFrom), blob_size(pFrom));
 }
 
 /*

Modified src/content.c from [1ea5bdca20] to [6b8e09987a].

@@ -26,16 +26,123 @@
 #include "config.h"
 #include "content.h"
 #include <assert.h>
 
 /*
+** Macros for debugging
+*/
+#if 0
+# define CONTENT_TRACE(X)  printf X;
+#else
+# define CONTENT_TRACE(X)
+#endif
+
+/*
+** The artifact retrival cache
+*/
+#define MX_CACHE_CNT  50    /* Maximum number of positive cache entries */
+#define EXPELL_INTERVAL 5   /* How often to expell from a full cache */
+static struct {
+  int n;               /* Current number of positive cache entries */
+  int nextAge;         /* Age counter for implementing LRU */
+  int skipCnt;         /* Used to limit entries expelled from cache */
+  struct {             /* One instance of this for each cache entry */
+    int rid;                  /* Artifact id */
+    int age;                  /* Age.  Newer is larger */
+    Blob content;             /* Content of the artifact */
+  } a[MX_CACHE_CNT];   /* The positive cache */
+
+  /*
+  ** The missing artifact cache.
+  **
+  ** Artifacts whose record ID are in missingCache cannot be retrieved
+  ** either because they are phantoms or because they are a delta that
+  ** depends on a phantom.  Artifacts whose content we are certain is
+  ** available are in availableCache.  If an artifact is in neither cache
+  ** then its current availablity is unknown.
+  */
+  Bag missing;         /* Cache of artifacts that are incomplete */
+  Bag available;       /* Cache of artifacts that are complete */
+} contentCache;
+
+
+/*
+** Clear the content cache.
+*/
+void content_clear_cache(void){
+  int i;
+  for(i=0; i<contentCache.n; i++){
+    blob_reset(&contentCache.a[i].content);
+  }
+  bag_clear(&contentCache.missing);
+  bag_clear(&contentCache.available);
+  contentCache.n = 0;
+}
+
+/*
 ** Return the srcid associated with rid.  Or return 0 if rid is
 ** original content and not a delta.
 */
 static int findSrcid(int rid){
   int srcid = db_int(0, "SELECT srcid FROM delta WHERE rid=%d", rid);
   return srcid;
+}
+
+/*
+** Check to see if content is available for artifact "rid".  Return
+** true if it is.  Return false if rid is a phantom or depends on
+** a phantom.
+*/
+int content_is_available(int rid){
+  int srcid;
+  if( bag_find(&contentCache.missing, rid) ){
+    return 0;
+  }
+  if( bag_find(&contentCache.available, rid) ){
+    return 1;
+  }
+  if( db_int(-1, "SELECT size FROM blob WHERE rid=%d", rid)<0 ){
+    bag_insert(&contentCache.missing, rid);
+    return 0;
+  }
+  srcid = findSrcid(rid);
+  if( srcid==0 ){
+    bag_insert(&contentCache.available, rid);
+    return 1;
+  }
+  if( content_is_available(srcid) ){
+    bag_insert(&contentCache.available, rid);
+    return 1;
+  }else{
+    bag_insert(&contentCache.missing, rid);
+    return 0;
+  }
+}
+
+/*
+** Mark artifact rid as being available now.  Update the cache to
+** show that everything that was formerly unavailable because rid
+** was missing is now available.
+*/
+static void content_mark_available(int rid){
+  Bag pending;
+  Stmt q;
+  if( bag_find(&contentCache.available, rid) ) return;
+  bag_init(&pending);
+  bag_insert(&pending, rid);
+  while( (rid = bag_first(&pending))!=0 ){
+    bag_remove(&pending, rid);
+    bag_remove(&contentCache.missing, rid);
+    bag_insert(&contentCache.available, rid);
+    db_prepare(&q, "SELECT rid FROM delta WHERE srcid=%d", rid);
+    while( db_step(&q)==SQLITE_ROW ){
+      int nx = db_column_int(&q, 0);
+      bag_insert(&pending, nx);
+    }
+    db_finalize(&q);
+  }
+  bag_clear(&pending);
 }
 
 /*
 ** Extract the content for ID rid and put it into the
 ** uninitialized blob.  Return 1 on success.  If the record
@@ -44,16 +151,46 @@
 int content_get(int rid, Blob *pBlob){
   Stmt q;
   Blob src;
   int srcid;
   int rc = 0;
+  int i;
   static Bag inProcess;
 
   assert( g.repositoryOpen );
-  srcid = findSrcid(rid);
   blob_zero(pBlob);
+
+  /* Early out if we know the content is not available */
+  if( bag_find(&contentCache.missing, rid) ){
+    CONTENT_TRACE(("%*smiss from cache: %d\n",
+                    bag_count(&inProcess), "", rid))
+    return 0;
+  }
+
+  /* Look for the artifact in the cache first */
+  for(i=0; i<contentCache.n; i++){
+    if( contentCache.a[i].rid==rid ){
+      *pBlob = contentCache.a[i].content;
+      blob_zero(&contentCache.a[i].content);
+      contentCache.n--;
+      if( i<contentCache.n ){
+        contentCache.a[i] = contentCache.a[contentCache.n];
+      }
+      CONTENT_TRACE(("%*shit cache: %d\n",
+                    bag_count(&inProcess), "", rid))
+      return 1;
+    }
+  }
+
+  /* See if we need to apply a delta to find this artifact */
+  srcid = findSrcid(rid);
+  CONTENT_TRACE(("%*ssearching for %d.  Need %d.\n",
+                 bag_count(&inProcess), "", rid, srcid))
+
+
   if( srcid ){
+    /* Yes, a delta is required */
     if( bag_find(&inProcess, srcid) ){
       db_multi_exec(
         "UPDATE blob SET content=NULL, size=-1 WHERE rid=%d;"
         "DELETE FROM delta WHERE rid=%d;"
         "INSERT OR IGNORE INTO phantom VALUES(%d);",
@@ -61,10 +198,11 @@
       );
       blob_zero(pBlob);
       return 0;
     }
     bag_insert(&inProcess, srcid);
+
     if( content_get(srcid, &src) ){
       db_prepare(&q, "SELECT content FROM blob WHERE rid=%d AND size>=0", rid);
       if( db_step(&q)==SQLITE_ROW ){
         Blob delta;
         db_ephemeral_blob(&q, 0, &delta);
@@ -73,29 +211,67 @@
         blob_delta_apply(&src, &delta, pBlob);
         blob_reset(&delta);
         rc = 1;
       }
       db_finalize(&q);
-      blob_reset(&src);
+
+      /* Save the srcid artifact in the cache */
+      if( contentCache.n<MX_CACHE_CNT ){
+        i = contentCache.n++;
+      }else if( ((contentCache.skipCnt++)%EXPELL_INTERVAL)!=0 ){
+        i = -1;
+      }else{
+        int j, best;
+        best = contentCache.nextAge+1;
+        i = -1;
+        for(j=0; j<contentCache.n; j++){
+          if( contentCache.a[j].age<best ){
+            i = j;
+            best = contentCache.a[j].age;
+          }
+        }
+        CONTENT_TRACE(("%*sexpell %d from cache\n",
+                       bag_count(&inProcess), "", contentCache.a[i].rid))
+        blob_reset(&contentCache.a[i].content);
+      }
+      if( i>=0 ){
+        contentCache.a[i].content = src;
+        contentCache.a[i].age = contentCache.nextAge++;
+        contentCache.a[i].rid = srcid;
+        CONTENT_TRACE(("%*sadd %d to cache\n",
+                       bag_count(&inProcess), "", srcid))
+      }else{
+        blob_reset(&src);
+      }
     }
     bag_remove(&inProcess, srcid);
   }else{
+    /* No delta required.  Read content directly from the database */
     db_prepare(&q, "SELECT content FROM blob WHERE rid=%d AND size>=0", rid);
     if( db_step(&q)==SQLITE_ROW ){
       db_ephemeral_blob(&q, 0, pBlob);
       blob_uncompress(pBlob, pBlob);
       rc = 1;
     }
     db_finalize(&q);
   }
+  if( rc==0 ){
+    bag_insert(&contentCache.missing, rid);
+  }else{
+    bag_insert(&contentCache.available, rid);
+  }
   return rc;
 }
 
 /*
-** Get the contents of a file within a given revision.
+** Get the contents of a file within a given baseline.
 */
-int content_get_historical_file(const char *revision, const char *file, Blob *content){
+int content_get_historical_file(
+  const char *revision,    /* Name of the baseline containing the file */
+  const char *file,        /* Name of the file */
+  Blob *content            /* Write file content here */
+){
   Blob mfile;
   Manifest m;
   int i, rid=0;
 
   rid = name_to_rid(revision);
@@ -194,10 +370,11 @@
   int rid;
   Stmt s1;
   Blob cmpr;
   Blob hash;
   int markAsUnclustered = 0;
+  int isDephantomize = 0;
 
   assert( g.repositoryOpen );
   if( pBlob && srcId==0 ){
     sha1sum_blob(pBlob, &hash);
   }else{
@@ -249,12 +426,13 @@
     );
     blob_compress(pBlob, &cmpr);
     db_bind_blob(&s1, ":data", &cmpr);
     db_exec(&s1);
     db_multi_exec("DELETE FROM phantom WHERE rid=%d", rid);
-    if( srcId==0 || db_int(0, "SELECT size FROM blob WHERE rid=%d", srcId)>0 ){
-      after_dephantomize(rid, 0);
+    if( srcId==0 || content_is_available(srcId) ){
+      isDephantomize = 1;
+      content_mark_available(rid);
     }
   }else{
     /* We are creating a new entry */
     db_prepare(&s1,
       "INSERT INTO blob(rcvid,size,uuid,content)"
@@ -275,10 +453,17 @@
   /* If the srcId is specified, then the data we just added is
   ** really a delta.  Record this fact in the delta table.
   */
   if( srcId ){
     db_multi_exec("REPLACE INTO delta(rid,srcid) VALUES(%d,%d)", rid, srcId);
+  }
+  if( !isDephantomize && bag_find(&contentCache.missing, rid) &&
+      (srcId==0 || content_is_available(srcId)) ){
+    content_mark_available(rid);
+  }
+  if( isDephantomize ){
+    after_dephantomize(rid, 0);
   }
 
   /* Add the element to the unclustered table if has never been
   ** previously seen.
   */

Modified src/delta.c from [c73001b47f] to [abb4c0444e].

@@ -196,24 +196,31 @@
 }
 
 /*
 ** Compute a 32-bit checksum on the N-byte buffer.  Return the result.
 */
-static unsigned int checksum(const char *zIn, int N){
-  const unsigned char *z = (const unsigned char*)zIn;
-  unsigned int sum = 0;
-  while( N>=4 ){
+static unsigned int checksum(const char *zIn, size_t N){
+  const unsigned char *z = (const unsigned char *)zIn;
+  unsigned sum = 0;
+  while(N >= 16){
+    sum += ((unsigned)z[0] + z[4] + z[8] + z[12]) << 24;
+    sum += ((unsigned)z[1] + z[5] + z[9] + z[13]) << 16;
+    sum += ((unsigned)z[2] + z[6] + z[10]+ z[14]) << 8;
+    sum += ((unsigned)z[3] + z[7] + z[11]+ z[15]);
+    z += 16;
+    N -= 16;
+  }
+  while(N >= 4){
     sum += (z[0]<<24) | (z[1]<<16) | (z[2]<<8) | z[3];
     z += 4;
     N -= 4;
   }
-  if( N>0 ){
-    unsigned char zBuf[4];
-    memset(zBuf, 0, sizeof(zBuf));
-    memcpy(zBuf, z, N);
-    z = zBuf;
-    sum += (z[0]<<24) | (z[1]<<16) | (z[2]<<8) | z[3];
+  switch(N){
+    case 3:   sum += (z[2] << 8);
+    case 2:   sum += (z[1] << 16);
+    case 1:   sum += (z[0] << 24);
+    default:  ;
   }
   return sum;
 }
 
 /*

Modified src/rebuild.c from [5e55de3bcd] to [b5c158aea9].

@@ -53,10 +53,87 @@
 @    sqlcode text             -- An SQL SELECT statement for this report
 @ );
 ;
 
 /*
+** Variables used for progress information
+*/
+static int totalSize;       /* Total number of artifacts to process */
+static int processCnt;      /* Number processed so far */
+static int ttyOutput;       /* Do progress output */
+
+/*
+** Called after each artifact is processed
+*/
+static void rebuild_step_done(void){
+  if( ttyOutput ){
+    processCnt++;
+    printf("%d (%d%%)...\r", processCnt, (processCnt*100/totalSize));
+    fflush(stdout);
+  }
+}
+
+/*
+** Rebuild cross-referencing information for the artifact
+** rid with content pBase and all of its descendents.  This
+** routine clears the content buffer before returning.
+*/
+static void rebuild_step(int rid, Blob *pBase){
+  Stmt q1;
+  Bag children;
+  Blob copy;
+  Blob *pUse;
+  int nChild, i, cid;
+
+  /* Find all children of artifact rid */
+  db_prepare(&q1, "SELECT rid FROM delta WHERE srcid=%d", rid);
+  bag_init(&children);
+  while( db_step(&q1)==SQLITE_ROW ){
+    bag_insert(&children, db_column_int(&q1, 0));
+  }
+  nChild = bag_count(&children);
+  db_finalize(&q1);
+
+  /* Crosslink the artifact */
+  if( nChild==0 ){
+    pUse = pBase;
+  }else{
+    blob_copy(&copy, pBase);
+    pUse = &copy;
+  }
+  manifest_crosslink(rid, pUse);
+  blob_reset(pUse);
+
+  /* Call all children recursively */
+  for(cid=bag_first(&children), i=1; cid; cid=bag_next(&children, cid), i++){
+    Stmt q2;
+    int sz;
+    if( nChild==i ){
+      pUse = pBase;
+    }else{
+      blob_copy(&copy, pBase);
+      pUse = &copy;
+    }
+    db_prepare(&q2, "SELECT content, size FROM blob WHERE rid=%d", cid);
+    if( db_step(&q2)==SQLITE_ROW && (sz = db_column_int(&q2,1))>=0 ){
+      Blob delta;
+      db_ephemeral_blob(&q2, 0, &delta);
+      blob_uncompress(&delta, &delta);
+      blob_delta_apply(pUse, &delta, pUse);
+      blob_reset(&delta);
+      db_finalize(&q2);
+      rebuild_step(cid, pUse);
+    }else{
+      db_finalize(&q2);
+      blob_reset(pUse);
+    }
+  }
+  bag_clear(&children);
+  rebuild_step_done();
+}
+
+/*
 ** Core function to rebuild the infomration in the derived tables of a
 ** fossil repository from the blobs. This function is shared between
 ** 'rebuild_database' ('rebuild') and 'reconstruct_cmd'
 ** ('reconstruct'), both of which have to regenerate this information
 ** from scratch.
@@ -64,16 +141,17 @@
 ** If the randomize parameter is true, then the BLOBs are deliberately
 ** extracted in a random order.  This feature is used to test the
 ** ability of fossil to accept records in any order and still
 ** construct a sane repository.
 */
-int rebuild_db(int randomize, int ttyOutput){
+int rebuild_db(int randomize, int doOut){
   Stmt s;
   int errCnt = 0;
   char *zTable;
-  int cnt = 0;
 
+  ttyOutput = doOut;
+  processCnt = 0;
   db_multi_exec(zSchemaUpdates);
   for(;;){
     zTable = db_text(0,
        "SELECT name FROM sqlite_master"
        " WHERE type='table'"
@@ -91,30 +169,26 @@
      " WHERE rid IN (SELECT rid FROM shun JOIN blob USING(uuid))"
   );
   db_multi_exec(
     "DELETE FROM config WHERE name IN ('remote-code', 'remote-maxid')"
   );
+  totalSize = db_int(0, "SELECT count(*) FROM blob");
   db_prepare(&s,
-     "SELECT rid, size FROM blob %s"
-     " WHERE NOT EXISTS(SELECT 1 FROM shun WHERE uuid=blob.uuid)",
-     randomize ? "ORDER BY random()" : ""
+     "SELECT rid, size FROM blob"
+     " WHERE NOT EXISTS(SELECT 1 FROM shun WHERE uuid=blob.uuid)"
+     "   AND NOT EXISTS(SELECT 1 FROM delta WHERE rid=blob.rid)"
   );
   while( db_step(&s)==SQLITE_ROW ){
     int rid = db_column_int(&s, 0);
     int size = db_column_int(&s, 1);
     if( size>=0 ){
       Blob content;
-      if( ttyOutput ){
-        cnt++;
-        printf("%d...\r", cnt);
-        fflush(stdout);
-      }
       content_get(rid, &content);
-      manifest_crosslink(rid, &content);
-      blob_reset(&content);
+      rebuild_step(rid, &content);
     }else{
       db_multi_exec("INSERT OR IGNORE INTO phantom VALUES(%d)", rid);
+      rebuild_step_done();
     }
   }
   db_finalize(&s);
   if( ttyOutput ){
     printf("\n");
@@ -141,14 +215,15 @@
   if( g.argc!=3 ){
     usage("REPOSITORY-FILENAME");
   }
   db_open_repository(g.argv[2]);
   db_begin_transaction();
+  ttyOutput = 1;
   errCnt = rebuild_db(randomizeFlag, 1);
   if( errCnt && !forceFlag ){
     printf("%d errors. Rolling back changes. Use --force to force a commit.\n",
             errCnt);
     db_end_transaction(1);
   }else{
     db_end_transaction(0);
   }
 }

Modified src/verify.c from [8d633e10bb] to [f05153c954].

@@ -73,10 +73,11 @@
 /*
 ** This routine is called just prior to each commit operation.
 */
 static int verify_at_commit(void){
   int rid;
+  content_clear_cache();
   inFinalVerify = 1;
   rid = bag_first(&toVerify);
   while( rid>0 ){
     verify_rid(rid);
     rid = bag_next(&toVerify, rid);