Fossil: Check-in [9c57055025]

Overview

SHA1 Hash: 9c5705502507e993fa2486d31093571139db2128

Date: 2007-12-02 05:49:00

User: aku

Comment: Performance bugfix. nextmap/premap can still be performance killers and memory hogs. Moved the computation of sucessor changesets down to the type-dependent code (new methods) and the SQL database, i.e. the C level. In the current setup it was possible that the DB would deliver us millions of file-level dependency pairs which the Tcl level would then reduce to tens of actual changeset dependencies. Tcl did not cope well with that amount of data. Now the reduction happens in the query itself. A concrete example was a branch in the Tcl CVS generating nearly 9 million pairs, which reduced to roughly 200 changeset dependencies. This blew the memory out of the water and the converter ground to a halt, busily swapping. Ok, causes behind us, also added another index on 'csitem(iid)' to speed the search for changesets from the revisions, tags, and branches.

Timelines: ancestors | descendants | both | trunk

Other Links: files | ZIP archive | manifest

SHA1 Hash:	9c5705502507e993fa2486d31093571139db2128
Date:	2007-12-02 05:49:00
User:	aku
Comment:	Performance bugfix. nextmap/premap can still be performance killers and memory hogs. Moved the computation of sucessor changesets down to the type-dependent code (new methods) and the SQL database, i.e. the C level. In the current setup it was possible that the DB would deliver us millions of file-level dependency pairs which the Tcl level would then reduce to tens of actual changeset dependencies. Tcl did not cope well with that amount of data. Now the reduction happens in the query itself. A concrete example was a branch in the Tcl CVS generating nearly 9 million pairs, which reduced to roughly 200 changeset dependencies. This blew the memory out of the water and the converter ground to a halt, busily swapping. Ok, causes behind us, also added another index on 'csitem(iid)' to speed the search for changesets from the revisions, tags, and branches.
Timelines:	ancestors \| descendants \| both \| trunk
Other Links:	files \| ZIP archive \| manifest

Tags And Properties

branch=trunk inherited from [a28c83647d]
sym-trunk inherited from [a28c83647d]

Changes

[hide diffs]

Modified tools/cvs2fossil/lib/c2f_pinitcsets.tcl from [39a67db2c5] to [f56c12b551].

@@ -93,11 +93,12 @@
 	    cid  INTEGER  NOT NULL  REFERENCES changeset,
 	    pos  INTEGER  NOT NULL,
 	    iid  INTEGER  NOT NULL, -- REFERENCES revision|tag|branch
 	    UNIQUE (cid, pos),
 	    UNIQUE (cid, iid)
-	}
+	} { iid }
+	# Index on: iid (successor/predecessor retrieval)
 
 	project::rev getcstypes
 	return
     }

Modified tools/cvs2fossil/lib/c2f_prev.tcl from [e427524ce6] to [a4cf35790a].

@@ -87,13 +87,25 @@
     delegate method istag      to mytypeobj
 
     method setpos {p} { set mypos $p ; return }
     method pos    {}  { return $mypos }
 
+    # result = list (changeset)
+    method successors {} {
+	return [struct::list map \
+		    [$mytypeobj cs_successors $myitems] \
+		    [mytypemethod of]]
+    }
+
     # result = dict (item -> list (changeset))
     method successormap {} {
-	# NOTE / FUTURE: Possible bottleneck.
+	# NOTE / FUTURE: Definitive bottleneck (can be millions of pairs).
+	#
+	# Only user is pass 9, computing the limits of backward
+	# branches per branch in the changeset. TODO: Fold that into
+	# the SQL query, i.e. move the crunching from Tcl to C.
+
 	array set tmp {}
 	foreach {rev children} [$self nextmap] {
 	    foreach child $children {
 		lappend tmp($rev) $myitemmap($child)
 	    }
@@ -100,25 +112,18 @@
 	    set tmp($rev) [lsort -unique $tmp($rev)]
 	}
 	return [array get tmp]
     }
 
-    # result = list (changeset)
-    method successors {} {
-	# NOTE / FUTURE: Possible bottleneck.
-	set csets {}
-	foreach {_ children} [$self nextmap] {
-	    foreach child $children {
-		lappend csets $myitemmap($child)
-	    }
-	}
-	return [lsort -unique $csets]
-    }
-
     # result = dict (item -> list (changeset))
     method predecessormap {} {
-	# NOTE / FUTURE: Possible bottleneck.
+	# NOTE / FUTURE: Definitive bottleneck (can be millions of pairs).
+	#
+	# Only user is pass 9, computing the limits of backward
+	# branches per branch in the changeset. TODO: Fold that into
+	# the SQL query, i.e. move the crunching from Tcl to C.
+
 	array set tmp {}
 	foreach {rev children} [$self premap] {
 	    foreach child $children {
 		lappend tmp($rev) $myitemmap($child)
 	    }
@@ -1057,10 +1062,68 @@
 	"] {
 	    lappend dependencies([list rev $rid]) [list sym::branch $parent]
 	}
 	return
     }
+
+    # result = list (changeset-id)
+    typemethod cs_successors {revisions} {
+        # This is a variant of 'successors' which maps the low-level
+        # data directly to the associated changesets. I.e. instead
+        # millions of dependency pairs (in extreme cases (Example: Tcl
+        # CVS)) we return a very short and much more manageable list
+        # of changesets.
+
+	set theset ('[join $revisions {','}]')
+	return [state run "
+	    SELECT C.cid
+	    FROM   revision R, csitem CI, changeset C
+	    WHERE  R.rid   IN $theset     -- Restrict to revisions of interest
+	    AND    R.child IS NOT NULL    -- Has primary child
+            AND    CI.iid = R.rid
+            AND    C.cid = CI.cid
+            AND    C.type = 0
+    UNION
+	    SELECT C.cid
+	    FROM   revision R, revisionbranchchildren B, csitem CI, changeset C
+	    WHERE  R.rid   IN $theset     -- Restrict to revisions of interest
+	    AND    R.rid = B.rid          -- Select subset of branch children
+            AND    CI.iid = R.rid
+            AND    C.cid = CI.cid
+            AND    C.type = 0
+    UNION
+	    SELECT C.cid
+	    FROM   revision R, revision RA, csitem CI, changeset C
+	    WHERE  R.rid   IN $theset      -- Restrict to revisions of interest
+	    AND    R.isdefault             -- Restrict to NTDB
+	    AND    R.dbchild IS NOT NULL   -- and last NTDB belonging to trunk
+	    AND    RA.rid = R.dbchild      -- Go directly to trunk root
+	    AND    RA.child IS NOT NULL    -- Has primary child.
+            AND    CI.iid = R.rid
+            AND    C.cid = CI.cid
+            AND    C.type = 0
+            AND    CI.iid = R.rid
+            AND    C.cid = CI.cid
+            AND    C.type = 0
+    UNION
+	    SELECT C.cid
+	    FROM   revision R, tag T, csitem CI, changeset C
+	    WHERE  R.rid in $theset
+	    AND    T.rev = R.rid
+            AND    CI.iid = T.tid
+            AND    C.cid = CI.cid
+            AND    C.type = 1
+    UNION
+	    SELECT C.cid
+	    FROM   revision R, branch B, csitem CI, changeset C
+	    WHERE  R.rid in $theset
+	    AND    B.root = R.rid
+            AND    CI.iid = B.bid
+            AND    C.cid = CI.cid
+            AND    C.type = 2
+	"]
+    }
 }
 
 # # ## ### ##### ######## ############# #####################
 ## Helper singleton. Commands for tag symbol changesets.
 
@@ -1136,10 +1199,16 @@
 	    AND    T.sid = P.sid
 	    AND    P.pid = TX.sid
 	"] {
 	    lappend dependencies([list sym::tag $tid]) [list sym::tag $parent]
 	}
+	return
+    }
+
+    # result = list (changeset-id)
+    typemethod cs_successors {tags} {
+	# Tags have no successors.
 	return
     }
 }
 
 # # ## ### ##### ######## ############# #####################
@@ -1250,10 +1319,49 @@
 	    AND    B.sid = P.sid
 	    AND    P.pid = T.sid
 	"] {
 	    lappend dependencies([list sym::branch $bid]) [list sym::tag $parent]
 	}
+	return
+    }
+
+    # result = list (changeset-id)
+    typemethod cs_successors {branches} {
+        # This is a variant of 'successors' which maps the low-level
+        # data directly to the associated changesets. I.e. instead
+        # millions of dependency pairs (in extreme cases (Example: Tcl
+        # CVS)) we return a very short and much more manageable list
+        # of changesets.
+
+	set theset ('[join $branches {','}]')
+        return [state run "
+	    SELECT C.cid
+	    FROM   branch B, revision R, csitem CI, changeset C
+	    WHERE  B.bid IN $theset
+	    AND    B.first = R.rid
+            AND    CI.iid = R.rid
+            AND    C.cid = CI.cid
+            AND    C.type = 0
+    UNION
+	    SELECT C.cid
+	    FROM   branch B, preferedparent P, branch BX, csitem CI, changeset C
+	    WHERE  B.bid IN $theset
+	    AND    B.sid = P.pid
+	    AND    BX.sid = P.sid
+            AND    CI.iid = BX.bid
+            AND    C.cid = CI.cid
+            AND    C.type = 2
+    UNION
+	    SELECT C.cid
+	    FROM   branch B, preferedparent P, tag T, csitem CI, changeset C
+	    WHERE  B.bid IN $theset
+	    AND    B.sid = P.pid
+	    AND    T.sid = P.sid
+            AND    CI.iid = T.tid
+            AND    C.cid = CI.cid
+            AND    C.type = 1
+	"]
 	return
     }
 
     # # ## ### ##### ######## #############
     ## Configuration