Check-in [27ed4f7dc3]
Not logged in
Overview

SHA1 Hash:27ed4f7dc3a0c032f255bc0d6a3734c00f68a65d
Date: 2008-02-16 06:46:41
User: aku
Comment:Extended pass InitCsets and underlying code with more log output geared towards memory introspection, and added markers for special locations. Extended my notes with general observations from the first test runs over my example CVS repositories.
Timelines: ancestors | descendants | both | trunk
Other Links: files | ZIP archive | manifest

Tags And Properties
Changes
[hide diffs]

Modified cvs2fossil.txt from [7755e1a714] to [0f9001c9f0].

@@ -8,10 +8,56 @@
 	multiple sub-projects in one go.
 
 *	We have to look into the pass 'InitCsets' and hunt for the
 	cause of the large amount of memory it is gobbling up.
 
+	Results from the first look using the new memory tracking
+	subsystem:
+
+	(1) The general architecture, workflow, is a bit wasteful. All
+	    changesets are generated and kept in memory before getting
+	    persisted. This means that allocated memory piles up over
+	    time, with later changesets pushing the boundaries. This
+	    is made worse that some of the preliminary changesets seem
+	    to require a lot of temporary memory as part of getting
+	    broken down into the actual ones. InititializeBreakState
+	    seems to be the culprit here. Its memory usage is possibly
+	    quadratic in the number of items in the changeset.
+
+	(2) A number of small inefficiencies. Like 'state eval' always
+	    pulling the whole result into memory before processing it
+	    with 'foreach'. Here potentially large lists.
+
+	(3) We maintain an in-memory map from tagged items to their
+	    changesets. While this is needed later in the sorting
+	    passes during the creation this is wasted space. And also
+	    wasted time, to maintain it during the creation and
+	    breaking.
+
+	Changes:
+
+	(a) Re-architect to create, break, and persist changesets one
+	    by one, completely releasing all associated in-memory data
+	    before going to the next. Should be low-hanging fruit with
+	    high impact, as we have all the necessary operations
+	    already, just not in that order, and that alone should
+	    already keep the pile from forming, making the spikes of
+	    (2) more manageable.
+
+	(b) Look into the smaller problems described in (2), and
+	    especially (3). These should still be low-hanging fruit,
+	    although of lesser effect than (a). For (3) disable the
+	    map and its maintenace during construction, and put it
+	    into a separate command, to be used when loading the
+	    created changesets at the end.
+
+	(c) With larger effect, but more difficult to achieve, go into
+	    command 'InitializeBreakState' and the preceding
+	    'internalsuccessors', and rearchitect it. Definitely not a
+	    low-hanging fruit. Possibly also something we can skip if
+	    doing (a) had a large enough effect.
+
 *	Look at the dependencies on external packages and consider
 	which of them can be moved into the importer, either as a
 	simple utility command, or wholesale.
 
 	struct::list
@@ -37,6 +83,6 @@
 
 	snit
 		In toto
 
 	sqlite3
-		In tota
+		In toto

Modified tools/cvs2fossil/lib/c2f_pinitcsets.tcl from [eda30d7ee3] to [9999ef81cc].

@@ -19,10 +19,11 @@
 
 package require Tcl 8.4                               ; # Required runtime.
 package require snit                                  ; # OO system.
 package require vc::tools::misc                       ; # Text formatting.
 package require vc::tools::log                        ; # User feedback.
+package require vc::tools::mem                        ; # Memory tracking.
 package require vc::fossil::import::cvs::repository   ; # Repository management.
 package require vc::fossil::import::cvs::state        ; # State storage.
 package require vc::fossil::import::cvs::integrity    ; # State integrity checks.
 package require vc::fossil::import::cvs::project::rev ; # Project level changesets
 
@@ -179,21 +180,29 @@
 	#       early, extending them with all their revisions. This
 	#       however would mean lots of (slow) method invokations
 	#       on the csets. Doing it like this, late creation, means
 	#       less such calls. None, but the creation itself.
 
+	log write 14 initcsets meta_begin
+	mem::mark
 	foreach {mid rid pid} [state run {
 	    SELECT M.mid, R.rid, M.pid
 	    FROM   revision R, meta M   -- R ==> M, using PK index of M.
 	    WHERE  R.mid = M.mid
 	    ORDER  BY M.mid, R.date
 	}] {
+	    log write 14 initcsets meta_next
+
 	    if {$lastmeta != $mid} {
 		if {[llength $revisions]} {
 		    incr n
 		    set  p [repository projectof $lastproject]
+		    log write 14 initcsets meta_cset_begin
+		    mem::mark
 		    project::rev %AUTO% $p rev $lastmeta $revisions
+		    log write 14 initcsets meta_cset_done
+		    mem::mark
 		    set revisions {}
 		}
 		set lastmeta    $mid
 		set lastproject $pid
 	    }
@@ -201,19 +210,27 @@
 	}
 
 	if {[llength $revisions]} {
 	    incr n
 	    set  p [repository projectof $lastproject]
+	    log write 14 initcsets meta_cset_begin
+	    mem::mark
 	    project::rev %AUTO% $p rev $lastmeta $revisions
+	    log write 14 initcsets meta_cset_done
+	    mem::mark
 	}
+
+	log write 14 initcsets meta_done
+	mem::mark
 
 	log write 4 initcsets "Created [nsp $n {revision changeset}]"
 	return
     }
 
     proc CreateSymbolChangesets {} {
 	log write 3 initcsets {Create changesets based on symbols}
+	mem::mark
 
 	# Tags and branches induce changesets as well, containing the
 	# revisions they are attached to (tags), or spawned from
 	# (branches).
 
@@ -279,10 +296,11 @@
 	    set  p [repository projectof $lastproject]
 	    project::rev %AUTO% $p sym::branch $lastsymbol $branches
 	}
 
 	log write 4 initcsets "Created [nsp $n {symbol changeset}]"
+	mem::mark
 	return
     }
 
     proc BreakInternalDependencies {} {
 	# This code operates on the revision changesets created by
@@ -292,19 +310,21 @@
 	# by splitting the problematic changeset into multiple
 	# fragments. The results are changesets which have no internal
 	# dependencies, only external ones.
 
 	log write 3 initcsets {Break internal dependencies}
+	mem::mark
 	set old [llength [project::rev all]]
 
 	foreach cset [project::rev all] {
 	    $cset breakinternaldependencies
 	}
 
 	set n [expr {[llength [project::rev all]] - $old}]
 	log write 4 initcsets "Created [nsp $n {additional revision changeset}]"
 	log write 4 initcsets Ok.
+	mem::mark
 	return
     }
 
     proc PersistTheChangesets {} {
 	log write 3 initcsets "Saving [nsp [llength [project::rev all]] {initial changeset}] to the persistent state"
@@ -333,10 +353,13 @@
 	namespace import ::vc::fossil::import::cvs::repository
 	namespace import ::vc::fossil::import::cvs::state
 	namespace import ::vc::fossil::import::cvs::integrity
 	namespace eval project {
 	    namespace import ::vc::fossil::import::cvs::project::rev
+	}
+	namespace eval mem {
+	    namespace import ::vc::tools::mem::mark
 	}
 	namespace import ::vc::tools::misc::*
 	namespace import ::vc::tools::log
 	log register initcsets
     }

Modified tools/cvs2fossil/lib/c2f_prev.tcl from [d233796d95] to [433b16c0c0].

@@ -133,11 +133,12 @@
 	$mytypeobj successors tmp $myitems
 	return [array get tmp]
     }
 
     method breakinternaldependencies {} {
-
+	log write 14 csets {[$self str] BID}
+	vc::tools::mem::mark
 	##
 	## NOTE: This method, maybe in conjunction with its caller
 	##       seems to be a memory hog, especially for large
 	##       changesets, with 'large' meaning to have a 'long list
 	##       of items, several thousand'. Investigate where the
@@ -166,13 +167,16 @@
 	# Array of dependencies (parent -> child). This is pulled from
 	# the state, and limited to successors within the changeset.
 
 	array set dependencies {}
 	$mytypeobj internalsuccessors dependencies $myitems
-	if {![array size dependencies]} {return 0} ; # Nothing to break.
+	if {![array size dependencies]} {
+	    return 0
+	} ; # Nothing to break.
 
 	log write 5 csets ...[$self str].......................................................
+	vc::tools::mem::mark
 
 	# We have internal dependencies to break. We now iterate over
 	# all positions in the list (which is chronological, at least
 	# as far as the timestamps are correct and unique) and
 	# determine the best position for the break, by trying to
@@ -1060,10 +1064,12 @@
     # var(dv) = dict (revision -> list (revision))
     typemethod internalsuccessors {dv revisions} {
 	upvar 1 $dv dependencies
 	set theset ('[join $revisions {','}]')
 
+	log write 14 cset internalsuccessors
+
 	# See 'successors' below for the main explanation of
 	# the various cases. This piece is special in that it
 	# restricts the successors we look for to the same set of
 	# revisions we start from. Sensible as we are looking for
 	# changeset internal dependencies.
@@ -1121,10 +1127,12 @@
 
 	# We allow revisions to be far apart in time in the same
 	# changeset, but in turn need the pseudo-dependencies to
 	# handle this.
 
+	log write 14 cset pseudo-internalsuccessors
+
 	array set fids {}
 	foreach {rid fid} [state run [subst -nocommands -nobackslashes {
 	    SELECT R.rid, R.fid
             FROM   revision R
             WHERE  R.rid IN $theset
@@ -1141,10 +1149,12 @@
 		    set dep($a,$b) .
 		    set dep($b,$a) .
 		}
 	    }
 	}
+
+	log write 14 cset complete
 	return
     }
 
     # result = 4-list (itemtype itemid nextitemtype nextitemid ...)
     typemethod loops {revisions} {
@@ -1611,22 +1621,25 @@
 
 	# Set up the helper singletons
 	namespace eval rev {
 	    namespace import ::vc::fossil::import::cvs::state
 	    namespace import ::vc::fossil::import::cvs::integrity
+	    namespace import ::vc::tools::log
 	}
 	namespace eval sym::tag {
 	    namespace import ::vc::fossil::import::cvs::state
 	    namespace import ::vc::fossil::import::cvs::integrity
+	    namespace import ::vc::tools::log
 	}
 	namespace eval sym::branch {
 	    namespace import ::vc::fossil::import::cvs::state
 	    namespace import ::vc::fossil::import::cvs::integrity
+	    namespace import ::vc::tools::log
 	}
     }
 }
 
 # # ## ### ##### ######## ############# #####################
 ## Ready
 
 package provide vc::fossil::import::cvs::project::rev 1.0
 return