Overview
SHA1 Hash: | f637d4220695430e25040509f12f050d2f4cd42f |
---|---|
Date: | 2008-02-24 18:01:40 |
User: | aku |
Comment: | Updated my notes regarding memory usage. Converted more locations to incremental query processing via 'state foreachrow', now throughout the importer. |
Timelines: | ancestors | descendants | both | trunk |
Other Links: | files | ZIP archive | manifest |
Tags And Properties
- branch=trunk inherited from [a28c83647d]
- sym-trunk inherited from [a28c83647d]
Changes
[hide diffs]Modified cvs2fossil.txt from [0f9001c9f0] to [3bce849e6e].
@@ -5,58 +5,36 @@ * Not yet able to handle the specification of multiple projects for one CVS repository. I.e. I can, for example, import all of tcllib, or a single subproject of tcllib, like tklib, but not multiple sub-projects in one go. -* We have to look into the pass 'InitCsets' and hunt for the - cause of the large amount of memory it is gobbling up. - - Results from the first look using the new memory tracking - subsystem: - - (1) The general architecture, workflow, is a bit wasteful. All - changesets are generated and kept in memory before getting - persisted. This means that allocated memory piles up over - time, with later changesets pushing the boundaries. This - is made worse that some of the preliminary changesets seem - to require a lot of temporary memory as part of getting - broken down into the actual ones. InititializeBreakState - seems to be the culprit here. Its memory usage is possibly - quadratic in the number of items in the changeset. - - (2) A number of small inefficiencies. Like 'state eval' always - pulling the whole result into memory before processing it - with 'foreach'. Here potentially large lists. - - (3) We maintain an in-memory map from tagged items to their - changesets. While this is needed later in the sorting - passes during the creation this is wasted space. And also - wasted time, to maintain it during the creation and - breaking. - - Changes: - - (a) Re-architect to create, break, and persist changesets one - by one, completely releasing all associated in-memory data - before going to the next. Should be low-hanging fruit with - high impact, as we have all the necessary operations - already, just not in that order, and that alone should - already keep the pile from forming, making the spikes of - (2) more manageable. - - (b) Look into the smaller problems described in (2), and - especially (3). These should still be low-hanging fruit, - although of lesser effect than (a). For (3) disable the - map and its maintenace during construction, and put it - into a separate command, to be used when loading the - created changesets at the end. - - (c) With larger effect, but more difficult to achieve, go into - command 'InitializeBreakState' and the preceding - 'internalsuccessors', and rearchitect it. Definitely not a - low-hanging fruit. Possibly also something we can skip if - doing (a) had a large enough effect. +* Consider to rework the breaker- and sort-passes so that they + do not need all changesets as objects in memory. + + Current memory consumption after all changesets are loaded: + + bwidget 6971627 6.6 + cvs-memchan 4634049 4.4 + cvs-sqlite 45674501 43.6 + cvs-trf 8781289 8.4 + faqs 2835116 2.7 + libtommath 4405066 4.2 + mclistbox 3350190 3.2 + newclock 5020460 4.8 + oocore 4064574 3.9 + sampleextension 4729932 4.5 + tclapps 8482135 8.1 + tclbench 4116887 3.9 + tcl_bignum 2545192 2.4 + tclconfig 4105042 3.9 + tcllib 31707688 30.2 + tcltutorial 3512048 3.3 + tcl 109926382 104.8 + thread 8953139 8.5 + tklib 13935220 13.3 + tk 66149870 63.1 + widget 2625609 2.5 * Look at the dependencies on external packages and consider which of them can be moved into the importer, either as a simple utility command, or wholesale.
Modified tools/cvs2fossil/lib/c2f_file.tcl from [a5aa8fb6dc] to [8d01091d5f].
@@ -314,32 +314,38 @@ set earcs {} ; # Arcs for expansion graph set zarcs {} ; # Arcs for zip graph set revmap {} ; # path -> rid map to later merge uuid information - foreach {rid revnr parent child coff clen cid cparent} [state run { - SELECT B.rid, R.rev, R.parent, R.child, B.coff, B.clen, B.bid, B.pid + state foreachrow { + SELECT B.rid AS xrid, + R.rev AS revnr, + R.child AS xchild, + B.coff AS xcoff, + B.clen AS xclen, + B.bid AS cid, + B.pid AS cparent FROM blob B LEFT OUTER JOIN revision R ON B.rid = R.rid WHERE B.fid = $myid - }] { + } { # Main data are blobs, most will have revisions, but not # all. The expansion graph is blob based, whereas the # recompression graph is revision based. if {$revnr ne ""} { # Blob has revision, extend recompression graph. - lappend revmap r$revnr $rid - - $zp node insert $rid - $zp node set $rid revnr $revnr - $zp node set $rid label <$revnr> - - if {$child ne ""} { - lappend zarcs $child $rid + lappend revmap r$revnr $xrid + + $zp node insert $xrid + $zp node set $xrid revnr $revnr + $zp node set $xrid label <$revnr> + + if {$xchild ne ""} { + lappend zarcs $xchild $xrid } } else { # We fake a revnr for the blobs which have no # revision, for use in the expansion graph. set revnr ghost$cid @@ -346,11 +352,11 @@ } # Now the expansion graph. $ex node insert $cid - $ex node set $cid text [list $coff $clen] + $ex node set $cid text [list $xcoff $xclen] $ex node set $cid revnr $revnr $ex node set $cid label <$revnr> if {$cparent ne ""} { # The expansion arcs go from baseline to delta
Modified tools/cvs2fossil/lib/c2f_frev.tcl from [f35156ab47] to [5f8c3fdd03].
@@ -513,13 +513,13 @@ {1 0} add {1 1} nothing } typemethod getopcodes {} { - foreach {id name} [state run { + state foreachrow { SELECT oid, name FROM optype; - }] { set myopcode($name) $id } + } { set myopcode($name) $oid } return } typevariable myopcode -array {}
Modified tools/cvs2fossil/lib/c2f_pbreakacycle.tcl from [3f3d02eb94] to [9de68fa960].
@@ -111,11 +111,13 @@ log write 2 breakacycle {Loading revision commit order} set n 0 state transaction { - foreach {cid pos} [state run { SELECT cid, pos FROM csorder }] { + state foreachrow { + SELECT cid, pos FROM csorder + } { log progress 2 breakacycle $n {} set cset [project::rev of $cid] $cset setpos $pos set mycset($pos) $cset lappend myrevisionchangesets $cset
Modified tools/cvs2fossil/lib/c2f_pcollsym.tcl from [7383c81d45] to [596822d4e2].
@@ -112,24 +112,24 @@ # # ## ### ##### ######## ############# ## Internal methods ## TODO: Move UnconvertedSymbols, BadSymbolTypes, BlockedIncludes, - ## InvalidTags to the integrity module? + ## TODO: InvalidTags to the integrity module? proc UnconvertedSymbols {} { # Paranoia - Have we left symbols without conversion # information (i.e. with type 'undefined') ? set undef [project::sym undef] - foreach {pname sname} [state run { - SELECT P.name, S.name + state foreachrow { + SELECT P.name AS pname, S.name AS sname FROM symbol S, project P WHERE S.type = $undef -- Restrict to undefined symbols AND P.pid = S.pid -- Get project for symbol - }] { + } { trouble fatal "$pname : The symbol '$sname' was left undefined" } return } @@ -136,16 +136,16 @@ proc BadSymbolTypes {} { # Paranoia - Have we left symbols with bogus conversion # information (type out of the valid range (excluded, branch, # tag)) ? - foreach {pname sname} [state run { - SELECT P.name, S.name + state foreachrow { + SELECT P.name AS pname, S.name AS sname FROM symbol S, project P WHERE S.type NOT IN (0,1,2) -- Restrict to symbols with bogus type codes AND P.pid = S.pid -- Get project of symbol - }] { + } { trouble fatal "$pname : The symbol '$sname' has no proper conversion type" } return } @@ -153,19 +153,19 @@ # Paranoia - Have we scheduled symbols for exclusion without # also excluding their dependent symbols ? set excl [project::sym excluded] - foreach {pname sname bname} [state run { - SELECT P.name, S.name, SB.name + state foreachrow { + SELECT P.name AS pname, S.name AS sname, SB.name AS bname FROM symbol S, blocker B, symbol SB, project P WHERE S.type = $excl -- Restrict to excluded symbols AND S.sid = B.sid -- Get symbols blocking them AND B.bid = SB.sid -- and AND SB.type != $excl -- which are not excluded themselves AND P.pid = S.pid -- Get project of symbol - }] { + } { trouble fatal "$pname : The symbol '$sname' cannot be excluded as the unexcluded symbol '$bname' depends on it." } return } @@ -179,17 +179,17 @@ # left to the heuristics, most specifically # 'project::sym.HasCommits()'. set tag [project::sym tag] - foreach {pname sname} [state run { - SELECT P.name, S.name + state foreachrow { + SELECT P.name AS pname, S.name AS sname FROM project P, symbol S WHERE S.type = $tag -- Restrict to tag symbols AND S.commit_count > 0 -- which have revisions committed to them AND P.pid = S.pid -- Get project of symbol - }] { + } { trouble fatal "$pname : The symbol '$sname' cannot be forced to be converted as tag because it has commits." } return } @@ -224,12 +224,17 @@ # each candidate overwriting all previous # selections. Note that we ignore excluded symbol, we # do not care about their prefered parents and do not # attempt to compute them. - foreach {s p sname pname prname votes} [state run { - SELECT S.sid, P.pid, S.name, SB.name, PR.name, P.n + state foreachrow { + SELECT S.sid AS xs, + P.pid AS xp, + S.name AS sname, + SB.name AS pname, + PR.name AS prname, + P.n AS votes FROM symbol S, parent P, symbol SB, project PR WHERE S.type != $excl -- Restrict to wanted symbols AND S.sid = P.sid -- Get possible parents of symbol AND P.pid = SB.sid -- and AND S.pid = PR.pid -- the project of the symbol @@ -238,14 +243,14 @@ -- Higher votes and smaller ids (= earlier branches) last -- We simply keep the last possible parent for each -- symbol. This parent will have the max number of votes -- for its symbol and will be the earliest created branch -- possible among all with many votes. - }] { + } { log write 9 pcollsym "Voting $votes for Parent($sname) = $pname" - set prefered($s) [list $p $sname $pname $prname] + set prefered($xs) [list $xp $sname $pname $prname] } # Phase II: Write the found preferences back into the table # this pass defined for it. @@ -262,20 +267,20 @@ # Phase III: Check the result that all symbols except for # trunks have a prefered parent. We also ignore # excluded symbols, as we intentionally did not # compute a prefered parent for them, see phase I. - foreach {pname sname} [state run { - SELECT PR.name, S.name + state foreachrow { + SELECT PR.name AS pname, S.name AS sname FROM symbol S LEFT OUTER JOIN preferedparent P ON S.sid = P.sid, -- From symbol to prefered parent project PR WHERE P.pid IS NULL -- restrict to symbols without a preference AND S.type != $excl -- which are not excluded AND S.name != ':trunk:' -- and are not a trunk AND S.pid = PR.pid -- get project of symbol - }] { + } { trouble fatal "$pname : '$sname' has no prefered parent." } # The reverse, having prefered parents for unknown symbols # cannot occur.
Modified tools/cvs2fossil/lib/c2f_pfiltersym.tcl from [b7f9a11eac] to [810af7f145].
@@ -147,20 +147,22 @@ # links in the database. array set ntdb {} array set link {} - foreach {id parent transfer} [state run { - SELECT R.rid, R.parent, R.dbchild + state foreachrow { + SELECT R.rid AS xid, + R.parent AS xparent, + R.dbchild AS transfer FROM revision R, symbol S WHERE R.lod = S.sid -- Get symbol of line-of-development of all revisions AND S.sid IN excludedsymbols -- Restrict to the excluded symbols AND R.isdefault -- Restrict to NTDB revisions - }] { - set ntdb($id) $parent + } { + set ntdb($xid) $xparent if {$transfer eq ""} continue - set link($id) $transfer + set link($xid) $transfer } foreach joint [array names link] { # The joints are the highest NTDB revisions which are # shared with their respective trunk. We disconnect from @@ -308,15 +310,15 @@ array set sn [state run { SELECT B.bid, S.name FROM branch B, symbol S WHERE B.sid = S.sid }] # Symbol names ... array set sx [state run { SELECT L.sid, L.name FROM symbol L }] # Files and projects. array set fpn {} - foreach {id fn pn} [state run { - SELECT F.fid, F.name, P.name + state foreachrow { + SELECT F.fid AS id, F.name AS fn, P.name AS pn FROM file F, project P WHERE F.pid = P.pid - }] { set fpn($id) [list $fn $pn] } + } { set fpn($id) [list $fn $pn] } set tagstoadjust [state run { SELECT T.tid, T.fid, T.lod, P.pid, S.name, R.rev, R.rid FROM tag T, preferedparent P, symbol S, revision R WHERE T.sid = P.sid -- For all tags, get left-hand of prefered parent via symbol @@ -531,27 +533,35 @@ array set sym {} set n 0 set t 0 set c 0 - foreach {s stype cc p ptype} [state run { - SELECT S.name, A.name, S.commit_count, P.name, B.name + state foreachrow { + SELECT S.name AS xs, + A.name AS stype, + S.commit_count AS cc, + P.name AS xp, + B.name AS ptype FROM tag T, symbol S, symbol P, symtype A, symtype B WHERE S.sid = T.sid AND P.sid = T.lod AND A.tid = S.type AND B.tid = P.type UNION - SELECT S.name, A.name, S.commit_count, P.name, B.name + SELECT S.name AS xs, + A.name AS stype, + S.commit_count AS cc, + P.name AS xp, + B.name AS ptype FROM branch B, symbol S, symbol P, symtype A, symtype B WHERE S.sid = B.sid AND P.sid = B.lod AND A.tid = S.type AND B.tid = P.type - }] { - lappend sym($s) $p $stype $ptype $cc - maxlen n $s + } { + lappend sym($xs) $xp $stype $ptype $cc + maxlen n $xs maxlen t $stype maxlen t $ptype maxlen c $cc }
Modified tools/cvs2fossil/lib/c2f_prev.tcl from [f8ed1ba0f8] to [dfc591f76c].
@@ -325,18 +325,21 @@ } proc Getrevisioninfo {revisions} { set theset ('[join $revisions {','}]') set revisions {} - #foreachrow - foreach {frid path fname revnr rop} [state run [subst -nocommands -nobackslashes { - SELECT U.uuid, F.visible, F.name, R.rev, R.op + state foreachrow [subst -nocommands -nobackslashes { + SELECT U.uuid AS frid, + F.visible AS path, + F.name AS fname, + R.rev AS revnr, + R.op AS rop FROM revision R, revuuid U, file F WHERE R.rid IN $theset -- All specified revisions AND U.rid = R.rid -- get fossil uuid of revision AND F.fid = R.fid -- get file of revision - }]] { + }] { lappend revisions $frid $path $fname/$revnr $rop } return $revisions } @@ -584,20 +587,19 @@ # the order given to them by the sort passes. Both the # filtering by project and sorting make use of 'project::rev # rev' impossible. set res {} - #foreachrow - foreach {cid cdate} [state run { - SELECT C.cid, T.date + state foreachrow { + SELECT C.cid AS xcid, T.date AS cdate FROM changeset C, cstimestamp T WHERE C.type = 0 -- limit to revision changesets AND C.pid = $projectid -- limit to changesets in project AND T.cid = C.cid -- get ordering information ORDER BY T.date -- sort into commit order - }] { - lappend res $myidmap($cid) $cdate + } { + lappend res $myidmap($xcid) $cdate } return $res } typemethod getcstypes {} { @@ -899,17 +901,16 @@ array set delta {} array set stamp {} set theset ('[join $revisions {','}]') - #foreachrow - foreach {rid time} [state run [subst -nocommands -nobackslashes { - SELECT R.rid, R.date + state foreachrow [subst -nocommands -nobackslashes { + SELECT R.rid AS xrid, R.date AS time FROM revision R WHERE R.rid IN $theset - }]] { - set stamp($rid) $time + }] { + set stamp($xrid) $time } log write 14 csets {IBS: stamp [array size stamp]} set n 0 @@ -1326,54 +1327,51 @@ # '1.2' revision) is a successor, if it exists. # Note that the branches spawned from the revisions, and the # tags associated with them are successors as well. - #foreachrow - foreach {rid child} [state run [subst -nocommands -nobackslashes { + state foreachrow [subst -nocommands -nobackslashes { -- (1) Primary child - SELECT R.rid, R.child + SELECT R.rid AS xrid, R.child AS xchild FROM revision R WHERE R.rid IN $theset -- Restrict to revisions of interest AND R.child IS NOT NULL -- Has primary child UNION -- (2) Secondary (branch) children - SELECT R.rid, B.brid + SELECT R.rid AS xrid, B.brid AS xchild FROM revision R, revisionbranchchildren B WHERE R.rid IN $theset -- Restrict to revisions of interest AND R.rid = B.rid -- Select subset of branch children UNION -- (4) Child of trunk root successor of last NTDB on trunk. - SELECT R.rid, RA.child + SELECT R.rid AS xrid, RA.child AS xchild FROM revision R, revision RA WHERE R.rid IN $theset -- Restrict to revisions of interest AND R.isdefault -- Restrict to NTDB AND R.dbchild IS NOT NULL -- and last NTDB belonging to trunk AND RA.rid = R.dbchild -- Go directly to trunk root AND RA.child IS NOT NULL -- Has primary child. - }]] { + }] { # Consider moving this to the integrity module. - integrity assert {$rid != $child} {Revision $rid depends on itself.} - lappend dependencies([list rev $rid]) [list rev $child] + integrity assert {$xrid != $xchild} {Revision $xrid depends on itself.} + lappend dependencies([list rev $xrid]) [list rev $xchild] } - #foreachrow - foreach {rid child} [state run [subst -nocommands -nobackslashes { - SELECT R.rid, T.tid + state foreachrow [subst -nocommands -nobackslashes { + SELECT R.rid AS xrid, T.tid AS xchild FROM revision R, tag T WHERE R.rid IN $theset -- Restrict to revisions of interest AND T.rev = R.rid -- Select tags attached to them - }]] { - lappend dependencies([list rev $rid]) [list sym::tag $child] + }] { + lappend dependencies([list rev $xrid]) [list sym::tag $xchild] } - #foreachrow - foreach {rid child} [state run [subst -nocommands -nobackslashes { - SELECT R.rid, B.bid + state foreachrow [subst -nocommands -nobackslashes { + SELECT R.rid AS xrid, B.bid AS xchild FROM revision R, branch B WHERE R.rid IN $theset -- Restrict to revisions of interest AND B.root = R.rid -- Select branches attached to them - }]] { - lappend dependencies([list rev $rid]) [list sym::branch $child] + }] { + lappend dependencies([list rev $xrid]) [list sym::branch $xchild] } return } # result = list (changeset-id) @@ -1581,38 +1579,35 @@ # The first revision committed on a branch, and all branches # and tags which have it as their prefered parent are the # successors of a branch. set theset ('[join $branches {','}]') - #foreachrow - foreach {bid child} [state run [subst -nocommands -nobackslashes { - SELECT B.bid, R.rid + state foreachrow [subst -nocommands -nobackslashes { + SELECT B.bid AS xbid, R.rid AS xchild FROM branch B, revision R WHERE B.bid IN $theset -- Restrict to branches of interest AND B.first = R.rid -- Get first revision on the branch - }]] { - lappend dependencies([list sym::branch $bid]) [list rev $child] + }] { + lappend dependencies([list sym::branch $xbid]) [list rev $xchild] } - #foreachrow - foreach {bid child} [state run [subst -nocommands -nobackslashes { - SELECT B.bid, BX.bid + state foreachrow [subst -nocommands -nobackslashes { + SELECT B.bid AS xbid, BX.bid AS xchild FROM branch B, preferedparent P, branch BX WHERE B.bid IN $theset -- Restrict to branches of interest AND B.sid = P.pid -- Get subordinate branches via the AND BX.sid = P.sid -- prefered parents of their symbols - }]] { - lappend dependencies([list sym::branch $bid]) [list sym::branch $child] + }] { + lappend dependencies([list sym::branch $xbid]) [list sym::branch $xchild] } - #foreachrow - foreach {bid child} [state run [subst -nocommands -nobackslashes { - SELECT B.bid, T.tid + state foreachrow [subst -nocommands -nobackslashes { + SELECT B.bid AS xbid, T.tid AS xchild FROM branch B, preferedparent P, tag T WHERE B.bid IN $theset -- Restrict to branches of interest AND B.sid = P.pid -- Get subordinate tags via the AND T.sid = P.sid -- prefered parents of their symbols - }]] { - lappend dependencies([list sym::branch $bid]) [list sym::tag $child] + }] { + lappend dependencies([list sym::branch $xbid]) [list sym::tag $xchild] } return } # result = list (changeset-id)
Modified tools/cvs2fossil/lib/c2f_psym.tcl from [cdd6e8c604] to [294eabb16a].
@@ -249,13 +249,13 @@ typevariable myforcepattern {} ; # List of patterns and types # specifying which symbols to # force to specific types. typemethod getsymtypes {} { - foreach {tid name} [state run { + state foreachrow { SELECT tid, name FROM symtype; - }] { set mysymtype($tid) $name } + } { set mysymtype($tid) $name } return } # Keep the codes below in sync with 'pass::collrev/setup('symtype'). typevariable myexcluded 0 ; # Code for symbols which are excluded. @@ -299,18 +299,19 @@ log write 2 symbol "Symbol type statistics:" set fmt %[string length $mynum]s set all 0 - foreach {stype splural n} [state run { - SELECT T.name, T.plural, COUNT (s.sid) + state foreachrow { + SELECT T.name AS stype, + T.plural AS splural, + COUNT (s.sid) AS n FROM symbol S, symtype T WHERE S.type = T.tid GROUP BY T.name ORDER BY T.name - ; - }] { + } { log write 2 symbol "* [format $fmt $n] [sp $n $stype $splural]" incr all $n } log write 2 symbol "= [format $fmt $all] total" @@ -391,13 +392,12 @@ incr myrulecount($label) # This is stored directly into the database. state run { UPDATE symbol - SET type = $chosen - WHERE sid = $myid - ; + SET type = $chosen + WHERE sid = $myid } return } # # ## ### ##### ######## #############
Modified tools/cvs2fossil/lib/c2f_repository.tcl from [1850a8da8e] to [e021cb0c0b].
@@ -152,23 +152,23 @@ return } typemethod load {} { state transaction { - foreach {pid name} [state run { + state foreachrow { SELECT pid, name FROM project ; - }] { + } { set project [project %AUTO% $name $type] lappend myprojpaths $name lappend myprojects $project set myprojmap($pid) $project $project setid $pid } - foreach {fid pid name visible exec} [state run { + state foreachrow { SELECT fid, pid, name, visible, exec FROM file ; - }] { + } { $myprojmap($pid) addfile $name $visible $exec $fid } } return } @@ -244,15 +244,14 @@ typemethod loadsymbols {} { state transaction { # We load the symbol ids at large to have the mapping # right from the beginning. - foreach {sid pid name tc bc cc} [state run { - SELECT sid, pid, name, tag_count, branch_count, commit_count + state foreachrow { + SELECT sid, pid, name, tag_count AS tc, branch_count AS bc, commit_count AS cc FROM symbol - ; - }] { + } { $mysymbol map $sid [list $pid $name] set project $myprojmap($pid) set force [$project hassymbol $name] set symbol [$project getsymbol $name] @@ -271,23 +270,23 @@ } # Beyond the symbols we also load the author, commit log, # and meta information. - foreach {aid aname} [state run { - SELECT aid, name FROM author - }] { + state foreachrow { + SELECT aid, name AS aname FROM author + } { $myauthor map $aid $aname } - foreach {cid text} [state run { + state foreachrow { SELECT cid, text FROM cmessage - }] { + } { $mycmsg map $cid $text } - foreach {mid pid bid aid cid} [state run { + state foreachrow { SELECT mid, pid, bid, aid, cid FROM meta - }] { + } { $mymeta map $mid [list $pid $bid $aid $cid] } } return }