Overview
SHA1 Hash: | f166b0a63c5e75950de23d297d46e03dafaf4e51 |
---|---|
Date: | 2007-08-31 04:57:33 |
User: | aku |
Comment: | Added first code regarding import from cvs, processing a CVSROOT/history file. Looks good, except that the history I have is incomplete, truncated at the beginning. Extended my notes with results from this experiment, thinking about a possible different method. |
Timelines: | ancestors | descendants | both | trunk |
Other Links: | files | ZIP archive | manifest |
Tags And Properties
- branch=trunk inherited from [a28c83647d]
- sym-trunk inherited from [a28c83647d]
Changes
[hide diffs]Modified ci_cvs.txt from [06c022ce6b] to [13f4f929b2].
@@ -1,5 +1,89 @@ +=============================================================================== + +First experimental codes ... + +toosl/import-cvs.tcl +tools/lib/rcsparser.tcl + +No actual import, right now only working on getting csets right. The +code uses CVSROOT/history as foundation, and augments that with data +from the individual RCS files (commit messages). + +Statistics of a run ... + 3516 csets. + + 1545 breaks on user change + 558 breaks on file duplicate + 13 breaks on branch/trunk change + 1402 breaks on commit message change + +Time statistics ... + 3297 were processed in <= 1 seconds (93.77%) + 217 were processed in between 2 seconds and 14 minutes. + 1 was processed in ~41 minutes + 1 was processed in ~22 hours + +Time fuzz - Differences between csets range from 0 seconds to 66 +days. Needs stats analysis to see if there is an obvious break. Even +so the times within csets and between csets overlap a great deal, +making time a bad criterium for cset separation, IMHO. + +Leaving that topic, back to the current cset separator ... + +It has a problem: + The history file is not starting at the root! + +Examples: + The first three changesets are + + =============================/user + M {Wed Nov 22 09:28:49 AM PST 2000} ericm 1.4 tcllib/modules/ftpd/ChangeLog + M {Wed Nov 22 09:28:49 AM PST 2000} ericm 1.7 tcllib/modules/ftpd/ftpd.tcl + files: 2 + delta: 0 + range: 0 seconds + =============================/cmsg + M {Wed Nov 29 02:14:33 PM PST 2000} ericm 1.3 tcllib/aclocal.m4 + files: 1 + delta: + range: 0 seconds + =============================/cmsg + M {Sun Feb 04 12:28:35 AM PST 2001} ericm 1.9 tcllib/modules/mime/ChangeLog + M {Sun Feb 04 12:28:35 AM PST 2001} ericm 1.12 tcllib/modules/mime/mime.tcl + files: 2 + delta: 0 + range: 0 seconds + +All csets modify files which already have several revisions. We have +no csets from before that in the history, but these csets are in the +RCS files. + +I wonder, is SF maybe removing old entries from the history when it +grows too large ? + +This also affects incremental import ... I cannot assume that the +history always grows. It may shrink ... I cannot keep an offset, will +have to record the time of the last entry, or even the full entry +processed last, to allow me to skip ahead to anything not known yet. + +I might have to try to implement the algorithm outlined below, +matching the revision trees of the individual RCS files to each other +to form the global tree of revisions. Maybe we can use the history to +help in the matchup, for the parts where we do have it. + +Wait. This might be easier ... Take the delta information from the RCS +files and generate a fake history ... Actually, this might even allow +us to create a total history ... No, not quite, the merge entries the +actual history may contain will be missing. These we can mix in from +the actual history, as much as we have. + +Still, lets try that, a fake history, and then run this script on it +to see if/where are differences. + +=============================================================================== + Notes about CVS import, regarding CVS. - Problem: CVS does not really track changesets, but only individual revisions of files. To recover changesets it is necessary to look at
Added tools/import-cvs.tcl version [8e70daebc0]
@@ -1,1 +1,370 @@ +#!/bin/sh +# -*- tcl -*- \ +exec tclsh "$0" ${1+"$@"} + +# ----------------------------------------------------------------------------- +# Make private packages accessible. + +lappend auto_path [file join [file dirname [info script]] lib] +package require rcsparser +package require fileutil + +# ----------------------------------------------------------------------------- +# Repository management (CVS) + +namespace eval ::cvs { + variable base ; set base {} ; # Repository toplevel directory. + variable npaths ; array set npaths {} ; # path -> actual path mapping. + variable rpaths ; array set rpaths {} ; # path -> rcs file mapping. + variable cmsg ; array set cmsg {} ; # Cache of commit messages. +} + +proc ::cvs::hextime {hex} { + set t 0 + foreach d [string map { + a 10 b 11 c 12 d 13 e 14 f 15 + A 10 B 11 C 12 D 13 E 14 F 15 + } [split $hex {}]] { + set t [expr {($t << 4) + $d}];#horner + } + return $t +} + +proc ::cvs::at {path} { + variable base $path + return +} + +proc ::cvs::cmsg {path rev} { + variable cmsg + set key [list $path $rev] + + if {![info exists cmsg($key)]} { + set rcs [cvs::rcsfile $path] + + #puts stderr "scan $path => $rcs" + + array set p [::rcsparser::process $rcs] + + foreach {r msg} $p(commit) { + set cmsg([list $path $r]) $msg + } + + if {![info exists cmsg($key)]} { + return -code error "Bogus revision $rev of file $path" + } + } + + return $cmsg($key) +} + +proc ::cvs::norm {path} { + variable npaths + if {![info exists npaths($path)]} { + set npaths($path) [NormFile $path] + } + return $npaths($path) +} + +proc ::cvs::NormFile {path} { + variable base + + set f $base/$path,v + if {[::file exists $f]} {return $path} + + set hd [::file dirname $path] + set tl [::file tail $path] + + set f $base/$hd/Attic/$tl,v + if {[::file exists $f]} {return $path} + + # Bad. The dir can be truncated, i.e. it may not be an exact + # subdirectory of base, but deeper inside, with parents between it + # and base left out. Example (from the tcllib history file): + # + # M3f1d1245|afaupell|<remote>|ipentry|1.2|ChangeLog + # The correct path is 'tklib/modules/ipentry'. + # This we have to resolve too. + + # normalize dance - old fileutil, modern fileutil (cvs head) doesn't do that. + set bx [file normalize $base] + foreach c [fileutil::findByPattern $bx -glob $hd] { + set cx [fileutil::stripPath $bx $c] + set c $base/$cx + + set f $c/$tl,v + if {[::file exists $f]} {return $cx/$tl} + set f $c/Attic/$tl,v + if {[::file exists $f]} {return $cx/$tl} + } + + puts stderr <$f> + return -code error "Unable to locate actual file for $path" +} + +proc ::cvs::rcsfile {path} { + variable rpaths + if {![info exists rpaths($path)]} { + set rpaths($path) [RcsFile $path] + } + return $rpaths($path) +} + +proc ::cvs::RcsFile {path} { + variable base + + set f $base/$path,v + if {[::file exists $f]} {return $f} + + set hd [::file dirname $path] + set tl [::file tail $path] + + set f $base/$hd/Attic/$tl,v + if {[::file exists $f]} {return $f} + + # We do not have truncated directories here, assuming that only + # norm paths are fed into this command. + + if 0 { + # Bad. The dir can be truncated, i.e. it may not be an exact + # subdirectory of base, but deeper inside, with parents + # between it and base left out. Example (from the tcllib + # history file): + # + # M3f1d1245|afaupell|<remote>|ipentry|1.2|ChangeLog The + # correct path is 'tklib/modules/ipentry'. This we have to + # resolve too. + + # normalize dance - old fileutil, modern fileutil (cvs head) + # doesn't do that. + set bx [file normalize $base] + foreach c [fileutil::findByPattern $bx -glob $hd] { + set c $base/[fileutil::stripPath $bx $c] + + set f $c/$tl,v + if {[::file exists $f]} {return $f} + set f $c/Attic/$tl,v + if {[::file exists $f]} {return $f} + } + } + + puts stderr <$f> + return -code error "Unable to locate rcs file for $path" +} + +proc ::cvs::history {} { + variable base + return $base/CVSROOT/history +} + +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- + +cvs::at [lindex $argv 0] + +#puts [::cvs::norm ipentry/ChangeLog] +#exit + +#changeset state +global cs csf +array set cs { + start {} end {} cm {} + usr {} dt {} +} +array set csf {} + +proc rh {} { + global argv cs csf repo + + set f [open [cvs::history] r] + + while {[gets $f line] >= 0} { + # Decode line + foreach {op usr _ dir rev file} [split [string trim $line] |] break + set ts [cvs::hextime [string range $op 1 end]] + set op [string index $op 0] + + # Filter out irrelevant parts + if {$op eq "O"} continue ; # checkout + if {$op eq "F"} continue ; # release + if {$op eq "T"} continue ; # rtag + if {$op eq "W"} continue ; # delete on update + if {$op eq "U"} continue ; # update + if {$op eq "P"} continue ; # update by patch + #if {$op eq "G"} continue ; # merge on update - FUTURE - identifies mergepoints. + if {$op eq "C"} continue ; # conflict on update - s.a. + if {$op eq "E"} continue ; # export + # left types + # M: commit + # A: addition + # R: removal + + set df $dir/$file + if {[newcs $op $usr $ts $rev df cause]} { + + # NOTE 1: ChangeSets containing CVSROOT => remove such files. + # NOTE 2: Empty changesets, ignore. + + #commit + csstats + + if {$cause eq "cmsg"} { +set msg + } else { +set msg "" + } + + if {$cs(end) ne ""} { + puts =============================/$cause\ delta\ [expr {$ts - $cs(end)}] + } else { + puts =============================/$cause + } + csclear + } + + # Note: newcs normalizes df, in case the log information is + # bogus. So the df here may be different from before newcs + csadd $op $usr $ts $rev $df + # apply modification to workspace + } +} + +proc newcs {op usr ts rev dfv rv} { + global cs csf + upvar 1 $rv reason $dfv df + + # Logic to detect when a new change set begins. A new change sets + # has started with the current entry when the entry + # + # 1. is for a different user than the last. + # 2. tries to add a file to the changeset which is already part of it. + # 3.is on the trunk, and the last on a branch, or vice versa. + # 4. the current entry has a different commit message than the last. + + set df [cvs::norm $df] + + # User changed + if {$usr ne $cs(usr)} { + set reason user + return 1 + } + + # File is already in the changeset + if {[info exists csf($df)]} { + set reason file + return 1 + } + + # last/current are different regarding trunk/branch + set depth [llength [split $rev .]] + if {($cs(lastd) == 2) != ($depth == 2)} { + set reason branch + return 1 + } + + # Commit message changed + if {[cvs::cmsg $cs(lastf) $cs(lastr)] ne [cvs::cmsg $df $rev]} { + set reason cmsg + return 1 + } + + # Same changeset + return 0 +} + +proc csclear {} { + global cs csf + array set cs {start {} usr {} end {} dt {}} + array unset csf * + return +} + +proc csadd {op usr ts rev df} { + global cs csf + + if {$cs(usr) eq ""} {set cs(usr) $usr} + if {$cs(start) eq ""} { + set cs(start) $ts + } else { + lappend cs(dt) [expr {$ts - $cs(end)}] + } + set cs(end) $ts + + set csf($df) [list $op $rev] + set cs(lastf) $df + set cs(lastr) $rev + set cs(lastd) [llength [split $rev .]] + + puts [list $op [clock format $ts] $usr $rev $df] + return +} + +proc csstats {} { + global cs csf + + if {$cs(start) eq ""} return + + puts "files: [array size csf]" + puts "delta: $cs(dt)" + puts "range: [expr {$cs(end) - $cs(start)}] seconds" + return +} + +rh + +exit + +========================================= +new fossil +new fossil workspace + +open history + +foreach line { + ignore unwanted lines + + accumulate changesets data + new change-set => commit and continue + + current branch and branch of new change different ? + => move fossil workspace to proper revision. + + apply change to workspace + uncommitted +} + +if uncommitted => commit +delete workspace +copy fossil repo to destination +========================================= + +Not dealt with in outline: branches, tags, merging + +========================================= + +complexities +- apply to workspace + - remove simple, direct translation + - add => requires extraction of indicated revision from ,v + - modify => see above (without add following) + +- ,v file => Can be the either dir/file,v, or dir/Attic/file,v + Both ? Priority ? + +- How to detect changes on branches ? + +- Have to keep knowledge of which branches went there. + => save change-sets information, + uuid in fossil + => need only the leaves of each branch, and of branch points. + => better keep all until complete. + => uuid can be gotten from 'manifest.uuid' in workspace. +- keep tag information ? (symbolics) + +========================================= + +CVSROOT=ORIGIN +cvs -d ORIGIN checkout -r REV FILE +Extract specific revision of a specific file. +-q, -Q quietness
Added tools/lib/pkgIndex.tcl version [71b6b857ad]
@@ -1,1 +1,2 @@ - +if {![package vsatisfies [package require Tcl] 8.4]} return +package ifneeded rcsparser 1.0 [list source [file join $dir rcsparser.tcl]]
Added tools/lib/rcsparser.tcl version [69db20ceda]
@@ -1,1 +1,265 @@ +# ----------------------------------------------------------------------------- +# Parse RCS files (,v) - ignore the deltas - we need only the commit messages +# Recursive Descent Parser + +# ----------------------------------------------------------------------------- +# Requirements + +package require Tcl 8.4 +package require fileutil ; # Tcllib (cat) + +namespace eval ::rcsparser {} + +# ----------------------------------------------------------------------------- +# API + +proc ::rcsparser::process {path} { + set data [fileutil::cat -encoding binary $path] + array set res {} + Admin + Deltas + Description + DeltaTexts + return [array get res] +} + +# ----------------------------------------------------------------------------- +# Internal helper commands + +proc ::rcsparser::Admin {} { + upvar 1 data data res res + Head ; Branch ; Access ; Symbols ; Locks ; Strict ; Comment ; Expand + return +} + +proc ::rcsparser::Deltas {} { + upvar 1 data data res res + while {[Num 0]} { Date ; Author ; State ; Branches ; NextRev } + return +} + +proc ::rcsparser::Description {} { + upvar 1 data data res res + Literal desc + String 1 + Def desc + return +} + +proc ::rcsparser::DeltaTexts {} { + upvar 1 data data res res + while {[Num 0]} { Log ; Text } + return +} + +proc ::rcsparser::Head {} { + upvar 1 data data res res + Literal head ; Num 1 ; Literal \; + Def head + return +} + +proc ::rcsparser::Branch {} { + upvar 1 data data res res + if {![Literal branch 0]} return ; Num 1 ; Literal \; + Def branch + return +} + +proc ::rcsparser::Access {} { + upvar 1 data data res res + Literal access ; Literal \; + return +} + +proc ::rcsparser::Symbols {} { + upvar 1 data data res res + Literal symbols + while {[Ident]} { Num 1 ; Map symbol } + Literal \; + return +} + +proc ::rcsparser::Locks {} { + upvar 1 data data res res + Literal locks + while {[Ident]} { Num 1 ; Map lock } + Literal \; + return +} + +proc ::rcsparser::Strict {} { + upvar 1 data data res res + if {![Literal strict 0]} return ; Literal \; + return +} + +proc ::rcsparser::Comment {} { + upvar 1 data data res res + if {![Literal comment 0]} return ; + if {![String 0]} return ; + Literal \; + Def comment + return +} + +proc ::rcsparser::Expand {} { + upvar 1 data data res res + if {![Literal expand 0]} return ; + if {![String 0]} return ; + Literal \; + Def expand + return +} + +proc ::rcsparser::Date {} { + upvar 1 data data res res + Literal date ; Num 1 ; Literal \; + return +} + +proc ::rcsparser::Author {} { + upvar 1 data data res res + Literal author ; Skip ; Literal \; + return +} + +proc ::rcsparser::State {} { + upvar 1 data data res res + Literal state ; Skip ; Literal \; + return +} + +proc ::rcsparser::Branches {} { + upvar 1 data data res res + Literal branches ; Skip ; Literal \; + return +} + +proc ::rcsparser::NextRev {} { + upvar 1 data data res res + Literal next ; Skip ; Literal \; + return +} + +proc ::rcsparser::Log {} { + upvar 1 data data res res + IsIdent ; Literal log ; String 1 ; Map commit + return +} + +proc ::rcsparser::Text {} { + upvar 1 data data res res + Literal text ; String 1 + return +} + +proc ::rcsparser::Ident {} { + upvar 1 data data res res + + #puts I@?<[string range $data 0 10]...> + + if {[regexp -indices -- {^\s*;\s*} $data]} { + return 0 + } elseif {![regexp -indices -- {^\s*([^:]*)\s*:\s*} $data match val]} { + return 0 + } + + Get $val ; IsIdent + Next + return 1 +} + +proc ::rcsparser::Literal {name {required 1}} { + upvar 1 data data res res + if {![regexp -indices -- "^\\s*$name\\s*" $data match]} { + if {$required} { + return -code error "Expected '$name' @ '[string range $data 0 30]...'" + } + return 0 + } + + Next + return 1 +} + +proc ::rcsparser::String {{required 1}} { + upvar 1 data data res res + + if {![regexp -indices -- {^\s*@(([^@]*(@@)*)*)@\s*} $data match val]} { + if {$required} { + return -code error "Expected string @ '[string range $data 0 30]...'" + } + return 0 + } + + Get $val + Next + return 1 +} + +proc ::rcsparser::Num {required} { + upvar 1 data data res res + if {![regexp -indices -- {^\s*((\d|\.)+)\s*} $data match val]} { + if {$required} { + return -code error "Expected id @ '[string range $data 0 30]...'" + } + return 0 + } + + Get $val + Next + return 1 +} + +proc ::rcsparser::Skip {} { + upvar 1 data data res res + regexp -indices -- {^\s*[^;]*\s*} $data match + Next + return +} + +proc ::rcsparser::Def {key} { + upvar 1 data data res res + set res($key) $res(lastval) + unset res(lastval) + return +} + +proc ::rcsparser::Map {key} { + upvar 1 data data res res + lappend res($key) $res(id) $res(lastval) + #puts Map($res(id))=($res(lastval)) + unset res(lastval) + unset res(id) + return +} + +proc ::rcsparser::IsIdent {} { + upvar 1 data data res res + set res(id) $res(lastval) + unset res(lastval) + return +} + +proc ::rcsparser::Get {val} { + upvar 1 data data res res + foreach {s e} $val break + set res(lastval) [string range $data $s $e] + #puts G|$res(lastval) + return +} + +proc ::rcsparser::Next {} { + upvar 1 match match data data + foreach {s e} $match break ; incr e + set data [string range $data $e end] + return +} + +# ----------------------------------------------------------------------------- +# Ready + +package provide rcsparser 1.0 +return