Check-in [f166b0a63c]
Not logged in
Overview

SHA1 Hash:f166b0a63c5e75950de23d297d46e03dafaf4e51
Date: 2007-08-31 04:57:33
User: aku
Comment:Added first code regarding import from cvs, processing a CVSROOT/history file. Looks good, except that the history I have is incomplete, truncated at the beginning. Extended my notes with results from this experiment, thinking about a possible different method.
Timelines: ancestors | descendants | both | trunk
Other Links: files | ZIP archive | manifest

Tags And Properties
Changes
[hide diffs]

Modified ci_cvs.txt from [06c022ce6b] to [13f4f929b2].

@@ -1,5 +1,89 @@
+===============================================================================
+
+First experimental codes ...
+
+toosl/import-cvs.tcl
+tools/lib/rcsparser.tcl
+
+No actual import, right now only working on getting csets right. The
+code uses CVSROOT/history as foundation, and augments that with data
+from the individual RCS files (commit messages).
+
+Statistics of a run ...
+	3516 csets.
+
+	1545 breaks on user change
+	 558 breaks on file duplicate
+	  13 breaks on branch/trunk change
+	1402 breaks on commit message change
+
+Time statistics ...
+	3297 were processed in <= 1 seconds (93.77%)
+	 217 were processed in between 2 seconds and 14 minutes.
+	   1 was  processed in ~41 minutes
+	   1 was  processed in ~22 hours
+
+Time fuzz - Differences between csets range from 0 seconds to 66
+days. Needs stats analysis to see if there is an obvious break. Even
+so the times within csets and between csets overlap a great deal,
+making time a bad criterium for cset separation, IMHO.
+
+Leaving that topic, back to the current cset separator ...
+
+It has a problem:
+	The history file is not starting at the root!
+
+Examples:
+	The first three changesets are
+
+	=============================/user
+	M {Wed Nov 22 09:28:49 AM PST 2000} ericm 1.4 tcllib/modules/ftpd/ChangeLog
+	M {Wed Nov 22 09:28:49 AM PST 2000} ericm 1.7 tcllib/modules/ftpd/ftpd.tcl
+	files: 2
+	delta: 0
+	range: 0 seconds
+	=============================/cmsg
+	M {Wed Nov 29 02:14:33 PM PST 2000} ericm 1.3 tcllib/aclocal.m4
+	files: 1
+	delta:
+	range: 0 seconds
+	=============================/cmsg
+	M {Sun Feb 04 12:28:35 AM PST 2001} ericm 1.9 tcllib/modules/mime/ChangeLog
+	M {Sun Feb 04 12:28:35 AM PST 2001} ericm 1.12 tcllib/modules/mime/mime.tcl
+	files: 2
+	delta: 0
+	range: 0 seconds
+
+All csets modify files which already have several revisions. We have
+no csets from before that in the history, but these csets are in the
+RCS files.
+
+I wonder, is SF maybe removing old entries from the history when it
+grows too large ?
+
+This also affects incremental import ... I cannot assume that the
+history always grows. It may shrink ... I cannot keep an offset, will
+have to record the time of the last entry, or even the full entry
+processed last, to allow me to skip ahead to anything not known yet.
+
+I might have to try to implement the algorithm outlined below,
+matching the revision trees of the individual RCS files to each other
+to form the global tree of revisions. Maybe we can use the history to
+help in the matchup, for the parts where we do have it.
+
+Wait. This might be easier ... Take the delta information from the RCS
+files and generate a fake history ... Actually, this might even allow
+us to create a total history ... No, not quite, the merge entries the
+actual history may contain will be missing. These we can mix in from
+the actual history, as much as we have.
+
+Still, lets try that, a fake history, and then run this script on it
+to see if/where are differences.
+
+===============================================================================
+
 
 Notes about CVS import, regarding CVS.
 
 - Problem: CVS does not really track changesets, but only individual
   revisions of files. To recover changesets it is necessary to look at

Added tools/import-cvs.tcl version [8e70daebc0]

@@ -1,1 +1,370 @@
+#!/bin/sh
+# -*- tcl -*- \
+exec tclsh "$0" ${1+"$@"}
+
+# -----------------------------------------------------------------------------
+# Make private packages accessible.
+
+lappend auto_path [file join [file dirname [info script]] lib]
+package require rcsparser
+package require fileutil
+
+# -----------------------------------------------------------------------------
+# Repository management (CVS)
+
+namespace eval ::cvs {
+    variable base   ; set       base   {} ; # Repository toplevel directory.
+    variable npaths ; array set npaths {} ; # path -> actual path mapping.
+    variable rpaths ; array set rpaths {} ; # path -> rcs file mapping.
+    variable cmsg   ; array set cmsg   {} ; # Cache of commit messages.
+}
+
+proc ::cvs::hextime {hex} {
+    set t 0
+    foreach d [string map {
+	a 10 b 11 c 12 d 13 e 14 f 15
+	A 10 B 11 C 12 D 13 E 14 F 15
+    } [split $hex {}]] {
+	set t [expr {($t << 4) + $d}];#horner
+    }
+    return $t
+}
+
+proc ::cvs::at {path} {
+    variable base $path
+    return
+}
+
+proc ::cvs::cmsg {path rev} {
+    variable cmsg
+    set key [list $path $rev]
+
+    if {![info exists cmsg($key)]} {
+	set rcs [cvs::rcsfile $path]
+
+	#puts stderr "scan $path => $rcs"
+
+	array set p [::rcsparser::process $rcs]
+
+	foreach {r msg} $p(commit) {
+	    set cmsg([list $path $r]) $msg
+	}
+
+	if {![info exists cmsg($key)]} {
+	    return -code error "Bogus revision $rev of file $path"
+	}
+    }
+
+    return $cmsg($key)
+}
+
+proc ::cvs::norm {path} {
+    variable npaths
+    if {![info exists npaths($path)]} {
+	set npaths($path) [NormFile $path]
+    }
+    return $npaths($path)
+}
+
+proc ::cvs::NormFile {path} {
+    variable base
+
+    set f $base/$path,v
+    if {[::file exists $f]} {return $path}
+
+    set hd [::file dirname $path]
+    set tl [::file tail    $path]
+
+    set f $base/$hd/Attic/$tl,v
+    if {[::file exists $f]} {return $path}
+
+    # Bad. The dir can be truncated, i.e. it may not be an exact
+    # subdirectory of base, but deeper inside, with parents between it
+    # and base left out. Example (from the tcllib history file):
+    #
+    # M3f1d1245|afaupell|<remote>|ipentry|1.2|ChangeLog
+    # The correct path is 'tklib/modules/ipentry'.
+    # This we have to resolve too.
+
+    # normalize dance - old fileutil, modern fileutil (cvs head) doesn't do that.
+    set bx [file normalize $base]
+    foreach c [fileutil::findByPattern $bx -glob $hd] {
+	set cx [fileutil::stripPath $bx $c]
+	set c  $base/$cx
+
+	set f $c/$tl,v
+	if {[::file exists $f]} {return $cx/$tl}
+	set f $c/Attic/$tl,v
+	if {[::file exists $f]} {return $cx/$tl}
+    }
+
+    puts stderr <$f>
+    return -code error "Unable to locate actual file for $path"
+}
+
+proc ::cvs::rcsfile {path} {
+    variable rpaths
+    if {![info exists rpaths($path)]} {
+	set rpaths($path) [RcsFile $path]
+    }
+    return $rpaths($path)
+}
+
+proc ::cvs::RcsFile {path} {
+    variable base
+
+    set f $base/$path,v
+    if {[::file exists $f]} {return $f}
+
+    set hd [::file dirname $path]
+    set tl [::file tail    $path]
+
+    set f $base/$hd/Attic/$tl,v
+    if {[::file exists $f]} {return $f}
+
+    # We do not have truncated directories here, assuming that only
+    # norm paths are fed into this command.
+
+    if 0 {
+	# Bad. The dir can be truncated, i.e. it may not be an exact
+	# subdirectory of base, but deeper inside, with parents
+	# between it and base left out. Example (from the tcllib
+	# history file):
+	#
+	# M3f1d1245|afaupell|<remote>|ipentry|1.2|ChangeLog The
+	# correct path is 'tklib/modules/ipentry'.  This we have to
+	# resolve too.
+
+	# normalize dance - old fileutil, modern fileutil (cvs head)
+	# doesn't do that.
+	set bx [file normalize $base]
+	foreach c [fileutil::findByPattern $bx -glob $hd] {
+	    set c $base/[fileutil::stripPath $bx $c]
+
+	    set f $c/$tl,v
+	    if {[::file exists $f]} {return $f}
+	    set f $c/Attic/$tl,v
+	    if {[::file exists $f]} {return $f}
+	}
+    }
+
+    puts stderr <$f>
+    return -code error "Unable to locate rcs file for $path"
+}
+
+proc ::cvs::history {} {
+    variable base
+    return $base/CVSROOT/history
+}
+
+# -----------------------------------------------------------------------------
+
+# -----------------------------------------------------------------------------
+
+cvs::at [lindex $argv 0]
+
+#puts [::cvs::norm ipentry/ChangeLog]
+#exit
+
+#changeset state
+global cs csf
+array set cs {
+    start {} end {} cm {}
+    usr   {} dt  {}
+}
+array set csf {}
+
+proc rh {} {
+    global argv cs csf repo
+
+    set f [open [cvs::history] r]
+
+    while {[gets $f line] >= 0} {
+	# Decode line
+	foreach {op usr _ dir rev file} [split [string trim $line] |] break
+	set ts [cvs::hextime [string range $op 1 end]]
+	set op [string index $op 0]
+
+	# Filter out irrelevant parts
+	if {$op eq "O"} continue ; # checkout
+	if {$op eq "F"} continue ; # release
+	if {$op eq "T"} continue ; # rtag
+	if {$op eq "W"} continue ; # delete on update
+	if {$op eq "U"} continue ; # update
+	if {$op eq "P"} continue ; # update by patch
+	#if {$op eq "G"} continue ; # merge on update - FUTURE - identifies mergepoints.
+	if {$op eq "C"} continue ; # conflict on update - s.a.
+	if {$op eq "E"} continue ; # export
+	# left types
+	# M: commit
+	# A: addition
+	# R: removal
+
+	set df $dir/$file
+	if {[newcs $op $usr $ts $rev df cause]} {
+
+	    # NOTE 1: ChangeSets containing CVSROOT => remove such files.
+	    # NOTE 2: Empty changesets, ignore.
+
+	    #commit
+	    csstats
+
+	    if {$cause eq "cmsg"} {
+set msg
+	    } else {
+set msg ""
+	    }
+
+	    if {$cs(end) ne ""} {
+		puts =============================/$cause\ delta\ [expr {$ts - $cs(end)}]
+	    } else {
+		puts =============================/$cause
+	    }
+	    csclear
+	}
+
+	# Note: newcs normalizes df, in case the log information is
+	# bogus. So the df here may be different from before newcs
+	csadd $op $usr $ts $rev $df
+	# apply modification to workspace
+    }
+}
+
+proc newcs {op usr ts rev dfv rv} {
+    global cs csf
+    upvar 1 $rv reason $dfv df
+
+    # Logic to detect when a new change set begins. A new change sets
+    # has started with the current entry when the entry
+    #
+    # 1. is for a different user than the last.
+    # 2. tries to add a file to the changeset which is already part of it.
+    # 3.is on the trunk, and the last on a branch, or vice versa.
+    # 4. the current entry has a different commit message than the last.
+
+    set df [cvs::norm $df]
+
+    # User changed
+    if {$usr ne $cs(usr)} {
+	set reason user
+	return 1
+    }
+
+    # File is already in the changeset
+    if {[info exists csf($df)]} {
+	set reason file
+	return 1
+    }
+
+    # last/current are different regarding trunk/branch
+    set depth [llength [split $rev .]]
+    if {($cs(lastd) == 2) != ($depth == 2)} {
+	set reason branch
+	return 1
+    }
+
+    # Commit message changed
+    if {[cvs::cmsg $cs(lastf) $cs(lastr)] ne [cvs::cmsg $df $rev]} {
+	set reason cmsg
+	return 1
+    }
+
+    # Same changeset
+    return 0
+}
+
+proc csclear {} {
+    global cs csf
+    array set cs {start {} usr {} end {} dt {}}
+    array unset csf *
+    return
+}
+
+proc csadd {op usr ts rev df} {
+    global cs csf
+
+    if {$cs(usr) eq ""} {set cs(usr) $usr}
+    if {$cs(start) eq ""} {
+	set cs(start) $ts
+    } else {
+	lappend cs(dt) [expr {$ts - $cs(end)}]
+    }
+    set cs(end) $ts
+
+    set csf($df) [list $op $rev]
+    set cs(lastf) $df
+    set cs(lastr) $rev
+    set cs(lastd) [llength [split $rev .]]
+
+    puts [list $op [clock format $ts] $usr $rev $df]
+    return
+}
+
+proc csstats {} {
+    global cs csf
+
+    if {$cs(start) eq ""} return
+
+    puts "files: [array size csf]"
+    puts "delta: $cs(dt)"
+    puts "range: [expr {$cs(end) - $cs(start)}] seconds"
+    return
+}
+
+rh
+
+exit
+
+=========================================
+new fossil
+new fossil workspace
+
+open history
+
+foreach line {
+    ignore unwanted lines
+
+    accumulate changesets data
+    new change-set => commit and continue
+
+    current branch and branch of new change different ?
+    => move fossil workspace to proper revision.
+
+    apply change to workspace
+    uncommitted
+}
+
+if uncommitted => commit
+delete workspace
+copy fossil repo to destination
+=========================================
+
+Not dealt with in outline: branches, tags, merging
+
+=========================================
+
+complexities
+- apply to workspace
+  - remove simple, direct translation
+  - add => requires extraction of indicated revision from ,v
+  - modify => see above (without add following)
+
+- ,v file => Can be the either dir/file,v, or dir/Attic/file,v
+  Both ? Priority ?
+
+- How to detect changes on branches ?
+
+- Have to keep knowledge of which branches went there.
+ => save change-sets information, + uuid in fossil
+ => need only the leaves of each branch, and of branch points.
+ => better keep all until complete.
+ => uuid can be gotten from 'manifest.uuid' in workspace.
+- keep tag information ? (symbolics)
+
+=========================================
+
+CVSROOT=ORIGIN
 
+cvs -d ORIGIN checkout -r REV FILE
+Extract specific revision of a specific file.
+-q, -Q quietness

Added tools/lib/pkgIndex.tcl version [71b6b857ad]

@@ -1,1 +1,2 @@
-
+if {![package vsatisfies [package require Tcl] 8.4]} return
+package ifneeded rcsparser 1.0 [list source [file join $dir rcsparser.tcl]]

Added tools/lib/rcsparser.tcl version [69db20ceda]

@@ -1,1 +1,265 @@
 
+# -----------------------------------------------------------------------------
+# Parse RCS files (,v) - ignore the deltas - we need only the commit messages
+# Recursive Descent Parser
+
+# -----------------------------------------------------------------------------
+# Requirements
+
+package require Tcl 8.4
+package require fileutil ; # Tcllib (cat)
+
+namespace eval ::rcsparser {}
+
+# -----------------------------------------------------------------------------
+# API
+
+proc ::rcsparser::process {path} {
+    set data [fileutil::cat -encoding binary $path]
+    array set res {}
+    Admin
+    Deltas
+    Description
+    DeltaTexts
+    return [array get res]
+}
+
+# -----------------------------------------------------------------------------
+# Internal helper commands
+
+proc ::rcsparser::Admin {} {
+    upvar 1 data data res res
+    Head ; Branch ; Access ; Symbols ; Locks ; Strict ; Comment ; Expand
+    return
+}
+
+proc ::rcsparser::Deltas {} {
+    upvar 1 data data res res
+    while {[Num 0]} { Date ; Author ; State ; Branches ; NextRev }
+    return
+}
+
+proc ::rcsparser::Description {} {
+    upvar 1 data data res res
+    Literal desc
+    String 1
+    Def desc
+    return
+}
+
+proc ::rcsparser::DeltaTexts {} {
+    upvar 1 data data res res
+    while {[Num 0]} { Log ; Text }
+    return
+}
+
+proc ::rcsparser::Head {} {
+    upvar 1 data data res res
+    Literal head ; Num 1 ; Literal \;
+    Def head
+    return
+}
+
+proc ::rcsparser::Branch {} {
+    upvar 1 data data res res
+    if {![Literal branch 0]} return ; Num 1 ; Literal \;
+    Def branch
+    return
+}
+
+proc ::rcsparser::Access {} {
+    upvar 1 data data res res
+    Literal access ; Literal \;
+    return
+}
+
+proc ::rcsparser::Symbols {} {
+    upvar 1 data data res res
+    Literal symbols
+    while {[Ident]} { Num 1 ; Map symbol }
+    Literal \;
+    return
+}
+
+proc ::rcsparser::Locks {} {
+    upvar 1 data data res res
+    Literal locks
+    while {[Ident]} { Num 1 ; Map lock }
+    Literal \;
+    return
+}
+
+proc ::rcsparser::Strict {} {
+    upvar 1 data data res res
+    if {![Literal strict 0]} return ; Literal \;
+    return
+}
+
+proc ::rcsparser::Comment {} {
+    upvar 1 data data res res
+    if {![Literal comment 0]} return ;
+    if {![String 0]} return ;
+    Literal \;
+    Def comment
+    return
+}
+
+proc ::rcsparser::Expand {} {
+    upvar 1 data data res res
+    if {![Literal expand 0]} return ;
+    if {![String 0]} return ;
+    Literal \;
+    Def expand
+    return
+}
+
+proc ::rcsparser::Date {} {
+    upvar 1 data data res res
+    Literal date ; Num 1 ; Literal \;
+    return
+}
+
+proc ::rcsparser::Author {} {
+    upvar 1 data data res res
+    Literal author ; Skip ; Literal \;
+    return
+}
+
+proc ::rcsparser::State {} {
+    upvar 1 data data res res
+    Literal state ; Skip ; Literal \;
+    return
+}
+
+proc ::rcsparser::Branches {} {
+    upvar 1 data data res res
+    Literal branches ; Skip ; Literal \;
+    return
+}
+
+proc ::rcsparser::NextRev {} {
+    upvar 1 data data res res
+    Literal next ; Skip ; Literal \;
+    return
+}
+
+proc ::rcsparser::Log {} {
+    upvar 1 data data res res
+    IsIdent ; Literal log ; String 1 ; Map commit
+    return
+}
+
+proc ::rcsparser::Text {} {
+    upvar 1 data data res res
+    Literal text ; String 1
+    return
+}
+
+proc ::rcsparser::Ident {} {
+    upvar 1 data data res res
+
+    #puts I@?<[string range $data 0 10]...>
+
+    if {[regexp -indices -- {^\s*;\s*} $data]} {
+	return 0
+    } elseif {![regexp -indices -- {^\s*([^:]*)\s*:\s*} $data match val]} {
+	return 0
+    }
+
+    Get $val ; IsIdent
+    Next
+    return 1
+}
+
+proc ::rcsparser::Literal {name {required 1}} {
+    upvar 1 data data res res
+    if {![regexp -indices -- "^\\s*$name\\s*" $data match]} {
+	if {$required} {
+	    return -code error "Expected '$name' @ '[string range $data 0 30]...'"
+	}
+	return 0
+    }
+
+    Next
+    return 1
+}
+
+proc ::rcsparser::String {{required 1}} {
+    upvar 1 data data res res
+
+    if {![regexp -indices -- {^\s*@(([^@]*(@@)*)*)@\s*} $data match val]} {
+	if {$required} {
+	    return -code error "Expected string @ '[string range $data 0 30]...'"
+	}
+	return 0
+    }
+
+    Get $val
+    Next
+    return 1
+}
+
+proc ::rcsparser::Num {required} {
+    upvar 1 data data res res
+    if {![regexp -indices -- {^\s*((\d|\.)+)\s*} $data match val]} {
+	if {$required} {
+	    return -code error "Expected id @ '[string range $data 0 30]...'"
+	}
+	return 0
+    }
+
+    Get $val
+    Next
+    return 1
+}
+
+proc ::rcsparser::Skip {} {
+    upvar 1 data data res res
+    regexp -indices -- {^\s*[^;]*\s*} $data match
+    Next
+    return
+}
+
+proc ::rcsparser::Def {key} {
+    upvar 1 data data res res
+    set res($key) $res(lastval)
+    unset res(lastval)
+    return
+}
+
+proc ::rcsparser::Map {key} {
+    upvar 1 data data res res
+    lappend res($key) $res(id) $res(lastval)
+    #puts Map($res(id))=($res(lastval))
+    unset res(lastval)
+    unset res(id)
+    return
+}
+
+proc ::rcsparser::IsIdent {} {
+    upvar 1 data data res res
+    set res(id) $res(lastval)
+    unset res(lastval)
+    return
+}
+
+proc ::rcsparser::Get {val} {
+    upvar 1 data data res res
+    foreach {s e} $val break
+    set res(lastval) [string range $data $s $e]
+    #puts G|$res(lastval)
+    return
+}
+
+proc ::rcsparser::Next {} {
+    upvar 1 match match data data
+    foreach {s e} $match break ; incr e
+    set data [string range $data $e end]
+    return
+}
+
+# -----------------------------------------------------------------------------
+# Ready
+
+package provide rcsparser 1.0
+return