#!/usr/bin/env python
#
# A simple and fast Subversion to git conversion script.  It takes
# a svnadmin dump and uses it to build a git repository.  Subversion
# branches and tags are not intelligently converted at the moment.
#
#   2008-11-19 
#   Neil Schemenauer <nas@arctrix.com>
#
# Based on svn2bzr, with licensing terms as follow:
# Copyright (C) 2005-2007 by Canonical Ltd
#
# Written by Gustavo Niemeyer <gustavo@niemeyer.net>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
import optparse
import tempfile
import logging
import marshal
import sys, os
import shutil
import anydbm
from subprocess import Popen, PIPE
try:
    import bz2
except ImportError:
    pass
import re

VERSION = "0.1"
DEFAULT_DOMAIN = ""

def get_logger():
    if hasattr(get_logger, "initialized"):
        logger = logging.getLogger("svn2git")
    else:
        get_logger.initialized = True
        class Formatter(logging.Formatter):
            def format(self, record):
                if record.levelno != logging.INFO:
                    record.prefix = record.levelname.lower()+": "
                else:
                    record.prefix = ""
                return logging.Formatter.format(self, record)
        formatter = Formatter("%(prefix)s%(message)s")
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(formatter)
        #logger = logging.getLogger("bzr")
        #logger.addHandler(handler)
        #logger.setLevel(logging.ERROR)
        logger = logging.getLogger("svn2git")
        logger.addHandler(handler)
        logger.setLevel(logging.INFO)
    return logger


class Error(Exception): pass
class FormatVersionError(Error): pass
class IncrementalDumpError(Error): pass


class DumpEntry(dict):
    """
    An entry in a dump file.
    """

    __slots__ = ["prop", "content_pos", "content_len",
                 "copy_from", "change_from"]

    def __init__(self):
        self.prop = {}
        self.content_pos = 0
        self.content_len = 0
        self.copy_from = None
        self.change_from = None

    def __repr__(self):
        return "<DumpEntry %r>" % dict.__repr__(self)


class Dump(object):
    """
    That class will read a dump file and store information about it.

    Besides iterating through the entries, this class is also capable
    of providing the complete tree (a dictionary like {path: entry, ...})
    for any given revision. This is important in cases where a given
    path is not being considered, but some entry is copied from it into
    a path which is being considered.

    The mechanism used to build and store information about the whole dump
    tries to perform reasonably, without consuming an unacceptable amount
    of memory. Basically, there's an on-disk tree cache which saves a
    complete tree state (trees are path -> dump entry mappings) each 100
    revisions, or whenever the tree contains copies of previous trees.
    Then, whenever a tree has to be rebuilt for a given revision, the
    largest cached tree revision before the asked revision is taken,
    and the tree is incremented up to the asked revision. This ensures
    that the tree will never be incremented for more than 99 revisions, and
    will never "walk back" (since all trees that need copies are already
    cached).
    """

    def __init__(self, file=None, log=None):
        self._dump = []           # [entry, ... ]

        # Index of the revision entry in the dump list.
        self._revision_index = {} # {revno: dump index, ...}

        # The start/stop indexes of entries with changes in a revision.
        self._revision_slice = {} # {revno: dump slice, ...}

        self._revision_order = [] # [revno, ... ]

        self._tree_cache_filename = tempfile.mktemp('-saved-trees')
        self._tree_cache = anydbm.open(self._tree_cache_filename, "c")
        self._tree_cache_mem = {}
        self._tree_cache_mem_order = []

        self._path_id = {} # {path: id, ...}
        self._id_path = {} # {id: path, ...}

        self._log = log or get_logger()

        self._file = file
        self._read()

    def __del__(self):
        os.unlink(self._tree_cache_filename)

    def _save_tree(self, revno, tree):
        self._log.debug("Saving revision %d in disk cache" % revno)
        self._tree_cache[str(revno)] = bz2.compress(marshal.dumps(tree, 2))

    def _load_tree(self, revno):
        self._log.debug("Loading revision %d from disk cache" % revno)
        return marshal.loads(bz2.decompress(self._tree_cache[str(revno)]))

    def _build_tree(self, revno):
        """Build tree state for the given revision number.

        @param revno: Revision number of the tree to be built.
        """
        if revno in self._tree_cache_mem:
            self._log.debug("Found revision %d in memory cache" % revno)
            return self._tree_cache_mem[revno]
        if str(revno) in self._tree_cache:
            return self._load_tree(revno)
        tree_revno = -1
        for cached_revno_s in self._tree_cache:
            cached_revno = int(cached_revno_s)
            if tree_revno < cached_revno < revno:
                tree_revno = cached_revno
        for cached_revno in self._tree_cache_mem:
            if tree_revno <= cached_revno < revno:
                tree_revno = cached_revno
        if tree_revno != -1:
            self._log.debug("Building revision %d based on %d" %
                            (revno, tree_revno))
            if tree_revno in self._tree_cache_mem:
                tree = self._tree_cache_mem[tree_revno].copy()
            else:
                tree = self._load_tree(tree_revno)
        else:
            self._log.debug("Building revision %d from scratch" % revno)
            tree = {}
        for current_revno in self._revision_order:
            if current_revno > revno:
                break
            elif tree_revno < current_revno:
                slice = self._revision_slice[current_revno]
                for entry_index in range(slice.start, slice.stop):
                    self._change_tree(tree, self._dump[entry_index],
                                      entry_index)
                if current_revno == revno:
                    break
        self._log.debug("Revision %d is ready" % revno)
        if len(self._tree_cache_mem) > 3:
            del self._tree_cache_mem[self._tree_cache_mem_order[0]]
            del self._tree_cache_mem_order[0]
        self._tree_cache_mem[revno] = tree
        self._tree_cache_mem_order.append(revno)
        return tree

    def __iter__(self):
        return iter(self._dump)

    def get_revision(self, revno):
        return self._dump[self._revision_index[revno]]

    def get_revision_entries(self, revno, path=None, incremental=True):
        if incremental and not path:
            return self._dump[self._revision_slice[revno]]
        else:
            raise NotImplementedError

    def get_entry(self, revno, path):
        tree = self._build_tree(revno)
        return self._dump[tree[self._path_id[path]]]

    def get_entry_content(self, entry):
        if entry.content_len == 0:
            return ""
        self._file.seek(entry.content_pos)
        return self._file.read(entry.content_len)

    def get_tree(self, revno):
        tree = self._build_tree(revno)
        path_tree = {}
        id_path = self._id_path
        for tree_path_id in tree.keys():
            path_tree[id_path[tree_path_id]] = self._dump[tree[tree_path_id]]
        return path_tree

    def get_dir_tree(self, revno, path):
        tree = self._build_tree(revno)
        path_tree = {}
        id_path = self._id_path
        prefix = path+"/"
        for tree_path_id in tree.keys():
            tree_path = id_path[tree_path_id]
            if tree_path == path or tree_path.startswith(prefix):
                path_tree[tree_path] = self._dump[tree[tree_path_id]]
        return path_tree

    def _change_tree(self, tree, entry, entry_index, building=False):
        """Modify the tree with information in the given entry.

        @param tree: The tree to be modified.
        @param entry: The entry with changes to be applied to the tree.
        @param entry_index: Index of entry in self._dump.
        @param building: Whether this is building the tree for the first
            time, or is reconstructing the tree state from a cached version.
        """

        node_action = entry["node-action"]
        node_path = entry["node-path"]

        path_id = self._path_id
        id_path = self._id_path

        if node_path not in path_id:
            new_id = len(path_id)
            path_id[node_path] = new_id
            id_path[new_id] = node_path

        node_path_id = path_id[node_path]

        copied_something = False

        if node_action == "add":
            assert node_path not in tree
            tree[node_path_id] = entry_index

            node_kind = entry["node-kind"]

            if "node-copyfrom-path" in entry:

                copied_something = True

                copy_path = entry["node-copyfrom-path"]
                copy_revno = int(entry["node-copyfrom-rev"])

                copy_tree = self._build_tree(copy_revno)

                entry.copy_from = self._dump[copy_tree[path_id[copy_path]]]

                if building:
                    if "prop-content-length" not in entry:
                        entry.prop = entry.copy_from.prop
                    elif "text-content-length" not in entry:
                        entry.content_pos = entry.copy_from.content_pos
                        entry.content_len = entry.copy_from.content_len

                if node_kind == "dir":
                    # Add entries inside the directory to the tree.
                    prefix = copy_path+"/"
                    def relocate(path):
                        return os.path.join(node_path, path[len(prefix):])
                    for tree_path_id, tree_entry_index in copy_tree.items():
                        tree_path = id_path[tree_path_id]
                        if tree_path.startswith(prefix):
                            # Would we need a new entry with copy_from?
                            relocated_path = relocate(tree_path)
                            if relocated_path not in path_id:
                                new_id = len(path_id)
                                path_id[relocated_path] = new_id
                                id_path[new_id] = relocated_path
                            tree[path_id[relocated_path]] = tree_entry_index



        elif node_action == "change":

            if node_path_id not in tree:
                raise IncrementalDumpError("Dump references a missing "
                                           "path/revision")

            if building:
                entry.change_from = self._dump[tree[node_path_id]]

                if "prop-content-length" not in entry:
                    entry.prop = entry.change_from.prop
                elif "text-content-length" not in entry:
                    entry.content_pos = entry.change_from.content_pos
                    entry.content_len = entry.change_from.content_len

            tree[node_path_id] = entry_index

        elif node_action == "delete":

            if node_path_id not in tree:
                raise IncrementalDumpError("Dump references a missing "
                                           "path/revision")

            tree_entry = self._dump[tree[node_path_id]]

            if tree_entry["node-kind"] == "dir":
                prefix = node_path+"/"
                for tree_path_id in tree.keys():
                    if id_path[tree_path_id].startswith(prefix):
                        del tree[tree_path_id]

            del tree[node_path_id]

        return copied_something

    def _read(self):

        file = self._file

        convert_to_int = {}
        for name in ["revision-number",
                     "content-length",
                     "prop-content-length",
                     "text-content-length",
                     "node-copyfrom-rev"]:
            convert_to_int[intern(name)] = True

        revision = revision_index = None
        last_saved_len = 0
        copied_something = False

        tree = {}

        revno = -1

        while True:

            line = file.readline()
            if not line:
                break
            line = line.rstrip()
            if not line:
                continue

            # Build entry

            entry = DumpEntry()

            while line:

                field, value = line.split(': ', 1)
                field = intern(field.lower())

                if field in convert_to_int:
                    entry[field] = int(value)
                else:
                    entry[field] = value

                line = file.readline().rstrip()

            prop_content_length = int(entry.get("prop-content-length", 0))
            if prop_content_length:

                line = file.readline().rstrip()

                while line != "PROPS-END":

                    k, l = line.split(' ')
                    assert k == "K"
                    key = intern(file.read(int(l)))

                    file.readline()
                    line = file.readline().rstrip()

                    v, l = line.split(' ')
                    assert v == "V"
                    v_len = int(l)
                    if key in convert_to_int:
                        entry.prop[key] = int(file.read(v_len))
                    else:
                        entry.prop[key] = file.read(v_len)

                    file.readline()
                    line = file.readline().rstrip()

            entry.content_len = entry.get("text-content-length", 0)
            if entry.content_len:
                entry.content_pos = file.tell()
                file.seek(entry.content_len, 1)

            # The entry was read. Now process it.

            current_index = len(self._dump)

            if "node-path" in entry:

                copied_something |= self._change_tree(tree, entry,
                                                      current_index,
                                                      building=True)

            elif "revision-number" in entry:

                if revision:
                    self._log.info("Revision %d read" % revno)
                    self._log.debug("Tree has %d entries" % len(tree))

                    self._revision_index[revno] = revision_index
                    self._revision_slice[revno] = slice(revision_index+1,
                                                        current_index)
                    self._revision_order.append(revno)

                    if (copied_something or
                        (last_saved_len+100 <= len(self._revision_index))):
                        last_saved_len = len(self._revision_index)
                        self._save_tree(revno, tree)
                    else:
                        if len(self._tree_cache_mem) > 3:
                            top = self._tree_cache_mem_order[0]
                            del self._tree_cache_mem[top]
                            del self._tree_cache_mem_order[0]
                        self._tree_cache_mem[revno] = tree.copy()
                        self._tree_cache_mem_order.append(revno)

                revno = entry["revision-number"]
                revision = entry
                revision_index = current_index

                copied_something = False

            elif "svn-fs-dump-format-version" in entry:

                format_version = entry["svn-fs-dump-format-version"]
                if format_version != "2":
                    raise FormatVersionError, \
                          "Invalid dump format version: %s" % format_version

            self._dump.append(entry)


def shellquote(s):
    return "'" + s.replace("'", "'\\''") + "'"

def git_init(root):
    cwd = os.getcwd()
    os.chdir(root)
    if not os.path.exists('.git'):
        os.system('git init')
    os.chdir(cwd)

def git_commit(root, message, committer, date):
    cwd = os.getcwd()
    os.chdir(root)
    os.system('git add .')
    p = os.popen("GIT_AUTHOR_DATE=%s git commit -q -a --author %s -F -" %
                 (shellquote(date), shellquote(committer)), 'w')
    p.write(message)
    rv = p.close()
    if rv:
        print 'git returned', rv
    os.chdir(cwd)


class GitCreator:

    def __init__(self, dump, root, authorfile):
        self._dump = dump
        self._root = root
        self._log = get_logger()
        self._authors = {}
        if authorfile and os.path.exists(authorfile):
            self._read_author_map(authorfile)

    def _apply_executable_bit(self, path, prop):
        executable = prop.get('svn:executable')
        if executable is None:
            return
        self._log.debug("Apply exec bit %s %s" % (path, executable))
        if executable == '*':
            os.chmod(path, os.stat(path).st_mode | 0111)
        else:
            os.chmod(path, os.stat(path).st_mode & ~0111)

    def remove(self, path):
        if os.path.isdir(path):
            shutil.rmtree(path)
        else:
            if not os.path.exists(path):
                print '**oops', path, 'does not exist'
            else:
                os.unlink(path)

    def copy(self, orig_path, orig_revno, dest_path):
        orig_entry = self._dump.get_entry(orig_revno, orig_path)
        if orig_entry["node-kind"] == "dir":
            self.copy_dir(orig_path, orig_revno, dest_path)
        else:
            self.copy_file(orig_path, orig_revno, dest_path)

    def move(self, orig_path, orig_revno, dest_path):
        self._log.debug("Moving %s %s" % (orig_path, dest_path))
        shutil.move(orig_path, dest_path)

    def add_file(self, path, content, prop={}):
        self._log.debug("Adding file: %s" % path)
        dirname = os.path.dirname(path)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        fp = open(path, "w")
        fp.write(content)
        fp.close()
        self._apply_executable_bit(path, prop)

    def copy_file(self, orig_path, orig_revno, dest_path, prop={}):
        orig_entry = self._dump.get_entry(orig_revno, orig_path)
        orig_content = self._dump.get_entry_content(orig_entry)
        self._log.debug("Copying file: %s at %d to %s" %
                        (orig_path, orig_revno, dest_path))
        fp = open(dest_path, "w")
        fp.write(orig_content)
        fp.close()
        self._apply_executable_bit(dest_path, prop)

    def add_dir(self, path):
        self._log.debug("Adding dir: %s" % path)
        if not os.path.exists(path):
            os.makedirs(path)

    def copy_dir(self, orig_path, orig_revno, dest_path):
        entries = self._dump.get_dir_tree(orig_revno, orig_path).items()
        entries.sort()
        changed = False
        for path, entry in entries:
            tail = path[len(orig_path)+1:]

            # We don't want the '/' suffix when copying the directory itself.
            if tail == "":
                copy_dest_path = dest_path
            else:
                copy_dest_path = os.path.join(dest_path, tail)

            node_kind = entry["node-kind"]
            if node_kind == "file":
                content = self._dump.get_entry_content(entry)
                self.add_file(copy_dest_path, content, prop=entry.prop)
            elif node_kind == "dir":
                self.add_dir(copy_dest_path)

    def change_file(self, path, content, prop={}):
        self._log.debug("Changing file: %s" % path)
        fp = open(path, "w")
        fp.write(content)
        fp.close()
        self._apply_executable_bit(path, prop)


    def run(self):
        revision = None
        revno = None
        git_init(self._root)

        def commit():
            self.commit(revno, revision.prop.get("svn:log", ""),
                        committer=revision.prop.get("svn:author"),
                        timestamp=revision.prop["svn:date"])

        for entry in self._dump:

            if "revision-number" in entry:

                if revision is not None and revno != 0:
                    commit()

                revision = entry
                self._revno = revno = revision["revision-number"]

            elif "node-path" in entry:

                node_path = entry["node-path"]

                node_action = entry["node-action"]
                node_kind = entry.get("node-kind")
                
                if node_kind not in (None, "file", "dir"):
                    raise Error, "Unknown entry kind: %s" % node_kind
                if node_action not in ("add", "delete", "change", "replace"):
                    raise Error, "Unknown action: %s" % node_action

                real_path = node_path
                assert not real_path.startswith('/'), real_path
                real_path = os.path.join(self._root, real_path)
                real_path = os.path.normpath(real_path)

                if node_action == "delete":
                    self.remove(real_path)

                elif node_action == "add" or node_action == "replace":

                    if node_action == "replace":
                        self.remove(real_path)

                    if "node-copyfrom-path" in entry:
                        copy_path = entry["node-copyfrom-path"]
                        copy_revno = entry["node-copyfrom-rev"]

                        if node_kind == "file":
                            if "text-content-length" in entry:
                                content = self._dump.get_entry_content(entry)
                                self.add_file(real_path, content,
                                              prop=entry.prop)
                            else:
                                self.copy_file(copy_path, copy_revno,
                                               real_path, prop=entry.prop)
                        else:
                            self.copy_dir(copy_path, copy_revno, real_path)

                    elif node_kind == "file":
                        content = self._dump.get_entry_content(entry)
                        self.add_file(real_path, content, prop=entry.prop)

                    elif node_kind == "dir":
                        self.add_dir(real_path)

                elif node_action == "change":

                    if (node_kind == "file" and
                        entry.content_pos != entry.change_from.content_pos):
                        content = self._dump.get_entry_content(entry)
                        self.change_file(real_path, content, prop=entry.prop)

            elif "uuid" in entry:
                self._uuid = entry["uuid"]

        if revision is not None:
            commit()

    def commit(self, revno, message, committer, timestamp):
        self._log.info("Committing revision %d" % revno)
        committer = self._authors.get(committer, committer)
        committer = committer or 'nobody'
        if '@' not in committer:
            domain = DEFAULT_DOMAIN or self._uuid
            committer = '%s <%s@%s>' % (committer, committer, domain)
        git_commit(self._root, message, committer, timestamp)


def svn2git(dump_file, output_dir, authorfile=None):

    if os.path.exists(output_dir):
        raise Error, "%s already exists" % output_dir

    dump = Dump(dump_file)
    os.mkdir(output_dir)
    creator = GitCreator(dump, output_dir, authorfile)
    creator.run()


def append_filter(option, opt, value, parser):
    lst = getattr(parser.values, option.dest)
    if type(lst) is not list:
        lst = []
        setattr(parser.values, option.dest, lst)
    lst.append((opt == "--include", value))


def parse_options():
    parser = optparse.OptionParser("svn2bzr.py [options] "
                                   "<dump file> <output dir>",
                                   version="%prog "+VERSION)
    parser.defaults["filter"] = []
    parser.add_option("--authors", dest="authorfile", metavar="FILENAME",
                      type="string",
                      help="mapping from Subversion usernames to Bzr email "
                           "address format")
    parser.add_option("--log", metavar="LEVEL",
                      help="set logging level to LEVEL (debug, info, "
                           "warning, error)", default="info")
    opts, args = parser.parse_args()
    if len(args) != 2:
        parser.print_help()
        sys.exit(1)
    opts.args = args
    return opts


def main():

    opts = parse_options()

    log = get_logger()
    log.setLevel(logging.getLevelName(opts.log.upper()))

    dump_filename = opts.args[0]
    if dump_filename.endswith(".gz"):
        import gzip
        dump_file = gzip.GzipFile(dump_filename, 'rb')
    elif dump_filename.endswith(".bz2"):
        dump_file = bz2.BZ2File(dump_filename, 'rb')
    else:
        dump_file = open(dump_filename, 'rb')

    try:
        svn2git(dump_file, opts.args[1], opts.authorfile)
    except Error, e:
        sys.exit("error: %s" % e)
    except KeyboardInterrupt:
        sys.exit("Interrupted")

if __name__ == "__main__":
    main()

