Patchwork contrib/synthwiderepo: generate random repos with arbitrarily many files

login
register
mail settings
Submitter adgar@google.com
Date Aug. 25, 2014, 9:27 p.m.
Message ID <4e11710e8d2fdaf35858.1409002038@adgar.nyc.corp.google.com>
Download mbox | patch
Permalink /patch/5599/
State Rejected
Headers show

Comments

adgar@google.com - Aug. 25, 2014, 9:27 p.m.
# HG changeset patch
# User Mike Edgar <adgar@google.com>
# Date 1408738363 14400
#      Fri Aug 22 16:12:43 2014 -0400
# Node ID 4e11710e8d2fdaf35858a02f310898294c73ef2a
# Parent  de783f2403c498ef1e20121acf178b32ec27199c
contrib/synthwiderepo: generate random repos with arbitrarily many files

Two new commands are introduced for the purpose of evaluating manifest
performance at a variety of scaling points.

The first command, analyze-wide, walks a directory tree and emits a model
of the directory structure (currently just file counts per directory).

The second command, synthesize-wide, constructs a single commit with N
files added in a directory structure statistically similar to the
distribution of files in a model created by analyze-wide.

NB: It seems possible to combine this with the existing analyze/
synthesize commands in contrib/synthrepo.py, but the options required
by the -wide variants are different enough that a merger felt kludgy.
Matt Mackall - Aug. 30, 2014, 12:17 p.m.
On Mon, 2014-08-25 at 17:27 -0400, adgar@google.com wrote:
> # HG changeset patch
> # User Mike Edgar <adgar@google.com>
> # Date 1408738363 14400
> #      Fri Aug 22 16:12:43 2014 -0400
> # Node ID 4e11710e8d2fdaf35858a02f310898294c73ef2a
> # Parent  de783f2403c498ef1e20121acf178b32ec27199c
> contrib/synthwiderepo: generate random repos with arbitrarily many files

I've asked Mike to feel free to hack synthrepo.

Patch

diff -r de783f2403c4 -r 4e11710e8d2f contrib/synthwiderepo.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/contrib/synthwiderepo.py	Fri Aug 22 16:12:43 2014 -0400
@@ -0,0 +1,184 @@ 
+# synthwiderepo.py - wide repo synthesis
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+'''synthesize a commit with many files in many directories'''
+
+import bisect, json, itertools, os, random, sys
+from mercurial import cmdutil, context, util, hg
+from mercurial.i18n import _
+from mercurial.node import hex, nullid, short
+
+cmdtable = {}
+command = cmdutil.command(cmdtable)
+
+@command('analyze-wide',
+         [('o', 'output', '', _('write output to given file'), _('FILE')),
+          ('r', 'root', '', _('alternate directory to analyze'), _('DIR'))],
+         _('hg analyze-wide'),
+         optionalrepo=True)
+def analyze_wide(ui, repo, root='', output='', **opts):
+    '''measure file count in each directory in a tree and write counts to a file
+
+    Both repositories and arbitrary directories may be analyzed. By default, the
+    current repository will be analyzed. To override the default (or no repository
+    is present) specify --root.
+
+    The file counts are written to a JSON file which can be used with
+    :hg:`synthesize-wide` to create a commit with synthesized files similar to the
+    analyzed directory.
+    '''
+    if repo and not root:
+        root = repo.root
+    if not output:
+        output = os.path.basename(root) + '.json'
+
+    if output == '-':
+        fp = sys.stdout
+    else:
+        fp = open(output, 'w')
+
+    def onerror(e):
+        ui.warn('os.walk error: %s\n' % e)
+
+    outjson = {'dirs': {}}
+    rootprefixlen = len(root)
+    try:
+        for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
+            dirpathfromroot = dirpath[rootprefixlen:]
+            outjson['dirs'][dirpathfromroot] = len(filenames)
+            if '.hg' in dirnames:
+                dirnames.remove('.hg')
+
+        json.dump(outjson, fp)
+    finally:
+        fp.close()
+
+@command('synthesize-wide',
+         [('c', 'count', 0, _('create given number of files'), _('COUNT')),
+          ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
+          ('', 'seed', 0, _('seed for random number generator'))],
+         _('hg synthesize-wide [OPTION].. DESCFILE'))
+def synthesize_wide(ui, repo, descpath, count=0, seed=0, **opts):
+    '''synthesize a commit adding files using a model of directory structure
+
+    The model must have been generated by :hg:`analyze-wide`. The directories in
+    the model will be renamed randomly, and files will be created in the renamed
+    directories according to the probabilities in the model.
+
+    Generated file path components are drawn randomly from a dictionary
+    containing one word per line. Use --dict to specify a specific dictionary.
+    '''
+    if not count:
+        raise util.Abort(_('the number of files to create is required'))
+    if seed:
+        random.seed(int(seed))
+
+    try:
+        fp = hg.openpath(ui, descpath)
+    except Exception, err:
+        raise util.Abort('%s: %s' % (descpath, err[0].strerror))
+    desc = json.load(fp)
+    fp.close()
+
+    newdirs = renamedirs(desc['dirs'], words)
+    dirs = cdf(newdirs)
+
+    dictfile = opts.get('dict') or '/usr/share/dict/words'
+    try:
+        fp = open(dictfile, 'rU')
+    except IOError, err:
+        raise util.Abort('%s: %s' % (dictfile, err.strerror))
+    words = [w.lower().replace("'", "") for w in fp.read().splitlines()]
+    fp.close()
+
+    random.shuffle(words)
+    # Size of dictionary is typically < 1M, so we pick filenames by iterating
+    # through each pair of words from a shuffled word list.
+    filenames = itertools.imap(lambda parts: '-'.join(reversed(parts)),
+                               itertools.product(words, words))
+
+    _synthesizing = _('synthesizing')
+    _files = _('files')
+
+    wlock = repo.wlock()
+    lock = repo.lock()
+
+    try:
+        pctx = repo['tip']
+
+        files = {}
+        for i in xrange(0, count):
+            ui.progress(_synthesizing, i, unit=_files, total=count)
+
+            # Start off with a flat repo.
+            path = pickpath(dirs, filenames)
+            while path in files:
+                path = pickpath(dirs, filenames)
+            data = '%s contents\n' % path
+            files[path] = context.memfilectx(repo, path, data)
+
+        def filectxfn(repo, memctx, path):
+            if path not in files:
+                raise IOError('Unexpected file added in synthesize-wide: %s' % path)
+            return files[path]
+
+        ui.progress(_synthesizing, None)
+        ui.status(_('committing %d synthesized files\n') % (len(files),))
+        message = 'synthesized wide repo with %d files' % (len(files),)
+        user = ui.username()
+        date = '%d %d' % util.makedate()
+        mc = context.memctx(repo, [pctx.node(), nullid], message,
+                            files.iterkeys(),
+                            filectxfn, user, date)
+        newnode = mc.commit()
+
+        hexfn = ui.debugflag and hex or short
+        newnodehex = hexfn(newnode)
+
+        ui.write(_('added commit %s\n') % (newnodehex,))
+    finally:
+        lock.release()
+        wlock.release()
+
+def renamedirs(dirs, words):
+    '''Randomly rename the directory names in the per-dir file count dict.'''
+    wordgen = itertools.cycle(words)
+    replacements = {'': ''}
+    result = {}
+    def rename(dirpath):
+        '''Recursively rename the directory and all path prefixes.
+
+        The mapping from path to renamed path is stored for all path prefixes
+        as in dynamic programming, ensuring linear runtime and consistent renaming
+        regardless of iteration order through the model.'''
+        if dirpath in replacements:
+            return replacements[dirpath]
+        head, _ = os.path.split(dirpath)
+        head = head and rename(head) or ''
+        renamed = os.path.join(head, next(wordgen))
+        replacements[dirpath] = renamed
+        return renamed
+    for dirpath, count in dirs.iteritems():
+        result[rename(dirpath.lstrip(os.sep))] = count
+    return result
+
+# TODO(adgar): Extract common synthesis module, used here and in synthrepo.py.
+def cdf(l):
+    if not l:
+        return [], []
+    vals, probs = zip(*sorted(l.iteritems(), key=lambda x: x[1], reverse=True))
+    t = float(sum(probs, 0))
+    s, cdfs = 0, []
+    for v in probs:
+        s += v
+        cdfs.append(s / t)
+    return vals, cdfs
+
+def pickcdf(cdf):
+    return cdf[0][bisect.bisect_left(cdf[1], random.random())]
+
+def pickpath(dirs, filenames):
+    dirname = pickcdf(dirs).encode('utf-8')
+    return os.path.join(dirname, next(filenames))