Patchwork [V2] contrib: add a tool to detect cyclic references

login
register
mail settings
Submitter Jun Wu
Date May 28, 2017, 5:02 a.m.
Message ID <869505c8d99d438f1c7d.1495947774@x1c>
Download mbox | patch
Permalink /patch/20965/
State Changes Requested
Headers show

Comments

Jun Wu - May 28, 2017, 5:02 a.m.
# HG changeset patch
# User Jun Wu <quark@fb.com>
# Date 1495946922 25200
#      Sat May 27 21:48:42 2017 -0700
# Node ID 869505c8d99d438f1c7d0775645d4ac28d70c15e
# Parent  ad37c569ec8121a99e4cd9da52365b114b82e744
# Available At https://bitbucket.org/quark-zju/hg-draft
#              hg pull https://bitbucket.org/quark-zju/hg-draft -r 869505c8d99d
contrib: add a tool to detect cyclic references

This patch adds a simple tool to find out all cyclic references for CPython.

The added test shows some common cases where cycles are created.
Gregory Szorc - May 28, 2017, 6:45 p.m.
On Sat, May 27, 2017 at 10:02 PM, Jun Wu <quark@fb.com> wrote:

> # HG changeset patch
> # User Jun Wu <quark@fb.com>
> # Date 1495946922 25200
> #      Sat May 27 21:48:42 2017 -0700
> # Node ID 869505c8d99d438f1c7d0775645d4ac28d70c15e
> # Parent  ad37c569ec8121a99e4cd9da52365b114b82e744
> # Available At https://bitbucket.org/quark-zju/hg-draft
> #              hg pull https://bitbucket.org/quark-zju/hg-draft -r
> 869505c8d99d
> contrib: add a tool to detect cyclic references
>
> This patch adds a simple tool to find out all cyclic references for
> CPython.
>
> The added test shows some common cases where cycles are created.
>

This seems reasonable. However, the tool as it stands isn't very usable
because you must include it in sys.path and import it. I'd favor adding
this as mercurial/leaks.py (or possibly adding to mercurial/profiling.py)
so it is trivial to import and use the context manager. That, or refactor
it as an extension and have the extension stash a reference to the context
manager under mercurial.util.leakdetect (or similar) - again so it is easy
to use.


>
> diff --git a/contrib/leakdetect.py b/contrib/leakdetect.py
> new file mode 100644
> --- /dev/null
> +++ b/contrib/leakdetect.py
> @@ -0,0 +1,147 @@
> +# leakdetect.py - CPython object cyclic references analysis
> +#
> +# Copyright 2017 Facebook, Inc.
> +#
> +# This software may be used and distributed according to the terms of the
> +# GNU General Public License version 2 or any later version.
> +
> +from __future__ import absolute_import
> +
> +import contextlib
> +import gc
> +import os
> +import platform
> +import sys
> +
> +def stronglyconnectedcomponents(vertices, edges):
> +    """yield strongly connected components in a directed graph"""
> +    ignored = set() # vertices outputted already
> +    stack = []      # vertice DFS stack
> +    index = {}      # stack[index[n]] == n; later visit: bigger index
> +    lowlink = {}    # see Tarjan's strongly connected components algorithm
> +    execstack = []  # enumerate execution stack to bypass Python stack
> limit
> +
> +    def visit(v):
> +        index[v] = lowlink[v] = len(stack)
> +        stack.append(v)
> +        execstack.append((checkroot, (v,)))
> +        for w in edges.get(v, []):
> +            execstack.append((updatelowlink, (v, w)))
> +            if w not in index:
> +                execstack.append((visit, (w,)))
> +
> +    def updatelowlink(v, w):
> +        lowlink[v] = min(lowlink[v], lowlink[w])
> +
> +    def checkroot(v):
> +        i = index[v]
> +        if lowlink[v] == i:
> +            component = stack[i:]
> +            del stack[i:]
> +            if component[0] not in ignored:
> +                ignored.update(component)
> +                return component
> +
> +    for v in vertices:
> +        if v not in ignored:
> +            index.clear()
> +            lowlink.clear()
> +            execstack.append((visit, (v,)))
> +            while execstack:
> +                func, args = execstack.pop()
> +                result = func(*args)
> +                if result is not None:
> +                    yield result
> +
> +def objectreferencegraph(objects):
> +    """return (vertices, edges), referenced relationship among objects
> +
> +    vertices contain object IDs.
> +    """
> +    idset = set(id(o) for o in objects)
> +    edges = {}
> +    for o in objects:
> +        edges[id(o)] = [id(r) for r in gc.get_referents(o) if id(r) in
> idset]
> +    return idset, edges
> +
> +def describe(obj):
> +    """short and easy to understand text to help identify an object
> +
> +    similar to repr(), but avoids huge outputs in all cases.
> +    """
> +    if isinstance(obj, tuple):
> +        return '<tuple (%s)>' % ', '.join(describe(v) for v in obj)
> +    elif isinstance(obj, type):
> +        return '<type %s>' % obj.__name__
> +    elif isinstance(obj, type(describe)): # function
> +        code = obj.func_code
> +        path = os.path.basename(code.co_filename)
> +        return ('<function %s@%s:%s>' % (obj.func_name, path,
> +                                         code.co_firstlineno))
> +    elif isinstance(obj, type(os.environ.get)): # instancemethod
> +        return repr(obj)
> +    else:
> +        if type(obj) in (list, set, dict):
> +            tryrepr = repr(obj)
> +            if len(tryrepr) < 70:
> +                return tryrepr
> +        cellcontent = getattr(obj, 'cell_contents', None)
> +        if cellcontent is not None:
> +            return '<cell %s>' % describe(cellcontent)
> +        typename = type(obj).__name__
> +        if typename == 'getset_descriptor':
> +            return repr(obj)
> +        return '<%s>' % typename
> +
> +def analysegarbage(garbage, out):
> +    """find cyclic references among garbage objects and report them"""
> +    out.write('--- leak detection report ---\n'
> +              '%d objects not freed by reference counting\n' %
> len(garbage))
> +    vertices, edges = objectreferencegraph(garbage)
> +    groups = []
> +    for component in stronglyconnectedcomponents(vertices, edges):
> +        if len(component) == 1:
> +            v = component[0]
> +            if v in edges[v]: # self-cycle
> +                groups.append((v,))
> +        else:
> +            groups.append(component)
> +
> +    objmap = {id(o): o for o in garbage}
> +    outputs = []
> +    for g in groups:
> +        output = 'cycle of %d objects:\n' % len(g)
> +        output += ''.join(sorted('  %s\n' % describe(objmap[i]) for i in
> g))
> +        outputs.append(output)
> +    out.write(''.join(sorted(outputs)))
> +
> +if platform.python_implementation() == 'CPython':
> +    # only verified to work in CPython
> +    @contextlib.contextmanager
> +    def leakdetect(out=sys.stderr):
> +        # collect garbage so previous garbage won't affect us
> +        gc.set_debug(gc.get_debug() & ~gc.DEBUG_SAVEALL)
> +        gc.collect()
> +        # make gc.collect put objects to gc.garbage instead of releasing
> them
> +        gc.set_debug(gc.get_debug() | gc.DEBUG_SAVEALL)
> +        gc.disable()
> +        try:
> +            yield
> +        finally:
> +            # populate gc.garbage
> +            gc.collect()
> +            garbage = gc.garbage
> +            # restore the default behavior
> +            gc.set_debug(gc.get_debug() & ~gc.DEBUG_SAVEALL)
>

Wouldn't this be semantically better if it restored the original settings
stashed away from a gc.get_debug()?


> +            if garbage:
> +                analysegarbage(garbage, out)
> +                del gc.garbage[:]
> +
> +    disabled = False
> +else:
> +    # fallback to a no-op gracefully
> +    @contextlib.contextmanager
> +    def leakdetect(out=sys.stderr):
> +        yield
> +
> +    disabled = True
> diff --git a/tests/test-leakdetect.py b/tests/test-leakdetect.py
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-leakdetect.py
> @@ -0,0 +1,50 @@
> +from __future__ import absolute_import
> +
> +import inspect
> +import os
> +import sys
> +
> +dirname = os.path.dirname
> +sys.path.insert(0, os.path.join(dirname(dirname(__file__)), 'contrib'))
> +
> +import leakdetect
> +
> +if leakdetect.disabled:
> +    sys.stdout.write('skipped: missing feature: cpython\n')
> +    sys.exit(80)
> +
> +def leaklistdict():
> +    a = []
> +    b = {}
> +    b['a'] = a
> +    a.append(b)
> +    a.append([[[[['65537']]]]])
> +    c = [('long text' * 100,)]
> +    c.append(c)
> +
> +def leakframe():
> +    frame = inspect.currentframe()
> +    a = []
> +    a.append(a)
> +    b = [[['some text']]]
> +
> +def leakfunc():
> +    def func(x):
> +        return func(x + 1)
> +    f = lambda x: g(x)
> +    g = lambda x: f(x)
> +
> +def leakclass():
> +    # http://bugs.python.org/msg60985
> +    class A(object):
> +        pass
> +    B = type('B', (object,), {})
> +    a = A()
> +    b = B()
> +    class C(a.__class__):
> +        pass
> +    a.__class__ = C
> +
> +for leak in [leaklistdict, leakframe, leakfunc, leakclass]:
> +    with leakdetect.leakdetect():
> +        leak()
> diff --git a/tests/test-leakdetect.py.out b/tests/test-leakdetect.py.out
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-leakdetect.py.out
> @@ -0,0 +1,43 @@
> +--- leak detection report ---
> +9 objects not freed by reference counting
> +cycle of 1 objects:
> +  <list>
> +cycle of 2 objects:
> +  [{'a': [...]}, [[[[['65537']]]]]]
> +  {'a': [{...}, [[[[['65537']]]]]]}
> +--- leak detection report ---
> +5 objects not freed by reference counting
> +cycle of 1 objects:
> +  <frame>
> +cycle of 1 objects:
> +  [[...]]
> +--- leak detection report ---
> +9 objects not freed by reference counting
> +cycle of 3 objects:
> +  <cell <function func@test-leakdetect.py:32>>
> +  <function func@test-leakdetect.py:32>
> +  <tuple (<cell <function func@test-leakdetect.py:32>>)>
> +cycle of 6 objects:
> +  <cell <function <lambda>@test-leakdetect.py:34>>
> +  <cell <function <lambda>@test-leakdetect.py:35>>
> +  <function <lambda>@test-leakdetect.py:34>
> +  <function <lambda>@test-leakdetect.py:35>
> +  <tuple (<cell <function <lambda>@test-leakdetect.py:34>>)>
> +  <tuple (<cell <function <lambda>@test-leakdetect.py:35>>)>
> +--- leak detection report ---
> +15 objects not freed by reference counting
> +cycle of 2 objects:
> +  <tuple (<type C>, <type A>, <type object>)>
> +  <type C>
> +cycle of 5 objects:
> +  <attribute '__dict__' of 'A' objects>
> +  <attribute '__weakref__' of 'A' objects>
> +  <dict>
> +  <tuple (<type A>, <type object>)>
> +  <type A>
> +cycle of 5 objects:
> +  <attribute '__dict__' of 'B' objects>
> +  <attribute '__weakref__' of 'B' objects>
> +  <dict>
> +  <tuple (<type B>, <type object>)>
> +  <type B>
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>
Jun Wu - May 28, 2017, 7:44 p.m.
Excerpts from Gregory Szorc's message of 2017-05-28 11:45:04 -0700:
> This seems reasonable. However, the tool as it stands isn't very usable
> because you must include it in sys.path and import it. I'd favor adding
> this as mercurial/leaks.py (or possibly adding to mercurial/profiling.py)
> so it is trivial to import and use the context manager. That, or refactor
> it as an extension and have the extension stash a reference to the context
> manager under mercurial.util.leakdetect (or similar) - again so it is easy
> to use.

I like the profiling.py idea. It's convenient to users. The only downside is
the detection scope is narrowed down so cycles in the ui object may be
missed. (in fact, OrderedDict was detected to create cycles). That said, I
think our main focus is repo and related complex objects so the
"maybeprofile" scope looks reasonable.

I'll move the code to profiling.py if there is no objection.

> [...] 
> Wouldn't this be semantically better if it restored the original settings
> stashed away from a gc.get_debug()?

Yes. This part could be cleaned up a bit.

> [...]

Patch

diff --git a/contrib/leakdetect.py b/contrib/leakdetect.py
new file mode 100644
--- /dev/null
+++ b/contrib/leakdetect.py
@@ -0,0 +1,147 @@ 
+# leakdetect.py - CPython object cyclic references analysis
+#
+# Copyright 2017 Facebook, Inc.
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import contextlib
+import gc
+import os
+import platform
+import sys
+
+def stronglyconnectedcomponents(vertices, edges):
+    """yield strongly connected components in a directed graph"""
+    ignored = set() # vertices outputted already
+    stack = []      # vertice DFS stack
+    index = {}      # stack[index[n]] == n; later visit: bigger index
+    lowlink = {}    # see Tarjan's strongly connected components algorithm
+    execstack = []  # enumerate execution stack to bypass Python stack limit
+
+    def visit(v):
+        index[v] = lowlink[v] = len(stack)
+        stack.append(v)
+        execstack.append((checkroot, (v,)))
+        for w in edges.get(v, []):
+            execstack.append((updatelowlink, (v, w)))
+            if w not in index:
+                execstack.append((visit, (w,)))
+
+    def updatelowlink(v, w):
+        lowlink[v] = min(lowlink[v], lowlink[w])
+
+    def checkroot(v):
+        i = index[v]
+        if lowlink[v] == i:
+            component = stack[i:]
+            del stack[i:]
+            if component[0] not in ignored:
+                ignored.update(component)
+                return component
+
+    for v in vertices:
+        if v not in ignored:
+            index.clear()
+            lowlink.clear()
+            execstack.append((visit, (v,)))
+            while execstack:
+                func, args = execstack.pop()
+                result = func(*args)
+                if result is not None:
+                    yield result
+
+def objectreferencegraph(objects):
+    """return (vertices, edges), referenced relationship among objects
+
+    vertices contain object IDs.
+    """
+    idset = set(id(o) for o in objects)
+    edges = {}
+    for o in objects:
+        edges[id(o)] = [id(r) for r in gc.get_referents(o) if id(r) in idset]
+    return idset, edges
+
+def describe(obj):
+    """short and easy to understand text to help identify an object
+
+    similar to repr(), but avoids huge outputs in all cases.
+    """
+    if isinstance(obj, tuple):
+        return '<tuple (%s)>' % ', '.join(describe(v) for v in obj)
+    elif isinstance(obj, type):
+        return '<type %s>' % obj.__name__
+    elif isinstance(obj, type(describe)): # function
+        code = obj.func_code
+        path = os.path.basename(code.co_filename)
+        return ('<function %s@%s:%s>' % (obj.func_name, path,
+                                         code.co_firstlineno))
+    elif isinstance(obj, type(os.environ.get)): # instancemethod
+        return repr(obj)
+    else:
+        if type(obj) in (list, set, dict):
+            tryrepr = repr(obj)
+            if len(tryrepr) < 70:
+                return tryrepr
+        cellcontent = getattr(obj, 'cell_contents', None)
+        if cellcontent is not None:
+            return '<cell %s>' % describe(cellcontent)
+        typename = type(obj).__name__
+        if typename == 'getset_descriptor':
+            return repr(obj)
+        return '<%s>' % typename
+
+def analysegarbage(garbage, out):
+    """find cyclic references among garbage objects and report them"""
+    out.write('--- leak detection report ---\n'
+              '%d objects not freed by reference counting\n' % len(garbage))
+    vertices, edges = objectreferencegraph(garbage)
+    groups = []
+    for component in stronglyconnectedcomponents(vertices, edges):
+        if len(component) == 1:
+            v = component[0]
+            if v in edges[v]: # self-cycle
+                groups.append((v,))
+        else:
+            groups.append(component)
+
+    objmap = {id(o): o for o in garbage}
+    outputs = []
+    for g in groups:
+        output = 'cycle of %d objects:\n' % len(g)
+        output += ''.join(sorted('  %s\n' % describe(objmap[i]) for i in g))
+        outputs.append(output)
+    out.write(''.join(sorted(outputs)))
+
+if platform.python_implementation() == 'CPython':
+    # only verified to work in CPython
+    @contextlib.contextmanager
+    def leakdetect(out=sys.stderr):
+        # collect garbage so previous garbage won't affect us
+        gc.set_debug(gc.get_debug() & ~gc.DEBUG_SAVEALL)
+        gc.collect()
+        # make gc.collect put objects to gc.garbage instead of releasing them
+        gc.set_debug(gc.get_debug() | gc.DEBUG_SAVEALL)
+        gc.disable()
+        try:
+            yield
+        finally:
+            # populate gc.garbage
+            gc.collect()
+            garbage = gc.garbage
+            # restore the default behavior
+            gc.set_debug(gc.get_debug() & ~gc.DEBUG_SAVEALL)
+            if garbage:
+                analysegarbage(garbage, out)
+                del gc.garbage[:]
+
+    disabled = False
+else:
+    # fallback to a no-op gracefully
+    @contextlib.contextmanager
+    def leakdetect(out=sys.stderr):
+        yield
+
+    disabled = True
diff --git a/tests/test-leakdetect.py b/tests/test-leakdetect.py
new file mode 100644
--- /dev/null
+++ b/tests/test-leakdetect.py
@@ -0,0 +1,50 @@ 
+from __future__ import absolute_import
+
+import inspect
+import os
+import sys
+
+dirname = os.path.dirname
+sys.path.insert(0, os.path.join(dirname(dirname(__file__)), 'contrib'))
+
+import leakdetect
+
+if leakdetect.disabled:
+    sys.stdout.write('skipped: missing feature: cpython\n')
+    sys.exit(80)
+
+def leaklistdict():
+    a = []
+    b = {}
+    b['a'] = a
+    a.append(b)
+    a.append([[[[['65537']]]]])
+    c = [('long text' * 100,)]
+    c.append(c)
+
+def leakframe():
+    frame = inspect.currentframe()
+    a = []
+    a.append(a)
+    b = [[['some text']]]
+
+def leakfunc():
+    def func(x):
+        return func(x + 1)
+    f = lambda x: g(x)
+    g = lambda x: f(x)
+
+def leakclass():
+    # http://bugs.python.org/msg60985
+    class A(object):
+        pass
+    B = type('B', (object,), {})
+    a = A()
+    b = B()
+    class C(a.__class__):
+        pass
+    a.__class__ = C
+
+for leak in [leaklistdict, leakframe, leakfunc, leakclass]:
+    with leakdetect.leakdetect():
+        leak()
diff --git a/tests/test-leakdetect.py.out b/tests/test-leakdetect.py.out
new file mode 100644
--- /dev/null
+++ b/tests/test-leakdetect.py.out
@@ -0,0 +1,43 @@ 
+--- leak detection report ---
+9 objects not freed by reference counting
+cycle of 1 objects:
+  <list>
+cycle of 2 objects:
+  [{'a': [...]}, [[[[['65537']]]]]]
+  {'a': [{...}, [[[[['65537']]]]]]}
+--- leak detection report ---
+5 objects not freed by reference counting
+cycle of 1 objects:
+  <frame>
+cycle of 1 objects:
+  [[...]]
+--- leak detection report ---
+9 objects not freed by reference counting
+cycle of 3 objects:
+  <cell <function func@test-leakdetect.py:32>>
+  <function func@test-leakdetect.py:32>
+  <tuple (<cell <function func@test-leakdetect.py:32>>)>
+cycle of 6 objects:
+  <cell <function <lambda>@test-leakdetect.py:34>>
+  <cell <function <lambda>@test-leakdetect.py:35>>
+  <function <lambda>@test-leakdetect.py:34>
+  <function <lambda>@test-leakdetect.py:35>
+  <tuple (<cell <function <lambda>@test-leakdetect.py:34>>)>
+  <tuple (<cell <function <lambda>@test-leakdetect.py:35>>)>
+--- leak detection report ---
+15 objects not freed by reference counting
+cycle of 2 objects:
+  <tuple (<type C>, <type A>, <type object>)>
+  <type C>
+cycle of 5 objects:
+  <attribute '__dict__' of 'A' objects>
+  <attribute '__weakref__' of 'A' objects>
+  <dict>
+  <tuple (<type A>, <type object>)>
+  <type A>
+cycle of 5 objects:
+  <attribute '__dict__' of 'B' objects>
+  <attribute '__weakref__' of 'B' objects>
+  <dict>
+  <tuple (<type B>, <type object>)>
+  <type B>