Patchwork [v2] osutil: add darwin-only version of os.listdir using cffi

login
register
mail settings
Submitter Maciej Fijalkowski
Date July 18, 2016, 12:19 p.m.
Message ID <4d330a2c13cf52d9e8f8.1468844373@brick.arcode.com>
Download mbox | patch
Permalink /patch/15935/
State Accepted
Headers show

Comments

Maciej Fijalkowski - July 18, 2016, 12:19 p.m.
# HG changeset patch
# User Maciej Fijalkowski <fijall@gmail.com>
# Date 1468227908 -7200
#      Mon Jul 11 11:05:08 2016 +0200
# Node ID 4d330a2c13cf52d9e8f80ad18b82a47a0e62eb55
# Parent  065fb34034a3849f07ca836ce2f4eb17c7f543db
osutil: add darwin-only version of os.listdir using cffi
Jun Wu - July 18, 2016, 12:30 p.m.
The code looks reasonable. I don't have OSX at hand to test now. Someone
with OSX may want to have a second pass.

Excerpts from Maciej Fijalkowski's message of 2016-07-18 14:19:33 +0200:
> # HG changeset patch
> # User Maciej Fijalkowski <fijall@gmail.com>
> # Date 1468227908 -7200
> #      Mon Jul 11 11:05:08 2016 +0200
> # Node ID 4d330a2c13cf52d9e8f80ad18b82a47a0e62eb55
> # Parent  065fb34034a3849f07ca836ce2f4eb17c7f543db
> osutil: add darwin-only version of os.listdir using cffi
> 
> diff --git a/mercurial/pure/osutil.py b/mercurial/pure/osutil.py
> --- a/mercurial/pure/osutil.py
> +++ b/mercurial/pure/osutil.py
> @@ -14,6 +14,10 @@
>  import stat as statmod
>  import sys
>  
> +from . import policy
> +modulepolicy = policy.policy
> +policynocffi = policy.policynocffi
> +
>  def _mode_to_kind(mode):
>      if statmod.S_ISREG(mode):
>          return statmod.S_IFREG
> @@ -31,7 +35,7 @@
>          return statmod.S_IFSOCK
>      return mode
>  
> -def listdir(path, stat=False, skip=None):
> +def listdirpure(path, stat=False, skip=None):
>      '''listdir(path, stat=False) -> list_of_tuples
>  
>      Return a sorted list containing information about the entries
> @@ -61,6 +65,95 @@
>              result.append((fn, _mode_to_kind(st.st_mode)))
>      return result
>  
> +ffi = None
> +if modulepolicy not in policynocffi and sys.platform == 'darwin':
> +    try:
> +        from _osutil_cffi import ffi, lib
> +    except ImportError:
> +        if modulepolicy == 'cffi': # strict cffi import
> +            raise
> +
> +if sys.platform == 'darwin' and ffi is not None:
> +    listdir_batch_size = 4096
> +    # tweakable number, only affects performance, which chunks
> +    # of bytes do we get back from getattrlistbulk
> +
> +    attrkinds = [None] * 20 # we need the max no for enum VXXX, 20 is plenty
> +
> +    attrkinds[lib.VREG] = statmod.S_IFREG
> +    attrkinds[lib.VDIR] = statmod.S_IFDIR
> +    attrkinds[lib.VLNK] = statmod.S_IFLNK
> +    attrkinds[lib.VBLK] = statmod.S_IFBLK
> +    attrkinds[lib.VCHR] = statmod.S_IFCHR
> +    attrkinds[lib.VFIFO] = statmod.S_IFIFO
> +    attrkinds[lib.VSOCK] = statmod.S_IFSOCK
> +
> +    class stat_res(object):
> +        def __init__(self, st_mode, st_mtime, st_size):
> +            self.st_mode = st_mode
> +            self.st_mtime = st_mtime
> +            self.st_size = st_size
> +
> +    tv_sec_ofs = ffi.offsetof("struct timespec", "tv_sec")
> +    buf = ffi.new("char[]", listdir_batch_size)
> +
> +    def listdirinternal(dfd, req, stat, skip):
> +        ret = []
> +        while True:
> +            r = lib.getattrlistbulk(dfd, req, buf, listdir_batch_size, 0)
> +            if r == 0:
> +                break
> +            if r == -1:
> +                raise OSError(ffi.errno, os.strerror(ffi.errno))
> +            cur = ffi.cast("val_attrs_t*", buf)
> +            for i in range(r):
> +                lgt = cur.length
> +                assert lgt == ffi.cast('uint32_t*', cur)[0]
> +                ofs = cur.name_info.attr_dataoffset
> +                str_lgt = cur.name_info.attr_length
> +                base_ofs = ffi.offsetof('val_attrs_t', 'name_info')
> +                name = str(ffi.buffer(ffi.cast("char*", cur) + base_ofs + ofs,
> +                           str_lgt - 1))
> +                tp = attrkinds[cur.obj_type]
> +                if name == "." or name == "..":
> +                    continue
> +                if skip == name and tp == statmod.S_ISDIR:
> +                    return []
> +                if stat:
> +                    mtime = cur.time.tv_sec
> +                    mode = (cur.accessmask & ~lib.S_IFMT)| tp
> +                    ret.append((name, tp, stat_res(st_mode=mode, st_mtime=mtime,
> +                                st_size=cur.datalength)))
> +                else:
> +                    ret.append((name, tp))
> +                cur += lgt
> +        return ret
> +
> +    def listdir(path, stat=False, skip=None):
> +        req = ffi.new("struct attrlist*")
> +        req.bitmapcount = lib.ATTR_BIT_MAP_COUNT
> +        req.commonattr = (lib.ATTR_CMN_RETURNED_ATTRS |
> +                          lib.ATTR_CMN_NAME |
> +                          lib.ATTR_CMN_OBJTYPE |
> +                          lib.ATTR_CMN_ACCESSMASK |
> +                          lib.ATTR_CMN_MODTIME)
> +        req.fileattr = lib.ATTR_FILE_DATALENGTH
> +        dfd = lib.open(path, lib.O_RDONLY, 0)
> +        if dfd == -1:
> +            raise OSError(ffi.errno, os.strerror(ffi.errno))
> +
> +        try:
> +            ret = listdirinternal(dfd, req, stat, skip)
> +        finally:
> +            try:
> +                lib.close(dfd)
> +            except BaseException:
> +                pass # we ignore all the errors from closing, not
> +                # much we can do about that
> +        return ret
> +else:
> +    listdir = listdirpure
> +
>  if os.name != 'nt':
>      posixfile = open
>  
> diff --git a/setup.py b/setup.py
> --- a/setup.py
> +++ b/setup.py
> @@ -320,6 +320,9 @@
>          elif self.distribution.cffi:
>              exts = []
>              # cffi modules go here
> +            if sys.platform == 'darwin':
> +                import setup_osutil_cffi
> +                exts.append(setup_osutil_cffi.ffi.distutils_extension())
>              self.distribution.ext_modules = exts
>          else:
>              h = os.path.join(get_python_inc(), 'Python.h')
> diff --git a/setup_osutil_cffi.py b/setup_osutil_cffi.py
> new file mode 100644
> --- /dev/null
> +++ b/setup_osutil_cffi.py
> @@ -0,0 +1,101 @@
> +from __future__ import absolute_import
> +
> +import cffi
> +
> +ffi = cffi.FFI()
> +ffi.set_source("_osutil_cffi", """
> +#include <sys/attr.h>
> +#include <sys/vnode.h>
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <time.h>
> +
> +typedef struct val_attrs {
> +    uint32_t          length;
> +    attribute_set_t   returned;
> +    attrreference_t   name_info;
> +    fsobj_type_t      obj_type;
> +    struct timespec   mtime;
> +    uint32_t          accessmask;
> +    off_t             datalength;
> +} __attribute__((aligned(4), packed)) val_attrs_t;
> +""", include_dirs=['mercurial'])
> +ffi.cdef('''
> +
> +typedef uint32_t attrgroup_t;
> +
> +typedef struct attrlist {
> +    uint16_t     bitmapcount; /* number of attr. bit sets in list */
> +    uint16_t   reserved;    /* (to maintain 4-byte alignment) */
> +    attrgroup_t commonattr;  /* common attribute group */
> +    attrgroup_t volattr;     /* volume attribute group */
> +    attrgroup_t dirattr;     /* directory attribute group */
> +    attrgroup_t fileattr;    /* file attribute group */
> +    attrgroup_t forkattr;    /* fork attribute group */
> +    ...;
> +};
> +
> +typedef struct attribute_set {
> +    ...;
> +} attribute_set_t;
> +
> +typedef struct attrreference {
> +    int attr_dataoffset;
> +    int attr_length;
> +    ...;
> +} attrreference_t;
> +
> +typedef struct val_attrs {
> +    uint32_t          length;
> +    attribute_set_t   returned;
> +    attrreference_t   name_info;
> +    uint32_t          obj_type;
> +    struct timespec   mtime;
> +    uint32_t          accessmask;
> +    int               datalength;
> +    ...;
> +} val_attrs_t;
> +
> +/* the exact layout of the above struct will be figured out during build time */
> +
> +typedef int ... time_t;
> +typedef int ... off_t;
> +
> +typedef struct timespec {
> +    time_t tv_sec;
> +    ...;
> +};
> +
> +int getattrlist(const char* path, struct attrlist * attrList, void * attrBuf,
> +                size_t attrBufSize, unsigned int options);
> +
> +int getattrlistbulk(int dirfd, struct attrlist * attrList, void * attrBuf,
> +                    size_t attrBufSize, uint64_t options);
> +
> +#define ATTR_BIT_MAP_COUNT ...
> +#define ATTR_CMN_NAME ...
> +#define ATTR_CMN_OBJTYPE ...
> +#define ATTR_CMN_MODTIME ...
> +#define ATTR_CMN_ACCESSMASK ...
> +#define ATTR_CMN_ERROR ...
> +#define ATTR_CMN_RETURNED_ATTRS ...
> +#define ATTR_FILE_DATALENGTH ...
> +
> +#define VREG ...
> +#define VDIR ...
> +#define VLNK ...
> +#define VBLK ...
> +#define VCHR ...
> +#define VFIFO ...
> +#define VSOCK ...
> +
> +#define S_IFMT ...
> +
> +int open(const char *path, int oflag, int perm);
> +int close(int);
> +
> +#define O_RDONLY ...
> +''')
> +
> +if __name__ == '__main__':
> +    ffi.compile()
Matt Mackall - July 19, 2016, 4:06 a.m.
On Mon, 2016-07-18 at 14:19 +0200, Maciej Fijalkowski wrote:
> # HG changeset patch
> # User Maciej Fijalkowski <fijall@gmail.com>
> # Date 1468227908 -7200
> #      Mon Jul 11 11:05:08 2016 +0200
> # Node ID 4d330a2c13cf52d9e8f80ad18b82a47a0e62eb55
> # Parent  065fb34034a3849f07ca836ce2f4eb17c7f543db
> osutil: add darwin-only version of os.listdir using cffi

I've gone ahead and queued this to make forward progress, but I think we need to
look harder at how some of the things here work.

It seems like we could have a fair amount of cffi definitions, and cluttering
the root directory (or mercurial/) with them is probably not a great long-term
solution. We could probably add a cffi/ directory under mercurial/ to parallel
pure/.

-- 
Mathematics is the supreme nostalgia of our time.
Maciej Fijalkowski - July 19, 2016, 6:46 a.m.
On Tue, Jul 19, 2016 at 6:06 AM, Matt Mackall <mpm@selenic.com> wrote:
> On Mon, 2016-07-18 at 14:19 +0200, Maciej Fijalkowski wrote:
>> # HG changeset patch
>> # User Maciej Fijalkowski <fijall@gmail.com>
>> # Date 1468227908 -7200
>> #      Mon Jul 11 11:05:08 2016 +0200
>> # Node ID 4d330a2c13cf52d9e8f80ad18b82a47a0e62eb55
>> # Parent  065fb34034a3849f07ca836ce2f4eb17c7f543db
>> osutil: add darwin-only version of os.listdir using cffi
>
> I've gone ahead and queued this to make forward progress, but I think we need to
> look harder at how some of the things here work.
>
> It seems like we could have a fair amount of cffi definitions, and cluttering
> the root directory (or mercurial/) with them is probably not a great long-term
> solution. We could probably add a cffi/ directory under mercurial/ to parallel
> pure/.
>
> --
> Mathematics is the supreme nostalgia of our time.
>

Thanks Matt!

I lack the authority to make calls where you want those things :-) I'm
very happy to do the actual technical work involved in moving things
around and making sure still works though. Please let me know about
the decision and I'll do it.

Patch

diff --git a/mercurial/pure/osutil.py b/mercurial/pure/osutil.py
--- a/mercurial/pure/osutil.py
+++ b/mercurial/pure/osutil.py
@@ -14,6 +14,10 @@ 
 import stat as statmod
 import sys
 
+from . import policy
+modulepolicy = policy.policy
+policynocffi = policy.policynocffi
+
 def _mode_to_kind(mode):
     if statmod.S_ISREG(mode):
         return statmod.S_IFREG
@@ -31,7 +35,7 @@ 
         return statmod.S_IFSOCK
     return mode
 
-def listdir(path, stat=False, skip=None):
+def listdirpure(path, stat=False, skip=None):
     '''listdir(path, stat=False) -> list_of_tuples
 
     Return a sorted list containing information about the entries
@@ -61,6 +65,95 @@ 
             result.append((fn, _mode_to_kind(st.st_mode)))
     return result
 
+ffi = None
+if modulepolicy not in policynocffi and sys.platform == 'darwin':
+    try:
+        from _osutil_cffi import ffi, lib
+    except ImportError:
+        if modulepolicy == 'cffi': # strict cffi import
+            raise
+
+if sys.platform == 'darwin' and ffi is not None:
+    listdir_batch_size = 4096
+    # tweakable number, only affects performance, which chunks
+    # of bytes do we get back from getattrlistbulk
+
+    attrkinds = [None] * 20 # we need the max no for enum VXXX, 20 is plenty
+
+    attrkinds[lib.VREG] = statmod.S_IFREG
+    attrkinds[lib.VDIR] = statmod.S_IFDIR
+    attrkinds[lib.VLNK] = statmod.S_IFLNK
+    attrkinds[lib.VBLK] = statmod.S_IFBLK
+    attrkinds[lib.VCHR] = statmod.S_IFCHR
+    attrkinds[lib.VFIFO] = statmod.S_IFIFO
+    attrkinds[lib.VSOCK] = statmod.S_IFSOCK
+
+    class stat_res(object):
+        def __init__(self, st_mode, st_mtime, st_size):
+            self.st_mode = st_mode
+            self.st_mtime = st_mtime
+            self.st_size = st_size
+
+    tv_sec_ofs = ffi.offsetof("struct timespec", "tv_sec")
+    buf = ffi.new("char[]", listdir_batch_size)
+
+    def listdirinternal(dfd, req, stat, skip):
+        ret = []
+        while True:
+            r = lib.getattrlistbulk(dfd, req, buf, listdir_batch_size, 0)
+            if r == 0:
+                break
+            if r == -1:
+                raise OSError(ffi.errno, os.strerror(ffi.errno))
+            cur = ffi.cast("val_attrs_t*", buf)
+            for i in range(r):
+                lgt = cur.length
+                assert lgt == ffi.cast('uint32_t*', cur)[0]
+                ofs = cur.name_info.attr_dataoffset
+                str_lgt = cur.name_info.attr_length
+                base_ofs = ffi.offsetof('val_attrs_t', 'name_info')
+                name = str(ffi.buffer(ffi.cast("char*", cur) + base_ofs + ofs,
+                           str_lgt - 1))
+                tp = attrkinds[cur.obj_type]
+                if name == "." or name == "..":
+                    continue
+                if skip == name and tp == statmod.S_ISDIR:
+                    return []
+                if stat:
+                    mtime = cur.time.tv_sec
+                    mode = (cur.accessmask & ~lib.S_IFMT)| tp
+                    ret.append((name, tp, stat_res(st_mode=mode, st_mtime=mtime,
+                                st_size=cur.datalength)))
+                else:
+                    ret.append((name, tp))
+                cur += lgt
+        return ret
+
+    def listdir(path, stat=False, skip=None):
+        req = ffi.new("struct attrlist*")
+        req.bitmapcount = lib.ATTR_BIT_MAP_COUNT
+        req.commonattr = (lib.ATTR_CMN_RETURNED_ATTRS |
+                          lib.ATTR_CMN_NAME |
+                          lib.ATTR_CMN_OBJTYPE |
+                          lib.ATTR_CMN_ACCESSMASK |
+                          lib.ATTR_CMN_MODTIME)
+        req.fileattr = lib.ATTR_FILE_DATALENGTH
+        dfd = lib.open(path, lib.O_RDONLY, 0)
+        if dfd == -1:
+            raise OSError(ffi.errno, os.strerror(ffi.errno))
+
+        try:
+            ret = listdirinternal(dfd, req, stat, skip)
+        finally:
+            try:
+                lib.close(dfd)
+            except BaseException:
+                pass # we ignore all the errors from closing, not
+                # much we can do about that
+        return ret
+else:
+    listdir = listdirpure
+
 if os.name != 'nt':
     posixfile = open
 
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -320,6 +320,9 @@ 
         elif self.distribution.cffi:
             exts = []
             # cffi modules go here
+            if sys.platform == 'darwin':
+                import setup_osutil_cffi
+                exts.append(setup_osutil_cffi.ffi.distutils_extension())
             self.distribution.ext_modules = exts
         else:
             h = os.path.join(get_python_inc(), 'Python.h')
diff --git a/setup_osutil_cffi.py b/setup_osutil_cffi.py
new file mode 100644
--- /dev/null
+++ b/setup_osutil_cffi.py
@@ -0,0 +1,101 @@ 
+from __future__ import absolute_import
+
+import cffi
+
+ffi = cffi.FFI()
+ffi.set_source("_osutil_cffi", """
+#include <sys/attr.h>
+#include <sys/vnode.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <time.h>
+
+typedef struct val_attrs {
+    uint32_t          length;
+    attribute_set_t   returned;
+    attrreference_t   name_info;
+    fsobj_type_t      obj_type;
+    struct timespec   mtime;
+    uint32_t          accessmask;
+    off_t             datalength;
+} __attribute__((aligned(4), packed)) val_attrs_t;
+""", include_dirs=['mercurial'])
+ffi.cdef('''
+
+typedef uint32_t attrgroup_t;
+
+typedef struct attrlist {
+    uint16_t     bitmapcount; /* number of attr. bit sets in list */
+    uint16_t   reserved;    /* (to maintain 4-byte alignment) */
+    attrgroup_t commonattr;  /* common attribute group */
+    attrgroup_t volattr;     /* volume attribute group */
+    attrgroup_t dirattr;     /* directory attribute group */
+    attrgroup_t fileattr;    /* file attribute group */
+    attrgroup_t forkattr;    /* fork attribute group */
+    ...;
+};
+
+typedef struct attribute_set {
+    ...;
+} attribute_set_t;
+
+typedef struct attrreference {
+    int attr_dataoffset;
+    int attr_length;
+    ...;
+} attrreference_t;
+
+typedef struct val_attrs {
+    uint32_t          length;
+    attribute_set_t   returned;
+    attrreference_t   name_info;
+    uint32_t          obj_type;
+    struct timespec   mtime;
+    uint32_t          accessmask;
+    int               datalength;
+    ...;
+} val_attrs_t;
+
+/* the exact layout of the above struct will be figured out during build time */
+
+typedef int ... time_t;
+typedef int ... off_t;
+
+typedef struct timespec {
+    time_t tv_sec;
+    ...;
+};
+
+int getattrlist(const char* path, struct attrlist * attrList, void * attrBuf,
+                size_t attrBufSize, unsigned int options);
+
+int getattrlistbulk(int dirfd, struct attrlist * attrList, void * attrBuf,
+                    size_t attrBufSize, uint64_t options);
+
+#define ATTR_BIT_MAP_COUNT ...
+#define ATTR_CMN_NAME ...
+#define ATTR_CMN_OBJTYPE ...
+#define ATTR_CMN_MODTIME ...
+#define ATTR_CMN_ACCESSMASK ...
+#define ATTR_CMN_ERROR ...
+#define ATTR_CMN_RETURNED_ATTRS ...
+#define ATTR_FILE_DATALENGTH ...
+
+#define VREG ...
+#define VDIR ...
+#define VLNK ...
+#define VBLK ...
+#define VCHR ...
+#define VFIFO ...
+#define VSOCK ...
+
+#define S_IFMT ...
+
+int open(const char *path, int oflag, int perm);
+int close(int);
+
+#define O_RDONLY ...
+''')
+
+if __name__ == '__main__':
+    ffi.compile()