From 4909408f1d492a74d987047e9cb8af781c4c4b6d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 6 Mar 2017 11:06:01 +0200 Subject: [PATCH 1/2] bpo-25996: Added support of file descriptors in os.scandir() on Unix. os.fwalk() is sped up by 2 times by using os.scandir(). --- Doc/library/os.rst | 10 +++- Doc/whatsnew/3.7.rst | 10 ++++ Lib/os.py | 39 +++++++----- Lib/test/test_os.py | 42 +++++++++++-- Misc/NEWS | 3 + Modules/clinic/posixmodule.c.h | 4 +- Modules/posixmodule.c | 105 ++++++++++++++++++++++++++------- 7 files changed, 168 insertions(+), 45 deletions(-) diff --git a/Doc/library/os.rst b/Doc/library/os.rst index 974ab2d481e2107..5d45683d0634e4e 100644 --- a/Doc/library/os.rst +++ b/Doc/library/os.rst @@ -2022,6 +2022,9 @@ features: attributes of each :class:`os.DirEntry` will be ``bytes``; in all other circumstances, they will be of type ``str``. + This function can also support :ref:`specifying a file descriptor + `; the file descriptor must refer to a directory. + The :func:`scandir` iterator supports the :term:`context manager` protocol and has the following method: @@ -2068,6 +2071,9 @@ features: The function accepts a :term:`path-like object`. + .. versionchanged:: 3.7 + Added support for :ref:`file descriptors ` on Unix. + .. class:: DirEntry @@ -2107,7 +2113,9 @@ features: The entry's full path name: equivalent to ``os.path.join(scandir_path, entry.name)`` where *scandir_path* is the :func:`scandir` *path* argument. The path is only absolute if the :func:`scandir` *path* - argument was absolute. + argument was absolute. If the :func:`scandir` *path* + argument was a :ref:`file descriptor `, the :attr:`path` + attribute is the same as the :attr:`name` attribute. The :attr:`path` attribute will be ``bytes`` if the :func:`scandir` *path* argument is of type ``bytes`` and ``str`` otherwise. Use diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 5c5ca147e3adf51..940148d9c9f03c3 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -97,6 +97,12 @@ New Modules Improved Modules ================ +os +-- + +Added support for :ref:`file descriptors ` in :func:`~os.scandir` +on Unix. (Contributed by Serhiy Storchaka in :issue:`25996`.) + unittest.mock ------------- @@ -127,6 +133,10 @@ Optimizations in method calls being faster up to 20%. (Contributed by Yury Selivanov and INADA Naoki in :issue:`26110`.) +* The :func:`os.fwalk` function has been sped up by 2 times. This was done + using the :func:`os.scandir` function. + (Contributed by Serhiy Storchaka in :issue:`25996`.) + Build and C API Changes ======================= diff --git a/Lib/os.py b/Lib/os.py index fa06f3937ba5051..0b62bc27ed8d933 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -129,6 +129,7 @@ def _add(str, fn): _add("HAVE_FCHMOD", "chmod") _add("HAVE_FCHOWN", "chown") _add("HAVE_FDOPENDIR", "listdir") + _add("HAVE_FDOPENDIR", "scandir") _add("HAVE_FEXECVE", "execve") _set.add(stat) # fstat always works _add("HAVE_FTRUNCATE", "truncate") @@ -416,7 +417,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False): __all__.append("walk") -if {open, stat} <= supports_dir_fd and {listdir, stat} <= supports_fd: +if {open, stat} <= supports_dir_fd and {scandir, stat} <= supports_fd: def fwalk(top=".", topdown=True, onerror=None, *, follow_symlinks=False, dir_fd=None): """Directory tree generator. @@ -455,7 +456,8 @@ def fwalk(top=".", topdown=True, onerror=None, *, follow_symlinks=False, dir_fd= top = fspath(top) # Note: To guard against symlink races, we use the standard # lstat()/open()/fstat() trick. - orig_st = stat(top, follow_symlinks=False, dir_fd=dir_fd) + if not follow_symlinks: + orig_st = stat(top, follow_symlinks=False, dir_fd=dir_fd) topfd = open(top, O_RDONLY, dir_fd=dir_fd) try: if (follow_symlinks or (st.S_ISDIR(orig_st.st_mode) and @@ -469,33 +471,40 @@ def _fwalk(topfd, toppath, topdown, onerror, follow_symlinks): # necessary, it can be adapted to only require O(1) FDs, see issue # #13734. - names = listdir(topfd) - dirs, nondirs = [], [] - for name in names: + scandir_it = scandir(topfd) + walk_dirs = dirs = [] + nondirs = [] + if not (topdown or follow_symlinks): + walk_dirs = [] # list of entries + for entry in scandir_it: + name = entry.name try: - # Here, we don't use AT_SYMLINK_NOFOLLOW to be consistent with - # walk() which reports symlinks to directories as directories. - # We do however check for symlinks before recursing into - # a subdirectory. - if st.S_ISDIR(stat(name, dir_fd=topfd).st_mode): + if entry.is_dir(): dirs.append(name) + if walk_dirs is not dirs: + walk_dirs.append(entry) else: nondirs.append(name) except OSError: try: # Add dangling symlinks, ignore disappeared files - if st.S_ISLNK(stat(name, dir_fd=topfd, follow_symlinks=False) - .st_mode): + if entry.is_symlink(): nondirs.append(name) except OSError: - continue + pass if topdown: yield toppath, dirs, nondirs, topfd - for name in dirs: + for entry in walk_dirs: + name = entry try: - orig_st = stat(name, dir_fd=topfd, follow_symlinks=follow_symlinks) + if not follow_symlinks: + if topdown: + orig_st = stat(name, dir_fd=topfd, follow_symlinks=False) + else: + name = entry.name + orig_st = entry.stat(follow_symlinks=False) dirfd = open(name, O_RDONLY, dir_fd=topfd) except OSError as err: if onerror is not None: diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index cb15234079fc0db..19da807b774067b 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -1010,9 +1010,12 @@ class FwalkTests(WalkTests): """Tests for os.fwalk().""" def walk(self, top, **kwargs): - for root, dirs, files, root_fd in os.fwalk(top, **kwargs): + for root, dirs, files, root_fd in self.fwalk(top, **kwargs): yield (root, dirs, files) + def fwalk(self, *args, **kwargs): + return os.fwalk(*args, **kwargs) + def _compare_to_walk(self, walk_kwargs, fwalk_kwargs): """ compare with walk() results. @@ -1027,7 +1030,7 @@ def _compare_to_walk(self, walk_kwargs, fwalk_kwargs): for root, dirs, files in os.walk(**walk_kwargs): expected[root] = (set(dirs), set(files)) - for root, dirs, files, rootfd in os.fwalk(**fwalk_kwargs): + for root, dirs, files, rootfd in self.fwalk(**fwalk_kwargs): self.assertIn(root, expected) self.assertEqual(expected[root], (set(dirs), set(files))) @@ -1049,7 +1052,7 @@ def test_yields_correct_dir_fd(self): # check returned file descriptors for topdown, follow_symlinks in itertools.product((True, False), repeat=2): args = support.TESTFN, topdown, None - for root, dirs, files, rootfd in os.fwalk(*args, follow_symlinks=follow_symlinks): + for root, dirs, files, rootfd in self.fwalk(*args, follow_symlinks=follow_symlinks): # check that the FD is valid os.fstat(rootfd) # redundant check @@ -1064,7 +1067,7 @@ def test_fd_leak(self): minfd = os.dup(1) os.close(minfd) for i in range(256): - for x in os.fwalk(support.TESTFN): + for x in self.fwalk(support.TESTFN): pass newfd = os.dup(1) self.addCleanup(os.close, newfd) @@ -3294,6 +3297,35 @@ def test_bytes(self): self.assertEqual(entry.path, os.fsencode(os.path.join(self.path, 'file.txt'))) + @unittest.skipUnless(os.listdir in os.supports_fd, + 'fd support for listdir required for this test.') + def test_fd(self): + self.assertIn(os.scandir, os.supports_fd) + self.create_file('file.txt') + expected_names = ['file.txt'] + if support.can_symlink(): + os.symlink('file.txt', os.path.join(self.path, 'link')) + expected_names.append('link') + + fd = os.open(self.path, os.O_RDONLY) + try: + with os.scandir(fd) as it: + entries = list(it) + names = [entry.name for entry in entries] + self.assertEqual(sorted(names), expected_names) + self.assertEqual(names, os.listdir(fd)) + for entry in entries: + self.assertEqual(entry.path, entry.name) + self.assertEqual(os.fspath(entry), entry.name) + self.assertEqual(entry.is_symlink(), entry.name == 'link') + if os.stat in os.supports_dir_fd: + st = os.stat(entry.name, dir_fd=fd) + self.assertEqual(entry.stat(), st) + st = os.stat(entry.name, dir_fd=fd, follow_symlinks=False) + self.assertEqual(entry.stat(follow_symlinks=False), st) + finally: + os.close(fd) + def test_empty_path(self): self.assertRaises(FileNotFoundError, os.scandir, '') @@ -3309,7 +3341,7 @@ def test_consume_iterator_twice(self): self.assertEqual(len(entries2), 0, entries2) def test_bad_path_type(self): - for obj in [1234, 1.234, {}, []]: + for obj in [1.234, {}, []]: self.assertRaises(TypeError, os.scandir, obj) def test_close(self): diff --git a/Misc/NEWS b/Misc/NEWS index d542cf1032e5c1b..35f365b0bef593c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -263,6 +263,9 @@ Extension Modules Library ------- +- bpo-25996: Added support of file descriptors in os.scandir() on Unix. + os.fwalk() is sped up by 2 times by using os.scandir(). + - bpo-29623: Allow use of path-like object as a single argument in ConfigParser.read(). Patch by David Ellis. diff --git a/Modules/clinic/posixmodule.c.h b/Modules/clinic/posixmodule.c.h index 39ac7fd54fd90ed..6ef0293efd78ea5 100644 --- a/Modules/clinic/posixmodule.c.h +++ b/Modules/clinic/posixmodule.c.h @@ -5926,7 +5926,7 @@ os_scandir(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject *kwname PyObject *return_value = NULL; static const char * const _keywords[] = {"path", NULL}; static _PyArg_Parser _parser = {"|O&:scandir", _keywords, 0}; - path_t path = PATH_T_INITIALIZE("scandir", "path", 1, 0); + path_t path = PATH_T_INITIALIZE("scandir", "path", 1, PATH_HAVE_FDOPENDIR); if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, path_converter, &path)) { @@ -6493,4 +6493,4 @@ os_getrandom(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject *kwna #ifndef OS_GETRANDOM_METHODDEF #define OS_GETRANDOM_METHODDEF #endif /* !defined(OS_GETRANDOM_METHODDEF) */ -/*[clinic end generated code: output=5a0be969e3f71660 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=5529857101c08b49 input=a9049054013a1b77]*/ diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index ae03d06fe10d072..104c4d70ab05664 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -11159,6 +11159,7 @@ typedef struct { unsigned char d_type; #endif ino_t d_ino; + int dir_fd; #endif } DirEntry; @@ -11208,19 +11209,31 @@ DirEntry_fetch_stat(DirEntry *self, int follow_symlinks) PyObject *ub; #ifdef MS_WINDOWS - if (PyUnicode_FSDecoder(self->path, &ub)) { - const wchar_t *path = PyUnicode_AsUnicode(ub); + if (!PyUnicode_FSDecoder(self->path, &ub)) + return NULL; + const wchar_t *path = PyUnicode_AsUnicode(ub); #else /* POSIX */ - if (PyUnicode_FSConverter(self->path, &ub)) { - const char *path = PyBytes_AS_STRING(ub); + if (!PyUnicode_FSConverter(self->path, &ub)) + return NULL; + const char *path = PyBytes_AS_STRING(ub); + if (self->dir_fd != DEFAULT_DIR_FD) { +#ifdef HAVE_FSTATAT + result = fstatat(self->dir_fd, path, &st, + follow_symlinks ? 0 : AT_SYMLINK_NOFOLLOW); +#else + PyErr_SetString(PyExc_NotImplementedError, "can't fetch stat"); + return NULL; +#endif /* HAVE_FSTATAT */ + } + else #endif + { if (follow_symlinks) result = STAT(path, &st); else result = LSTAT(path, &st); - Py_DECREF(ub); - } else - return NULL; + } + Py_DECREF(ub); if (result != 0) return path_object_error(self->path); @@ -11630,20 +11643,36 @@ DirEntry_from_posix_info(path_t *path, const char *name, Py_ssize_t name_len, entry->stat = NULL; entry->lstat = NULL; - joined_path = join_path_filename(path->narrow, name, name_len); - if (!joined_path) - goto error; + if (path->fd != -1) { + entry->dir_fd = path->fd; + joined_path = NULL; + } + else { + entry->dir_fd = DEFAULT_DIR_FD; + joined_path = join_path_filename(path->narrow, name, name_len); + if (!joined_path) + goto error; + } if (!path->narrow || !PyBytes_Check(path->object)) { entry->name = PyUnicode_DecodeFSDefaultAndSize(name, name_len); - entry->path = PyUnicode_DecodeFSDefault(joined_path); + if (joined_path) + entry->path = PyUnicode_DecodeFSDefault(joined_path); } else { entry->name = PyBytes_FromStringAndSize(name, name_len); - entry->path = PyBytes_FromString(joined_path); + if (joined_path) + entry->path = PyBytes_FromString(joined_path); } PyMem_Free(joined_path); - if (!entry->name || !entry->path) + if (!entry->name) + goto error; + + if (path->fd != -1) { + entry->path = entry->name; + Py_INCREF(entry->path); + } + else if (!entry->path) goto error; #ifdef HAVE_DIRENT_D_TYPE @@ -11671,6 +11700,9 @@ typedef struct { #else /* POSIX */ DIR *dirp; #endif +#ifdef HAVE_FDOPENDIR + int fd; +#endif } ScandirIterator; #ifdef MS_WINDOWS @@ -11755,6 +11787,10 @@ ScandirIterator_closedir(ScandirIterator *iterator) iterator->dirp = NULL; Py_BEGIN_ALLOW_THREADS +#ifdef HAVE_FDOPENDIR + if (iterator->path.fd != -1) + rewinddir(dirp); +#endif closedir(dirp); Py_END_ALLOW_THREADS return; @@ -11930,7 +11966,7 @@ static PyTypeObject ScandirIteratorType = { /*[clinic input] os.scandir - path : path_t(nullable=True) = None + path : path_t(nullable=True, allow_fd='PATH_HAVE_FDOPENDIR') = None Return an iterator of DirEntry objects for given path. @@ -11943,13 +11979,16 @@ If path is None, uses the path='.'. static PyObject * os_scandir_impl(PyObject *module, path_t *path) -/*[clinic end generated code: output=6eb2668b675ca89e input=e62b08b3cd41f604]*/ +/*[clinic end generated code: output=6eb2668b675ca89e input=b139dc1c57f60846]*/ { ScandirIterator *iterator; #ifdef MS_WINDOWS wchar_t *path_strW; #else const char *path_str; +#ifdef HAVE_FDOPENDIR + int fd = -1; +#endif #endif iterator = PyObject_New(ScandirIterator, &ScandirIteratorType); @@ -11985,18 +12024,40 @@ os_scandir_impl(PyObject *module, path_t *path) goto error; } #else /* POSIX */ - if (iterator->path.narrow) - path_str = iterator->path.narrow; + errno = 0; +#ifdef HAVE_FDOPENDIR + if (path->fd != -1) { + /* closedir() closes the FD, so we duplicate it */ + fd = _Py_dup(path->fd); + if (fd == -1) + goto error; + + Py_BEGIN_ALLOW_THREADS + iterator->dirp = fdopendir(fd); + Py_END_ALLOW_THREADS + } else - path_str = "."; +#endif + { + if (iterator->path.narrow) + path_str = iterator->path.narrow; + else + path_str = "."; - errno = 0; - Py_BEGIN_ALLOW_THREADS - iterator->dirp = opendir(path_str); - Py_END_ALLOW_THREADS + Py_BEGIN_ALLOW_THREADS + iterator->dirp = opendir(path_str); + Py_END_ALLOW_THREADS + } if (!iterator->dirp) { path_error(&iterator->path); +#ifdef HAVE_FDOPENDIR + if (fd != -1) { + Py_BEGIN_ALLOW_THREADS + close(fd); + Py_END_ALLOW_THREADS + } +#endif goto error; } #endif From ecefa60982c4d253ef085aa4beea1b5c8315062b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 10 Mar 2017 19:31:05 +0200 Subject: [PATCH 2/2] Refactoring. Collect entries rather of (name, entry) pairs. --- Lib/os.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index fdd93714070948c..e388d50c03ffe18 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -473,10 +473,9 @@ def _fwalk(topfd, toppath, isbytes, topdown, onerror, follow_symlinks): # #13734. scandir_it = scandir(topfd) - walk_dirs = dirs = [] + dirs = [] nondirs = [] - if not (topdown or follow_symlinks): - walk_dirs = [] # list of entries + entries = None if topdown or follow_symlinks else [] for entry in scandir_it: name = entry.name if isbytes: @@ -484,8 +483,8 @@ def _fwalk(topfd, toppath, isbytes, topdown, onerror, follow_symlinks): try: if entry.is_dir(): dirs.append(name) - if walk_dirs is not dirs: - walk_dirs.append((name, entry)) + if entries is not None: + entries.append(entry) else: nondirs.append(name) except OSError: @@ -499,13 +498,13 @@ def _fwalk(topfd, toppath, isbytes, topdown, onerror, follow_symlinks): if topdown: yield toppath, dirs, nondirs, topfd - for name in walk_dirs: + for name in dirs if entries is None else zip(dirs, entries): try: if not follow_symlinks: if topdown: orig_st = stat(name, dir_fd=topfd, follow_symlinks=False) else: - assert walk_dirs is not dirs + assert entries is not None name, entry = name orig_st = entry.stat(follow_symlinks=False) dirfd = open(name, O_RDONLY, dir_fd=topfd)