From 394162bf63995bae57a567f5694de26a051bed4f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 5 Apr 2017 17:20:50 +0300 Subject: [PATCH 1/3] bpo-29995: re.escape() now escapes only special characters. --- Doc/library/re.rst | 6 +++++- Lib/re.py | 36 +++++++++--------------------------- Lib/test/test_re.py | 38 ++++++++++++++++++++------------------ Misc/NEWS | 2 ++ 4 files changed, 36 insertions(+), 46 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 323854a2995ede..f582a2b87181ff 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -786,13 +786,17 @@ form. .. function:: escape(string) - Escape all the characters in pattern except ASCII letters, numbers and ``'_'``. + Escape special characters in a string. This is useful if you want to match an arbitrary literal string that may have regular expression metacharacters in it. .. versionchanged:: 3.3 The ``'_'`` character is no longer escaped. + .. versionchanged:: 3.7 + Only characters that can have special meaning in a regular expression + are escaped. + .. function:: purge() diff --git a/Lib/re.py b/Lib/re.py index d321cff92c9cb5..7053eddbe027e6 100644 --- a/Lib/re.py +++ b/Lib/re.py @@ -241,39 +241,21 @@ def template(pattern, flags=0): "Compile a template pattern, returning a pattern object" return _compile(pattern, flags|T) -_alphanum_str = frozenset( - "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890") -_alphanum_bytes = frozenset( - b"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890") +# SPECIAL_CHARS +# closing ')', '}' and ']' +# '-' (a range in character set) +# '#' (comment) and WHITESPACE (ignored) in verbose mode +_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'} def escape(pattern): """ - Escape all the characters in pattern except ASCII letters, numbers and '_'. + Escape special characters in a string. """ if isinstance(pattern, str): - alphanum = _alphanum_str - s = list(pattern) - for i, c in enumerate(pattern): - if c not in alphanum: - if c == "\000": - s[i] = "\\000" - else: - s[i] = "\\" + c - return "".join(s) + return pattern.translate(_special_chars_map) else: - alphanum = _alphanum_bytes - s = [] - esc = ord(b"\\") - for c in pattern: - if c in alphanum: - s.append(c) - else: - if c == 0: - s.extend(b"\\000") - else: - s.append(esc) - s.append(c) - return bytes(s) + pattern = str(pattern, 'latin1') + return pattern.translate(_special_chars_map).encode('latin1') # -------------------------------------------------------------------- # internals diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index a1fddfb4b6b6a6..665e6f813ae400 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -904,7 +904,7 @@ def test_search_coverage(self): self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") def assertMatch(self, pattern, text, match=None, span=None, - matcher=re.match): + matcher=re.fullmatch): if match is None and span is None: # the pattern matches the whole text match = text @@ -918,36 +918,38 @@ def assertMatch(self, pattern, text, match=None, span=None, self.assertEqual(m.span(), span) def test_re_escape(self): - alnum_chars = string.ascii_letters + string.digits + '_' + literal_chars = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~' p = ''.join(chr(i) for i in range(256)) for c in p: - if c in alnum_chars: - self.assertEqual(re.escape(c), c) - elif c == '\x00': - self.assertEqual(re.escape(c), '\\000') - else: - self.assertEqual(re.escape(c), '\\' + c) self.assertMatch(re.escape(c), c) + self.assertMatch('[' + re.escape(c) + ']', c) + self.assertMatch('(?x)' + re.escape(c), c) self.assertMatch(re.escape(p), p) + for c in '-.]{}': + self.assertEqual(re.escape(c)[:1], '\\') + literal_chars = (string.ascii_letters + string.digits + + '!"%&\',/:;<=>@_`~') + self.assertEqual(re.escape(literal_chars), literal_chars) - def test_re_escape_byte(self): - alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii') + def test_re_escape_bytes(self): p = bytes(range(256)) for i in p: b = bytes([i]) - if b in alnum_chars: - self.assertEqual(re.escape(b), b) - elif i == 0: - self.assertEqual(re.escape(b), b'\\000') - else: - self.assertEqual(re.escape(b), b'\\' + b) self.assertMatch(re.escape(b), b) + self.assertMatch(b'[' + re.escape(b) + b']', b) + self.assertMatch(b'(?x)' + re.escape(b), b) self.assertMatch(re.escape(p), p) + for i in b'-.]{}': + b = bytes([i]) + self.assertEqual(re.escape(b)[:1], b'\\') + literal_chars = ((string.ascii_letters + string.digits).encode() + + b'!"%&\',/:;<=>@_`~') + self.assertEqual(re.escape(literal_chars), literal_chars) def test_re_escape_non_ascii(self): s = 'xxx\u2620\u2620\u2620xxx' s_escaped = re.escape(s) - self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx') + self.assertEqual(s_escaped, s) self.assertMatch(s_escaped, s) self.assertMatch('.%s+.' % re.escape('\u2620'), s, 'x\u2620\u2620\u2620x', (2, 7), re.search) @@ -955,7 +957,7 @@ def test_re_escape_non_ascii(self): def test_re_escape_non_ascii_bytes(self): b = 'y\u2620y\u2620y'.encode('utf-8') b_escaped = re.escape(b) - self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') + self.assertEqual(b_escaped, b) self.assertMatch(b_escaped, b) res = re.findall(re.escape('\u2620'.encode('utf-8')), b) self.assertEqual(len(res), 2) diff --git a/Misc/NEWS b/Misc/NEWS index 876767ecd45ccb..9ee170f1470a41 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -303,6 +303,8 @@ Extension Modules Library ------- +- bpo-29995: re.escape() now escapes only special characters. + - bpo-29649: Improve struct.pack_into() exception messages for problems with the buffer size and offset. Patch by Andrew Nester. From 1c1699ab946463eee709abf336894e86409b65ee Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 7 Apr 2017 10:14:21 +0300 Subject: [PATCH 2/3] Fixed IDLE test. --- Lib/idlelib/idle_test/test_replace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/idlelib/idle_test/test_replace.py b/Lib/idlelib/idle_test/test_replace.py index 9913ed2b7c81e1..2ecbd34168c54e 100644 --- a/Lib/idlelib/idle_test/test_replace.py +++ b/Lib/idlelib/idle_test/test_replace.py @@ -221,8 +221,8 @@ def test_replace_regex(self): self.assertIn('Invalid Replace Expression', showerror.message) # test access method - self.engine.setcookedpat("\'") - equal(pv.get(), "\\'") + self.engine.setcookedpat("?") + equal(pv.get(), "\\?") def test_replace_backwards(self): equal = self.assertEqual From 9524b569aa19b1c90882a5680d95ce7ae1c3d5ab Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 11 Apr 2017 23:34:52 +0300 Subject: [PATCH 3/3] Addressed review comments. --- Lib/test/test_re.py | 9 ++++----- Misc/NEWS | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 665e6f813ae400..b3b29f847e619e 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -917,8 +917,9 @@ def assertMatch(self, pattern, text, match=None, span=None, self.assertEqual(m.group(), match) self.assertEqual(m.span(), span) + LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~' + def test_re_escape(self): - literal_chars = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~' p = ''.join(chr(i) for i in range(256)) for c in p: self.assertMatch(re.escape(c), c) @@ -927,8 +928,7 @@ def test_re_escape(self): self.assertMatch(re.escape(p), p) for c in '-.]{}': self.assertEqual(re.escape(c)[:1], '\\') - literal_chars = (string.ascii_letters + string.digits + - '!"%&\',/:;<=>@_`~') + literal_chars = self.LITERAL_CHARS self.assertEqual(re.escape(literal_chars), literal_chars) def test_re_escape_bytes(self): @@ -942,8 +942,7 @@ def test_re_escape_bytes(self): for i in b'-.]{}': b = bytes([i]) self.assertEqual(re.escape(b)[:1], b'\\') - literal_chars = ((string.ascii_letters + string.digits).encode() + - b'!"%&\',/:;<=>@_`~') + literal_chars = self.LITERAL_CHARS.encode('ascii') self.assertEqual(re.escape(literal_chars), literal_chars) def test_re_escape_non_ascii(self): diff --git a/Misc/NEWS b/Misc/NEWS index 3ad88e10f09906..aba429b462072c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -303,7 +303,7 @@ Extension Modules Library ------- -- bpo-29995: re.escape() now escapes only special characters. +- bpo-29995: re.escape() now escapes only regex special characters. - bpo-29962: Add math.remainder operation, implementing remainder as specified in IEEE 754.