From a4f8ac05d167c55f9ae9b7b0983e0ec823ccbb44 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 18 Mar 2022 20:06:08 +0200 Subject: [PATCH 1/8] bpo-433030: Add support of atomic grouping in regular expressions --- Doc/library/re.rst | 41 +++++++ Lib/sre_compile.py | 33 +++++- Lib/sre_constants.py | 6 +- Lib/sre_parse.py | 33 +++++- Lib/test/test_re.py | 238 ++++++++++++++++++++++++++++++---------- Modules/_sre.c | 32 ++++++ Modules/sre_constants.h | 31 +++--- Modules/sre_lib.h | 195 ++++++++++++++++++++++++++++++++ 8 files changed, 527 insertions(+), 82 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 8d62e3bf4d8d834..b4af10a959d8d9b 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -157,6 +157,21 @@ The special characters are: .. index:: single: {} (curly brackets); in regular expressions +``*+``, ``++``, ``?+`` + Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is + appended also match as many times as possible. However, unlike the true greedy + qualifiers, these do not allow back-tracking when the expression following it + fails to match. These are known as :dfn:`Possessive` qualifiers. For example, + ``a*a`` will match ``'aaaa'`` because the ``a*`` will match all 4 ``'a'``s, but, + when the final ``'a'`` is encountered, the expression is backtracked so that in the + end the ``a*`` ends up matching 3 ``'a'``s total, and the fourth ``'a'`` is matched + by the final ``'a'``. However, when ``a*+a`` is used to match ``'aaaa'``, the + ``a*+`` will match all 4 ``'a'``, but when the final ``'a'`` fails to find any more + characters to match, the expression cannot be backtracked and will thus fail to + match. + + .. versionadded:: 3.11 + ``{m}`` Specifies that exactly *m* copies of the previous RE should be matched; fewer matches cause the entire RE not to match. For example, ``a{6}`` will match @@ -178,6 +193,18 @@ The special characters are: 6-character string ``'aaaaaa'``, ``a{3,5}`` will match 5 ``'a'`` characters, while ``a{3,5}?`` will only match 3 characters. +``{m,n}+`` + Causes the resulting RE to match from *m* to *n* repetitions of the preceding + RE, attempting to match as many repetitions as possible *without* establishing any + backtracking points. This is the possessive version of the qualifier above. For + example, on the 6-character string ``'aaaaaa'``, ``a{3,5}+aa`` attempt to match 5 + ``'a'`` characters, then, requiring 2 more ``'a'``s, will need more characters than + available and thus fail, while ``a{3,5}aa`` will match with ``a{3,5}`` capturing + 5, then 4 ``'a'``s by backtracking and then the final 2 ``'a'``s are matched by the + final ``aa`` in the pattern. + + .. versionadded:: 3.11 + .. index:: single: \ (backslash); in regular expressions ``\`` @@ -421,6 +448,20 @@ The special characters are: some fixed length. Patterns which start with negative lookbehind assertions may match at the beginning of the string being searched. +``(?>...)`` + Attempts to match ``...`` as if it was a separate Regular Expression, and if + successful, continues to match the rest of the pattern following it. If the + subsequent pattern fails to match, the stack can only be unwound to a point + *before* the ``(?>...)`` because once exited, the expression, known as an + :dfn:`Atomic Group`, has thrown away all stack points within itself. Thus, + ``(?>.*).`` would never match anything because first the ``.*`` would match all + characters possible, then, having nothing left to match, the final ``.`` would + fail to match. Since there are no stack points saved in the Atomic Group, and + there is no stack point before it, the entire expression would thus fail to + match. + + .. versionadded:: 3.11 + ``(?(id/name)yes-pattern|no-pattern)`` Will try to match with ``yes-pattern`` if the group with given *id* or *name* exists, and with ``no-pattern`` if it doesn't. ``no-pattern`` is diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index c6398bfb83a5763..04b3b3b2eba09f3 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -17,7 +17,7 @@ assert _sre.MAGIC == MAGIC, "SRE module mismatch" _LITERAL_CODES = {LITERAL, NOT_LITERAL} -_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} +_REPEATING_CODES = {MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT} _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} _UNIT_CODES = _LITERAL_CODES | {ANY, IN} @@ -140,6 +140,8 @@ def _compile(code, pattern, flags): if _simple(av[2]): if op is MAX_REPEAT: emit(REPEAT_ONE) + elif op is POSSESSIVE_REPEAT: + emit(POSSESSIVE_ONE) else: emit(MIN_REPEAT_ONE) skip = _len(code); emit(0) @@ -148,6 +150,14 @@ def _compile(code, pattern, flags): _compile(code, av[2], flags) emit(SUCCESS) code[skip] = _len(code) - skip + elif op is POSSESSIVE_REPEAT: + emit(POSSESSIVE_REPEAT) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + code[skip] = _len(code) - skip + emit(SUCCESS) else: emit(REPEAT) skip = _len(code); emit(0) @@ -155,6 +165,8 @@ def _compile(code, pattern, flags): emit(av[1]) _compile(code, av[2], flags) code[skip] = _len(code) - skip + # TODO: What if op is REPEAT, not MIN_REPEAT; + # Default of MIN_UNTIL may be wrong if op is MAX_REPEAT: emit(MAX_UNTIL) else: @@ -169,6 +181,17 @@ def _compile(code, pattern, flags): if group: emit(MARK) emit((group-1)*2+1) + elif op is ATOMIC_GROUP: + # Atomic Groups are handled by starting with an Atomic + # Group op code, then putting in the atomic group pattern + # and finally a success op code to tell any repeat + # operations within the Atomic Group to stop eating and + # pop their stack if they reach it + emit(ATOMIC_GROUP) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(SUCCESS) + code[skip] = _len(code) - skip elif op in SUCCESS_CODES: emit(op) elif op in ASSERT_CODES: @@ -709,7 +732,8 @@ def print_2(*args): else: print_(FAILURE) i += 1 - elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE): + elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, + POSSESSIVE_REPEAT, POSSESSIVE_ONE): skip, min, max = code[i: i+3] if max == MAXREPEAT: max = 'MAXREPEAT' @@ -725,6 +749,11 @@ def print_2(*args): print_(op, skip, arg, to=i+skip) dis_(i+2, i+skip) i += skip + elif op is ATOMIC_GROUP: + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip elif op is INFO: skip, flags, min, max = code[i: i+4] if max == MAXREPEAT: diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 8e613cb3fa5dcba..254abaf0460642b 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20171005 +MAGIC = 20220318 from _sre import MAXREPEAT, MAXGROUPS @@ -97,6 +97,9 @@ def _makecodes(names): REPEAT_ONE SUBPATTERN MIN_REPEAT_ONE + ATOMIC_GROUP + POSSESSIVE_REPEAT + POSSESSIVE_ONE GROUPREF_IGNORE IN_IGNORE @@ -144,7 +147,6 @@ def _makecodes(names): CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK """) - # replacement operations for "ignore case" mode OP_IGNORE = { LITERAL: LITERAL_IGNORE, diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 83119168e6376ee..458197f1926a1eb 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -25,7 +25,7 @@ WHITESPACE = frozenset(" \t\n\r\v\f") -_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) +_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) ESCAPES = { @@ -190,6 +190,10 @@ def getwidth(self): i, j = av.getwidth() lo = lo + i hi = hi + j + elif op is ATOMIC_GROUP: + i, j = av.getwidth() + lo = lo + i + hi = hi + j elif op is SUBPATTERN: i, j = av[-1].getwidth() lo = lo + i @@ -675,8 +679,13 @@ def _parse(source, state, verbose, nested, first=False): if group is None and not add_flags and not del_flags: item = p if sourcematch("?"): + # Non-Greedy Match subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) else: + # Greedy Match subpattern[-1] = (MAX_REPEAT, (min, max, item)) elif this == ".": @@ -684,7 +693,8 @@ def _parse(source, state, verbose, nested, first=False): elif this == "(": start = source.tell() - 1 - group = True + capture = True + atomic = False name = None add_flags = 0 del_flags = 0 @@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False): len(char) + 2) elif char == ":": # non-capturing group - group = None + capture = False elif char == "#": # comment while True: @@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False): subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) continue + elif char == ">": + # non-capturing, atomic group + capture = False + atomic = True elif char in FLAGS or char == "-": # flags flags = _parse_flags(source, state, char) @@ -818,17 +832,19 @@ def _parse(source, state, verbose, nested, first=False): continue add_flags, del_flags = flags - group = None + capture = False else: raise source.error("unknown extension ?" + char, len(char) + 1) # parse group contents - if group is not None: + if capture: try: group = state.opengroup(name) except error as err: raise source.error(err.msg, len(name) + 1) from None + else: + group = None sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and not (del_flags & SRE_FLAG_VERBOSE)) p = _parse_sub(source, state, sub_verbose, nested + 1) @@ -837,7 +853,12 @@ def _parse(source, state, verbose, nested, first=False): source.tell() - start) if group is not None: state.closegroup(group, p) - subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) + if atomic: + # TODO: Assert that group is always None in this + # case + subpatternappend((ATOMIC_GROUP, p)) + else: + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) elif this == "^": subpatternappend((AT, AT_BEGINNING)) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 59575962eb4f3d4..751180297ce3b11 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -83,6 +83,23 @@ def test_search_star_plus(self): self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) self.assertIsNone(re.match('a+', 'xxx')) + def test_branching(self): + """Test Branching + Test expressions using the OR ('|') operator.""" + self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2)) + self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(), + (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(), + (0, 2)) + self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1)) + def bump_num(self, matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1) @@ -1239,11 +1256,13 @@ def test_nothing_to_repeat(self): 'nothing to repeat', 3) def test_multiple_repeat(self): - for outer_reps in '*', '+', '{1,2}': - for outer_mod in '', '?': + for outer_reps in '*', '+', '?', '{1,2}': + for outer_mod in '', '?', '+': outer_op = outer_reps + outer_mod for inner_reps in '*', '+', '?', '{1,2}': - for inner_mod in '', '?': + for inner_mod in '', '?', '+': + if inner_mod + outer_reps in ('?', '+'): + continue inner_op = inner_reps + inner_mod self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 'multiple repeat', 1 + len(inner_op)) @@ -1499,7 +1518,8 @@ def test_inline_flags(self): def test_dollar_matches_twice(self): - "$ matches the end of string, and just before the terminating \n" + """Test that $ does not include \\n + $ matches the end of string, and just before the terminating \n""" pattern = re.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') @@ -1815,60 +1835,6 @@ def test_bug_2537(self): self.assertEqual(m.group(1), "") self.assertEqual(m.group(2), "y") - @cpython_only - def test_debug_flag(self): - pat = r'(\.)(?:[ch]|py)(?(1)$|: )' - with captured_stdout() as out: - re.compile(pat, re.DEBUG) - self.maxDiff = None - dump = '''\ -SUBPATTERN 1 0 0 - LITERAL 46 -BRANCH - IN - LITERAL 99 - LITERAL 104 -OR - LITERAL 112 - LITERAL 121 -GROUPREF_EXISTS 1 - AT AT_END -ELSE - LITERAL 58 - LITERAL 32 - - 0. INFO 8 0b1 2 5 (to 9) - prefix_skip 0 - prefix [0x2e] ('.') - overlap [0] - 9: MARK 0 -11. LITERAL 0x2e ('.') -13. MARK 1 -15. BRANCH 10 (to 26) -17. IN 6 (to 24) -19. LITERAL 0x63 ('c') -21. LITERAL 0x68 ('h') -23. FAILURE -24: JUMP 9 (to 34) -26: branch 7 (to 33) -27. LITERAL 0x70 ('p') -29. LITERAL 0x79 ('y') -31. JUMP 2 (to 34) -33: FAILURE -34: GROUPREF_EXISTS 0 6 (to 41) -37. AT END -39. JUMP 5 (to 45) -41: LITERAL 0x3a (':') -43. LITERAL 0x20 (' ') -45: SUCCESS -''' - self.assertEqual(out.getvalue(), dump) - # Debug output is output again even a second time (bypassing - # the cache -- issue #20426). - with captured_stdout() as out: - re.compile(pat, re.DEBUG) - self.assertEqual(out.getvalue(), dump) - def test_keyword_parameters(self): # Issue #20283: Accepting the string keyword parameter. pat = re.compile(r'(ab)') @@ -2113,6 +2079,162 @@ def test_bug_40736(self): with self.assertRaisesRegex(TypeError, "got 'type'"): re.search("x*", type) + def test_possessive_qualifiers(self): + """Test Possessive Qualifiers + test qualifiers of the form @+ for some repetition operator @, + e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed + without creating a stack frame for rolling the stack back and + trying 1 or more fewer matches.""" + self.assertIsNone(re.match('e*+e', 'eeee')) + self.assertEqual(re.match('e++a', 'eeea').group(0), 'eeea') + self.assertEqual(re.match('e?+a', 'ea').group(0), 'ea') + self.assertEqual(re.match('e{2,4}+a', 'eeea').group(0), 'eeea') + self.assertIsNone(re.match('(.)++.', 'ee')) + self.assertEqual(re.match('(ae)*+a', 'aea').groups(), ('ae',)) + self.assertEqual(re.match('([ae][ae])?+a', 'aea').groups(), + ('ae',)) + self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(), + ('',)) + self.assertEqual(re.match('()*+a', 'a').groups(), ('',)) + self.assertEqual(re.search('x*+', 'axx').span(0), (0, 0)) + self.assertEqual(re.search('x*+', 'axx').span(), (0, 0)) + self.assertEqual(re.search('x++', 'axx').span(0), (1, 3)) + self.assertEqual(re.search('x++', 'axx').span(), (1, 3)) + self.assertEqual(re.match('a*+', 'xxx').span(0), (0, 0)) + self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0)) + self.assertEqual(re.match('x*+', 'xxxa').span(0), (0, 3)) + self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3)) + self.assertIsNone(re.match('a++', 'xxx')) + self.assertIsNone(re.match(r"^(\w){1}+$", "abc")) + self.assertIsNone(re.match(r"^(\w){1,2}+$", "abc")) + + self.assertEqual(re.match(r"^(\w){3}+$", "abc").group(1), "c") + self.assertEqual(re.match(r"^(\w){1,3}+$", "abc").group(1), "c") + self.assertEqual(re.match(r"^(\w){1,4}+$", "abc").group(1), "c") + + self.assertIsNone(re.match("^x{1}+$", "xxx")) + self.assertIsNone(re.match("^x{1,2}+$", "xxx")) + + self.assertTrue(re.match("^x{3}+$", "xxx")) + self.assertTrue(re.match("^x{1,3}+$", "xxx")) + self.assertTrue(re.match("^x{1,4}+$", "xxx")) + + self.assertIsNone(re.match("^x{}+$", "xxx")) + self.assertTrue(re.match("^x{}+$", "x{}")) + + def test_atomic_grouping(self): + """Test Atomic Grouping + test non-capturing groups of the form (?>...), which acts does + not maintain any stack point created within the group once the + group is finished being evaluated.""" + pattern1 = re.compile(r'a(?>bc|b)c') + self.assertIsNone(pattern1.match('abc')) + self.assertTrue(pattern1.match('abcc')) + self.assertIsNone(re.match(r'(?>.*).', 'abc')) + self.assertTrue(re.match(r'(?>x)++', 'xxx')) + self.assertTrue(re.match(r'(?>x++)', 'xxx')) + self.assertIsNone(re.match(r'(?>x)++x', 'xxx')) + self.assertIsNone(re.match(r'(?>x++)x', 'xxx')) + + +def get_debug_out(pat): + with captured_stdout() as out: + re.compile(pat, re.DEBUG) + return out.getvalue() + + +@cpython_only +class DebugTests(unittest.TestCase): + maxDiff = None + + def test_debug_flag(self): + pat = r'(\.)(?:[ch]|py)(?(1)$|: )' + dump = '''\ +SUBPATTERN 1 0 0 + LITERAL 46 +BRANCH + IN + LITERAL 99 + LITERAL 104 +OR + LITERAL 112 + LITERAL 121 +GROUPREF_EXISTS 1 + AT AT_END +ELSE + LITERAL 58 + LITERAL 32 + + 0. INFO 8 0b1 2 5 (to 9) + prefix_skip 0 + prefix [0x2e] ('.') + overlap [0] + 9: MARK 0 +11. LITERAL 0x2e ('.') +13. MARK 1 +15. BRANCH 10 (to 26) +17. IN 6 (to 24) +19. LITERAL 0x63 ('c') +21. LITERAL 0x68 ('h') +23. FAILURE +24: JUMP 9 (to 34) +26: branch 7 (to 33) +27. LITERAL 0x70 ('p') +29. LITERAL 0x79 ('y') +31. JUMP 2 (to 34) +33: FAILURE +34: GROUPREF_EXISTS 0 6 (to 41) +37. AT END +39. JUMP 5 (to 45) +41: LITERAL 0x3a (':') +43. LITERAL 0x20 (' ') +45: SUCCESS +''' + self.assertEqual(get_debug_out(pat), dump) + # Debug output is output again even a second time (bypassing + # the cache -- issue #20426). + self.assertEqual(get_debug_out(pat), dump) + + def test_atomic_group(self): + self.assertEqual(get_debug_out(r'(?>ab?)'), '''\ +ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))] + + 0. INFO 4 0b0 1 2 (to 5) + 5: ATOMIC_GROUP 11 (to 17) + 7. LITERAL 0x61 ('a') + 9. REPEAT_ONE 6 0 1 (to 16) +13. LITERAL 0x62 ('b') +15. SUCCESS +16: SUCCESS +17: SUCCESS +''') + + def test_possesive_repeat_one(self): + self.assertEqual(get_debug_out(r'a?+'), '''\ +POSSESSIVE_REPEAT 0 1 + LITERAL 97 + + 0. INFO 4 0b0 0 1 (to 5) + 5: POSSESSIVE_ONE 6 0 1 (to 12) + 9. LITERAL 0x61 ('a') +11. SUCCESS +12: SUCCESS +''') + + def test_possesive_repeat(self): + self.assertEqual(get_debug_out(r'(?:ab)?+'), '''\ +POSSESSIVE_REPEAT 0 1 + LITERAL 97 + LITERAL 98 + + 0. INFO 4 0b0 0 2 (to 5) + 5: POSSESSIVE_REPEAT 7 0 1 (to 13) + 9. LITERAL 0x61 ('a') +11. LITERAL 0x62 ('b') +13: SUCCESS +14. SUCCESS +''') + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): diff --git a/Modules/_sre.c b/Modules/_sre.c index 213730860cfb534..9cf94b3f161fe1d 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1807,6 +1807,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_REPEAT_ONE: case SRE_OP_MIN_REPEAT_ONE: + case SRE_OP_POSSESSIVE_ONE: { SRE_CODE min, max; GET_SKIP; @@ -1844,6 +1845,37 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) } break; + case SRE_OP_POSSESSIVE_REPEAT: + { + SRE_CODE min, max; + GET_SKIP; + GET_ARG; min = arg; + GET_ARG; max = arg; + if (min > max) + FAIL; + if (max > SRE_MAXREPEAT) + FAIL; + if (!_validate_inner(code, code+skip-3, groups)) + FAIL; + code += skip-3; + GET_OP; + if (op != SRE_OP_SUCCESS) + FAIL; + } + break; + + case SRE_OP_ATOMIC_GROUP: + { + GET_SKIP; + if (!_validate_inner(code, code+skip-2, groups)) + FAIL; + code += skip-2; + GET_OP; + if (op != SRE_OP_SUCCESS) + FAIL; + } + break; + case SRE_OP_GROUPREF: case SRE_OP_GROUPREF_IGNORE: case SRE_OP_GROUPREF_UNI_IGNORE: diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index c8ccb32d21de6ce..4d4b1e99380b4f6 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20171005 +#define SRE_MAGIC 20220318 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -40,19 +40,22 @@ #define SRE_OP_REPEAT_ONE 25 #define SRE_OP_SUBPATTERN 26 #define SRE_OP_MIN_REPEAT_ONE 27 -#define SRE_OP_GROUPREF_IGNORE 28 -#define SRE_OP_IN_IGNORE 29 -#define SRE_OP_LITERAL_IGNORE 30 -#define SRE_OP_NOT_LITERAL_IGNORE 31 -#define SRE_OP_GROUPREF_LOC_IGNORE 32 -#define SRE_OP_IN_LOC_IGNORE 33 -#define SRE_OP_LITERAL_LOC_IGNORE 34 -#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35 -#define SRE_OP_GROUPREF_UNI_IGNORE 36 -#define SRE_OP_IN_UNI_IGNORE 37 -#define SRE_OP_LITERAL_UNI_IGNORE 38 -#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39 -#define SRE_OP_RANGE_UNI_IGNORE 40 +#define SRE_OP_ATOMIC_GROUP 28 +#define SRE_OP_POSSESSIVE_REPEAT 29 +#define SRE_OP_POSSESSIVE_ONE 30 +#define SRE_OP_GROUPREF_IGNORE 31 +#define SRE_OP_IN_IGNORE 32 +#define SRE_OP_LITERAL_IGNORE 33 +#define SRE_OP_NOT_LITERAL_IGNORE 34 +#define SRE_OP_GROUPREF_LOC_IGNORE 35 +#define SRE_OP_IN_LOC_IGNORE 36 +#define SRE_OP_LITERAL_LOC_IGNORE 37 +#define SRE_OP_NOT_LITERAL_LOC_IGNORE 38 +#define SRE_OP_GROUPREF_UNI_IGNORE 39 +#define SRE_OP_IN_UNI_IGNORE 40 +#define SRE_OP_LITERAL_UNI_IGNORE 41 +#define SRE_OP_NOT_LITERAL_UNI_IGNORE 42 +#define SRE_OP_RANGE_UNI_IGNORE 43 #define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 32469cd161cf3b5..a403747b2e9feb0 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -480,6 +480,9 @@ do { \ #define JUMP_BRANCH 11 #define JUMP_ASSERT 12 #define JUMP_ASSERT_NOT 13 +#define JUMP_POSS_REPEAT_1 14 +#define JUMP_POSS_REPEAT_2 15 +#define JUMP_ATOMIC_GROUP 16 #define DO_JUMPX(jumpvalue, jumplabel, nextpattern, toplevel_) \ DATA_ALLOC(SRE(match_context), nextctx); \ @@ -950,6 +953,57 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) } RETURN_FAILURE; + case SRE_OP_POSSESSIVE_ONE: + /* match repeated sequence (maximizing regexp) without + backtracking */ + + /* this operator only works if the repeated item is + exactly one character wide, and we're not already + collecting backtracking points. for other cases, + use the MAX_REPEAT operator */ + + /* <1=min> <2=max> item + tail */ + + TRACE(("|%p|%p|POSSESSIVE_ONE %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + + if (ctx->ptr + ctx->pattern[1] > end) { + RETURN_FAILURE; /* cannot match */ + } + + state->ptr = ctx->ptr; + + ret = SRE(count)(state, ctx->pattern + 3, ctx->pattern[2]); + RETURN_ON_ERROR(ret); + DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); + ctx->count = ret; + ctx->ptr += ctx->count; + + /* when we arrive here, count contains the number of + matches, and ctx->ptr points to the tail of the target + string. check if the rest of the pattern matches, + and fail if not. */ + + /* Test for not enough repetitions in match */ + if (ctx->count < (Py_ssize_t) ctx->pattern[1]) { + RETURN_FAILURE; + } + + /* Update the pattern to point to the next op code */ + ctx->pattern += ctx->pattern[0]; + + /* Let the tail be evaluated separately and consider this + match successful. */ + if (*ctx->pattern == SRE_OP_SUCCESS) { + /* tail is empty. we're finished */ + state->ptr = ctx->ptr; + RETURN_SUCCESS; + } + + /* Attempt to match the rest of the string */ + break; + case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ @@ -1110,6 +1164,138 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; RETURN_FAILURE; + case SRE_OP_POSSESSIVE_REPEAT: + /* create possessive repeat contexts. */ + /* <1=min> <2=max> pattern + tail */ + TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + + /* Set the global Input pointer to this context's Input + pointer */ + state->ptr = ctx->ptr; + + /* Initialize Count to 0 */ + ctx->count = 0; + + /* Check for minimum required matches. */ + while (ctx->count < (Py_ssize_t)ctx->pattern[1]) { + /* not enough matches */ + DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1, + &ctx->pattern[3]); + if (ret) { + RETURN_ON_ERROR(ret); + ctx->count++; + } + else { + state->ptr = ctx->ptr; + RETURN_FAILURE; + } + } + + /* Clear the context's Input stream pointer so that it + doesn't match the global state so that the while loop can + be entered. */ + ctx->ptr = NULL; + + /* Keep trying to parse the sub-pattern until the + end is reached, creating a new context each time. */ + while ((ctx->count < (Py_ssize_t)ctx->pattern[2] || + (Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT) && + state->ptr != ctx->ptr) { + /* Save the Capture Group Marker state into the current + Context and back up the current highest number + Capture Group marker. */ + LASTMARK_SAVE(); + MARK_PUSH(ctx->lastmark); + + /* zero-width match protection */ + /* Set the context's Input Stream pointer to be the + current Input Stream pointer from the global + state. When the loop reaches the next iteration, + the context will then store the last known good + position with the global state holding the Input + Input Stream position that has been updated with + the most recent match. Thus, if state's Input + stream remains the same as the one stored in the + current Context, we know we have successfully + matched an empty string and that all subsequent + matches will also be the empty string until the + maximum number of matches are counted, and because + of this, we could immediately stop at that point and + consider this match successful. */ + ctx->ptr = state->ptr; + + /* We have not reached the maximin matches, so try to + match once more. */ + DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2, + &ctx->pattern[3]); + + /* Check to see if the last attempted match + succeeded. */ + if (ret) { + /* Drop the saved highest number Capture Group + marker saved above and use the newly updated + value. */ + MARK_POP_DISCARD(ctx->lastmark); + RETURN_ON_ERROR(ret); + + /* Success, increment the count. */ + ctx->count++; + } + /* Last attempted match failed. */ + else { + /* Restore the previously saved highest number + Capture Group marker since the last iteration + did not match, then restore that to the global + state. */ + MARK_POP(ctx->lastmark); + LASTMARK_RESTORE(); + + /* We have sufficient matches, so exit loop. */ + break; + } + } + + /* Evaluate Tail */ + /* Jump to end of pattern indicated by skip, and then skip + the SUCCESS op code that follows it. */ + ctx->pattern += ctx->pattern[0] + 1; + ctx->ptr = state->ptr; + break; + + case SRE_OP_ATOMIC_GROUP: + /* Atomic Group Sub Pattern */ + /* pattern tail */ + TRACE(("|%p|%p|ATOMIC_GROUP\n", ctx->pattern, ctx->ptr)); + + /* Set the global Input pointer to this context's Input + pointer */ + state->ptr = ctx->ptr; + + /* Evaluate the Atomic Group in a new context, terminating + when the end of the group, represented by a SUCCESS op + code, is reached. */ + /* Group Pattern begins at an offset of 1 code. */ + DO_JUMP(JUMP_ATOMIC_GROUP, jump_atomic_group, + &ctx->pattern[1]); + + /* Test Exit Condition */ + RETURN_ON_ERROR(ret); + + if (ret == 0) { + /* Atomic Group failed to Match. */ + state->ptr = ctx->ptr; + RETURN_FAILURE; + } + + /* Evaluate Tail */ + /* Jump to end of pattern indicated by skip, and then skip + the SUCCESS op code that follows it. */ + ctx->pattern += ctx->pattern[0]; + ctx->ptr = state->ptr; + break; + case SRE_OP_GROUPREF: /* match backreference */ TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern, @@ -1306,6 +1492,12 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case JUMP_MIN_UNTIL_1: TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr)); goto jump_min_until_1; + case JUMP_POSS_REPEAT_1: + TRACE(("|%p|%p|JUMP_POSS_REPEAT_1\n", ctx->pattern, ctx->ptr)); + goto jump_poss_repeat_1; + case JUMP_POSS_REPEAT_2: + TRACE(("|%p|%p|JUMP_POSS_REPEAT_2\n", ctx->pattern, ctx->ptr)); + goto jump_poss_repeat_2; case JUMP_REPEAT: TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr)); goto jump_repeat; @@ -1318,6 +1510,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case JUMP_MIN_REPEAT_ONE: TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr)); goto jump_min_repeat_one; + case JUMP_ATOMIC_GROUP: + TRACE(("|%p|%p|JUMP_ATOMIC_GROUP\n", ctx->pattern, ctx->ptr)); + goto jump_atomic_group; case JUMP_ASSERT: TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr)); goto jump_assert; From 5066bd1221a83e549a97f2ffc6dcd56600bff0ce Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 19 Mar 2022 08:22:26 +0200 Subject: [PATCH 2/8] Polishing. --- Doc/library/re.rst | 7 ++++++- Lib/sre_compile.py | 23 +++++++++-------------- Lib/sre_constants.py | 2 +- Lib/sre_parse.py | 3 +-- Lib/test/test_re.py | 10 +++------- Modules/_sre.c | 32 ++++++++++---------------------- Modules/sre_constants.h | 2 +- Modules/sre_lib.h | 6 +++--- 8 files changed, 34 insertions(+), 51 deletions(-) diff --git a/Doc/library/re.rst b/Doc/library/re.rst index b4af10a959d8d9b..ffdfc0d5c86659f 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -155,7 +155,9 @@ The special characters are: only ``''``. .. index:: - single: {} (curly brackets); in regular expressions + single: *+; in regular expressions + single: ++; in regular expressions + single: ?+; in regular expressions ``*+``, ``++``, ``?+`` Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is @@ -172,6 +174,9 @@ The special characters are: .. versionadded:: 3.11 +.. index:: + single: {} (curly brackets); in regular expressions + ``{m}`` Specifies that exactly *m* copies of the previous RE should be matched; fewer matches cause the entire RE not to match. For example, ``a{6}`` will match diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 04b3b3b2eba09f3..524a06db6cc2d29 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -141,7 +141,7 @@ def _compile(code, pattern, flags): if op is MAX_REPEAT: emit(REPEAT_ONE) elif op is POSSESSIVE_REPEAT: - emit(POSSESSIVE_ONE) + emit(POSSESSIVE_REPEAT_ONE) else: emit(MIN_REPEAT_ONE) skip = _len(code); emit(0) @@ -150,24 +150,19 @@ def _compile(code, pattern, flags): _compile(code, av[2], flags) emit(SUCCESS) code[skip] = _len(code) - skip - elif op is POSSESSIVE_REPEAT: - emit(POSSESSIVE_REPEAT) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - code[skip] = _len(code) - skip - emit(SUCCESS) else: - emit(REPEAT) + if op is POSSESSIVE_REPEAT: + emit(POSSESSIVE_REPEAT) + else: + emit(REPEAT) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) code[skip] = _len(code) - skip - # TODO: What if op is REPEAT, not MIN_REPEAT; - # Default of MIN_UNTIL may be wrong - if op is MAX_REPEAT: + if op is POSSESSIVE_REPEAT: + emit(SUCCESS) + elif op is MAX_REPEAT: emit(MAX_UNTIL) else: emit(MIN_UNTIL) @@ -733,7 +728,7 @@ def print_2(*args): print_(FAILURE) i += 1 elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, - POSSESSIVE_REPEAT, POSSESSIVE_ONE): + POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): skip, min, max = code[i: i+3] if max == MAXREPEAT: max = 'MAXREPEAT' diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 254abaf0460642b..359190af4e47118 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -99,7 +99,7 @@ def _makecodes(names): MIN_REPEAT_ONE ATOMIC_GROUP POSSESSIVE_REPEAT - POSSESSIVE_ONE + POSSESSIVE_REPEAT_ONE GROUPREF_IGNORE IN_IGNORE diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 458197f1926a1eb..fcf4c15d72d3384 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -854,8 +854,7 @@ def _parse(source, state, verbose, nested, first=False): if group is not None: state.closegroup(group, p) if atomic: - # TODO: Assert that group is always None in this - # case + assert group is None subpatternappend((ATOMIC_GROUP, p)) else: subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 751180297ce3b11..388a5e852e3248d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1518,7 +1518,7 @@ def test_inline_flags(self): def test_dollar_matches_twice(self): - """Test that $ does not include \\n + r"""Test that $ does not include \n $ matches the end of string, and just before the terminating \n""" pattern = re.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') @@ -2096,13 +2096,9 @@ def test_possessive_qualifiers(self): self.assertEqual(re.match('(e?){2,4}+a', 'eeea').groups(), ('',)) self.assertEqual(re.match('()*+a', 'a').groups(), ('',)) - self.assertEqual(re.search('x*+', 'axx').span(0), (0, 0)) self.assertEqual(re.search('x*+', 'axx').span(), (0, 0)) - self.assertEqual(re.search('x++', 'axx').span(0), (1, 3)) self.assertEqual(re.search('x++', 'axx').span(), (1, 3)) - self.assertEqual(re.match('a*+', 'xxx').span(0), (0, 0)) self.assertEqual(re.match('a*+', 'xxx').span(), (0, 0)) - self.assertEqual(re.match('x*+', 'xxxa').span(0), (0, 3)) self.assertEqual(re.match('x*+', 'xxxa').span(), (0, 3)) self.assertIsNone(re.match('a++', 'xxx')) self.assertIsNone(re.match(r"^(\w){1}+$", "abc")) @@ -2124,7 +2120,7 @@ def test_possessive_qualifiers(self): def test_atomic_grouping(self): """Test Atomic Grouping - test non-capturing groups of the form (?>...), which acts does + test non-capturing groups of the form (?>...), which does not maintain any stack point created within the group once the group is finished being evaluated.""" pattern1 = re.compile(r'a(?>bc|b)c') @@ -2215,7 +2211,7 @@ def test_possesive_repeat_one(self): LITERAL 97 0. INFO 4 0b0 0 1 (to 5) - 5: POSSESSIVE_ONE 6 0 1 (to 12) + 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12) 9. LITERAL 0x61 ('a') 11. SUCCESS 12: SUCCESS diff --git a/Modules/_sre.c b/Modules/_sre.c index 9cf94b3f161fe1d..a181df406aa9377 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1807,7 +1807,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_REPEAT_ONE: case SRE_OP_MIN_REPEAT_ONE: - case SRE_OP_POSSESSIVE_ONE: + case SRE_OP_POSSESSIVE_REPEAT_ONE: { SRE_CODE min, max; GET_SKIP; @@ -1827,27 +1827,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) break; case SRE_OP_REPEAT: - { - SRE_CODE min, max; - GET_SKIP; - GET_ARG; min = arg; - GET_ARG; max = arg; - if (min > max) - FAIL; - if (max > SRE_MAXREPEAT) - FAIL; - if (!_validate_inner(code, code+skip-3, groups)) - FAIL; - code += skip-3; - GET_OP; - if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) - FAIL; - } - break; - case SRE_OP_POSSESSIVE_REPEAT: { - SRE_CODE min, max; + SRE_CODE op1 = op, min, max; GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; @@ -1859,8 +1841,14 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) FAIL; code += skip-3; GET_OP; - if (op != SRE_OP_SUCCESS) - FAIL; + if (op1 == SRE_OP_POSSESSIVE_REPEAT) { + if (op != SRE_OP_SUCCESS) + FAIL; + } + else { + if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL) + FAIL; + } } break; diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 4d4b1e99380b4f6..8b9125b75b4568e 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -42,7 +42,7 @@ #define SRE_OP_MIN_REPEAT_ONE 27 #define SRE_OP_ATOMIC_GROUP 28 #define SRE_OP_POSSESSIVE_REPEAT 29 -#define SRE_OP_POSSESSIVE_ONE 30 +#define SRE_OP_POSSESSIVE_REPEAT_ONE 30 #define SRE_OP_GROUPREF_IGNORE 31 #define SRE_OP_IN_IGNORE 32 #define SRE_OP_LITERAL_IGNORE 33 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index a403747b2e9feb0..20afdd595769954 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -953,7 +953,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) } RETURN_FAILURE; - case SRE_OP_POSSESSIVE_ONE: + case SRE_OP_POSSESSIVE_REPEAT_ONE: /* match repeated sequence (maximizing regexp) without backtracking */ @@ -962,10 +962,10 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) collecting backtracking points. for other cases, use the MAX_REPEAT operator */ - /* <1=min> <2=max> item + /* <1=min> <2=max> item tail */ - TRACE(("|%p|%p|POSSESSIVE_ONE %d %d\n", ctx->pattern, + TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); if (ctx->ptr + ctx->pattern[1] > end) { From 8015968a88af83f3204fd55e05c12497cc8db6f3 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 19 Mar 2022 12:08:57 +0200 Subject: [PATCH 3/8] Update documentation. --- Doc/library/re.rst | 72 ++++++++++--------- Doc/whatsnew/3.11.rst | 7 ++ Lib/test/test_re.py | 4 +- .../2022-03-19-08-42-57.bpo-433030.UTwRX7.rst | 3 + 4 files changed, 52 insertions(+), 34 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst diff --git a/Doc/library/re.rst b/Doc/library/re.rst index ffdfc0d5c86659f..5ebf8cfa4e86a9a 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -161,16 +161,20 @@ The special characters are: ``*+``, ``++``, ``?+`` Like the ``'*'``, ``'+'``, and ``'?'`` qualifiers, those where ``'+'`` is - appended also match as many times as possible. However, unlike the true greedy - qualifiers, these do not allow back-tracking when the expression following it - fails to match. These are known as :dfn:`Possessive` qualifiers. For example, - ``a*a`` will match ``'aaaa'`` because the ``a*`` will match all 4 ``'a'``s, but, - when the final ``'a'`` is encountered, the expression is backtracked so that in the - end the ``a*`` ends up matching 3 ``'a'``s total, and the fourth ``'a'`` is matched - by the final ``'a'``. However, when ``a*+a`` is used to match ``'aaaa'``, the - ``a*+`` will match all 4 ``'a'``, but when the final ``'a'`` fails to find any more - characters to match, the expression cannot be backtracked and will thus fail to - match. + appended also match as many times as possible. + However, unlike the true greedy qualifiers, these do not allow + back-tracking when the expression following it fails to match. + These are known as :dfn:`possessive` qualifiers. + For example, ``a*a`` will match ``'aaaa'`` because the ``a*`` will match + all 4 ``'a'``s, but, when the final ``'a'`` is encountered, the + expression is backtracked so that in the end the ``a*`` ends up matching + 3 ``'a'``s total, and the fourth ``'a'`` is matched by the final ``'a'``. + However, when ``a*+a`` is used to match ``'aaaa'``, the ``a*+`` will + match all 4 ``'a'``, but when the final ``'a'`` fails to find any more + characters to match, the expression cannot be backtracked and will thus + fail to match. + ``x*+``, ``x++`` and ``x?+`` are equivalent to ``(?>x*)``, ``(?>x+)`` + and ``(?>x?)`` correspondigly. .. versionadded:: 3.11 @@ -199,14 +203,17 @@ The special characters are: while ``a{3,5}?`` will only match 3 characters. ``{m,n}+`` - Causes the resulting RE to match from *m* to *n* repetitions of the preceding - RE, attempting to match as many repetitions as possible *without* establishing any - backtracking points. This is the possessive version of the qualifier above. For - example, on the 6-character string ``'aaaaaa'``, ``a{3,5}+aa`` attempt to match 5 - ``'a'`` characters, then, requiring 2 more ``'a'``s, will need more characters than - available and thus fail, while ``a{3,5}aa`` will match with ``a{3,5}`` capturing - 5, then 4 ``'a'``s by backtracking and then the final 2 ``'a'``s are matched by the - final ``aa`` in the pattern. + Causes the resulting RE to match from *m* to *n* repetitions of the + preceding RE, attempting to match as many repetitions as possible + *without* establishing any backtracking points. + This is the possessive version of the qualifier above. + For example, on the 6-character string ``'aaaaaa'``, ``a{3,5}+aa`` + attempt to match 5 ``'a'`` characters, then, requiring 2 more ``'a'``s, + will need more characters than available and thus fail, while + ``a{3,5}aa`` will match with ``a{3,5}`` capturing 5, then 4 ``'a'``s + by backtracking and then the final 2 ``'a'``s are matched by the final + ``aa`` in the pattern. + ``x{m,n}+`` is equivalent to ``(?>x{m,n})``. .. versionadded:: 3.11 @@ -365,6 +372,21 @@ The special characters are: .. versionchanged:: 3.7 The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group. +``(?>...)`` + Attempts to match ``...`` as if it was a separate regular expression, and + if successful, continues to match the rest of the pattern following it. + If the subsequent pattern fails to match, the stack can only be unwound + to a point *before* the ``(?>...)`` because once exited, the expression, + known as an :dfn:`atomic group`, has thrown away all stack points within + itself. + Thus, ``(?>.*).`` would never match anything because first the ``.*`` + would match all characters possible, then, having nothing left to match, + the final ``.`` would fail to match. + Since there are no stack points saved in the Atomic Group, and there is + no stack point before it, the entire expression would thus fail to match. + + .. versionadded:: 3.11 + .. index:: single: (?P<; in regular expressions ``(?P...)`` @@ -453,20 +475,6 @@ The special characters are: some fixed length. Patterns which start with negative lookbehind assertions may match at the beginning of the string being searched. -``(?>...)`` - Attempts to match ``...`` as if it was a separate Regular Expression, and if - successful, continues to match the rest of the pattern following it. If the - subsequent pattern fails to match, the stack can only be unwound to a point - *before* the ``(?>...)`` because once exited, the expression, known as an - :dfn:`Atomic Group`, has thrown away all stack points within itself. Thus, - ``(?>.*).`` would never match anything because first the ``.*`` would match all - characters possible, then, having nothing left to match, the final ``.`` would - fail to match. Since there are no stack points saved in the Atomic Group, and - there is no stack point before it, the entire expression would thus fail to - match. - - .. versionadded:: 3.11 - ``(?(id/name)yes-pattern|no-pattern)`` Will try to match with ``yes-pattern`` if the group with given *id* or *name* exists, and with ``no-pattern`` if it doesn't. ``no-pattern`` is diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 2af663809a448b2..2e019c4fef5a079 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -295,6 +295,13 @@ os instead of ``CryptGenRandom()`` which is deprecated. (Contributed by Dong-hee Na in :issue:`44611`.) +re +-- + +* :term:`Atomic grouping ` (``(?>...)``) and :term:`possessive` + qualifiers (``*+``, ``++``, ``?+``, ``{m,n}+``) are now supported in regular + expressions. + (Contributed by Jeffrey C. Jacobs and Serhiy Storchaka in :issue:`433030`.) shutil ------ diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 388a5e852e3248d..7dce0b6ea824526 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2081,7 +2081,7 @@ def test_bug_40736(self): def test_possessive_qualifiers(self): """Test Possessive Qualifiers - test qualifiers of the form @+ for some repetition operator @, + Test qualifiers of the form @+ for some repetition operator @, e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed without creating a stack frame for rolling the stack back and trying 1 or more fewer matches.""" @@ -2120,7 +2120,7 @@ def test_possessive_qualifiers(self): def test_atomic_grouping(self): """Test Atomic Grouping - test non-capturing groups of the form (?>...), which does + Test non-capturing groups of the form (?>...), which does not maintain any stack point created within the group once the group is finished being evaluated.""" pattern1 = re.compile(r'a(?>bc|b)c') diff --git a/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst new file mode 100644 index 000000000000000..76ad082f65101f5 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst @@ -0,0 +1,3 @@ +Add support of :term:`atomic grouping ` (``(?>...)``) and +:term:`possessive` qualifiers (``*+``, ``++``, ``?+``, ``{m,n}+``) in +:mod:`regular expressions `. From 599109c57346d706d82dc02acac21d6311c949b8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 19 Mar 2022 12:27:50 +0200 Subject: [PATCH 4/8] Fix incorrect use of the term role. --- Doc/whatsnew/3.11.rst | 5 ++--- .../next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 2e019c4fef5a079..75bfb85a62e6fcb 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -298,9 +298,8 @@ os re -- -* :term:`Atomic grouping ` (``(?>...)``) and :term:`possessive` - qualifiers (``*+``, ``++``, ``?+``, ``{m,n}+``) are now supported in regular - expressions. +* Atomic grouping (``(?>...)``) and possessive qualifiers (``*+``, ``++``, + ``?+``, ``{m,n}+``) are now supported in regular expressions. (Contributed by Jeffrey C. Jacobs and Serhiy Storchaka in :issue:`433030`.) shutil diff --git a/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst index 76ad082f65101f5..1afed73ffeab235 100644 --- a/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst +++ b/Misc/NEWS.d/next/Library/2022-03-19-08-42-57.bpo-433030.UTwRX7.rst @@ -1,3 +1,2 @@ -Add support of :term:`atomic grouping ` (``(?>...)``) and -:term:`possessive` qualifiers (``*+``, ``++``, ``?+``, ``{m,n}+``) in -:mod:`regular expressions `. +Add support of atomic grouping (``(?>...)``) and possessive qualifiers +(``*+``, ``++``, ``?+``, ``{m,n}+``) in :mod:`regular expressions `. From 711975d84430462217e6847036c949125f36b5fe Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 20 Mar 2022 17:08:14 +0200 Subject: [PATCH 5/8] Polishing. --- Lib/sre_constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 359190af4e47118..a00b0170607b591 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -147,6 +147,7 @@ def _makecodes(names): CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK """) + # replacement operations for "ignore case" mode OP_IGNORE = { LITERAL: LITERAL_IGNORE, From 9d540594481838da577c466405f89ddf3314c84a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 20 Mar 2022 17:21:17 +0200 Subject: [PATCH 6/8] Simplify the compliler code. --- Lib/sre_compile.py | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 524a06db6cc2d29..0867200a59a2302 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -17,11 +17,16 @@ assert _sre.MAGIC == MAGIC, "SRE module mismatch" _LITERAL_CODES = {LITERAL, NOT_LITERAL} -_REPEATING_CODES = {MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT} _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} _UNIT_CODES = _LITERAL_CODES | {ANY, IN} +_REPEATING_CODES = { + MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), + MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), + POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), +} + # Sets of lowercase characters which have the same uppercase. _equivalences = ( # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I @@ -138,12 +143,7 @@ def _compile(code, pattern, flags): if flags & SRE_FLAG_TEMPLATE: raise error("internal: unsupported template operator %r" % (op,)) if _simple(av[2]): - if op is MAX_REPEAT: - emit(REPEAT_ONE) - elif op is POSSESSIVE_REPEAT: - emit(POSSESSIVE_REPEAT_ONE) - else: - emit(MIN_REPEAT_ONE) + emit(REPEATING_CODES[op][2]) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) @@ -151,21 +151,13 @@ def _compile(code, pattern, flags): emit(SUCCESS) code[skip] = _len(code) - skip else: - if op is POSSESSIVE_REPEAT: - emit(POSSESSIVE_REPEAT) - else: - emit(REPEAT) + emit(REPEATING_CODES[op][0]) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) code[skip] = _len(code) - skip - if op is POSSESSIVE_REPEAT: - emit(SUCCESS) - elif op is MAX_REPEAT: - emit(MAX_UNTIL) - else: - emit(MIN_UNTIL) + emit(REPEATING_CODES[op][1]) elif op is SUBPATTERN: group, add_flags, del_flags, p = av if group: From 7e4f64aa33774fd57d30ae3003eb00736210b1a7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 21 Mar 2022 10:55:44 +0200 Subject: [PATCH 7/8] Fix possesive repetition of 1-character pattern --- Lib/test/test_re.py | 60 +++++++++++++++++++++++++++++++++++++++++++++ Modules/sre_lib.h | 5 +++- 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index e8badc2c12e5f5b..bde7509f09729f5 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2077,6 +2077,36 @@ def test_possessive_qualifiers(self): self.assertIsNone(re.match("^x{}+$", "xxx")) self.assertTrue(re.match("^x{}+$", "x{}")) + def test_fullmatch_possessive_qualifiers(self): + self.assertTrue(re.fullmatch(r'a++', 'a')) + self.assertTrue(re.fullmatch(r'a*+', 'a')) + self.assertTrue(re.fullmatch(r'a?+', 'a')) + self.assertTrue(re.fullmatch(r'a{1,3}+', 'a')) + self.assertIsNone(re.fullmatch(r'a++', 'ab')) + self.assertIsNone(re.fullmatch(r'a*+', 'ab')) + self.assertIsNone(re.fullmatch(r'a?+', 'ab')) + self.assertIsNone(re.fullmatch(r'a{1,3}+', 'ab')) + + self.assertTrue(re.fullmatch(r'(?:ab)++', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab)*+', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab)?+', 'ab')) + self.assertTrue(re.fullmatch(r'(?:ab){1,3}+', 'ab')) + self.assertIsNone(re.fullmatch(r'(?:ab)++', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab)*+', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab)?+', 'abc')) + self.assertIsNone(re.fullmatch(r'(?:ab){1,3}+', 'abc')) + + def test_findall_possessive_qualifiers(self): + self.assertEqual(re.findall(r'a++', 'aab'), ['aa']) + self.assertEqual(re.findall(r'a*+', 'aab'), ['aa', '', '']) + self.assertEqual(re.findall(r'a?+', 'aab'), ['a', 'a', '', '']) + self.assertEqual(re.findall(r'a{1,3}+', 'aab'), ['aa']) + + self.assertEqual(re.findall(r'(?:ab)++', 'ababc'), ['abab']) + self.assertEqual(re.findall(r'(?:ab)*+', 'ababc'), ['abab', '', '']) + self.assertEqual(re.findall(r'(?:ab)?+', 'ababc'), ['ab', 'ab', '', '']) + self.assertEqual(re.findall(r'(?:ab){1,3}+', 'ababc'), ['abab']) + def test_atomic_grouping(self): """Test Atomic Grouping Test non-capturing groups of the form (?>...), which does @@ -2091,6 +2121,36 @@ def test_atomic_grouping(self): self.assertIsNone(re.match(r'(?>x)++x', 'xxx')) self.assertIsNone(re.match(r'(?>x++)x', 'xxx')) + def test_fullmatch_atomic_grouping(self): + self.assertTrue(re.fullmatch(r'(?>a+)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a*)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a?)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a{1,3})', 'a')) + self.assertIsNone(re.fullmatch(r'(?>a+)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a*)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a?)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a{1,3})', 'ab')) + + self.assertTrue(re.fullmatch(r'(?>(?:ab)+)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)*)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)?)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)+)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)*)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)?)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab){1,3})', 'abc')) + + def test_findall_atomic_grouping(self): + self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa']) + self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', '']) + self.assertEqual(re.findall(r'(?>a?)', 'aab'), ['a', 'a', '', '']) + self.assertEqual(re.findall(r'(?>a{1,3})', 'aab'), ['aa']) + + self.assertEqual(re.findall(r'(?>(?:ab)+)', 'ababc'), ['abab']) + self.assertEqual(re.findall(r'(?>(?:ab)*)', 'ababc'), ['abab', '', '']) + self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', '']) + self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab']) + def get_debug_out(pat): with captured_stdout() as out: diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 20afdd595769954..956fd3fad91649f 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -995,7 +995,10 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* Let the tail be evaluated separately and consider this match successful. */ - if (*ctx->pattern == SRE_OP_SUCCESS) { + if (*ctx->pattern == SRE_OP_SUCCESS && + ctx->ptr == state->end && + !(ctx->toplevel && state->must_advance && ctx->ptr == state->start)) + { /* tail is empty. we're finished */ state->ptr = ctx->ptr; RETURN_SUCCESS; From b9e20f07559e38ae8e34596c78113db9e91f4075 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 21 Mar 2022 12:25:44 +0200 Subject: [PATCH 8/8] Add the original author in Misc/ACKS. --- Misc/ACKS | 1 + 1 file changed, 1 insertion(+) diff --git a/Misc/ACKS b/Misc/ACKS index 895813e8fc208a7..cfc15be2c9748f7 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -824,6 +824,7 @@ Ben Jackson Paul Jackson Manuel Jacob David Jacobs +Jeffrey C. Jacobs Kevin Jacobs Kjetil Jacobsen Shantanu Jain