From fe8bcb254eac30d779454107a649c21e5ca3629d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 2 Nov 2025 10:33:21 +0200 Subject: [PATCH 1/3] gh-140875: Fix handling of unclosed charrefs before EOF in HTMLParser --- Lib/html/parser.py | 29 +++-- Lib/test/test_htmlparser.py | 109 ++++++++++++++---- ...-11-02-10-44-23.gh-issue-140875.wt6B37.rst | 2 + 3 files changed, 107 insertions(+), 33 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-11-02-10-44-23.gh-issue-140875.wt6B37.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index e50620de800d636..80fb8c3f929f6b6 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -24,6 +24,7 @@ entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') +incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])') attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?') starttagopen = re.compile('<[a-zA-Z]') @@ -304,10 +305,20 @@ def goahead(self, end): k = k - 1 i = self.updatepos(i, k) continue + match = incomplete_charref.match(rawdata, i) + if match: + if end: + self.handle_charref(rawdata[i+2:]) + i = self.updatepos(i, n) + break + # incomplete + break + elif i + 3 < n: # larger than "&#x" + # not the end of the buffer, and can't be confused + # with some other construct + self.handle_data("&#") + i = self.updatepos(i, i + 2) else: - if ";" in rawdata[i:]: # bail by consuming &# - self.handle_data(rawdata[i:i+2]) - i = self.updatepos(i, i+2) break elif startswith('&', i): match = entityref.match(rawdata, i) @@ -321,15 +332,13 @@ def goahead(self, end): continue match = incomplete.match(rawdata, i) if match: - # match.group() will contain at least 2 chars - if end and match.group() == rawdata[i:]: - k = match.end() - if k <= i: - k = n - i = self.updatepos(i, i + 1) + if end: + self.handle_entityref(rawdata[i+1:]) + i = self.updatepos(i, n) + break # incomplete break - elif (i + 1) < n: + elif i + 1 < n: # not the end of the buffer, and can't be confused # with some other construct self.handle_data("&") diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 19dde9362a43b6c..424b6f5978504eb 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -109,12 +109,13 @@ def get_events(self): class TestCaseBase(unittest.TestCase): - def get_collector(self): - return EventCollector(convert_charrefs=False) + def get_collector(self, convert_charrefs=False): + return EventCollector(convert_charrefs=convert_charrefs) - def _run_check(self, source, expected_events, collector=None): + def _run_check(self, source, expected_events, + *, collector=None, convert_charrefs=False): if collector is None: - collector = self.get_collector() + collector = self.get_collector(convert_charrefs=convert_charrefs) parser = collector for s in source: parser.feed(s) @@ -128,7 +129,7 @@ def _run_check(self, source, expected_events, collector=None): def _run_check_extra(self, source, events): self._run_check(source, events, - EventCollectorExtra(convert_charrefs=False)) + collector=EventCollectorExtra(convert_charrefs=False)) class HTMLParserTestCase(TestCaseBase): @@ -187,10 +188,86 @@ def test_malformatted_charref(self): ]) def test_unclosed_entityref(self): - self._run_check("&entityref foo", [ - ("entityref", "entityref"), - ("data", " foo"), - ]) + self._run_check('> z', [('entityref', 'gt'), ('data', ' z')], + convert_charrefs=False) + self._run_check('> z', [('data', '> z')], convert_charrefs=True) + + self._run_check('&undefined z', + [('entityref', 'undefined'), ('data', ' z')], + convert_charrefs=False) + self._run_check('&undefined z', [('data', '&undefined z')], + convert_charrefs=True) + + self._run_check('>undefined z', + [('entityref', 'gtundefined'), ('data', ' z')], + convert_charrefs=False) + self._run_check('>undefined z', [('data', '>undefined z')], + convert_charrefs=True) + + self._run_check('& z', [('data', '& z')], convert_charrefs=False) + self._run_check('& z', [('data', '& z')], convert_charrefs=True) + + def test_eof_in_entityref(self): + self._run_check('>', [('entityref', 'gt')], convert_charrefs=False) + self._run_check('>', [('data', '>')], convert_charrefs=True) + + self._run_check('&g', [('entityref', 'g')], convert_charrefs=False) + self._run_check('&g', [('data', '&g')], convert_charrefs=True) + + self._run_check('&undefined', [('entityref', 'undefined')], + convert_charrefs=False) + self._run_check('&undefined', [('data', '&undefined')], + convert_charrefs=True) + + self._run_check('>undefined', [('entityref', 'gtundefined')], + convert_charrefs=False) + self._run_check('>undefined', [('data', '>undefined')], + convert_charrefs=True) + + self._run_check('&', [('data', '&')], convert_charrefs=False) + self._run_check('&', [('data', '&')], convert_charrefs=True) + + def test_unclosed_charref(self): + self._run_check('{ z', [('charref', '123'), ('data', ' z')], + convert_charrefs=False) + self._run_check('{ z', [('data', '{ z')], convert_charrefs=True) + self._run_check('« z', [('charref', 'xab'), ('data', ' z')], + convert_charrefs=False) + self._run_check('« z', [('data', '\xab z')], convert_charrefs=True) + + self._run_check('� z', + [('charref', '123456789'), ('data', ' z')], + convert_charrefs=False) + self._run_check('� z', [('data', '\ufffd z')], + convert_charrefs=True) + self._run_check('� z', + [('charref', 'x123456789'), ('data', ' z')], + convert_charrefs=False) + self._run_check('� z', [('data', '\ufffd z')], + convert_charrefs=True) + + self._run_check('&# z', [('data', '&# z')], convert_charrefs=False) + self._run_check('&# z', [('data', '&# z')], convert_charrefs=True) + self._run_check('&#x z', [('data', '&#x z')], convert_charrefs=False) + self._run_check('&#x z', [('data', '&#x z')], convert_charrefs=True) + + def test_eof_in_charref(self): + self._run_check('{', [('charref', '123')], convert_charrefs=False) + self._run_check('{', [('data', '{')], convert_charrefs=True) + self._run_check('«', [('charref', 'xab')], convert_charrefs=False) + self._run_check('«', [('data', '\xab')], convert_charrefs=True) + + self._run_check('�', [('charref', '123456789')], + convert_charrefs=False) + self._run_check('�', [('data', '\ufffd')], convert_charrefs=True) + self._run_check('�', [('charref', 'x123456789')], + convert_charrefs=False) + self._run_check('�', [('data', '\ufffd')], convert_charrefs=True) + + self._run_check('&#', [('data', '&#')], convert_charrefs=False) + self._run_check('&#', [('data', '&#')], convert_charrefs=True) + self._run_check('&#x', [('data', '&#x')], convert_charrefs=False) + self._run_check('&#x', [('data', '&#x')], convert_charrefs=True) def test_bad_nesting(self): # Strangely, this *is* supposed to test that overlapping @@ -762,20 +839,6 @@ def test_correct_detection_of_start_tags(self): ] self._run_check(html, expected) - def test_EOF_in_charref(self): - # see #17802 - # This test checks that the UnboundLocalError reported in the issue - # is not raised, however I'm not sure the returned values are correct. - # Maybe HTMLParser should use self.unescape for these - data = [ - ('a&', [('data', 'a&')]), - ('a&b', [('data', 'ab')]), - ('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]), - ('a&b;', [('data', 'a'), ('entityref', 'b')]), - ] - for html, expected in data: - self._run_check(html, expected) - def test_eof_in_comments(self): data = [ ('