[3.13] gh-153030: Fix quadratic complexity in incremental parsing in HTMLParser (GH-153031) (GH-153040)

miss-islington · serhiy-storchaka · claude · web-flow · commit 7933f4bf7131 · 2026-07-04T18:01:22.000Z
When an unterminated construct (e.g. a tag or comment) spanned many feed() calls, rescanning the growing buffer and concatenating new data onto it were both quadratic. New data is now accumulated in a list and only joined and parsed once enough has piled up. (cherry picked from commit bcf98dd) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
@@ -157,6 +157,9 @@ def reset(self):
         self.cdata_elem = None
         self._support_cdata = True
         self._escapable = True
+        self._pending = []
+        self._pending_len = 0
+        self._parse_threshold = 1
         super().reset()
 
     def feed(self, data):
@@ -165,11 +168,36 @@ def feed(self, data):
         Call this as often as you want, with as little or as much text
         as you want (may include '\n').
         """
-        self.rawdata = self.rawdata + data
-        self.goahead(0)
+        # Accumulate new data in a list and only join and parse it once
+        # enough has piled up.  Rescanning an unparsed buffer (e.g. an
+        # unterminated tag) and concatenating onto it on every call would
+        # both be quadratic in the input size.
+        self._pending_len += len(data)
+        if self._pending_len < self._parse_threshold:
+            self._pending.append(data)
+        else:
+            if not self._pending:
+                self.rawdata += data
+            else:
+                self._pending.append(data)
+                self.rawdata += ''.join(self._pending)
+                self._pending.clear()
+            self._pending_len = 0
+            n = len(self.rawdata)
+            self.goahead(0)
+            if len(self.rawdata) < n:
+                # Some data was parsed; resume on the next call.
+                self._parse_threshold = 1
+            else:
+                # Nothing was parsed; wait until the buffer doubles.
+                self._parse_threshold = len(self.rawdata)
 
     def close(self):
         """Handle any buffered data."""
+        if self._pending:
+            self.rawdata += ''.join(self._pending)
+            self._pending.clear()
+            self._pending_len = 0
         self.goahead(1)
 
     __starttag_text = None
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
@@ -1041,6 +1041,26 @@ def check(source):
         check("<![CDATA[" * 9 * n)
         check("<!doctype" * 35 * n)
 
+    @support.requires_resource('cpu')
+    def test_incremental_no_quadratic_complexity(self):
+        # An unterminated construct fed in many small chunks used to take
+        # quadratic time, both to rescan and to concatenate the buffer.
+        # Now it takes a fraction of a second.
+        def check(prefix, chunk, suffix):
+            parser = html.parser.HTMLParser()
+            parser.feed(prefix)
+            for _ in range(200_000):
+                parser.feed(chunk)
+            parser.feed(suffix)
+            parser.close()
+        chunk = "a" * 64
+        check("<!--", chunk, "-->")       # comment
+        check("<?", chunk, ">")           # processing instruction
+        check("<!doctype ", chunk, ">")   # doctype
+        check("<![CDATA[", chunk, "]]>")  # CDATA section
+        check("<a href='", chunk, "'>")   # start tag
+        check("<script>", chunk, "</script>")  # RAWTEXT element
+
 
 class AttributesTestCase(TestCaseBase):
 
diff --git a/Misc/NEWS.d/next/Security/2026-07-04-17-00-00.gh-issue-153030.RovkP6.rst b/Misc/NEWS.d/next/Security/2026-07-04-17-00-00.gh-issue-153030.RovkP6.rst
@@ -0,0 +1,3 @@
+Fixed quadratic complexity in incremental parsing of long unterminated
+constructs (such as tags or comments) in :class:`html.parser.HTMLParser`,
+which could be exploited for a denial of service.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Fixed quadratic complexity in incremental parsing of long unterminated`
	`2`	+constructs (such as tags or comments) in :class:`html.parser.HTMLParser`,
	`3`	`+which could be exploited for a denial of service.`