Skip to content

Commit 7933f4b

Browse files
miss-islingtonserhiy-storchakaclaude
authored
[3.13] gh-153030: Fix quadratic complexity in incremental parsing in HTMLParser (GH-153031) (GH-153040)
When an unterminated construct (e.g. a tag or comment) spanned many feed() calls, rescanning the growing buffer and concatenating new data onto it were both quadratic. New data is now accumulated in a list and only joined and parsed once enough has piled up. (cherry picked from commit bcf98dd) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 8a28075 commit 7933f4b

3 files changed

Lines changed: 53 additions & 2 deletions

File tree

Lib/html/parser.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ def reset(self):
157157
self.cdata_elem = None
158158
self._support_cdata = True
159159
self._escapable = True
160+
self._pending = []
161+
self._pending_len = 0
162+
self._parse_threshold = 1
160163
super().reset()
161164

162165
def feed(self, data):
@@ -165,11 +168,36 @@ def feed(self, data):
165168
Call this as often as you want, with as little or as much text
166169
as you want (may include '\n').
167170
"""
168-
self.rawdata = self.rawdata + data
169-
self.goahead(0)
171+
# Accumulate new data in a list and only join and parse it once
172+
# enough has piled up. Rescanning an unparsed buffer (e.g. an
173+
# unterminated tag) and concatenating onto it on every call would
174+
# both be quadratic in the input size.
175+
self._pending_len += len(data)
176+
if self._pending_len < self._parse_threshold:
177+
self._pending.append(data)
178+
else:
179+
if not self._pending:
180+
self.rawdata += data
181+
else:
182+
self._pending.append(data)
183+
self.rawdata += ''.join(self._pending)
184+
self._pending.clear()
185+
self._pending_len = 0
186+
n = len(self.rawdata)
187+
self.goahead(0)
188+
if len(self.rawdata) < n:
189+
# Some data was parsed; resume on the next call.
190+
self._parse_threshold = 1
191+
else:
192+
# Nothing was parsed; wait until the buffer doubles.
193+
self._parse_threshold = len(self.rawdata)
170194

171195
def close(self):
172196
"""Handle any buffered data."""
197+
if self._pending:
198+
self.rawdata += ''.join(self._pending)
199+
self._pending.clear()
200+
self._pending_len = 0
173201
self.goahead(1)
174202

175203
__starttag_text = None

Lib/test/test_htmlparser.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,6 +1041,26 @@ def check(source):
10411041
check("<![CDATA[" * 9 * n)
10421042
check("<!doctype" * 35 * n)
10431043

1044+
@support.requires_resource('cpu')
1045+
def test_incremental_no_quadratic_complexity(self):
1046+
# An unterminated construct fed in many small chunks used to take
1047+
# quadratic time, both to rescan and to concatenate the buffer.
1048+
# Now it takes a fraction of a second.
1049+
def check(prefix, chunk, suffix):
1050+
parser = html.parser.HTMLParser()
1051+
parser.feed(prefix)
1052+
for _ in range(200_000):
1053+
parser.feed(chunk)
1054+
parser.feed(suffix)
1055+
parser.close()
1056+
chunk = "a" * 64
1057+
check("<!--", chunk, "-->") # comment
1058+
check("<?", chunk, ">") # processing instruction
1059+
check("<!doctype ", chunk, ">") # doctype
1060+
check("<![CDATA[", chunk, "]]>") # CDATA section
1061+
check("<a href='", chunk, "'>") # start tag
1062+
check("<script>", chunk, "</script>") # RAWTEXT element
1063+
10441064

10451065
class AttributesTestCase(TestCaseBase):
10461066

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fixed quadratic complexity in incremental parsing of long unterminated
2+
constructs (such as tags or comments) in :class:`html.parser.HTMLParser`,
3+
which could be exploited for a denial of service.

0 commit comments

Comments
 (0)