diff --git a/Lib/profiling/sampling/binary_collector.py b/Lib/profiling/sampling/binary_collector.py index afbbc829269067..cdbb76b878b1d1 100644 --- a/Lib/profiling/sampling/binary_collector.py +++ b/Lib/profiling/sampling/binary_collector.py @@ -1,5 +1,6 @@ """Thin Python wrapper around C binary writer for profiling data.""" +import sys import time import _remote_debugging @@ -61,6 +62,7 @@ def __init__(self, filename, sample_interval_usec, *, skip_idle=False, self.filename = filename self.sample_interval_usec = sample_interval_usec self.skip_idle = skip_idle + self.running = True compression_type = _resolve_compression(compression) start_time_us = int(time.monotonic() * 1_000_000) @@ -81,7 +83,13 @@ def collect(self, stack_frames, timestamp_us=None): """ if timestamp_us is None: timestamp_us = int(time.monotonic() * 1_000_000) - self._writer.write_sample(stack_frames, timestamp_us) + try: + self._writer.write_sample(stack_frames, timestamp_us) + except OverflowError as e: + self.running = False + print(f"Warning: {e}; stopping early and keeping the data " + "collected so far.", + file=sys.stderr) def collect_failed_sample(self): """Record a failed sample attempt (no-op for binary format).""" diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py index 5efc60a9211175..5731f6f146b423 100644 --- a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py +++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py @@ -9,6 +9,8 @@ import unittest from collections import defaultdict +from test.support import captured_stderr + try: import _remote_debugging from _remote_debugging import ( @@ -971,6 +973,83 @@ def test_writer_total_samples_after_close_returns_zero(self): w.close() self.assertEqual(w.total_samples, 0) + def test_binary_collector_stops_gracefully_on_overflow(self): + """OverflowError from the writer stops collection via the running + protocol instead of propagating and corrupting the file. + See gh-151292.""" + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + filename = f.name + self.temp_files.append(filename) + + collector = BinaryCollector(filename, 1000, compression="none") + self.assertTrue(collector.running) + + sample = [ + make_interpreter(0, [make_thread(1, [make_frame("a.py", 1, "f")])]) + ] + + # Collect real samples first, then hit the limit. + for i in range(3): + collector.collect(sample, timestamp_us=(i + 1) * 1000) + self.assertTrue(collector.running) + + real_writer = collector._writer + + class _OverflowingWriter: + def write_sample(self, stack_frames, timestamp_us): + raise OverflowError("too many samples for binary format") + + collector._writer = _OverflowingWriter() + with captured_stderr() as stderr: + collector.collect(sample, timestamp_us=4000) + + self.assertFalse(collector.running) + self.assertIn("too many samples", stderr.getvalue()) + + # The real writer can still be finalized into a valid file that + # keeps the samples collected before the limit was hit. + collector._writer = real_writer + collector.export(None) + + with open(filename, "rb") as f: + header = f.read(32) + magic, version = struct.unpack_from("=II", header, 0) + self.assertEqual(magic, 0x54414348) # "TACH" + self.assertEqual(version, 1) + (sample_count,) = struct.unpack_from("=I", header, 28) + self.assertEqual(sample_count, 3) + + reader_collector = RawCollector() + with BinaryReader(filename) as reader: + self.assertEqual(reader.replay_samples(reader_collector), 3) + + def test_interpreter_id_overflow_rejected(self): + """An interpreter_id wider than u32 raises OverflowError before any + writer state is mutated: subsequent valid samples are still accepted + and finalize produces a readable file.""" + with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f: + filename = f.name + self.temp_files.append(filename) + + good = [ + make_interpreter(0, [make_thread(1, [make_frame("a.py", 1, "f")])]) + ] + bad = [ + make_interpreter(2**32, [make_thread(1, [make_frame("a.py", 1, "f")])]) + ] + + writer = _remote_debugging.BinaryWriter(filename, 1000, 0, compression=0) + writer.write_sample(good, 1000) + with self.assertRaises(OverflowError): + writer.write_sample(bad, 2000) + writer.write_sample(good, 3000) + writer.finalize() + self.assertEqual(writer.total_samples, 2) + + reader_collector = RawCollector() + with BinaryReader(filename) as reader: + self.assertEqual(reader.replay_samples(reader_collector), 2) + class TestBinaryFormatValidation(BinaryFormatTestBase): """Tests for malformed binary files.""" diff --git a/Misc/NEWS.d/next/Library/2026-07-02-15-26-51.gh-issue-151292.nmnQlp.rst b/Misc/NEWS.d/next/Library/2026-07-02-15-26-51.gh-issue-151292.nmnQlp.rst new file mode 100644 index 00000000000000..eb3bda128bb256 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-07-02-15-26-51.gh-issue-151292.nmnQlp.rst @@ -0,0 +1,3 @@ +Fix ``profiling.sampling --binary`` leaving unreadable profile files when +the ``_remote_debugging`` binary writer raises :exc:`OverflowError`. Patch +by Maurycy Pawłowski-Wieroński. diff --git a/Modules/_remote_debugging/binary_io_writer.c b/Modules/_remote_debugging/binary_io_writer.c index 341f9f7dc8ac45..9bcc7de94ca5db 100644 --- a/Modules/_remote_debugging/binary_io_writer.c +++ b/Modules/_remote_debugging/binary_io_writer.c @@ -629,11 +629,6 @@ flush_pending_rle(BinaryWriter *writer, ThreadEntry *entry) if (!entry->has_pending_rle || entry->pending_rle_count == 0) { return 0; } - if (entry->pending_rle_count > UINT32_MAX - writer->total_samples) { - PyErr_SetString(PyExc_OverflowError, - "too many samples for binary format"); - return -1; - } /* Write RLE record: * [thread_id: 8] [interpreter_id: 4] [STACK_REPEAT: 1] [count: varint] @@ -655,7 +650,6 @@ flush_pending_rle(BinaryWriter *writer, ThreadEntry *entry) if (writer_write_bytes(writer, &entry->pending_rle[i].status, 1) < 0) { return -1; } - writer->total_samples++; } writer->stats.repeat_records++; @@ -678,12 +672,6 @@ write_sample_with_encoding(BinaryWriter *writer, ThreadEntry *entry, const uint32_t *frame_indices, size_t stack_depth, size_t shared_count, size_t pop_count, size_t push_count) { - if (writer->total_samples == UINT32_MAX) { - PyErr_SetString(PyExc_OverflowError, - "too many samples for binary format"); - return -1; - } - /* Header: thread_id(8) + interpreter_id(4) + encoding(1) + delta(varint) + status(1) */ uint8_t header_buf[SAMPLE_HEADER_MAX_SIZE]; memcpy(header_buf + SMP_OFF_THREAD_ID, &entry->thread_id, SMP_SIZE_THREAD_ID); @@ -955,6 +943,12 @@ static int process_thread_sample(BinaryWriter *writer, PyObject *thread_info, uint32_t interpreter_id, uint64_t timestamp_us) { + if (writer->total_samples == UINT32_MAX) { + PyErr_SetString(PyExc_OverflowError, + "too many samples for binary format"); + return -1; + } + PyObject *thread_id_obj = PyStructSequence_GET_ITEM(thread_info, 0); PyObject *status_obj = PyStructSequence_GET_ITEM(thread_info, 1); PyObject *frame_list = PyStructSequence_GET_ITEM(thread_info, 2); @@ -1010,6 +1004,7 @@ process_thread_sample(BinaryWriter *writer, PyObject *thread_info, entry->pending_rle[entry->pending_rle_count].status = status; entry->pending_rle_count++; entry->has_pending_rle = 1; + writer->total_samples++; } else { /* Stack changed - flush any pending RLE first */ if (entry->has_pending_rle) {