From c266f0c375c2e60ea46046254fa7cd5fa2fe1ca2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maurycy=20Paw=C5=82owski-Wiero=C5=84ski?=
 <maurycy@maurycy.com>
Date: Tue, 5 May 2026 02:50:06 +0200
Subject: [PATCH] gh-149009: Validate `thread_count` in `profiling.sampling`
 binary reader (#149147)

---
 .../test_binary_format.py                     | 30 +++++++++++++++++++
 ...-04-29-13-08-46.gh-issue-149009.rek3Tw.rst |  3 ++
 Modules/_remote_debugging/binary_io_reader.c  |  8 +++++
 3 files changed, 41 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2026-04-29-13-08-46.gh-issue-149009.rek3Tw.rst

diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
index ca6cb6befae..9cf706aa2da 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_binary_format.py
@@ -3,6 +3,7 @@
 import json
 import os
 import random
+import struct
 import tempfile
 import unittest
 from collections import defaultdict
@@ -941,6 +942,35 @@ class TestBinaryEdgeCases(BinaryFormatTestBase):
         self.assertEqual(w.total_samples, 0)
 
 
+class TestBinaryFormatValidation(BinaryFormatTestBase):
+    """Tests for malformed binary files."""
+
+    HDR_OFF_THREADS = 32
+
+    def test_replay_rejects_more_threads_than_declared(self):
+        """Replay rejects files with more unique threads than the header declares."""
+        threads = [
+            make_thread(1, [make_frame("t1.py", 10, "t1")]),
+            make_thread(2, [make_frame("t2.py", 20, "t2")]),
+        ]
+        samples = [[make_interpreter(0, threads)]]
+        filename = self.create_binary_file(samples, compression="none")
+
+        with open(filename, "r+b") as raw:
+            raw.seek(self.HDR_OFF_THREADS)
+            raw.write(struct.pack("=I", 1))
+
+        with BinaryReader(filename) as reader:
+            self.assertEqual(reader.get_info()["thread_count"], 1)
+            with self.assertRaises(ValueError) as cm:
+                reader.replay_samples(RawCollector())
+            self.assertEqual(
+                str(cm.exception),
+                "Invalid thread count: sample data contains more unique "
+                "threads than declared in header (declared 1, found at least 2)",
+            )
+
+
 class TestBinaryEncodings(BinaryFormatTestBase):
     """Tests specifically targeting different stack encodings."""
 
diff --git a/Misc/NEWS.d/next/Library/2026-04-29-13-08-46.gh-issue-149009.rek3Tw.rst b/Misc/NEWS.d/next/Library/2026-04-29-13-08-46.gh-issue-149009.rek3Tw.rst
new file mode 100644
index 00000000000..e2f07874276
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-04-29-13-08-46.gh-issue-149009.rek3Tw.rst
@@ -0,0 +1,3 @@
+Validate that :mod:`profiling.sampling` binary profiles do not contain more
+unique (thread, interpreter) pairs than declared in the header. Patch by
+Maurycy Pawłowski-Wieroński.
diff --git a/Modules/_remote_debugging/binary_io_reader.c b/Modules/_remote_debugging/binary_io_reader.c
index da3e7d55309..551530b5199 100644
--- a/Modules/_remote_debugging/binary_io_reader.c
+++ b/Modules/_remote_debugging/binary_io_reader.c
@@ -559,6 +559,14 @@ reader_get_or_create_thread_state(BinaryReader *reader, uint64_t thread_id,
         }
     }
 
+    if (reader->thread_state_count >= reader->thread_count) {
+        PyErr_Format(PyExc_ValueError,
+            "Invalid thread count: sample data contains more unique threads than declared in header "
+            "(declared %u, found at least %zu)",
+            reader->thread_count, reader->thread_state_count + 1);
+        return NULL;
+    }
+
     if (!reader->thread_states) {
         reader->thread_state_capacity = 16;
         reader->thread_states = PyMem_Calloc(reader->thread_state_capacity, sizeof(ReaderThreadState));