More minor improvements to conformance.py (#23792)

2026-05-06 08:56:57 -04:00 · 2026-03-07 17:54:21 +00:00
parent edac24bbd1
commit 5cdd2fe435
1 changed files with 64 additions and 32 deletions
@@ -76,13 +76,12 @@ CONFORMANCE_URL = CONFORMANCE_DIR_WITH_README + "tests/{filename}#L{line}"

 GITHUB_HEADER = [
    "<table>",
-    "",
    "<tr>",
    "<th>Test case</th>",
    "<th>Diff</th>",
    "</tr>",
 ]
-GITHUB_FOOTER = ["", "</table>"]
+GITHUB_FOOTER = ["</table>"]
 SUMMARY_NOTE = """
    Each test case represents one expected error annotation or a group of annotations
    sharing a tag. Counts are per test case, not per diagnostic — multiple diagnostics
@@ -370,7 +369,6 @@ def render_html_diff_row(tc: TestCase, *, source: Source | None) -> list[str]:
    return [
        "",
        "<tr>",
-        "",
        "<td>",
        "",
        location,
@@ -384,7 +382,6 @@ def render_html_diff_row(tc: TestCase, *, source: Source | None) -> list[str]:
        "```",
        "",
        "</td>",
-        "",
        "</tr>",
    ]

@@ -719,16 +716,28 @@ def render_test_cases(
            lines.append("```")
        else:
            lines.extend(GITHUB_FOOTER)
-        lines.extend(["", "</details>", ""])
+        lines.extend(["</details>", ""])

    return "\n".join(lines)


-def render_file_stats_table(test_cases: list[TestCase]) -> str:
-    """Render a per-file breakdown showing only files whose TP/FP/FN counts changed."""
+def collect_file_stats(test_cases: list[TestCase]) -> list[FileStats]:
+    """Compute per-file statistics from grouped test cases."""
    path_to_cases: dict[Path, list[TestCase]] = {}
    for tc in test_cases:
        path_to_cases.setdefault(tc.path, []).append(tc)
+    return [
+        FileStats(
+            path=path,
+            old=compute_stats(cases, Source.OLD),
+            new=compute_stats(cases, Source.NEW),
+        )
+        for path, cases in path_to_cases.items()
+    ]
+
+
+def render_file_stats_table(file_stats: list[FileStats]) -> str:
+    """Render a per-file breakdown showing only files whose TP/FP/FN counts changed."""

    def fmt(old: int, new: int, *, greater_is_better: bool = True) -> str:
        if old == new:
@@ -738,19 +747,13 @@ def render_file_stats_table(test_cases: list[TestCase]) -> str:
        indicator = " ✅" if improved else " ❌"
        return f"{new} ({diff:+}){indicator}"

-    # Collect per-file data; track totals across all files regardless of change.
-    file_stats: list[FileStats] = []
+    # Collect totals across all files regardless of change.
    old_totals = Statistics()
    new_totals = Statistics()
    passing = 0
-    total_files = 0
+    total_files = len(file_stats)

-    for path, cases in path_to_cases.items():
-        fs = FileStats(
-            path=path,
-            old=compute_stats(cases, Source.OLD),
-            new=compute_stats(cases, Source.NEW),
-        )
+    for fs in file_stats:
        old_totals.true_positives += fs.old.true_positives
        old_totals.false_positives += fs.old.false_positives
        old_totals.false_negatives += fs.old.false_negatives
@@ -758,8 +761,6 @@ def render_file_stats_table(test_cases: list[TestCase]) -> str:
        new_totals.false_positives += fs.new.false_positives
        new_totals.false_negatives += fs.new.false_negatives
        passing += fs.new_passes
-        total_files += 1
-        file_stats.append(fs)

    changed = [fs for fs in file_stats if fs.total_change > 0]
    if not changed:
@@ -772,9 +773,9 @@ def render_file_stats_table(test_cases: list[TestCase]) -> str:
        if fs.new_passes and not fs.old_passes:
            status = "✅ Newly Passing 🎉"
        elif fs.old_passes and not fs.new_passes:
-            status = "❌ Newly Failing"
+            status = "❌ Newly Failing ☹️"
        elif fs.new_passes:
-            status = "✅"
+            status = "✅ Still Passing"
        else:
            old_errors = fs.old.false_positives + fs.old.false_negatives
            new_errors = fs.new.false_positives + fs.new.false_negatives
@@ -849,17 +850,34 @@ def diff_format(
            assert_never((greater_is_better, increased))  # ty: ignore[type-assertion-failure]


-def render_summary(test_cases: list[TestCase], *, force_summary_table: bool) -> str:
-    def format_metric(diff: float, old: float, new: float):
+def render_summary(
+    test_cases: list[TestCase],
+    file_stats: list[FileStats],
+    *,
+    force_summary_table: bool,
+) -> str:
+    def format_metric(diff: float, old: float, new: float) -> str:
        if diff > 0:
-            return f"increased from {old:.2%} to {new:.2%}"
+            return f"increased from <b>{old:.2%}</b> to <b>{new:.2%}</b>"
        if diff < 0:
-            return f"decreased from {old:.2%} to {new:.2%}"
-        return f"held steady at {old:.2%}"
+            return f"decreased from <b>{old:.2%}</b> to <b>{new:.2%}</b>"
+        return f"held steady at <b>{old:.2%}</b>"
+
+    def format_int_metric(diff: int, old: int, new: int, total: int) -> str:
+        if diff > 0:
+            return f"improved from <b>{old}/{total}</b> to <b>{new}/{total}</b>"
+        if diff < 0:
+            return f"regressed from <b>{old}/{total}</b> to <b>{new}/{total}</b>"
+        return f"held steady at <b>{old}/{total}</b>"

    old = compute_stats(test_cases, Source.OLD)
    new = compute_stats(test_cases, Source.NEW)

+    old_files_passing = sum(fs.old_passes for fs in file_stats)
+    new_files_passing = sum(fs.new_passes for fs in file_stats)
+    total_files = len(file_stats)
+    files_passing_change = new_files_passing - old_files_passing
+
    assert new.true_positives > 0, (
        "Expected ty to have at least one true positive.\n"
        f"Sample of grouped diagnostics: {test_cases[:5]}"
@@ -876,7 +894,9 @@ def render_summary(test_cases: list[TestCase], *, force_summary_table: bool) ->
        f"The percentage of diagnostics emitted that were expected errors "
        f"{format_metric(precision_change, old.precision, new.precision)}. "
        f"The percentage of expected errors that received a diagnostic "
-        f"{format_metric(recall_change, old.recall, new.recall)}."
+        f"{format_metric(recall_change, old.recall, new.recall)}. "
+        f"The number of fully passing files "
+        f"{format_int_metric(files_passing_change, old_files_passing, new_files_passing, total_files)}."
    )

    base_header = f"[Typing conformance results]({CONFORMANCE_DIR_WITH_README})"
@@ -906,13 +926,18 @@ def render_summary(test_cases: list[TestCase], *, force_summary_table: bool) ->
    precision_diff = diff_format(precision_change, greater_is_better=True)
    recall_diff = diff_format(recall_change, greater_is_better=True)
    total_diff = diff_format(total_change, neutral=True)
+    passing_diff = diff_format(files_passing_change, greater_is_better=True)

-    if (precision_change > 0 and recall_change >= 0) or (
-        recall_change > 0 and precision_change >= 0
+    if (
+        (precision_change > 0 and recall_change >= 0 and files_passing_change >= 0)
+        or (recall_change > 0 and precision_change >= 0 and files_passing_change >= 0)
+        or (files_passing_change > 0 and precision_change >= 0 and recall_change >= 0)
    ):
        header = f"{base_header} improved 🎉"
-    elif (precision_change < 0 and recall_change <= 0) or (
-        recall_change < 0 and precision_change <= 0
+    elif (
+        (precision_change < 0 and recall_change <= 0 and files_passing_change <= 0)
+        or (recall_change < 0 and precision_change <= 0 and files_passing_change <= 0)
+        or (files_passing_change < 0 and precision_change <= 0 and recall_change <= 0)
    ):
        header = f"{base_header} regressed ❌"
    else:
@@ -945,6 +970,7 @@ def render_summary(test_cases: list[TestCase], *, force_summary_table: bool) ->
        | Total Diagnostics | {old.total_diagnostics} | {new.total_diagnostics} | {total_change:+} | {total_diff} |
        | Precision | {old.precision:.2%} | {new.precision:.2%} | {precision_change:+.2%} | {precision_diff} |
        | Recall | {old.recall:.2%} | {new.recall:.2%} | {recall_change:+.2%} | {recall_diff} |
+        | Passing Files | {old_files_passing}/{total_files} | {new_files_passing}/{total_files} | {files_passing_change:+} | {passing_diff} |

        """
    )
@@ -1060,12 +1086,18 @@ def main():
        expected=expected,
    )

+    file_stats = collect_file_stats(grouped)
+
    rendered = "\n\n".join(
        filter(
            None,
            [
-                render_summary(grouped, force_summary_table=args.force_summary_table),
-                render_file_stats_table(grouped),
+                render_summary(
+                    grouped,
+                    file_stats,
+                    force_summary_table=args.force_summary_table,
+                ),
+                render_file_stats_table(file_stats),
                render_test_cases(grouped, format=args.format),
            ],
        )