mirror of
https://github.com/astral-sh/ruff.git
synced 2026-05-06 08:56:57 -04:00
1116 lines
36 KiB
Python
1116 lines
36 KiB
Python
"""
|
|
Run typing conformance tests and compare results between two ty versions.
|
|
|
|
By default, this script will use `uv` to run the latest version of ty
|
|
as the new version with `uvx ty@latest`. This requires `uv` to be installed
|
|
and available in the system PATH.
|
|
|
|
If CONFORMANCE_SUITE_COMMIT is set, the hash will be used to create
|
|
links to the corresponding line in the conformance repository for each
|
|
diagnostic. Otherwise, it will default to `main'.
|
|
|
|
Examples:
|
|
# Compare an older version of ty to latest
|
|
%(prog)s --old-ty uvx ty@0.0.1a35
|
|
|
|
# Compare two specific ty versions
|
|
%(prog)s --old-ty uvx ty@0.0.1a35 --new-ty uvx ty@0.0.7
|
|
|
|
# Use local ty builds
|
|
%(prog)s --old-ty ./target/debug/ty-old --new-ty ./target/debug/ty-new
|
|
|
|
# Custom test directory
|
|
%(prog)s --target-path custom/tests --old-ty uvx ty@0.0.1a35 --new-ty uvx ty@0.0.7
|
|
|
|
# Show a diff with local paths to the test directory instead of table of links
|
|
%(prog)s --old-ty uvx ty@0.0.1a35 --new-ty uvx ty@0.0.7 --format diff
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tomllib
|
|
from collections import defaultdict
|
|
from collections.abc import Sequence, Set as AbstractSet
|
|
from dataclasses import dataclass
|
|
from enum import StrEnum, auto
|
|
from itertools import chain, groupby
|
|
from pathlib import Path
|
|
from textwrap import dedent
|
|
from typing import Any, Literal, Self, assert_never
|
|
|
|
# The conformance tests include 4 types of errors:
|
|
# 1. Required errors (E): The type checker must raise an error on this line
|
|
# 2. Optional errors (E?): The type checker may raise an error on this line
|
|
# 3. Tagged errors (E[tag]): The type checker must raise at most one error
|
|
# on a set of lines with a matching tag
|
|
# 4. Tagged multi-errors (E[tag+]): The type checker should raise one or
|
|
# more errors on a set of lines with a matching tag
|
|
CONFORMANCE_ERROR_PATTERN = re.compile(
|
|
r"""
|
|
\#\s*E # "# E" begins each error
|
|
(?P<optional>\?)? # Optional '?' (E?) indicates that an error is optional
|
|
(?: # An optional tag for errors that may appear on multiple lines at most once
|
|
\[
|
|
(?P<tag>[^+\]]+) # identifier
|
|
(?P<multi>\+)? # '+' indicates that an error may occur more than once on tagged lines
|
|
\]
|
|
)?
|
|
(?:
|
|
\s*:\s*(?P<description>.*) # optional description
|
|
)?
|
|
""",
|
|
re.VERBOSE,
|
|
)
|
|
|
|
CONFORMANCE_SUITE_COMMIT = os.environ.get("CONFORMANCE_SUITE_COMMIT", "main")
|
|
CONFORMANCE_DIR_WITH_README = (
|
|
f"https://github.com/python/typing/blob/{CONFORMANCE_SUITE_COMMIT}/conformance/"
|
|
)
|
|
CONFORMANCE_URL = CONFORMANCE_DIR_WITH_README + "tests/{filename}#L{line}"
|
|
|
|
GITHUB_HEADER = [
|
|
"<table>",
|
|
"<tr>",
|
|
"<th>Test case</th>",
|
|
"<th>Diff</th>",
|
|
"</tr>",
|
|
]
|
|
GITHUB_FOOTER = ["</table>"]
|
|
SUMMARY_NOTE = """
|
|
Each test case represents one expected error annotation or a group of annotations
|
|
sharing a tag. Counts are per test case, not per diagnostic — multiple diagnostics
|
|
on the same line count as one. Required annotations (`E`) are true positives when
|
|
ty flags the expected location and false negatives when it does not. Optional
|
|
annotations (`E?`) are true positives when flagged but true negatives (not false
|
|
negatives) when not. Tagged annotations (`E[tag]`) require ty to flag exactly one
|
|
of the tagged lines; tagged multi-annotations (`E[tag+]`) allow any number up to
|
|
the tag count. Flagging unexpected locations counts as a false positive.
|
|
"""
|
|
# Priority order for section headings: improvements first, regressions last.
|
|
TITLE_PRIORITY: dict[str, int] = {
|
|
"True positives added": 0,
|
|
"False positives removed": 1,
|
|
"True positives changed": 2,
|
|
"False positives changed": 3,
|
|
"False positives added": 4,
|
|
"True positives removed": 5,
|
|
"Optional Diagnostics Added": 6,
|
|
"Optional Diagnostics Removed": 7,
|
|
"Optional Diagnostics Changed": 8,
|
|
}
|
|
|
|
|
|
class Source(StrEnum):
|
|
OLD = auto()
|
|
NEW = auto()
|
|
|
|
|
|
class Classification(StrEnum):
|
|
TRUE_POSITIVE = auto()
|
|
FALSE_POSITIVE = auto()
|
|
TRUE_NEGATIVE = auto()
|
|
FALSE_NEGATIVE = auto()
|
|
|
|
def into_title(self, *, verb: Literal["added", "removed", "changed"]) -> str:
|
|
match self:
|
|
case Classification.TRUE_POSITIVE:
|
|
return f"True positives {verb}"
|
|
case Classification.FALSE_POSITIVE:
|
|
return f"False positives {verb}"
|
|
case Classification.TRUE_NEGATIVE:
|
|
return f"True negatives {verb}"
|
|
case Classification.FALSE_NEGATIVE:
|
|
return f"False negatives {verb}"
|
|
|
|
|
|
class Change(StrEnum):
|
|
ADDED = auto()
|
|
REMOVED = auto()
|
|
CHANGED = auto()
|
|
UNCHANGED = auto()
|
|
|
|
def into_title(self) -> str:
|
|
match self:
|
|
case Change.ADDED:
|
|
return "Optional Diagnostics Added"
|
|
case Change.REMOVED:
|
|
return "Optional Diagnostics Removed"
|
|
case Change.CHANGED:
|
|
return "Optional Diagnostics Changed"
|
|
case Change.UNCHANGED:
|
|
return "Optional Diagnostics Unchanged"
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class Position:
|
|
line: int
|
|
column: int
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class Positions:
|
|
begin: Position
|
|
end: Position
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class Location:
|
|
path: Path
|
|
positions: Positions
|
|
|
|
def as_link(self) -> str:
|
|
file = self.path.name
|
|
link = CONFORMANCE_URL.format(filename=file, line=self.positions.begin.line)
|
|
return f"[{file}:{self.positions.begin.line}:{self.positions.begin.column}]({link})"
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class TyDiagnostic:
|
|
"""A diagnostic emitted by a ty version (old or new) during a conformance run."""
|
|
|
|
check_name: str
|
|
description: str
|
|
severity: str
|
|
location: Location
|
|
source: Source
|
|
|
|
def __post_init__(self) -> None:
|
|
# Remove check name prefix from description
|
|
self.description = self.description.replace(f"{self.check_name}: ", "")
|
|
|
|
def __str__(self) -> str:
|
|
return (
|
|
f"{self.location.path}:{self.location.positions.begin.line}:"
|
|
f"{self.location.positions.begin.column}: "
|
|
f"{self.severity_for_display}[{self.check_name}] {self.description}"
|
|
)
|
|
|
|
@classmethod
|
|
def from_gitlab_output(
|
|
cls,
|
|
dct: dict[str, Any],
|
|
source: Source,
|
|
) -> Self:
|
|
return cls(
|
|
check_name=dct["check_name"],
|
|
description=dct["description"],
|
|
severity=dct["severity"],
|
|
location=Location(
|
|
path=Path(dct["location"]["path"]).resolve(),
|
|
positions=Positions(
|
|
begin=Position(
|
|
line=dct["location"]["positions"]["begin"]["line"],
|
|
column=dct["location"]["positions"]["begin"]["column"],
|
|
),
|
|
end=Position(
|
|
line=dct["location"]["positions"]["end"]["line"],
|
|
column=dct["location"]["positions"]["end"]["column"],
|
|
),
|
|
),
|
|
),
|
|
source=source,
|
|
)
|
|
|
|
@property
|
|
def severity_for_display(self) -> str:
|
|
return {
|
|
"major": "error",
|
|
"minor": "warning",
|
|
}.get(self.severity, "unknown")
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class ExpectedError:
|
|
"""An error annotation parsed from a conformance test file (e.g. ``# E: ...``)."""
|
|
|
|
description: str
|
|
location: Location
|
|
optional: bool
|
|
# tag identifying an error that can occur on multiple lines
|
|
tag: str | None
|
|
# True if one or more errors can occur on lines with the same tag
|
|
multi: bool
|
|
|
|
@property
|
|
def key(self) -> str:
|
|
"""Key to group expected errors by path and beginning line or path and tag."""
|
|
return (
|
|
f"{self.location.path.as_posix()}:{self.location.positions.begin.line}"
|
|
if self.tag is None
|
|
else f"{self.location.path.as_posix()}:{self.tag}"
|
|
)
|
|
|
|
|
|
def diagnostics_are_equivalent(a: list[TyDiagnostic], b: list[TyDiagnostic]) -> bool:
|
|
"""Compare two diagnostic lists for equality, ignoring the ``source`` field."""
|
|
|
|
def fingerprint(d: TyDiagnostic) -> tuple:
|
|
return (
|
|
d.check_name,
|
|
d.description,
|
|
d.severity,
|
|
str(d.location.path),
|
|
d.location.positions.begin.line,
|
|
d.location.positions.begin.column,
|
|
)
|
|
|
|
return sorted(map(fingerprint, a)) == sorted(map(fingerprint, b))
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class TestCase:
|
|
key: str
|
|
old: list[TyDiagnostic]
|
|
new: list[TyDiagnostic]
|
|
expected: list[ExpectedError]
|
|
|
|
@property
|
|
def change(self) -> Change:
|
|
if self.new and not self.old:
|
|
return Change.ADDED
|
|
elif self.old and not self.new:
|
|
return Change.REMOVED
|
|
elif (
|
|
self.old and self.new and not diagnostics_are_equivalent(self.old, self.new)
|
|
):
|
|
return Change.CHANGED
|
|
else:
|
|
return Change.UNCHANGED
|
|
|
|
@property
|
|
def optional(self) -> bool:
|
|
return bool(self.expected) and all(e.optional for e in self.expected)
|
|
|
|
@property
|
|
def multi(self) -> bool:
|
|
return bool(self.expected) and all(e.multi for e in self.expected)
|
|
|
|
@property
|
|
def path(self) -> Path:
|
|
"""Return the source file path for this test case."""
|
|
for diags in (self.new, self.old, self.expected):
|
|
if diags:
|
|
return diags[0].location.path
|
|
raise ValueError(f"No diagnostics in test case {self.key}")
|
|
|
|
def diagnostics_by_source(self, source: Source) -> list[TyDiagnostic]:
|
|
return self.old if source == Source.OLD else self.new
|
|
|
|
def classify(self, source: Source) -> Classification:
|
|
diagnostics = self.diagnostics_by_source(source)
|
|
|
|
if diagnostics:
|
|
if self.optional:
|
|
return Classification.TRUE_POSITIVE
|
|
if self.expected:
|
|
distinct_lines = len(
|
|
{d.location.positions.begin.line for d in diagnostics}
|
|
)
|
|
expected_max = len(self.expected) if self.multi else 1
|
|
if 1 <= distinct_lines <= expected_max:
|
|
return Classification.TRUE_POSITIVE
|
|
else:
|
|
return Classification.FALSE_POSITIVE
|
|
else:
|
|
return Classification.FALSE_POSITIVE
|
|
|
|
elif self.expected:
|
|
if self.optional:
|
|
return Classification.TRUE_NEGATIVE
|
|
return Classification.FALSE_NEGATIVE
|
|
|
|
else:
|
|
return Classification.TRUE_NEGATIVE
|
|
|
|
|
|
def render_html_diff_row(tc: TestCase, *, source: Source | None) -> list[str]:
|
|
"""Render a single HTML <tr> with a test-case link and a markdown diff block."""
|
|
all_diags = tc.old + tc.new if source is None else tc.diagnostics_by_source(source)
|
|
|
|
if all_diags:
|
|
min_line = min(d.location.positions.begin.line for d in all_diags)
|
|
max_line = max(d.location.positions.begin.line for d in all_diags)
|
|
filename = all_diags[0].location.path.name
|
|
if min_line == max_line:
|
|
url = CONFORMANCE_URL.format(filename=filename, line=min_line)
|
|
display = f"{filename}:{min_line}"
|
|
else:
|
|
url = (
|
|
f"{CONFORMANCE_DIR_WITH_README}tests/{filename}#L{min_line}-L{max_line}"
|
|
)
|
|
display = f"{filename}:{min_line}:{max_line}"
|
|
location = f"[{display}]({url})"
|
|
else:
|
|
location = tc.key
|
|
|
|
diff_lines = []
|
|
if source is None:
|
|
for d in tc.old:
|
|
diff_lines.append(
|
|
f"-{d.severity_for_display}[{d.check_name}] {d.description}"
|
|
)
|
|
for d in tc.new:
|
|
diff_lines.append(
|
|
f"+{d.severity_for_display}[{d.check_name}] {d.description}"
|
|
)
|
|
else:
|
|
sign = "-" if source == Source.OLD else "+"
|
|
for d in tc.diagnostics_by_source(source):
|
|
diff_lines.append(
|
|
f"{sign}{d.severity_for_display}[{d.check_name}] {d.description}"
|
|
)
|
|
|
|
return [
|
|
"",
|
|
"<tr>",
|
|
"<td>",
|
|
"",
|
|
location,
|
|
"",
|
|
"</td>",
|
|
"",
|
|
"<td>",
|
|
"",
|
|
"```diff",
|
|
*diff_lines,
|
|
"```",
|
|
"",
|
|
"</td>",
|
|
"</tr>",
|
|
]
|
|
|
|
|
|
def render_diff_row(diagnostics: list[TyDiagnostic], *, removed: bool = False) -> str:
|
|
sign = "-" if removed else "+"
|
|
return "\n".join(f"{sign} {d}" for d in diagnostics)
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class Statistics:
|
|
true_positives: int = 0
|
|
false_positives: int = 0
|
|
false_negatives: int = 0
|
|
total_diagnostics: int = 0
|
|
|
|
@property
|
|
def precision(self) -> float:
|
|
if self.true_positives + self.false_positives > 0:
|
|
return self.true_positives / (self.true_positives + self.false_positives)
|
|
return 0.0
|
|
|
|
@property
|
|
def recall(self) -> float:
|
|
if self.true_positives + self.false_negatives > 0:
|
|
return self.true_positives / (self.true_positives + self.false_negatives)
|
|
else:
|
|
return 0.0
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class DiagnosticEntry:
|
|
"""A test case bound to the section title and source side it should be rendered under."""
|
|
|
|
title: str
|
|
test_case: TestCase
|
|
# None means show both old (removed) and new (added) in a single "changed" section.
|
|
source: Source | None
|
|
|
|
|
|
@dataclass(kw_only=True, slots=True)
|
|
class FileStats:
|
|
path: Path
|
|
old: Statistics
|
|
new: Statistics
|
|
|
|
@property
|
|
def old_passes(self) -> bool:
|
|
return self.old.false_positives == 0 and self.old.false_negatives == 0
|
|
|
|
@property
|
|
def new_passes(self) -> bool:
|
|
return self.new.false_positives == 0 and self.new.false_negatives == 0
|
|
|
|
@property
|
|
def total_change(self) -> int:
|
|
return (
|
|
abs(self.new.true_positives - self.old.true_positives)
|
|
+ abs(self.new.false_positives - self.old.false_positives)
|
|
+ abs(self.new.false_negatives - self.old.false_negatives)
|
|
)
|
|
|
|
|
|
def collect_expected_diagnostics(test_files: Sequence[Path]) -> list[ExpectedError]:
|
|
errors: list[ExpectedError] = []
|
|
for file in test_files:
|
|
for idx, line in enumerate(file.read_text().splitlines(), 1):
|
|
if match := re.search(CONFORMANCE_ERROR_PATTERN, line):
|
|
errors.append(
|
|
ExpectedError(
|
|
description=(match.group("description") or "Missing"),
|
|
location=Location(
|
|
path=file,
|
|
positions=Positions(
|
|
begin=Position(
|
|
line=idx,
|
|
column=match.start(),
|
|
),
|
|
end=Position(
|
|
line=idx,
|
|
column=match.end(),
|
|
),
|
|
),
|
|
),
|
|
optional=match.group("optional") is not None,
|
|
tag=(
|
|
f"{file.name}:{match.group('tag')}"
|
|
if match.group("tag")
|
|
else None
|
|
),
|
|
multi=match.group("multi") is not None,
|
|
)
|
|
)
|
|
|
|
assert errors, "Failed to discover any expected diagnostics!"
|
|
return errors
|
|
|
|
|
|
def collect_ty_diagnostics(
|
|
ty_path: list[str],
|
|
source: Source,
|
|
test_files: Sequence[Path],
|
|
python_version: str = "3.12",
|
|
extra_search_paths: Sequence[Path] = (),
|
|
) -> list[TyDiagnostic]:
|
|
extra_search_path_args = [
|
|
f"--extra-search-path={path}" for path in extra_search_paths
|
|
]
|
|
process = subprocess.run(
|
|
[
|
|
*ty_path,
|
|
"check",
|
|
f"--python-version={python_version}",
|
|
"--output-format=gitlab",
|
|
"--ignore=assert-type-unspellable-subtype",
|
|
"--error=invalid-enum-member-annotation",
|
|
"--error=invalid-legacy-positional-parameter",
|
|
"--error=deprecated",
|
|
"--error=redundant-final-classvar",
|
|
"--exit-zero",
|
|
*extra_search_path_args,
|
|
*map(str, test_files),
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
timeout=15,
|
|
)
|
|
|
|
return [
|
|
TyDiagnostic.from_gitlab_output(dct, source=source)
|
|
for dct in json.loads(process.stdout)
|
|
if dct["severity"] == "major"
|
|
]
|
|
|
|
|
|
def group_diagnostics_by_key(
|
|
old: list[TyDiagnostic],
|
|
new: list[TyDiagnostic],
|
|
expected: list[ExpectedError],
|
|
) -> list[TestCase]:
|
|
# Build a lookup from (filename, line) to tag so ty diagnostics on a tagged
|
|
# line can be grouped with all other expected errors sharing that tag.
|
|
tagged_lines: dict[tuple[str, int], str] = {
|
|
(e.location.path.name, e.location.positions.begin.line): e.tag
|
|
for e in expected
|
|
if e.tag is not None
|
|
}
|
|
|
|
def ty_key(diag: TyDiagnostic) -> str:
|
|
tag = tagged_lines.get(
|
|
(diag.location.path.name, diag.location.positions.begin.line)
|
|
)
|
|
return (
|
|
f"{diag.location.path.as_posix()}:{tag}"
|
|
if tag is not None
|
|
else f"{diag.location.path.as_posix()}:{diag.location.positions.begin.line}"
|
|
)
|
|
|
|
old_by_key: defaultdict[str, list[TyDiagnostic]] = defaultdict(list)
|
|
new_by_key: defaultdict[str, list[TyDiagnostic]] = defaultdict(list)
|
|
expected_by_key: defaultdict[str, list[ExpectedError]] = defaultdict(list)
|
|
|
|
for diag in old:
|
|
old_by_key[ty_key(diag)].append(diag)
|
|
for diag in new:
|
|
new_by_key[ty_key(diag)].append(diag)
|
|
for err in expected:
|
|
expected_by_key[err.key].append(err)
|
|
|
|
all_keys = sorted(old_by_key.keys() | new_by_key.keys() | expected_by_key.keys())
|
|
return [
|
|
TestCase(
|
|
key=key,
|
|
old=old_by_key[key],
|
|
new=new_by_key[key],
|
|
expected=expected_by_key[key],
|
|
)
|
|
for key in all_keys
|
|
]
|
|
|
|
|
|
def compute_stats(test_cases: list[TestCase], source: Source) -> Statistics:
|
|
stats = Statistics()
|
|
for tc in test_cases:
|
|
match tc.classify(source):
|
|
case Classification.TRUE_POSITIVE:
|
|
stats.true_positives += 1
|
|
case Classification.FALSE_POSITIVE:
|
|
stats.false_positives += 1
|
|
case Classification.FALSE_NEGATIVE:
|
|
stats.false_negatives += 1
|
|
case Classification.TRUE_NEGATIVE:
|
|
pass
|
|
stats.total_diagnostics += len(tc.diagnostics_by_source(source))
|
|
return stats
|
|
|
|
|
|
def collect_diagnostic_entries(test_cases: list[TestCase]) -> list[DiagnosticEntry]:
|
|
"""Classify each changed test case and assign it to a titled section."""
|
|
entries: list[DiagnosticEntry] = []
|
|
|
|
for tc in test_cases:
|
|
change = tc.change
|
|
if change == Change.UNCHANGED:
|
|
continue
|
|
|
|
if tc.optional:
|
|
if change == Change.ADDED:
|
|
entries.append(
|
|
DiagnosticEntry(
|
|
title=Change.ADDED.into_title(), test_case=tc, source=Source.NEW
|
|
)
|
|
)
|
|
elif change == Change.REMOVED:
|
|
entries.append(
|
|
DiagnosticEntry(
|
|
title=Change.REMOVED.into_title(),
|
|
test_case=tc,
|
|
source=Source.OLD,
|
|
)
|
|
)
|
|
elif change == Change.CHANGED:
|
|
entries.append(
|
|
DiagnosticEntry(
|
|
title=Change.CHANGED.into_title(), test_case=tc, source=None
|
|
)
|
|
)
|
|
else:
|
|
if change == Change.ADDED:
|
|
new_class = tc.classify(Source.NEW)
|
|
entries.append(
|
|
DiagnosticEntry(
|
|
title=new_class.into_title(verb="added"),
|
|
test_case=tc,
|
|
source=Source.NEW,
|
|
)
|
|
)
|
|
elif change == Change.REMOVED:
|
|
old_class = tc.classify(Source.OLD)
|
|
entries.append(
|
|
DiagnosticEntry(
|
|
title=old_class.into_title(verb="removed"),
|
|
test_case=tc,
|
|
source=Source.OLD,
|
|
)
|
|
)
|
|
elif change == Change.CHANGED:
|
|
old_class = tc.classify(Source.OLD)
|
|
new_class = tc.classify(Source.NEW)
|
|
if old_class == new_class:
|
|
# Same classification but different diagnostics: one "changed" section.
|
|
entries.append(
|
|
DiagnosticEntry(
|
|
title=new_class.into_title(verb="changed"),
|
|
test_case=tc,
|
|
source=None,
|
|
)
|
|
)
|
|
else:
|
|
# Classification changed: split into separate removed/added sections.
|
|
entries.append(
|
|
DiagnosticEntry(
|
|
title=old_class.into_title(verb="removed"),
|
|
test_case=tc,
|
|
source=Source.OLD,
|
|
)
|
|
)
|
|
entries.append(
|
|
DiagnosticEntry(
|
|
title=new_class.into_title(verb="added"),
|
|
test_case=tc,
|
|
source=Source.NEW,
|
|
)
|
|
)
|
|
|
|
entries.sort(
|
|
key=lambda e: (TITLE_PRIORITY.get(e.title, 99), e.title, e.test_case.key)
|
|
)
|
|
return entries
|
|
|
|
|
|
def render_test_cases(
|
|
test_cases: list[TestCase],
|
|
*,
|
|
format: Literal["diff", "github"] = "diff",
|
|
) -> str:
|
|
entries = collect_diagnostic_entries(test_cases)
|
|
if not entries:
|
|
return ""
|
|
|
|
lines = []
|
|
for title, group in groupby(entries, key=lambda e: e.title):
|
|
group_list = list(group)
|
|
n = len(group_list)
|
|
|
|
lines.append(f"### {title} ({n})")
|
|
lines.extend(
|
|
[
|
|
"",
|
|
"<details>",
|
|
f"<summary>{n} {'diagnostic' if n == 1 else 'diagnostics'}</summary>",
|
|
"",
|
|
]
|
|
)
|
|
|
|
if format == "diff":
|
|
lines.append("```diff")
|
|
else:
|
|
lines.extend(GITHUB_HEADER)
|
|
|
|
for entry in group_list:
|
|
tc, source = entry.test_case, entry.source
|
|
if source is None:
|
|
if format == "diff":
|
|
lines.append(render_diff_row(tc.old, removed=True))
|
|
lines.append(render_diff_row(tc.new, removed=False))
|
|
else:
|
|
lines.extend(render_html_diff_row(tc, source=None))
|
|
else:
|
|
if format == "diff":
|
|
lines.append(
|
|
render_diff_row(
|
|
tc.diagnostics_by_source(source),
|
|
removed=source == Source.OLD,
|
|
)
|
|
)
|
|
else:
|
|
lines.extend(render_html_diff_row(tc, source=source))
|
|
|
|
if format == "diff":
|
|
lines.append("```")
|
|
else:
|
|
lines.extend(GITHUB_FOOTER)
|
|
lines.extend(["</details>", ""])
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def collect_file_stats(test_cases: list[TestCase]) -> list[FileStats]:
|
|
"""Compute per-file statistics from grouped test cases."""
|
|
path_to_cases: dict[Path, list[TestCase]] = {}
|
|
for tc in test_cases:
|
|
path_to_cases.setdefault(tc.path, []).append(tc)
|
|
return [
|
|
FileStats(
|
|
path=path,
|
|
old=compute_stats(cases, Source.OLD),
|
|
new=compute_stats(cases, Source.NEW),
|
|
)
|
|
for path, cases in path_to_cases.items()
|
|
]
|
|
|
|
|
|
def render_file_stats_table(file_stats: list[FileStats]) -> str:
|
|
"""Render a per-file breakdown showing only files whose TP/FP/FN counts changed."""
|
|
|
|
def fmt(old: int, new: int, *, greater_is_better: bool = True) -> str:
|
|
if old == new:
|
|
return str(new)
|
|
diff = new - old
|
|
improved = diff > 0 if greater_is_better else diff < 0
|
|
indicator = " ✅" if improved else " ❌"
|
|
return f"{new} ({diff:+}){indicator}"
|
|
|
|
# Collect totals across all files regardless of change.
|
|
old_totals = Statistics()
|
|
new_totals = Statistics()
|
|
passing = 0
|
|
total_files = len(file_stats)
|
|
|
|
for fs in file_stats:
|
|
old_totals.true_positives += fs.old.true_positives
|
|
old_totals.false_positives += fs.old.false_positives
|
|
old_totals.false_negatives += fs.old.false_negatives
|
|
new_totals.true_positives += fs.new.true_positives
|
|
new_totals.false_positives += fs.new.false_positives
|
|
new_totals.false_negatives += fs.new.false_negatives
|
|
passing += fs.new_passes
|
|
|
|
changed = [fs for fs in file_stats if fs.total_change > 0]
|
|
if not changed:
|
|
return ""
|
|
|
|
changed.sort(key=lambda fs: (-fs.total_change, fs.path.name))
|
|
|
|
rows = []
|
|
for fs in changed:
|
|
if fs.new_passes and not fs.old_passes:
|
|
status = "✅ Newly Passing 🎉"
|
|
elif fs.old_passes and not fs.new_passes:
|
|
status = "❌ Newly Failing ☹️"
|
|
elif fs.new_passes:
|
|
status = "✅ Still Passing"
|
|
else:
|
|
old_errors = fs.old.false_positives + fs.old.false_negatives
|
|
new_errors = fs.new.false_positives + fs.new.false_negatives
|
|
if new_errors < old_errors:
|
|
status = "📈 Improving"
|
|
elif new_errors > old_errors:
|
|
status = "📉 Regressing"
|
|
else:
|
|
status = "➡️ Neutral"
|
|
url = CONFORMANCE_DIR_WITH_README + f"tests/{fs.path.name}"
|
|
rows.append(
|
|
f"| [{fs.path.name}]({url})"
|
|
f" | {fmt(fs.old.true_positives, fs.new.true_positives, greater_is_better=True)}"
|
|
f" | {fmt(fs.old.false_positives, fs.new.false_positives, greater_is_better=False)}"
|
|
f" | {fmt(fs.old.false_negatives, fs.new.false_negatives, greater_is_better=False)}"
|
|
f" | {status} |"
|
|
)
|
|
|
|
totals_row = (
|
|
f"| **Total (all files)**"
|
|
f" | **{fmt(old_totals.true_positives, new_totals.true_positives, greater_is_better=True)}**"
|
|
f" | **{fmt(old_totals.false_positives, new_totals.false_positives, greater_is_better=False)}**"
|
|
f" | **{fmt(old_totals.false_negatives, new_totals.false_negatives, greater_is_better=False)}**"
|
|
f" | {passing}/{total_files} |"
|
|
)
|
|
|
|
lines = [
|
|
"### Test file breakdown",
|
|
"",
|
|
"<details>",
|
|
f"<summary>{len(changed)} file{'s' if len(changed) != 1 else ''} altered</summary>",
|
|
"",
|
|
"| File | True Positives | False Positives | False Negatives | Status |",
|
|
"|------|----|----|----|--------|",
|
|
*rows,
|
|
totals_row,
|
|
"",
|
|
"</details>",
|
|
"",
|
|
]
|
|
return "\n".join(lines)
|
|
|
|
|
|
def diff_format(
|
|
diff: float,
|
|
*,
|
|
greater_is_better: bool = True,
|
|
neutral: bool = False,
|
|
) -> str:
|
|
if diff == 0:
|
|
return ""
|
|
|
|
increased = diff > 0
|
|
good = " (✅)" if not neutral else ""
|
|
bad = " (❌)" if not neutral else ""
|
|
up = "⏫"
|
|
down = "⏬"
|
|
|
|
match (greater_is_better, increased):
|
|
case (True, True):
|
|
return f"{up}{good}"
|
|
case (False, True):
|
|
return f"{up}{bad}"
|
|
case (True, False):
|
|
return f"{down}{bad}"
|
|
case (False, False):
|
|
return f"{down}{good}"
|
|
case _:
|
|
# The ty false positive seems to be due to insufficient type narrowing for tuples;
|
|
# possibly related to https://github.com/astral-sh/ty/issues/493 and/or
|
|
# https://github.com/astral-sh/ty/issues/887
|
|
assert_never((greater_is_better, increased)) # ty: ignore[type-assertion-failure]
|
|
|
|
|
|
def render_summary(
|
|
test_cases: list[TestCase],
|
|
file_stats: list[FileStats],
|
|
*,
|
|
force_summary_table: bool,
|
|
) -> str:
|
|
def format_metric(diff: float, old: float, new: float) -> str:
|
|
if diff > 0:
|
|
return f"increased from <b>{old:.2%}</b> to <b>{new:.2%}</b>"
|
|
if diff < 0:
|
|
return f"decreased from <b>{old:.2%}</b> to <b>{new:.2%}</b>"
|
|
return f"held steady at <b>{old:.2%}</b>"
|
|
|
|
def format_int_metric(diff: int, old: int, new: int, total: int) -> str:
|
|
if diff > 0:
|
|
return f"improved from <b>{old}/{total}</b> to <b>{new}/{total}</b>"
|
|
if diff < 0:
|
|
return f"regressed from <b>{old}/{total}</b> to <b>{new}/{total}</b>"
|
|
return f"held steady at <b>{old}/{total}</b>"
|
|
|
|
old = compute_stats(test_cases, Source.OLD)
|
|
new = compute_stats(test_cases, Source.NEW)
|
|
|
|
old_files_passing = sum(fs.old_passes for fs in file_stats)
|
|
new_files_passing = sum(fs.new_passes for fs in file_stats)
|
|
total_files = len(file_stats)
|
|
files_passing_change = new_files_passing - old_files_passing
|
|
|
|
assert new.true_positives > 0, (
|
|
"Expected ty to have at least one true positive.\n"
|
|
f"Sample of grouped diagnostics: {test_cases[:5]}"
|
|
)
|
|
|
|
precision_change = new.precision - old.precision
|
|
recall_change = new.recall - old.recall
|
|
true_pos_change = new.true_positives - old.true_positives
|
|
false_pos_change = new.false_positives - old.false_positives
|
|
false_neg_change = new.false_negatives - old.false_negatives
|
|
total_change = new.total_diagnostics - old.total_diagnostics
|
|
|
|
summary_paragraph = (
|
|
f"The percentage of diagnostics emitted that were expected errors "
|
|
f"{format_metric(precision_change, old.precision, new.precision)}. "
|
|
f"The percentage of expected errors that received a diagnostic "
|
|
f"{format_metric(recall_change, old.recall, new.recall)}. "
|
|
f"The number of fully passing files "
|
|
f"{format_int_metric(files_passing_change, old_files_passing, new_files_passing, total_files)}."
|
|
)
|
|
|
|
base_header = f"[Typing conformance results]({CONFORMANCE_DIR_WITH_README})"
|
|
|
|
if not force_summary_table and all(
|
|
diag.change is Change.UNCHANGED for diag in test_cases
|
|
):
|
|
return dedent(
|
|
f"""
|
|
## {base_header}
|
|
|
|
### No changes detected ✅
|
|
|
|
<details>
|
|
<summary>Current numbers</summary>
|
|
|
|
<br>
|
|
{summary_paragraph}
|
|
|
|
</details>
|
|
"""
|
|
)
|
|
|
|
true_pos_diff = diff_format(true_pos_change, greater_is_better=True)
|
|
false_pos_diff = diff_format(false_pos_change, greater_is_better=False)
|
|
false_neg_diff = diff_format(false_neg_change, greater_is_better=False)
|
|
precision_diff = diff_format(precision_change, greater_is_better=True)
|
|
recall_diff = diff_format(recall_change, greater_is_better=True)
|
|
total_diff = diff_format(total_change, neutral=True)
|
|
passing_diff = diff_format(files_passing_change, greater_is_better=True)
|
|
|
|
if (
|
|
(precision_change > 0 and recall_change >= 0 and files_passing_change >= 0)
|
|
or (recall_change > 0 and precision_change >= 0 and files_passing_change >= 0)
|
|
or (files_passing_change > 0 and precision_change >= 0 and recall_change >= 0)
|
|
):
|
|
header = f"{base_header} improved 🎉"
|
|
elif (
|
|
(precision_change < 0 and recall_change <= 0 and files_passing_change <= 0)
|
|
or (recall_change < 0 and precision_change <= 0 and files_passing_change <= 0)
|
|
or (files_passing_change < 0 and precision_change <= 0 and recall_change <= 0)
|
|
):
|
|
header = f"{base_header} regressed ❌"
|
|
else:
|
|
header = base_header
|
|
|
|
summary_note = " ".join(SUMMARY_NOTE.split())
|
|
|
|
return dedent(
|
|
f"""
|
|
## {header}
|
|
|
|
{summary_paragraph}
|
|
|
|
### Summary
|
|
|
|
<details>
|
|
<summary>How are test cases classified?</summary>
|
|
|
|
<br>
|
|
|
|
{summary_note}
|
|
|
|
</details>
|
|
|
|
| Metric | Old | New | Diff | Outcome |
|
|
|--------|-----|-----|------|---------|
|
|
| True Positives | {old.true_positives} | {new.true_positives} | {true_pos_change:+} | {true_pos_diff} |
|
|
| False Positives | {old.false_positives} | {new.false_positives} | {false_pos_change:+} | {false_pos_diff} |
|
|
| False Negatives | {old.false_negatives} | {new.false_negatives} | {false_neg_change:+} | {false_neg_diff} |
|
|
| Total Diagnostics | {old.total_diagnostics} | {new.total_diagnostics} | {total_change:+} | {total_diff} |
|
|
| Precision | {old.precision:.2%} | {new.precision:.2%} | {precision_change:+.2%} | {precision_diff} |
|
|
| Recall | {old.recall:.2%} | {new.recall:.2%} | {recall_change:+.2%} | {recall_diff} |
|
|
| Passing Files | {old_files_passing}/{total_files} | {new_files_passing}/{total_files} | {files_passing_change:+} | {passing_diff} |
|
|
|
|
"""
|
|
)
|
|
|
|
|
|
def get_test_groups(root_dir: Path) -> AbstractSet[str]:
|
|
"""Adapted from typing/conformance/test_groups.py."""
|
|
# Read the TOML file that defines the test groups. Each test
|
|
# group has a name that associated test cases must start with.
|
|
test_group_file = root_dir / "src" / "test_groups.toml"
|
|
with open(test_group_file, "rb") as f:
|
|
return tomllib.load(f).keys()
|
|
|
|
|
|
def get_test_cases(
|
|
test_group_names: AbstractSet[str], tests_dir: Path
|
|
) -> Sequence[Path]:
|
|
"""Adapted from typing/conformance/test_groups.py."""
|
|
# Filter test cases based on test group names. Files that do
|
|
# not begin with a known test group name are assumed to be
|
|
# files that support one or more tests.
|
|
return [
|
|
p
|
|
for p in chain(tests_dir.glob("*.py"), tests_dir.glob("*.pyi"))
|
|
if p.name.split("_")[0] in test_group_names
|
|
]
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(
|
|
description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--old-ty",
|
|
nargs="+",
|
|
help="Command to run old version of ty",
|
|
required=True,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--new-ty",
|
|
nargs="+",
|
|
default=["uvx", "ty@latest"],
|
|
help="Command to run new version of ty (default: uvx ty@latest)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--tests-path",
|
|
type=Path,
|
|
default=Path("typing/conformance"),
|
|
help="Path to conformance tests directory (default: typing/conformance)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--python-version",
|
|
type=str,
|
|
default="3.12",
|
|
help="Python version to assume when running ty (default: 3.12)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--format", type=str, choices=["diff", "github"], default="github"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
help="Write output to file instead of stdout",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--force-summary-table",
|
|
action="store_true",
|
|
help="Always print the summary table, even if no changes were detected",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
tests_dir = args.tests_path.resolve().absolute()
|
|
test_groups = get_test_groups(tests_dir)
|
|
test_files = get_test_cases(test_groups, tests_dir / "tests")
|
|
|
|
expected = collect_expected_diagnostics(test_files)
|
|
|
|
extra_search_paths = [tests_dir / "tests"]
|
|
|
|
old = collect_ty_diagnostics(
|
|
ty_path=args.old_ty,
|
|
test_files=test_files,
|
|
source=Source.OLD,
|
|
python_version=args.python_version,
|
|
extra_search_paths=extra_search_paths,
|
|
)
|
|
|
|
new = collect_ty_diagnostics(
|
|
ty_path=args.new_ty,
|
|
test_files=test_files,
|
|
source=Source.NEW,
|
|
python_version=args.python_version,
|
|
extra_search_paths=extra_search_paths,
|
|
)
|
|
|
|
grouped = group_diagnostics_by_key(
|
|
old=old,
|
|
new=new,
|
|
expected=expected,
|
|
)
|
|
|
|
file_stats = collect_file_stats(grouped)
|
|
|
|
rendered = "\n\n".join(
|
|
filter(
|
|
None,
|
|
[
|
|
render_summary(
|
|
grouped,
|
|
file_stats,
|
|
force_summary_table=args.force_summary_table,
|
|
),
|
|
render_file_stats_table(file_stats),
|
|
render_test_cases(grouped, format=args.format),
|
|
],
|
|
)
|
|
)
|
|
|
|
if args.output:
|
|
args.output.write_text(rendered, encoding="utf-8")
|
|
print(f"Output written to {args.output}", file=sys.stderr)
|
|
print(rendered, file=sys.stderr)
|
|
else:
|
|
print(rendered)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|