mirror of
https://github.com/vinta/awesome-python.git
synced 2026-05-06 06:06:45 -04:00
64781112d8
Co-Authored-By: Claude <noreply@anthropic.com>
505 lines
18 KiB
Python
505 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""Build a single-page HTML site from README.md for the awesome-python website."""
|
|
|
|
import json
|
|
import re
|
|
import shutil
|
|
import xml.etree.ElementTree as ET
|
|
from collections.abc import Sequence
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from jinja2 import Environment, FileSystemLoader
|
|
from readme_parser import ParsedGroup, ParsedSection, parse_readme, parse_sponsors, slugify
|
|
|
|
GITHUB_REPO_URL_RE = re.compile(r"^https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$")
|
|
MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)\s]+)\)")
|
|
BULLET_LINE_RE = re.compile(r"^\s*-\s")
|
|
SITE_URL = "https://awesome-python.com/"
|
|
SITEMAP_URL = f"{SITE_URL}sitemap.xml"
|
|
SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"
|
|
|
|
BUILTIN_FILTER = "Built-in"
|
|
BUILTIN_SLUG = "built-in"
|
|
BUILTIN_PATH = f"/categories/{BUILTIN_SLUG}/"
|
|
BUILTIN_PUBLIC_URL = f"{SITE_URL}categories/{BUILTIN_SLUG}/"
|
|
|
|
SOURCE_TYPE_DOMAINS = {
|
|
"docs.python.org": "Built-in",
|
|
"gitlab.com": "GitLab",
|
|
"bitbucket.org": "Bitbucket",
|
|
}
|
|
|
|
|
|
def detect_source_type(url: str) -> str | None:
|
|
"""Detect source type from URL domain. Returns None for GitHub URLs."""
|
|
if GITHUB_REPO_URL_RE.match(url):
|
|
return None
|
|
for domain, source_type in SOURCE_TYPE_DOMAINS.items():
|
|
if domain in url:
|
|
return source_type
|
|
if "github.com" not in url:
|
|
return "External"
|
|
return None
|
|
|
|
|
|
def extract_github_repo(url: str) -> str | None:
|
|
"""Extract owner/repo from a GitHub repo URL. Returns None for non-GitHub URLs."""
|
|
m = GITHUB_REPO_URL_RE.match(url)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def load_stars(path: Path) -> dict[str, dict]:
|
|
"""Load star data from JSON. Returns empty dict if file doesn't exist or is corrupt."""
|
|
if path.exists():
|
|
try:
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
return {}
|
|
|
|
|
|
def sort_entries(entries: list[dict]) -> list[dict]:
|
|
"""Sort entries by stars descending, then name ascending.
|
|
|
|
Three tiers: starred entries first, stdlib second, other non-starred last.
|
|
"""
|
|
|
|
def sort_key(entry: dict) -> tuple[int, int, int, str]:
|
|
stars = entry["stars"]
|
|
name = entry["name"].lower()
|
|
if stars is not None:
|
|
builtin = 1 if entry.get("source_type") == "Built-in" else 0
|
|
return (0, -stars, builtin, name)
|
|
if entry.get("source_type") == "Built-in":
|
|
return (1, 0, 0, name)
|
|
return (2, 0, 0, name)
|
|
|
|
return sorted(entries, key=sort_key)
|
|
|
|
|
|
def build_robots_txt() -> str:
|
|
return (
|
|
"User-agent: *\n"
|
|
"Content-Signal: search=yes, ai-input=yes, ai-train=yes\n"
|
|
"Allow: /\n"
|
|
"\n"
|
|
f"Sitemap: {SITEMAP_URL}\n"
|
|
)
|
|
|
|
|
|
def category_path(category: ParsedSection) -> str:
|
|
return f"/categories/{category['slug']}/"
|
|
|
|
|
|
def category_public_url(category: ParsedSection) -> str:
|
|
return f"{SITE_URL}categories/{category['slug']}/"
|
|
|
|
|
|
def group_path(group_slug: str) -> str:
|
|
return f"/categories/{group_slug}/"
|
|
|
|
|
|
def group_public_url(group_slug: str) -> str:
|
|
return f"{SITE_URL}categories/{group_slug}/"
|
|
|
|
|
|
def subcategory_path(category_slug: str, subcategory_slug: str) -> str:
|
|
return f"/categories/{category_slug}/{subcategory_slug}/"
|
|
|
|
|
|
def subcategory_public_url(category_slug: str, subcategory_slug: str) -> str:
|
|
return f"{SITE_URL}categories/{category_slug}/{subcategory_slug}/"
|
|
|
|
|
|
def write_sitemap_xml(path: Path, urls: Sequence[tuple[str, str]]) -> None:
|
|
ET.register_namespace("", SITEMAP_NS)
|
|
urlset = ET.Element(f"{{{SITEMAP_NS}}}urlset")
|
|
for url, lastmod in urls:
|
|
url_el = ET.SubElement(urlset, f"{{{SITEMAP_NS}}}url")
|
|
loc_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}loc")
|
|
loc_el.text = url
|
|
lastmod_el = ET.SubElement(url_el, f"{{{SITEMAP_NS}}}lastmod")
|
|
lastmod_el.text = lastmod
|
|
|
|
ET.ElementTree(urlset).write(path, encoding="utf-8", xml_declaration=True)
|
|
with path.open("ab") as f:
|
|
f.write(b"\n")
|
|
|
|
|
|
def top_level_heading_text(line: str) -> str | None:
|
|
stripped = line.strip()
|
|
if not stripped.startswith("# "):
|
|
return None
|
|
return stripped.removeprefix("#").strip().strip("#").strip().strip("*").strip()
|
|
|
|
|
|
LLMS_CATEGORIES_PLACEHOLDER = "{{ categories_md }}"
|
|
|
|
|
|
def extract_categories_body(markdown: str) -> str:
|
|
"""Return content under the `# Categories` heading, excluding the heading line itself."""
|
|
lines = markdown.splitlines(keepends=True)
|
|
start_idx = None
|
|
end_idx = len(lines)
|
|
for i, line in enumerate(lines):
|
|
heading = top_level_heading_text(line)
|
|
if heading is None:
|
|
continue
|
|
if start_idx is None and heading.lower() == "categories":
|
|
start_idx = i + 1
|
|
while start_idx < len(lines) and lines[start_idx].strip() == "":
|
|
start_idx += 1
|
|
elif start_idx is not None and i >= start_idx:
|
|
end_idx = i
|
|
break
|
|
if start_idx is None:
|
|
return ""
|
|
return "".join(lines[start_idx:end_idx]).rstrip() + "\n"
|
|
|
|
|
|
def build_llms_txt(template_text: str, readme_text: str, stars_data: dict[str, dict]) -> str:
|
|
"""Render the llms.txt template by injecting the README's Categories body, then annotate stars."""
|
|
body = extract_categories_body(readme_text).rstrip()
|
|
rendered = template_text.replace(LLMS_CATEGORIES_PLACEHOLDER, body)
|
|
return annotate_entries_with_stars(rendered, stars_data, format_stars=str)
|
|
|
|
|
|
def annotate_entries_with_stars(
|
|
markdown: str,
|
|
stars_data: dict[str, dict],
|
|
*,
|
|
format_stars=None,
|
|
) -> str:
|
|
"""Append the star count to bullet entry lines whose first GitHub link has known star data.
|
|
|
|
`format_stars` controls the parenthesized text. Defaults to "{N} GitHub stars".
|
|
Pass `str` for a bare number.
|
|
"""
|
|
if format_stars is None:
|
|
format_stars = lambda n: f"{n} GitHub stars" # noqa: E731 lambda-assignment
|
|
lines = markdown.splitlines(keepends=True)
|
|
out: list[str] = []
|
|
for line in lines:
|
|
if not BULLET_LINE_RE.match(line):
|
|
out.append(line)
|
|
continue
|
|
annotated = line
|
|
for match in MARKDOWN_LINK_RE.finditer(line):
|
|
repo_key = extract_github_repo(match.group(1))
|
|
if not repo_key:
|
|
continue
|
|
entry = stars_data.get(repo_key)
|
|
if not entry or "stars" not in entry:
|
|
continue
|
|
stripped = line.rstrip("\n")
|
|
ending = line[len(stripped):]
|
|
annotated = f"{stripped} ({format_stars(entry['stars'])}){ending}"
|
|
break
|
|
out.append(annotated)
|
|
return "".join(out)
|
|
|
|
|
|
def remove_sponsors_section(markdown: str) -> str:
|
|
lines = markdown.splitlines(keepends=True)
|
|
start_idx = None
|
|
for i, line in enumerate(lines):
|
|
heading = top_level_heading_text(line)
|
|
if heading and heading.lower() == "sponsors":
|
|
start_idx = i
|
|
break
|
|
|
|
if start_idx is None:
|
|
return markdown
|
|
|
|
end_idx = len(lines)
|
|
for i, line in enumerate(lines[start_idx + 1 :], start=start_idx + 1):
|
|
if top_level_heading_text(line):
|
|
end_idx = i
|
|
break
|
|
|
|
return "".join(lines[:start_idx] + lines[end_idx:])
|
|
|
|
|
|
def extract_entries(
|
|
categories: list[ParsedSection],
|
|
groups: list[ParsedGroup],
|
|
) -> list[dict]:
|
|
"""Flatten categories into individual library entries for table display.
|
|
|
|
Entries appearing in multiple categories are merged into a single entry
|
|
with lists of categories and groups.
|
|
"""
|
|
cat_to_group = {cat["name"]: group["name"] for group in groups for cat in group["categories"]}
|
|
|
|
seen: dict[tuple[str, str], dict[str, Any]] = {} # (url, name) -> entry
|
|
entries: list[dict[str, Any]] = []
|
|
for cat in categories:
|
|
group_name = cat_to_group.get(cat["name"], "Other")
|
|
for entry in cat["entries"]:
|
|
key = (entry["url"], entry["name"])
|
|
existing: dict[str, Any] | None = seen.get(key)
|
|
if existing is None:
|
|
existing = {
|
|
"name": entry["name"],
|
|
"url": entry["url"],
|
|
"description": entry["description"],
|
|
"categories": [],
|
|
"groups": [],
|
|
"subcategories": [],
|
|
"stars": None,
|
|
"owner": None,
|
|
"last_commit_at": None,
|
|
"source_type": detect_source_type(entry["url"]),
|
|
"also_see": entry["also_see"],
|
|
}
|
|
seen[key] = existing
|
|
entries.append(existing)
|
|
if cat["name"] not in existing["categories"]:
|
|
existing["categories"].append(cat["name"])
|
|
if group_name not in existing["groups"]:
|
|
existing["groups"].append(group_name)
|
|
subcat = entry["subcategory"]
|
|
if subcat:
|
|
scoped = f"{cat['name']} > {subcat}"
|
|
if not any(s["value"] == scoped for s in existing["subcategories"]):
|
|
sub_slug = slugify(subcat)
|
|
existing["subcategories"].append({
|
|
"name": subcat,
|
|
"value": scoped,
|
|
"slug": sub_slug,
|
|
"url": f"/categories/{cat['slug']}/{sub_slug}/",
|
|
})
|
|
return entries
|
|
|
|
|
|
def build(repo_root: Path) -> None:
|
|
"""Main build: parse README, render single-page HTML via Jinja2 templates."""
|
|
website = repo_root / "website"
|
|
readme_text = (repo_root / "README.md").read_text(encoding="utf-8")
|
|
|
|
subtitle = ""
|
|
for line in readme_text.split("\n"):
|
|
stripped = line.strip()
|
|
if stripped and not stripped.startswith("#"):
|
|
subtitle = stripped
|
|
break
|
|
|
|
parsed_groups = parse_readme(readme_text)
|
|
sponsors = parse_sponsors(readme_text)
|
|
|
|
categories = [cat for g in parsed_groups for cat in g["categories"]]
|
|
cat_slugs = [cat["slug"] for cat in categories]
|
|
group_slugs = [g["slug"] for g in parsed_groups]
|
|
all_top_level_slugs = cat_slugs + group_slugs + [BUILTIN_SLUG]
|
|
duplicates = {s for s in all_top_level_slugs if all_top_level_slugs.count(s) > 1}
|
|
if duplicates:
|
|
raise ValueError(
|
|
f"slug collision in /categories/ namespace: {sorted(duplicates)}. "
|
|
"Rename a category or group so their slugs differ."
|
|
)
|
|
total_entries = sum(c["entry_count"] for c in categories)
|
|
entries = extract_entries(categories, parsed_groups)
|
|
build_date = datetime.now(UTC)
|
|
|
|
stars_data = load_stars(website / "data" / "github_stars.json")
|
|
|
|
repo_self = stars_data.get("vinta/awesome-python", {})
|
|
repo_stars = None
|
|
if "stars" in repo_self:
|
|
stars_val = repo_self["stars"]
|
|
repo_stars = f"{stars_val // 1000}k" if stars_val >= 1000 else str(stars_val)
|
|
|
|
for entry in entries:
|
|
repo_key = extract_github_repo(entry["url"])
|
|
if not repo_key and entry.get("source_type") == "Built-in":
|
|
repo_key = "python/cpython"
|
|
if repo_key and repo_key in stars_data:
|
|
sd = stars_data[repo_key]
|
|
entry["stars"] = sd["stars"]
|
|
entry["owner"] = sd["owner"]
|
|
entry["last_commit_at"] = sd.get("last_commit_at", "")
|
|
|
|
entries = sort_entries(entries)
|
|
category_urls = {cat["name"]: category_path(cat) for cat in categories}
|
|
|
|
filter_urls: dict[str, str] = dict(category_urls)
|
|
for group in parsed_groups:
|
|
filter_urls[group["name"]] = group_path(group["slug"])
|
|
for entry in entries:
|
|
for sub in entry.get("subcategories", []):
|
|
filter_urls[sub["value"]] = sub["url"]
|
|
builtin_entries = [e for e in entries if e.get("source_type") == BUILTIN_FILTER]
|
|
if builtin_entries:
|
|
filter_urls[BUILTIN_FILTER] = BUILTIN_PATH
|
|
|
|
env = Environment(
|
|
loader=FileSystemLoader(website / "templates"),
|
|
autoescape=True,
|
|
trim_blocks=True,
|
|
lstrip_blocks=True,
|
|
)
|
|
env.filters["slugify"] = slugify
|
|
|
|
site_dir = website / "output"
|
|
if site_dir.exists():
|
|
shutil.rmtree(site_dir)
|
|
site_dir.mkdir(parents=True)
|
|
|
|
filter_urls_json = json.dumps(filter_urls, sort_keys=True, ensure_ascii=False).replace("</", "<\\/")
|
|
|
|
tpl_index = env.get_template("index.html")
|
|
(site_dir / "index.html").write_text(
|
|
tpl_index.render(
|
|
categories=categories,
|
|
subtitle=subtitle,
|
|
entries=entries,
|
|
total_entries=total_entries,
|
|
total_categories=len(categories),
|
|
repo_stars=repo_stars,
|
|
build_date=build_date.strftime("%B %d, %Y"),
|
|
sponsors=sponsors,
|
|
category_urls=category_urls,
|
|
filter_urls_json=filter_urls_json,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
tpl_category = env.get_template("category.html")
|
|
categories_dir = site_dir / "categories"
|
|
for category in categories:
|
|
category_entries = [entry for entry in entries if category["name"] in entry["categories"]]
|
|
page_dir = categories_dir / category["slug"]
|
|
page_dir.mkdir(parents=True, exist_ok=True)
|
|
(page_dir / "index.html").write_text(
|
|
tpl_category.render(
|
|
category=category,
|
|
category_url=category_public_url(category),
|
|
entries=category_entries,
|
|
total_categories=len(categories),
|
|
page_kind="category",
|
|
category_urls=category_urls,
|
|
current_path=category_path(category),
|
|
filter_urls_json=filter_urls_json,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
for group in parsed_groups:
|
|
group_entries = [entry for entry in entries if group["name"] in entry["groups"]]
|
|
page_dir = categories_dir / group["slug"]
|
|
page_dir.mkdir(parents=True, exist_ok=True)
|
|
synthetic = {
|
|
"name": group["name"],
|
|
"slug": group["slug"],
|
|
"description": "",
|
|
"description_html": "",
|
|
}
|
|
(page_dir / "index.html").write_text(
|
|
tpl_category.render(
|
|
category=synthetic,
|
|
category_url=group_public_url(group["slug"]),
|
|
entries=group_entries,
|
|
total_categories=len(categories),
|
|
page_kind="group",
|
|
category_urls=category_urls,
|
|
current_path=group_path(group["slug"]),
|
|
filter_urls_json=filter_urls_json,
|
|
group_categories=group["categories"],
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
if builtin_entries:
|
|
page_dir = categories_dir / BUILTIN_SLUG
|
|
page_dir.mkdir(parents=True, exist_ok=True)
|
|
synthetic = {
|
|
"name": BUILTIN_FILTER,
|
|
"slug": BUILTIN_SLUG,
|
|
"description": "",
|
|
"description_html": "",
|
|
}
|
|
(page_dir / "index.html").write_text(
|
|
tpl_category.render(
|
|
category=synthetic,
|
|
category_url=BUILTIN_PUBLIC_URL,
|
|
entries=builtin_entries,
|
|
total_categories=len(categories),
|
|
page_kind="built-in",
|
|
category_urls=category_urls,
|
|
current_path=BUILTIN_PATH,
|
|
filter_urls_json=filter_urls_json,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
seen_subcats: set[tuple[str, str]] = set()
|
|
for category in categories:
|
|
cat_url_prefix = f"/categories/{category['slug']}/"
|
|
for entry in entries:
|
|
for sub in entry.get("subcategories", []):
|
|
if not sub["url"].startswith(cat_url_prefix):
|
|
continue
|
|
key = (category["slug"], sub["slug"])
|
|
if key in seen_subcats:
|
|
continue
|
|
seen_subcats.add(key)
|
|
sub_entries = [
|
|
e for e in entries
|
|
if any(s["value"] == sub["value"] for s in e.get("subcategories", []))
|
|
]
|
|
page_dir = categories_dir / category["slug"] / sub["slug"]
|
|
page_dir.mkdir(parents=True, exist_ok=True)
|
|
synthetic = {
|
|
"name": sub["name"],
|
|
"slug": sub["slug"],
|
|
"description": "",
|
|
"description_html": "",
|
|
}
|
|
(page_dir / "index.html").write_text(
|
|
tpl_category.render(
|
|
category=synthetic,
|
|
category_url=subcategory_public_url(category["slug"], sub["slug"]),
|
|
entries=sub_entries,
|
|
total_categories=len(categories),
|
|
page_kind="subcategory",
|
|
parent_category=category,
|
|
category_urls=category_urls,
|
|
current_path=subcategory_path(category["slug"], sub["slug"]),
|
|
filter_urls_json=filter_urls_json,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
static_src = website / "static"
|
|
static_dst = site_dir / "static"
|
|
if static_src.exists():
|
|
shutil.copytree(static_src, static_dst, dirs_exist_ok=True)
|
|
|
|
markdown_index = annotate_entries_with_stars(
|
|
remove_sponsors_section(readme_text), stars_data
|
|
)
|
|
llms_template = (website / "templates" / "llms.txt").read_text(encoding="utf-8")
|
|
llms_txt = build_llms_txt(llms_template, readme_text, stars_data)
|
|
(site_dir / "robots.txt").write_text(build_robots_txt(), encoding="utf-8")
|
|
sitemap_date = build_date.date().isoformat()
|
|
sitemap_urls = [(SITE_URL, sitemap_date)]
|
|
sitemap_urls.extend((category_public_url(c), sitemap_date) for c in categories)
|
|
sitemap_urls.extend((group_public_url(g["slug"]), sitemap_date) for g in parsed_groups)
|
|
if builtin_entries:
|
|
sitemap_urls.append((BUILTIN_PUBLIC_URL, sitemap_date))
|
|
for cat_slug, sub_slug in sorted(seen_subcats):
|
|
sitemap_urls.append((subcategory_public_url(cat_slug, sub_slug), sitemap_date))
|
|
write_sitemap_xml(site_dir / "sitemap.xml", sitemap_urls)
|
|
(site_dir / "index.md").write_text(markdown_index, encoding="utf-8")
|
|
(site_dir / "llms.txt").write_text(llms_txt, encoding="utf-8")
|
|
|
|
print(f"Built site with {len(parsed_groups)} groups, {len(categories)} categories")
|
|
print(f"Total entries: {total_entries}")
|
|
print(f"Output: {site_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
build(Path(__file__).parent.parent)
|