mirror of
https://github.com/supabase/supabase.git
synced 2026-06-28 11:33:52 -04:00
cd52669f1f
## Summary
This brings docs `/guides/*` to full content negotiation for AI agents
(GROWTH-811):
RFC 9110 q-value parsing instead of a `.includes('text/markdown')`
substring match,
a 406 when the client rejects every type the route can produce, and
markdown rewrites
for known LLM user agents.
I implemented it by extracting the negotiation into a shared
`common/markdown-negotiation`
module consumed by both `apps/docs/middleware.ts` and
`apps/www/middleware.ts`, rather than
duplicating the helpers into docs and keeping them in sync by hand with
www (#45394). Single
source of truth, no re-sync burden. www is refactored onto the shared
helper with no behavior
change.
## Changes
### docs `/guides/*` content negotiation (GROWTH-811)
- Replace the `.includes('text/markdown')` substring match with RFC 9110
q-value parsing.
- Return 406 (`Cache-Control: no-store`, `Vary: Accept`) when Accept
excludes every type the
route serves. Bypassed for LLM user agents, the `.md` suffix, and
clients sending no Accept.
- Rewrite to `/api/guides-md/<slug>` for LLM user agents (Claude-User,
Claude-Web, ChatGPT-User,
PerplexityBot) regardless of Accept.
- Preserve the existing `.md` suffix routing and the entire
`/reference/*` block.
### Shared negotiation helper
- New `packages/common/markdown-negotiation.ts`:
`negotiateMarkdown(signals, route)` returns
`'markdown' | 'not-acceptable' | 'pass'`. Internalizes q-value parsing,
the LLM user-agent
match, the UA-length cap, and the markdown-vs-html preference.
- `apps/www/middleware.ts`: refactored to consume the shared helper; its
duplicated copy of the
negotiation helpers (added in #45394) is removed. `.md` early-return,
changelog routing, and
first-referrer cookie stamping are unchanged (no behavior change,
covered by its existing tests).
### Tests
- New `apps/docs/middleware.test.ts`: q-value priority, the 406 path,
`.md` suffix, LLM UA
override, browser default Accept, training-crawler and substring-embed
exclusion, and the
`/reference/*` exemption.
- New `packages/common/markdown-negotiation.test.ts`: the same decision
matrix at the unit level
(q-values, 406, LLM UAs, `.md`, `*/*`, training crawlers, OWS,
out-of-range q).
## Testing (Vercel preview)
After Vercel posts a preview URL, save it once then run the probe set.
```bash
echo 'PREVIEW_HOST' > /tmp/growth-811-host.txt
HOST=$(cat /tmp/growth-811-host.txt)
# 1) Browser-style Accept -> HTML 200
curl -sI -A "Mozilla/5.0" \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
"https://$HOST/docs/guides/auth"
# 2) Accept: text/markdown -> markdown 200
curl -sI -H 'Accept: text/markdown' "https://$HOST/docs/guides/auth"
# 3) text/html;q=1.0, text/markdown;q=0.5 -> HTML 200
curl -sI -H 'Accept: text/html;q=1.0, text/markdown;q=0.5' "https://$HOST/docs/guides/auth"
# 4) unsupported Accept -> 406 + Cache-Control: no-store + Vary: Accept
curl -sI -H 'Accept: application/x-content-negotiation-probe' "https://$HOST/docs/guides/auth"
# 5) User-Agent: Claude-User/1.0 (any Accept) -> markdown 200
curl -sI -A 'Claude-User/1.0' "https://$HOST/docs/guides/auth"
```
### After merge
Run
[acceptmarkdown.com/readiness-check](https://acceptmarkdown.com/readiness-check)
against `https://supabase.com/docs/guides/auth`: expect 100/100.
## Linear
- fixes GROWTH-811
87 lines
3.2 KiB
TypeScript
87 lines
3.2 KiB
TypeScript
// Live-fetch agents only. Training crawlers (GPTBot, ClaudeBot, CCBot) are
|
|
// governed by robots.txt; serving them content that differs from the HTML
|
|
// page risks SEO and cloaking penalties.
|
|
const LLM_USER_AGENT = /\bClaude-User\b|\bClaude-Web\b|\bChatGPT-User\b|\bPerplexityBot\b/i
|
|
|
|
// Media ranges (RFC 9110 §5.3.2) ordered most to least specific.
|
|
const RANGES = ['text/markdown', 'text/html', 'text/*', '*/*'] as const
|
|
type Range = (typeof RANGES)[number]
|
|
|
|
const Q_PARAM = /^\s*q\s*=\s*([\d.]+)\s*$/i
|
|
|
|
// Cap UA length before the regex test to bound CPU on the edge hot path.
|
|
const MAX_UA_LENGTH = 512
|
|
|
|
function isRange(s: string): s is Range {
|
|
return (RANGES as readonly string[]).includes(s)
|
|
}
|
|
|
|
function parseQ(params: string[]): number {
|
|
for (const p of params) {
|
|
const q = parseFloat(p.match(Q_PARAM)?.[1] ?? '')
|
|
if (Number.isFinite(q) && q >= 0 && q <= 1) return q
|
|
}
|
|
return 1
|
|
}
|
|
|
|
// `markdownExplicit` lets callers avoid flipping a bare `Accept: */*` to
|
|
// markdown — generic clients sending */* aren't expressing a preference.
|
|
function parseAccept(header: string) {
|
|
const seen = new Map<Range, number>()
|
|
|
|
for (const entry of header.toLowerCase().split(',')) {
|
|
const [rawType, ...params] = entry.trim().split(';')
|
|
const range = rawType.trim()
|
|
if (!isRange(range)) continue
|
|
seen.set(range, Math.max(seen.get(range) ?? -1, parseQ(params)))
|
|
}
|
|
|
|
return {
|
|
html: seen.get('text/html') ?? seen.get('text/*') ?? seen.get('*/*') ?? 0,
|
|
markdown: seen.get('text/markdown') ?? seen.get('text/*') ?? seen.get('*/*') ?? 0,
|
|
markdownExplicit: seen.has('text/markdown') || seen.has('text/*'),
|
|
}
|
|
}
|
|
|
|
function shouldServeMarkdown(accept: ReturnType<typeof parseAccept>): boolean {
|
|
if (accept.markdown === 0) return false
|
|
if (accept.markdown > accept.html) return true
|
|
return accept.markdown === accept.html && accept.markdownExplicit
|
|
}
|
|
|
|
export type MarkdownDecision = 'markdown' | 'not-acceptable' | 'pass'
|
|
|
|
/**
|
|
* Content negotiation for routes that can serve either HTML or markdown.
|
|
*
|
|
* `hasMarkdownVariant` is false for paths with no markdown representation (they
|
|
* never negotiate). `isMarkdownSuffix` forces markdown for an explicit `.md`
|
|
* request; callers that handle `.md` upstream can leave it false.
|
|
*/
|
|
export function negotiateMarkdown(
|
|
{ acceptHeader, userAgent }: { acceptHeader: string; userAgent: string },
|
|
{
|
|
hasMarkdownVariant,
|
|
isMarkdownSuffix = false,
|
|
}: { hasMarkdownVariant: boolean; isMarkdownSuffix?: boolean }
|
|
): MarkdownDecision {
|
|
if (!hasMarkdownVariant) return 'pass'
|
|
|
|
// LLM agents and an explicit `.md` request always get markdown.
|
|
if (LLM_USER_AGENT.test(userAgent.slice(0, MAX_UA_LENGTH)) || isMarkdownSuffix) {
|
|
return 'markdown'
|
|
}
|
|
|
|
// No Accept header = browser/default client: serve HTML, never 406.
|
|
if (!acceptHeader) return 'pass'
|
|
|
|
const accept = parseAccept(acceptHeader)
|
|
|
|
// 406 when Accept rejects every type this route can produce. Only reached for
|
|
// non-LLM, non-`.md` clients that sent an Accept header (guards above), so a
|
|
// deliberate `Accept: application/json` gets a clean 406 instead of HTML.
|
|
if (accept.markdown === 0 && accept.html === 0) return 'not-acceptable'
|
|
|
|
return shouldServeMarkdown(accept) ? 'markdown' : 'pass'
|
|
}
|