Files
supabase/apps/studio/evals/trace-utils.ts
Matt Rossman 65fab30935 feat(ai): judge tool inputs, add storage guidance and permissive RLS evals (#46168)
Adding broad RLS policies to public buckets can cause users to expose
more than they expected, like the ability to list all profile pictures
on an app. This patches Assistant with knowledge to follow our latest
guidance on restrictive RLS policies for storage buckets
https://github.com/supabase/supabase/pull/46172

**Changes**
- Adds Storage bucket evals for public website assets and avatar access
patterns to distinguish public vs private bucket use cases
- Adds eval for overly permissive table policies
- Adds `storage` knowledge so Assistant distinguishes public buckets,
private buckets, object reads, and object listing.
- Adds `includeToolCallInputs` option for scorer transcripts so LLM
judges can evaluate proposed SQL/tool actions.
- Bumps max step count to 10 since storage knowledge may incur another
tool call (also 10 is recommended
[here](https://vercel.com/academy/ai-sdk/multi-step-and-generative-ui#why-multi-step-is-required)
for complex multi-tool scenarios)

**References**
-
https://supabase.com/docs/guides/storage/buckets/fundamentals#public-buckets
- https://supabase.com/docs/guides/storage/security/access-control
- https://github.com/supabase/supabase/pull/46172

**Notes:**
- These prompt tweaks are not meant to be exhaustive fixes, they are
mainly hotfixes intended to hold us out until these cases can be
addressed more deeply in skills/docs and tracked in a central evals

Closes AI-676
Closes AI-756

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Added Storage knowledge resource for the assistant covering Supabase
Storage access patterns and RLS guidance.
* Added three evaluation cases: two for Storage (marketing assets,
avatars) and one for RLS policy generation for user profiles.

* **Improvements**
  * Evaluators now include tool call inputs when judging conversations.
* Assistant prompts and generation enhanced with richer Storage/RLS
guidance and extended streaming limits.

* **Tests**
* Added test ensuring tool call inputs are included in serialized thread
context.

<!-- review_stack_entry_start -->

[![Review Change
Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/supabase/supabase/pull/46168?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack)

<!-- review_stack_entry_end -->
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-05-29 09:55:23 -04:00

226 lines
7.1 KiB
TypeScript

import type { SpanData, Trace } from 'braintrust'
import { z } from 'zod'
const projectContextPrefix = "The user's current project is "
/**
* Matches AI SDK tool spans as Braintrust records them: tool args first,
* execution context second.
*/
const aiSdkToolSpanInputSchema = z.tuple([
z.unknown(),
z
.object({
messages: z.unknown().optional(),
toolCallId: z.string().optional(),
})
.passthrough(),
])
const threadTextBlockSchema = z.object({ type: z.literal('text'), text: z.string() })
const threadToolCallArgumentsSchema = z.object({ type: z.literal('valid'), value: z.unknown() })
const threadToolCallBlockSchema = z.object({
type: z.literal('tool_call'),
tool_name: z.string(),
arguments: z.unknown().optional(),
})
const threadContentBlockSchema = z.union([threadTextBlockSchema, threadToolCallBlockSchema])
const threadContentSchema = z.union([
z.string(),
z.array(z.unknown()).transform((blocks) =>
blocks.flatMap((block) => {
const result = threadContentBlockSchema.safeParse(block)
return result.success ? [result.data] : []
})
),
])
const threadMessageSchema = z.object({
role: z.enum(['system', 'user', 'assistant', 'tool']),
content: threadContentSchema,
})
type ThreadMessage = z.infer<typeof threadMessageSchema>
type ThreadContentBlock = z.infer<typeof threadContentBlockSchema>
export type ThreadSerializationOptions = {
includeToolCallInputs?: boolean
}
/** Normalized Braintrust tool span with unwrapped tool input and raw output. */
export type ToolSpan = {
span: SpanData
input: unknown
output: unknown
}
export type ThreadParts = {
projectContext: string | null
priorConversation: string | null
currentUserInput: string | null
lastAssistantTurn: string | null
}
/** Optional schemas used to validate and type a tool span's input and output. */
type ToolSpanSchemas<
TInputSchema extends z.ZodType | undefined,
TOutputSchema extends z.ZodType | undefined,
> = {
inputSchema?: TInputSchema
outputSchema?: TOutputSchema
}
/** Tool span whose input/output types are inferred from provided schemas. */
type ParsedToolSpan<
TInputSchema extends z.ZodType | undefined,
TOutputSchema extends z.ZodType | undefined,
> = {
span: SpanData
input: TInputSchema extends z.ZodType ? z.infer<TInputSchema> : unknown
output: TOutputSchema extends z.ZodType ? z.infer<TOutputSchema> : unknown
}
/** Extracts the actual tool args from Braintrust's traced function input shape. */
function getToolSpanInput(span: SpanData): unknown {
const result = aiSdkToolSpanInputSchema.safeParse(span.input)
return result.success ? result.data[0] : span.input
}
function unwrapToolCallArguments(args: unknown): unknown {
const result = threadToolCallArgumentsSchema.safeParse(args)
return result.success ? result.data.value : args
}
function serializeContentBlock(
block: ThreadContentBlock,
options: ThreadSerializationOptions
): string {
if (block.type === 'text') return block.text
const marker = `[called ${block.tool_name}]`
if (!options.includeToolCallInputs || typeof block.arguments === 'undefined') return marker
return `${marker}\n${JSON.stringify(unwrapToolCallArguments(block.arguments), null, 2)}`
}
function serializeMessageContent(
message: ThreadMessage | undefined,
options: ThreadSerializationOptions = {}
): string | null {
if (!message) return null
if (typeof message.content === 'string') return message.content || null
const content = message.content.map((block) => serializeContentBlock(block, options)).join('\n')
return content || null
}
function serializeMessages(
messages: ThreadMessage[],
options: ThreadSerializationOptions = {}
): string | null {
const parts = messages.flatMap((message) => {
const content = serializeMessageContent(message, options)
return content ? [`[${message.role}]\n${content}`] : []
})
return parts.length > 0 ? parts.join('\n\n') : null
}
function isProjectContextMessage(message: ThreadMessage): boolean {
return (
message.role === 'assistant' &&
Boolean(serializeMessageContent(message)?.startsWith(projectContextPrefix))
)
}
function findLastUserIndex(messages: ThreadMessage[]): number {
for (let i = messages.length - 1; i >= 0; i--) {
if (messages[i].role === 'user') return i
}
return -1
}
export function getThreadPartsFromThread(
thread: unknown[],
options: ThreadSerializationOptions = {}
): ThreadParts {
const messages = thread.flatMap((message) => {
const result = threadMessageSchema.safeParse(message)
if (!result.success || result.data.role === 'system' || result.data.role === 'tool') return []
return [result.data]
})
const projectContextMessages = messages.filter(isProjectContextMessage)
const chatMessages = messages.filter((message) => !isProjectContextMessage(message))
const lastUserIdx = findLastUserIndex(chatMessages)
const projectContext = serializeMessageContent(
projectContextMessages[projectContextMessages.length - 1]
)
if (lastUserIdx === -1) {
return {
projectContext,
priorConversation: serializeMessages(chatMessages, options),
currentUserInput: null,
lastAssistantTurn: null,
}
}
return {
projectContext,
priorConversation: serializeMessages(chatMessages.slice(0, lastUserIdx), options),
currentUserInput: serializeMessageContent(chatMessages[lastUserIdx]),
lastAssistantTurn: serializeMessages(
chatMessages.slice(lastUserIdx + 1).filter((message) => message.role === 'assistant'),
options
),
}
}
export async function getThreadParts(
trace: Trace,
options: ThreadSerializationOptions = {}
): Promise<ThreadParts> {
return getThreadPartsFromThread(await trace.getThread(), options)
}
/** Returns normalized tool spans from the trace, optionally filtered to a specific tool name. */
export async function getToolSpans(trace: Trace, toolName?: string): Promise<ToolSpan[]> {
const spans = await trace.getSpans({ spanType: ['tool'] })
const toolSpans = spans.map((span) => ({
span,
input: getToolSpanInput(span),
output: span.output,
}))
if (!toolName) return toolSpans
return toolSpans.filter((s) => s.span.span_attributes?.name === toolName)
}
/** Returns only tool spans whose normalized input/output match the provided schemas. */
export async function getParsedToolSpans<
TInputSchema extends z.ZodType | undefined = undefined,
TOutputSchema extends z.ZodType | undefined = undefined,
>(
trace: Trace,
toolName: string,
schemas: ToolSpanSchemas<TInputSchema, TOutputSchema> = {}
): Promise<Array<ParsedToolSpan<TInputSchema, TOutputSchema>>> {
const spans = await getToolSpans(trace, toolName)
return spans.flatMap(({ span, input, output }) => {
const parsedInput = schemas.inputSchema?.safeParse(input)
if (parsedInput && !parsedInput.success) return []
const parsedOutput = schemas.outputSchema?.safeParse(output)
if (parsedOutput && !parsedOutput.success) return []
return [
{
span,
input: parsedInput ? parsedInput.data : input,
output: parsedOutput ? parsedOutput.data : output,
} as ParsedToolSpan<TInputSchema, TOutputSchema>,
]
})
}