mirror of
https://github.com/clockworklabs/SpacetimeDB.git
synced 2026-05-13 11:17:50 -04:00
b5a7b37660
# Description of Changes
Adds TypeScript as a third language for LLM benchmark tests alongside
Rust and C#, and fixes table naming convention mismatches.
**TypeScript Support:**
- Added `Lang::TypeScript` variant with camelCase naming conventions
- Created TypeScript project template (`templates/typescript/server/`)
with package.json, tsconfig.json, and index.ts
- Added TypeScript publisher that uses `spacetime build` and `spacetime
publish`
- Created 22 TypeScript task prompts and golden answer files for all
benchmark tests
- Updated prompt discovery to find `tasks/typescript.txt` files
**Table Naming Fix:**
- Standardized on singular table names across all languages:
- Rust: `user` (snake_case singular)
- C#: `User` (PascalCase singular)
- TypeScript: `user` (camelCase singular)
- Updated `table_name()` helper to convert singular names to appropriate
case per language
- Updated all spec.rs files to use `table_name("user", lang)` instead of
hardcoded `"users"`
**CI/Hashing Improvements:**
- Added `compute_processed_context_hash()` for language-specific hash
computation after tab filtering
- Updated CI check to verify both `rustdoc_json` and `docs` modes for
Rust
- Fixed `--hash-only` mode to skip golden builds
# API and ABI breaking changes
None - these are internal benchmark tooling changes only.
# Expected complexity level and risk
**Complexity: 2**
The changes add a new language following existing patterns for Rust and
C#. The table naming fixes are straightforward find-and-replace style
updates. Low risk since this only affects the benchmark tooling, not the
core SpacetimeDB codebase.
# Testing
- [x] `cargo build -p xtask-llm-benchmark` compiles successfully
- [x] All 22 TypeScript golden modules build and publish successfully
- [x] Rust and C# benchmarks unaffected by changes
---------
Signed-off-by: Tyler Cloutier <cloutiertyler@users.noreply.github.com>
Co-authored-by: clockwork-labs-bot <clockwork-labs-bot@users.noreply.github.com>
Co-authored-by: John Detter <4099508+jdetter@users.noreply.github.com>
313 lines
13 KiB
YAML
313 lines
13 KiB
YAML
name: Update LLM benchmarks
|
||
|
||
on:
|
||
workflow_dispatch:
|
||
inputs:
|
||
pr_number:
|
||
description: "Pull Request Number"
|
||
required: true
|
||
issue_comment:
|
||
types: [created] # only run when the comment is first created
|
||
|
||
permissions:
|
||
contents: read
|
||
pull-requests: write
|
||
issues: write
|
||
|
||
concurrency:
|
||
group: >-
|
||
llm-benchmark
|
||
-${{ github.event_name == 'issue_comment' && github.event.issue.number || inputs.pr_number }}
|
||
${{ github.event_name == 'issue_comment' && !startsWith(github.event.comment.body, '/update-llm-benchmark') && '-unrelated-comment' }}
|
||
cancel-in-progress: true
|
||
|
||
jobs:
|
||
update-llm-benchmark:
|
||
# Runnable either with a comment that starts with /update-llm-benchmark
|
||
# or by manually dispatching
|
||
if: |
|
||
(github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/update-llm-benchmark')) ||
|
||
(github.event_name == 'workflow_dispatch')
|
||
runs-on: spacetimedb-new-runner
|
||
container:
|
||
image: localhost:5000/spacetimedb-ci:latest
|
||
options: >-
|
||
--privileged
|
||
steps:
|
||
# Here we install the spacetime CLI for faster execution of the tests
|
||
# SpacetimeDB itself is not under test here, rather it's the docs.
|
||
# If we want to change that it is possible to have the benchmark compile
|
||
# SpacetimeDB from source.
|
||
- name: Install spacetime CLI
|
||
run: |
|
||
curl -sSf https://install.spacetimedb.com | sh -s -- -y
|
||
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
||
|
||
- name: Load PR info
|
||
id: pr
|
||
uses: actions/github-script@v7
|
||
with:
|
||
script: |
|
||
let prNumber;
|
||
if (context.eventName === 'issue_comment') {
|
||
prNumber = context.payload.issue.number;
|
||
} else if (context.eventName === 'workflow_dispatch') {
|
||
const raw = context.payload.inputs?.pr_number;
|
||
if (!raw || !/^\d+$/.test(raw)) {
|
||
core.setFailed(`Invalid pr_number input: '${raw}'.`);
|
||
return;
|
||
}
|
||
prNumber = Number(raw);
|
||
} else {
|
||
core.setFailed(`Unsupported event: ${context.eventName}`);
|
||
return;
|
||
}
|
||
|
||
const { data: pr } = await github.rest.pulls.get({
|
||
owner: context.repo.owner,
|
||
repo: context.repo.repo,
|
||
pull_number: prNumber,
|
||
});
|
||
|
||
core.setOutput('number', String(prNumber));
|
||
core.setOutput('head_ref', pr.head.ref);
|
||
core.setOutput('head_sha', pr.head.sha);
|
||
core.setOutput('head_repo_full_name', pr.head.repo.full_name);
|
||
core.setOutput('head_owner_type', pr.head.repo.owner.type); // "User"|"Organization"
|
||
core.setOutput('maintainer_can_modify', String(pr.maintainer_can_modify));
|
||
|
||
# If this was kicked off by a comment, ensure that the commenter is
|
||
# a collaborator on the repo. We don't want unprivileged users to run benchmarks.
|
||
# Note that the workflow that will be run will be the one that is on the `master`
|
||
# branch, NOT the one from the PR. This is important so that the PR author can't
|
||
# sneak in an exfiltration exploit.
|
||
- name: Check commenter permission
|
||
if: github.event_name == 'issue_comment'
|
||
uses: actions/github-script@v7
|
||
with:
|
||
script: |
|
||
const user = context.payload.comment.user.login;
|
||
const { data } = await github.rest.repos.getCollaboratorPermissionLevel({
|
||
owner: context.repo.owner,
|
||
repo: context.repo.repo,
|
||
username: user,
|
||
});
|
||
|
||
const allowed = new Set(['admin', 'maintain', 'write', 'triage']);
|
||
if (!allowed.has(data.permission)) {
|
||
core.setFailed(`User ${user} has permission '${data.permission}', not allowed to run benchmarks.`);
|
||
}
|
||
|
||
# If the PR is from a fork, we need to be able to have GitHub actions commit back
|
||
# to the forked repo, so that we can update the benchmark results.
|
||
# In order to do this we need to ensure that the PR is configured to allow the maintainers
|
||
# of the SpacetimeDB repo to commit back ot the fork.
|
||
- name: Check fork pushability (and comment if not)
|
||
if: steps.pr.outputs.head_repo_full_name != github.repository
|
||
uses: actions/github-script@v7
|
||
env:
|
||
PR_NUMBER: ${{ steps.pr.outputs.number }}
|
||
HEAD_OWNER_TYPE: ${{ steps.pr.outputs.head_owner_type }}
|
||
MAINTAINER_CAN_MODIFY: ${{ steps.pr.outputs.maintainer_can_modify }}
|
||
with:
|
||
script: |
|
||
const issue_number = Number(process.env.PR_NUMBER);
|
||
const headOwnerType = process.env.HEAD_OWNER_TYPE;
|
||
const canModify = process.env.MAINTAINER_CAN_MODIFY === 'true';
|
||
|
||
if (headOwnerType === 'Organization') {
|
||
await github.rest.issues.createComment({
|
||
owner: context.repo.owner,
|
||
repo: context.repo.repo,
|
||
issue_number,
|
||
body: [
|
||
"I can’t push benchmark updates to this PR because it comes from an **organization-owned fork**.",
|
||
"GitHub doesn’t allow granting upstream maintainers push permissions to org-owned forks.",
|
||
"",
|
||
"Options:",
|
||
"- Reopen the PR from a **personal fork** with **Allow edits from maintainers** enabled, or",
|
||
"- A maintainer can apply the benchmark update on an internal branch."
|
||
].join("\n"),
|
||
});
|
||
core.setFailed("Org-owned fork PR is not pushable by maintainers.");
|
||
return;
|
||
}
|
||
|
||
if (!canModify) {
|
||
await github.rest.issues.createComment({
|
||
owner: context.repo.owner,
|
||
repo: context.repo.repo,
|
||
issue_number,
|
||
body: [
|
||
"I can’t push benchmark updates to this PR branch until you enable **Allow edits from maintainers**.",
|
||
"Please check the box on the PR page, then re-comment `/update-llm-benchmark`.",
|
||
"See https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork"
|
||
].join("\n"),
|
||
});
|
||
core.setFailed("maintainer_can_modify is false; author must enable 'Allow edits from maintainers'.");
|
||
}
|
||
|
||
# Run the benchmark that is already checked into master to prevent
|
||
# an exfiltration attack whereby the PR author tries to sneak in an exploit
|
||
# and get a maintainer to run the modified benchmark without looking at the
|
||
# PR first. This ensure that we only ever execute code that is checked into
|
||
# master.
|
||
- name: Checkout master (build/install tool from trusted code)
|
||
uses: actions/checkout@v4
|
||
with:
|
||
ref: master
|
||
fetch-depth: 0
|
||
persist-credentials: false
|
||
|
||
- uses: dtolnay/rust-toolchain@stable
|
||
- uses: Swatinem/rust-cache@v2
|
||
|
||
# Ensure we use a user-writable .NET install (not /usr/share/dotnet),
|
||
# so workload installs don't require sudo.
|
||
- name: Setup .NET SDK
|
||
uses: actions/setup-dotnet@v4
|
||
with:
|
||
dotnet-version: "8.0.x"
|
||
|
||
- name: Install WASI workload (wasi-experimental)
|
||
env:
|
||
DOTNET_MULTILEVEL_LOOKUP: "0"
|
||
DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
|
||
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
|
||
run: |
|
||
dotnet --info
|
||
dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
|
||
|
||
- name: Install llm-benchmark tool from master
|
||
run: |
|
||
cargo install --path tools/xtask-llm-benchmark --locked
|
||
command -v llm_benchmark
|
||
|
||
# Check out the repo on the branch, but ONLY use this code as data!
|
||
# Never execute code that is on the PR branch.
|
||
- name: Checkout PR head (branch)
|
||
uses: actions/checkout@v4
|
||
with:
|
||
repository: ${{ steps.pr.outputs.head_repo_full_name }}
|
||
ref: ${{ steps.pr.outputs.head_sha }}
|
||
fetch-depth: 0
|
||
persist-credentials: false
|
||
|
||
# Run the benchmark against the PR using the installed tool from the
|
||
# master branch.
|
||
- name: Run benchmark (with provider keys)
|
||
env:
|
||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||
# Prevent MSBuild node reuse issues that cause "Pipe is broken" errors
|
||
# when running multiple dotnet publish commands in parallel.
|
||
# See: https://github.com/dotnet/msbuild/issues/6657
|
||
MSBUILDDISABLENODEREUSE: "1"
|
||
DOTNET_CLI_USE_MSBUILD_SERVER: "0"
|
||
run: |
|
||
llm_benchmark ci-quickfix
|
||
llm_benchmark ci-check
|
||
|
||
# Generate failure analysis if there are any failures
|
||
- name: Generate failure analysis
|
||
env:
|
||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||
run: |
|
||
llm_benchmark analyze -o docs/llms/docs-benchmark-analysis.md || true
|
||
|
||
# Generate PR comment markdown (compares against master baseline)
|
||
- name: Generate PR comment markdown
|
||
run: |
|
||
llm_benchmark ci-comment
|
||
|
||
- name: Ensure only docs/llms changed
|
||
run: |
|
||
set -euo pipefail
|
||
CHANGED="$(git diff --name-only)"
|
||
|
||
if [ -z "$CHANGED" ]; then
|
||
echo "No changes."
|
||
exit 0
|
||
fi
|
||
|
||
if echo "$CHANGED" | grep -qvE '^docs/llms/'; then
|
||
echo "Benchmark produced changes outside docs/llms:"
|
||
echo "$CHANGED" | grep -vE '^docs/llms/'
|
||
exit 1
|
||
fi
|
||
|
||
# Comment the benchmark results on the PR
|
||
- name: Comment benchmark results on PR
|
||
uses: actions/github-script@v7
|
||
env:
|
||
PR_NUMBER: ${{ steps.pr.outputs.number }}
|
||
with:
|
||
github-token: ${{ secrets.CLOCKWORK_LABS_BOT_PAT }}
|
||
script: |
|
||
const fs = require('fs');
|
||
|
||
// Read the pre-generated comment markdown
|
||
const commentPath = 'docs/llms/docs-benchmark-comment.md';
|
||
if (!fs.existsSync(commentPath)) {
|
||
core.setFailed(`Comment file not found: ${commentPath}`);
|
||
return;
|
||
}
|
||
let body = fs.readFileSync(commentPath, 'utf8');
|
||
|
||
// Check if failure analysis exists and append it
|
||
const analysisPath = 'docs/llms/docs-benchmark-analysis.md';
|
||
if (fs.existsSync(analysisPath)) {
|
||
const analysis = fs.readFileSync(analysisPath, 'utf8');
|
||
// Only include if there's meaningful content (not just "no failures")
|
||
if (!analysis.includes('No failures found')) {
|
||
body += `\n<details>\n<summary>Failure Analysis (click to expand)</summary>\n\n${analysis}\n</details>`;
|
||
}
|
||
}
|
||
|
||
const issue_number = Number(process.env.PR_NUMBER);
|
||
|
||
// Always post a new comment
|
||
console.log(`Posting new comment on PR #${issue_number}...`);
|
||
try {
|
||
await github.rest.issues.createComment({
|
||
owner: context.repo.owner,
|
||
repo: context.repo.repo,
|
||
issue_number,
|
||
body,
|
||
});
|
||
console.log('Comment created successfully');
|
||
} catch (err) {
|
||
console.error('Failed to post comment:', err.message);
|
||
console.error('Full error:', JSON.stringify(err, null, 2));
|
||
throw err;
|
||
}
|
||
|
||
# The benchmarks only modify the docs/llms directory.
|
||
# Commit the changes.
|
||
- name: Commit changes
|
||
run: |
|
||
git config user.name "clockwork-labs-bot"
|
||
git config user.email "clockwork-labs-bot@users.noreply.github.com"
|
||
|
||
# Prefer staging only the benchmark output area (adjust as needed)
|
||
git add docs/llms
|
||
|
||
git diff --cached --quiet && exit 0
|
||
git commit -m "Update LLM benchmark results"
|
||
|
||
# Here we use the https://github.com/clockwork-labs-bot user's
|
||
# personal access token to commit back to the PR branch. This is necessary
|
||
# if we want to be able to push back to external contributor forks.
|
||
- name: Push back to PR branch (same repo or fork)
|
||
env:
|
||
GH_TOKEN: ${{ secrets.CLOCKWORK_LABS_BOT_PAT }}
|
||
run: |
|
||
git remote set-url origin "https://x-access-token:${GH_TOKEN}@github.com/${{ steps.pr.outputs.head_repo_full_name }}.git"
|
||
# Fetch and rebase in case branch moved since workflow started (e.g., previous benchmark run)
|
||
git fetch origin "${{ steps.pr.outputs.head_ref }}"
|
||
if ! git rebase "origin/${{ steps.pr.outputs.head_ref }}"; then
|
||
git rebase --abort
|
||
echo "::error::Rebase failed due to conflicts. The PR branch may have been updated during the benchmark run. Please re-run /update-llm-benchmark."
|
||
exit 1
|
||
fi
|
||
git push origin "HEAD:${{ steps.pr.outputs.head_ref }}"
|