mirror of
https://github.com/clockworklabs/SpacetimeDB.git
synced 2026-05-06 07:26:43 -04:00
LLM Benchmarking (#3486)
# Description of Changes Introduce a new **LLM benchmarking app** and supporting code. * **CLI:** `llm` with subcommands `run`, `routes list`, `diff`, `ci-check`. * **Runner:** executes globally numbered tasks; filters by `--lang`, `--categories`, `--tasks`, `--providers`, `--models`. * **Providers/clients:** route layer (`provider:model`) with HTTP LLM Vendor clients; env-driven keys/base URLs. * **Evaluation:** deterministic scorers (hash/equality, JSON shape/count, light schema/reducer parity) with clear failure messages. * **Results:** stable JSON schema; single-file HTML viewer to inspect/filter/export CSV. * **Build & guards:** build script for compile-time setup; * **Docs:** `DEVELOP.md` includes `cargo llm …` usage. This PR is the initial addition of the app and its modules (runner, config, routes, prompt/segmentation, scorers, schema/types, defaults/constants/paths/hashing/combine, publishers, spacetime guard, HTML stats viewer). ### How it works 1. **Pick what to run** * Choose tasks (`--tasks 0,7,12`), or a language (`--lang rust|csharp`), or categories (`--categories basics,schema`). * Optionally limit vendors/models (`--providers …`, `--models …`). 2. **Resolve routes** * Read env (API keys + base URLs) and build the active set (e.g., `openai:gpt-5`). 3. **Build context** * Start Spacetime * Publish golden answer modules * Prepare prompts and send to LLM model * Attempt to publish LLM module 4. **Execute calls** * Run the selected tasks within each test against selected models and languages. 5. **Score outputs** * Apply deterministic scorers (hash/equality, JSON shape/count, simple schema/reducer checks). * Record the score and any short failure reason. 6. **Update results file** * Write/update the single results JSON with task/route outcomes, timings, and summaries. # API and ABI breaking changes None. New application and modules; no existing public APIs/ABIs altered. # Expected complexity level and risk **4/5.** New CLI, routing, evaluation, and artifact format. * External model APIs may rate-limit/timeout; concurrency tunable via `LLM_BENCH_CONCURRENCY` / `LLM_BENCH_ROUTE_CONCURRENCY`. # Testing I ran the full test matrix and generated results for every task against every vendor, model, and language (rust + C#). I also tested the CI check locally using [act](https://github.com/nektos/act). **Please verify** * [ ] `llm run --tasks 0,1,2` (explicit `run`) * [ ] `llm run --lang rust --categories basics` (filters) * [ ] `llm run --categories basics,schema` (multiple categories) * [ ] `llm run --lang csharp` (language switch) * [ ] `llm run --providers openai,anthropic --models "openai:gpt-5 anthropic:claude-sonnet-4-5"` (provider/model limits) * [ ] `llm run --hash-only` (dry integrity) * [ ] `llm run --goldens-only` (test goldens only) * [ ] `llm run --force` (skip hash check) * [ ] `llm ci-check` * [ ] Stats viewer loads the JSON; filtering and CSV export work * [ ] CI works as intended --------- Signed-off-by: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Signed-off-by: Tyler Cloutier <cloutiertyler@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Co-authored-by: Tyler Cloutier <cloutiertyler@aol.com> Co-authored-by: Tyler Cloutier <cloutiertyler@users.noreply.github.com> Co-authored-by: spacetimedb-bot <spacetimedb-bot@users.noreply.github.com> Co-authored-by: John Detter <4099508+jdetter@users.noreply.github.com>
This commit is contained in:
@@ -3,6 +3,7 @@ rustflags = ["--cfg", "tokio_unstable"]
|
||||
|
||||
[alias]
|
||||
bump-versions = "run -p upgrade-version --"
|
||||
llm = "run --package xtask-llm-benchmark --bin llm_benchmark --"
|
||||
ci = "run -p ci --"
|
||||
|
||||
[target.x86_64-pc-windows-msvc]
|
||||
|
||||
+149
-5
@@ -7,9 +7,9 @@ on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
pr_number:
|
||||
description: 'Pull Request Number'
|
||||
description: "Pull Request Number"
|
||||
required: false
|
||||
default: ''
|
||||
default: ""
|
||||
|
||||
name: CI
|
||||
|
||||
@@ -19,6 +19,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
docker_smoketests:
|
||||
needs: [lints, llm_ci_check]
|
||||
name: Smoketests
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -108,7 +109,7 @@ jobs:
|
||||
dotnet workload config --update-mode manifests
|
||||
dotnet workload update
|
||||
- uses: actions/setup-python@v5
|
||||
with: { python-version: '3.12' }
|
||||
with: { python-version: "3.12" }
|
||||
if: runner.os == 'Windows'
|
||||
- name: Install python deps
|
||||
run: python -m pip install -r smoketests/requirements.txt
|
||||
@@ -120,6 +121,7 @@ jobs:
|
||||
run: docker compose -f .github/docker-compose.yml down
|
||||
|
||||
test:
|
||||
needs: [lints, llm_ci_check]
|
||||
name: Test Suite
|
||||
runs-on: spacetimedb-new-runner
|
||||
container:
|
||||
@@ -500,7 +502,22 @@ jobs:
|
||||
run: |
|
||||
cargo ci cli-docs
|
||||
|
||||
llm_ci_check:
|
||||
name: Verify LLM benchmark is up to date
|
||||
permissions:
|
||||
contents: read
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
|
||||
- name: Run hash check (both langs)
|
||||
run: cargo llm ci-check
|
||||
|
||||
unity-testsuite:
|
||||
needs: [lints, llm_ci_check]
|
||||
# Skip if this is an external contribution.
|
||||
# The license secrets will be empty, so the step would fail anyway.
|
||||
if: ${{ github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork }}
|
||||
@@ -585,7 +602,7 @@ jobs:
|
||||
enable_pr_comment: ${{ github.event_name == 'pull_request' }}
|
||||
target_path: sdks/csharp
|
||||
env:
|
||||
GITHUB_TOKEN: '${{ secrets.GITHUB_TOKEN }}'
|
||||
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
|
||||
|
||||
- name: Start SpacetimeDB
|
||||
run: |
|
||||
@@ -624,13 +641,14 @@ jobs:
|
||||
githubToken: ${{ secrets.GITHUB_TOKEN }}
|
||||
testMode: playmode
|
||||
useHostNetwork: true
|
||||
artifactsPath: ''
|
||||
artifactsPath: ""
|
||||
env:
|
||||
UNITY_EMAIL: ${{ secrets.UNITY_EMAIL }}
|
||||
UNITY_PASSWORD: ${{ secrets.UNITY_PASSWORD }}
|
||||
UNITY_SERIAL: ${{ secrets.UNITY_SERIAL }}
|
||||
|
||||
csharp-testsuite:
|
||||
needs: [lints, llm_ci_check]
|
||||
runs-on: spacetimedb-new-runner
|
||||
container:
|
||||
image: localhost:5000/spacetimedb-ci:latest
|
||||
@@ -737,3 +755,129 @@ jobs:
|
||||
echo 'Error: Bindings are dirty. Please run `sdks/csharp/tools~/gen-regression-tests.sh`.'
|
||||
exit 1
|
||||
}
|
||||
|
||||
internal-tests:
|
||||
name: Internal Tests
|
||||
needs: [lints, llm_ci_check]
|
||||
# Skip if not a PR or a push to master
|
||||
# Skip if this is an external contribution. GitHub secrets will be empty, so the step would fail anyway.
|
||||
if: ${{ (github.event_name == 'pull_request' || (github.event_name == 'push' && github.ref == 'refs/heads/master'))
|
||||
&& (github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork) }}
|
||||
permissions:
|
||||
contents: read
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
TARGET_OWNER: clockworklabs
|
||||
TARGET_REPO: SpacetimeDBPrivate
|
||||
steps:
|
||||
- id: dispatch
|
||||
name: Trigger tests
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ secrets.SPACETIMEDB_PRIVATE_TOKEN }}
|
||||
script: |
|
||||
const workflowId = 'ci.yml';
|
||||
const targetRef = 'master';
|
||||
const targetOwner = process.env.TARGET_OWNER;
|
||||
const targetRepo = process.env.TARGET_REPO;
|
||||
// Use the ref for pull requests because the head sha is brittle (github does some extra dance where it merges in master).
|
||||
const publicRef = (context.eventName === 'pull_request') ? context.payload.pull_request.head.ref : context.sha;
|
||||
const preDispatch = new Date().toISOString();
|
||||
|
||||
// Dispatch the workflow in the target repository
|
||||
await github.rest.actions.createWorkflowDispatch({
|
||||
owner: targetOwner,
|
||||
repo: targetRepo,
|
||||
workflow_id: workflowId,
|
||||
ref: targetRef,
|
||||
inputs: { public_ref: publicRef }
|
||||
});
|
||||
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
// Find the dispatched run by name
|
||||
let runId = null;
|
||||
for (let attempt = 0; attempt < 20 && !runId; attempt++) { // up to ~10 minutes to locate the run
|
||||
await sleep(5000);
|
||||
const runsResp = await github.rest.actions.listWorkflowRuns({
|
||||
owner: targetOwner,
|
||||
repo: targetRepo,
|
||||
workflow_id: workflowId,
|
||||
event: 'workflow_dispatch',
|
||||
branch: targetRef,
|
||||
per_page: 50,
|
||||
});
|
||||
|
||||
const expectedName = `CI [public_ref=${publicRef}]`;
|
||||
const candidates = runsResp.data.workflow_runs
|
||||
.filter(r => r.name === expectedName && new Date(r.created_at) >= new Date(preDispatch))
|
||||
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
|
||||
|
||||
if (candidates.length > 0) {
|
||||
runId = candidates[0].id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!runId) {
|
||||
core.setFailed('Failed to locate dispatched run in the private repository.');
|
||||
return;
|
||||
}
|
||||
|
||||
const runUrl = `https://github.com/${targetOwner}/${targetRepo}/actions/runs/${runId}`;
|
||||
core.info(`View run: ${runUrl}`);
|
||||
core.setOutput('run_id', String(runId));
|
||||
core.setOutput('run_url', runUrl);
|
||||
|
||||
- name: Wait for Internal Tests to complete
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ secrets.SPACETIMEDB_PRIVATE_TOKEN }}
|
||||
script: |
|
||||
const targetOwner = process.env.TARGET_OWNER;
|
||||
const targetRepo = process.env.TARGET_REPO;
|
||||
const runId = Number(`${{ steps.dispatch.outputs.run_id }}`);
|
||||
const runUrl = `${{ steps.dispatch.outputs.run_url }}`;
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
core.info(`Waiting for workflow result... ${runUrl}`);
|
||||
|
||||
let conclusion = null;
|
||||
for (let attempt = 0; attempt < 240; attempt++) { // up to ~2 hours
|
||||
const runResp = await github.rest.actions.getWorkflowRun({
|
||||
owner: targetOwner,
|
||||
repo: targetRepo,
|
||||
run_id: runId
|
||||
});
|
||||
const { status, conclusion: c } = runResp.data;
|
||||
if (status === 'completed') {
|
||||
conclusion = c || 'success';
|
||||
break;
|
||||
}
|
||||
await sleep(30000);
|
||||
}
|
||||
|
||||
if (!conclusion) {
|
||||
core.setFailed('Timed out waiting for private workflow to complete.');
|
||||
return;
|
||||
}
|
||||
|
||||
if (conclusion !== 'success') {
|
||||
core.setFailed(`Private workflow failed with conclusion: ${conclusion}`);
|
||||
}
|
||||
|
||||
- name: Cancel invoked run if workflow cancelled
|
||||
if: ${{ cancelled() }}
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ secrets.SPACETIMEDB_PRIVATE_TOKEN }}
|
||||
script: |
|
||||
const targetOwner = process.env.TARGET_OWNER;
|
||||
const targetRepo = process.env.TARGET_REPO;
|
||||
const runId = Number(`${{ steps.dispatch.outputs.run_id }}`);
|
||||
if (!runId) return;
|
||||
await github.rest.actions.cancelWorkflowRun({
|
||||
owner: targetOwner,
|
||||
repo: targetRepo,
|
||||
run_id: runId,
|
||||
});
|
||||
|
||||
@@ -1,148 +0,0 @@
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
name: Internal Tests
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || format('sha-{0}', github.sha) }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run-tests:
|
||||
# Skip if this is an external contribution. GitHub secrets will be empty, so the step would fail anyway.
|
||||
if: ${{ github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork }}
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
TARGET_OWNER: clockworklabs
|
||||
TARGET_REPO: SpacetimeDBPrivate
|
||||
steps:
|
||||
- id: dispatch
|
||||
name: Trigger tests
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ secrets.SPACETIMEDB_PRIVATE_TOKEN }}
|
||||
script: |
|
||||
const workflowId = 'ci.yml';
|
||||
const targetRef = 'master';
|
||||
const targetOwner = process.env.TARGET_OWNER;
|
||||
const targetRepo = process.env.TARGET_REPO;
|
||||
// Use the ref for pull requests because the head sha is brittle (github does some extra dance where it merges in master).
|
||||
const publicRef = (context.eventName === 'pull_request') ? context.payload.pull_request.head.ref : context.sha;
|
||||
const preDispatch = new Date().toISOString();
|
||||
|
||||
console.log("context.eventName =", context.eventName);
|
||||
console.log("context.ref =", context.ref);
|
||||
console.log("context.sha =", context.sha);
|
||||
console.log("Dispatch workflow with publicRef =", publicRef);
|
||||
|
||||
// Dispatch the workflow in the target repository
|
||||
await github.rest.actions.createWorkflowDispatch({
|
||||
owner: targetOwner,
|
||||
repo: targetRepo,
|
||||
workflow_id: workflowId,
|
||||
ref: targetRef,
|
||||
inputs: { public_ref: publicRef }
|
||||
});
|
||||
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
// Find the dispatched run by name
|
||||
let runId = null;
|
||||
for (let attempt = 0; attempt < 20 && !runId; attempt++) { // up to ~10 minutes to locate the run
|
||||
await sleep(5000);
|
||||
const runsResp = await github.rest.actions.listWorkflowRuns({
|
||||
owner: targetOwner,
|
||||
repo: targetRepo,
|
||||
workflow_id: workflowId,
|
||||
event: 'workflow_dispatch',
|
||||
branch: targetRef,
|
||||
per_page: 50,
|
||||
});
|
||||
|
||||
const expectedName = `CI [public_ref=${publicRef}]`;
|
||||
const candidates = runsResp.data.workflow_runs
|
||||
.filter(r => r.name === expectedName && new Date(r.created_at) >= new Date(preDispatch))
|
||||
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at));
|
||||
|
||||
if (candidates.length > 0) {
|
||||
runId = candidates[0].id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!runId) {
|
||||
core.setFailed('Failed to locate dispatched run in the private repository.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Provide direct link and context prior to waiting
|
||||
const runUrl = `https://github.com/${targetOwner}/${targetRepo}/actions/runs/${runId}`;
|
||||
core.info(`View run: ${runUrl}`);
|
||||
|
||||
core.setOutput('run_id', String(runId));
|
||||
core.setOutput('run_url', runUrl);
|
||||
|
||||
- name: Wait for Internal Tests to complete
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ secrets.SPACETIMEDB_PRIVATE_TOKEN }}
|
||||
script: |
|
||||
const targetOwner = process.env.TARGET_OWNER;
|
||||
const targetRepo = process.env.TARGET_REPO;
|
||||
const runId = Number(`${{ steps.dispatch.outputs.run_id }}`);
|
||||
const runUrl = `${{ steps.dispatch.outputs.run_url }}`;
|
||||
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
||||
|
||||
core.info(`Waiting for workflow result... ${runUrl}`);
|
||||
|
||||
let conclusion = null;
|
||||
for (let attempt = 0; attempt < 240; attempt++) { // up to ~2 hours
|
||||
const runResp = await github.rest.actions.getWorkflowRun({
|
||||
owner: targetOwner,
|
||||
repo: targetRepo,
|
||||
run_id: runId
|
||||
});
|
||||
const { status, conclusion: c } = runResp.data;
|
||||
if (status === 'completed') {
|
||||
conclusion = c || 'success';
|
||||
break;
|
||||
}
|
||||
await sleep(30000);
|
||||
}
|
||||
|
||||
if (!conclusion) {
|
||||
core.setFailed('Timed out waiting for private workflow to complete.');
|
||||
return;
|
||||
}
|
||||
|
||||
core.info(`Private workflow conclusion: ${conclusion}`);
|
||||
if (conclusion !== 'success') {
|
||||
core.setFailed(`Private workflow failed with conclusion: ${conclusion}`);
|
||||
}
|
||||
|
||||
- name: Cancel invoked run if workflow cancelled
|
||||
if: ${{ cancelled() }}
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
github-token: ${{ secrets.SPACETIMEDB_PRIVATE_TOKEN }}
|
||||
script: |
|
||||
const targetOwner = process.env.TARGET_OWNER;
|
||||
const targetRepo = process.env.TARGET_REPO;
|
||||
const runId = Number(`${{ steps.dispatch.outputs.run_id }}`);
|
||||
if (!runId) {
|
||||
core.warning('No run_id available to cancel.');
|
||||
return;
|
||||
}
|
||||
core.info(`Cancelling private workflow run ${runId} in ${targetOwner}/${targetRepo}...`);
|
||||
await github.rest.actions.cancelWorkflowRun({
|
||||
owner: targetOwner,
|
||||
repo: targetRepo,
|
||||
run_id: runId,
|
||||
});
|
||||
core.info('Cancellation requested.');
|
||||
@@ -0,0 +1,320 @@
|
||||
name: Update LLM benchmarks
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
pr_number:
|
||||
description: "Pull Request Number"
|
||||
required: true
|
||||
issue_comment:
|
||||
types: [created] # only run when the comment is first created
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
issues: write
|
||||
|
||||
concurrency:
|
||||
group: llm-benchmark-${{ github.event_name == 'issue_comment' && github.event.issue.number || inputs.pr_number }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
update-llm-benchmark:
|
||||
# Runnable either with a comment that starts with /update-llm-benchmark
|
||||
# or by manually dispatching
|
||||
if: |
|
||||
(github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/update-llm-benchmark')) ||
|
||||
(github.event_name == 'workflow_dispatch')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# Here we install the spacetime CLI for faster execution of the tests
|
||||
# SpacetimeDB itself is not under test here, rather it's the docs.
|
||||
# If we want to change that it is possible to have the benchmark compile
|
||||
# SpacetimeDB from source.
|
||||
- name: Install spacetime CLI
|
||||
run: curl -sSf https://install.spacetimedb.com | sh -s -- -y
|
||||
|
||||
- name: Load PR info
|
||||
id: pr
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
let prNumber;
|
||||
if (context.eventName === 'issue_comment') {
|
||||
prNumber = context.payload.issue.number;
|
||||
} else if (context.eventName === 'workflow_dispatch') {
|
||||
const raw = context.payload.inputs?.pr_number;
|
||||
if (!raw || !/^\d+$/.test(raw)) {
|
||||
core.setFailed(`Invalid pr_number input: '${raw}'.`);
|
||||
return;
|
||||
}
|
||||
prNumber = Number(raw);
|
||||
} else {
|
||||
core.setFailed(`Unsupported event: ${context.eventName}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const { data: pr } = await github.rest.pulls.get({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
pull_number: prNumber,
|
||||
});
|
||||
|
||||
core.setOutput('number', String(prNumber));
|
||||
core.setOutput('head_ref', pr.head.ref);
|
||||
core.setOutput('head_sha', pr.head.sha);
|
||||
core.setOutput('head_repo_full_name', pr.head.repo.full_name);
|
||||
core.setOutput('head_owner_type', pr.head.repo.owner.type); // "User"|"Organization"
|
||||
core.setOutput('maintainer_can_modify', String(pr.maintainer_can_modify));
|
||||
|
||||
# If this was kicked off by a comment, ensure that the commenter is
|
||||
# a collaborator on the repo. We don't want unprivileged users to run benchmarks.
|
||||
# Note that the workflow that will be run will be the one that is on the `master`
|
||||
# branch, NOT the one from the PR. This is important so that the PR author can't
|
||||
# sneak in an exfiltration exploit.
|
||||
- name: Check commenter permission
|
||||
if: github.event_name == 'issue_comment'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const user = context.payload.comment.user.login;
|
||||
const { data } = await github.rest.repos.getCollaboratorPermissionLevel({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
username: user,
|
||||
});
|
||||
|
||||
const allowed = new Set(['admin', 'maintain', 'write', 'triage']);
|
||||
if (!allowed.has(data.permission)) {
|
||||
core.setFailed(`User ${user} has permission '${data.permission}', not allowed to run benchmarks.`);
|
||||
}
|
||||
|
||||
# If the PR is from a fork, we need to be able to have GitHub actions commit back
|
||||
# to the forked repo, so that we can update the benchmark results.
|
||||
# In order to do this we need to ensure that the PR is configured to allow the maintainers
|
||||
# of the SpacetimeDB repo to commit back ot the fork.
|
||||
- name: Check fork pushability (and comment if not)
|
||||
if: steps.pr.outputs.head_repo_full_name != github.repository
|
||||
uses: actions/github-script@v7
|
||||
env:
|
||||
PR_NUMBER: ${{ steps.pr.outputs.number }}
|
||||
HEAD_OWNER_TYPE: ${{ steps.pr.outputs.head_owner_type }}
|
||||
MAINTAINER_CAN_MODIFY: ${{ steps.pr.outputs.maintainer_can_modify }}
|
||||
with:
|
||||
script: |
|
||||
const issue_number = Number(process.env.PR_NUMBER);
|
||||
const headOwnerType = process.env.HEAD_OWNER_TYPE;
|
||||
const canModify = process.env.MAINTAINER_CAN_MODIFY === 'true';
|
||||
|
||||
if (headOwnerType === 'Organization') {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number,
|
||||
body: [
|
||||
"I can’t push benchmark updates to this PR because it comes from an **organization-owned fork**.",
|
||||
"GitHub doesn’t allow granting upstream maintainers push permissions to org-owned forks.",
|
||||
"",
|
||||
"Options:",
|
||||
"- Reopen the PR from a **personal fork** with **Allow edits from maintainers** enabled, or",
|
||||
"- A maintainer can apply the benchmark update on an internal branch."
|
||||
].join("\n"),
|
||||
});
|
||||
core.setFailed("Org-owned fork PR is not pushable by maintainers.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!canModify) {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number,
|
||||
body: [
|
||||
"I can’t push benchmark updates to this PR branch until you enable **Allow edits from maintainers**.",
|
||||
"Please check the box on the PR page, then re-comment `/update-llm-benchmark`.",
|
||||
"See https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork"
|
||||
].join("\n"),
|
||||
});
|
||||
core.setFailed("maintainer_can_modify is false; author must enable 'Allow edits from maintainers'.");
|
||||
}
|
||||
|
||||
# Run the benchmark that is already checked into master to prevent
|
||||
# an exfiltration attack whereby the PR author tries to sneak in an exploit
|
||||
# and get a maintainer to run the modified benchmark without looking at the
|
||||
# PR first. This ensure that we only ever execute code that is checked into
|
||||
# master.
|
||||
- name: Checkout master (build/install tool from trusted code)
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: master
|
||||
fetch-depth: 0
|
||||
persist-credentials: false
|
||||
|
||||
- uses: dtolnay/rust-toolchain@stable
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
|
||||
# Ensure we use a user-writable .NET install (not /usr/share/dotnet),
|
||||
# so workload installs don't require sudo.
|
||||
- name: Setup .NET SDK
|
||||
uses: actions/setup-dotnet@v4
|
||||
with:
|
||||
dotnet-version: "8.0.x"
|
||||
|
||||
- name: Install WASI workload (wasi-experimental)
|
||||
env:
|
||||
DOTNET_MULTILEVEL_LOOKUP: "0"
|
||||
DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
|
||||
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
|
||||
run: |
|
||||
dotnet --info
|
||||
dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel
|
||||
|
||||
- name: Install llm-benchmark tool from master
|
||||
run: |
|
||||
cargo install --path tools/xtask-llm-benchmark --locked
|
||||
command -v llm_benchmark
|
||||
|
||||
# Check out the repo on the branch, but ONLY use this code as data!
|
||||
# Never execute code that is on the PR branch.
|
||||
- name: Checkout PR head (branch)
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: ${{ steps.pr.outputs.head_repo_full_name }}
|
||||
ref: ${{ steps.pr.outputs.head_sha }}
|
||||
fetch-depth: 0
|
||||
persist-credentials: false
|
||||
|
||||
# Run the benchmark against the PR using the installed tool from the
|
||||
# master branch.
|
||||
- name: Run benchmark (with provider keys)
|
||||
env:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
# Prevent MSBuild node reuse issues that cause "Pipe is broken" errors
|
||||
# when running multiple dotnet publish commands in parallel.
|
||||
# See: https://github.com/dotnet/msbuild/issues/6657
|
||||
MSBUILDDISABLENODEREUSE: "1"
|
||||
DOTNET_CLI_USE_MSBUILD_SERVER: "0"
|
||||
run: |
|
||||
llm_benchmark ci-quickfix
|
||||
llm_benchmark ci-check
|
||||
|
||||
- name: Ensure only docs/llms changed
|
||||
run: |
|
||||
set -euo pipefail
|
||||
CHANGED="$(git diff --name-only)"
|
||||
|
||||
if [ -z "$CHANGED" ]; then
|
||||
echo "No changes."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if echo "$CHANGED" | grep -qvE '^docs/llms/'; then
|
||||
echo "Benchmark produced changes outside docs/llms:"
|
||||
echo "$CHANGED" | grep -vE '^docs/llms/'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Comment the benchmark results on the PR
|
||||
- name: Comment benchmark results on PR
|
||||
uses: actions/github-script@v7
|
||||
env:
|
||||
PR_NUMBER: ${{ steps.pr.outputs.number }}
|
||||
with:
|
||||
github-token: ${{ secrets.CLOCKWORK_LABS_BOT_PAT }}
|
||||
script: |
|
||||
const fs = require('fs');
|
||||
// docs-benchmark files are used for CI (testing documentation quality)
|
||||
const summaryPath = 'docs/llms/docs-benchmark-summary.json';
|
||||
const summary = JSON.parse(fs.readFileSync(summaryPath, 'utf8'));
|
||||
|
||||
// Extract results for the modes checked by ci-check
|
||||
// Rust: rustdoc_json, C#: docs
|
||||
const rustResults = summary.by_language?.rust?.modes?.rustdoc_json?.models?.['GPT-5'];
|
||||
const csharpResults = summary.by_language?.csharp?.modes?.docs?.models?.['GPT-5'];
|
||||
|
||||
const formatPct = (val) => val !== undefined ? `${val.toFixed(1)}%` : 'N/A';
|
||||
|
||||
let table = `## LLM Benchmark Results (ci-quickfix)\n\n`;
|
||||
table += `| Language | Mode | Category | Tests Passed | Pass % | Task Pass % |\n`;
|
||||
table += `|----------|------|----------|--------------|--------|-------------|\n`;
|
||||
|
||||
if (rustResults) {
|
||||
const cats = rustResults.categories || {};
|
||||
if (cats.basics) {
|
||||
const c = cats.basics;
|
||||
table += `| Rust | rustdoc_json | basics | ${c.passed_tests}/${c.total_tests} | ${formatPct(c.pass_pct)} | ${formatPct(c.task_pass_pct)} |\n`;
|
||||
}
|
||||
if (cats.schema) {
|
||||
const c = cats.schema;
|
||||
table += `| Rust | rustdoc_json | schema | ${c.passed_tests}/${c.total_tests} | ${formatPct(c.pass_pct)} | ${formatPct(c.task_pass_pct)} |\n`;
|
||||
}
|
||||
const t = rustResults.totals;
|
||||
table += `| Rust | rustdoc_json | **total** | ${t.passed_tests}/${t.total_tests} | ${formatPct(t.pass_pct)} | ${formatPct(t.task_pass_pct)} |\n`;
|
||||
}
|
||||
|
||||
if (csharpResults) {
|
||||
const cats = csharpResults.categories || {};
|
||||
if (cats.basics) {
|
||||
const c = cats.basics;
|
||||
table += `| C# | docs | basics | ${c.passed_tests}/${c.total_tests} | ${formatPct(c.pass_pct)} | ${formatPct(c.task_pass_pct)} |\n`;
|
||||
}
|
||||
if (cats.schema) {
|
||||
const c = cats.schema;
|
||||
table += `| C# | docs | schema | ${c.passed_tests}/${c.total_tests} | ${formatPct(c.pass_pct)} | ${formatPct(c.task_pass_pct)} |\n`;
|
||||
}
|
||||
const t = csharpResults.totals;
|
||||
table += `| C# | docs | **total** | ${t.passed_tests}/${t.total_tests} | ${formatPct(t.pass_pct)} | ${formatPct(t.task_pass_pct)} |\n`;
|
||||
}
|
||||
|
||||
table += `\n<sub>Generated at: ${summary.generated_at}</sub>`;
|
||||
|
||||
const issue_number = Number(process.env.PR_NUMBER);
|
||||
|
||||
// Find and update existing comment or create new one
|
||||
const comments = await github.rest.issues.listComments({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number,
|
||||
});
|
||||
|
||||
const marker = '## LLM Benchmark Results (ci-quickfix)';
|
||||
const existingComment = comments.data.find(c => c.body.startsWith(marker));
|
||||
|
||||
if (existingComment) {
|
||||
await github.rest.issues.updateComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
comment_id: existingComment.id,
|
||||
body: table,
|
||||
});
|
||||
} else {
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number,
|
||||
body: table,
|
||||
});
|
||||
}
|
||||
|
||||
# The benchmarks only modify the docs/llms directory.
|
||||
# Commit the changes.
|
||||
- name: Commit changes
|
||||
run: |
|
||||
git config user.name "spacetimedb-bot"
|
||||
git config user.email "spacetimedb-bot@users.noreply.github.com"
|
||||
|
||||
# Prefer staging only the benchmark output area (adjust as needed)
|
||||
git add docs/llms
|
||||
|
||||
git diff --cached --quiet && exit 0
|
||||
git commit -m "Update LLM benchmark results"
|
||||
|
||||
# Here we use the https://github.com/clockwork-labs-bot user's
|
||||
# personal access token to commit back to the PR branch. This is necessary
|
||||
# if we want to be able to push back to external contributor forks.
|
||||
- name: Push back to PR branch (same repo or fork)
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CLOCKWORK_LABS_BOT_PAT }}
|
||||
run: |
|
||||
git remote set-url origin "https://x-access-token:${GH_TOKEN}@github.com/${{ steps.pr.outputs.head_repo_full_name }}.git"
|
||||
git push origin "HEAD:${{ steps.pr.outputs.head_ref }}"
|
||||
Generated
+33
-1
@@ -7401,7 +7401,6 @@ dependencies = [
|
||||
"names",
|
||||
"notify 7.0.0",
|
||||
"percent-encoding",
|
||||
"portpicker",
|
||||
"predicates",
|
||||
"pretty_assertions",
|
||||
"quick-xml 0.31.0",
|
||||
@@ -7419,6 +7418,7 @@ dependencies = [
|
||||
"spacetimedb-codegen",
|
||||
"spacetimedb-data-structures",
|
||||
"spacetimedb-fs-utils",
|
||||
"spacetimedb-guard",
|
||||
"spacetimedb-jsonwebtoken",
|
||||
"spacetimedb-lib 1.11.1",
|
||||
"spacetimedb-paths",
|
||||
@@ -7814,6 +7814,15 @@ dependencies = [
|
||||
"zstd-framed",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spacetimedb-guard"
|
||||
version = "1.11.1"
|
||||
dependencies = [
|
||||
"portpicker",
|
||||
"reqwest 0.12.24",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spacetimedb-jsonwebtoken"
|
||||
version = "9.3.0"
|
||||
@@ -10903,6 +10912,29 @@ dependencies = [
|
||||
"xml-rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xtask-llm-benchmark"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"blake3",
|
||||
"chrono",
|
||||
"clap 4.5.50",
|
||||
"fs2",
|
||||
"futures",
|
||||
"heck 0.5.0",
|
||||
"reqwest 0.12.24",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"spacetimedb 1.11.1",
|
||||
"spacetimedb-guard",
|
||||
"tempfile",
|
||||
"thiserror 2.0.17",
|
||||
"tokio",
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xxhash-rust"
|
||||
version = "0.8.15"
|
||||
|
||||
@@ -15,6 +15,7 @@ members = [
|
||||
"crates/durability",
|
||||
"crates/execution",
|
||||
"crates/expr",
|
||||
"crates/guard",
|
||||
"crates/fs-utils",
|
||||
"crates/lib",
|
||||
"crates/metrics",
|
||||
@@ -56,6 +57,7 @@ members = [
|
||||
"tools/replace-spacetimedb",
|
||||
"tools/generate-client-api",
|
||||
"tools/gen-bindings",
|
||||
"tools/xtask-llm-benchmark",
|
||||
"crates/bindings-typescript/test-app/server",
|
||||
"crates/bindings-typescript/test-react-router-app/server",
|
||||
]
|
||||
@@ -118,6 +120,7 @@ spacetimedb-datastore = { path = "crates/datastore", version = "=1.11.1" }
|
||||
spacetimedb-durability = { path = "crates/durability", version = "=1.11.1" }
|
||||
spacetimedb-execution = { path = "crates/execution", version = "=1.11.1" }
|
||||
spacetimedb-expr = { path = "crates/expr", version = "=1.11.1" }
|
||||
spacetimedb-guard = { path = "crates/guard", version = "=1.11.1" }
|
||||
spacetimedb-lib = { path = "crates/lib", default-features = false, version = "=1.11.1" }
|
||||
spacetimedb-memory-usage = { path = "crates/memory-usage", version = "=1.11.1", default-features = false }
|
||||
spacetimedb-metrics = { path = "crates/metrics", version = "=1.11.1" }
|
||||
|
||||
@@ -89,8 +89,7 @@ pretty_assertions.workspace = true
|
||||
fs_extra.workspace = true
|
||||
assert_cmd = "2"
|
||||
predicates = "3"
|
||||
portpicker = "0.1"
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
spacetimedb-guard.workspace = true
|
||||
|
||||
[target.'cfg(not(target_env = "msvc"))'.dependencies]
|
||||
tikv-jemallocator = { workspace = true }
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
mod util;
|
||||
|
||||
use crate::util::SpacetimeDbGuard;
|
||||
use assert_cmd::cargo::cargo_bin_cmd;
|
||||
use spacetimedb_guard::SpacetimeDbGuard;
|
||||
|
||||
#[test]
|
||||
fn cli_can_publish_spacetimedb_on_disk() {
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
mod util;
|
||||
|
||||
use crate::util::SpacetimeDbGuard;
|
||||
use assert_cmd::cargo::cargo_bin_cmd;
|
||||
use spacetimedb_guard::SpacetimeDbGuard;
|
||||
|
||||
#[test]
|
||||
fn cli_can_ping_spacetimedb_on_disk() {
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "spacetimedb-guard"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
|
||||
[dependencies]
|
||||
portpicker = "0.1"
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
tempfile.workspace = true
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![allow(clippy::disallowed_macros)]
|
||||
|
||||
use std::{
|
||||
env,
|
||||
io::{BufRead, BufReader},
|
||||
@@ -30,10 +32,19 @@ impl SpacetimeDbGuard {
|
||||
let temp_dir = tempfile::tempdir().expect("failed to create temp dir");
|
||||
let data_dir = temp_dir.path().display().to_string();
|
||||
|
||||
Self::spawn_spacetime_start(&["start", "--data-dir", &data_dir])
|
||||
Self::spawn_spacetime_start(false, &["start", "--data-dir", &data_dir])
|
||||
}
|
||||
|
||||
fn spawn_spacetime_start(extra_args: &[&str]) -> Self {
|
||||
/// Start `spacetimedb` in a temporary data directory via:
|
||||
/// spacetime start --data-dir <temp-dir> --listen-addr <addr>
|
||||
pub fn spawn_in_temp_data_dir_use_cli() -> Self {
|
||||
let temp_dir = tempfile::tempdir().expect("failed to create temp dir");
|
||||
let data_dir = temp_dir.path().display().to_string();
|
||||
|
||||
Self::spawn_spacetime_start(true, &["start", "--data-dir", &data_dir])
|
||||
}
|
||||
|
||||
fn spawn_spacetime_start(use_installed_cli: bool, extra_args: &[&str]) -> Self {
|
||||
let port = find_free_port();
|
||||
let addr: SocketAddr = format!("127.0.0.1:{port}").parse().unwrap();
|
||||
let address = addr.to_string();
|
||||
@@ -42,13 +53,23 @@ impl SpacetimeDbGuard {
|
||||
// Workspace root for `cargo run -p ...`
|
||||
let workspace_dir = env!("CARGO_MANIFEST_DIR");
|
||||
|
||||
Self::build_prereqs(workspace_dir);
|
||||
let mut cargo_args = vec!["run", "-p", "spacetimedb-cli", "--"];
|
||||
let mut args = vec![];
|
||||
|
||||
cargo_args.extend(extra_args);
|
||||
cargo_args.extend(["--listen-addr", &address]);
|
||||
let (child, logs) = if use_installed_cli {
|
||||
args.extend_from_slice(extra_args);
|
||||
args.extend_from_slice(&["--listen-addr", &address]);
|
||||
|
||||
let (child, logs) = Self::spawn_child(workspace_dir, &cargo_args);
|
||||
let cmd = Command::new("spacetime");
|
||||
Self::spawn_child(cmd, env!("CARGO_MANIFEST_DIR"), &args)
|
||||
} else {
|
||||
Self::build_prereqs(workspace_dir);
|
||||
args.extend(vec!["run", "-p", "spacetimedb-cli", "--"]);
|
||||
args.extend(extra_args);
|
||||
args.extend(["--listen-addr", &address]);
|
||||
|
||||
let cmd = Command::new("cargo");
|
||||
Self::spawn_child(cmd, workspace_dir, &args)
|
||||
};
|
||||
|
||||
let guard = SpacetimeDbGuard { child, host_url, logs };
|
||||
guard.wait_until_http_ready(Duration::from_secs(10));
|
||||
@@ -72,8 +93,7 @@ impl SpacetimeDbGuard {
|
||||
}
|
||||
}
|
||||
|
||||
fn spawn_child(workspace_dir: &str, args: &[&str]) -> (Child, Arc<Mutex<String>>) {
|
||||
let mut cmd = Command::new("cargo");
|
||||
fn spawn_child(mut cmd: Command, workspace_dir: &str, args: &[&str]) -> (Child, Arc<Mutex<String>>) {
|
||||
let mut child = cmd
|
||||
.args(args)
|
||||
.current_dir(workspace_dir)
|
||||
+250
@@ -0,0 +1,250 @@
|
||||
# DEVELOP.md
|
||||
|
||||
This document explains how to configure the environment, run the LLM benchmark tool, and work with the benchmark suite.
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Quick Checks & Fixes](#quick-checks-fixes)
|
||||
2. [Environment Variables](#environment-variables)
|
||||
3. [Benchmark Suite](#benchmark-suite)
|
||||
4. [Troubleshooting](#troubleshooting)
|
||||
---
|
||||
|
||||
## Quick Checks & Fixes
|
||||
|
||||
Use this single command to quickly unblock CI by regenerating hashes and running only GPT-5 for the minimal Rust + C# passes. This is not the full benchmark suite.
|
||||
|
||||
`cargo llm ci-quickfix`
|
||||
What this does:
|
||||
1. Runs Rust rustdoc_json pass for GPT-5 only.
|
||||
2. Runs C# docs pass for GPT-5 only.
|
||||
3. Writes updated results & summary.
|
||||
|
||||
---
|
||||
|
||||
> Model IDs passed to `--models` must match configured routes (see `model_routes.rs`), e.g. `"openai:gpt-5"`.
|
||||
|
||||
|
||||
### Spacetime CLI
|
||||
Publishing is performed via the `spacetime` CLI (`spacetime publish -c -y --server <name> <db>`). Ensure:
|
||||
- `spacetime` is on PATH
|
||||
- The target server is reachable/running
|
||||
|
||||
## Environment Variables
|
||||
|
||||
> These are the **defaults** and/or recommended dev values.
|
||||
|
||||
| Name | Purpose | Values / Example | Required |
|
||||
|---|---|---|---|
|
||||
| `SPACETIME_SERVER` | Target SpacetimeDB environment | `local` | ✅ |
|
||||
| `LLM_DEBUG` | Print short debug info while generating | `true` / `false` (default `true` in dev) | ✅ |
|
||||
| `LLM_DEBUG_VERBOSE` | Extra‑verbose logs (payloads, scoring detail) | `false` | ✅ |
|
||||
| `LLM_BENCH_CONCURRENCY` | Parallel task concurrency across the whole bench run | `20` | ✅ |
|
||||
| `LLM_BENCH_ROUTE_CONCURRENCY` | Per‑route concurrency (throttle per vendor/model) | `4` | ✅ |
|
||||
| `OPENAI_API_KEY` | OpenAI credential | `sk-...` | optional* |
|
||||
| `OPENAI_BASE_URL` | OpenAI-compatible base URL override | `https://api.openai.com/` | optional |
|
||||
| `ANTHROPIC_API_KEY` | Anthropic credential | `...` | optional* |
|
||||
| `ANTHROPIC_BASE_URL` | Anthropic base URL override | `https://api.anthropic.com` | optional |
|
||||
| `GOOGLE_API_KEY` | Gemini credential | `...` | optional* |
|
||||
| `GOOGLE_BASE_URL` | Gemini base URL override | `https://generativelanguage.googleapis.com` | optional |
|
||||
| `XAI_API_KEY` | xAI Grok credential | `...` | optional |
|
||||
| `DEEPSEEK_API_KEY` | DeepSeek credential | `...` | optional |
|
||||
| `META_API_KEY` | Meta Llama credential | `...` | optional* |
|
||||
|
||||
\*Required only if you plan to run that provider locally.
|
||||
|
||||
**Canonical dev block** (copy/paste into your shell profile):
|
||||
|
||||
```bash
|
||||
OPENAI_API_KEY=
|
||||
OPENAI_BASE_URL=https://api.openai.com/
|
||||
|
||||
ANTHROPIC_API_KEY=
|
||||
ANTHROPIC_BASE_URL=https://api.anthropic.com
|
||||
|
||||
GOOGLE_API_KEY=
|
||||
GOOGLE_BASE_URL=https://generativelanguage.googleapis.com
|
||||
|
||||
XAI_API_KEY=
|
||||
XAI_BASE_URL=https://api.x.ai
|
||||
|
||||
DEEPSEEK_API_KEY=
|
||||
DEEPSEEK_BASE_URL=https://api.deepseek.com
|
||||
|
||||
META_API_KEY=
|
||||
META_BASE_URL=https://openrouter.ai/api/v1
|
||||
|
||||
SPACETIME_SERVER="local"
|
||||
LLM_DEBUG=true
|
||||
LLM_DEBUG_VERBOSE=false
|
||||
LLM_BENCH_CONCURRENCY=20
|
||||
LLM_BENCH_ROUTE_CONCURRENCY=4
|
||||
```
|
||||
|
||||
Windows PowerShell:
|
||||
|
||||
```powershell
|
||||
$env:SPACETIME_SERVER="local"
|
||||
$env:LLM_DEBUG="true"
|
||||
$env:LLM_DEBUG_VERBOSE="false"
|
||||
$env:LLM_BENCH_CONCURRENCY="20"
|
||||
$env:LLM_BENCH_ROUTE_CONCURRENCY="4"
|
||||
```
|
||||
|
||||
|
||||
### LLM Providers — Keys & Base URLs
|
||||
|
||||
> Notes
|
||||
> - These match the providers wired in this repo (`OpenAiClient`, `AnthropicClient`, `GoogleGeminiClient`, `XaiGrokClient`, `DeepSeekClient`, `MetaLlamaClient`).
|
||||
|
||||
| Provider | API Key Env | Base URL Env (optional) | Default Base URL |
|
||||
|---------------|---------------------|-------------------------|---|
|
||||
| OpenAI | `OPENAI_API_KEY` | `OPENAI_BASE_URL` | `https://api.openai.com` |
|
||||
| Anthropic | `ANTHROPIC_API_KEY` | `ANTHROPIC_BASE_URL` | `https://api.anthropic.com` |
|
||||
| Google Gemini | `GOOGLE_API_KEY` | `GOOGLE_BASE_URL` | `https://generativelanguage.googleapis.com` |
|
||||
| xAI Grok | `XAI_API_KEY` | `XAI_BASE_URL` | `https://api.x.ai` |
|
||||
| DeepSeek | `DEEPSEEK_API_KEY` | `DEEPSEEK_BASE_URL` | `https://api.deepseek.com` |
|
||||
| META | `META_API_KEY` | `META_BASE_URL` | `https://openrouter.ai/api/v1` |
|
||||
|
||||
---
|
||||
|
||||
## Benchmark Suite
|
||||
|
||||
Results directory: `docs/llms`
|
||||
|
||||
### Result Files
|
||||
|
||||
There are two sets of result files, each serving a different purpose:
|
||||
|
||||
| Files | Purpose | Updated By |
|
||||
|-------|---------|------------|
|
||||
| `docs-benchmark-details.json`<br>`docs-benchmark-summary.json` | Test documentation quality with a single reference model (GPT-5) | `cargo llm ci-quickfix` |
|
||||
| `llm-comparison-details.json`<br>`llm-comparison-summary.json` | Compare all LLMs against the same documentation | `cargo llm run` |
|
||||
|
||||
- **docs-benchmark**: Used by CI to ensure documentation quality. Contains only GPT-5 results.
|
||||
- **llm-comparison**: Used for manual benchmark runs to compare LLM performance. Contains results from all configured models.
|
||||
|
||||
> Results writes are lock-safe and atomic. The tool takes an exclusive lock and writes via a temp file, then renames it, so concurrent runs won't corrupt results.
|
||||
|
||||
Open `llm_benchmark_stats_viewer.html` in a browser to inspect merged results locally.
|
||||
### Current Benchmarks
|
||||
|
||||
**basics**
|
||||
000. empty-reducers — tests whether it can create basic reducers with various arguments
|
||||
001. basic-tables — can it create tables with basic columns
|
||||
002. scheduled-table — can it create a scheduled table and reducer
|
||||
003. struct-in-table — can it put a struct in a table
|
||||
004. insert — can it insert a row
|
||||
005. update — can it update a row
|
||||
006. delete — can it delete a row
|
||||
007. crud — can it insert, update, and delete a row in the same reducer
|
||||
008. index-lookup — can it look up something from an index
|
||||
009. init — can it write the init reducer
|
||||
010. connect — can it write the client_connected/client_disconnected reducers
|
||||
011. helper-function — can it create a non-reducer helper function
|
||||
|
||||
**schema**
|
||||
012. spacetime-product-type — can it define a new spacetime product type
|
||||
013. spacetime-sum-type — can it define a new sum type
|
||||
014. elementary-columns — can it create columns with basic types
|
||||
015. product-type-columns — can it create columns with product types
|
||||
016. sum-type-columns — can it create columns with sum types
|
||||
017. scheduled — can it create scheduled columns
|
||||
018. constraints — can it add primary keys, unique constraints, and indexes
|
||||
019. many-to-many — can it create a many-to-many relationship
|
||||
020. ecs — can it create a basic ecs
|
||||
021. multi-column-index — can it create a multi-column index
|
||||
|
||||
Benchmarks live under `benchmarks/` with structure like:
|
||||
|
||||
```
|
||||
benchmarks/
|
||||
category/
|
||||
t_001_foo/
|
||||
tasks/
|
||||
rust.txt
|
||||
csharp.txt
|
||||
answers/
|
||||
rust.rs
|
||||
csharp.cs
|
||||
spec.rs # scoring config, reducer/schema checks, etc.
|
||||
```
|
||||
|
||||
### Creating a new benchmark
|
||||
|
||||
1. **Copy existing benchmark**
|
||||
- Duplicate any existing benchmark folder.
|
||||
- Bump the numeric prefix to a new, unused ID: `t_123_my_task`.
|
||||
|
||||
2. **Rename for the new task**
|
||||
- Rename the folder to your ID + short slug: `t_123_my_task`.
|
||||
|
||||
3. **Write the task prompt**
|
||||
- Create/update `tasks/rust.txt` and/or `tasks/csharp.txt`.
|
||||
- Be explicit (tables, reducers, helpers, constraints). Avoid ambiguity.
|
||||
|
||||
4. **Add golden answers**
|
||||
- Implement the canonical solution in `answers/rust.rs` and/or `answers/csharp.cs`.
|
||||
|
||||
5. **Define scoring**
|
||||
- Edit `spec.rs` to add scorers (e.g., schema/table/field checks, reducer/func exists).
|
||||
|
||||
6. **Quick validation**
|
||||
- Build goldens only:
|
||||
`cargo llm run --goldens-only --tasks t_123_my_task`
|
||||
|
||||
7. **Categorize**
|
||||
- Ensure the folder sits under the right category path.
|
||||
|
||||
|
||||
### Typical Commands
|
||||
|
||||
```bash
|
||||
# Run everything with current env (providers/models from your .env)
|
||||
cargo llm run
|
||||
|
||||
# Only Rust (or C#)
|
||||
cargo llm run --lang rust
|
||||
cargo llm run --lang csharp
|
||||
|
||||
# Only certain categories (use your actual category names)
|
||||
cargo llm run --categories basics,schema
|
||||
|
||||
# Only certain tasks by number (globally numbered)
|
||||
cargo llm run --tasks 0,7,12
|
||||
|
||||
# Limit providers/models explicitly
|
||||
cargo llm run \
|
||||
--providers openai,anthropic \
|
||||
--models "openai:gpt-5 anthropic:claude-sonnet-4-5"
|
||||
|
||||
# Dry runs
|
||||
cargo llm run --hash-only # build context only (no provider calls)
|
||||
cargo llm run --goldens-only # build/check goldens only
|
||||
|
||||
# Be aggressive (skip some safety checks)
|
||||
cargo llm run --force
|
||||
|
||||
# CI sanity check per language
|
||||
cargo llm ci-check --lang rust
|
||||
cargo llm ci-check --lang csharp
|
||||
|
||||
```
|
||||
|
||||
Outputs:
|
||||
- Logs to stdout/stderr (respecting `LLM_DEBUG`/`LLM_DEBUG_VERBOSE`).
|
||||
- JSON results in a per‑run folder (timestamped), merged into aggregate reports.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**HTTP 400/404 from providers**
|
||||
- Check the model ID spelling and whether it’s available for your account/region.
|
||||
- Verify the correct base URL for non-default gateways.
|
||||
|
||||
**Timeouts / Rate-limits**
|
||||
- Lower `LLM_BENCH_CONCURRENCY` or `LLM_BENCH_ROUTE_CONCURRENCY`.
|
||||
- Some providers aggressively throttle bursts; use backoff/retry when supported.
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"version": 1,
|
||||
"generated_at": "2026-01-06T00:39:43.087Z",
|
||||
"by_language": {
|
||||
"csharp": {
|
||||
"modes": {
|
||||
"docs": {
|
||||
"hash": "6f75b0c555fd7577df52872b447d9237496b7f004485c56464181b8bfb7834de",
|
||||
"models": {
|
||||
"GPT-5": {
|
||||
"categories": {
|
||||
"basics": {
|
||||
"tasks": 12,
|
||||
"total_tests": 27,
|
||||
"passed_tests": 27,
|
||||
"pass_pct": 100.0,
|
||||
"task_pass_equiv": 12.0,
|
||||
"task_pass_pct": 100.0
|
||||
},
|
||||
"schema": {
|
||||
"tasks": 10,
|
||||
"total_tests": 34,
|
||||
"passed_tests": 31,
|
||||
"pass_pct": 91.17647,
|
||||
"task_pass_equiv": 9.0,
|
||||
"task_pass_pct": 90.0
|
||||
}
|
||||
},
|
||||
"totals": {
|
||||
"tasks": 22,
|
||||
"total_tests": 61,
|
||||
"passed_tests": 58,
|
||||
"pass_pct": 95.08197,
|
||||
"task_pass_equiv": 21.0,
|
||||
"task_pass_pct": 95.454544
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"rust": {
|
||||
"modes": {
|
||||
"rustdoc_json": {
|
||||
"hash": "9bb229f6cfd63a9477451e127576b9bd378ec1087b9f320e5a8576b415e021b0",
|
||||
"models": {
|
||||
"GPT-5": {
|
||||
"categories": {
|
||||
"basics": {
|
||||
"tasks": 12,
|
||||
"total_tests": 27,
|
||||
"passed_tests": 20,
|
||||
"pass_pct": 74.07407,
|
||||
"task_pass_equiv": 9.0,
|
||||
"task_pass_pct": 75.0
|
||||
},
|
||||
"schema": {
|
||||
"tasks": 10,
|
||||
"total_tests": 34,
|
||||
"passed_tests": 23,
|
||||
"pass_pct": 67.64706,
|
||||
"task_pass_equiv": 6.0,
|
||||
"task_pass_pct": 60.000004
|
||||
}
|
||||
},
|
||||
"totals": {
|
||||
"tasks": 22,
|
||||
"total_tests": 61,
|
||||
"passed_tests": 43,
|
||||
"pass_pct": 70.491806,
|
||||
"task_pass_equiv": 15.000001,
|
||||
"task_pass_pct": 68.18182
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,36 @@
|
||||
[package]
|
||||
name = "xtask-llm-benchmark"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
default-run = "llm_benchmark"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
spacetimedb.workspace = true
|
||||
spacetimedb-guard.workspace = true
|
||||
|
||||
anyhow.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
blake3.workspace = true
|
||||
clap.workspace = true
|
||||
chrono = { version = "0.4", features = ["clock", "serde"] }
|
||||
async-trait = "0.1.89"
|
||||
tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
|
||||
urlencoding = "2.1.3"
|
||||
reqwest = { version = "0.12", features = ["json"] }
|
||||
futures = "0.3.31"
|
||||
tempfile = "3.23.0"
|
||||
fs2 = "0.4.3"
|
||||
heck = "0.5.0"
|
||||
thiserror = "2.0.17"
|
||||
|
||||
[lib]
|
||||
path = "src/lib.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "llm_benchmark"
|
||||
path = "src/bin/llm_benchmark.rs"
|
||||
@@ -0,0 +1,198 @@
|
||||
#![allow(clippy::disallowed_macros)]
|
||||
|
||||
use std::{
|
||||
env, fs, io,
|
||||
path::{Component, Path, PathBuf},
|
||||
process::Command,
|
||||
};
|
||||
|
||||
fn main() {
|
||||
// === Paths ===
|
||||
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
|
||||
let benches_root = manifest_dir.join("src/benchmarks");
|
||||
|
||||
let gen_dir = manifest_dir.join("src/generated");
|
||||
let registry_rs = gen_dir.join("registry.rs");
|
||||
|
||||
fs::create_dir_all(&gen_dir).unwrap();
|
||||
|
||||
// We'll gather generated module blocks + match arms
|
||||
let mut mods_src = String::new();
|
||||
let mut arms_src = String::new();
|
||||
|
||||
// Track whether we actually saw anything. If we saw nothing,
|
||||
// that's almost always a wrong-path / didn't put build.rs in right crate problem.
|
||||
let mut found_any = false;
|
||||
|
||||
// Walk: src/benchmarks/<category>/<task>/spec.rs
|
||||
for cat_entry in read_dir_sorted(&benches_root) {
|
||||
let cat_entry = cat_entry.unwrap();
|
||||
let cat_path = cat_entry.path();
|
||||
if !cat_path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
let category = file_name_string(&cat_path);
|
||||
|
||||
for task_entry in read_dir_sorted(&cat_path) {
|
||||
let task_entry = task_entry.unwrap();
|
||||
let task_path = task_entry.path();
|
||||
if !task_path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
let task = file_name_string(&task_path);
|
||||
|
||||
let spec_path = task_path.join("spec.rs");
|
||||
if !spec_path.is_file() {
|
||||
continue;
|
||||
}
|
||||
|
||||
found_any = true;
|
||||
|
||||
// ex: basics_t_005_update
|
||||
let mod_ident = format_ident(&category, &task);
|
||||
|
||||
// registry.rs (we are generating) → ../../benchmarks/.../spec.rs (relative include path)
|
||||
let rel_spec_path = relative_path(®istry_rs, &spec_path);
|
||||
|
||||
// inline submodule
|
||||
mods_src.push_str(&format!(
|
||||
"#[allow(dead_code)]\n#[allow(clippy::all)]\nmod {mod_ident} {{\n include!(\"{rel_spec_path}\");\n}}\n\n"
|
||||
));
|
||||
|
||||
// map ("category","task") → that module's spec() fn
|
||||
arms_src.push_str(&format!(" (\"{category}\", \"{task}\") => {mod_ident}::spec,\n"));
|
||||
}
|
||||
}
|
||||
|
||||
if !found_any {
|
||||
// Fail fast instead of silently letting the stub compile.
|
||||
panic!(
|
||||
"build.rs: did not find any benchmark specs under {:?}.
|
||||
This usually means one of two things:
|
||||
1) The benchmarks actually live somewhere else (path mismatch).
|
||||
2) build.rs is not in the same crate root as the code you're compiling, \
|
||||
so Cargo is not running this script for that crate.",
|
||||
benches_root
|
||||
);
|
||||
}
|
||||
|
||||
// Build final file string
|
||||
let file_contents = format!(
|
||||
"use crate::eval::BenchmarkSpec;
|
||||
use anyhow::{{anyhow, Result}};
|
||||
use std::path::Path;
|
||||
|
||||
{mods_src}pub fn resolve_by_path(task_root: &Path) -> Result<fn() -> BenchmarkSpec> {{
|
||||
let task = task_root
|
||||
.file_name()
|
||||
.and_then(|s| s.to_str())
|
||||
.ok_or_else(|| anyhow!(\"missing task name\"))?;
|
||||
let category = task_root
|
||||
.parent()
|
||||
.and_then(|p| p.file_name().and_then(|s| s.to_str()))
|
||||
.ok_or_else(|| anyhow!(\"missing category name\"))?;
|
||||
|
||||
let ctor = match (category, task) {{
|
||||
{arms_src} _ => return Err(anyhow!(
|
||||
\"no spec registered for {{}}/{{}} (need spec.rs)\",
|
||||
category,
|
||||
task
|
||||
)),
|
||||
}};
|
||||
|
||||
Ok(ctor)
|
||||
}}
|
||||
"
|
||||
);
|
||||
|
||||
// Write unformatted first
|
||||
fs::write(®istry_rs, file_contents).unwrap();
|
||||
|
||||
// Best-effort: format it so CI/rustfmt is happy
|
||||
let _ = Command::new("rustup").args(["component", "add", "rustfmt"]).status();
|
||||
|
||||
let _ = Command::new("rustfmt")
|
||||
.arg("--edition")
|
||||
.arg("2021")
|
||||
.arg(registry_rs.to_string_lossy().to_string())
|
||||
.status();
|
||||
}
|
||||
|
||||
/// Deterministic read_dir so output order is stable.
|
||||
fn read_dir_sorted(dir: &Path) -> Vec<io::Result<fs::DirEntry>> {
|
||||
let mut entries: Vec<_> = fs::read_dir(dir).unwrap().collect();
|
||||
entries.sort_by_key(|res| {
|
||||
res.as_ref()
|
||||
.ok()
|
||||
.and_then(|e| e.file_name().into_string().ok())
|
||||
.unwrap_or_default()
|
||||
});
|
||||
entries
|
||||
}
|
||||
|
||||
/// Get final path segment as String.
|
||||
fn file_name_string(p: &Path) -> String {
|
||||
p.file_name()
|
||||
.and_then(|s| s.to_str())
|
||||
.expect("utf8 dir name")
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// Turn ("basics","t_005_update") into "basics_t_005_update"
|
||||
/// - lowercase
|
||||
/// - non [a-z0-9_] → '_'
|
||||
/// - if first char is digit, prefix '_'
|
||||
fn format_ident(category: &str, task: &str) -> String {
|
||||
fn sanitize(s: &str) -> String {
|
||||
s.chars()
|
||||
.map(|c| {
|
||||
if c.is_ascii_alphanumeric() || c == '_' {
|
||||
c.to_ascii_lowercase()
|
||||
} else {
|
||||
'_'
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
let mut ident = format!("{}_{}", sanitize(category), sanitize(task));
|
||||
if ident.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) {
|
||||
ident.insert(0, '_');
|
||||
}
|
||||
ident
|
||||
}
|
||||
|
||||
/// Build a relative path string from `from` file to `to` file,
|
||||
/// normalized to `/` for portability so `include!` is valid.
|
||||
fn relative_path(from: &Path, to: &Path) -> String {
|
||||
let base_dir = from.parent().expect("registry.rs must have a parent dir");
|
||||
let rel = diff_paths(to, base_dir).unwrap_or_else(|| to.to_path_buf());
|
||||
rel.to_string_lossy().replace('\\', "/")
|
||||
}
|
||||
|
||||
/// Minimal diff_paths (no extra crate).
|
||||
fn diff_paths(path: &Path, base: &Path) -> Option<PathBuf> {
|
||||
let path_comps: Vec<Component<'_>> = path.components().collect();
|
||||
let base_comps: Vec<Component<'_>> = base.components().collect();
|
||||
|
||||
// find shared prefix
|
||||
let common_len = path_comps.iter().zip(&base_comps).take_while(|(a, b)| a == b).count();
|
||||
|
||||
// walk back from base
|
||||
let mut out = PathBuf::new();
|
||||
for _ in base_comps.iter().skip(common_len) {
|
||||
out.push("..");
|
||||
}
|
||||
|
||||
// then walk forward into path
|
||||
for comp in path_comps.iter().skip(common_len) {
|
||||
match comp {
|
||||
Component::Normal(os) => out.push(os),
|
||||
Component::CurDir => out.push("."),
|
||||
Component::ParentDir => out.push(".."),
|
||||
Component::RootDir | Component::Prefix(_) => return None,
|
||||
}
|
||||
}
|
||||
|
||||
Some(out)
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
pub mod publishers;
|
||||
pub(crate) mod results_merge;
|
||||
pub mod runner;
|
||||
mod templates;
|
||||
pub mod types;
|
||||
pub(crate) mod utils;
|
||||
|
||||
pub use publishers::{DotnetPublisher, Publisher, SpacetimeRustPublisher};
|
||||
pub use runner::TaskRunner;
|
||||
pub use types::{RunOutcome, TaskPaths};
|
||||
pub use utils::bench_route_concurrency;
|
||||
@@ -0,0 +1,193 @@
|
||||
use crate::bench::utils::sanitize_db_name;
|
||||
use anyhow::{bail, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
/* Shared */
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
pub trait Publisher: Send + Sync {
|
||||
fn publish(&self, host_url: &str, source: &Path, module_name: &str) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Check if the process was killed by a signal (e.g., SIGSEGV = 11)
|
||||
#[cfg(unix)]
|
||||
fn was_signal_killed(status: &std::process::ExitStatus) -> bool {
|
||||
use std::os::unix::process::ExitStatusExt;
|
||||
status.signal().is_some()
|
||||
}
|
||||
|
||||
#[cfg(not(unix))]
|
||||
fn was_signal_killed(_status: &std::process::ExitStatus) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if the failure is a transient error that should be retried.
|
||||
/// These are resource contention issues in the dotnet WASI SDK.
|
||||
fn is_transient_build_error(stderr: &str, stdout: &str) -> bool {
|
||||
let combined = format!("{stderr}{stdout}");
|
||||
// "Pipe is broken" errors from WASI SDK parallel builds
|
||||
combined.contains("Pipe is broken")
|
||||
|| combined.contains("EmitBundleObjectFiles")
|
||||
// Other transient resource errors
|
||||
|| combined.contains("Unable to read data from the transport connection")
|
||||
}
|
||||
|
||||
fn run(cmd: &mut Command, label: &str) -> Result<()> {
|
||||
run_with_retry(cmd, label, 2)
|
||||
}
|
||||
|
||||
fn run_with_retry(cmd: &mut Command, label: &str, max_retries: u32) -> Result<()> {
|
||||
let mut last_error = None;
|
||||
|
||||
for attempt in 0..=max_retries {
|
||||
if attempt > 0 {
|
||||
eprintln!(
|
||||
"⚠️ {label}: retrying after transient failure (attempt {}/{})",
|
||||
attempt + 1,
|
||||
max_retries + 1
|
||||
);
|
||||
std::thread::sleep(std::time::Duration::from_secs(1));
|
||||
}
|
||||
|
||||
eprintln!("==> {label}: {:?}", cmd);
|
||||
let out = match cmd.output() {
|
||||
Ok(o) => o,
|
||||
Err(e) => {
|
||||
last_error = Some(format!("{label}: spawn failed: {e}"));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if out.status.success() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let code = out.status.code().unwrap_or(-1);
|
||||
let stderr = String::from_utf8_lossy(&out.stderr);
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
|
||||
// Retry on signal kills (like SIGSEGV) or transient build errors
|
||||
let should_retry = was_signal_killed(&out.status) || is_transient_build_error(&stderr, &stdout);
|
||||
if should_retry && attempt < max_retries {
|
||||
let reason = if was_signal_killed(&out.status) {
|
||||
"signal kill"
|
||||
} else {
|
||||
"transient build error"
|
||||
};
|
||||
eprintln!("⚠️ {label}: {reason} detected, will retry...");
|
||||
last_error = Some(format!(
|
||||
"{label} failed (exit={code})\n--- stderr ---\n{stderr}\n--- stdout ---\n{stdout}"
|
||||
));
|
||||
continue;
|
||||
}
|
||||
|
||||
bail!("{label} failed (exit={code})\n--- stderr ---\n{stderr}\n--- stdout ---\n{stdout}");
|
||||
}
|
||||
|
||||
bail!(last_error.unwrap_or_else(|| format!("{label}: unknown error after retries")))
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
/* C# Publisher */
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct DotnetPublisher;
|
||||
|
||||
impl DotnetPublisher {
|
||||
fn ensure_csproj(root: &Path) -> Result<()> {
|
||||
let mut has = false;
|
||||
for ent in fs::read_dir(root)? {
|
||||
let ent = ent?;
|
||||
if ent.path().extension().map(|e| e == "csproj").unwrap_or(false) {
|
||||
has = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !has {
|
||||
bail!("expected a C# project in {}", root.display());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Publisher for DotnetPublisher {
|
||||
fn publish(&self, host_url: &str, source: &Path, module_name: &str) -> Result<()> {
|
||||
if !source.exists() {
|
||||
bail!("no source: {}", source.display());
|
||||
}
|
||||
println!("publish csharp module {}", module_name);
|
||||
|
||||
Self::ensure_csproj(source)?;
|
||||
|
||||
let db = sanitize_db_name(module_name);
|
||||
|
||||
let mut cmd = Command::new("spacetime");
|
||||
cmd.arg("build")
|
||||
.current_dir(source)
|
||||
.env("DOTNET_CLI_TELEMETRY_OPTOUT", "1")
|
||||
.env("DOTNET_NOLOGO", "1");
|
||||
run(&mut cmd, "spacetime build (csharp)")?;
|
||||
|
||||
let mut pubcmd = Command::new("spacetime");
|
||||
pubcmd
|
||||
.arg("publish")
|
||||
.arg("-c")
|
||||
.arg("-y")
|
||||
.arg("--server")
|
||||
.arg(host_url)
|
||||
.arg(&db)
|
||||
.current_dir(source);
|
||||
run(&mut pubcmd, "spacetime publish (csharp)")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
/* -------------------------------------------------------------------------- */
|
||||
/* Rust Publisher */
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct SpacetimeRustPublisher;
|
||||
|
||||
impl SpacetimeRustPublisher {
|
||||
fn ensure_standalone_manifest(dst: &Path) -> Result<()> {
|
||||
if !dst.join("Cargo.toml").exists() {
|
||||
bail!("no Cargo.toml in {}", dst.display());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Publisher for SpacetimeRustPublisher {
|
||||
fn publish(&self, host_url: &str, source: &Path, module_name: &str) -> Result<()> {
|
||||
if !source.exists() {
|
||||
bail!("no source: {}", source.display());
|
||||
}
|
||||
println!("publish rust module {}", module_name);
|
||||
|
||||
// Build/publish directly from `source`
|
||||
Self::ensure_standalone_manifest(source)?;
|
||||
|
||||
// sanitize db + server
|
||||
let db = sanitize_db_name(module_name);
|
||||
|
||||
// 2) Publish
|
||||
run(
|
||||
Command::new("spacetime")
|
||||
.arg("publish")
|
||||
.arg("-c")
|
||||
.arg("-y")
|
||||
.arg("--server")
|
||||
.arg(host_url)
|
||||
.arg(&db)
|
||||
.current_dir(source),
|
||||
"spacetime publish",
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
use anyhow::{Context, Result};
|
||||
use fs2::FileExt;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
use std::io::{Read, Write};
|
||||
use std::path::Path;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
use crate::bench::types::RunOutcome;
|
||||
use crate::results::schema::{LangEntry, ModeEntry, ModelEntry, Results};
|
||||
|
||||
fn load_results(path: &Path) -> Result<Results> {
|
||||
if !path.exists() {
|
||||
return Ok(Results::default());
|
||||
}
|
||||
let mut f = fs::File::open(path)?;
|
||||
let mut s = String::new();
|
||||
f.read_to_string(&mut s)?;
|
||||
let root: Results = serde_json::from_str(&s).with_context(|| format!("failed parsing {}", path.display()))?;
|
||||
Ok(root)
|
||||
}
|
||||
|
||||
fn save_atomic(path: &Path, root: &Results) -> Result<()> {
|
||||
let parent = path.parent().context("no parent dir for results path")?;
|
||||
fs::create_dir_all(parent)?;
|
||||
let mut tmp = NamedTempFile::new_in(parent)?;
|
||||
serde_json::to_writer_pretty(&mut tmp, root)?;
|
||||
tmp.flush()?;
|
||||
tmp.persist(path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn ensure_lang<'a>(root: &'a mut Results, lang: &str) -> &'a mut LangEntry {
|
||||
if let Some(i) = root.languages.iter().position(|x| x.lang == lang) {
|
||||
return &mut root.languages[i];
|
||||
}
|
||||
root.languages.push(LangEntry {
|
||||
lang: lang.to_string(),
|
||||
modes: Vec::new(),
|
||||
golden_answers: BTreeMap::new(),
|
||||
});
|
||||
root.languages.last_mut().unwrap()
|
||||
}
|
||||
|
||||
fn ensure_mode<'a>(lang_v: &'a mut LangEntry, mode: &str, hash: Option<String>) -> &'a mut ModeEntry {
|
||||
if let Some(i) = lang_v.modes.iter().position(|m| m.mode == mode) {
|
||||
return &mut lang_v.modes[i];
|
||||
}
|
||||
lang_v.modes.push(ModeEntry {
|
||||
mode: mode.to_string(),
|
||||
hash,
|
||||
models: Vec::new(),
|
||||
});
|
||||
lang_v.modes.last_mut().unwrap()
|
||||
}
|
||||
|
||||
fn ensure_model<'a>(mode_v: &'a mut ModeEntry, name: &str) -> &'a mut ModelEntry {
|
||||
if let Some(i) = mode_v.models.iter().position(|m| m.name == name) {
|
||||
return &mut mode_v.models[i];
|
||||
}
|
||||
mode_v.models.push(ModelEntry {
|
||||
name: name.to_string(),
|
||||
route_api_model: None,
|
||||
tasks: Default::default(), // HashMap<String, RunOutcome>
|
||||
});
|
||||
mode_v.models.last_mut().unwrap()
|
||||
}
|
||||
|
||||
pub fn merge_task_runs(path: &Path, mode: &str, runs: &[RunOutcome]) -> Result<()> {
|
||||
if runs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let lock_path = path.with_extension("lock");
|
||||
let lock = fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.truncate(false)
|
||||
.open(&lock_path)?;
|
||||
lock.lock_exclusive()?;
|
||||
|
||||
let res = (|| -> Result<()> {
|
||||
let mut root = load_results(path)?;
|
||||
|
||||
for r in runs {
|
||||
let lang_v = ensure_lang(&mut root, &r.lang);
|
||||
|
||||
// Always bump the mode hash to the latest run's hash
|
||||
let mode_v = ensure_mode(lang_v, mode, Some(r.hash.clone()));
|
||||
|
||||
let model_v = ensure_model(mode_v, &r.model_name);
|
||||
|
||||
// Always replace with the latest value (even if None)
|
||||
model_v.route_api_model = r.route_api_model.clone();
|
||||
|
||||
// Always overwrite the task result
|
||||
model_v.tasks.insert(r.task.clone(), r.clone());
|
||||
}
|
||||
|
||||
save_atomic(path, &root)
|
||||
})();
|
||||
|
||||
let _ = lock.unlock();
|
||||
res
|
||||
}
|
||||
@@ -0,0 +1,680 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use chrono::Utc;
|
||||
use futures::{stream, StreamExt};
|
||||
use serde_json::json;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::OnceLock;
|
||||
use std::time::Instant;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::task;
|
||||
|
||||
use crate::bench::publishers::{DotnetPublisher, SpacetimeRustPublisher};
|
||||
use crate::bench::results_merge::merge_task_runs;
|
||||
use crate::bench::templates::materialize_project;
|
||||
use crate::bench::types::{BenchRunContext, PublishParams, RunContext, RunOneError};
|
||||
pub(crate) use crate::bench::types::{RunOutcome, TaskPaths};
|
||||
use crate::bench::utils::{
|
||||
bench_concurrency, bench_csharp_concurrency, category_slug, debug_llm, fmt_dur, print_llm_output, sanitize_db_name,
|
||||
task_slug, work_server_dir_scoped,
|
||||
};
|
||||
use crate::bench::Publisher;
|
||||
use crate::eval::{Lang, ScoreDetails};
|
||||
use crate::generated::resolve_by_path;
|
||||
use crate::llm::model_routes::ModelRoute;
|
||||
|
||||
pub struct TaskRunner {
|
||||
pub bench_root: PathBuf,
|
||||
pub rust_publisher: SpacetimeRustPublisher,
|
||||
pub cs_publisher: DotnetPublisher,
|
||||
}
|
||||
|
||||
static BUILT_KEYS: OnceLock<Mutex<HashSet<String>>> = OnceLock::new();
|
||||
|
||||
fn build_key(lang: Lang, selectors: Option<&[String]>) -> String {
|
||||
let v = match selectors {
|
||||
Some(s) if !s.is_empty() => {
|
||||
let mut t = s.to_vec();
|
||||
t.sort(); // stable key independent of order
|
||||
t
|
||||
}
|
||||
_ => vec!["ALL".to_string()],
|
||||
};
|
||||
let joined = v.join(",");
|
||||
format!("{lang:?}:{joined}")
|
||||
}
|
||||
|
||||
/// Build goldens **once per (lang, selector-set)** in this process.
|
||||
/// If selectors is None/empty, that means "ALL tasks".
|
||||
pub async fn ensure_goldens_built_once(
|
||||
host: Option<String>,
|
||||
bench_root: &Path,
|
||||
lang: Lang,
|
||||
selectors: Option<&[String]>,
|
||||
) -> Result<()> {
|
||||
let key = build_key(lang, selectors);
|
||||
let set = BUILT_KEYS.get_or_init(|| Mutex::new(HashSet::new()));
|
||||
{
|
||||
let set = set.lock();
|
||||
if set.await.contains(&key) {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// single-flight for this key
|
||||
let set_guard = set.lock().await;
|
||||
if set_guard.contains(&key) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// IMPORTANT: pass selectors through so we only build needed goldens
|
||||
build_goldens_only_for_lang(host, bench_root, lang, selectors).await?;
|
||||
|
||||
// mark as built
|
||||
drop(set_guard);
|
||||
let mut set = BUILT_KEYS.get().unwrap().lock().await;
|
||||
set.insert(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn publish_rust_async(
|
||||
publisher: SpacetimeRustPublisher,
|
||||
host_url: String,
|
||||
wdir: PathBuf,
|
||||
db: String,
|
||||
) -> Result<()> {
|
||||
task::spawn_blocking(move || publisher.publish(&host_url, &wdir, &db)).await??;
|
||||
Ok(())
|
||||
}
|
||||
async fn publish_cs_async(publisher: DotnetPublisher, host_url: String, wdir: PathBuf, db: String) -> Result<()> {
|
||||
task::spawn_blocking(move || publisher.publish(&host_url, &wdir, &db)).await??;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl TaskRunner {
|
||||
pub fn new(bench_root: PathBuf, rust_publisher: SpacetimeRustPublisher, cs_publisher: DotnetPublisher) -> Self {
|
||||
Self {
|
||||
bench_root,
|
||||
rust_publisher,
|
||||
cs_publisher,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn publish_golden_only(
|
||||
&self,
|
||||
lang: Lang,
|
||||
category: &str,
|
||||
task_id: &str,
|
||||
golden_src_text: &str,
|
||||
golden_db: String,
|
||||
host: Option<String>,
|
||||
) -> Result<()> {
|
||||
self.publish(
|
||||
PublishParams {
|
||||
lang,
|
||||
category,
|
||||
task_id,
|
||||
route_tag: "",
|
||||
source_text: golden_src_text,
|
||||
db_name: golden_db,
|
||||
host,
|
||||
},
|
||||
"golden",
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn publish_llm(&self, params: PublishParams<'_>) -> Result<()> {
|
||||
self.publish(params, "llm").await
|
||||
}
|
||||
|
||||
async fn publish(&self, params: PublishParams<'_>, phase: &str) -> Result<()> {
|
||||
let lang_name = match params.lang {
|
||||
Lang::Rust => "rust",
|
||||
Lang::CSharp => "csharp",
|
||||
};
|
||||
|
||||
let wdir = work_server_dir_scoped(params.category, params.task_id, lang_name, phase, params.route_tag);
|
||||
if wdir.exists() {
|
||||
let _ = fs::remove_dir_all(&wdir);
|
||||
}
|
||||
let _proj_root = materialize_project(
|
||||
lang_name,
|
||||
params.category,
|
||||
params.task_id,
|
||||
phase,
|
||||
params.route_tag,
|
||||
params.source_text,
|
||||
)?;
|
||||
|
||||
let host_url = params.host.unwrap_or_else(|| "local".to_owned());
|
||||
match params.lang {
|
||||
Lang::Rust => publish_rust_async(self.rust_publisher, host_url, wdir, params.db_name).await?,
|
||||
Lang::CSharp => publish_cs_async(self.cs_publisher, host_url, wdir, params.db_name).await?,
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn run_one(&self, task: &TaskPaths, cfg: &RunContext<'_>) -> Result<RunOutcome, RunOneError> {
|
||||
let wall = Instant::now();
|
||||
let started = Utc::now();
|
||||
|
||||
let category = category_slug(&task.root);
|
||||
let task_id = task_slug(&task.root);
|
||||
let route_tag = sanitize_db_name(cfg.route.display_name);
|
||||
let golden_db = sanitize_db_name(&format!("{}-{}-golden", category, task_id));
|
||||
let llm_db = sanitize_db_name(&format!("{}-{}-{}-llm", category, task_id, route_tag));
|
||||
|
||||
let ctor = resolve_by_path(&task.root)?;
|
||||
let spec = ctor();
|
||||
|
||||
let scorers = spec.scorers_for(cfg.lang, &route_tag, cfg.host.as_deref().unwrap_or("local"));
|
||||
let total_tasks = scorers.len();
|
||||
|
||||
let prompt_builder = (spec.make_prompt)(cfg.lang);
|
||||
println!("→ [{}] {}: building prompt", cfg.lang_name, cfg.route.display_name);
|
||||
let prompt = prompt_builder.build_segmented(cfg.context);
|
||||
|
||||
println!("→ [{}] {}: calling provider", cfg.lang_name, cfg.route.display_name);
|
||||
let llm_output = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(200),
|
||||
cfg.llm.generate(cfg.route, &prompt),
|
||||
)
|
||||
.await
|
||||
.map_err(|_| RunOneError::Other(anyhow!("LLM call timed out")))?
|
||||
.map_err(RunOneError::Other)?;
|
||||
|
||||
if debug_llm() {
|
||||
print_llm_output(cfg.route.display_name, &task_id, &llm_output);
|
||||
}
|
||||
|
||||
let publish_error: Option<String> = self
|
||||
.publish_llm(PublishParams {
|
||||
lang: cfg.lang,
|
||||
category: &category,
|
||||
task_id: &task_id,
|
||||
route_tag: &route_tag,
|
||||
source_text: &llm_output,
|
||||
db_name: llm_db.clone(),
|
||||
host: cfg.host.clone(),
|
||||
})
|
||||
.await
|
||||
.err()
|
||||
.map(|e| {
|
||||
eprintln!(
|
||||
"⚠️ publish failed for {}/{}/{}: {e:#}",
|
||||
category, task_id, cfg.route.display_name
|
||||
);
|
||||
format!("{:#}", e)
|
||||
});
|
||||
|
||||
let mut passed = 0usize;
|
||||
let mut partial_sum = 0f32;
|
||||
let mut scorer_details: HashMap<String, ScoreDetails> = HashMap::new();
|
||||
|
||||
if publish_error.is_none() {
|
||||
println!("→ [{}] {}: scoring", cfg.lang_name, cfg.route.display_name);
|
||||
for s in &scorers {
|
||||
let r = s.score(&llm_output);
|
||||
if r.pass {
|
||||
passed += 1;
|
||||
}
|
||||
partial_sum += r.partial.clamp(0.0, 1.0);
|
||||
scorer_details.insert(s.id().to_string(), r);
|
||||
}
|
||||
} else {
|
||||
println!(
|
||||
"→ [{}] {}: publish failed — skipping scoring (0/{})",
|
||||
cfg.lang_name, cfg.route.display_name, total_tasks
|
||||
);
|
||||
scorer_details.insert(
|
||||
"publish_error".into(),
|
||||
ScoreDetails {
|
||||
pass: false,
|
||||
partial: 0.0,
|
||||
notes: json!({
|
||||
"phase": "build_or_publish",
|
||||
"error": publish_error.as_deref().unwrap_or("unknown"),
|
||||
}),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
let score_pct = if total_tasks == 0 {
|
||||
0.0
|
||||
} else {
|
||||
(partial_sum / total_tasks as f32) * 100.0
|
||||
};
|
||||
|
||||
let finished = Utc::now();
|
||||
let took = wall.elapsed();
|
||||
println!(
|
||||
"→ [{}] {}/{}/{}: done (passed {}/{}, {:.1}%) — {}",
|
||||
cfg.lang_name,
|
||||
category,
|
||||
task_id,
|
||||
cfg.route.display_name,
|
||||
passed as u32,
|
||||
total_tasks,
|
||||
score_pct,
|
||||
fmt_dur(took)
|
||||
);
|
||||
|
||||
Ok(RunOutcome {
|
||||
hash: cfg.hash.to_string(),
|
||||
task: task_id.clone(),
|
||||
lang: cfg.lang_name.to_string(),
|
||||
model_name: cfg.route.display_name.to_string(),
|
||||
vendor: cfg.route.vendor.slug().to_string(),
|
||||
golden_published: publish_error.is_none(),
|
||||
total_tests: total_tasks as u32,
|
||||
passed_tests: passed as u32,
|
||||
category: Some(category.clone()),
|
||||
llm_output: Some(llm_output),
|
||||
route_api_model: Some(cfg.route.api_model.to_string()),
|
||||
golden_db: Some(golden_db),
|
||||
llm_db: Some(llm_db),
|
||||
work_dir_golden: Some(
|
||||
work_server_dir_scoped(&category, &task_id, cfg.lang_name, "golden", "")
|
||||
.to_string_lossy()
|
||||
.into_owned(),
|
||||
),
|
||||
work_dir_llm: Some(
|
||||
work_server_dir_scoped(&category, &task_id, cfg.lang_name, "llm", &route_tag)
|
||||
.to_string_lossy()
|
||||
.into_owned(),
|
||||
),
|
||||
scorer_details: Some(scorer_details),
|
||||
started_at: Some(started),
|
||||
finished_at: Some(finished),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn run_all_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Result<Vec<RunOutcome>> {
|
||||
let total_wall = Instant::now();
|
||||
|
||||
// 1) run per-task LLM builds + scoring
|
||||
let tasks = discover_tasks(cfg.bench_root)?;
|
||||
let runner = TaskRunner::new(PathBuf::from(cfg.bench_root), SpacetimeRustPublisher, DotnetPublisher);
|
||||
let lang_name = cfg.lang.as_str();
|
||||
let buf = match cfg.lang {
|
||||
Lang::CSharp => bench_csharp_concurrency(),
|
||||
_ => bench_concurrency(),
|
||||
};
|
||||
|
||||
let results: Vec<(TaskPaths, Result<RunOutcome, RunOneError>)> =
|
||||
futures::stream::iter(tasks.into_iter().map(|task| {
|
||||
let runner = &runner;
|
||||
let route = cfg.route;
|
||||
let lang = cfg.lang;
|
||||
let lang_name = lang_name.to_string();
|
||||
let context = cfg.context;
|
||||
let hash = cfg.hash;
|
||||
let llm = cfg.llm;
|
||||
let host = cfg.host.clone();
|
||||
|
||||
async move {
|
||||
let started = Utc::now();
|
||||
let run_cfg = RunContext {
|
||||
lang_name: &lang_name,
|
||||
lang,
|
||||
route,
|
||||
context,
|
||||
hash,
|
||||
llm,
|
||||
host,
|
||||
};
|
||||
|
||||
let res = runner.run_one(&task, &run_cfg).await;
|
||||
(
|
||||
task,
|
||||
res.map(|mut o| {
|
||||
o.started_at.get_or_insert(started);
|
||||
o
|
||||
}),
|
||||
)
|
||||
}
|
||||
}))
|
||||
.buffer_unordered(buf)
|
||||
.collect()
|
||||
.await;
|
||||
|
||||
let mut outcomes = Vec::new();
|
||||
let mut errs = 0usize;
|
||||
|
||||
for (task, r) in results {
|
||||
match r {
|
||||
Ok(v) => outcomes.push(v),
|
||||
Err(RunOneError::WithOutput { msg, llm_output }) => {
|
||||
errs += 1;
|
||||
eprintln!("⚠️ task failed but continuing: {msg}");
|
||||
outcomes.push(build_fail_outcome(
|
||||
&task,
|
||||
lang_name,
|
||||
cfg.route,
|
||||
cfg.hash,
|
||||
anyhow::anyhow!(msg),
|
||||
Some(llm_output),
|
||||
));
|
||||
}
|
||||
Err(RunOneError::Other(e)) => {
|
||||
errs += 1;
|
||||
eprintln!("⚠️ task failed but continuing: {e:?}");
|
||||
outcomes.push(build_fail_outcome(&task, lang_name, cfg.route, cfg.hash, e, None));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("[runner] completed batch: ok={} err={}", outcomes.len(), errs);
|
||||
|
||||
if !outcomes.is_empty() {
|
||||
merge_task_runs(&cfg.details_path, cfg.mode, &outcomes)?;
|
||||
} else {
|
||||
eprintln!("[runner] no successful runs; not calling merge_task_runs");
|
||||
}
|
||||
|
||||
println!(
|
||||
"✓ [{}] {}: total {}",
|
||||
lang_name,
|
||||
cfg.route.display_name,
|
||||
fmt_dur(total_wall.elapsed())
|
||||
);
|
||||
|
||||
Ok(outcomes)
|
||||
}
|
||||
|
||||
// run only selected tasks by selectors like 1/01/001 or t_001
|
||||
pub async fn run_selected_for_model_async_for_lang(cfg: &BenchRunContext<'_>) -> Result<Vec<RunOutcome>> {
|
||||
let total_wall = Instant::now();
|
||||
|
||||
let wanted: HashSet<String> = cfg
|
||||
.selectors
|
||||
.iter()
|
||||
.flat_map(|s| s.iter())
|
||||
.map(|s| normalize_task_selector(s.as_str()))
|
||||
.collect::<Result<_>>()?;
|
||||
|
||||
let tasks = discover_tasks(cfg.bench_root)?;
|
||||
let selected: Vec<TaskPaths> = tasks
|
||||
.into_iter()
|
||||
.filter(|t| {
|
||||
let name = t.root.file_name().and_then(|x| x.to_str()).unwrap_or("");
|
||||
wanted.iter().any(|w| name.starts_with(w))
|
||||
})
|
||||
.collect();
|
||||
|
||||
if selected.is_empty() {
|
||||
bail!("no tasks matched {:?}", wanted);
|
||||
}
|
||||
|
||||
let runner = TaskRunner::new(PathBuf::from(cfg.bench_root), SpacetimeRustPublisher, DotnetPublisher);
|
||||
let lang_name = cfg.lang.as_str();
|
||||
let buf = match cfg.lang {
|
||||
Lang::CSharp => bench_csharp_concurrency(),
|
||||
_ => bench_concurrency(),
|
||||
};
|
||||
|
||||
let results: Vec<(TaskPaths, Result<RunOutcome, RunOneError>)> =
|
||||
futures::stream::iter(selected.into_iter().map(|task| {
|
||||
let runner = &runner;
|
||||
let route = cfg.route;
|
||||
let lang = cfg.lang;
|
||||
let lang_name = lang_name.to_string();
|
||||
let context = cfg.context;
|
||||
let hash = cfg.hash;
|
||||
let llm = cfg.llm;
|
||||
|
||||
async move {
|
||||
let started = Utc::now();
|
||||
let run_cfg = RunContext {
|
||||
lang_name: &lang_name,
|
||||
lang,
|
||||
route,
|
||||
context,
|
||||
hash,
|
||||
llm,
|
||||
host: cfg.host.clone(),
|
||||
};
|
||||
|
||||
let res = runner.run_one(&task, &run_cfg).await;
|
||||
(
|
||||
task,
|
||||
res.map(|mut o| {
|
||||
o.started_at.get_or_insert(started);
|
||||
o
|
||||
}),
|
||||
)
|
||||
}
|
||||
}))
|
||||
.buffer_unordered(buf)
|
||||
.collect()
|
||||
.await;
|
||||
|
||||
let mut outcomes = Vec::with_capacity(results.len());
|
||||
let mut errs = 0usize;
|
||||
|
||||
for (task, r) in results {
|
||||
match r {
|
||||
Ok(v) => outcomes.push(v),
|
||||
Err(RunOneError::WithOutput { msg, llm_output }) => {
|
||||
errs += 1;
|
||||
eprintln!("⚠️ task failed but continuing: {msg}");
|
||||
outcomes.push(build_fail_outcome(
|
||||
&task,
|
||||
lang_name,
|
||||
cfg.route,
|
||||
cfg.hash,
|
||||
anyhow::anyhow!(msg),
|
||||
Some(llm_output),
|
||||
));
|
||||
}
|
||||
Err(RunOneError::Other(e)) => {
|
||||
errs += 1;
|
||||
eprintln!("⚠️ task failed but continuing: {e:?}");
|
||||
outcomes.push(build_fail_outcome(&task, lang_name, cfg.route, cfg.hash, e, None));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !outcomes.is_empty() {
|
||||
merge_task_runs(&cfg.details_path, cfg.mode, &outcomes)?;
|
||||
}
|
||||
|
||||
println!(
|
||||
"✓ [{}] {}: total {} (err={})",
|
||||
lang_name,
|
||||
cfg.route.display_name,
|
||||
fmt_dur(total_wall.elapsed()),
|
||||
errs
|
||||
);
|
||||
Ok(outcomes)
|
||||
}
|
||||
|
||||
pub async fn run_selected_or_all_for_model_async_for_lang(ctx: &BenchRunContext<'_>) -> Result<Vec<RunOutcome>> {
|
||||
if let Some(sels) = ctx.selectors {
|
||||
if !sels.is_empty() {
|
||||
let sel_cfg = BenchRunContext {
|
||||
bench_root: ctx.bench_root,
|
||||
mode: ctx.mode,
|
||||
hash: ctx.hash,
|
||||
route: ctx.route,
|
||||
context: ctx.context,
|
||||
llm: ctx.llm,
|
||||
lang: ctx.lang,
|
||||
selectors: Option::from(sels),
|
||||
host: ctx.host.clone(),
|
||||
details_path: ctx.details_path.clone(),
|
||||
};
|
||||
return run_selected_for_model_async_for_lang(&sel_cfg).await;
|
||||
}
|
||||
}
|
||||
|
||||
run_all_for_model_async_for_lang(ctx).await
|
||||
}
|
||||
|
||||
pub async fn build_goldens_only_for_lang(
|
||||
host: Option<String>,
|
||||
bench_root: &Path,
|
||||
lang: Lang,
|
||||
selectors: Option<&[String]>,
|
||||
) -> Result<()> {
|
||||
let tasks = if let Some(sels) = selectors {
|
||||
let wanted: HashSet<String> = sels.iter().map(|s| normalize_task_selector(s)).collect::<Result<_>>()?;
|
||||
let all = discover_tasks(bench_root)?;
|
||||
let filtered: Vec<TaskPaths> = all
|
||||
.into_iter()
|
||||
.filter(|t| {
|
||||
let name = t.root.file_name().and_then(|x| x.to_str()).unwrap_or("");
|
||||
wanted.iter().any(|w| name.starts_with(w))
|
||||
})
|
||||
.collect();
|
||||
if filtered.is_empty() {
|
||||
bail!("no tasks matched {:?}", wanted);
|
||||
}
|
||||
filtered
|
||||
} else {
|
||||
discover_tasks(bench_root)?
|
||||
};
|
||||
|
||||
let runner = TaskRunner::new(PathBuf::from(bench_root), SpacetimeRustPublisher, DotnetPublisher);
|
||||
let lang_name = lang.as_str();
|
||||
let buf = match lang {
|
||||
Lang::CSharp => bench_csharp_concurrency(),
|
||||
_ => bench_concurrency(),
|
||||
};
|
||||
|
||||
stream::iter(tasks.into_iter().map(|task| {
|
||||
let runner = &runner;
|
||||
let host_clone = host.clone();
|
||||
async move {
|
||||
let category = category_slug(&task.root);
|
||||
let task_id = task_slug(&task.root);
|
||||
let golden_db = sanitize_db_name(&format!("{}-{}-golden", category, task_id));
|
||||
let golden_src_text = load_golden_source(&task, lang)?;
|
||||
println!("→ [{}] build golden {} {}", lang_name, category, task_id);
|
||||
runner
|
||||
.publish_golden_only(lang, &category, &task_id, &golden_src_text, golden_db, host_clone)
|
||||
.await
|
||||
}
|
||||
}))
|
||||
.buffer_unordered(buf)
|
||||
.collect::<Vec<_>>()
|
||||
.await
|
||||
.into_iter()
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
println!("✓ [{}] goldens build/publish: complete", lang_name);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn discover_tasks(benchmarks_root: &Path) -> Result<Vec<TaskPaths>> {
|
||||
let mut out = Vec::new();
|
||||
for cat in read_dirs(benchmarks_root)? {
|
||||
for task in read_dirs(&cat)? {
|
||||
out.push(TaskPaths {
|
||||
root: task.clone(),
|
||||
answers_rust: task.join("answers/rust/server"),
|
||||
answers_csharp: task.join("answers/csharp/server"),
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn build_fail_outcome(
|
||||
task: &TaskPaths,
|
||||
lang_name: &str,
|
||||
route: &ModelRoute,
|
||||
hash: &str,
|
||||
err: anyhow::Error,
|
||||
llm_output: Option<String>,
|
||||
) -> RunOutcome {
|
||||
let category = category_slug(&task.root);
|
||||
let task_id = task_slug(&task.root);
|
||||
let now = Utc::now();
|
||||
let mut sd: HashMap<String, ScoreDetails> = HashMap::new();
|
||||
sd.insert(
|
||||
"publish_error".to_string(),
|
||||
ScoreDetails {
|
||||
pass: false,
|
||||
partial: 0.0,
|
||||
notes: json!({
|
||||
"phase": "build_or_publish",
|
||||
"error": format!("{:#}", err),
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
RunOutcome {
|
||||
hash: hash.to_string(),
|
||||
task: task_id.clone(),
|
||||
lang: lang_name.to_string(),
|
||||
golden_published: false,
|
||||
category: Some(category),
|
||||
|
||||
model_name: route.display_name.to_string(),
|
||||
total_tests: 1,
|
||||
passed_tests: 0,
|
||||
|
||||
llm_output,
|
||||
|
||||
route_api_model: Some(route.api_model.to_string()),
|
||||
golden_db: None,
|
||||
llm_db: None,
|
||||
work_dir_golden: None,
|
||||
work_dir_llm: None,
|
||||
scorer_details: Some(sd),
|
||||
|
||||
vendor: route.vendor.slug().to_string(),
|
||||
started_at: Some(now),
|
||||
finished_at: Some(now),
|
||||
}
|
||||
}
|
||||
|
||||
fn read_dirs(p: &Path) -> Result<Vec<PathBuf>> {
|
||||
let mut v = Vec::new();
|
||||
for e in fs::read_dir(p).with_context(|| format!("read_dir {}", p.display()))? {
|
||||
let e = e?;
|
||||
let path = e.path();
|
||||
if path.is_dir() {
|
||||
v.push(path);
|
||||
}
|
||||
}
|
||||
Ok(v)
|
||||
}
|
||||
|
||||
// TEST_CASE/answers/csharp.cs and TEST_CASE/rust.rs
|
||||
fn load_golden_source(task: &TaskPaths, lang: Lang) -> Result<String> {
|
||||
match lang {
|
||||
Lang::Rust => {
|
||||
let p = task.root.join("answers").join("rust.rs");
|
||||
fs::read_to_string(&p).with_context(|| format!("read {}", p.display()))
|
||||
}
|
||||
Lang::CSharp => {
|
||||
let p = task.root.join("answers").join("csharp.cs");
|
||||
fs::read_to_string(&p).with_context(|| format!("read {}", p.display()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// "1" | "01" | "001" | "t_001" -> "t_001"
|
||||
fn normalize_task_selector(raw: &str) -> Result<String> {
|
||||
let s = raw.trim().to_ascii_lowercase();
|
||||
if s.is_empty() {
|
||||
bail!("empty task selector");
|
||||
}
|
||||
if let Some(rest) = s.strip_prefix("t_") {
|
||||
if rest.chars().all(|c| c.is_ascii_digit()) {
|
||||
let n: u32 = rest.parse()?;
|
||||
return Ok(format!("t_{:03}", n));
|
||||
}
|
||||
bail!("invalid task selector: {raw}");
|
||||
}
|
||||
if s.chars().all(|c| c.is_ascii_digit()) {
|
||||
let n: u32 = s.parse()?;
|
||||
return Ok(format!("t_{:03}", n));
|
||||
}
|
||||
bail!("invalid task selector: {raw}")
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
use crate::bench::utils::work_server_dir_scoped;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use std::{
|
||||
env, fs, io,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
pub fn materialize_project(
|
||||
lang: &str,
|
||||
category: &str,
|
||||
task: &str,
|
||||
phase: &str,
|
||||
route_tag: &str,
|
||||
llm_code: &str,
|
||||
) -> Result<PathBuf> {
|
||||
let out = work_server_dir_scoped(category, task, lang, phase, route_tag);
|
||||
let src = tmpl_root().join(match lang {
|
||||
"rust" => "rust/server",
|
||||
"csharp" => "csharp/server",
|
||||
_ => bail!("unsupported lang `{}`", lang),
|
||||
});
|
||||
|
||||
if out.exists() {
|
||||
let _ = fs::remove_dir_all(&out);
|
||||
}
|
||||
fs::create_dir_all(&out)?;
|
||||
copy_tree_with_templates(&src, &out)?;
|
||||
|
||||
match lang {
|
||||
"rust" => inject_rust(&out, llm_code)?,
|
||||
"csharp" => inject_csharp(&out, llm_code)?,
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/* helpers */
|
||||
|
||||
fn tmpl_root() -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src").join("templates")
|
||||
}
|
||||
|
||||
fn copy_tree_with_templates(src: &Path, dst: &Path) -> Result<()> {
|
||||
fn recurse(from: &Path, to: &Path) -> Result<()> {
|
||||
fs::create_dir_all(to)?;
|
||||
for entry in fs::read_dir(from)? {
|
||||
let entry = entry?;
|
||||
let p = entry.path();
|
||||
let rel = p.strip_prefix(from)?;
|
||||
let out_path = to.join(rel);
|
||||
if entry.file_type()?.is_dir() {
|
||||
recurse(&p, &out_path)?;
|
||||
} else if out_path.extension().and_then(|e| e.to_str()) == Some("tmpl") {
|
||||
let rendered_path = out_path.with_extension("");
|
||||
let s = fs::read_to_string(&p).with_context(|| format!("read {}", p.display()))?;
|
||||
let s = replace_placeholders(&s);
|
||||
if let Some(dir) = rendered_path.parent() {
|
||||
fs::create_dir_all(dir)?;
|
||||
}
|
||||
fs::write(&rendered_path, s).with_context(|| format!("write {}", rendered_path.display()))?;
|
||||
} else {
|
||||
if let Some(dir) = out_path.parent() {
|
||||
fs::create_dir_all(dir)?;
|
||||
}
|
||||
fs::copy(&p, &out_path)
|
||||
.map(|_| ())
|
||||
.with_context(|| format!("copy {} -> {}", p.display(), out_path.display()))?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
if !src.exists() {
|
||||
bail!("missing template dir {}", src.display());
|
||||
}
|
||||
recurse(src, dst)
|
||||
}
|
||||
|
||||
fn replace_placeholders(s: &str) -> String {
|
||||
let sdk = env::var("SPACETIME_SDK_VERSION").unwrap_or_else(|_| "1.5.0".into());
|
||||
s.replace("{SPACETIME_SDK_VERSION}", &sdk)
|
||||
}
|
||||
|
||||
fn inject_rust(root: &Path, llm_code: &str) -> anyhow::Result<()> {
|
||||
let lib = root.join("src/lib.rs");
|
||||
ensure_parent(&lib)?;
|
||||
let mut contents = fs::read_to_string(&lib).unwrap_or_default();
|
||||
let marker = "/*__LLM_CODE__*/";
|
||||
let cleaned = normalize_source(llm_code);
|
||||
|
||||
if let Some(idx) = contents.find(marker) {
|
||||
contents.replace_range(idx..idx + marker.len(), &cleaned);
|
||||
} else {
|
||||
if !contents.ends_with('\n') {
|
||||
contents.push('\n');
|
||||
}
|
||||
contents.push_str(&cleaned);
|
||||
}
|
||||
fs::write(&lib, contents).with_context(|| format!("write {}", lib.display()))
|
||||
}
|
||||
|
||||
fn inject_csharp(root: &Path, llm_code: &str) -> anyhow::Result<()> {
|
||||
let prog = root.join("Lib.cs");
|
||||
ensure_parent(&prog)?;
|
||||
let mut contents = fs::read_to_string(&prog).unwrap_or_default();
|
||||
let marker = "//__LLM_CODE__";
|
||||
let cleaned = normalize_source(llm_code);
|
||||
|
||||
if let Some(idx) = contents.find(marker) {
|
||||
contents.replace_range(idx..idx + marker.len(), &cleaned);
|
||||
} else {
|
||||
if !contents.ends_with('\n') {
|
||||
contents.push('\n');
|
||||
}
|
||||
contents.push_str(&cleaned);
|
||||
}
|
||||
fs::write(&prog, contents).with_context(|| format!("write {}", prog.display()))
|
||||
}
|
||||
|
||||
/// Remove leading/trailing Markdown fences like ```rust ... ``` or ~~~
|
||||
/// Keeps the inner text intact. Always returns an owned String.
|
||||
fn strip_code_fences(input: &str) -> String {
|
||||
let t = input.trim();
|
||||
if !(t.starts_with("```") || t.starts_with("~~~")) {
|
||||
return t.to_owned();
|
||||
}
|
||||
// split once on the first newline to skip the opening fence (and optional lang tag)
|
||||
let mut lines = t.lines();
|
||||
let _first = lines.next(); // opening fence
|
||||
let body = lines.collect::<Vec<_>>().join("\n");
|
||||
// trim a trailing closing fence if present
|
||||
let trimmed = body.trim_end();
|
||||
let trimmed = trimmed
|
||||
.strip_suffix("```")
|
||||
.or_else(|| trimmed.strip_suffix("~~~"))
|
||||
.unwrap_or(trimmed);
|
||||
trimmed.trim().to_owned()
|
||||
}
|
||||
|
||||
fn normalize_source(input: &str) -> String {
|
||||
let mut out = strip_code_fences(input).replace("\r\n", "\n");
|
||||
out = out.trim_end().to_string();
|
||||
out.push('\n');
|
||||
out
|
||||
}
|
||||
|
||||
fn ensure_parent(p: &Path) -> io::Result<()> {
|
||||
if let Some(dir) = p.parent() {
|
||||
fs::create_dir_all(dir)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
use crate::eval::{Lang, ScoreDetails};
|
||||
use crate::llm::types::Vendor;
|
||||
use crate::llm::{LlmProvider, ModelRoute};
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::{Path, PathBuf};
|
||||
use thiserror::Error;
|
||||
|
||||
/// Parameters for publishing a module (golden or LLM-generated).
|
||||
pub struct PublishParams<'a> {
|
||||
pub lang: Lang,
|
||||
pub category: &'a str,
|
||||
pub task_id: &'a str,
|
||||
pub route_tag: &'a str,
|
||||
pub source_text: &'a str,
|
||||
pub db_name: String,
|
||||
pub host: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct RunOutcome {
|
||||
pub hash: String,
|
||||
pub task: String,
|
||||
pub lang: String,
|
||||
pub golden_published: bool,
|
||||
pub model_name: String,
|
||||
pub total_tests: u32,
|
||||
pub passed_tests: u32,
|
||||
|
||||
pub llm_output: Option<String>,
|
||||
pub category: Option<String>,
|
||||
pub route_api_model: Option<String>,
|
||||
pub golden_db: Option<String>,
|
||||
pub llm_db: Option<String>,
|
||||
pub work_dir_golden: Option<String>,
|
||||
pub work_dir_llm: Option<String>,
|
||||
pub scorer_details: Option<HashMap<String, ScoreDetails>>,
|
||||
|
||||
#[serde(default)]
|
||||
pub vendor: String,
|
||||
|
||||
pub started_at: Option<DateTime<Utc>>,
|
||||
pub finished_at: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
pub struct TaskPaths {
|
||||
pub root: PathBuf,
|
||||
pub answers_csharp: PathBuf,
|
||||
pub answers_rust: PathBuf,
|
||||
}
|
||||
|
||||
pub struct RouteRun {
|
||||
pub route_name: String,
|
||||
pub api_model: String,
|
||||
pub outcomes: Vec<RunOutcome>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum RunOneError {
|
||||
#[error("{msg}")]
|
||||
WithOutput { msg: String, llm_output: String },
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub struct RunContext<'a> {
|
||||
pub lang_name: &'a str,
|
||||
pub lang: Lang,
|
||||
pub route: &'a ModelRoute,
|
||||
pub context: &'a str,
|
||||
pub hash: &'a str,
|
||||
pub llm: &'a dyn LlmProvider,
|
||||
pub host: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a> RunContext<'a> {
|
||||
pub fn new(
|
||||
lang_name: &'a str,
|
||||
lang: Lang,
|
||||
route: &'a ModelRoute,
|
||||
context: &'a str,
|
||||
hash: &'a str,
|
||||
llm: &'a dyn LlmProvider,
|
||||
host: Option<String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
lang_name,
|
||||
lang,
|
||||
route,
|
||||
context,
|
||||
hash,
|
||||
llm,
|
||||
host,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BenchRunContext<'a> {
|
||||
pub bench_root: &'a Path,
|
||||
pub mode: &'a str,
|
||||
pub hash: &'a str,
|
||||
pub route: &'a ModelRoute,
|
||||
pub context: &'a str,
|
||||
pub llm: &'a dyn LlmProvider,
|
||||
pub lang: Lang,
|
||||
pub selectors: Option<&'a [String]>,
|
||||
pub host: Option<String>,
|
||||
pub details_path: PathBuf,
|
||||
}
|
||||
|
||||
pub struct RunConfig {
|
||||
pub modes: Option<Vec<String>>,
|
||||
pub hash_only: bool,
|
||||
pub goldens_only: bool,
|
||||
pub lang: Lang,
|
||||
pub providers_filter: Option<HashSet<Vendor>>,
|
||||
pub selectors: Option<Vec<String>>,
|
||||
pub force: bool,
|
||||
pub categories: Option<HashSet<String>>,
|
||||
pub model_filter: Option<HashMap<Vendor, HashSet<String>>>,
|
||||
pub host: Option<String>,
|
||||
/// Path to the details.json file where results will be merged
|
||||
pub details_path: PathBuf,
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
use std::env;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Duration;
|
||||
|
||||
pub fn sanitize_db_name(raw: &str) -> String {
|
||||
// lowercase and strip invalids to hyphens
|
||||
let s: String = raw
|
||||
.to_ascii_lowercase()
|
||||
.chars()
|
||||
.map(|c| {
|
||||
if c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-' {
|
||||
c
|
||||
} else {
|
||||
'-'
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// collapse multiple '-' and trim
|
||||
let mut out = String::with_capacity(s.len());
|
||||
let mut prev_dash = false;
|
||||
for ch in s.chars() {
|
||||
if ch == '-' {
|
||||
if !prev_dash {
|
||||
out.push('-');
|
||||
}
|
||||
prev_dash = true;
|
||||
} else {
|
||||
out.push(ch);
|
||||
prev_dash = false;
|
||||
}
|
||||
}
|
||||
while out.starts_with('-') {
|
||||
out.remove(0);
|
||||
}
|
||||
while out.ends_with('-') {
|
||||
out.pop();
|
||||
}
|
||||
|
||||
// must start with [a-z0-9]; if empty, prefix
|
||||
if out.is_empty() || !out.chars().next().unwrap().is_ascii_alphanumeric() {
|
||||
out.insert_str(0, "db");
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
pub fn work_server_dir_scoped(category: &str, task: &str, lang: &str, phase: &str, route_tag: &str) -> PathBuf {
|
||||
let target = env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| "target".into());
|
||||
Path::new(&target)
|
||||
.join("llm-runs")
|
||||
.join(category)
|
||||
.join(task)
|
||||
.join(lang)
|
||||
.join("server")
|
||||
.join(route_tag)
|
||||
.join(phase)
|
||||
}
|
||||
|
||||
pub fn max_chars() -> usize {
|
||||
env::var("LLM_OUTPUT_MAX_CHARS")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(2000)
|
||||
}
|
||||
|
||||
pub fn print_llm_output(model: &str, task: &str, s: &str) {
|
||||
let limit = max_chars();
|
||||
let mut end = s.len().min(limit);
|
||||
while !s.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
}
|
||||
let s = &s[..end];
|
||||
println!("\n===== {} :: {} =====\n{}\n===== end =====\n", model, task, s);
|
||||
}
|
||||
|
||||
pub fn task_slug(p: &Path) -> String {
|
||||
p.file_name().and_then(|s| s.to_str()).unwrap_or_default().to_string()
|
||||
}
|
||||
pub fn category_slug(p: &Path) -> String {
|
||||
p.parent()
|
||||
.and_then(|x| x.file_name())
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
pub fn debug_llm() -> bool {
|
||||
matches!(env::var("LLM_DEBUG").as_deref(), Ok("1" | "true" | "yes"))
|
||||
}
|
||||
|
||||
pub fn debug_llm_verbose() -> bool {
|
||||
matches!(env::var("LLM_DEBUG_VERBOSE").as_deref(), Ok("1" | "true" | "yes"))
|
||||
}
|
||||
|
||||
pub fn bench_concurrency() -> usize {
|
||||
env::var("LLM_BENCH_CONCURRENCY")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(8)
|
||||
}
|
||||
|
||||
/// Concurrency for C# builds. Lower default than Rust due to dotnet/WASI SDK
|
||||
/// instability under high parallelism (causes SIGSEGV and "Pipe is broken" errors).
|
||||
pub fn bench_csharp_concurrency() -> usize {
|
||||
env::var("LLM_BENCH_CSHARP_CONCURRENCY")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(2)
|
||||
}
|
||||
|
||||
pub fn bench_route_concurrency() -> usize {
|
||||
env::var("LLM_BENCH_ROUTE_CONCURRENCY")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(2)
|
||||
}
|
||||
|
||||
pub fn fmt_dur(d: Duration) -> String {
|
||||
let secs = d.as_secs_f64();
|
||||
if secs < 1.0 {
|
||||
format!("{} ms", d.as_millis())
|
||||
} else if secs < 60.0 {
|
||||
format!("{:.2} s", secs)
|
||||
} else {
|
||||
let m = (secs / 60.0).floor() as u64;
|
||||
let s = secs - (m as f64) * 60.0;
|
||||
format!("{}m {:.1}s", m, s)
|
||||
}
|
||||
}
|
||||
+19
@@ -0,0 +1,19 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Reducer]
|
||||
public static void EmptyReducer_NoArgs(ReducerContext ctx) { }
|
||||
|
||||
[Reducer]
|
||||
public static void EmptyReducer_WithInt(ReducerContext ctx, int count) { }
|
||||
|
||||
[Reducer]
|
||||
public static void EmptyReducer_WithString(ReducerContext ctx, string name) { }
|
||||
|
||||
[Reducer]
|
||||
public static void EmptyReducer_WithTwoArgs(ReducerContext ctx, int count, string name) { }
|
||||
|
||||
[Reducer]
|
||||
public static void EmptyReducer_WithThreeArgs(ReducerContext ctx, bool active, float ratio, string label) { }
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
use spacetimedb::{reducer, ReducerContext};
|
||||
|
||||
#[reducer]
|
||||
pub fn empty_reducer_no_args(ctx: &ReducerContext) -> Result<(), String> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn empty_reducer_with_int(ctx: &ReducerContext, count: i32) -> Result<(), String> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn empty_reducer_with_string(ctx: &ReducerContext, name: String) -> Result<(), String> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn empty_reducer_with_two_args(ctx: &ReducerContext, count: i32, name: String) -> Result<(), String> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn empty_reducer_with_three_args(
|
||||
ctx: &ReducerContext,
|
||||
active: bool,
|
||||
ratio: f32,
|
||||
label: String,
|
||||
) -> Result<(), String> {
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
use crate::eval::defaults::default_schema_parity_scorers;
|
||||
use crate::eval::BenchmarkSpec;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |_lang, route_tag, host_url| {
|
||||
default_schema_parity_scorers(host_url, file!(), route_tag)
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
Write a SpacetimeDB backend module in C# that defines only these five empty reducers.
|
||||
|
||||
REDUCERS
|
||||
- EmptyReducer_NoArgs: no arguments, returns void, empty body
|
||||
- EmptyReducer_WithInt: (int count), returns void, empty body
|
||||
- EmptyReducer_WithString: (string name), returns void, empty body
|
||||
- EmptyReducer_WithTwoArgs: (int count, string name), returns void, empty body
|
||||
- EmptyReducer_WithThreeArgs: (bool active, float ratio, string label), returns void, empty body
|
||||
@@ -0,0 +1,8 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines only these five empty reducers.
|
||||
|
||||
REDUCERS
|
||||
- empty_reducer_no_args: no arguments, returns (), empty body
|
||||
- empty_reducer_with_int: (count: i32), returns (), empty body
|
||||
- empty_reducer_with_string: (name: String), returns (), empty body
|
||||
- empty_reducer_with_two_args: (count: i32, name: String), returns (), empty body
|
||||
- empty_reducer_with_three_args: (active: bool, ratio: f32, label: String), returns (), empty body
|
||||
@@ -0,0 +1,31 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "users")]
|
||||
public partial struct Users
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Name;
|
||||
public int Age;
|
||||
public bool Active;
|
||||
}
|
||||
|
||||
[Table(Name = "products")]
|
||||
public partial struct Products
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Title;
|
||||
public float Price;
|
||||
public bool InStock;
|
||||
}
|
||||
|
||||
[Table(Name = "notes")]
|
||||
public partial struct Notes
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Body;
|
||||
public long Rating;
|
||||
public bool Pinned;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
use spacetimedb::table;
|
||||
|
||||
#[table(name = users)]
|
||||
pub struct Users {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub age: i32,
|
||||
pub active: bool,
|
||||
}
|
||||
|
||||
#[table(name = products)]
|
||||
pub struct Products {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub title: String,
|
||||
pub price: f32,
|
||||
pub in_stock: bool,
|
||||
}
|
||||
|
||||
#[table(name = notes)]
|
||||
pub struct Notes {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub body: String,
|
||||
pub rating: i64,
|
||||
pub pinned: bool,
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
use crate::eval::defaults::default_schema_parity_scorers;
|
||||
use crate::eval::BenchmarkSpec;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |_lang, route_tag, host_url| {
|
||||
default_schema_parity_scorers(host_url, file!(), route_tag)
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
Write a SpacetimeDB backend module in C# that defines three tables with basic columns.
|
||||
|
||||
TABLES
|
||||
- users
|
||||
- Struct: Users
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Name: string
|
||||
- Age: int
|
||||
- Active: bool
|
||||
|
||||
- products
|
||||
- Struct: Products
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Title: string
|
||||
- Price: float
|
||||
- InStock: bool
|
||||
|
||||
- notes
|
||||
- Struct: Notes
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Body: string
|
||||
- Rating: long
|
||||
- Pinned: bool
|
||||
@@ -0,0 +1,26 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines three tables with basic columns.
|
||||
|
||||
TABLES
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- name: String
|
||||
- age: i32
|
||||
- active: bool
|
||||
|
||||
- products
|
||||
- Struct: Product
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- title: String
|
||||
- price: f32
|
||||
- in_stock: bool
|
||||
|
||||
- notes
|
||||
- Struct: Note
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- body: String
|
||||
- rating: i64
|
||||
- pinned: bool
|
||||
+24
@@ -0,0 +1,24 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "tick_timer", Scheduled = nameof(Tick), ScheduledAt = nameof(TickTimer.ScheduledAt))]
|
||||
public partial struct TickTimer
|
||||
{
|
||||
[PrimaryKey, AutoInc] public ulong ScheduledId;
|
||||
public ScheduleAt ScheduledAt;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void Tick(ReducerContext ctx, TickTimer timer) { }
|
||||
|
||||
[Reducer(ReducerKind.Init)]
|
||||
public static void Init(ReducerContext ctx)
|
||||
{
|
||||
var interval = new TimeDuration { Microseconds = 50_000 };
|
||||
ctx.Db.tick_timer.Insert(new TickTimer
|
||||
{
|
||||
ScheduledAt = new ScheduleAt.Interval(interval)
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, ScheduleAt, Table};
|
||||
use std::time::Duration;
|
||||
|
||||
#[table(name = tick_timer, scheduled(tick))]
|
||||
pub struct TickTimer {
|
||||
#[primary_key]
|
||||
#[auto_inc]
|
||||
scheduled_id: u64,
|
||||
scheduled_at: ScheduleAt,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn tick(_ctx: &ReducerContext, _row: TickTimer) -> Result<(), String> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[reducer(init)]
|
||||
pub fn init(ctx: &ReducerContext) -> Result<(), String> {
|
||||
ctx.db.tick_timer().insert(TickTimer {
|
||||
scheduled_id: 0,
|
||||
scheduled_at: ScheduleAt::Interval(Duration::from_millis(50).into()),
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
use crate::eval::defaults::default_schema_parity_scorers;
|
||||
use crate::eval::BenchmarkSpec;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |_lang, route_tag, host_url| {
|
||||
default_schema_parity_scorers(host_url, file!(), route_tag)
|
||||
})
|
||||
}
|
||||
+15
@@ -0,0 +1,15 @@
|
||||
Write a SpacetimeDB backend module in C# that defines a scheduled table and a scheduled reducer.
|
||||
|
||||
TABLE
|
||||
- tick_timer
|
||||
- Struct: TickTimer
|
||||
- Fields:
|
||||
- ScheduledId: ulong (primary key, auto-increment)
|
||||
- ScheduledAt: ScheduleAt
|
||||
- Scheduling:
|
||||
- Reducer: Tick
|
||||
- Column: ScheduledAt
|
||||
|
||||
REDUCERS
|
||||
- Tick: scheduled reducer triggered by tick_timer
|
||||
- Init: insert exactly one row into tick_timer that schedules a repeating interval of 50_000 microseconds
|
||||
@@ -0,0 +1,15 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines a scheduled table and a scheduled reducer.
|
||||
|
||||
TABLE
|
||||
- tick_timer
|
||||
- Struct: TickTimer
|
||||
- Fields:
|
||||
- scheduled_id: u64 (primary key, auto-increment)
|
||||
- scheduled_at: ScheduleAt
|
||||
- Scheduling:
|
||||
- reducer: tick
|
||||
- column: scheduled_at
|
||||
|
||||
REDUCERS
|
||||
- tick: scheduled reducer triggered by tick_timer
|
||||
- init: insert exactly one row into tick_timer that schedules a repeating interval of 50_000 microseconds
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Type]
|
||||
public partial struct Position
|
||||
{
|
||||
public int X;
|
||||
public int Y;
|
||||
}
|
||||
|
||||
[Table(Name = "entities")]
|
||||
public partial struct Entity
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public Position Pos;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
use spacetimedb::{table, SpacetimeType};
|
||||
|
||||
#[derive(SpacetimeType, Clone, Debug)]
|
||||
pub struct Position {
|
||||
pub x: i32,
|
||||
pub y: i32,
|
||||
}
|
||||
|
||||
#[table(name = entities)]
|
||||
pub struct Entity {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub pos: Position,
|
||||
}
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
use crate::eval::defaults::default_schema_parity_scorers;
|
||||
use crate::eval::BenchmarkSpec;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |_lang, route_tag, host_url| {
|
||||
default_schema_parity_scorers(host_url, file!(), route_tag)
|
||||
})
|
||||
}
|
||||
+14
@@ -0,0 +1,14 @@
|
||||
Write a SpacetimeDB backend module in C# that defines a struct type and uses it in a table.
|
||||
|
||||
TYPES
|
||||
- Struct: Position
|
||||
- Fields:
|
||||
- X: int
|
||||
- Y: int
|
||||
|
||||
TABLE
|
||||
- entities
|
||||
- Struct: Entity
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Pos: Position
|
||||
@@ -0,0 +1,14 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines a struct type and uses it in a table.
|
||||
|
||||
TYPES
|
||||
- Struct: Position
|
||||
- Fields:
|
||||
- x: i32
|
||||
- y: i32
|
||||
|
||||
TABLE
|
||||
- entities
|
||||
- Struct: Entity
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- pos: Position
|
||||
@@ -0,0 +1,19 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "users")]
|
||||
public partial struct User
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Name;
|
||||
public int Age;
|
||||
public bool Active;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void InsertUser(ReducerContext ctx, int id, string name, int age, bool active)
|
||||
{
|
||||
ctx.Db.users.Insert(new User { Id = id, Name = name, Age = age, Active = active });
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, Table};
|
||||
|
||||
#[table(name = users)]
|
||||
pub struct Users {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub age: i32,
|
||||
pub active: bool,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn insert_user(ctx: &ReducerContext, id: i32, name: String, age: i32, active: bool) -> Result<(), String> {
|
||||
ctx.db.users().insert(Users { id, name, age, active });
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
use crate::eval::defaults::{default_schema_parity_scorers, make_reducer_data_parity_scorer};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerDataParityConfig, SqlBuilder};
|
||||
use serde_json::Value;
|
||||
use std::time;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing);
|
||||
let select = sb.select_by_id("users", &["id","name","age","active"], "id", 1);
|
||||
let reducer_name = ident("InsertUser", casing);
|
||||
|
||||
v.push(make_reducer_data_parity_scorer(host_url, ReducerDataParityConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer_name.into(),
|
||||
args: vec![
|
||||
Value::from(1),
|
||||
Value::from("Alice"),
|
||||
Value::from(30),
|
||||
Value::from(true),
|
||||
],
|
||||
select_query: select.clone(),
|
||||
id_str: "data_parity_insert_user",
|
||||
collapse_ws: true,
|
||||
timeout: time::Duration::from_secs(10),
|
||||
}));
|
||||
v
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
Write a SpacetimeDB backend module in C# that defines one table and a reducer that inserts a row.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Name: string
|
||||
- Age: int
|
||||
- Active: bool
|
||||
|
||||
REDUCERS
|
||||
- InsertUser: given id:int, name:string, age:int, active:bool, insert exactly one row into users
|
||||
- (Id=id, Name=name, Age=age, Active=active)
|
||||
@@ -0,0 +1,14 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines one table and a reducer that inserts a row.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- name: String
|
||||
- age: i32
|
||||
- active: bool
|
||||
|
||||
REDUCERS
|
||||
- insert_user: given id:i32, name:String, age:i32, active:bool, insert exactly one row into users
|
||||
- (id=id, name=name, age=age, active=active)
|
||||
@@ -0,0 +1,19 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "users")]
|
||||
public partial struct User
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Name;
|
||||
public int Age;
|
||||
public bool Active;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void UpdateUser(ReducerContext ctx, int id, string name, int age, bool active)
|
||||
{
|
||||
ctx.Db.users.Id.Update(new User { Id = id, Name = name, Age = age, Active = active });
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext};
|
||||
|
||||
#[table(name = users)]
|
||||
pub struct User {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub age: i32,
|
||||
pub active: bool,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn update_user(ctx: &ReducerContext, id: i32, name: String, age: i32, active: bool) {
|
||||
ctx.db.users().id().update(User { id, name, age, active });
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
use crate::eval::defaults::{
|
||||
default_schema_parity_scorers,
|
||||
make_reducer_data_parity_scorer,
|
||||
make_sql_exec_both_scorer,
|
||||
};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerDataParityConfig, SqlBuilder};
|
||||
use serde_json::Value;
|
||||
use std::time;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing);
|
||||
let seed = sb.insert_values("users", &["id","name","age","active"], &["1","'Alice'","30","true"]);
|
||||
let select = sb.select_by_id("users", &["id","name","age","active"], "id", 1);
|
||||
let reducer_name = ident("UpdateUser", casing);
|
||||
|
||||
v.push(make_sql_exec_both_scorer(
|
||||
host_url,
|
||||
file!(),
|
||||
route_tag,
|
||||
&seed,
|
||||
"seed_users_row",
|
||||
time::Duration::from_secs(10),
|
||||
));
|
||||
|
||||
v.push(make_reducer_data_parity_scorer(host_url, ReducerDataParityConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer_name.into(),
|
||||
args: vec![
|
||||
Value::from(1),
|
||||
Value::from("Alice2"),
|
||||
Value::from(31),
|
||||
Value::from(false),
|
||||
],
|
||||
select_query: select.clone(),
|
||||
id_str: "data_parity_update_user",
|
||||
collapse_ws: true,
|
||||
timeout: time::Duration::from_secs(10),
|
||||
}));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
Write a SpacetimeDB backend module in C# that defines one table and a reducer that updates a row.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Name: string
|
||||
- Age: int
|
||||
- Active: bool
|
||||
|
||||
REDUCERS
|
||||
- UpdateUser: given id:int, name:string, age:int, active:bool, update the row in users with Id=id to exactly these values
|
||||
- (Id=id, Name=name, Age=age, Active=active)
|
||||
@@ -0,0 +1,14 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines one table and a reducer that updates a row.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- name: String
|
||||
- age: i32
|
||||
- active: bool
|
||||
|
||||
REDUCERS
|
||||
- update_user: given id:i32, name:String, age:i32, active:bool, update the row in users with id=id to exactly these values
|
||||
- (id=id, name=name, age=age, active=active)
|
||||
@@ -0,0 +1,19 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "users")]
|
||||
public partial struct User
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Name;
|
||||
public int Age;
|
||||
public bool Active;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void DeleteUser(ReducerContext ctx, int id)
|
||||
{
|
||||
ctx.Db.users.Id.Delete(id);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext};
|
||||
|
||||
#[table(name = users)]
|
||||
pub struct User {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub age: i32,
|
||||
pub active: bool,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn delete_user(ctx: &ReducerContext, id: i32) {
|
||||
ctx.db.users().id().delete(id);
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
use crate::eval::defaults::{
|
||||
default_schema_parity_scorers,
|
||||
make_reducer_sql_count_scorer,
|
||||
make_sql_exec_both_scorer,
|
||||
};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerSqlCountConfig, SqlBuilder};
|
||||
use serde_json::Value;
|
||||
use std::time;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing);
|
||||
let seed = sb.insert_values("users", &["id","name","age","active"], &["1","'Alice'","30","true"]);
|
||||
let count = sb.count_by_id("users", "id", 1);
|
||||
let reducer_name = ident("DeleteUser", casing);
|
||||
|
||||
v.push(make_sql_exec_both_scorer(
|
||||
host_url,
|
||||
file!(),
|
||||
route_tag,
|
||||
&seed,
|
||||
"seed_users_row",
|
||||
time::Duration::from_secs(10),
|
||||
));
|
||||
|
||||
v.push(make_reducer_sql_count_scorer(host_url, ReducerSqlCountConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer_name.into(),
|
||||
args: vec![Value::from(1)],
|
||||
sql_count_query: count.clone(),
|
||||
expected_count: 0,
|
||||
id_str: "delete_user_count_zero",
|
||||
timeout: time::Duration::from_secs(10),
|
||||
}));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
Write a SpacetimeDB backend module in C# that defines one table and a reducer that deletes a row.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Name: string
|
||||
- Age: int
|
||||
- Active: bool
|
||||
|
||||
REDUCERS
|
||||
- DeleteUser: given id:int, delete the row in users with that Id
|
||||
@@ -0,0 +1,13 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines one table and a reducer that deletes a row.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- name: String
|
||||
- age: i32
|
||||
- active: bool
|
||||
|
||||
REDUCERS
|
||||
- delete_user: given id:i32, delete the row in users with that id
|
||||
@@ -0,0 +1,22 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "users")]
|
||||
public partial struct User
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Name;
|
||||
public int Age;
|
||||
public bool Active;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void Crud(ReducerContext ctx)
|
||||
{
|
||||
ctx.Db.users.Insert(new User { Id = 1, Name = "Alice", Age = 30, Active = true });
|
||||
ctx.Db.users.Insert(new User { Id = 2, Name = "Bob", Age = 22, Active = false });
|
||||
ctx.Db.users.Id.Update(new User { Id = 1, Name = "Alice2", Age = 31, Active = false });
|
||||
ctx.Db.users.Id.Delete(2);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, Table};
|
||||
|
||||
#[table(name = users)]
|
||||
pub struct User {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub age: i32,
|
||||
pub active: bool,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn crud(ctx: &ReducerContext) {
|
||||
ctx.db.users().insert(User { id: 1, name: "Alice".into(), age: 30, active: true });
|
||||
ctx.db.users().insert(User { id: 2, name: "Bob".into(), age: 22, active: false });
|
||||
ctx.db.users().id().update(User { id: 1, name: "Alice2".into(), age: 31, active: false });
|
||||
ctx.db.users().id().delete(2);
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
use crate::eval::defaults::{default_schema_parity_scorers, make_reducer_data_parity_scorer, make_sql_count_only_scorer};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerDataParityConfig, SqlBuilder};
|
||||
use std::time::Duration;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing);
|
||||
let reducer = ident("Crud", casing);
|
||||
|
||||
let select_id1 = sb.select_by_id("users", &["id","name","age","active"], "id", 1);
|
||||
let count_id2 = sb.count_by_id("users", "id", 2);
|
||||
let count_all = "SELECT COUNT(*) AS n FROM users";
|
||||
|
||||
v.push(make_reducer_data_parity_scorer(host_url, ReducerDataParityConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer.into(),
|
||||
args: vec![],
|
||||
select_query: select_id1.clone(),
|
||||
id_str: "crud_row_id1_parity",
|
||||
collapse_ws: true,
|
||||
timeout: Duration::from_secs(10),
|
||||
}));
|
||||
v.push(make_sql_count_only_scorer(
|
||||
host_url, file!(), route_tag, &count_id2, 0, "crud_row_id2_deleted", Duration::from_secs(10),
|
||||
));
|
||||
v.push(make_sql_count_only_scorer(
|
||||
host_url, file!(), route_tag, count_all, 1, "crud_total_count_one", Duration::from_secs(10),
|
||||
));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
Write a SpacetimeDB backend module in C# that defines one table and a reducer that performs insert, update, and delete in one call.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Name: string
|
||||
- Age: int
|
||||
- Active: bool
|
||||
|
||||
REDUCERS
|
||||
- Crud: perform these steps in order
|
||||
- insert (Id=1, Name="Alice", Age=30, Active=true)
|
||||
- insert (Id=2, Name="Bob", Age=22, Active=false)
|
||||
- update (Id=1, Name="Alice2", Age=31, Active=false)
|
||||
- delete Id=2
|
||||
@@ -0,0 +1,17 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines one table and a reducer that performs insert, update, and delete in one call.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- name: String
|
||||
- age: i32
|
||||
- active: bool
|
||||
|
||||
REDUCERS
|
||||
- crud: perform these steps in order
|
||||
- insert (id=1, name="Alice", age=30, active=true)
|
||||
- insert (id=2, name="Bob", age=22, active=false)
|
||||
- update (id=1, name="Alice2", age=31, active=false)
|
||||
- delete id=2
|
||||
@@ -0,0 +1,31 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "users")]
|
||||
public partial struct User
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Name;
|
||||
public int Age;
|
||||
public bool Active;
|
||||
}
|
||||
|
||||
[Table(Name = "results")]
|
||||
public partial struct Result
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Name;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void LookupUserName(ReducerContext ctx, int id)
|
||||
{
|
||||
var u = ctx.Db.users.Id.Find(id);
|
||||
if (u.HasValue)
|
||||
{
|
||||
var row = u.Value;
|
||||
ctx.Db.results.Insert(new Result { Id = row.Id, Name = row.Name });
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, Table};
|
||||
|
||||
#[table(name = users)]
|
||||
pub struct User {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub age: i32,
|
||||
pub active: bool,
|
||||
}
|
||||
|
||||
#[table(name = results)]
|
||||
pub struct ResultRow {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn lookup_user_name(ctx: &ReducerContext, id: i32) {
|
||||
if let Some(u) = ctx.db.users().id().find(id) {
|
||||
ctx.db.results().insert(ResultRow { id: u.id, name: u.name });
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
use crate::eval::defaults::{
|
||||
default_schema_parity_scorers,
|
||||
make_reducer_data_parity_scorer,
|
||||
make_sql_exec_both_scorer,
|
||||
};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerDataParityConfig, SqlBuilder};
|
||||
use serde_json::Value;
|
||||
use std::time::Duration;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing);
|
||||
let reducer_name = ident("LookupUserName", casing);
|
||||
|
||||
// Seed a user row in both DBs so the lookup has something to find
|
||||
let seed_users = sb.insert_values(
|
||||
"users",
|
||||
&["id","name","age","active"],
|
||||
&["1","'Alice'","30","true"],
|
||||
);
|
||||
|
||||
v.push(make_sql_exec_both_scorer(
|
||||
host_url,
|
||||
file!(),
|
||||
route_tag,
|
||||
&seed_users,
|
||||
"seed_user_row",
|
||||
Duration::from_secs(10),
|
||||
));
|
||||
|
||||
// After calling the reducer, the projection should be present in results
|
||||
let select_result = sb.select_by_id(
|
||||
"results",
|
||||
&["id","name"],
|
||||
"id",
|
||||
1,
|
||||
);
|
||||
|
||||
v.push(make_reducer_data_parity_scorer(host_url, ReducerDataParityConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer_name.into(),
|
||||
args: vec![Value::from(1)],
|
||||
select_query: select_result.clone(),
|
||||
id_str: "index_lookup_projection_parity",
|
||||
collapse_ws: true,
|
||||
timeout: Duration::from_secs(10),
|
||||
}));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
Write a SpacetimeDB backend module in C# that defines two tables and a reducer that looks up a row by primary-key index and writes a projection to another table.
|
||||
|
||||
TABLES
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Name: string
|
||||
- Age: int
|
||||
- Active: bool
|
||||
|
||||
- results
|
||||
- Struct: Result
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Name: string
|
||||
|
||||
REDUCERS
|
||||
- LookupUserName: given id:int, find the users row with that Id using the index and insert (Id, Name) into results.
|
||||
@@ -0,0 +1,19 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines two tables and a reducer that looks up a row by primary-key index and writes a projection to another table.
|
||||
|
||||
TABLES
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- name: String
|
||||
- age: i32
|
||||
- active: bool
|
||||
|
||||
- results
|
||||
- Struct: Result
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- name: String
|
||||
|
||||
REDUCERS
|
||||
- lookup_user_name: given id:i32, find the users row with that id using the index and insert (id, name) into results.
|
||||
@@ -0,0 +1,20 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "users")]
|
||||
public partial struct User
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public string Name;
|
||||
public int Age;
|
||||
public bool Active;
|
||||
}
|
||||
|
||||
[Reducer(ReducerKind.Init)]
|
||||
public static void Init(ReducerContext ctx)
|
||||
{
|
||||
ctx.Db.users.Insert(new User { Id = 1, Name = "Alice", Age = 30, Active = true });
|
||||
ctx.Db.users.Insert(new User { Id = 2, Name = "Bob", Age = 22, Active = false });
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, Table};
|
||||
|
||||
#[table(name = users)]
|
||||
pub struct User {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub age: i32,
|
||||
pub active: bool,
|
||||
}
|
||||
|
||||
#[reducer(init)]
|
||||
pub fn init(ctx: &ReducerContext) {
|
||||
ctx.db.users().insert(User { id: 1, name: "Alice".into(), age: 30, active: true });
|
||||
ctx.db.users().insert(User { id: 2, name: "Bob".into(), age: 22, active: false });
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
use crate::eval::defaults::{default_schema_parity_scorers, make_sql_count_only_scorer};
|
||||
use crate::eval::{casing_for_lang, BenchmarkSpec, SqlBuilder};
|
||||
use std::time::Duration;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
|
||||
let sb = SqlBuilder::new(casing_for_lang(lang));
|
||||
let id = sb.cols(&["id"])[0].clone();
|
||||
let name = sb.cols(&["name"])[0].clone();
|
||||
let age = sb.cols(&["age"])[0].clone();
|
||||
let act = sb.cols(&["active"])[0].clone();
|
||||
|
||||
let q_alice = format!("SELECT COUNT(*) AS n FROM users WHERE {id}=1 AND {name}='Alice' AND {age}=30 AND {act}=true");
|
||||
let q_bob = format!("SELECT COUNT(*) AS n FROM users WHERE {id}=2 AND {name}='Bob' AND {age}=22 AND {act}=false");
|
||||
let q_total = "SELECT COUNT(*) AS n FROM users";
|
||||
|
||||
v.push(make_sql_count_only_scorer(host_url, file!(), route_tag, q_alice, 1, "init_seed_alice", Duration::from_secs(10)));
|
||||
v.push(make_sql_count_only_scorer(host_url, file!(), route_tag, q_bob, 1, "init_seed_bob", Duration::from_secs(10)));
|
||||
v.push(make_sql_count_only_scorer(host_url, file!(), route_tag, q_total, 2, "init_total_two", Duration::from_secs(10)));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
Write a SpacetimeDB backend module in C# that defines one table and an Init reducer that seeds rows on database initialization.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Name: string
|
||||
- Age: int
|
||||
- Active: bool
|
||||
|
||||
REDUCERS
|
||||
- Init: insert exactly these rows on initialization
|
||||
- (Id=1, Name="Alice", Age=30, Active=true)
|
||||
- (Id=2, Name="Bob", Age=22, Active=false)
|
||||
@@ -0,0 +1,15 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines one table and an init reducer that seeds rows on database initialization.
|
||||
|
||||
TABLE
|
||||
- users
|
||||
- Struct: User
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- name: String
|
||||
- age: i32
|
||||
- active: bool
|
||||
|
||||
REDUCERS
|
||||
- init: insert exactly these rows on initialization
|
||||
- (id=1, name="Alice", age=30, active=true)
|
||||
- (id=2, name="Bob", age=22, active=false)
|
||||
@@ -0,0 +1,23 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "events")]
|
||||
public partial struct Event
|
||||
{
|
||||
[PrimaryKey, AutoInc] public int Id;
|
||||
public string Kind;
|
||||
}
|
||||
|
||||
[Reducer(ReducerKind.ClientConnected)]
|
||||
public static void ClientConnected(ReducerContext ctx)
|
||||
{
|
||||
ctx.Db.events.Insert(new Event { Kind = "connected" });
|
||||
}
|
||||
|
||||
[Reducer(ReducerKind.ClientDisconnected)]
|
||||
public static void ClientDisconnected(ReducerContext ctx)
|
||||
{
|
||||
ctx.Db.events.Insert(new Event { Kind = "disconnected" });
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, Table};
|
||||
|
||||
#[table(name = events)]
|
||||
pub struct Event {
|
||||
#[primary_key]
|
||||
#[auto_inc]
|
||||
pub id: u64,
|
||||
pub kind: String,
|
||||
}
|
||||
|
||||
#[reducer(client_connected)]
|
||||
pub fn client_connected(ctx: &ReducerContext) {
|
||||
ctx.db.events().insert(Event { id: 0, kind: "connected".into() });
|
||||
}
|
||||
|
||||
#[reducer(client_disconnected)]
|
||||
pub fn client_disconnected(ctx: &ReducerContext) {
|
||||
ctx.db.events().insert(Event { id: 0, kind: "disconnected".into() });
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
use crate::eval::defaults::default_schema_parity_scorers;
|
||||
use crate::eval::BenchmarkSpec;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |_lang, route_tag, host_url| {
|
||||
default_schema_parity_scorers(host_url, file!(), route_tag)
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
Write a SpacetimeDB backend module in C# that defines one table and two reducers for client lifecycle events.
|
||||
|
||||
TABLE
|
||||
- events
|
||||
- Struct: Event
|
||||
- Fields:
|
||||
- Id: int (primary key, auto-increment)
|
||||
- Kind: string
|
||||
|
||||
REDUCERS
|
||||
- ClientConnected: when a client connects, insert exactly one row into events with Kind="connected"
|
||||
- ClientDisconnected: when a client disconnects, insert exactly one row into events with Kind="disconnected"
|
||||
@@ -0,0 +1,12 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines one table and two reducers for client lifecycle events.
|
||||
|
||||
TABLE
|
||||
- events
|
||||
- Struct: Event
|
||||
- Fields:
|
||||
- id: i32 (primary key, auto-increment)
|
||||
- kind: String
|
||||
|
||||
REDUCERS
|
||||
- client_connected: when a client connects, insert exactly one row into events with kind="connected"
|
||||
- client_disconnected: when a client disconnects, insert exactly one row into events with kind="disconnected"
|
||||
+19
@@ -0,0 +1,19 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "results")]
|
||||
public partial struct Result
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public int Sum;
|
||||
}
|
||||
|
||||
static int Add(int a, int b) => a + b;
|
||||
|
||||
[Reducer]
|
||||
public static void ComputeSum(ReducerContext ctx, int id, int a, int b)
|
||||
{
|
||||
ctx.Db.results.Insert(new Result { Id = id, Sum = Add(a, b) });
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, Table};
|
||||
|
||||
#[table(name = results)]
|
||||
pub struct ResultRow {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub sum: i32,
|
||||
}
|
||||
|
||||
fn add(a: i32, b: i32) -> i32 { a + b }
|
||||
|
||||
#[reducer]
|
||||
pub fn compute_sum(ctx: &ReducerContext, id: i32, a: i32, b: i32) {
|
||||
ctx.db.results().insert(ResultRow { id, sum: add(a, b) });
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
use crate::eval::defaults::{default_schema_parity_scorers, make_reducer_data_parity_scorer, make_sql_count_only_scorer};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerDataParityConfig, SqlBuilder};
|
||||
use serde_json::Value;
|
||||
use std::time::Duration;
|
||||
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing);
|
||||
let reducer = ident("ComputeSum", casing);
|
||||
let select = sb.select_by_id("results", &["id","sum"], "id", 1);
|
||||
|
||||
v.push(make_reducer_data_parity_scorer(host_url, ReducerDataParityConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer.into(),
|
||||
args: vec![
|
||||
Value::from(1),
|
||||
Value::from(2),
|
||||
Value::from(3),
|
||||
],
|
||||
select_query: select.clone(),
|
||||
id_str: "helper_func_sum_parity",
|
||||
collapse_ws: true,
|
||||
timeout: Duration::from_secs(10),
|
||||
}));
|
||||
|
||||
|
||||
let id = sb.cols(&["id"])[0].clone();
|
||||
let sum = sb.cols(&["sum"])[0].clone();
|
||||
let q = format!("SELECT COUNT(*) AS n FROM results WHERE {id}=1 AND {sum}=5");
|
||||
|
||||
v.push(make_sql_count_only_scorer(
|
||||
host_url,
|
||||
file!(),
|
||||
route_tag,
|
||||
q,
|
||||
1,
|
||||
"helper_func_sum_abs",
|
||||
Duration::from_secs(10),
|
||||
));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
+15
@@ -0,0 +1,15 @@
|
||||
Write a SpacetimeDB backend module in C# that defines a table, a non-reducer helper function, and a reducer that uses the helper.
|
||||
|
||||
TABLE
|
||||
- results
|
||||
- Struct: Result
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Sum: int
|
||||
|
||||
HELPERS
|
||||
- Add: given a:int and b:int, returns int
|
||||
|
||||
REDUCERS
|
||||
- ComputeSum: given id:int, a:int, b:int, insert exactly this row into results
|
||||
- (Id=id, Sum=Add(a, b))
|
||||
@@ -0,0 +1,15 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines a table, a non-reducer helper function, and a reducer that uses the helper.
|
||||
|
||||
TABLE
|
||||
- results
|
||||
- Struct: Result
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- sum: i32
|
||||
|
||||
HELPERS
|
||||
- add: given a:i32 and b:i32, returns i32
|
||||
|
||||
REDUCERS
|
||||
- compute_sum: given id:i32, a:i32, b:i32, insert exactly this row into results
|
||||
- (id=id, sum=add(a, b))
|
||||
+24
@@ -0,0 +1,24 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Type]
|
||||
public partial struct Score
|
||||
{
|
||||
public int Left;
|
||||
public int Right;
|
||||
}
|
||||
|
||||
[Table(Name = "results")]
|
||||
public partial struct Result
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public Score Value;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void SetScore(ReducerContext ctx, int id, int left, int right)
|
||||
{
|
||||
ctx.Db.results.Insert(new Result { Id = id, Value = new Score { Left = left, Right = right } });
|
||||
}
|
||||
}
|
||||
+19
@@ -0,0 +1,19 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, SpacetimeType, Table};
|
||||
|
||||
#[derive(SpacetimeType, Clone, Debug)]
|
||||
pub struct Score {
|
||||
pub left: i32,
|
||||
pub right: i32,
|
||||
}
|
||||
|
||||
#[table(name = results)]
|
||||
pub struct ResultRow {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub value: Score,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn set_score(ctx: &ReducerContext, id: i32, left: i32, right: i32) {
|
||||
ctx.db.results().insert(ResultRow { id, value: Score { left, right } });
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
use crate::eval::defaults::{default_schema_parity_scorers, make_reducer_data_parity_scorer, make_sql_count_only_scorer};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerDataParityConfig, SqlBuilder};
|
||||
use serde_json::Value;
|
||||
use std::time::Duration;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing_for_lang(lang));
|
||||
|
||||
let reducer = ident("SetScore", casing);
|
||||
|
||||
// Compare the full row (including the product-typed column) across golden/llm
|
||||
let select = sb.select_by_id("results", &["id","value"], "id", 1);
|
||||
|
||||
v.push(make_reducer_data_parity_scorer(host_url, ReducerDataParityConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer.into(),
|
||||
args: vec![
|
||||
Value::from(1),
|
||||
Value::from(2),
|
||||
Value::from(3),
|
||||
],
|
||||
select_query: select.clone(),
|
||||
id_str: "product_type_row_parity",
|
||||
collapse_ws: true,
|
||||
timeout: Duration::from_secs(10),
|
||||
}));
|
||||
|
||||
// Absolute sanity: exactly one row with id=1 exists
|
||||
let count = sb.count_by_id("results", "id", 1);
|
||||
v.push(make_sql_count_only_scorer(
|
||||
host_url,
|
||||
file!(),
|
||||
route_tag,
|
||||
&count,
|
||||
1,
|
||||
"product_type_row_count",
|
||||
Duration::from_secs(10),
|
||||
));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
Write a SpacetimeDB backend module in C# that defines a product type and uses it in a table.
|
||||
|
||||
TYPES
|
||||
- Struct: Score
|
||||
- Fields:
|
||||
- Left: int
|
||||
- Right: int
|
||||
|
||||
TABLE
|
||||
- results
|
||||
- Struct: Result
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Value: Score
|
||||
|
||||
REDUCERS
|
||||
- SetScore: given id:int, left:int, right:int, insert exactly this row into results
|
||||
- (Id=id, Value=Score{Left=left, Right=right})
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines a product type and uses it in a table.
|
||||
|
||||
TYPES
|
||||
- Struct: Score
|
||||
- Fields:
|
||||
- left: i32
|
||||
- right: i32
|
||||
|
||||
TABLE
|
||||
- results
|
||||
- Struct: Result
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- value: Score
|
||||
|
||||
REDUCERS
|
||||
- set_score: given id:i32, left:i32, right:i32, insert exactly this row into results
|
||||
- (id=id, value=Score{left:left, right:right})
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Type]
|
||||
public partial struct Circle { public int Radius; }
|
||||
|
||||
[Type]
|
||||
public partial struct Rectangle { public int Width; public int Height; }
|
||||
|
||||
[Type]
|
||||
public partial record Shape : TaggedEnum<(Circle Circle, Rectangle Rectangle)> {}
|
||||
|
||||
[Table(Name = "results")]
|
||||
public partial struct Result
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public Shape Value;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void SetCircle(ReducerContext ctx, int id, int radius)
|
||||
{
|
||||
ctx.Db.results.Insert(new Result { Id = id, Value = new Shape.Circle(new Circle { Radius = radius }) });
|
||||
}
|
||||
}
|
||||
+25
@@ -0,0 +1,25 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, SpacetimeType, Table};
|
||||
|
||||
#[derive(SpacetimeType, Clone, Debug)]
|
||||
pub struct Rect {
|
||||
pub width: i32,
|
||||
pub height: i32,
|
||||
}
|
||||
|
||||
#[derive(SpacetimeType, Clone, Debug)]
|
||||
pub enum Shape {
|
||||
Circle(i32),
|
||||
Rectangle(Rect),
|
||||
}
|
||||
|
||||
#[table(name = results)]
|
||||
pub struct ResultRow {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub value: Shape,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn set_circle(ctx: &ReducerContext, id: i32, radius: i32) {
|
||||
ctx.db.results().insert(ResultRow { id, value: Shape::Circle(radius) });
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
use crate::eval::defaults::{default_schema_parity_scorers, make_reducer_data_parity_scorer, make_sql_count_only_scorer};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerDataParityConfig, SqlBuilder};
|
||||
use serde_json::Value;
|
||||
use std::time::Duration;
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing_for_lang(lang));
|
||||
let reducer = ident("SetCircle", casing);
|
||||
|
||||
let select = sb.select_by_id("results", &["id","value"], "id", 1);
|
||||
v.push(make_reducer_data_parity_scorer(host_url, ReducerDataParityConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer.into(),
|
||||
args: vec![
|
||||
Value::from(1),
|
||||
Value::from(10),
|
||||
],
|
||||
select_query: select.clone(),
|
||||
id_str: "sum_type_row_parity",
|
||||
collapse_ws: true,
|
||||
timeout: Duration::from_secs(10),
|
||||
}));
|
||||
|
||||
|
||||
let count = sb.count_by_id("results", "id", 1);
|
||||
v.push(make_sql_count_only_scorer(
|
||||
host_url,
|
||||
file!(),
|
||||
route_tag,
|
||||
&count,
|
||||
1,
|
||||
"sum_type_row_count",
|
||||
Duration::from_secs(10),
|
||||
));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
Write a SpacetimeDB backend module in C# that defines a new sum type and uses it in a table.
|
||||
|
||||
TYPES
|
||||
- Struct: Circle
|
||||
- Fields:
|
||||
- Radius: int
|
||||
- Struct: Rectangle
|
||||
- Fields:
|
||||
- Width: int
|
||||
- Height: int
|
||||
- Sum: Shape = Circle | Rectangle
|
||||
|
||||
TABLE
|
||||
- results
|
||||
- Struct: Result
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Value: Shape
|
||||
|
||||
REDUCERS
|
||||
- SetCircle: given id:int and radius:int, insert exactly one row into results
|
||||
- (Id=id, Value=Circle{Radius=radius})
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
Write a SpacetimeDB backend module in Rust that defines a new sum type and uses it in a table.
|
||||
|
||||
TYPES
|
||||
- Struct: Rect
|
||||
- Fields:
|
||||
- width: i32
|
||||
- height: i32
|
||||
- Enum: Shape
|
||||
- Variants:
|
||||
- Circle(i32)
|
||||
- Rectangle(Rect)
|
||||
|
||||
TABLE
|
||||
- results
|
||||
- Struct: Result
|
||||
- Fields:
|
||||
- id: i32 (primary key)
|
||||
- value: Shape
|
||||
|
||||
REDUCERS
|
||||
- set_circle: given id:i32 and radius:i32, insert exactly one row into results
|
||||
- (id=id, value=Shape::Circle(radius))
|
||||
+30
@@ -0,0 +1,30 @@
|
||||
using SpacetimeDB;
|
||||
|
||||
public static partial class Module
|
||||
{
|
||||
[Table(Name = "primitives")]
|
||||
public partial struct Primitive
|
||||
{
|
||||
[PrimaryKey] public int Id;
|
||||
public int Count;
|
||||
public long Total;
|
||||
public float Price;
|
||||
public double Ratio;
|
||||
public bool Active;
|
||||
public string Name;
|
||||
}
|
||||
|
||||
[Reducer]
|
||||
public static void Seed(ReducerContext ctx)
|
||||
{
|
||||
ctx.Db.primitives.Insert(new Primitive {
|
||||
Id = 1,
|
||||
Count = 2,
|
||||
Total = 3000000000,
|
||||
Price = 1.5f,
|
||||
Ratio = 2.25,
|
||||
Active = true,
|
||||
Name = "Alice"
|
||||
});
|
||||
}
|
||||
}
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
use spacetimedb::{reducer, table, ReducerContext, Table};
|
||||
|
||||
#[table(name = primitives)]
|
||||
pub struct Primitive {
|
||||
#[primary_key]
|
||||
pub id: i32,
|
||||
pub count: i32,
|
||||
pub total: i64,
|
||||
pub price: f32,
|
||||
pub ratio: f64,
|
||||
pub active: bool,
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
#[reducer]
|
||||
pub fn seed(ctx: &ReducerContext) {
|
||||
ctx.db.primitives().insert(Primitive {
|
||||
id: 1,
|
||||
count: 2,
|
||||
total: 3_000_000_000,
|
||||
price: 1.5,
|
||||
ratio: 2.25,
|
||||
active: true,
|
||||
name: "Alice".into(),
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
use crate::eval::defaults::{default_schema_parity_scorers, make_reducer_data_parity_scorer, make_sql_count_only_scorer};
|
||||
use crate::eval::{casing_for_lang, ident, BenchmarkSpec, ReducerDataParityConfig, SqlBuilder};
|
||||
use std::time::Duration;
|
||||
|
||||
|
||||
pub fn spec() -> BenchmarkSpec {
|
||||
BenchmarkSpec::from_tasks_auto(file!(), |lang, route_tag, host_url| {
|
||||
let mut v = default_schema_parity_scorers(host_url, file!(), route_tag);
|
||||
let casing = casing_for_lang(lang);
|
||||
let sb = SqlBuilder::new(casing);
|
||||
let reducer = ident("Seed", casing);
|
||||
|
||||
let select = sb.select_by_id(
|
||||
"primitives",
|
||||
&["id","count","total","price","ratio","active","name"],
|
||||
"id",
|
||||
1
|
||||
);
|
||||
|
||||
v.push(make_reducer_data_parity_scorer(host_url, ReducerDataParityConfig {
|
||||
src_file: file!(),
|
||||
route_tag,
|
||||
reducer: reducer.into(),
|
||||
args: vec![], // no args
|
||||
select_query: select.clone(),
|
||||
id_str: "elementary_columns_row_parity",
|
||||
collapse_ws: true,
|
||||
timeout: Duration::from_secs(10),
|
||||
}));
|
||||
|
||||
let count = sb.count_by_id("primitives", "id", 1);
|
||||
v.push(make_sql_count_only_scorer(
|
||||
host_url,
|
||||
file!(),
|
||||
route_tag,
|
||||
&count,
|
||||
1,
|
||||
"elementary_columns_row_count",
|
||||
Duration::from_secs(10),
|
||||
));
|
||||
|
||||
v
|
||||
})
|
||||
}
|
||||
+17
@@ -0,0 +1,17 @@
|
||||
Write a SpacetimeDB backend module in C# that defines one table and seeds one row.
|
||||
|
||||
TABLE
|
||||
- primitives
|
||||
- Struct: Primitive
|
||||
- Fields:
|
||||
- Id: int (primary key)
|
||||
- Count: int
|
||||
- Total: long
|
||||
- Price: float
|
||||
- Ratio: double
|
||||
- Active: bool
|
||||
- Name: string
|
||||
|
||||
REDUCERS
|
||||
- Seed: insert exactly this row into primitives
|
||||
- (Id=1, Count=2, Total=3000000000, Price=1.5, Ratio=2.25, Active=true, Name="Alice")
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user