mirror of
https://github.com/clockworklabs/SpacetimeDB.git
synced 2026-05-06 07:26:43 -04:00
b75bf6decf
# Description of Changes Introduce a new **LLM benchmarking app** and supporting code. * **CLI:** `llm` with subcommands `run`, `routes list`, `diff`, `ci-check`. * **Runner:** executes globally numbered tasks; filters by `--lang`, `--categories`, `--tasks`, `--providers`, `--models`. * **Providers/clients:** route layer (`provider:model`) with HTTP LLM Vendor clients; env-driven keys/base URLs. * **Evaluation:** deterministic scorers (hash/equality, JSON shape/count, light schema/reducer parity) with clear failure messages. * **Results:** stable JSON schema; single-file HTML viewer to inspect/filter/export CSV. * **Build & guards:** build script for compile-time setup; * **Docs:** `DEVELOP.md` includes `cargo llm …` usage. This PR is the initial addition of the app and its modules (runner, config, routes, prompt/segmentation, scorers, schema/types, defaults/constants/paths/hashing/combine, publishers, spacetime guard, HTML stats viewer). ### How it works 1. **Pick what to run** * Choose tasks (`--tasks 0,7,12`), or a language (`--lang rust|csharp`), or categories (`--categories basics,schema`). * Optionally limit vendors/models (`--providers …`, `--models …`). 2. **Resolve routes** * Read env (API keys + base URLs) and build the active set (e.g., `openai:gpt-5`). 3. **Build context** * Start Spacetime * Publish golden answer modules * Prepare prompts and send to LLM model * Attempt to publish LLM module 4. **Execute calls** * Run the selected tasks within each test against selected models and languages. 5. **Score outputs** * Apply deterministic scorers (hash/equality, JSON shape/count, simple schema/reducer checks). * Record the score and any short failure reason. 6. **Update results file** * Write/update the single results JSON with task/route outcomes, timings, and summaries. # API and ABI breaking changes None. New application and modules; no existing public APIs/ABIs altered. # Expected complexity level and risk **4/5.** New CLI, routing, evaluation, and artifact format. * External model APIs may rate-limit/timeout; concurrency tunable via `LLM_BENCH_CONCURRENCY` / `LLM_BENCH_ROUTE_CONCURRENCY`. # Testing I ran the full test matrix and generated results for every task against every vendor, model, and language (rust + C#). I also tested the CI check locally using [act](https://github.com/nektos/act). **Please verify** * [ ] `llm run --tasks 0,1,2` (explicit `run`) * [ ] `llm run --lang rust --categories basics` (filters) * [ ] `llm run --categories basics,schema` (multiple categories) * [ ] `llm run --lang csharp` (language switch) * [ ] `llm run --providers openai,anthropic --models "openai:gpt-5 anthropic:claude-sonnet-4-5"` (provider/model limits) * [ ] `llm run --hash-only` (dry integrity) * [ ] `llm run --goldens-only` (test goldens only) * [ ] `llm run --force` (skip hash check) * [ ] `llm ci-check` * [ ] Stats viewer loads the JSON; filtering and CSV export work * [ ] CI works as intended --------- Signed-off-by: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Signed-off-by: Tyler Cloutier <cloutiertyler@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Co-authored-by: Tyler Cloutier <cloutiertyler@aol.com> Co-authored-by: Tyler Cloutier <cloutiertyler@users.noreply.github.com> Co-authored-by: spacetimedb-bot <spacetimedb-bot@users.noreply.github.com> Co-authored-by: John Detter <4099508+jdetter@users.noreply.github.com>
199 lines
6.2 KiB
Rust
199 lines
6.2 KiB
Rust
#![allow(clippy::disallowed_macros)]
|
|
|
|
use std::{
|
|
env, fs, io,
|
|
path::{Component, Path, PathBuf},
|
|
process::Command,
|
|
};
|
|
|
|
fn main() {
|
|
// === Paths ===
|
|
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
|
|
let benches_root = manifest_dir.join("src/benchmarks");
|
|
|
|
let gen_dir = manifest_dir.join("src/generated");
|
|
let registry_rs = gen_dir.join("registry.rs");
|
|
|
|
fs::create_dir_all(&gen_dir).unwrap();
|
|
|
|
// We'll gather generated module blocks + match arms
|
|
let mut mods_src = String::new();
|
|
let mut arms_src = String::new();
|
|
|
|
// Track whether we actually saw anything. If we saw nothing,
|
|
// that's almost always a wrong-path / didn't put build.rs in right crate problem.
|
|
let mut found_any = false;
|
|
|
|
// Walk: src/benchmarks/<category>/<task>/spec.rs
|
|
for cat_entry in read_dir_sorted(&benches_root) {
|
|
let cat_entry = cat_entry.unwrap();
|
|
let cat_path = cat_entry.path();
|
|
if !cat_path.is_dir() {
|
|
continue;
|
|
}
|
|
let category = file_name_string(&cat_path);
|
|
|
|
for task_entry in read_dir_sorted(&cat_path) {
|
|
let task_entry = task_entry.unwrap();
|
|
let task_path = task_entry.path();
|
|
if !task_path.is_dir() {
|
|
continue;
|
|
}
|
|
let task = file_name_string(&task_path);
|
|
|
|
let spec_path = task_path.join("spec.rs");
|
|
if !spec_path.is_file() {
|
|
continue;
|
|
}
|
|
|
|
found_any = true;
|
|
|
|
// ex: basics_t_005_update
|
|
let mod_ident = format_ident(&category, &task);
|
|
|
|
// registry.rs (we are generating) → ../../benchmarks/.../spec.rs (relative include path)
|
|
let rel_spec_path = relative_path(®istry_rs, &spec_path);
|
|
|
|
// inline submodule
|
|
mods_src.push_str(&format!(
|
|
"#[allow(dead_code)]\n#[allow(clippy::all)]\nmod {mod_ident} {{\n include!(\"{rel_spec_path}\");\n}}\n\n"
|
|
));
|
|
|
|
// map ("category","task") → that module's spec() fn
|
|
arms_src.push_str(&format!(" (\"{category}\", \"{task}\") => {mod_ident}::spec,\n"));
|
|
}
|
|
}
|
|
|
|
if !found_any {
|
|
// Fail fast instead of silently letting the stub compile.
|
|
panic!(
|
|
"build.rs: did not find any benchmark specs under {:?}.
|
|
This usually means one of two things:
|
|
1) The benchmarks actually live somewhere else (path mismatch).
|
|
2) build.rs is not in the same crate root as the code you're compiling, \
|
|
so Cargo is not running this script for that crate.",
|
|
benches_root
|
|
);
|
|
}
|
|
|
|
// Build final file string
|
|
let file_contents = format!(
|
|
"use crate::eval::BenchmarkSpec;
|
|
use anyhow::{{anyhow, Result}};
|
|
use std::path::Path;
|
|
|
|
{mods_src}pub fn resolve_by_path(task_root: &Path) -> Result<fn() -> BenchmarkSpec> {{
|
|
let task = task_root
|
|
.file_name()
|
|
.and_then(|s| s.to_str())
|
|
.ok_or_else(|| anyhow!(\"missing task name\"))?;
|
|
let category = task_root
|
|
.parent()
|
|
.and_then(|p| p.file_name().and_then(|s| s.to_str()))
|
|
.ok_or_else(|| anyhow!(\"missing category name\"))?;
|
|
|
|
let ctor = match (category, task) {{
|
|
{arms_src} _ => return Err(anyhow!(
|
|
\"no spec registered for {{}}/{{}} (need spec.rs)\",
|
|
category,
|
|
task
|
|
)),
|
|
}};
|
|
|
|
Ok(ctor)
|
|
}}
|
|
"
|
|
);
|
|
|
|
// Write unformatted first
|
|
fs::write(®istry_rs, file_contents).unwrap();
|
|
|
|
// Best-effort: format it so CI/rustfmt is happy
|
|
let _ = Command::new("rustup").args(["component", "add", "rustfmt"]).status();
|
|
|
|
let _ = Command::new("rustfmt")
|
|
.arg("--edition")
|
|
.arg("2021")
|
|
.arg(registry_rs.to_string_lossy().to_string())
|
|
.status();
|
|
}
|
|
|
|
/// Deterministic read_dir so output order is stable.
|
|
fn read_dir_sorted(dir: &Path) -> Vec<io::Result<fs::DirEntry>> {
|
|
let mut entries: Vec<_> = fs::read_dir(dir).unwrap().collect();
|
|
entries.sort_by_key(|res| {
|
|
res.as_ref()
|
|
.ok()
|
|
.and_then(|e| e.file_name().into_string().ok())
|
|
.unwrap_or_default()
|
|
});
|
|
entries
|
|
}
|
|
|
|
/// Get final path segment as String.
|
|
fn file_name_string(p: &Path) -> String {
|
|
p.file_name()
|
|
.and_then(|s| s.to_str())
|
|
.expect("utf8 dir name")
|
|
.to_string()
|
|
}
|
|
|
|
/// Turn ("basics","t_005_update") into "basics_t_005_update"
|
|
/// - lowercase
|
|
/// - non [a-z0-9_] → '_'
|
|
/// - if first char is digit, prefix '_'
|
|
fn format_ident(category: &str, task: &str) -> String {
|
|
fn sanitize(s: &str) -> String {
|
|
s.chars()
|
|
.map(|c| {
|
|
if c.is_ascii_alphanumeric() || c == '_' {
|
|
c.to_ascii_lowercase()
|
|
} else {
|
|
'_'
|
|
}
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
let mut ident = format!("{}_{}", sanitize(category), sanitize(task));
|
|
if ident.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false) {
|
|
ident.insert(0, '_');
|
|
}
|
|
ident
|
|
}
|
|
|
|
/// Build a relative path string from `from` file to `to` file,
|
|
/// normalized to `/` for portability so `include!` is valid.
|
|
fn relative_path(from: &Path, to: &Path) -> String {
|
|
let base_dir = from.parent().expect("registry.rs must have a parent dir");
|
|
let rel = diff_paths(to, base_dir).unwrap_or_else(|| to.to_path_buf());
|
|
rel.to_string_lossy().replace('\\', "/")
|
|
}
|
|
|
|
/// Minimal diff_paths (no extra crate).
|
|
fn diff_paths(path: &Path, base: &Path) -> Option<PathBuf> {
|
|
let path_comps: Vec<Component<'_>> = path.components().collect();
|
|
let base_comps: Vec<Component<'_>> = base.components().collect();
|
|
|
|
// find shared prefix
|
|
let common_len = path_comps.iter().zip(&base_comps).take_while(|(a, b)| a == b).count();
|
|
|
|
// walk back from base
|
|
let mut out = PathBuf::new();
|
|
for _ in base_comps.iter().skip(common_len) {
|
|
out.push("..");
|
|
}
|
|
|
|
// then walk forward into path
|
|
for comp in path_comps.iter().skip(common_len) {
|
|
match comp {
|
|
Component::Normal(os) => out.push(os),
|
|
Component::CurDir => out.push("."),
|
|
Component::ParentDir => out.push(".."),
|
|
Component::RootDir | Component::Prefix(_) => return None,
|
|
}
|
|
}
|
|
|
|
Some(out)
|
|
}
|