mirror of
https://github.com/clockworklabs/SpacetimeDB.git
synced 2026-05-06 07:26:43 -04:00
b75bf6decf
# Description of Changes Introduce a new **LLM benchmarking app** and supporting code. * **CLI:** `llm` with subcommands `run`, `routes list`, `diff`, `ci-check`. * **Runner:** executes globally numbered tasks; filters by `--lang`, `--categories`, `--tasks`, `--providers`, `--models`. * **Providers/clients:** route layer (`provider:model`) with HTTP LLM Vendor clients; env-driven keys/base URLs. * **Evaluation:** deterministic scorers (hash/equality, JSON shape/count, light schema/reducer parity) with clear failure messages. * **Results:** stable JSON schema; single-file HTML viewer to inspect/filter/export CSV. * **Build & guards:** build script for compile-time setup; * **Docs:** `DEVELOP.md` includes `cargo llm …` usage. This PR is the initial addition of the app and its modules (runner, config, routes, prompt/segmentation, scorers, schema/types, defaults/constants/paths/hashing/combine, publishers, spacetime guard, HTML stats viewer). ### How it works 1. **Pick what to run** * Choose tasks (`--tasks 0,7,12`), or a language (`--lang rust|csharp`), or categories (`--categories basics,schema`). * Optionally limit vendors/models (`--providers …`, `--models …`). 2. **Resolve routes** * Read env (API keys + base URLs) and build the active set (e.g., `openai:gpt-5`). 3. **Build context** * Start Spacetime * Publish golden answer modules * Prepare prompts and send to LLM model * Attempt to publish LLM module 4. **Execute calls** * Run the selected tasks within each test against selected models and languages. 5. **Score outputs** * Apply deterministic scorers (hash/equality, JSON shape/count, simple schema/reducer checks). * Record the score and any short failure reason. 6. **Update results file** * Write/update the single results JSON with task/route outcomes, timings, and summaries. # API and ABI breaking changes None. New application and modules; no existing public APIs/ABIs altered. # Expected complexity level and risk **4/5.** New CLI, routing, evaluation, and artifact format. * External model APIs may rate-limit/timeout; concurrency tunable via `LLM_BENCH_CONCURRENCY` / `LLM_BENCH_ROUTE_CONCURRENCY`. # Testing I ran the full test matrix and generated results for every task against every vendor, model, and language (rust + C#). I also tested the CI check locally using [act](https://github.com/nektos/act). **Please verify** * [ ] `llm run --tasks 0,1,2` (explicit `run`) * [ ] `llm run --lang rust --categories basics` (filters) * [ ] `llm run --categories basics,schema` (multiple categories) * [ ] `llm run --lang csharp` (language switch) * [ ] `llm run --providers openai,anthropic --models "openai:gpt-5 anthropic:claude-sonnet-4-5"` (provider/model limits) * [ ] `llm run --hash-only` (dry integrity) * [ ] `llm run --goldens-only` (test goldens only) * [ ] `llm run --force` (skip hash check) * [ ] `llm ci-check` * [ ] Stats viewer loads the JSON; filtering and CSV export work * [ ] CI works as intended --------- Signed-off-by: bradleyshep <148254416+bradleyshep@users.noreply.github.com> Signed-off-by: Tyler Cloutier <cloutiertyler@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> Co-authored-by: Tyler Cloutier <cloutiertyler@aol.com> Co-authored-by: Tyler Cloutier <cloutiertyler@users.noreply.github.com> Co-authored-by: spacetimedb-bot <spacetimedb-bot@users.noreply.github.com> Co-authored-by: John Detter <4099508+jdetter@users.noreply.github.com>
1542 lines
48 KiB
JSON
1542 lines
48 KiB
JSON
{
|
|
"version": 1,
|
|
"generated_at": "2026-01-04T18:07:19.608Z",
|
|
"by_language": {
|
|
"rust": {
|
|
"modes": {
|
|
"rustdoc_json": {
|
|
"models": {
|
|
"Grok 4": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 13,
|
|
"pass_pct": 48.148148,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 50.0
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 31,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 58,
|
|
"passed_tests": 13,
|
|
"pass_pct": 22.413794,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 27.272728
|
|
}
|
|
},
|
|
"Grok 3 Mini (Beta)": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 8,
|
|
"pass_pct": 29.62963,
|
|
"task_pass_equiv": 4.0,
|
|
"task_pass_pct": 33.333336
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 8,
|
|
"pass_pct": 13.114754,
|
|
"task_pass_equiv": 4.0,
|
|
"task_pass_pct": 18.181818
|
|
}
|
|
},
|
|
"Gemini 2.5 Pro": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 15,
|
|
"pass_pct": 55.555557,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 50.0
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 13,
|
|
"pass_pct": 38.235294,
|
|
"task_pass_equiv": 3.0,
|
|
"task_pass_pct": 30.000002
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 28,
|
|
"pass_pct": 45.901638,
|
|
"task_pass_equiv": 9.0,
|
|
"task_pass_pct": 40.909092
|
|
}
|
|
},
|
|
"Gemini 2.5 Flash": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 30,
|
|
"passed_tests": 7,
|
|
"pass_pct": 23.333334,
|
|
"task_pass_equiv": 2.0,
|
|
"task_pass_pct": 20.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 14,
|
|
"pass_pct": 51.851852,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 50.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 57,
|
|
"passed_tests": 21,
|
|
"pass_pct": 36.842106,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 36.363636
|
|
}
|
|
},
|
|
"DeepSeek V3": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 15,
|
|
"pass_pct": 44.117645,
|
|
"task_pass_equiv": 4.0,
|
|
"task_pass_pct": 40.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 12,
|
|
"pass_pct": 44.444443,
|
|
"task_pass_equiv": 5.0,
|
|
"task_pass_pct": 41.666664
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 27,
|
|
"pass_pct": 44.262295,
|
|
"task_pass_equiv": 9.0,
|
|
"task_pass_pct": 40.909092
|
|
}
|
|
},
|
|
"Claude 4.5 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 23,
|
|
"pass_pct": 67.64706,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 60.000004
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 18,
|
|
"pass_pct": 66.666664,
|
|
"task_pass_equiv": 7.0,
|
|
"task_pass_pct": 58.333332
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 41,
|
|
"pass_pct": 67.21311,
|
|
"task_pass_equiv": 13.0,
|
|
"task_pass_pct": 59.090908
|
|
}
|
|
},
|
|
"Meta Llama 3.1 405B": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 4,
|
|
"pass_pct": 14.814815,
|
|
"task_pass_equiv": 2.0,
|
|
"task_pass_pct": 16.666668
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 4,
|
|
"pass_pct": 6.557377,
|
|
"task_pass_equiv": 2.0,
|
|
"task_pass_pct": 9.090909
|
|
}
|
|
},
|
|
"DeepSeek R1": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 7,
|
|
"pass_pct": 20.588236,
|
|
"task_pass_equiv": 2.0,
|
|
"task_pass_pct": 20.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 14,
|
|
"pass_pct": 51.851852,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 50.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 21,
|
|
"pass_pct": 34.42623,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 36.363636
|
|
}
|
|
},
|
|
"o4-mini": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 21,
|
|
"pass_pct": 61.764706,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 60.000004
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 16,
|
|
"pass_pct": 59.25926,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 66.66667
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 37,
|
|
"pass_pct": 60.65574,
|
|
"task_pass_equiv": 14.0,
|
|
"task_pass_pct": 63.636364
|
|
}
|
|
},
|
|
"Claude 4 Sonnet": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 20,
|
|
"pass_pct": 74.07407,
|
|
"task_pass_equiv": 9.0,
|
|
"task_pass_pct": 75.0
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 18,
|
|
"pass_pct": 52.941177,
|
|
"task_pass_equiv": 5.0,
|
|
"task_pass_pct": 50.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 38,
|
|
"pass_pct": 62.295082,
|
|
"task_pass_equiv": 14.0,
|
|
"task_pass_pct": 63.636364
|
|
}
|
|
},
|
|
"Claude 4.5 Haiku": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 19,
|
|
"pass_pct": 55.882355,
|
|
"task_pass_equiv": 4.8,
|
|
"task_pass_pct": 48.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 19,
|
|
"pass_pct": 70.37037,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 66.66667
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 38,
|
|
"pass_pct": 62.295082,
|
|
"task_pass_equiv": 12.8,
|
|
"task_pass_pct": 58.181816
|
|
}
|
|
},
|
|
"GPT-5": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 11,
|
|
"pass_pct": 32.35294,
|
|
"task_pass_equiv": 2.35,
|
|
"task_pass_pct": 23.499998
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 5,
|
|
"pass_pct": 18.518518,
|
|
"task_pass_equiv": 1.3333334,
|
|
"task_pass_pct": 11.111112
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 16,
|
|
"pass_pct": 26.229507,
|
|
"task_pass_equiv": 3.6833334,
|
|
"task_pass_pct": 16.742424
|
|
}
|
|
},
|
|
"GPT-4o": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 19,
|
|
"pass_pct": 70.37037,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 66.66667
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 15,
|
|
"pass_pct": 44.117645,
|
|
"task_pass_equiv": 4.0,
|
|
"task_pass_pct": 40.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 34,
|
|
"pass_pct": 55.737705,
|
|
"task_pass_equiv": 12.0,
|
|
"task_pass_pct": 54.545456
|
|
}
|
|
},
|
|
"GPT-4.1": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 17,
|
|
"pass_pct": 62.962963,
|
|
"task_pass_equiv": 7.0,
|
|
"task_pass_pct": 58.333332
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 5,
|
|
"pass_pct": 14.705882,
|
|
"task_pass_equiv": 1.4,
|
|
"task_pass_pct": 14.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 22,
|
|
"pass_pct": 36.065575,
|
|
"task_pass_equiv": 8.4,
|
|
"task_pass_pct": 38.181816
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"docs": {
|
|
"models": {
|
|
"GPT-4.1": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 4,
|
|
"pass_pct": 14.814815,
|
|
"task_pass_equiv": 4.0,
|
|
"task_pass_pct": 33.333336
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 4,
|
|
"pass_pct": 6.557377,
|
|
"task_pass_equiv": 4.0,
|
|
"task_pass_pct": 18.181818
|
|
}
|
|
},
|
|
"Grok 3 Mini (Beta)": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 17,
|
|
"pass_pct": 62.962963,
|
|
"task_pass_equiv": 7.0,
|
|
"task_pass_pct": 58.333332
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 17,
|
|
"pass_pct": 50.0,
|
|
"task_pass_equiv": 5.0,
|
|
"task_pass_pct": 50.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 34,
|
|
"pass_pct": 55.737705,
|
|
"task_pass_equiv": 12.0,
|
|
"task_pass_pct": 54.545456
|
|
}
|
|
},
|
|
"Claude 3.5 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 10,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 12,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 22,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Claude 3.5 Haiku": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 11,
|
|
"total_tests": 18,
|
|
"passed_tests": 1,
|
|
"pass_pct": 5.5555553,
|
|
"task_pass_equiv": 1.0,
|
|
"task_pass_pct": 9.090909
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 11,
|
|
"total_tests": 18,
|
|
"passed_tests": 1,
|
|
"pass_pct": 5.5555553,
|
|
"task_pass_equiv": 1.0,
|
|
"task_pass_pct": 9.090909
|
|
}
|
|
},
|
|
"GPT-5": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 3,
|
|
"pass_pct": 8.823529,
|
|
"task_pass_equiv": 1.0,
|
|
"task_pass_pct": 10.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 13,
|
|
"pass_pct": 48.148148,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 50.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 16,
|
|
"pass_pct": 26.229507,
|
|
"task_pass_equiv": 7.0,
|
|
"task_pass_pct": 31.818182
|
|
}
|
|
},
|
|
"Claude 3.7 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 10,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 16,
|
|
"passed_tests": 6,
|
|
"pass_pct": 37.5,
|
|
"task_pass_equiv": 2.0,
|
|
"task_pass_pct": 16.666668
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 26,
|
|
"passed_tests": 6,
|
|
"pass_pct": 23.076923,
|
|
"task_pass_equiv": 2.0,
|
|
"task_pass_pct": 9.090909
|
|
}
|
|
},
|
|
"DeepSeek R1": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 14,
|
|
"pass_pct": 51.851852,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 50.0
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 14,
|
|
"pass_pct": 22.950819,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 27.272728
|
|
}
|
|
},
|
|
"Claude 4.5 Sonnet": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 25,
|
|
"pass_pct": 92.59259,
|
|
"task_pass_equiv": 10.0,
|
|
"task_pass_pct": 83.33333
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 30,
|
|
"pass_pct": 88.23529,
|
|
"task_pass_equiv": 9.0,
|
|
"task_pass_pct": 90.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 55,
|
|
"pass_pct": 90.16393,
|
|
"task_pass_equiv": 19.0,
|
|
"task_pass_pct": 86.36364
|
|
}
|
|
},
|
|
"Grok 4": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 25,
|
|
"pass_pct": 92.59259,
|
|
"task_pass_equiv": 10.0,
|
|
"task_pass_pct": 83.33333
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 14,
|
|
"pass_pct": 41.17647,
|
|
"task_pass_equiv": 4.0,
|
|
"task_pass_pct": 40.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 39,
|
|
"pass_pct": 63.934425,
|
|
"task_pass_equiv": 14.0,
|
|
"task_pass_pct": 63.636364
|
|
}
|
|
},
|
|
"DeepSeek V3": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 12,
|
|
"pass_pct": 35.294117,
|
|
"task_pass_equiv": 3.0,
|
|
"task_pass_pct": 30.000002
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 19,
|
|
"pass_pct": 70.37037,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 66.66667
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 31,
|
|
"pass_pct": 50.81967,
|
|
"task_pass_equiv": 11.0,
|
|
"task_pass_pct": 50.0
|
|
}
|
|
},
|
|
"Gemini 2.5 Flash": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 13,
|
|
"pass_pct": 48.148148,
|
|
"task_pass_equiv": 5.0,
|
|
"task_pass_pct": 41.666664
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 10,
|
|
"pass_pct": 29.411764,
|
|
"task_pass_equiv": 2.6666667,
|
|
"task_pass_pct": 26.666668
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 23,
|
|
"pass_pct": 37.704918,
|
|
"task_pass_equiv": 7.666667,
|
|
"task_pass_pct": 34.848488
|
|
}
|
|
},
|
|
"Claude 4 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 23,
|
|
"pass_pct": 67.64706,
|
|
"task_pass_equiv": 7.0,
|
|
"task_pass_pct": 70.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 22,
|
|
"pass_pct": 81.48148,
|
|
"task_pass_equiv": 10.0,
|
|
"task_pass_pct": 83.33333
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 45,
|
|
"pass_pct": 73.77049,
|
|
"task_pass_equiv": 17.0,
|
|
"task_pass_pct": 77.27273
|
|
}
|
|
},
|
|
"o4-mini": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Gemini 2.5 Pro": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 17,
|
|
"pass_pct": 62.962963,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 66.66667
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 17,
|
|
"pass_pct": 27.868853,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 36.363636
|
|
}
|
|
},
|
|
"GPT-4o": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"llms.md": {
|
|
"models": {
|
|
"Gemini 2.5 Pro": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"GPT-5": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 34,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 10.0,
|
|
"task_pass_pct": 100.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 27,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 12.0,
|
|
"task_pass_pct": 100.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 61,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 22.0,
|
|
"task_pass_pct": 100.0
|
|
}
|
|
},
|
|
"Gemini 2.5 Flash": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Grok 3 Mini (Beta)": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Claude 3.7 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Claude 4.5 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 15,
|
|
"passed_tests": 3,
|
|
"pass_pct": 20.0,
|
|
"task_pass_equiv": 1.0,
|
|
"task_pass_pct": 10.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 12,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 27,
|
|
"passed_tests": 3,
|
|
"pass_pct": 11.111111,
|
|
"task_pass_equiv": 1.0,
|
|
"task_pass_pct": 4.5454545
|
|
}
|
|
},
|
|
"Claude 4 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"DeepSeek V3": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Meta Llama 3.1 405B": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Grok 4": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Claude 3.5 Haiku": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 4,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"Claude 3.5 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 1,
|
|
"total_tests": 1,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"csharp": {
|
|
"modes": {
|
|
"llms.md": {
|
|
"models": {
|
|
"GPT-4.1": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 12,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 10,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 22,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"GPT-4o": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 12,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 10,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 22,
|
|
"passed_tests": 0,
|
|
"pass_pct": 0.0,
|
|
"task_pass_equiv": 0.0,
|
|
"task_pass_pct": 0.0
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"docs": {
|
|
"models": {
|
|
"o4-mini": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 18,
|
|
"pass_pct": 52.941177,
|
|
"task_pass_equiv": 6.0,
|
|
"task_pass_pct": 60.000004
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 17,
|
|
"pass_pct": 62.962963,
|
|
"task_pass_equiv": 7.8333335,
|
|
"task_pass_pct": 65.27778
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 35,
|
|
"pass_pct": 57.37705,
|
|
"task_pass_equiv": 13.833333,
|
|
"task_pass_pct": 62.878788
|
|
}
|
|
},
|
|
"Claude 4 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 22,
|
|
"pass_pct": 64.70588,
|
|
"task_pass_equiv": 7.0,
|
|
"task_pass_pct": 70.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 27,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 12.0,
|
|
"task_pass_pct": 100.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 49,
|
|
"pass_pct": 80.327866,
|
|
"task_pass_equiv": 19.0,
|
|
"task_pass_pct": 86.36364
|
|
}
|
|
},
|
|
"Grok 4": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 20,
|
|
"pass_pct": 74.07407,
|
|
"task_pass_equiv": 10.0,
|
|
"task_pass_pct": 83.33333
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 31,
|
|
"pass_pct": 91.17647,
|
|
"task_pass_equiv": 9.0,
|
|
"task_pass_pct": 90.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 51,
|
|
"pass_pct": 83.60656,
|
|
"task_pass_equiv": 19.0,
|
|
"task_pass_pct": 86.36364
|
|
}
|
|
},
|
|
"DeepSeek R1": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 24,
|
|
"pass_pct": 88.888885,
|
|
"task_pass_equiv": 11.0,
|
|
"task_pass_pct": 91.66667
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 16,
|
|
"pass_pct": 47.058823,
|
|
"task_pass_equiv": 5.4,
|
|
"task_pass_pct": 54.000004
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 40,
|
|
"pass_pct": 65.57377,
|
|
"task_pass_equiv": 16.4,
|
|
"task_pass_pct": 74.545456
|
|
}
|
|
},
|
|
"Claude 4.5 Sonnet": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 26,
|
|
"pass_pct": 76.47059,
|
|
"task_pass_equiv": 7.6666665,
|
|
"task_pass_pct": 76.666664
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 27,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 12.0,
|
|
"task_pass_pct": 100.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 53,
|
|
"pass_pct": 86.88525,
|
|
"task_pass_equiv": 19.666668,
|
|
"task_pass_pct": 89.39394
|
|
}
|
|
},
|
|
"Gemini 2.5 Pro": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 34,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 10.0,
|
|
"task_pass_pct": 100.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 27,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 12.0,
|
|
"task_pass_pct": 100.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 61,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 22.0,
|
|
"task_pass_pct": 100.0
|
|
}
|
|
},
|
|
"Gemini 2.5 Flash": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 19,
|
|
"pass_pct": 55.882355,
|
|
"task_pass_equiv": 6.0666666,
|
|
"task_pass_pct": 60.666668
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 24,
|
|
"passed_tests": 18,
|
|
"pass_pct": 75.0,
|
|
"task_pass_equiv": 9.0,
|
|
"task_pass_pct": 75.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 58,
|
|
"passed_tests": 37,
|
|
"pass_pct": 63.793102,
|
|
"task_pass_equiv": 15.066667,
|
|
"task_pass_pct": 68.48485
|
|
}
|
|
},
|
|
"GPT-4o": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 19,
|
|
"pass_pct": 70.37037,
|
|
"task_pass_equiv": 8.833334,
|
|
"task_pass_pct": 73.611115
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 26,
|
|
"pass_pct": 76.47059,
|
|
"task_pass_equiv": 8.0,
|
|
"task_pass_pct": 80.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 45,
|
|
"pass_pct": 73.77049,
|
|
"task_pass_equiv": 16.833334,
|
|
"task_pass_pct": 76.51516
|
|
}
|
|
},
|
|
"DeepSeek V3": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 24,
|
|
"pass_pct": 88.888885,
|
|
"task_pass_equiv": 11.0,
|
|
"task_pass_pct": 91.66667
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 27,
|
|
"pass_pct": 79.411766,
|
|
"task_pass_equiv": 8.466667,
|
|
"task_pass_pct": 84.66667
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 51,
|
|
"pass_pct": 83.60656,
|
|
"task_pass_equiv": 19.466667,
|
|
"task_pass_pct": 88.484856
|
|
}
|
|
},
|
|
"Meta Llama 3.1 405B": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 26,
|
|
"pass_pct": 76.47059,
|
|
"task_pass_equiv": 8.25,
|
|
"task_pass_pct": 82.5
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 24,
|
|
"pass_pct": 88.888885,
|
|
"task_pass_equiv": 11.0,
|
|
"task_pass_pct": 91.66667
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 50,
|
|
"pass_pct": 81.96722,
|
|
"task_pass_equiv": 19.25,
|
|
"task_pass_pct": 87.5
|
|
}
|
|
},
|
|
"Grok 3 Mini (Beta)": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 17,
|
|
"pass_pct": 50.0,
|
|
"task_pass_equiv": 5.0,
|
|
"task_pass_pct": 50.0
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 9,
|
|
"pass_pct": 33.333332,
|
|
"task_pass_equiv": 5.0,
|
|
"task_pass_pct": 41.666664
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 26,
|
|
"pass_pct": 42.62295,
|
|
"task_pass_equiv": 10.0,
|
|
"task_pass_pct": 45.454548
|
|
}
|
|
},
|
|
"GPT-5": {
|
|
"categories": {
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 4,
|
|
"pass_pct": 11.764706,
|
|
"task_pass_equiv": 1.25,
|
|
"task_pass_pct": 12.5
|
|
},
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 5,
|
|
"pass_pct": 18.518518,
|
|
"task_pass_equiv": 1.3333334,
|
|
"task_pass_pct": 11.111112
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 9,
|
|
"pass_pct": 14.754098,
|
|
"task_pass_equiv": 2.5833335,
|
|
"task_pass_pct": 11.742425
|
|
}
|
|
},
|
|
"Claude 4.5 Haiku": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 27,
|
|
"pass_pct": 100.0,
|
|
"task_pass_equiv": 12.0,
|
|
"task_pass_pct": 100.0
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 21,
|
|
"pass_pct": 61.764706,
|
|
"task_pass_equiv": 5.8,
|
|
"task_pass_pct": 58.000004
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 48,
|
|
"pass_pct": 78.68852,
|
|
"task_pass_equiv": 17.8,
|
|
"task_pass_pct": 80.90909
|
|
}
|
|
},
|
|
"GPT-4.1": {
|
|
"categories": {
|
|
"basics": {
|
|
"tasks": 12,
|
|
"total_tests": 27,
|
|
"passed_tests": 23,
|
|
"pass_pct": 85.18519,
|
|
"task_pass_equiv": 10.0,
|
|
"task_pass_pct": 83.33333
|
|
},
|
|
"schema": {
|
|
"tasks": 10,
|
|
"total_tests": 34,
|
|
"passed_tests": 29,
|
|
"pass_pct": 85.29412,
|
|
"task_pass_equiv": 9.0,
|
|
"task_pass_pct": 90.0
|
|
}
|
|
},
|
|
"totals": {
|
|
"tasks": 22,
|
|
"total_tests": 61,
|
|
"passed_tests": 52,
|
|
"pass_pct": 85.2459,
|
|
"task_pass_equiv": 19.0,
|
|
"task_pass_pct": 86.36364
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} |