SpacetimeDB/.github/workflows/llm-benchmark-update.yml

name: Update LLM benchmarks

on:
  workflow_dispatch:
    inputs:
      pr_number:
        description: "Pull Request Number"
        required: true
  issue_comment:
    types: [created] # only run when the comment is first created

permissions:
  contents: read
  pull-requests: write
  issues: write

concurrency:
  group: >-
    llm-benchmark
    -${{ github.event_name == 'issue_comment' && github.event.issue.number || inputs.pr_number }}
    ${{ github.event_name == 'issue_comment' && !startsWith(github.event.comment.body, '/update-llm-benchmark') && '-unrelated-comment' }}
  cancel-in-progress: true

jobs:
  update-llm-benchmark:
    # Runnable either with a comment that starts with /update-llm-benchmark
    # or by manually dispatching
    if: |
      (github.event_name == 'issue_comment' && github.event.issue.pull_request && startsWith(github.event.comment.body, '/update-llm-benchmark')) ||
      (github.event_name == 'workflow_dispatch')
    runs-on: spacetimedb-new-runner
    container:
      image: localhost:5000/spacetimedb-ci:latest
      options: >-
        --privileged
    steps:
      # Here we install the spacetime CLI for faster execution of the tests
      # SpacetimeDB itself is not under test here, rather it's the docs.
      # If we want to change that it is possible to have the benchmark compile
      # SpacetimeDB from source.
      - name: Install spacetime CLI
        run: |
          curl -sSf https://install.spacetimedb.com | sh -s -- -y
          echo "$HOME/.local/bin" >> $GITHUB_PATH

      - name: Load PR info
        id: pr
        uses: actions/github-script@v7
        with:
          script: |
            let prNumber;
            if (context.eventName === 'issue_comment') {
              prNumber = context.payload.issue.number;
            } else if (context.eventName === 'workflow_dispatch') {
              const raw = context.payload.inputs?.pr_number;
              if (!raw || !/^\d+$/.test(raw)) {
                core.setFailed(`Invalid pr_number input: '${raw}'.`);
                return;
              }
              prNumber = Number(raw);
            } else {
              core.setFailed(`Unsupported event: ${context.eventName}`);
              return;
            }

            const { data: pr } = await github.rest.pulls.get({
              owner: context.repo.owner,
              repo: context.repo.repo,
              pull_number: prNumber,
            });

            core.setOutput('number', String(prNumber));
            core.setOutput('head_ref', pr.head.ref);
            core.setOutput('head_sha', pr.head.sha);
            core.setOutput('head_repo_full_name', pr.head.repo.full_name);
            core.setOutput('head_owner_type', pr.head.repo.owner.type); // "User"|"Organization"
            core.setOutput('maintainer_can_modify', String(pr.maintainer_can_modify));

      # If this was kicked off by a comment, ensure that the commenter is
      # a collaborator on the repo. We don't want unprivileged users to run benchmarks.
      # Note that the workflow that will be run will be the one that is on the `master`
      # branch, NOT the one from the PR. This is important so that the PR author can't
      # sneak in an exfiltration exploit.
      - name: Check commenter permission
        if: github.event_name == 'issue_comment'
        uses: actions/github-script@v7
        with:
          script: |
            const user = context.payload.comment.user.login;
            const { data } = await github.rest.repos.getCollaboratorPermissionLevel({
              owner: context.repo.owner,
              repo: context.repo.repo,
              username: user,
            });

            const allowed = new Set(['admin', 'maintain', 'write', 'triage']);
            if (!allowed.has(data.permission)) {
              core.setFailed(`User ${user} has permission '${data.permission}', not allowed to run benchmarks.`);
            }

      # If the PR is from a fork, we need to be able to have GitHub actions commit back
      # to the forked repo, so that we can update the benchmark results.
      # In order to do this we need to ensure that the PR is configured to allow the maintainers
      # of the SpacetimeDB repo to commit back ot the fork.
      - name: Check fork pushability (and comment if not)
        if: steps.pr.outputs.head_repo_full_name != github.repository
        uses: actions/github-script@v7
        env:
          PR_NUMBER: ${{ steps.pr.outputs.number }}
          HEAD_OWNER_TYPE: ${{ steps.pr.outputs.head_owner_type }}
          MAINTAINER_CAN_MODIFY: ${{ steps.pr.outputs.maintainer_can_modify }}
        with:
          script: |
            const issue_number = Number(process.env.PR_NUMBER);
            const headOwnerType = process.env.HEAD_OWNER_TYPE;
            const canModify = process.env.MAINTAINER_CAN_MODIFY === 'true';

            if (headOwnerType === 'Organization') {
              await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number,
                body: [
                  "I can’t push benchmark updates to this PR because it comes from an **organization-owned fork**.",
                  "GitHub doesn’t allow granting upstream maintainers push permissions to org-owned forks.",
                  "",
                  "Options:",
                  "- Reopen the PR from a **personal fork** with **Allow edits from maintainers** enabled, or",
                  "- A maintainer can apply the benchmark update on an internal branch."
                ].join("\n"),
              });
              core.setFailed("Org-owned fork PR is not pushable by maintainers.");
              return;
            }

            if (!canModify) {
              await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number,
                body: [
                  "I can’t push benchmark updates to this PR branch until you enable **Allow edits from maintainers**.",
                  "Please check the box on the PR page, then re-comment `/update-llm-benchmark`.",
                  "See https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork"
                ].join("\n"),
              });
              core.setFailed("maintainer_can_modify is false; author must enable 'Allow edits from maintainers'.");
            }

      # Run the benchmark that is already checked into master to prevent
      # an exfiltration attack whereby the PR author tries to sneak in an exploit
      # and get a maintainer to run the modified benchmark without looking at the
      # PR first. This ensure that we only ever execute code that is checked into
      # master.
      - name: Checkout master (build/install tool from trusted code)
        uses: actions/checkout@v4
        with:
          ref: master
          fetch-depth: 0
          persist-credentials: false

      - uses: dtolnay/rust-toolchain@stable
      - uses: Swatinem/rust-cache@v2

      # Ensure we use a user-writable .NET install (not /usr/share/dotnet),
      # so workload installs don't require sudo.
      - name: Setup .NET SDK
        uses: actions/setup-dotnet@v4
        with:
          dotnet-version: "8.0.x"

      - name: Install WASI workload (wasi-experimental)
        env:
          DOTNET_MULTILEVEL_LOOKUP: "0"
          DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home
          DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1"
        run: |
          dotnet --info
          dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel

      - name: Install llm-benchmark tool from master
        run: |
          cargo install --path tools/xtask-llm-benchmark --locked
          command -v llm_benchmark

      # Check out the repo on the branch, but ONLY use this code as data!
      # Never execute code that is on the PR branch.
      - name: Checkout PR head (branch)
        uses: actions/checkout@v4
        with:
          repository: ${{ steps.pr.outputs.head_repo_full_name }}
          ref: ${{ steps.pr.outputs.head_sha }}
          fetch-depth: 0
          persist-credentials: false

      # Run the benchmark against the PR using the installed tool from the
      # master branch.
      - name: Run benchmark (with provider keys)
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          # Prevent MSBuild node reuse issues that cause "Pipe is broken" errors
          # when running multiple dotnet publish commands in parallel.
          # See: https://github.com/dotnet/msbuild/issues/6657
          MSBUILDDISABLENODEREUSE: "1"
          DOTNET_CLI_USE_MSBUILD_SERVER: "0"
        run: |
          llm_benchmark ci-quickfix
          llm_benchmark ci-check

      # Generate failure analysis if there are any failures
      - name: Generate failure analysis
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          llm_benchmark analyze -o docs/llms/docs-benchmark-analysis.md || true

      # Generate PR comment markdown (compares against master baseline)
      - name: Generate PR comment markdown
        run: |
          llm_benchmark ci-comment

      - name: Ensure only docs/llms changed
        run: |
          set -euo pipefail
          CHANGED="$(git diff --name-only)"

          if [ -z "$CHANGED" ]; then
            echo "No changes."
            exit 0
          fi

          if echo "$CHANGED" | grep -qvE '^docs/llms/'; then
            echo "Benchmark produced changes outside docs/llms:"
            echo "$CHANGED" | grep -vE '^docs/llms/'
            exit 1
          fi

      # Comment the benchmark results on the PR
      - name: Comment benchmark results on PR
        uses: actions/github-script@v7
        env:
          PR_NUMBER: ${{ steps.pr.outputs.number }}
        with:
          github-token: ${{ secrets.CLOCKWORK_LABS_BOT_PAT }}
          script: |
            const fs = require('fs');

            // Read the pre-generated comment markdown
            const commentPath = 'docs/llms/docs-benchmark-comment.md';
            if (!fs.existsSync(commentPath)) {
              core.setFailed(`Comment file not found: ${commentPath}`);
              return;
            }
            let body = fs.readFileSync(commentPath, 'utf8');

            // Check if failure analysis exists and append it
            const analysisPath = 'docs/llms/docs-benchmark-analysis.md';
            if (fs.existsSync(analysisPath)) {
              const analysis = fs.readFileSync(analysisPath, 'utf8');
              // Only include if there's meaningful content (not just "no failures")
              if (!analysis.includes('No failures found')) {
                body += `\n<details>\n<summary>Failure Analysis (click to expand)</summary>\n\n${analysis}\n</details>`;
              }
            }

            const issue_number = Number(process.env.PR_NUMBER);

            // Always post a new comment
            console.log(`Posting new comment on PR #${issue_number}...`);
            try {
              await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number,
                body,
              });
              console.log('Comment created successfully');
            } catch (err) {
              console.error('Failed to post comment:', err.message);
              console.error('Full error:', JSON.stringify(err, null, 2));
              throw err;
            }

      # The benchmarks only modify the docs/llms directory.
      # Commit the changes.
      - name: Commit changes
        run: |
          git config user.name "clockwork-labs-bot"
          git config user.email "clockwork-labs-bot@users.noreply.github.com"

          # Prefer staging only the benchmark output area (adjust as needed)
          git add docs/llms

          git diff --cached --quiet && exit 0
          git commit -m "Update LLM benchmark results"

      # Here we use the https://github.com/clockwork-labs-bot user's
      # personal access token to commit back to the PR branch. This is necessary
      # if we want to be able to push back to external contributor forks.
      - name: Push back to PR branch (same repo or fork)
        env:
          GH_TOKEN: ${{ secrets.CLOCKWORK_LABS_BOT_PAT }}
        run: |
          git remote set-url origin "https://x-access-token:${GH_TOKEN}@github.com/${{ steps.pr.outputs.head_repo_full_name }}.git"
          # Fetch and rebase in case branch moved since workflow started (e.g., previous benchmark run)
          git fetch origin "${{ steps.pr.outputs.head_ref }}"
          if ! git rebase "origin/${{ steps.pr.outputs.head_ref }}"; then
            git rebase --abort
            echo "::error::Rebase failed due to conflicts. The PR branch may have been updated during the benchmark run. Please re-run /update-llm-benchmark."
            exit 1
          fi
          git push origin "HEAD:${{ steps.pr.outputs.head_ref }}"