name: Waza skill evals
on:
pull_request:
paths:
- '.github/skills/**/SKILL.md'
- '.github/skills/**/eval.yaml'
- '.github/skills/**/tasks/**'
- '.github/skills/**/fixtures/**'
- '.github/evals/**'
- '.waza.yaml'
- '.github/workflows/waza-evals.yml'
workflow_dispatch:
inputs:
skill:
description: 'Single skill name to run (default: all configured pilot evals)'
required: false
type: string
permissions:
contents: read
pull-requests: write
concurrency:
group: waza-evals-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
WAZA_VERSION: 'v0.33.0'
jobs:
preflight:
name: Preflight (check secrets)
runs-on: ubuntu-latest
timeout-minutes: 2
outputs:
enabled: ${{ steps.check.outputs.enabled }}
steps:
- name: Check COPILOT_GITHUB_TOKEN availability
id: check
env:
TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
run: |
if [ -z "${TOKEN:-}" ]; then
echo "enabled=false" >> "$GITHUB_OUTPUT"
echo "::notice::COPILOT_GITHUB_TOKEN secret is not set. Skipping all waza skill eval jobs. See repo README / PR #109 for setup."
exit 0
fi
# Token is set โ verify it can actually read the private microsoft/waza
# repo (release downloads need access). Reject silently if 401/403/404.
# Capture headers + body for diagnostics (no token is ever printed).
hdr_file=$(mktemp)
body_file=$(mktemp)
http_code=$(curl -sS -D "${hdr_file}" -o "${body_file}" -w "%{http_code}" \
-H "Authorization: Bearer ${TOKEN}" \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/microsoft/waza/releases/latest || true)
if [ "${http_code}" = "200" ]; then
echo "enabled=true" >> "$GITHUB_OUTPUT"
echo "COPILOT_GITHUB_TOKEN can read microsoft/waza โ eval jobs will run."
else
echo "enabled=false" >> "$GITHUB_OUTPUT"
echo "::notice::COPILOT_GITHUB_TOKEN cannot read microsoft/waza (HTTP ${http_code}). Skipping all waza skill eval jobs."
echo "--- diagnostic: response headers (token not included) ---"
grep -iE '^(http|x-oauth-scopes|x-accepted-oauth-scopes|x-github-sso|x-ratelimit-remaining|x-ratelimit-used|x-github-request-id):' "${hdr_file}" || true
echo "--- diagnostic: response body (first 500 bytes) ---"
head -c 500 "${body_file}" || true
echo
echo "--- diagnostic: token-user identity probe ---"
user_code=$(curl -sS -o "${body_file}.user" -w "%{http_code}" \
-H "Authorization: Bearer ${TOKEN}" \
-H "Accept: application/vnd.github+json" \
https://api.github.com/user || true)
echo "GET /user -> HTTP ${user_code}"
if [ "${user_code}" = "200" ]; then
# Print only the login + token type, never the token itself.
jq -r '"token user: \(.login) (type: \(.type))"' "${body_file}.user" 2>/dev/null || head -c 200 "${body_file}.user"
else
head -c 300 "${body_file}.user" || true
fi
echo
fi
rm -f "${hdr_file}" "${body_file}" "${body_file}.user"
prepare:
name: Determine matrix
runs-on: ubuntu-latest
timeout-minutes: 5
needs: preflight
if: needs.preflight.outputs.enabled == 'true'
outputs:
skills: ${{ steps.select.outputs.skills }}
legs: ${{ steps.select.outputs.legs }}
baseline_models: ${{ steps.select.outputs.baseline_models }}
reason: ${{ steps.select.outputs.reason }}
mode: ${{ steps.select.outputs.mode }}
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Select skills
id: select
env:
REQUESTED: ${{ inputs.skill }}
EVENT: ${{ github.event_name }}
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
set -euo pipefail
manifest=".github/evals/manifest.yaml"
if [ ! -f "$manifest" ]; then
echo "::error::manifest not found: $manifest"
exit 1
fi
manifest_json="$(yq -o=json '.' "$manifest")"
ALL_SKILLS="$(echo "$manifest_json" | jq -c '[.skills[].name]')"
BASELINE_MODELS="$(echo "$manifest_json" | jq -c '
[ .tiers[].models[] | select(.baseline == true) | .name ] | unique
')"
echo "ALL_SKILLS=$ALL_SKILLS"
echo "BASELINE_MODELS=$BASELINE_MODELS"
emit() {
local selected="$1" mode="$2" reason="$3"
local legs
legs="$(echo "$manifest_json" | jq -c --argjson sel "$selected" '
. as $root
| [ $root.skills[]
| .name as $sname
| select($sel | index($sname))
| .tier as $tier
| $root.tiers[$tier].models[]
| { skill: $sname, model: .name, baseline: (.baseline == true) }
]
')"
{
echo "skills=${selected}"
echo "legs=${legs}"
echo "baseline_models=${BASELINE_MODELS}"
echo "mode=${mode}"
echo "reason=${reason}"
} >> "$GITHUB_OUTPUT"
echo "Selected skills: ${selected}"
echo "Legs: ${legs}"
echo "Mode: ${mode}"
echo "Reason: ${reason}"
}
if [ "$EVENT" = "workflow_dispatch" ] && [ -n "${REQUESTED:-}" ]; then
if echo "$ALL_SKILLS" | jq -e --arg s "$REQUESTED" '. | index($s)' > /dev/null; then
emit "[\"$REQUESTED\"]" "single" "workflow_dispatch input ($REQUESTED)"
exit 0
else
echo "::error::Requested skill '$REQUESTED' is not in the manifest ($ALL_SKILLS)"
exit 1
fi
fi
if [ "$EVENT" = "workflow_dispatch" ]; then
emit "$ALL_SKILLS" "full" "workflow_dispatch (no input โ full matrix)"
exit 0
fi
if [ -z "${BASE_SHA:-}" ] || [ -z "${HEAD_SHA:-}" ]; then
emit "$ALL_SKILLS" "full" "pull_request: missing base/head SHA โ full matrix"
exit 0
fi
git fetch --no-tags origin "$BASE_SHA" 2>/dev/null || true
changed=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" || true)
if [ -z "$changed" ]; then
emit "[]" "none" "no files changed in PR"
exit 0
fi
echo "--- changed files ---"
echo "$changed"
echo "---------------------"
if echo "$changed" | grep -qE '^(\.waza\.yaml|\.github/workflows/waza-evals\.yml|\.github/evals/manifest\.yaml)$'; then
emit "$ALL_SKILLS" "full" "project-wide config change (.waza.yaml, manifest, or workflow file) โ full matrix"
exit 0
fi
changed_skills=$(
echo "$changed" | awk -F/ '
/^\.github\/skills\// && NF >= 4 {print $3}
/^\.github\/evals\// && NF >= 4 {print $3}
' | sort -u
)
if [ -z "$changed_skills" ]; then
emit "[]" "none" "no per-skill files changed"
exit 0
fi
selected=$(
printf '%s\n' "$changed_skills" \
| jq -R -s -c --argjson all "$ALL_SKILLS" \
'[ split("\n")[] | select(length > 0) | select(IN($all[])) ]'
)
if [ "$selected" = "[]" ]; then
emit "[]" "none" "changed skill(s) not in the manifest: $(echo "$changed_skills" | tr '\n' ' ')"
exit 0
fi
count=$(echo "$selected" | jq 'length')
names=$(echo "$selected" | jq -r 'join(", ")')
emit "$selected" "subset" "diff-scoped: ${count} changed skill(s) โ ${names}"
tokens:
name: Token comparison vs main (advisory)
runs-on: ubuntu-latest
timeout-minutes: 10
needs: preflight
if: needs.preflight.outputs.enabled == 'true'
continue-on-error: true
env:
GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
WAZA_NO_UPDATE_CHECK: '1'
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Install waza (pinned release)
run: |
set -euo pipefail
waza_version="${WAZA_VERSION}"
if [ -z "${waza_version}" ]; then
echo "::error::WAZA_VERSION env var is not set"
exit 1
fi
echo "Installing waza ${waza_version}"
os="$(uname -s | tr '[:upper:]' '[:lower:]')"
arch="$(uname -m)"
case "${arch}" in
x86_64|amd64) arch=amd64 ;;
aarch64|arm64) arch=arm64 ;;
esac
asset="waza-${os}-${arch}"
base="https://github.com/microsoft/waza/releases/download/${waza_version}"
tmp="$(mktemp -d)"
curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}"
curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt"
( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - )
sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza
rm -rf "${tmp}"
waza --version
- name: Token comparison vs main (advisory)
id: tokens-compare
run: |
set -uo pipefail
mkdir -p .waza-results
# Advisory: no --strict so the step never fails the workflow.
# --format json produces machine-readable output for the comment job.
waza tokens compare main --skills --threshold 10 --format json \
> .waza-results/tokens-compare.json 2>&1 || true
echo "--- token comparison output ---"
cat .waza-results/tokens-compare.json || true
# Always exit cleanly โ advisory only.
exit 0
- name: Upload token comparison artifact
if: always()
uses: actions/upload-artifact@v7
with:
name: waza-tokens-compare
path: .waza-results/tokens-compare.json
retention-days: 14
if-no-files-found: warn
include-hidden-files: true
eval:
name: "${{ matrix.skill || 'eval' }} / ${{ matrix.model || 'skipped (no skill changes)' }}"
needs: [preflight, prepare]
if: needs.preflight.outputs.enabled == 'true' && needs.prepare.outputs.legs != '[]' && needs.prepare.outputs.legs != ''
runs-on: ubuntu-latest
timeout-minutes: 25
continue-on-error: true
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(needs.prepare.outputs.legs) }}
env:
GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
WAZA_NO_UPDATE_CHECK: '1'
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Install waza (pinned release)
run: |
set -euo pipefail
waza_version="${WAZA_VERSION}"
if [ -z "${waza_version}" ]; then
echo "::error::WAZA_VERSION env var is not set"
exit 1
fi
echo "Installing waza ${waza_version}"
os="$(uname -s | tr '[:upper:]' '[:lower:]')"
arch="$(uname -m)"
case "${arch}" in
x86_64|amd64) arch=amd64 ;;
aarch64|arm64) arch=arm64 ;;
esac
asset="waza-${os}-${arch}"
base="https://github.com/microsoft/waza/releases/download/${waza_version}"
tmp="$(mktemp -d)"
curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}"
curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt"
( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - )
sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza
rm -rf "${tmp}"
waza --version
- name: Run waza eval (advisory)
id: run
run: |
# GitHub's default shell is `bash -e`. `set -uo pipefail` does NOT
# disable -e, so a non-zero exit from `waza run` (e.g. metric below
# threshold) kills the script before `rc=$?` runs. Explicitly
# disable errexit so we can capture the code and surface it in the
# PR comment instead of failing the leg silently.
set +e
set -uo pipefail
mkdir -p .waza-results
spec=".github/evals/${{ matrix.skill }}/eval.yaml"
# Slug used for filenames + artifact suffix; harmless when the
# model has dots (gpt-5.4) since GH Actions allows them.
slug="${{ matrix.skill }}-${{ matrix.model }}"
extra_flags=""
if [ "${{ matrix.baseline }}" = "true" ]; then
extra_flags="--baseline"
fi
max_attempts=3
attempt=0
rc=0
while [ $attempt -lt $max_attempts ]; do
attempt=$((attempt + 1))
echo "::group::waza run attempt ${attempt}/${max_attempts} for ${slug}"
rc=0
waza run "${spec}" \
--model "${{ matrix.model }}" \
--judge-model "claude-opus-4.7" \
--suggest \
--recommend \
${extra_flags} \
--format "github-comment" \
--output ".waza-results/${slug}.json" \
--reporter "junit:.waza-results/${slug}.junit.xml" \
--parallel \
> ".waza-results/${slug}.md"
rc=$?
echo "::endgroup::"
if [ ! -f ".waza-results/${slug}.json" ]; then
echo "::warning::attempt ${attempt}: no JSON produced (rc=${rc})"
if [ $attempt -lt $max_attempts ]; then sleep 5; continue; fi
break
fi
session_errs=$(jq -r '
[.tasks[]?.runs[]? | select(.error_msg // "" | contains("Session not found"))] | length
' ".waza-results/${slug}.json" 2>/dev/null || echo 0)
if [ "${session_errs}" = "0" ]; then
echo "::notice::${slug} attempt ${attempt} clean (no session-not-found errors)"
break
fi
echo "::warning::${slug} attempt ${attempt} hit ${session_errs} session-not-found error(s)"
if [ $attempt -lt $max_attempts ]; then
rm -f ".waza-results/${slug}.json" ".waza-results/${slug}.md" ".waza-results/${slug}.junit.xml"
sleep 5
fi
done
final_session_errs=0
if [ -f ".waza-results/${slug}.json" ]; then
final_session_errs=$(jq -r '
[.tasks[]?.runs[]? | select(.error_msg // "" | contains("Session not found"))] | length
' ".waza-results/${slug}.json" 2>/dev/null || echo 0)
fi
if [ "${final_session_errs}" != "0" ]; then
echo "::error::${slug} still has ${final_session_errs} session-not-found error(s) after ${max_attempts} attempts โ discarding corrupt artifact"
printf 'session_not_found_errors=%s\nattempts=%s\nlast_exit_code=%s\n' \
"${final_session_errs}" "${max_attempts}" "${rc}" \
> ".waza-results/${slug}.infra-failed"
rm -f ".waza-results/${slug}.json" ".waza-results/${slug}.junit.xml"
{
printf '
printf 'waza run hit `%s` `Session not found` JSON-RPC error(s) ' "${final_session_errs}"
printf 'from the Copilot SDK after **%s attempt(s)**. ' "${max_attempts}"
printf 'The session-resume path used by `prompt` graders with '
printf '`continue_session: true` is intermittently flaky in CI; '
printf 'retries did not recover. **No score is reported for this leg** '
printf 'โ treating a corrupted run as a low score would be misleading.\n'
} > ".waza-results/${slug}.md"
fi
echo "exit_code=${rc}" >> "$GITHUB_OUTPUT"
echo
echo "--- captured PR-comment markdown ---"
cat ".waza-results/${slug}.md" || true
exit 0
- name: Tokens profile (advisory)
id: tokens-profile
continue-on-error: true
run: |
set -uo pipefail
slug="${{ matrix.skill }}-${{ matrix.model }}"
mkdir -p .waza-results
waza tokens profile ".github/skills/${{ matrix.skill }}" \
> ".waza-results/${slug}-tokens-profile.txt" 2>&1 || true
cat ".waza-results/${slug}-tokens-profile.txt" || true
exit 0
- name: Quality signal (advisory)
id: quality
continue-on-error: true
run: |
set -uo pipefail
slug="${{ matrix.skill }}-${{ matrix.model }}"
mkdir -p .waza-results
# --judge-model omitted: this step uses the project default judge model
# (claude-sonnet-4.6 from .waza.yaml) for consistent quality scoring
# regardless of which executor model is running in this matrix leg.
waza quality ".github/skills/${{ matrix.skill }}" --format table \
> ".waza-results/${slug}-quality.txt" 2>&1 || true
cat ".waza-results/${slug}-quality.txt" || true
exit 0
- name: Compliance check (advisory)
id: check
continue-on-error: true
run: |
set -uo pipefail
slug="${{ matrix.skill }}-${{ matrix.model }}"
mkdir -p .waza-results
waza check ".github/skills/${{ matrix.skill }}" \
> ".waza-results/${slug}-check.txt" 2>&1 || true
cat ".waza-results/${slug}-check.txt" || true
exit 0
- name: Upload eval artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: waza-results-${{ matrix.skill }}-${{ matrix.model }}
path: .waza-results/
retention-days: 14
if-no-files-found: warn
include-hidden-files: true
comment:
name: Post advisory comment on PR
needs: [preflight, prepare, eval, tokens]
if: github.event_name == 'pull_request' && needs.preflight.outputs.enabled == 'true' && always()
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
- name: Download all eval artifacts
uses: actions/download-artifact@v8
with:
path: artifacts
pattern: waza-results-*
merge-multiple: false
- name: Download token comparison artifact
uses: actions/download-artifact@v8
with:
name: waza-tokens-compare
path: artifacts/waza-tokens-compare
continue-on-error: true
- name: Aggregate and post comment
uses: actions/github-script@v9
env:
PREPARE_MODE: ${{ needs.prepare.outputs.mode }}
PREPARE_REASON: ${{ needs.prepare.outputs.reason }}
PREPARE_SKILLS: ${{ needs.prepare.outputs.skills }}
PREPARE_BASELINES: ${{ needs.prepare.outputs.baseline_models }}
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const fs = require('fs');
const path = require('path');
// Each matrix job uploads `waza-results-<skill>-<model>`
// containing per-leg files (slug.md, slug-tokens-profile.txt,
// slug-quality.txt, slug-check.txt).
//
// Skill ordering and baseline-model classification are sourced
// from .github/evals/manifest.yaml via the prepare job โ no
// hardcoded lists in this workflow.
const skills = JSON.parse(process.env.PREPARE_SKILLS || '[]');
const baselineModels = new Set(
JSON.parse(process.env.PREPARE_BASELINES || '[]')
);
const root = 'artifacts';
const allDirs = fs.existsSync(root)
? fs.readdirSync(root)
.filter((d) => d.startsWith('waza-results-'))
.sort()
: [];
// Helper: read a file, return trimmed content or fallback string.
// Logs a debug note when returning the fallback so missing artifacts
// are visible in the Actions log without failing the step.
function readArtifact(filePath, fallback) {
if (fs.existsSync(filePath)) {
const c = fs.readFileSync(filePath, 'utf8').trim();
if (c) return c;
core.debug(`readArtifact: file exists but is empty โ ${filePath}`);
} else {
core.debug(`readArtifact: file not found โ ${filePath}`);
}
return fallback;
}
// Helper: wrap content in a <details> block if it exceeds threshold.
function maybeCollapse(summary, content, threshold) {
const limit = threshold || 50;
const lines = content.split('\n').length;
if (lines > limit) {
return `<details><summary>${summary} (${lines} lines โ click to expand)</summary>\n\n${content}\n\n</details>`;
}
return `**${summary}**\n\n${content}`;
}
// Group artifacts by skill.
const bySkill = new Map();
for (const d of allDirs) {
const rest = d.replace(/^waza-results-/, '');
const skill = skills.find((s) => rest === s || rest.startsWith(s + '-'));
if (!skill) continue;
const model = rest === skill ? '(default)' : rest.slice(skill.length + 1);
if (!bySkill.has(skill)) bySkill.set(skill, []);
bySkill.get(skill).push({ model, dir: d, slug: rest });
}
// Token comparison section (top-level, from tokens job).
let tokenCompareSection = '';
const tcPath = path.join(root, 'waza-tokens-compare', 'tokens-compare.json');
const tcRaw = readArtifact(tcPath, '');
if (tcRaw) {
const tcBlock = '```json\n' + tcRaw + '\n```';
tokenCompareSection = [
'<details><summary>๐ Token comparison vs <code>main</code> (advisory)</summary>',
'',
tcBlock,
'',
'</details>',
'',
].join('\n');
}
// Build per-skill sections.
const sections = [];
for (const skill of skills) {
if (!bySkill.has(skill)) continue;
const legs = bySkill.get(skill).sort((a, b) => a.model.localeCompare(b.model));
// Score (per model) + Suggestions/Recommendations
const scoreParts = [];
for (const leg of legs) {
const isBaseline = baselineModels.has(leg.model);
const modelLabel = isBaseline
? leg.model + ' *(baseline โ A/B mode)*'
: leg.model;
const mdPath = path.join(root, leg.dir, leg.slug + '.md');
const body = readArtifact(mdPath,
'_No output captured. See workflow logs and the `' + leg.dir + '` artifact._');
scoreParts.push('<details><summary>Model: <code>' + modelLabel +
'</code></summary>\n\n' + body + '\n\n</details>');
}
const scoreSection = '<details><summary>๐ Score (per model) + Suggestions/Recommendations</summary>\n\n' +
scoreParts.join('\n\n') + '\n\n</details>';
// Tokens (count + profile) โ model-independent, use first available leg.
let tokenBody = '_Not available._';
for (const leg of legs) {
const tp = path.join(root, leg.dir, leg.slug + '-tokens-profile.txt');
const c = readArtifact(tp, '');
if (c) { tokenBody = '```\n' + c + '\n```'; break; }
}
const tokenSection = maybeCollapse('๐ข Tokens (count + profile)', tokenBody);
// Quality (5-dim table) โ model-independent, use first available leg.
let qualityBody = '_Not available._';
for (const leg of legs) {
const qp = path.join(root, leg.dir, leg.slug + '-quality.txt');
const c = readArtifact(qp, '');
if (c) { qualityBody = '```\n' + c + '\n```'; break; }
}
const qualitySection = maybeCollapse('๐ฏ Quality (5-dim table)', qualityBody);
// Check (compliance summary) โ model-independent, use first available leg.
let checkBody = '_Not available._';
for (const leg of legs) {
const cp = path.join(root, leg.dir, leg.slug + '-check.txt');
const c = readArtifact(cp, '');
if (c) { checkBody = '```\n' + c + '\n```'; break; }
}
// `waza check` expects `eval.yaml` colocated with `SKILL.md`. This
// repo separates them (`.github/skills/<name>/SKILL.md` vs
// `.github/evals/<name>/eval.yaml`), so the "Evaluation Suite:
// Not Found" line is a false negative โ the eval actually ran
// (see the "Score" section above). Prepend a note so reviewers
// are not misled.
const checkNote =
'> โน๏ธ **`waza check` expects `eval.yaml` colocated with `SKILL.md`.** ' +
'This repo separates them into `.github/evals/' + skill + '/eval.yaml`, ' +
'so the "Evaluation Suite: Not Found" line below is a false negative โ ' +
'the eval actually ran (see the **Score** section above).\n\n';
const checkSection = maybeCollapse('โ
Check (compliance summary)', checkNote + checkBody);
sections.push([
'
'',
scoreSection,
'',
tokenSection,
'',
qualitySection,
'',
checkSection,
].join('\n'));
}
const totalLegs = allDirs.length;
// Selection-mode banner from the prepare job.
const prepareMode = (process.env.PREPARE_MODE || '').trim();
const prepareReason = (process.env.PREPARE_REASON || '').trim();
let scopeBanner = '';
if (prepareMode === 'none') {
scopeBanner =
'> โน๏ธ **No skills evaluated.** ' + (prepareReason || 'No relevant changes detected.') +
' The token comparison above (if any) is the only signal for this PR.';
} else if (prepareMode === 'subset') {
scopeBanner =
'> ๐ฏ **Diff-scoped run.** ' + (prepareReason || 'Only changed skills evaluated.') +
' Touch `.waza.yaml` or trigger `workflow_dispatch` to run the full matrix.';
} else if (prepareMode === 'single') {
scopeBanner =
'> ๐ฏ **Single-skill run.** ' + (prepareReason || 'workflow_dispatch input.');
} else if (prepareMode === 'full') {
scopeBanner =
'> ๐ **Full matrix run.** ' + (prepareReason || 'All configured skills evaluated.');
}
const header = [
'<!-- waza-evals-comment -->',
'## ๐งช Waza skill evals (advisory)',
'',
scopeBanner,
scopeBanner ? '' : null,
'Ran ' + totalLegs + ' matrix leg' + (totalLegs === 1 ? '' : 's') +
' in parallel (skills ร models). Results are non-blocking โ investigate failures via the workflow logs and the per-leg `waza-results-*` artifacts.',
'',
'> **Legend:** Models flagged `baseline: true` in `.github/evals/manifest.yaml` (currently: `' +
(Array.from(baselineModels).join('`, `') || 'none') +
'`) run with `--baseline` (A/B mode) to cap quota. All other models run standard. Judge model is fixed at `claude-opus-4.7` across all legs.',
'',
].filter((line) => line !== null).join('\n');
// Assemble body. Each major block is separated by a blank line so
// that GitHub Flavored Markdown correctly recognizes the per-skill
// `
// preceding `</details>` they get rendered as plain text).
const sectionsBlock = sections.length > 0
? sections.join('\n\n---\n\n')
: '_No artifacts produced. See workflow logs._';
const body = [
header.replace(/\s+$/, ''),
tokenCompareSection.replace(/\s+$/, ''),
sectionsBlock,
].filter((s) => s.length > 0).join('\n\n') + '\n';
const { owner, repo } = context.repo;
const issue_number = context.payload.pull_request.number;
// Paginate to find our marker comment โ listComments defaults to
// 30 per page and our comment may be beyond that on busy PRs.
let existing = null;
for await (const response of github.paginate.iterator(
github.rest.issues.listComments,
{ owner, repo, issue_number, per_page: 100 }
)) {
const found = response.data.find((c) => c.body && c.body.includes('<!-- waza-evals-comment -->'));
if (found) { existing = found; break; }
}
if (existing) {
await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body });
} else {
await github.rest.issues.createComment({ owner, repo, issue_number, body });
}