name: Waza agent evals
on:
pull_request:
paths:
- '.github/agents/**/*.agent.md'
- '.github/evals/agents/**'
- '.github/workflows/waza-agent-evals.yml'
workflow_dispatch:
inputs:
agent:
description: 'Single agent name to run (default: all agents with an eval directory)'
required: false
type: string
permissions:
contents: read
pull-requests: write
concurrency:
group: waza-agent-evals-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
WAZA_VERSION: 'v0.33.0'
jobs:
preflight:
name: Preflight (check secrets)
runs-on: ubuntu-latest
timeout-minutes: 2
outputs:
enabled: ${{ steps.check.outputs.enabled }}
steps:
- name: Check COPILOT_GITHUB_TOKEN availability
id: check
env:
TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
run: |
if [ -z "${TOKEN:-}" ]; then
echo "enabled=false" >> "$GITHUB_OUTPUT"
echo "::notice::COPILOT_GITHUB_TOKEN secret is not set. Skipping all waza agent eval jobs. See repo README / PR #109 for setup."
exit 0
fi
# Token is set â verify it can actually read the private microsoft/waza
# repo (release downloads need access). Reject silently if 401/403/404.
# Capture headers + body for diagnostics (no token is ever printed).
hdr_file=$(mktemp)
body_file=$(mktemp)
http_code=$(curl -sS -D "${hdr_file}" -o "${body_file}" -w "%{http_code}" \
-H "Authorization: Bearer ${TOKEN}" \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/microsoft/waza/releases/latest || true)
if [ "${http_code}" = "200" ]; then
echo "enabled=true" >> "$GITHUB_OUTPUT"
echo "COPILOT_GITHUB_TOKEN can read microsoft/waza â eval jobs will run."
else
echo "enabled=false" >> "$GITHUB_OUTPUT"
echo "::notice::COPILOT_GITHUB_TOKEN cannot read microsoft/waza (HTTP ${http_code}). Skipping all waza agent eval jobs."
echo "--- diagnostic: response headers (token not included) ---"
grep -iE '^(http|x-oauth-scopes|x-accepted-oauth-scopes|x-github-sso|x-ratelimit-remaining|x-ratelimit-used|x-github-request-id):' "${hdr_file}" || true
echo "--- diagnostic: response body (first 500 bytes) ---"
head -c 500 "${body_file}" || true
echo
echo "--- diagnostic: token-user identity probe ---"
user_code=$(curl -sS -o "${body_file}.user" -w "%{http_code}" \
-H "Authorization: Bearer ${TOKEN}" \
-H "Accept: application/vnd.github+json" \
https://api.github.com/user || true)
echo "GET /user -> HTTP ${user_code}"
if [ "${user_code}" = "200" ]; then
jq -r '"token user: \(.login) (type: \(.type))"' "${body_file}.user" 2>/dev/null || head -c 200 "${body_file}.user"
else
head -c 300 "${body_file}.user" || true
fi
echo
fi
rm -f "${hdr_file}" "${body_file}" "${body_file}.user"
prepare:
name: Determine matrix
runs-on: ubuntu-latest
timeout-minutes: 5
needs: preflight
if: needs.preflight.outputs.enabled == 'true'
outputs:
agents: ${{ steps.select.outputs.agents }}
legs: ${{ steps.select.outputs.legs }}
reason: ${{ steps.select.outputs.reason }}
mode: ${{ steps.select.outputs.mode }}
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Select agents
id: select
env:
REQUESTED: ${{ inputs.agent }}
EVENT: ${{ github.event_name }}
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
set -euo pipefail
if [ -d .github/evals/agents ]; then
ALL_AGENTS="$(
find .github/evals/agents -mindepth 2 -maxdepth 2 -name eval.yaml \
| awk -F/ '{print $4}' \
| sort -u \
| jq -R -s -c 'split("\n") | map(select(length > 0))'
)"
else
ALL_AGENTS="[]"
fi
echo "ALL_AGENTS=$ALL_AGENTS"
emit() {
local selected="$1" mode="$2" reason="$3"
local legs
legs="$(echo "$selected" | jq -c '[ .[] | { agent: . } ]')"
{
echo "agents=${selected}"
echo "legs=${legs}"
echo "mode=${mode}"
echo "reason=${reason}"
} >> "$GITHUB_OUTPUT"
echo "Selected agents: ${selected}"
echo "Legs: ${legs}"
echo "Mode: ${mode}"
echo "Reason: ${reason}"
}
if [ "$EVENT" = "workflow_dispatch" ] && [ -n "${REQUESTED:-}" ]; then
if echo "$ALL_AGENTS" | jq -e --arg a "$REQUESTED" '. | index($a)' > /dev/null; then
emit "[\"$REQUESTED\"]" "single" "workflow_dispatch input ($REQUESTED)"
exit 0
else
echo "::error::Requested agent '$REQUESTED' has no eval directory under .github/evals/agents/ (available: $ALL_AGENTS)"
exit 1
fi
fi
if [ "$EVENT" = "workflow_dispatch" ]; then
emit "$ALL_AGENTS" "full" "workflow_dispatch (no input â full matrix)"
exit 0
fi
if [ -z "${BASE_SHA:-}" ] || [ -z "${HEAD_SHA:-}" ]; then
emit "$ALL_AGENTS" "full" "pull_request: missing base/head SHA â full matrix"
exit 0
fi
git fetch --no-tags origin "$BASE_SHA" 2>/dev/null || true
changed=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA" || true)
if [ -z "$changed" ]; then
emit "[]" "none" "no files changed in PR"
exit 0
fi
echo "--- changed files ---"
echo "$changed"
echo "---------------------"
if echo "$changed" | grep -qE '^\.github/workflows/waza-agent-evals\.yml$'; then
emit "$ALL_AGENTS" "full" "workflow file changed â full matrix"
exit 0
fi
changed_agents=$(
echo "$changed" | awk -F/ '
/^\.github\/agents\/.+\.agent\.md$/ {
fname=$3
sub(/\.agent\.md$/, "", fname)
print fname
}
/^\.github\/evals\/agents\// && NF >= 5 {print $4}
' | sort -u
)
if [ -z "$changed_agents" ]; then
emit "[]" "none" "no per-agent files changed"
exit 0
fi
selected=$(
printf '%s\n' "$changed_agents" \
| jq -R -s -c --argjson all "$ALL_AGENTS" \
'[ split("\n")[] | select(length > 0) | select(IN($all[])) ]'
)
if [ "$selected" = "[]" ]; then
emit "[]" "none" "changed agent(s) have no eval directory: $(echo "$changed_agents" | tr '\n' ' ')"
exit 0
fi
count=$(echo "$selected" | jq 'length')
names=$(echo "$selected" | jq -r 'join(", ")')
emit "$selected" "subset" "diff-scoped: ${count} changed agent(s) â ${names}"
tokens:
name: Agent file token comparison vs main (advisory)
runs-on: ubuntu-latest
timeout-minutes: 10
needs: preflight
if: needs.preflight.outputs.enabled == 'true'
continue-on-error: true
env:
GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
WAZA_NO_UPDATE_CHECK: '1'
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Install waza (pinned release)
run: |
set -euo pipefail
waza_version="${WAZA_VERSION}"
if [ -z "${waza_version}" ]; then
echo "::error::WAZA_VERSION env var is not set"
exit 1
fi
os="$(uname -s | tr '[:upper:]' '[:lower:]')"
arch="$(uname -m)"
case "${arch}" in
x86_64|amd64) arch=amd64 ;;
aarch64|arm64) arch=arm64 ;;
esac
asset="waza-${os}-${arch}"
base="https://github.com/microsoft/waza/releases/download/${waza_version}"
tmp="$(mktemp -d)"
curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}"
curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt"
( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - )
sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza
rm -rf "${tmp}"
waza --version
- name: Compare .agent.md token counts vs origin/main
shell: bash {0}
run: |
set -uo pipefail
mkdir -p .waza-results
# `waza tokens compare` without --skills walks every .md file
# in the repo. We post-filter to .github/agents/*.agent.md
# entries only. --threshold 0 keeps the exit code clean
# (advisory, never gates).
waza tokens compare origin/main --threshold 0 --format json \
> .waza-results/tokens-compare-raw.json 2>&1 || true
if jq -e 'type == "object" and has("files")' \
.waza-results/tokens-compare-raw.json > /dev/null 2>&1; then
jq '{
base: .base,
head: .head,
files: [ .files[]
| select((.path // "") | test("^\\.github/agents/.+\\.agent\\.md$")) ]
}' .waza-results/tokens-compare-raw.json \
> .waza-results/tokens-compare.json \
|| cp .waza-results/tokens-compare-raw.json .waza-results/tokens-compare.json
else
cp .waza-results/tokens-compare-raw.json .waza-results/tokens-compare.json
fi
echo "--- filtered agent token comparison ---"
cat .waza-results/tokens-compare.json || true
exit 0
- name: Upload token comparison artifact
if: always()
uses: actions/upload-artifact@v7
with:
name: waza-agent-tokens-compare
path: .waza-results/tokens-compare.json
retention-days: 14
if-no-files-found: warn
include-hidden-files: true
eval:
name: "${{ matrix.agent || 'eval (skipped â no agent changes)' }}"
needs: [preflight, prepare]
if: needs.preflight.outputs.enabled == 'true' && needs.prepare.outputs.legs != '[]' && needs.prepare.outputs.legs != ''
runs-on: ubuntu-latest
timeout-minutes: 20
continue-on-error: true
strategy:
fail-fast: false
max-parallel: 3
matrix:
include: ${{ fromJSON(needs.prepare.outputs.legs) }}
env:
GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
WAZA_NO_UPDATE_CHECK: '1'
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Install waza (pinned release)
run: |
set -euo pipefail
waza_version="${WAZA_VERSION}"
if [ -z "${waza_version}" ]; then
echo "::error::WAZA_VERSION env var is not set"
exit 1
fi
echo "Installing waza ${waza_version}"
os="$(uname -s | tr '[:upper:]' '[:lower:]')"
arch="$(uname -m)"
case "${arch}" in
x86_64|amd64) arch=amd64 ;;
aarch64|arm64) arch=arm64 ;;
esac
asset="waza-${os}-${arch}"
base="https://github.com/microsoft/waza/releases/download/${waza_version}"
tmp="$(mktemp -d)"
curl -fsSL -o "${tmp}/${asset}" "${base}/${asset}"
curl -fsSL -o "${tmp}/checksums.txt" "${base}/checksums.txt"
( cd "${tmp}" && grep " ${asset}$" checksums.txt | sha256sum -c - )
sudo install -m 0755 "${tmp}/${asset}" /usr/local/bin/waza
rm -rf "${tmp}"
waza --version
- name: Sync mirrored .agent.md from canonical .github/agents/
run: |
set -euo pipefail
agent="${{ matrix.agent }}"
src=".github/agents/${agent}.agent.md"
dst=".github/evals/agents/${agent}/${agent}.agent.md"
if [ -f "$src" ] && [ -d ".github/evals/agents/${agent}" ]; then
cp "$src" "$dst"
echo "Synced ${src} -> ${dst}"
else
echo "::warning::Missing canonical agent file or eval dir for ${agent}: src=${src}, dst-dir=.github/evals/agents/${agent}"
fi
- name: Run waza eval (advisory)
id: run
run: |
# GitHub's default shell is `bash -e`. `set -uo pipefail` does NOT
# disable -e, so a non-zero exit from `waza run` (e.g. metric below
# threshold) kills the script before `rc=$?` runs. Explicitly
# disable errexit so we can capture the code and surface it in the
# PR comment instead of failing the leg silently.
set +e
set -uo pipefail
mkdir -p .waza-results
agent="${{ matrix.agent }}"
spec=".github/evals/agents/${agent}/eval.yaml"
max_attempts=3
attempt=0
rc=0
while [ $attempt -lt $max_attempts ]; do
attempt=$((attempt + 1))
echo "::group::waza run attempt ${attempt}/${max_attempts} for ${agent}"
rc=0
waza run "${spec}" \
--model "claude-sonnet-4.6" \
--judge-model "claude-opus-4.7" \
--suggest \
--recommend \
--format "github-comment" \
--output ".waza-results/${agent}.json" \
--reporter "junit:.waza-results/${agent}.junit.xml" \
> ".waza-results/${agent}.md"
rc=$?
echo "::endgroup::"
if [ ! -f ".waza-results/${agent}.json" ]; then
echo "::warning::attempt ${attempt}: no JSON produced (rc=${rc})"
if [ $attempt -lt $max_attempts ]; then sleep 5; continue; fi
break
fi
infra_counts=$(jq -r '
[.tasks[]?.runs[]? | (.error_msg // "")] as $errs
| { session: ([$errs[] | select(contains("Session not found"))] | length),
grader: ([$errs[] | select(contains("failed to run grader"))] | length),
quota: ([$errs[] | select(contains("Failed to list models: 429"))] | length) }
| "\(.session) \(.grader) \(.quota)"
' ".waza-results/${agent}.json" 2>/dev/null || echo "0 0 0")
session_errs=$(echo "${infra_counts}" | awk '{print $1}')
grader_errs=$(echo "${infra_counts}" | awk '{print $2}')
quota_errs=$(echo "${infra_counts}" | awk '{print $3}')
total_infra=$((session_errs + grader_errs + quota_errs))
if [ "${total_infra}" = "0" ]; then
echo "::notice::${agent} attempt ${attempt} clean (no infra-failure errors)"
break
fi
echo "::warning::${agent} attempt ${attempt} hit ${session_errs} session-not-found + ${grader_errs} grader-infra + ${quota_errs} quota-429 error(s)"
if [ $attempt -lt $max_attempts ]; then
rm -f ".waza-results/${agent}.json" ".waza-results/${agent}.md" ".waza-results/${agent}.junit.xml"
if [ "${quota_errs}" != "0" ]; then sleep 30; else sleep 5; fi
fi
done
final_session=0
final_grader=0
final_quota=0
if [ -f ".waza-results/${agent}.json" ]; then
infra_counts=$(jq -r '
[.tasks[]?.runs[]? | (.error_msg // "")] as $errs
| { session: ([$errs[] | select(contains("Session not found"))] | length),
grader: ([$errs[] | select(contains("failed to run grader"))] | length),
quota: ([$errs[] | select(contains("Failed to list models: 429"))] | length) }
| "\(.session) \(.grader) \(.quota)"
' ".waza-results/${agent}.json" 2>/dev/null || echo "0 0 0")
final_session=$(echo "${infra_counts}" | awk '{print $1}')
final_grader=$(echo "${infra_counts}" | awk '{print $2}')
final_quota=$(echo "${infra_counts}" | awk '{print $3}')
fi
final_infra=$((final_session + final_grader + final_quota))
if [ "${final_infra}" != "0" ]; then
echo "::error::${agent} still has ${final_session} session-not-found + ${final_grader} grader-infra + ${final_quota} quota-429 error(s) after ${max_attempts} attempts â discarding corrupt artifact"
printf 'session_not_found_errors=%s\ngrader_failed_errors=%s\nquota_429_errors=%s\nattempts=%s\nlast_exit_code=%s\n' \
"${final_session}" "${final_grader}" "${final_quota}" "${max_attempts}" "${rc}" \
> ".waza-results/${agent}.infra-failed"
rm -f ".waza-results/${agent}.json" ".waza-results/${agent}.junit.xml"
{
printf '
printf 'waza run hit infra-level error(s) from the Copilot SDK '
printf 'after **%s attempt(s)**:\n\n' "${max_attempts}"
printf -- '- `Session not found` (JSON-RPC -32603): **%s**\n' "${final_session}"
printf -- '- `failed to run grader` (judge backend crash): **%s**\n' "${final_grader}"
printf -- '- `Failed to list models: 429` (Copilot quota): **%s**\n\n' "${final_quota}"
printf 'These error classes are transient infrastructure issues, '
printf 'not model-quality signal. **No score is reported for this leg** '
printf 'â treating a corrupted run as a low score would be misleading. '
printf 'See the workflow logs and the `waza-agent-results-%s` artifact for details.\n' "${agent}"
} > ".waza-results/${agent}.md"
fi
echo "exit_code=${rc}" >> "$GITHUB_OUTPUT"
echo
echo "--- captured PR-comment markdown ---"
cat ".waza-results/${agent}.md" || true
exit 0
- name: Agent signal â tokens profile + quality (advisory)
if: always()
run: |
set -uo pipefail
mkdir -p .waza-results
agent="${{ matrix.agent }}"
src=".github/agents/${agent}.agent.md"
if [ ! -f "$src" ]; then
echo "::warning::canonical agent file missing for ${agent}: ${src} â skipping signal steps"
exit 0
fi
stage_root="waza-agent-stage"
stage_dir="${stage_root}/${agent}"
rm -rf "$stage_dir"
mkdir -p "$stage_dir"
cp "$src" "${stage_dir}/SKILL.md"
echo "::group::waza tokens profile (${agent})"
waza tokens profile "$stage_dir" \
> ".waza-results/${agent}-tokens-profile.txt" 2>&1 || true
sed -i "s|${stage_root}/||g" ".waza-results/${agent}-tokens-profile.txt" || true
cat ".waza-results/${agent}-tokens-profile.txt" || true
echo "::endgroup::"
echo "::group::waza quality (${agent}) â LLM judge, ~1 premium req"
waza quality "$stage_dir" --format table \
> ".waza-results/${agent}-quality.txt" 2>&1 || true
sed -i "s|${stage_root}/||g" ".waza-results/${agent}-quality.txt" || true
cat ".waza-results/${agent}-quality.txt" || true
echo "::endgroup::"
rm -rf "$stage_root"
- name: Upload eval artifacts
if: always()
uses: actions/upload-artifact@v7
with:
name: waza-agent-results-${{ matrix.agent }}
path: .waza-results/
retention-days: 14
if-no-files-found: warn
include-hidden-files: true
comment:
name: Post advisory comment on PR
needs: [preflight, prepare, eval, tokens]
if: github.event_name == 'pull_request' && needs.preflight.outputs.enabled == 'true' && always()
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
- name: Download all eval artifacts
uses: actions/download-artifact@v8
with:
path: artifacts
pattern: waza-agent-results-*
merge-multiple: false
- name: Download token comparison artifact
uses: actions/download-artifact@v8
with:
name: waza-agent-tokens-compare
path: artifacts/waza-agent-tokens-compare
continue-on-error: true
- name: Normalize artifact layout (handle v8 single-match flattening)
shell: bash
run: |
set -euo pipefail
if [ ! -d artifacts ]; then
echo "artifacts/ does not exist â nothing to normalize"
exit 0
fi
echo "--- artifact layout BEFORE normalization ---"
find artifacts -maxdepth 3 -mindepth 1 | sort
shopt -s nullglob
cd artifacts
for f in *.json *.md *.junit.xml *-quality.txt *-tokens-profile.txt *.infra-failed; do
[ -f "$f" ] || continue
agent="$f"
for suf in -quality.txt -tokens-profile.txt .junit.xml .json .md .infra-failed; do
agent="${agent%"$suf"}"
done
if [ -z "$agent" ] || [ "$agent" = "$f" ]; then
echo "::warning::Could not derive agent slug from filename '$f' â leaving in place"
continue
fi
mkdir -p "waza-agent-results-${agent}"
mv -- "$f" "waza-agent-results-${agent}/"
echo " moved: $f -> waza-agent-results-${agent}/"
done
cd -
echo "--- artifact layout AFTER normalization ---"
find artifacts -maxdepth 3 -mindepth 1 | sort
- name: Aggregate and post comment
uses: actions/github-script@v9
env:
PREPARE_MODE: ${{ needs.prepare.outputs.mode }}
PREPARE_REASON: ${{ needs.prepare.outputs.reason }}
PREPARE_AGENTS: ${{ needs.prepare.outputs.agents }}
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const fs = require('fs');
const path = require('path');
const agents = JSON.parse(process.env.PREPARE_AGENTS || '[]');
const root = 'artifacts';
const allDirs = fs.existsSync(root)
? fs.readdirSync(root)
.filter((d) => d.startsWith('waza-agent-results-'))
.sort()
: [];
// ---------------- helpers ----------------
function readFileOrNull(filePath) {
try {
if (!fs.existsSync(filePath)) return null;
const c = fs.readFileSync(filePath, 'utf8');
return c.length > 0 ? c : null;
} catch (e) {
core.debug(`readFileOrNull: ${filePath} -> ${e.message}`);
return null;
}
}
function readJsonOrNull(filePath) {
const raw = readFileOrNull(filePath);
if (!raw) return null;
try {
return JSON.parse(raw);
} catch (e) {
core.debug(`readJsonOrNull: parse failed for ${filePath}: ${e.message}`);
return null;
}
}
function fmtMs(ms) {
if (typeof ms !== 'number' || !isFinite(ms)) return 'â';
if (ms < 1000) return `${ms}ms`;
const s = ms / 1000;
if (s < 60) return `${s.toFixed(1)}s`;
const m = Math.floor(s / 60);
return `${m}m${(s - m * 60).toFixed(0)}s`;
}
function fmtTokens(n) {
if (typeof n !== 'number' || !isFinite(n) || n === 0) return 'â';
if (n < 1000) return String(n);
if (n < 1_000_000) return `${(n / 1000).toFixed(1)}K`;
return `${(n / 1_000_000).toFixed(2)}M`;
}
function scoreEmoji(score, succeeded, total) {
if (typeof score !== 'number') return 'â ī¸';
if (succeeded === total && total > 0) return 'â
';
if (succeeded > 0) return 'â ī¸';
return 'â';
}
function truncateText(text, maxLines) {
if (!text) return '';
const lines = text.split('\n');
if (lines.length <= maxLines) return text;
return lines.slice(0, maxLines).join('\n')
+ `\nâĻ (${lines.length - maxLines} more lines truncated)`;
}
// ---------------- top-level: agent token compare ----------------
const tcPath = path.join(root, 'waza-agent-tokens-compare', 'tokens-compare.json');
const tc = readJsonOrNull(tcPath);
let tokenCompareSection = '';
if (tc) {
const files = Array.isArray(tc.files) ? tc.files : [];
if (files.length > 0) {
const rows = files
.map((f) => {
const before = (f.base_tokens != null) ? f.base_tokens
: (f.before != null) ? f.before : 'â';
const after = (f.head_tokens != null) ? f.head_tokens
: (f.after != null) ? f.after
: (f.tokens != null) ? f.tokens : 'â';
const delta = (f.delta != null) ? f.delta
: (typeof before === 'number' && typeof after === 'number')
? (after - before) : 'â';
const pct = (f.percent_change != null) ? `${f.percent_change.toFixed(1)}%`
: (typeof before === 'number' && before > 0 && typeof delta === 'number')
? `${(delta * 100 / before).toFixed(1)}%`
: 'â';
const sign = (typeof delta === 'number' && delta > 0) ? '+' : '';
return `| \`${f.path}\` | ${before} | ${after} | ${sign}${delta} | ${pct} |`;
})
.join('\n');
tokenCompareSection = [
'<details><summary>đ Agent file token comparison vs <code>main</code> (advisory)</summary>',
'',
`| File | Base | Head | Î | % |`,
`|---|---|---|---|---|`,
rows,
'',
'</details>',
].join('\n');
} else {
tokenCompareSection = [
'<details><summary>đ Agent file token comparison vs <code>main</code> (advisory)</summary>',
'',
'_No `.agent.md` files changed vs `main` (or token-compare returned no entries)._',
'',
'</details>',
].join('\n');
}
}
// ---------------- per-agent sections ----------------
const byAgent = new Map();
for (const d of allDirs) {
const agent = d.replace(/^waza-agent-results-/, '');
byAgent.set(agent, d);
}
const sections = [];
for (const agent of agents) {
const dir = byAgent.get(agent);
if (!dir) {
sections.push([
`
'',
'_No artifact produced. See workflow logs._',
].join('\n'));
continue;
}
const jsonPath = path.join(root, dir, `${agent}.json`);
const rawMd = readFileOrNull(path.join(root, dir, `${agent}.md`));
const json = readJsonOrNull(jsonPath);
if (!json) {
// Fall back to raw github-comment markdown if JSON is unavailable.
sections.push([
`
'',
rawMd || '_No output captured. See workflow logs._',
].join('\n'));
continue;
}
const summary = json.summary || {};
const usage = summary.usage || {};
const total = summary.total_tests || 0;
const ok = summary.succeeded || 0;
const failed = summary.failed || 0;
const score = (typeof summary.aggregate_score === 'number')
? summary.aggregate_score.toFixed(2) : 'â';
const emoji = scoreEmoji(summary.aggregate_score, ok, total);
const headline =
`**Score:** ${emoji} ${score} (${ok}/${total} tasks) | ` +
`**Duration:** ${fmtMs(summary.duration_ms)} | ` +
`**Cost:** ${usage.premium_requests ?? 'â'} premium req${usage.premium_requests === 1 ? '' : 's'}, ` +
`${usage.turns ?? 'â'} turns | ` +
`**Tokens:** ${fmtTokens(usage.input_tokens)} in / ${fmtTokens(usage.output_tokens)} out` +
(usage.cache_read_tokens ? ` / ${fmtTokens(usage.cache_read_tokens)} cache-read` : '');
// Per-task table.
const tasks = Array.isArray(json.tasks) ? json.tasks : [];
const taskRows = tasks.map((t) => {
const run0 = (t.runs && t.runs[0]) || {};
const sd = run0.session_digest || {};
const taskScore = (typeof t.score === 'number') ? t.score.toFixed(2)
: (typeof run0.score === 'number') ? run0.score.toFixed(2)
: 'â';
const passed = run0.status === 'passed' || run0.status === 'pass';
const statusEmoji = passed ? 'â
' : (run0.status === 'error' ? 'â ī¸' : 'â');
const toolCalls = sd.tool_call_count ?? 'â';
const graders = (run0.validations)
? Object.keys(run0.validations).join(', ')
: 'â';
const name = (t.display_name || t.name || t.id || '(unnamed)')
.replace(/\|/g, '\\|');
return `| ${name} | ${taskScore} | ${statusEmoji} | ${toolCalls} | ${graders} |`;
}).join('\n');
const taskTable = tasks.length > 0
? [
'| Task | Score | Status | Tool calls | Graders |',
'|---|---|---|---|---|',
taskRows,
].join('\n')
: '_No task data in JSON output._';
// Per-agent signal sections (parity with skills workflow):
// model-independent, fed by per-leg `waza tokens profile`
// and `waza quality` output. Both files live alongside the
// eval JSON/markdown in the same artifact dir.
const tokensProfilePath = path.join(root, dir, `${agent}-tokens-profile.txt`);
const tokensProfileRaw = readFileOrNull(tokensProfilePath);
const tokensSection = tokensProfileRaw
? [
'<details><summary>đĸ Tokens (count + profile)</summary>',
'',
'```',
tokensProfileRaw.trim(),
'```',
'',
'</details>',
].join('\n')
: '';
const qualityPath = path.join(root, dir, `${agent}-quality.txt`);
const qualityRaw = readFileOrNull(qualityPath);
const qualitySection = qualityRaw
? [
'<details><summary>đ¯ Quality (5-dim table)</summary>',
'',
'```',
qualityRaw.trim(),
'```',
'',
'_Scored by `waza quality` with `claude-sonnet-4.6` as LLM judge. The agent\'s `.agent.md` is staged as `SKILL.md` for analysis; treat dimensions as advisory signal (the rubric was authored for skills)._',
'',
'</details>',
].join('\n')
: '';
// Failure details (only when something failed).
const failureDetails = [];
for (const t of tasks) {
const run0 = (t.runs && t.runs[0]) || {};
if (run0.status === 'passed' || run0.status === 'pass') continue;
const name = t.display_name || t.name || t.id || '(unnamed)';
const lines = [`
const validations = run0.validations || {};
for (const [gname, v] of Object.entries(validations)) {
if (v.passed) continue;
const fb = (v.feedback || '_no feedback_').replace(/\n/g, ' ').slice(0, 400);
lines.push(`- â **${gname}** (${(v.score ?? 0).toFixed(2)}): ${fb}`);
}
const out = run0.final_output;
if (out && typeof out === 'string') {
lines.push('', '<details><summary>Agent output (truncated)</summary>', '', '```', truncateText(out, 30), '```', '', '</details>');
}
failureDetails.push(lines.join('\n'));
}
const failurePanel = (failed > 0 && failureDetails.length > 0)
? [
'<details open><summary>đ Failure details</summary>',
'',
failureDetails.join('\n\n---\n\n'),
'',
'</details>',
].join('\n')
: '';
// Suggestion / recommendation report (--suggest --recommend).
const sug = (json.metadata && json.metadata.suggestion_report) || null;
const rec = (json.metadata && json.metadata.recommendation_report) || null;
const suggestionParts = [];
if (sug && typeof sug === 'string' && sug.trim().length > 0) {
suggestionParts.push(sug.trim());
}
if (rec && typeof rec === 'string' && rec.trim().length > 0) {
if (suggestionParts.length > 0) suggestionParts.push('\n\n---\n\n');
suggestionParts.push(rec.trim());
}
const suggestionPanel = suggestionParts.length > 0
? [
failed > 0
? '<details open><summary>đĄ Suggestions / root-cause analysis</summary>'
: '<details><summary>đĄ Suggestions / recommendations</summary>',
'',
suggestionParts.join(''),
'',
'</details>',
].join('\n')
: '';
// Raw eval output (closed by default â fallback / drill-down).
const rawPanel = rawMd
? [
'<details><summary>đ Full eval output (raw <code>--format github-comment</code> markdown)</summary>',
'',
rawMd.trim(),
'',
'</details>',
].join('\n')
: '';
const parts = [
`
'',
headline,
'',
taskTable,
];
if (tokensSection) { parts.push('', tokensSection); }
if (qualitySection) { parts.push('', qualitySection); }
if (failurePanel) { parts.push('', failurePanel); }
if (suggestionPanel) { parts.push('', suggestionPanel); }
if (rawPanel) { parts.push('', rawPanel); }
sections.push(parts.join('\n'));
}
const totalLegs = allDirs.length;
const prepareMode = (process.env.PREPARE_MODE || '').trim();
const prepareReason = (process.env.PREPARE_REASON || '').trim();
let scopeBanner = '';
if (prepareMode === 'none') {
scopeBanner =
'> âšī¸ **No agents evaluated.** ' + (prepareReason || 'No relevant changes detected.');
} else if (prepareMode === 'subset') {
scopeBanner =
'> đ¯ **Diff-scoped run.** ' + (prepareReason || 'Only changed agents evaluated.') +
' Touch `.github/workflows/waza-agent-evals.yml` or trigger `workflow_dispatch` to run all agents.';
} else if (prepareMode === 'single') {
scopeBanner =
'> đ¯ **Single-agent run.** ' + (prepareReason || 'workflow_dispatch input.');
} else if (prepareMode === 'full') {
scopeBanner =
'> đ **Full matrix run.** ' + (prepareReason || 'All configured agents evaluated.');
}
const header = [
'<!-- waza-agent-evals-comment -->',
'## đ¤ Waza agent evals (advisory)',
'',
scopeBanner,
scopeBanner ? '' : null,
'Ran ' + totalLegs + ' agent eval' + (totalLegs === 1 ? '' : 's') +
' against `claude-sonnet-4.6`. Each eval consumes ~5 premium Copilot requests; results are non-blocking â investigate failures via the workflow logs and the per-agent `waza-agent-results-*` artifacts.',
'',
'> **How this works:** This workflow auto-syncs the canonical `.github/agents/<name>.agent.md` into the sibling mirror inside `.github/evals/agents/<name>/` before each run, so the score below reflects the version of the agent in this PR â not whatever was committed when the eval was first wired up.',
'',
].filter((line) => line !== null).join('\n');
const sectionsBlock = sections.length > 0
? sections.join('\n\n---\n\n')
: '_No agents in scope for this PR._';
const body = [
header.replace(/\s+$/, ''),
tokenCompareSection.replace(/\s+$/, ''),
sectionsBlock,
].filter((s) => s.length > 0).join('\n\n') + '\n';
const { owner, repo } = context.repo;
const issue_number = context.payload.pull_request.number;
const { data: comments } = await github.rest.issues.listComments({ owner, repo, issue_number });
const existing = comments.find((c) => c.body && c.body.includes('<!-- waza-agent-evals-comment -->'));
if (existing) {
await github.rest.issues.updateComment({ owner, repo, comment_id: existing.id, body });
} else {
await github.rest.issues.createComment({ owner, repo, issue_number, body });
}