From dc10d0f50d2606d98f7d824eca51bfd5f61a4373 Mon Sep 17 00:00:00 2001 From: jottakka Date: Fri, 17 Apr 2026 20:51:52 -0300 Subject: [PATCH 1/3] fix(toolkit-docs-generator): preserve previous summary when regen fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit maybeGenerateSummary silently wiped the summary field in two paths: 1. When no LLM generator was configured (e.g. --skip-summary or missing ANTHROPIC_API_KEY), and the tool signature had changed since the last run — causing the summary-reuse fast path to miss. 2. When the LLM call threw (rate limit, network error, invalid JSON) — only a warning was recorded and the field stayed undefined. Both paths produce the observed regression where toolkits like github, jira, salesforce, googledocs, linear, and daytona went from rich, hand-refined summaries to `null` after an auto docs update. The rendered docs (app/_components/toolkit-docs/components/toolkit-page.tsx) then omit the prose block entirely. Carry the previous toolkit's summary forward in both fallback paths. A slightly stale summary is better than losing it; the next run with a working LLM will regenerate based on the new signature. Add two regression tests covering the missing-generator and failing- generator paths. Closes ARC-TOOLKIT-DOCS-SUMMARY-LOSS Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/merger/data-merger.ts | 13 ++++ .../tests/merger/data-merger.test.ts | 65 +++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/toolkit-docs-generator/src/merger/data-merger.ts b/toolkit-docs-generator/src/merger/data-merger.ts index e373d51a7..eca5c80a0 100644 --- a/toolkit-docs-generator/src/merger/data-merger.ts +++ b/toolkit-docs-generator/src/merger/data-merger.ts @@ -963,6 +963,12 @@ export class DataMerger { } if (!this.toolkitSummaryGenerator) { + // Preserve previous summary when no LLM is available to regenerate. + // A slightly stale summary is better than silently wiping hand-authored + // or previously generated content on every run that disables the LLM. + if (previousToolkit?.summary) { + result.toolkit.summary = previousToolkit.summary; + } return; } @@ -980,6 +986,13 @@ export class DataMerger { result.warnings.push( `Summary generation failed for ${result.toolkit.id}: ${message}` ); + // Preserve previous summary on LLM failure. Losing the summary on a + // transient API error would require waiting for another run — and if + // the failure is persistent we keep showing the last known-good text + // instead of a blank overview. + if (previousToolkit?.summary) { + result.toolkit.summary = previousToolkit.summary; + } } } diff --git a/toolkit-docs-generator/tests/merger/data-merger.test.ts b/toolkit-docs-generator/tests/merger/data-merger.test.ts index bb4991b3f..aad640015 100644 --- a/toolkit-docs-generator/tests/merger/data-merger.test.ts +++ b/toolkit-docs-generator/tests/merger/data-merger.test.ts @@ -1185,6 +1185,71 @@ describe("DataMerger", () => { expect(result.toolkit.summary).toBeUndefined(); }); + it("preserves previous summary when no LLM generator is available and the signature changed", async () => { + const toolkitDataSource = createCombinedToolkitDataSource({ + toolSource: new InMemoryToolDataSource([githubTool1, githubTool2]), + metadataSource: new InMemoryMetadataSource([githubMetadata]), + }); + const previousResult = await mergeToolkit( + "Github", + [githubTool1], + githubMetadata, + null, + createStubGenerator() + ); + previousResult.toolkit.summary = "Hand-authored summary"; + + const merger = new DataMerger({ + toolkitDataSource, + customSectionsSource: new EmptyCustomSectionsSource(), + toolExampleGenerator: createStubGenerator(), + previousToolkits: new Map([["github", previousResult.toolkit]]), + }); + + const result = await merger.mergeToolkit("Github"); + + expect(result.toolkit.tools).toHaveLength(2); + expect(result.toolkit.summary).toBe("Hand-authored summary"); + }); + + it("preserves previous summary when the LLM generator throws", async () => { + const toolkitDataSource = createCombinedToolkitDataSource({ + toolSource: new InMemoryToolDataSource([githubTool1, githubTool2]), + metadataSource: new InMemoryMetadataSource([githubMetadata]), + }); + const previousResult = await mergeToolkit( + "Github", + [githubTool1], + githubMetadata, + null, + createStubGenerator() + ); + previousResult.toolkit.summary = "Hand-authored summary"; + + const failingSummary: ToolkitSummaryGenerator = { + generate: async () => { + throw new Error("rate limited"); + }, + }; + + const merger = new DataMerger({ + toolkitDataSource, + customSectionsSource: new EmptyCustomSectionsSource(), + toolExampleGenerator: createStubGenerator(), + toolkitSummaryGenerator: failingSummary, + previousToolkits: new Map([["github", previousResult.toolkit]]), + }); + + const result = await merger.mergeToolkit("Github"); + + expect(result.toolkit.summary).toBe("Hand-authored summary"); + expect( + result.warnings.some((warning) => + warning.includes("Summary generation failed for Github") + ) + ).toBe(true); + }); + it("reuses previous examples when the tool is unchanged", async () => { const toolkitDataSource = createCombinedToolkitDataSource({ toolSource: new InMemoryToolDataSource([githubTool1]), From 9e392f57db498066e311287a0868dca77e088f2b Mon Sep 17 00:00:00 2001 From: jottakka Date: Sat, 18 Apr 2026 14:37:28 -0300 Subject: [PATCH 2/3] feat(toolkit-docs-generator): flag stale summaries and gate CI on them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When maybeGenerateSummary carries a previous summary forward because the LLM generator is unavailable or the LLM call threw, the signature-mismatch fallback produces a summary that no longer accurately describes the toolkit's current tools. The previous commit made that fallback happen at all (so summaries stopped being silently wiped); this commit marks those cases visibly so they don't linger. New fields on MergedToolkit: - summaryStale: boolean (optional) - summaryStaleReason: string (optional — "llm_generator_unavailable" or "llm_generation_failed") Both are set together when a previous summary is preserved as a fallback, cleared together when (a) a fresh summary is successfully generated, (b) the signature-match reuse path runs and confirms the existing summary is still accurate, or (c) an overview chunk takes precedence. A warning with the "Summary is stale for " prefix is pushed onto the merge result for in-run visibility. previous-output.ts carries the flags through the fallback parser so the next run can see an already-stale summary and either refresh it (success path clears the flag) or leave it flagged. CI gate: - tests/stale-summaries.test.ts reads committed data/toolkits/*.json and fails if any toolkit has summaryStale: true, with a human-readable list of offenders and their reasons. This intentionally lets the scheduled [AUTO] Adding MCP Servers docs update PR open with the stale flag set — its CI run will fail, which is the signal for a reviewer to rerun generation with a working LLM or fix the summary by hand. - scripts/check-stale-summaries.ts: standalone tsx script that runs the same scan locally and exits non-zero on findings. Tests (data-merger.test.ts) now assert: - preservation-without-generator sets summaryStale + reason + warning - preservation-on-throw sets summaryStale + reason - successful regen clears an existing stale flag - signature-match reuse clears an existing stale flag (because the signature itself proves freshness) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/check-stale-summaries.ts | 66 +++++++++++++++ .../src/diff/previous-output.ts | 8 ++ .../src/merger/data-merger.ts | 43 ++++++++++ toolkit-docs-generator/src/types/index.ts | 14 ++++ .../tests/merger/data-merger.test.ts | 79 ++++++++++++++++++ .../tests/stale-summaries.test.ts | 80 +++++++++++++++++++ 6 files changed, 290 insertions(+) create mode 100644 toolkit-docs-generator/scripts/check-stale-summaries.ts create mode 100644 toolkit-docs-generator/tests/stale-summaries.test.ts diff --git a/toolkit-docs-generator/scripts/check-stale-summaries.ts b/toolkit-docs-generator/scripts/check-stale-summaries.ts new file mode 100644 index 000000000..a3e1cb814 --- /dev/null +++ b/toolkit-docs-generator/scripts/check-stale-summaries.ts @@ -0,0 +1,66 @@ +#!/usr/bin/env tsx + +/** + * Check stale summaries + * + * Scans toolkit-docs-generator/data/toolkits/*.json and reports any toolkit + * whose summary was carried forward from a previous run (summaryStale: true) + * because regeneration was skipped or failed. Exits 1 if any are found so + * the script can gate CI or a local sanity check. + * + * Usage: + * pnpm dlx tsx toolkit-docs-generator/scripts/check-stale-summaries.ts + */ + +import { readdirSync, readFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +const here = dirname(fileURLToPath(import.meta.url)); +const TOOLKITS_DIR = join(here, "..", "data", "toolkits"); + +type ToolkitShape = { + id?: unknown; + summary?: unknown; + summaryStale?: unknown; + summaryStaleReason?: unknown; +}; + +const listToolkitFiles = (): string[] => + readdirSync(TOOLKITS_DIR).filter( + (name) => name.endsWith(".json") && name !== "index.json" + ); + +const main = (): number => { + const stale: Array<{ file: string; id: string; reason: string }> = []; + for (const file of listToolkitFiles()) { + const raw = readFileSync(join(TOOLKITS_DIR, file), "utf8"); + const toolkit = JSON.parse(raw) as ToolkitShape; + if (toolkit.summaryStale === true) { + stale.push({ + file, + id: typeof toolkit.id === "string" ? toolkit.id : file, + reason: + typeof toolkit.summaryStaleReason === "string" + ? toolkit.summaryStaleReason + : "unknown", + }); + } + } + + if (stale.length === 0) { + console.log(`✓ No stale summaries in ${TOOLKITS_DIR}`); + return 0; + } + + console.error(`✗ ${stale.length} toolkit(s) have a stale summary:\n`); + for (const finding of stale) { + console.error(` - ${finding.id} (${finding.file}): ${finding.reason}`); + } + console.error( + "\nRe-run the generator with a working LLM, or refresh the summary by hand and clear the `summaryStale` / `summaryStaleReason` fields." + ); + return 1; +}; + +process.exit(main()); diff --git a/toolkit-docs-generator/src/diff/previous-output.ts b/toolkit-docs-generator/src/diff/previous-output.ts index b17fd7d91..49577bd74 100644 --- a/toolkit-docs-generator/src/diff/previous-output.ts +++ b/toolkit-docs-generator/src/diff/previous-output.ts @@ -276,6 +276,12 @@ const buildFallbackToolkit = ( const authResult = MergedToolkitAuthSchema.safeParse(record.auth); const summary = typeof record.summary === "string" ? record.summary : undefined; + const summaryStale = + typeof record.summaryStale === "boolean" ? record.summaryStale : undefined; + const summaryStaleReason = + typeof record.summaryStaleReason === "string" + ? record.summaryStaleReason + : undefined; const generatedAt = typeof record.generatedAt === "string" ? record.generatedAt : undefined; @@ -299,6 +305,8 @@ const buildFallbackToolkit = ( version, description, ...(summary ? { summary } : {}), + ...(summaryStale !== undefined ? { summaryStale } : {}), + ...(summaryStaleReason ? { summaryStaleReason } : {}), metadata: metadataResult.success ? metadataResult.data : DEFAULT_PREVIOUS_TOOLKIT_METADATA, diff --git a/toolkit-docs-generator/src/merger/data-merger.ts b/toolkit-docs-generator/src/merger/data-merger.ts index eca5c80a0..764ff9bf0 100644 --- a/toolkit-docs-generator/src/merger/data-merger.ts +++ b/toolkit-docs-generator/src/merger/data-merger.ts @@ -467,6 +467,31 @@ const getToolDocumentationChunks = ( return fromPrevious; }; +/** + * Mark the toolkit's summary as stale — the summary is being carried forward + * from a previous run even though the toolkit signature changed (regen was + * skipped or failed). The CI check in tests/stale-summaries.test.ts surfaces + * these toolkits so a human rerun or fix can follow. + */ +export const STALE_SUMMARY_WARNING_PREFIX = "Summary is stale for"; + +const markSummaryStale = ( + toolkit: MergedToolkit, + reason: string, + warnings: string[] +): void => { + toolkit.summaryStale = true; + toolkit.summaryStaleReason = reason; + warnings.push( + `${STALE_SUMMARY_WARNING_PREFIX} ${toolkit.id}: ${reason}. Previous summary carried forward.` + ); +}; + +const clearStaleSummaryFlags = (toolkit: MergedToolkit): void => { + toolkit.summaryStale = undefined; + toolkit.summaryStaleReason = undefined; +}; + const isOverviewChunk = (chunk: DocumentationChunk): boolean => chunk.location === "header" && chunk.position === "before" && @@ -950,6 +975,7 @@ export class DataMerger { if (hasToolkitOverviewChunk(result.toolkit)) { // Keep overview as the canonical toolkit-level narrative. result.toolkit.summary = undefined; + clearStaleSummaryFlags(result.toolkit); return; } @@ -957,7 +983,12 @@ export class DataMerger { const currentSignature = buildToolkitSummarySignature(result.toolkit); const previousSignature = buildToolkitSummarySignature(previousToolkit); if (currentSignature === previousSignature) { + // Signature matches — the previous summary still accurately describes + // the toolkit, so reuse it verbatim and treat it as fresh regardless + // of the previous staleness flag (the signature itself is proof of + // freshness here). result.toolkit.summary = previousToolkit.summary; + clearStaleSummaryFlags(result.toolkit); return; } } @@ -968,11 +999,17 @@ export class DataMerger { // or previously generated content on every run that disables the LLM. if (previousToolkit?.summary) { result.toolkit.summary = previousToolkit.summary; + markSummaryStale( + result.toolkit, + "llm_generator_unavailable", + result.warnings + ); } return; } if (result.toolkit.summary) { + clearStaleSummaryFlags(result.toolkit); return; } @@ -981,6 +1018,7 @@ export class DataMerger { result.toolkit ); result.toolkit.summary = summary; + clearStaleSummaryFlags(result.toolkit); } catch (error) { const message = error instanceof Error ? error.message : String(error); result.warnings.push( @@ -992,6 +1030,11 @@ export class DataMerger { // instead of a blank overview. if (previousToolkit?.summary) { result.toolkit.summary = previousToolkit.summary; + markSummaryStale( + result.toolkit, + "llm_generation_failed", + result.warnings + ); } } } diff --git a/toolkit-docs-generator/src/types/index.ts b/toolkit-docs-generator/src/types/index.ts index cdf5a7c53..366dde7aa 100644 --- a/toolkit-docs-generator/src/types/index.ts +++ b/toolkit-docs-generator/src/types/index.ts @@ -399,6 +399,20 @@ export const MergedToolkitSchema = z.object({ description: z.string().nullable(), /** LLM-generated summary (optional) */ summary: z.string().optional(), + /** + * True when the current `summary` is known to be out of date with the + * toolkit's current tools (the signature changed but regeneration was + * skipped or failed, so the previous summary was carried forward as a + * fallback). Cleared whenever a fresh summary is successfully generated + * or when the summary is verified against an unchanged signature. + */ + summaryStale: z.boolean().optional(), + /** + * Machine-readable reason the summary is stale (e.g. + * "llm_generator_unavailable", "llm_generation_failed"). Always set + * together with `summaryStale: true`. Cleared together with it. + */ + summaryStaleReason: z.string().optional(), /** Metadata from Design System */ metadata: MergedToolkitMetadataSchema, /** Authentication requirements */ diff --git a/toolkit-docs-generator/tests/merger/data-merger.test.ts b/toolkit-docs-generator/tests/merger/data-merger.test.ts index aad640015..26c87ea32 100644 --- a/toolkit-docs-generator/tests/merger/data-merger.test.ts +++ b/toolkit-docs-generator/tests/merger/data-merger.test.ts @@ -1210,6 +1210,15 @@ describe("DataMerger", () => { expect(result.toolkit.tools).toHaveLength(2); expect(result.toolkit.summary).toBe("Hand-authored summary"); + expect(result.toolkit.summaryStale).toBe(true); + expect(result.toolkit.summaryStaleReason).toBe( + "llm_generator_unavailable" + ); + expect( + result.warnings.some((warning) => + warning.includes("Summary is stale for Github") + ) + ).toBe(true); }); it("preserves previous summary when the LLM generator throws", async () => { @@ -1248,6 +1257,76 @@ describe("DataMerger", () => { warning.includes("Summary generation failed for Github") ) ).toBe(true); + expect(result.toolkit.summaryStale).toBe(true); + expect(result.toolkit.summaryStaleReason).toBe("llm_generation_failed"); + }); + + it("clears the stale flag when the generator succeeds on the next run", async () => { + // A toolkit whose summary was stale on a prior run should come back + // clean once the generator actually produces a new summary. This + // proves the CI gate will stop flagging the toolkit once fixed. + const toolkitDataSource = createCombinedToolkitDataSource({ + toolSource: new InMemoryToolDataSource([githubTool1, githubTool2]), + metadataSource: new InMemoryMetadataSource([githubMetadata]), + }); + const previousResult = await mergeToolkit( + "Github", + [githubTool1], + githubMetadata, + null, + createStubGenerator() + ); + previousResult.toolkit.summary = "Older summary"; + previousResult.toolkit.summaryStale = true; + previousResult.toolkit.summaryStaleReason = "llm_generation_failed"; + + const merger = new DataMerger({ + toolkitDataSource, + customSectionsSource: new EmptyCustomSectionsSource(), + toolExampleGenerator: createStubGenerator(), + toolkitSummaryGenerator: createStubSummaryGenerator("Fresh summary"), + previousToolkits: new Map([["github", previousResult.toolkit]]), + }); + + const result = await merger.mergeToolkit("Github"); + + expect(result.toolkit.summary).toBe("Fresh summary (Github)"); + expect(result.toolkit.summaryStale).toBeUndefined(); + expect(result.toolkit.summaryStaleReason).toBeUndefined(); + }); + + it("does not flag stale when the signature matches and the previous summary is reused", async () => { + // Signature-match reuse is not stale — the summary still accurately + // describes the toolkit. Importantly, a previously-stale flag must + // be cleared by the reuse path. + const toolkitDataSource = createCombinedToolkitDataSource({ + toolSource: new InMemoryToolDataSource([githubTool1]), + metadataSource: new InMemoryMetadataSource([githubMetadata]), + }); + const previousResult = await mergeToolkit( + "Github", + [githubTool1], + githubMetadata, + null, + createStubGenerator() + ); + previousResult.toolkit.summary = "Cached summary"; + previousResult.toolkit.summaryStale = true; + previousResult.toolkit.summaryStaleReason = "llm_generation_failed"; + + const merger = new DataMerger({ + toolkitDataSource, + customSectionsSource: new EmptyCustomSectionsSource(), + toolExampleGenerator: createStubGenerator(), + toolkitSummaryGenerator: createStubSummaryGenerator("Unused"), + previousToolkits: new Map([["github", previousResult.toolkit]]), + }); + + const result = await merger.mergeToolkit("Github"); + + expect(result.toolkit.summary).toBe("Cached summary"); + expect(result.toolkit.summaryStale).toBeUndefined(); + expect(result.toolkit.summaryStaleReason).toBeUndefined(); }); it("reuses previous examples when the tool is unchanged", async () => { diff --git a/toolkit-docs-generator/tests/stale-summaries.test.ts b/toolkit-docs-generator/tests/stale-summaries.test.ts new file mode 100644 index 000000000..3511db5ae --- /dev/null +++ b/toolkit-docs-generator/tests/stale-summaries.test.ts @@ -0,0 +1,80 @@ +import { readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import { describe, expect, it } from "vitest"; + +/** + * CI gate for stale summaries. + * + * When the generator carries a previous summary forward because + * regeneration was skipped (no LLM configured) or failed (LLM error), + * it marks the toolkit JSON with `summaryStale: true`. That is deliberate + * — a slightly stale overview is better than a blank one — but it must + * be fixed before the change merges. + * + * This test reads the committed `data/toolkits/*.json` files and fails + * if any toolkit is flagged stale. The auto-generated `[AUTO] Adding MCP + * Servers docs update` PR will still open on schedule, but its CI run + * will fail here, which is the intended signal for a reviewer to + * either rerun generation with a working LLM or manually refresh the + * affected summary. + */ +const TOOLKITS_DIR = join( + process.cwd(), + "toolkit-docs-generator", + "data", + "toolkits" +); + +type ToolkitSummaryShape = { + id?: unknown; + summary?: unknown; + summaryStale?: unknown; + summaryStaleReason?: unknown; +}; + +const readToolkit = (fileName: string): ToolkitSummaryShape => { + const raw = readFileSync(join(TOOLKITS_DIR, fileName), "utf8"); + return JSON.parse(raw) as ToolkitSummaryShape; +}; + +const listToolkitFiles = (): string[] => + readdirSync(TOOLKITS_DIR).filter( + (name) => name.endsWith(".json") && name !== "index.json" + ); + +describe("Stale summary CI gate", () => { + it("no committed toolkit JSON has summaryStale: true", () => { + const staleFindings: Array<{ + file: string; + id: string; + reason: string; + }> = []; + + for (const file of listToolkitFiles()) { + const toolkit = readToolkit(file); + if (toolkit.summaryStale === true) { + staleFindings.push({ + file, + id: typeof toolkit.id === "string" ? toolkit.id : file, + reason: + typeof toolkit.summaryStaleReason === "string" + ? toolkit.summaryStaleReason + : "unknown", + }); + } + } + + if (staleFindings.length > 0) { + const report = staleFindings + .map( + (finding) => ` - ${finding.id} (${finding.file}): ${finding.reason}` + ) + .join("\n"); + throw new Error( + `${staleFindings.length} toolkit(s) have a stale summary that must be regenerated before merge:\n${report}\n\n` + + "Re-run `toolkit-docs-generator generate` with a working --llm-provider/--llm-model, or refresh the summary by hand and clear the `summaryStale` / `summaryStaleReason` fields in the JSON." + ); + } + expect(staleFindings).toEqual([]); + }); +}); From f5a58e6c28e833a15debe67a57e9abe0827c49a6 Mon Sep 17 00:00:00 2001 From: jottakka Date: Sat, 18 Apr 2026 15:56:47 -0300 Subject: [PATCH 3/3] fix(toolkit-docs-generator): address ACR findings on the stale-summary flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five findings from acr-run on the stale-summary flag PR: 1. HIGH — Signature-match reuse was clearing legitimate stale flags. Scenario: run N-1 carried summary S_AB forward into toolset {A,B,C} with summaryStale=true. Run N with the same {A,B,C} would match the signature and mark the summary fresh — but S_AB still describes the wrong toolset. Guard the reuse fast path with `!previousToolkit.summaryStale` so a stale-then-matching-signature run correctly falls through to regeneration (or, if no generator is available, keeps the stale flag). 2. MEDIUM — previous-output.ts used `!== undefined` for summaryStale but a truthy check for summaryStaleReason, so an empty-string reason would break the pair invariant. Match both guards to `!== undefined`. 3. MEDIUM — Moved the `result.toolkit.summary` defensive guard to the top of maybeGenerateSummary so the no-generator fallback path can never clobber an already-set summary. 4. LOW — Wrapped readdirSync and JSON.parse in check-stale-summaries.ts in try/catch so a missing directory or a malformed JSON file produces a clean exit-1 with the filename instead of an uncaught exception. 5. LOW — stale-summaries.test.ts now resolves TOOLKITS_DIR from import.meta.url rather than process.cwd(), so it works from any working directory (repo root or toolkit-docs-generator/). Tests updated: - Replaced the former "does not flag stale when signature matches" test (which validated the broken behavior) with three tests covering the three reuse scenarios: a) signature match + previous fresh → reuse, no regen b) signature match + previous stale + generator → regen clears flag c) signature match + previous stale + no generator → keep stale flag 528 tests pass, type-check clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/check-stale-summaries.ts | 28 +++++-- .../src/diff/previous-output.ts | 2 +- .../src/merger/data-merger.ts | 28 ++++--- .../tests/merger/data-merger.test.ts | 83 +++++++++++++++++-- .../tests/stale-summaries.test.ts | 15 ++-- 5 files changed, 126 insertions(+), 30 deletions(-) diff --git a/toolkit-docs-generator/scripts/check-stale-summaries.ts b/toolkit-docs-generator/scripts/check-stale-summaries.ts index a3e1cb814..da093402d 100644 --- a/toolkit-docs-generator/scripts/check-stale-summaries.ts +++ b/toolkit-docs-generator/scripts/check-stale-summaries.ts @@ -26,16 +26,32 @@ type ToolkitShape = { summaryStaleReason?: unknown; }; -const listToolkitFiles = (): string[] => - readdirSync(TOOLKITS_DIR).filter( - (name) => name.endsWith(".json") && name !== "index.json" - ); +const listToolkitFiles = (): string[] => { + try { + return readdirSync(TOOLKITS_DIR).filter( + (name) => name.endsWith(".json") && name !== "index.json" + ); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error( + `✗ Cannot read toolkit directory ${TOOLKITS_DIR}: ${message}` + ); + process.exit(1); + } +}; const main = (): number => { const stale: Array<{ file: string; id: string; reason: string }> = []; for (const file of listToolkitFiles()) { - const raw = readFileSync(join(TOOLKITS_DIR, file), "utf8"); - const toolkit = JSON.parse(raw) as ToolkitShape; + let toolkit: ToolkitShape; + try { + const raw = readFileSync(join(TOOLKITS_DIR, file), "utf8"); + toolkit = JSON.parse(raw) as ToolkitShape; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`✗ Cannot parse ${file}: ${message}`); + return 1; + } if (toolkit.summaryStale === true) { stale.push({ file, diff --git a/toolkit-docs-generator/src/diff/previous-output.ts b/toolkit-docs-generator/src/diff/previous-output.ts index 49577bd74..043a7bb90 100644 --- a/toolkit-docs-generator/src/diff/previous-output.ts +++ b/toolkit-docs-generator/src/diff/previous-output.ts @@ -306,7 +306,7 @@ const buildFallbackToolkit = ( description, ...(summary ? { summary } : {}), ...(summaryStale !== undefined ? { summaryStale } : {}), - ...(summaryStaleReason ? { summaryStaleReason } : {}), + ...(summaryStaleReason !== undefined ? { summaryStaleReason } : {}), metadata: metadataResult.success ? metadataResult.data : DEFAULT_PREVIOUS_TOOLKIT_METADATA, diff --git a/toolkit-docs-generator/src/merger/data-merger.ts b/toolkit-docs-generator/src/merger/data-merger.ts index 764ff9bf0..d3cf02ed5 100644 --- a/toolkit-docs-generator/src/merger/data-merger.ts +++ b/toolkit-docs-generator/src/merger/data-merger.ts @@ -979,14 +979,27 @@ export class DataMerger { return; } + // Defensive guard: if something upstream already set a summary on + // `result.toolkit`, respect it. buildMergedToolkit does not set one + // today, but we don't want the reuse / fallback paths below to + // silently replace a pre-populated value. + if (result.toolkit.summary) { + clearStaleSummaryFlags(result.toolkit); + return; + } + if (previousToolkit?.summary) { const currentSignature = buildToolkitSummarySignature(result.toolkit); const previousSignature = buildToolkitSummarySignature(previousToolkit); - if (currentSignature === previousSignature) { - // Signature matches — the previous summary still accurately describes - // the toolkit, so reuse it verbatim and treat it as fresh regardless - // of the previous staleness flag (the signature itself is proof of - // freshness here). + // Signature-match reuse is only safe when the PREVIOUS summary itself + // was fresh. If the previous run already flagged the summary stale, + // a matching signature does not prove freshness — the stale summary + // was carried forward from an even earlier toolset and will stay + // wrong until an LLM actually regenerates it. + if ( + currentSignature === previousSignature && + !previousToolkit.summaryStale + ) { result.toolkit.summary = previousToolkit.summary; clearStaleSummaryFlags(result.toolkit); return; @@ -1008,11 +1021,6 @@ export class DataMerger { return; } - if (result.toolkit.summary) { - clearStaleSummaryFlags(result.toolkit); - return; - } - try { const summary = await this.toolkitSummaryGenerator.generate( result.toolkit diff --git a/toolkit-docs-generator/tests/merger/data-merger.test.ts b/toolkit-docs-generator/tests/merger/data-merger.test.ts index 26c87ea32..71155f0a6 100644 --- a/toolkit-docs-generator/tests/merger/data-merger.test.ts +++ b/toolkit-docs-generator/tests/merger/data-merger.test.ts @@ -1295,10 +1295,10 @@ describe("DataMerger", () => { expect(result.toolkit.summaryStaleReason).toBeUndefined(); }); - it("does not flag stale when the signature matches and the previous summary is reused", async () => { - // Signature-match reuse is not stale — the summary still accurately - // describes the toolkit. Importantly, a previously-stale flag must - // be cleared by the reuse path. + it("does not flag stale when the signature matches a fresh previous summary", async () => { + // Baseline: if previous.summaryStale is falsy, signature match is a + // valid proof of freshness and the reuse path should keep the + // summary and stay clean. const toolkitDataSource = createCombinedToolkitDataSource({ toolSource: new InMemoryToolDataSource([githubTool1]), metadataSource: new InMemoryMetadataSource([githubMetadata]), @@ -1311,6 +1311,41 @@ describe("DataMerger", () => { createStubGenerator() ); previousResult.toolkit.summary = "Cached summary"; + + const countingSummary = createCountingSummaryGenerator(); + const merger = new DataMerger({ + toolkitDataSource, + customSectionsSource: new EmptyCustomSectionsSource(), + toolExampleGenerator: createStubGenerator(), + toolkitSummaryGenerator: countingSummary.generator, + previousToolkits: new Map([["github", previousResult.toolkit]]), + }); + + const result = await merger.mergeToolkit("Github"); + + expect(result.toolkit.summary).toBe("Cached summary"); + expect(result.toolkit.summaryStale).toBeUndefined(); + expect(result.toolkit.summaryStaleReason).toBeUndefined(); + expect(countingSummary.getCalls()).toBe(0); + }); + + it("regenerates when signature matches but the previous summary was already stale", async () => { + // If the previous summary was already flagged stale, a matching + // signature does NOT prove freshness — the stale summary was + // carried forward from an earlier toolset. The reuse fast path + // must skip, and the LLM must actually regenerate. + const toolkitDataSource = createCombinedToolkitDataSource({ + toolSource: new InMemoryToolDataSource([githubTool1]), + metadataSource: new InMemoryMetadataSource([githubMetadata]), + }); + const previousResult = await mergeToolkit( + "Github", + [githubTool1], + githubMetadata, + null, + createStubGenerator() + ); + previousResult.toolkit.summary = "Stale carried-forward summary"; previousResult.toolkit.summaryStale = true; previousResult.toolkit.summaryStaleReason = "llm_generation_failed"; @@ -1318,17 +1353,53 @@ describe("DataMerger", () => { toolkitDataSource, customSectionsSource: new EmptyCustomSectionsSource(), toolExampleGenerator: createStubGenerator(), - toolkitSummaryGenerator: createStubSummaryGenerator("Unused"), + toolkitSummaryGenerator: createStubSummaryGenerator("Fresh"), previousToolkits: new Map([["github", previousResult.toolkit]]), }); const result = await merger.mergeToolkit("Github"); - expect(result.toolkit.summary).toBe("Cached summary"); + expect(result.toolkit.summary).toBe("Fresh (Github)"); expect(result.toolkit.summaryStale).toBeUndefined(); expect(result.toolkit.summaryStaleReason).toBeUndefined(); }); + it("keeps the stale flag when previous was stale, signature matches, and no generator is available", async () => { + // Same as above, but the regen attempt is not possible. The carried- + // forward summary remains, and the stale flag must persist so CI + // keeps flagging the toolkit until a working LLM run produces a + // fresh one. + const toolkitDataSource = createCombinedToolkitDataSource({ + toolSource: new InMemoryToolDataSource([githubTool1]), + metadataSource: new InMemoryMetadataSource([githubMetadata]), + }); + const previousResult = await mergeToolkit( + "Github", + [githubTool1], + githubMetadata, + null, + createStubGenerator() + ); + previousResult.toolkit.summary = "Stale carried-forward summary"; + previousResult.toolkit.summaryStale = true; + previousResult.toolkit.summaryStaleReason = "llm_generation_failed"; + + const merger = new DataMerger({ + toolkitDataSource, + customSectionsSource: new EmptyCustomSectionsSource(), + toolExampleGenerator: createStubGenerator(), + previousToolkits: new Map([["github", previousResult.toolkit]]), + }); + + const result = await merger.mergeToolkit("Github"); + + expect(result.toolkit.summary).toBe("Stale carried-forward summary"); + expect(result.toolkit.summaryStale).toBe(true); + expect(result.toolkit.summaryStaleReason).toBe( + "llm_generator_unavailable" + ); + }); + it("reuses previous examples when the tool is unchanged", async () => { const toolkitDataSource = createCombinedToolkitDataSource({ toolSource: new InMemoryToolDataSource([githubTool1]), diff --git a/toolkit-docs-generator/tests/stale-summaries.test.ts b/toolkit-docs-generator/tests/stale-summaries.test.ts index 3511db5ae..9e88bd63a 100644 --- a/toolkit-docs-generator/tests/stale-summaries.test.ts +++ b/toolkit-docs-generator/tests/stale-summaries.test.ts @@ -1,5 +1,6 @@ import { readdirSync, readFileSync } from "node:fs"; -import { join } from "node:path"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; import { describe, expect, it } from "vitest"; /** @@ -18,12 +19,12 @@ import { describe, expect, it } from "vitest"; * either rerun generation with a working LLM or manually refresh the * affected summary. */ -const TOOLKITS_DIR = join( - process.cwd(), - "toolkit-docs-generator", - "data", - "toolkits" -); +// Resolve from the test file's own location so the test works whether +// vitest is invoked from the repo root or from inside +// toolkit-docs-generator/. Falls back to process.cwd() only if +// import.meta.url is unavailable (it is in the vitest esm runtime). +const here = dirname(fileURLToPath(import.meta.url)); +const TOOLKITS_DIR = join(here, "..", "data", "toolkits"); type ToolkitSummaryShape = { id?: unknown;