From dc10d0f50d2606d98f7d824eca51bfd5f61a4373 Mon Sep 17 00:00:00 2001
From: jottakka <fjlimal@gmail.com>
Date: Fri, 17 Apr 2026 20:51:52 -0300
Subject: [PATCH 1/3] fix(toolkit-docs-generator): preserve previous summary
 when regen fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

maybeGenerateSummary silently wiped the summary field in two paths:

1. When no LLM generator was configured (e.g. --skip-summary or missing
   ANTHROPIC_API_KEY), and the tool signature had changed since the last
   run — causing the summary-reuse fast path to miss.
2. When the LLM call threw (rate limit, network error, invalid JSON) —
   only a warning was recorded and the field stayed undefined.

Both paths produce the observed regression where toolkits like github,
jira, salesforce, googledocs, linear, and daytona went from rich,
hand-refined summaries to `null` after an auto docs update. The
rendered docs (app/_components/toolkit-docs/components/toolkit-page.tsx)
then omit the prose block entirely.

Carry the previous toolkit's summary forward in both fallback paths.
A slightly stale summary is better than losing it; the next run with a
working LLM will regenerate based on the new signature.

Add two regression tests covering the missing-generator and failing-
generator paths.

Closes ARC-TOOLKIT-DOCS-SUMMARY-LOSS

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/merger/data-merger.ts                 | 13 ++++
 .../tests/merger/data-merger.test.ts          | 65 +++++++++++++++++++
 2 files changed, 78 insertions(+)

diff --git a/toolkit-docs-generator/src/merger/data-merger.ts b/toolkit-docs-generator/src/merger/data-merger.ts
index e373d51a7..eca5c80a0 100644
--- a/toolkit-docs-generator/src/merger/data-merger.ts
+++ b/toolkit-docs-generator/src/merger/data-merger.ts
@@ -963,6 +963,12 @@ export class DataMerger {
     }
 
     if (!this.toolkitSummaryGenerator) {
+      // Preserve previous summary when no LLM is available to regenerate.
+      // A slightly stale summary is better than silently wiping hand-authored
+      // or previously generated content on every run that disables the LLM.
+      if (previousToolkit?.summary) {
+        result.toolkit.summary = previousToolkit.summary;
+      }
       return;
     }
 
@@ -980,6 +986,13 @@ export class DataMerger {
       result.warnings.push(
         `Summary generation failed for ${result.toolkit.id}: ${message}`
       );
+      // Preserve previous summary on LLM failure. Losing the summary on a
+      // transient API error would require waiting for another run — and if
+      // the failure is persistent we keep showing the last known-good text
+      // instead of a blank overview.
+      if (previousToolkit?.summary) {
+        result.toolkit.summary = previousToolkit.summary;
+      }
     }
   }
 
diff --git a/toolkit-docs-generator/tests/merger/data-merger.test.ts b/toolkit-docs-generator/tests/merger/data-merger.test.ts
index bb4991b3f..aad640015 100644
--- a/toolkit-docs-generator/tests/merger/data-merger.test.ts
+++ b/toolkit-docs-generator/tests/merger/data-merger.test.ts
@@ -1185,6 +1185,71 @@ describe("DataMerger", () => {
       expect(result.toolkit.summary).toBeUndefined();
     });
 
+    it("preserves previous summary when no LLM generator is available and the signature changed", async () => {
+      const toolkitDataSource = createCombinedToolkitDataSource({
+        toolSource: new InMemoryToolDataSource([githubTool1, githubTool2]),
+        metadataSource: new InMemoryMetadataSource([githubMetadata]),
+      });
+      const previousResult = await mergeToolkit(
+        "Github",
+        [githubTool1],
+        githubMetadata,
+        null,
+        createStubGenerator()
+      );
+      previousResult.toolkit.summary = "Hand-authored summary";
+
+      const merger = new DataMerger({
+        toolkitDataSource,
+        customSectionsSource: new EmptyCustomSectionsSource(),
+        toolExampleGenerator: createStubGenerator(),
+        previousToolkits: new Map([["github", previousResult.toolkit]]),
+      });
+
+      const result = await merger.mergeToolkit("Github");
+
+      expect(result.toolkit.tools).toHaveLength(2);
+      expect(result.toolkit.summary).toBe("Hand-authored summary");
+    });
+
+    it("preserves previous summary when the LLM generator throws", async () => {
+      const toolkitDataSource = createCombinedToolkitDataSource({
+        toolSource: new InMemoryToolDataSource([githubTool1, githubTool2]),
+        metadataSource: new InMemoryMetadataSource([githubMetadata]),
+      });
+      const previousResult = await mergeToolkit(
+        "Github",
+        [githubTool1],
+        githubMetadata,
+        null,
+        createStubGenerator()
+      );
+      previousResult.toolkit.summary = "Hand-authored summary";
+
+      const failingSummary: ToolkitSummaryGenerator = {
+        generate: async () => {
+          throw new Error("rate limited");
+        },
+      };
+
+      const merger = new DataMerger({
+        toolkitDataSource,
+        customSectionsSource: new EmptyCustomSectionsSource(),
+        toolExampleGenerator: createStubGenerator(),
+        toolkitSummaryGenerator: failingSummary,
+        previousToolkits: new Map([["github", previousResult.toolkit]]),
+      });
+
+      const result = await merger.mergeToolkit("Github");
+
+      expect(result.toolkit.summary).toBe("Hand-authored summary");
+      expect(
+        result.warnings.some((warning) =>
+          warning.includes("Summary generation failed for Github")
+        )
+      ).toBe(true);
+    });
+
     it("reuses previous examples when the tool is unchanged", async () => {
       const toolkitDataSource = createCombinedToolkitDataSource({
         toolSource: new InMemoryToolDataSource([githubTool1]),

From 9e392f57db498066e311287a0868dca77e088f2b Mon Sep 17 00:00:00 2001
From: jottakka <fjlimal@gmail.com>
Date: Sat, 18 Apr 2026 14:37:28 -0300
Subject: [PATCH 2/3] feat(toolkit-docs-generator): flag stale summaries and
 gate CI on them
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When maybeGenerateSummary carries a previous summary forward because the
LLM generator is unavailable or the LLM call threw, the signature-mismatch
fallback produces a summary that no longer accurately describes the
toolkit's current tools. The previous commit made that fallback happen at
all (so summaries stopped being silently wiped); this commit marks those
cases visibly so they don't linger.

New fields on MergedToolkit:
- summaryStale: boolean (optional)
- summaryStaleReason: string (optional — "llm_generator_unavailable" or
  "llm_generation_failed")

Both are set together when a previous summary is preserved as a fallback,
cleared together when (a) a fresh summary is successfully generated,
(b) the signature-match reuse path runs and confirms the existing summary
is still accurate, or (c) an overview chunk takes precedence. A warning
with the "Summary is stale for <id>" prefix is pushed onto the merge
result for in-run visibility.

previous-output.ts carries the flags through the fallback parser so the
next run can see an already-stale summary and either refresh it (success
path clears the flag) or leave it flagged.

CI gate:
- tests/stale-summaries.test.ts reads committed data/toolkits/*.json and
  fails if any toolkit has summaryStale: true, with a human-readable
  list of offenders and their reasons. This intentionally lets the
  scheduled [AUTO] Adding MCP Servers docs update PR open with the stale
  flag set — its CI run will fail, which is the signal for a reviewer
  to rerun generation with a working LLM or fix the summary by hand.
- scripts/check-stale-summaries.ts: standalone tsx script that runs the
  same scan locally and exits non-zero on findings.

Tests (data-merger.test.ts) now assert:
- preservation-without-generator sets summaryStale + reason + warning
- preservation-on-throw sets summaryStale + reason
- successful regen clears an existing stale flag
- signature-match reuse clears an existing stale flag (because the
  signature itself proves freshness)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../scripts/check-stale-summaries.ts          | 66 +++++++++++++++
 .../src/diff/previous-output.ts               |  8 ++
 .../src/merger/data-merger.ts                 | 43 ++++++++++
 toolkit-docs-generator/src/types/index.ts     | 14 ++++
 .../tests/merger/data-merger.test.ts          | 79 ++++++++++++++++++
 .../tests/stale-summaries.test.ts             | 80 +++++++++++++++++++
 6 files changed, 290 insertions(+)
 create mode 100644 toolkit-docs-generator/scripts/check-stale-summaries.ts
 create mode 100644 toolkit-docs-generator/tests/stale-summaries.test.ts

diff --git a/toolkit-docs-generator/scripts/check-stale-summaries.ts b/toolkit-docs-generator/scripts/check-stale-summaries.ts
new file mode 100644
index 000000000..a3e1cb814
--- /dev/null
+++ b/toolkit-docs-generator/scripts/check-stale-summaries.ts
@@ -0,0 +1,66 @@
+#!/usr/bin/env tsx
+
+/**
+ * Check stale summaries
+ *
+ * Scans toolkit-docs-generator/data/toolkits/*.json and reports any toolkit
+ * whose summary was carried forward from a previous run (summaryStale: true)
+ * because regeneration was skipped or failed. Exits 1 if any are found so
+ * the script can gate CI or a local sanity check.
+ *
+ * Usage:
+ *   pnpm dlx tsx toolkit-docs-generator/scripts/check-stale-summaries.ts
+ */
+
+import { readdirSync, readFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+const here = dirname(fileURLToPath(import.meta.url));
+const TOOLKITS_DIR = join(here, "..", "data", "toolkits");
+
+type ToolkitShape = {
+  id?: unknown;
+  summary?: unknown;
+  summaryStale?: unknown;
+  summaryStaleReason?: unknown;
+};
+
+const listToolkitFiles = (): string[] =>
+  readdirSync(TOOLKITS_DIR).filter(
+    (name) => name.endsWith(".json") && name !== "index.json"
+  );
+
+const main = (): number => {
+  const stale: Array<{ file: string; id: string; reason: string }> = [];
+  for (const file of listToolkitFiles()) {
+    const raw = readFileSync(join(TOOLKITS_DIR, file), "utf8");
+    const toolkit = JSON.parse(raw) as ToolkitShape;
+    if (toolkit.summaryStale === true) {
+      stale.push({
+        file,
+        id: typeof toolkit.id === "string" ? toolkit.id : file,
+        reason:
+          typeof toolkit.summaryStaleReason === "string"
+            ? toolkit.summaryStaleReason
+            : "unknown",
+      });
+    }
+  }
+
+  if (stale.length === 0) {
+    console.log(`✓ No stale summaries in ${TOOLKITS_DIR}`);
+    return 0;
+  }
+
+  console.error(`✗ ${stale.length} toolkit(s) have a stale summary:\n`);
+  for (const finding of stale) {
+    console.error(`  - ${finding.id} (${finding.file}): ${finding.reason}`);
+  }
+  console.error(
+    "\nRe-run the generator with a working LLM, or refresh the summary by hand and clear the `summaryStale` / `summaryStaleReason` fields."
+  );
+  return 1;
+};
+
+process.exit(main());
diff --git a/toolkit-docs-generator/src/diff/previous-output.ts b/toolkit-docs-generator/src/diff/previous-output.ts
index b17fd7d91..49577bd74 100644
--- a/toolkit-docs-generator/src/diff/previous-output.ts
+++ b/toolkit-docs-generator/src/diff/previous-output.ts
@@ -276,6 +276,12 @@ const buildFallbackToolkit = (
   const authResult = MergedToolkitAuthSchema.safeParse(record.auth);
   const summary =
     typeof record.summary === "string" ? record.summary : undefined;
+  const summaryStale =
+    typeof record.summaryStale === "boolean" ? record.summaryStale : undefined;
+  const summaryStaleReason =
+    typeof record.summaryStaleReason === "string"
+      ? record.summaryStaleReason
+      : undefined;
   const generatedAt =
     typeof record.generatedAt === "string" ? record.generatedAt : undefined;
 
@@ -299,6 +305,8 @@ const buildFallbackToolkit = (
       version,
       description,
       ...(summary ? { summary } : {}),
+      ...(summaryStale !== undefined ? { summaryStale } : {}),
+      ...(summaryStaleReason ? { summaryStaleReason } : {}),
       metadata: metadataResult.success
         ? metadataResult.data
         : DEFAULT_PREVIOUS_TOOLKIT_METADATA,
diff --git a/toolkit-docs-generator/src/merger/data-merger.ts b/toolkit-docs-generator/src/merger/data-merger.ts
index eca5c80a0..764ff9bf0 100644
--- a/toolkit-docs-generator/src/merger/data-merger.ts
+++ b/toolkit-docs-generator/src/merger/data-merger.ts
@@ -467,6 +467,31 @@ const getToolDocumentationChunks = (
   return fromPrevious;
 };
 
+/**
+ * Mark the toolkit's summary as stale — the summary is being carried forward
+ * from a previous run even though the toolkit signature changed (regen was
+ * skipped or failed). The CI check in tests/stale-summaries.test.ts surfaces
+ * these toolkits so a human rerun or fix can follow.
+ */
+export const STALE_SUMMARY_WARNING_PREFIX = "Summary is stale for";
+
+const markSummaryStale = (
+  toolkit: MergedToolkit,
+  reason: string,
+  warnings: string[]
+): void => {
+  toolkit.summaryStale = true;
+  toolkit.summaryStaleReason = reason;
+  warnings.push(
+    `${STALE_SUMMARY_WARNING_PREFIX} ${toolkit.id}: ${reason}. Previous summary carried forward.`
+  );
+};
+
+const clearStaleSummaryFlags = (toolkit: MergedToolkit): void => {
+  toolkit.summaryStale = undefined;
+  toolkit.summaryStaleReason = undefined;
+};
+
 const isOverviewChunk = (chunk: DocumentationChunk): boolean =>
   chunk.location === "header" &&
   chunk.position === "before" &&
@@ -950,6 +975,7 @@ export class DataMerger {
     if (hasToolkitOverviewChunk(result.toolkit)) {
       // Keep overview as the canonical toolkit-level narrative.
       result.toolkit.summary = undefined;
+      clearStaleSummaryFlags(result.toolkit);
       return;
     }
 
@@ -957,7 +983,12 @@ export class DataMerger {
       const currentSignature = buildToolkitSummarySignature(result.toolkit);
       const previousSignature = buildToolkitSummarySignature(previousToolkit);
       if (currentSignature === previousSignature) {
+        // Signature matches — the previous summary still accurately describes
+        // the toolkit, so reuse it verbatim and treat it as fresh regardless
+        // of the previous staleness flag (the signature itself is proof of
+        // freshness here).
         result.toolkit.summary = previousToolkit.summary;
+        clearStaleSummaryFlags(result.toolkit);
         return;
       }
     }
@@ -968,11 +999,17 @@ export class DataMerger {
       // or previously generated content on every run that disables the LLM.
       if (previousToolkit?.summary) {
         result.toolkit.summary = previousToolkit.summary;
+        markSummaryStale(
+          result.toolkit,
+          "llm_generator_unavailable",
+          result.warnings
+        );
       }
       return;
     }
 
     if (result.toolkit.summary) {
+      clearStaleSummaryFlags(result.toolkit);
       return;
     }
 
@@ -981,6 +1018,7 @@ export class DataMerger {
         result.toolkit
       );
       result.toolkit.summary = summary;
+      clearStaleSummaryFlags(result.toolkit);
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       result.warnings.push(
@@ -992,6 +1030,11 @@ export class DataMerger {
       // instead of a blank overview.
       if (previousToolkit?.summary) {
         result.toolkit.summary = previousToolkit.summary;
+        markSummaryStale(
+          result.toolkit,
+          "llm_generation_failed",
+          result.warnings
+        );
       }
     }
   }
diff --git a/toolkit-docs-generator/src/types/index.ts b/toolkit-docs-generator/src/types/index.ts
index cdf5a7c53..366dde7aa 100644
--- a/toolkit-docs-generator/src/types/index.ts
+++ b/toolkit-docs-generator/src/types/index.ts
@@ -399,6 +399,20 @@ export const MergedToolkitSchema = z.object({
   description: z.string().nullable(),
   /** LLM-generated summary (optional) */
   summary: z.string().optional(),
+  /**
+   * True when the current `summary` is known to be out of date with the
+   * toolkit's current tools (the signature changed but regeneration was
+   * skipped or failed, so the previous summary was carried forward as a
+   * fallback). Cleared whenever a fresh summary is successfully generated
+   * or when the summary is verified against an unchanged signature.
+   */
+  summaryStale: z.boolean().optional(),
+  /**
+   * Machine-readable reason the summary is stale (e.g.
+   * "llm_generator_unavailable", "llm_generation_failed"). Always set
+   * together with `summaryStale: true`. Cleared together with it.
+   */
+  summaryStaleReason: z.string().optional(),
   /** Metadata from Design System */
   metadata: MergedToolkitMetadataSchema,
   /** Authentication requirements */
diff --git a/toolkit-docs-generator/tests/merger/data-merger.test.ts b/toolkit-docs-generator/tests/merger/data-merger.test.ts
index aad640015..26c87ea32 100644
--- a/toolkit-docs-generator/tests/merger/data-merger.test.ts
+++ b/toolkit-docs-generator/tests/merger/data-merger.test.ts
@@ -1210,6 +1210,15 @@ describe("DataMerger", () => {
 
       expect(result.toolkit.tools).toHaveLength(2);
       expect(result.toolkit.summary).toBe("Hand-authored summary");
+      expect(result.toolkit.summaryStale).toBe(true);
+      expect(result.toolkit.summaryStaleReason).toBe(
+        "llm_generator_unavailable"
+      );
+      expect(
+        result.warnings.some((warning) =>
+          warning.includes("Summary is stale for Github")
+        )
+      ).toBe(true);
     });
 
     it("preserves previous summary when the LLM generator throws", async () => {
@@ -1248,6 +1257,76 @@ describe("DataMerger", () => {
           warning.includes("Summary generation failed for Github")
         )
       ).toBe(true);
+      expect(result.toolkit.summaryStale).toBe(true);
+      expect(result.toolkit.summaryStaleReason).toBe("llm_generation_failed");
+    });
+
+    it("clears the stale flag when the generator succeeds on the next run", async () => {
+      // A toolkit whose summary was stale on a prior run should come back
+      // clean once the generator actually produces a new summary. This
+      // proves the CI gate will stop flagging the toolkit once fixed.
+      const toolkitDataSource = createCombinedToolkitDataSource({
+        toolSource: new InMemoryToolDataSource([githubTool1, githubTool2]),
+        metadataSource: new InMemoryMetadataSource([githubMetadata]),
+      });
+      const previousResult = await mergeToolkit(
+        "Github",
+        [githubTool1],
+        githubMetadata,
+        null,
+        createStubGenerator()
+      );
+      previousResult.toolkit.summary = "Older summary";
+      previousResult.toolkit.summaryStale = true;
+      previousResult.toolkit.summaryStaleReason = "llm_generation_failed";
+
+      const merger = new DataMerger({
+        toolkitDataSource,
+        customSectionsSource: new EmptyCustomSectionsSource(),
+        toolExampleGenerator: createStubGenerator(),
+        toolkitSummaryGenerator: createStubSummaryGenerator("Fresh summary"),
+        previousToolkits: new Map([["github", previousResult.toolkit]]),
+      });
+
+      const result = await merger.mergeToolkit("Github");
+
+      expect(result.toolkit.summary).toBe("Fresh summary (Github)");
+      expect(result.toolkit.summaryStale).toBeUndefined();
+      expect(result.toolkit.summaryStaleReason).toBeUndefined();
+    });
+
+    it("does not flag stale when the signature matches and the previous summary is reused", async () => {
+      // Signature-match reuse is not stale — the summary still accurately
+      // describes the toolkit. Importantly, a previously-stale flag must
+      // be cleared by the reuse path.
+      const toolkitDataSource = createCombinedToolkitDataSource({
+        toolSource: new InMemoryToolDataSource([githubTool1]),
+        metadataSource: new InMemoryMetadataSource([githubMetadata]),
+      });
+      const previousResult = await mergeToolkit(
+        "Github",
+        [githubTool1],
+        githubMetadata,
+        null,
+        createStubGenerator()
+      );
+      previousResult.toolkit.summary = "Cached summary";
+      previousResult.toolkit.summaryStale = true;
+      previousResult.toolkit.summaryStaleReason = "llm_generation_failed";
+
+      const merger = new DataMerger({
+        toolkitDataSource,
+        customSectionsSource: new EmptyCustomSectionsSource(),
+        toolExampleGenerator: createStubGenerator(),
+        toolkitSummaryGenerator: createStubSummaryGenerator("Unused"),
+        previousToolkits: new Map([["github", previousResult.toolkit]]),
+      });
+
+      const result = await merger.mergeToolkit("Github");
+
+      expect(result.toolkit.summary).toBe("Cached summary");
+      expect(result.toolkit.summaryStale).toBeUndefined();
+      expect(result.toolkit.summaryStaleReason).toBeUndefined();
     });
 
     it("reuses previous examples when the tool is unchanged", async () => {
diff --git a/toolkit-docs-generator/tests/stale-summaries.test.ts b/toolkit-docs-generator/tests/stale-summaries.test.ts
new file mode 100644
index 000000000..3511db5ae
--- /dev/null
+++ b/toolkit-docs-generator/tests/stale-summaries.test.ts
@@ -0,0 +1,80 @@
+import { readdirSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import { describe, expect, it } from "vitest";
+
+/**
+ * CI gate for stale summaries.
+ *
+ * When the generator carries a previous summary forward because
+ * regeneration was skipped (no LLM configured) or failed (LLM error),
+ * it marks the toolkit JSON with `summaryStale: true`. That is deliberate
+ * — a slightly stale overview is better than a blank one — but it must
+ * be fixed before the change merges.
+ *
+ * This test reads the committed `data/toolkits/*.json` files and fails
+ * if any toolkit is flagged stale. The auto-generated `[AUTO] Adding MCP
+ * Servers docs update` PR will still open on schedule, but its CI run
+ * will fail here, which is the intended signal for a reviewer to
+ * either rerun generation with a working LLM or manually refresh the
+ * affected summary.
+ */
+const TOOLKITS_DIR = join(
+  process.cwd(),
+  "toolkit-docs-generator",
+  "data",
+  "toolkits"
+);
+
+type ToolkitSummaryShape = {
+  id?: unknown;
+  summary?: unknown;
+  summaryStale?: unknown;
+  summaryStaleReason?: unknown;
+};
+
+const readToolkit = (fileName: string): ToolkitSummaryShape => {
+  const raw = readFileSync(join(TOOLKITS_DIR, fileName), "utf8");
+  return JSON.parse(raw) as ToolkitSummaryShape;
+};
+
+const listToolkitFiles = (): string[] =>
+  readdirSync(TOOLKITS_DIR).filter(
+    (name) => name.endsWith(".json") && name !== "index.json"
+  );
+
+describe("Stale summary CI gate", () => {
+  it("no committed toolkit JSON has summaryStale: true", () => {
+    const staleFindings: Array<{
+      file: string;
+      id: string;
+      reason: string;
+    }> = [];
+
+    for (const file of listToolkitFiles()) {
+      const toolkit = readToolkit(file);
+      if (toolkit.summaryStale === true) {
+        staleFindings.push({
+          file,
+          id: typeof toolkit.id === "string" ? toolkit.id : file,
+          reason:
+            typeof toolkit.summaryStaleReason === "string"
+              ? toolkit.summaryStaleReason
+              : "unknown",
+        });
+      }
+    }
+
+    if (staleFindings.length > 0) {
+      const report = staleFindings
+        .map(
+          (finding) => `  - ${finding.id} (${finding.file}): ${finding.reason}`
+        )
+        .join("\n");
+      throw new Error(
+        `${staleFindings.length} toolkit(s) have a stale summary that must be regenerated before merge:\n${report}\n\n` +
+          "Re-run `toolkit-docs-generator generate` with a working --llm-provider/--llm-model, or refresh the summary by hand and clear the `summaryStale` / `summaryStaleReason` fields in the JSON."
+      );
+    }
+    expect(staleFindings).toEqual([]);
+  });
+});

From f5a58e6c28e833a15debe67a57e9abe0827c49a6 Mon Sep 17 00:00:00 2001
From: jottakka <fjlimal@gmail.com>
Date: Sat, 18 Apr 2026 15:56:47 -0300
Subject: [PATCH 3/3] fix(toolkit-docs-generator): address ACR findings on the
 stale-summary flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five findings from acr-run on the stale-summary flag PR:

1. HIGH — Signature-match reuse was clearing legitimate stale flags.
   Scenario: run N-1 carried summary S_AB forward into toolset
   {A,B,C} with summaryStale=true. Run N with the same {A,B,C} would
   match the signature and mark the summary fresh — but S_AB still
   describes the wrong toolset. Guard the reuse fast path with
   `!previousToolkit.summaryStale` so a stale-then-matching-signature
   run correctly falls through to regeneration (or, if no generator is
   available, keeps the stale flag).

2. MEDIUM — previous-output.ts used `!== undefined` for summaryStale
   but a truthy check for summaryStaleReason, so an empty-string reason
   would break the pair invariant. Match both guards to `!== undefined`.

3. MEDIUM — Moved the `result.toolkit.summary` defensive guard to the
   top of maybeGenerateSummary so the no-generator fallback path can
   never clobber an already-set summary.

4. LOW — Wrapped readdirSync and JSON.parse in check-stale-summaries.ts
   in try/catch so a missing directory or a malformed JSON file produces
   a clean exit-1 with the filename instead of an uncaught exception.

5. LOW — stale-summaries.test.ts now resolves TOOLKITS_DIR from
   import.meta.url rather than process.cwd(), so it works from any
   working directory (repo root or toolkit-docs-generator/).

Tests updated:
- Replaced the former "does not flag stale when signature matches" test
  (which validated the broken behavior) with three tests covering the
  three reuse scenarios:
  a) signature match + previous fresh → reuse, no regen
  b) signature match + previous stale + generator → regen clears flag
  c) signature match + previous stale + no generator → keep stale flag

528 tests pass, type-check clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../scripts/check-stale-summaries.ts          | 28 +++++--
 .../src/diff/previous-output.ts               |  2 +-
 .../src/merger/data-merger.ts                 | 28 ++++---
 .../tests/merger/data-merger.test.ts          | 83 +++++++++++++++++--
 .../tests/stale-summaries.test.ts             | 15 ++--
 5 files changed, 126 insertions(+), 30 deletions(-)

diff --git a/toolkit-docs-generator/scripts/check-stale-summaries.ts b/toolkit-docs-generator/scripts/check-stale-summaries.ts
index a3e1cb814..da093402d 100644
--- a/toolkit-docs-generator/scripts/check-stale-summaries.ts
+++ b/toolkit-docs-generator/scripts/check-stale-summaries.ts
@@ -26,16 +26,32 @@ type ToolkitShape = {
   summaryStaleReason?: unknown;
 };
 
-const listToolkitFiles = (): string[] =>
-  readdirSync(TOOLKITS_DIR).filter(
-    (name) => name.endsWith(".json") && name !== "index.json"
-  );
+const listToolkitFiles = (): string[] => {
+  try {
+    return readdirSync(TOOLKITS_DIR).filter(
+      (name) => name.endsWith(".json") && name !== "index.json"
+    );
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    console.error(
+      `✗ Cannot read toolkit directory ${TOOLKITS_DIR}: ${message}`
+    );
+    process.exit(1);
+  }
+};
 
 const main = (): number => {
   const stale: Array<{ file: string; id: string; reason: string }> = [];
   for (const file of listToolkitFiles()) {
-    const raw = readFileSync(join(TOOLKITS_DIR, file), "utf8");
-    const toolkit = JSON.parse(raw) as ToolkitShape;
+    let toolkit: ToolkitShape;
+    try {
+      const raw = readFileSync(join(TOOLKITS_DIR, file), "utf8");
+      toolkit = JSON.parse(raw) as ToolkitShape;
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.error(`✗ Cannot parse ${file}: ${message}`);
+      return 1;
+    }
     if (toolkit.summaryStale === true) {
       stale.push({
         file,
diff --git a/toolkit-docs-generator/src/diff/previous-output.ts b/toolkit-docs-generator/src/diff/previous-output.ts
index 49577bd74..043a7bb90 100644
--- a/toolkit-docs-generator/src/diff/previous-output.ts
+++ b/toolkit-docs-generator/src/diff/previous-output.ts
@@ -306,7 +306,7 @@ const buildFallbackToolkit = (
       description,
       ...(summary ? { summary } : {}),
       ...(summaryStale !== undefined ? { summaryStale } : {}),
-      ...(summaryStaleReason ? { summaryStaleReason } : {}),
+      ...(summaryStaleReason !== undefined ? { summaryStaleReason } : {}),
       metadata: metadataResult.success
         ? metadataResult.data
         : DEFAULT_PREVIOUS_TOOLKIT_METADATA,
diff --git a/toolkit-docs-generator/src/merger/data-merger.ts b/toolkit-docs-generator/src/merger/data-merger.ts
index 764ff9bf0..d3cf02ed5 100644
--- a/toolkit-docs-generator/src/merger/data-merger.ts
+++ b/toolkit-docs-generator/src/merger/data-merger.ts
@@ -979,14 +979,27 @@ export class DataMerger {
       return;
     }
 
+    // Defensive guard: if something upstream already set a summary on
+    // `result.toolkit`, respect it. buildMergedToolkit does not set one
+    // today, but we don't want the reuse / fallback paths below to
+    // silently replace a pre-populated value.
+    if (result.toolkit.summary) {
+      clearStaleSummaryFlags(result.toolkit);
+      return;
+    }
+
     if (previousToolkit?.summary) {
       const currentSignature = buildToolkitSummarySignature(result.toolkit);
       const previousSignature = buildToolkitSummarySignature(previousToolkit);
-      if (currentSignature === previousSignature) {
-        // Signature matches — the previous summary still accurately describes
-        // the toolkit, so reuse it verbatim and treat it as fresh regardless
-        // of the previous staleness flag (the signature itself is proof of
-        // freshness here).
+      // Signature-match reuse is only safe when the PREVIOUS summary itself
+      // was fresh. If the previous run already flagged the summary stale,
+      // a matching signature does not prove freshness — the stale summary
+      // was carried forward from an even earlier toolset and will stay
+      // wrong until an LLM actually regenerates it.
+      if (
+        currentSignature === previousSignature &&
+        !previousToolkit.summaryStale
+      ) {
         result.toolkit.summary = previousToolkit.summary;
         clearStaleSummaryFlags(result.toolkit);
         return;
@@ -1008,11 +1021,6 @@ export class DataMerger {
       return;
     }
 
-    if (result.toolkit.summary) {
-      clearStaleSummaryFlags(result.toolkit);
-      return;
-    }
-
     try {
       const summary = await this.toolkitSummaryGenerator.generate(
         result.toolkit
diff --git a/toolkit-docs-generator/tests/merger/data-merger.test.ts b/toolkit-docs-generator/tests/merger/data-merger.test.ts
index 26c87ea32..71155f0a6 100644
--- a/toolkit-docs-generator/tests/merger/data-merger.test.ts
+++ b/toolkit-docs-generator/tests/merger/data-merger.test.ts
@@ -1295,10 +1295,10 @@ describe("DataMerger", () => {
       expect(result.toolkit.summaryStaleReason).toBeUndefined();
     });
 
-    it("does not flag stale when the signature matches and the previous summary is reused", async () => {
-      // Signature-match reuse is not stale — the summary still accurately
-      // describes the toolkit. Importantly, a previously-stale flag must
-      // be cleared by the reuse path.
+    it("does not flag stale when the signature matches a fresh previous summary", async () => {
+      // Baseline: if previous.summaryStale is falsy, signature match is a
+      // valid proof of freshness and the reuse path should keep the
+      // summary and stay clean.
       const toolkitDataSource = createCombinedToolkitDataSource({
         toolSource: new InMemoryToolDataSource([githubTool1]),
         metadataSource: new InMemoryMetadataSource([githubMetadata]),
@@ -1311,6 +1311,41 @@ describe("DataMerger", () => {
         createStubGenerator()
       );
       previousResult.toolkit.summary = "Cached summary";
+
+      const countingSummary = createCountingSummaryGenerator();
+      const merger = new DataMerger({
+        toolkitDataSource,
+        customSectionsSource: new EmptyCustomSectionsSource(),
+        toolExampleGenerator: createStubGenerator(),
+        toolkitSummaryGenerator: countingSummary.generator,
+        previousToolkits: new Map([["github", previousResult.toolkit]]),
+      });
+
+      const result = await merger.mergeToolkit("Github");
+
+      expect(result.toolkit.summary).toBe("Cached summary");
+      expect(result.toolkit.summaryStale).toBeUndefined();
+      expect(result.toolkit.summaryStaleReason).toBeUndefined();
+      expect(countingSummary.getCalls()).toBe(0);
+    });
+
+    it("regenerates when signature matches but the previous summary was already stale", async () => {
+      // If the previous summary was already flagged stale, a matching
+      // signature does NOT prove freshness — the stale summary was
+      // carried forward from an earlier toolset. The reuse fast path
+      // must skip, and the LLM must actually regenerate.
+      const toolkitDataSource = createCombinedToolkitDataSource({
+        toolSource: new InMemoryToolDataSource([githubTool1]),
+        metadataSource: new InMemoryMetadataSource([githubMetadata]),
+      });
+      const previousResult = await mergeToolkit(
+        "Github",
+        [githubTool1],
+        githubMetadata,
+        null,
+        createStubGenerator()
+      );
+      previousResult.toolkit.summary = "Stale carried-forward summary";
       previousResult.toolkit.summaryStale = true;
       previousResult.toolkit.summaryStaleReason = "llm_generation_failed";
 
@@ -1318,17 +1353,53 @@ describe("DataMerger", () => {
         toolkitDataSource,
         customSectionsSource: new EmptyCustomSectionsSource(),
         toolExampleGenerator: createStubGenerator(),
-        toolkitSummaryGenerator: createStubSummaryGenerator("Unused"),
+        toolkitSummaryGenerator: createStubSummaryGenerator("Fresh"),
         previousToolkits: new Map([["github", previousResult.toolkit]]),
       });
 
       const result = await merger.mergeToolkit("Github");
 
-      expect(result.toolkit.summary).toBe("Cached summary");
+      expect(result.toolkit.summary).toBe("Fresh (Github)");
       expect(result.toolkit.summaryStale).toBeUndefined();
       expect(result.toolkit.summaryStaleReason).toBeUndefined();
     });
 
+    it("keeps the stale flag when previous was stale, signature matches, and no generator is available", async () => {
+      // Same as above, but the regen attempt is not possible. The carried-
+      // forward summary remains, and the stale flag must persist so CI
+      // keeps flagging the toolkit until a working LLM run produces a
+      // fresh one.
+      const toolkitDataSource = createCombinedToolkitDataSource({
+        toolSource: new InMemoryToolDataSource([githubTool1]),
+        metadataSource: new InMemoryMetadataSource([githubMetadata]),
+      });
+      const previousResult = await mergeToolkit(
+        "Github",
+        [githubTool1],
+        githubMetadata,
+        null,
+        createStubGenerator()
+      );
+      previousResult.toolkit.summary = "Stale carried-forward summary";
+      previousResult.toolkit.summaryStale = true;
+      previousResult.toolkit.summaryStaleReason = "llm_generation_failed";
+
+      const merger = new DataMerger({
+        toolkitDataSource,
+        customSectionsSource: new EmptyCustomSectionsSource(),
+        toolExampleGenerator: createStubGenerator(),
+        previousToolkits: new Map([["github", previousResult.toolkit]]),
+      });
+
+      const result = await merger.mergeToolkit("Github");
+
+      expect(result.toolkit.summary).toBe("Stale carried-forward summary");
+      expect(result.toolkit.summaryStale).toBe(true);
+      expect(result.toolkit.summaryStaleReason).toBe(
+        "llm_generator_unavailable"
+      );
+    });
+
     it("reuses previous examples when the tool is unchanged", async () => {
       const toolkitDataSource = createCombinedToolkitDataSource({
         toolSource: new InMemoryToolDataSource([githubTool1]),
diff --git a/toolkit-docs-generator/tests/stale-summaries.test.ts b/toolkit-docs-generator/tests/stale-summaries.test.ts
index 3511db5ae..9e88bd63a 100644
--- a/toolkit-docs-generator/tests/stale-summaries.test.ts
+++ b/toolkit-docs-generator/tests/stale-summaries.test.ts
@@ -1,5 +1,6 @@
 import { readdirSync, readFileSync } from "node:fs";
-import { join } from "node:path";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
 import { describe, expect, it } from "vitest";
 
 /**
@@ -18,12 +19,12 @@ import { describe, expect, it } from "vitest";
  * either rerun generation with a working LLM or manually refresh the
  * affected summary.
  */
-const TOOLKITS_DIR = join(
-  process.cwd(),
-  "toolkit-docs-generator",
-  "data",
-  "toolkits"
-);
+// Resolve from the test file's own location so the test works whether
+// vitest is invoked from the repo root or from inside
+// toolkit-docs-generator/. Falls back to process.cwd() only if
+// import.meta.url is unavailable (it is in the vitest esm runtime).
+const here = dirname(fileURLToPath(import.meta.url));
+const TOOLKITS_DIR = join(here, "..", "data", "toolkits");
 
 type ToolkitSummaryShape = {
   id?: unknown;