From d1aca6872cd3b3639b94866b6ac67d626f83e2a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Wed, 27 May 2026 10:46:33 +0800 Subject: [PATCH 1/6] fix: Optimization of the skill accumulation chain for negative examples --- .../core/config/defaults.ts | 5 + apps/memos-local-plugin/core/config/schema.ts | 8 + .../core/experience/feedback-builder.ts | 91 ++++++++- .../core/pipeline/memory-core.ts | 22 +++ apps/memos-local-plugin/core/skill/index.ts | 7 + .../core/skill/lifecycle.ts | 8 +- .../core/skill/repair-candidate.ts | 185 ++++++++++++++++++ apps/memos-local-plugin/core/skill/skill.ts | 9 +- .../core/skill/subscriber.ts | 48 ++++- apps/memos-local-plugin/core/skill/types.ts | 6 + .../migrations/013-skill-repair-origin.sql | 7 + .../core/storage/migrator.ts | 20 ++ .../core/storage/repos/skills.ts | 8 + apps/memos-local-plugin/core/types.ts | 13 ++ .../unit/experience/feedback-builder.test.ts | 50 +++++ .../tests/unit/skill/lifecycle.test.ts | 37 ++++ .../tests/unit/skill/repair-candidate.test.ts | 150 ++++++++++++++ .../tests/unit/skill/subscriber.test.ts | 104 ++++++++++ 18 files changed, 767 insertions(+), 11 deletions(-) create mode 100644 apps/memos-local-plugin/core/skill/repair-candidate.ts create mode 100644 apps/memos-local-plugin/core/storage/migrations/013-skill-repair-origin.sql create mode 100644 apps/memos-local-plugin/tests/unit/skill/repair-candidate.test.ts diff --git a/apps/memos-local-plugin/core/config/defaults.ts b/apps/memos-local-plugin/core/config/defaults.ts index 1cf2d2cf6..3d59646b1 100644 --- a/apps/memos-local-plugin/core/config/defaults.ts +++ b/apps/memos-local-plugin/core/config/defaults.ts @@ -206,6 +206,11 @@ export const DEFAULT_CONFIG: ResolvedConfig = { etaDelta: 0.1, archiveEta: 0.1, minEtaForRetrieval: 0.1, + // Unproven repair candidates graduate at a higher η than normal + // candidates: with η born at the 0.1 floor, 0.5 means a candidate must + // genuinely (full-)pass its trial(s) — a single fail (η→0.05) archives + // it rather than letting a weak signal promote it. + repairCandidateMinEta: 0.5, }, feedback: { failureThreshold: 3, diff --git a/apps/memos-local-plugin/core/config/schema.ts b/apps/memos-local-plugin/core/config/schema.ts index 7c9ff193b..8d485a36f 100644 --- a/apps/memos-local-plugin/core/config/schema.ts +++ b/apps/memos-local-plugin/core/config/schema.ts @@ -288,6 +288,14 @@ const AlgorithmSchema = Type.Object({ archiveEta: NumberInRange(0.1, 0, 1), /** Hide Tier-1 skills whose η is below this. Mirrors retrieval.minSkillEta. */ minEtaForRetrieval: NumberInRange(0.1, 0, 1), + /** + * Graduation floor for *repair-origin* candidates (unproven fixes minted + * from a failure). Higher than `minEtaForRetrieval` on purpose: a repair + * has no success anchor, so promotion must require a majority of real + * trial passes, not a single lucky one. With η birth at the 0.1 floor and + * `candidateTrials` trials, 0.5 demands ~2-of-3 genuine passes. + */ + repairCandidateMinEta: NumberInRange(0.5, 0, 1), }, { default: {} }), feedback: Type.Object({ /** Raise a burst after this many failures of the same tool in-window. */ diff --git a/apps/memos-local-plugin/core/experience/feedback-builder.ts b/apps/memos-local-plugin/core/experience/feedback-builder.ts index f5a948d40..0eb87812d 100644 --- a/apps/memos-local-plugin/core/experience/feedback-builder.ts +++ b/apps/memos-local-plugin/core/experience/feedback-builder.ts @@ -206,6 +206,9 @@ async function buildDraft(args: { let procedure: string; let verification: string; let guidance: ReturnType; + // The LLM's "what to do" line, when refinement ran — used as the corrective + // fix text for a constructive negative (Q6). + let llmProcedure: string | null = null; if (args.llm && (args.trace || args.episode)) { try { @@ -239,6 +242,7 @@ async function buildDraft(args: { preference: [], antiPattern: refined.caveats, }; + llmProcedure = refined.procedure; } catch (err) { // Fall back to rule-based extraction const fallback = buildDraftFallback(args, type, text); @@ -258,6 +262,19 @@ async function buildDraft(args: { guidance = fallback.guidance; } + // Q6: a constructive negative carries BOTH faces in one record — the + // avoidance ("don't do X", already in antiPattern) and the suggested fix + // ("do Y") as a preference. Only when the feedback actually names a + // corrective direction; a bare verdict ("wrong", reward 0) stays a pure + // warning and mints no fix (Q5: 没建设性就不沉淀修法). + const fix = fail ? constructiveFix(args.classified, llmProcedure) : null; + if (fix) { + guidance = { + preference: dedupeLines([...guidance.preference, fix]), + antiPattern: guidance.antiPattern, + }; + } + const boundary = [ "Use only for similar task shape, evaluator expectation, or user preference.", args.episode?.id ? `Source episode: ${args.episode.id}` : null, @@ -342,12 +359,74 @@ function guidanceOf( } if (type === "failure_avoidance") antiPattern.push(text); if (type === "repair_instruction" || type === "success_pattern") preference.push(text); + // Drop punctuation-only / empty captures (the classifier can extract "." from + // a soft preference match) so guidance never stores garbage. return { - preference: dedupeLines(preference), - antiPattern: dedupeLines(antiPattern), + preference: dedupeLines(preference.filter(substantive)), + antiPattern: dedupeLines(antiPattern.filter(substantive)), }; } +// Corrective-direction cues ("do Y"), conservative on purpose: meta prompts +// like "reflect on what to improve" are NOT cues, so a bare verdict mints no +// fix. The capture group holds the fix text. +const CORRECTIVE_CLAUSE_PATTERNS: readonly RegExp[] = [ + /\binstead[,\s]+\s*(?:use|try|apply|do|switch to)\s+(.{4,240})/i, + /\b(?:use|prefer|switch to|apply)\s+(.{4,240}?)\s+instead\b/i, + /\b(?:should|must|need to|needs to|have to)\s+(?:use|be|do|switch to|apply)\s+(.{4,240})/i, + /\b(?:use|prefer|switch to|apply)\s+(.{4,240})/i, + /(?:改用|应该用|应改为|换成|建议用|下次用|应该)\s*(.{2,120})/, +]; + +/** + * Reject only empty / pure-punctuation captures (e.g. the classifier's stray + * ".") — anything with at least one letter or digit is real content. A higher + * bar would wrongly drop short CJK guidance like "重复". + */ +function substantive(s: string | null | undefined): boolean { + if (!s) return false; + return /[\p{L}\p{N}]/u.test(s); +} + +function extractCorrectiveClause(text: string): string | null { + for (const re of CORRECTIVE_CLAUSE_PATTERNS) { + const m = re.exec(text); + const clause = m?.[1]?.trim(); + if (clause && substantive(clause)) return clause; + } + return null; +} + +/** + * For a failed/negative feedback, extract the *corrective direction* ("do Y") + * when the feedback actually contains one. Returns null when the feedback only + * delivers a verdict ("wrong", reward 0, plain TLE) with no reusable fix — those + * stay a pure avoidance warning and mint no repair candidate downstream + * (Q5: 没建设性就不沉淀修法). + * + * Gates on *substantive corrective text* rather than the classifier shape: the + * lexical classifier is noisy here (soft "instead/use" hits extract nothing, + * and pattern captures can grab punctuation like "."). The LLM's refined + * procedure is preferred when present; otherwise we extract a clause ourselves. + */ +function constructiveFix( + classified: ReturnType, + llmProcedure: string | null, +): string | null { + const candidates = [ + llmProcedure, + extractCorrectiveClause(classified.text), + classified.prefer, + classified.correction, + classified.constraint, + ]; + for (const c of candidates) { + const s = c?.trim(); + if (s && substantive(s)) return cleanLine(s, MAX_LINE_CHARS); + } + return null; +} + async function embedPolicy( text: string, deps: FeedbackExperienceDeps, @@ -574,15 +653,19 @@ function verifierStats(raw: unknown): VerifierStats { }; } -type ObjectiveOutcome = "pass" | "fail" | "unknown"; +export type ObjectiveOutcome = "pass" | "fail" | "unknown"; /** * Authoritative success/failure from the verifier payload, falling back to the * episode reward. Strict scenarios (coding/math/verifier) treat ONLY a full pass * as positive: a partial pass (passed < total) or reward below full credit is a * failure, never a positive exemplar. + * + * Pass `rTask = null` for a *verifier-only* verdict: with no reward fallback it + * returns "unknown" when the payload carries no verifier signal. Used by strict + * repair-candidate trial resolution, which must never pass on a loose reward. */ -function objectiveOutcome(raw: unknown, rTask: number | null | undefined): ObjectiveOutcome { +export function objectiveOutcome(raw: unknown, rTask: number | null | undefined): ObjectiveOutcome { const { reward, passed, total } = verifierStats(raw); if (passed != null && total != null && total > 0) { return passed >= total ? "pass" : "fail"; diff --git a/apps/memos-local-plugin/core/pipeline/memory-core.ts b/apps/memos-local-plugin/core/pipeline/memory-core.ts index 0015be1c0..03b2386f4 100644 --- a/apps/memos-local-plugin/core/pipeline/memory-core.ts +++ b/apps/memos-local-plugin/core/pipeline/memory-core.ts @@ -75,6 +75,7 @@ import type { import type { ResolvedConfig, ResolvedHome } from "../config/index.js"; import { loadConfig, resolveHome, SECRET_FIELD_PATHS } from "../config/index.js"; import { feedbackText, runFeedbackExperience } from "../experience/feedback-builder.js"; +import { isRepairCandidatePolicy, mintRepairCandidate } from "../skill/repair-candidate.js"; import { rootLogger } from "../logger/index.js"; import type { Logger } from "../logger/types.js"; import { openDb } from "../storage/connection.js"; @@ -1915,6 +1916,27 @@ export function createMemoryCore( try { await handle.l2.drain(); if (policyId) { + // A constructive negative (failure + named fix) mints an unproven + // repair *candidate* skill that earns trust via trials. The normal + // crystallization below skips negatives, so there is no conflict; the + // candidate dedups against it via sourcePolicyIds. + const pol = handle.repos.policies.getById(policyId); + if (pol && isRepairCandidatePolicy(pol)) { + // Best-effort: a mint failure must never block crystallization / L3. + try { + mintRepairCandidate(pol, { + repos: handle.repos, + embedder: handle.embedder, + now: Date.now, + log, + }); + } catch (err) { + log.warn("feedback.repair_candidate_failed", { + policyId, + err: err instanceof Error ? err.message : String(err), + }); + } + } await handle.skills.runOnce({ trigger: "manual", policyId }); } if (episode) { diff --git a/apps/memos-local-plugin/core/skill/index.ts b/apps/memos-local-plugin/core/skill/index.ts index 3d11349f2..9a79a4f71 100644 --- a/apps/memos-local-plugin/core/skill/index.ts +++ b/apps/memos-local-plugin/core/skill/index.ts @@ -40,6 +40,13 @@ export { runSkill, type RunSkillDeps, } from "./skill.js"; +export { + isRepairCandidatePolicy, + deriveStrictTrial, + mintRepairCandidate, + REPAIR_CANDIDATE_INITIAL_ETA, + type MintRepairCandidateDeps, +} from "./repair-candidate.js"; export { attachSkillSubscriber, type SkillSubscriberDeps, type SkillSubscriberHandle } from "./subscriber.js"; export { createSkillEventBus } from "./events.js"; export { extractToolNames } from "./tool-names.js"; diff --git a/apps/memos-local-plugin/core/skill/lifecycle.ts b/apps/memos-local-plugin/core/skill/lifecycle.ts index 49f43bb02..3bb1f2b65 100644 --- a/apps/memos-local-plugin/core/skill/lifecycle.ts +++ b/apps/memos-local-plugin/core/skill/lifecycle.ts @@ -73,8 +73,14 @@ function applyTrial( let status: SkillRow["status"] = skill.status; let transition: SkillLifecycleTransition | undefined; + // Repair-origin candidates have no success anchor, so they graduate at a + // stricter η floor — promotion must be earned by real (full-pass) trials, + // not a single lucky one (Q3). + const promoteFloor = skill.repairOrigin + ? cfg.repairCandidateMinEta ?? cfg.minEtaForRetrieval + : cfg.minEtaForRetrieval; if (status === "candidate" && trialsAttempted >= cfg.candidateTrials) { - if (eta >= cfg.minEtaForRetrieval) { + if (eta >= promoteFloor) { status = "active"; transition = "promoted"; } else { diff --git a/apps/memos-local-plugin/core/skill/repair-candidate.ts b/apps/memos-local-plugin/core/skill/repair-candidate.ts new file mode 100644 index 000000000..441914275 --- /dev/null +++ b/apps/memos-local-plugin/core/skill/repair-candidate.ts @@ -0,0 +1,185 @@ +/** + * Repair candidates — minting an unproven skill from a *constructive negative*. + * + * A failed episode whose feedback named a concrete fix produces a negative + * policy that also carries the suggested fix as a `decisionGuidance.preference` + * (see `feedback-builder.ts`). That policy is NOT skill-eligible through the + * normal `hasSuccessAnchor` gate — and it should not be: the fix is unverified. + * + * Instead we mint it directly as a **candidate** skill with: + * - `eta = REPAIR_CANDIDATE_INITIAL_ETA` (just at the retrieval floor — visible + * enough to be tried, zero success credit), and + * - `repairOrigin = true` (uses the stricter promotion bar; surfaced as + * "unverified" in retrieval), and + * - `strictTrial` stamped from the source (verifier origin → full-pass-only + * trial judging; soft feedback → loose). + * + * It earns trust the same way every other candidate does — via `skill_trials` + * resolved by the real re-run outcome — and is deduped against the normal + * crystallization path through `sourcePolicyIds` (a later positive feedback + * rebuilds *this* skill rather than minting a second one). + * + * No LLM call: the policy already carries refined guidance, so this works in + * the no-LLM fallback path too (unlike `crystallizeDraft`). + */ + +import { ids } from "../id.js"; +import type { Embedder } from "../embedding/types.js"; +import type { Logger } from "../logger/types.js"; +import type { Repos } from "../storage/repos/index.js"; +import type { PolicyRow, SkillId, SkillRow, TraceId } from "../types.js"; + +/** + * Q3: born at the retrieval floor — visible enough to be tried, no head start. + * MUST stay ≥ `retrieval.minSkillEta` / `skill.minEtaForRetrieval` (both default + * 0.1): tier-1 hides skills with `eta < minSkillEta`, so a candidate born below + * the floor would never surface, never get a trial, and never validate. Keep + * this aligned if that floor is raised. + */ +export const REPAIR_CANDIDATE_INITIAL_ETA = 0.1; + +export interface MintRepairCandidateDeps { + repos: Pick; + embedder: Embedder | null; + now?: () => number; + log?: Logger; +} + +/** + * A constructive negative: a failure (negative polarity, not skill-eligible) + * whose feedback named a concrete fix (a non-empty `preference`). That fix is + * the repair we mint as a candidate. + */ +export function isRepairCandidatePolicy(policy: PolicyRow): boolean { + if ((policy.evidencePolarity ?? "positive") !== "negative") return false; + if (policy.skillEligible !== false) return false; + return (policy.decisionGuidance?.preference ?? []).some((s) => s.trim().length > 0); +} + +/** + * Strict when the source carried an objective all-or-nothing verifier signal — + * those trials must judge by full credit only (Q2). Soft-feedback origin → loose. + */ +export function deriveStrictTrial(policy: PolicyRow): boolean { + const m = policy.verifierMeta as Record | null | undefined; + if (!m) return false; + return m.passed != null || m.total != null || m.reward != null || m.score != null; +} + +/** + * Mint a candidate repair skill from a constructive-negative policy. Returns the + * new skill id, or null when the policy is not a repair candidate or a skill + * already cites it (the normal rebuild path owns updates from then on). + */ +export function mintRepairCandidate( + policy: PolicyRow, + deps: MintRepairCandidateDeps, +): SkillId | null { + if (!isRepairCandidatePolicy(policy)) return null; + const now = deps.now?.() ?? Date.now(); + + // Dedup (Q4): if any non-archived skill already cites this policy, let the + // normal crystallization/rebuild path own it — don't mint a second skill. + const already = deps.repos.skills + .list({ limit: 500 }) + .some((s) => s.status !== "archived" && s.sourcePolicyIds.includes(policy.id)); + if (already) return null; + + const fix = (policy.decisionGuidance?.preference ?? []).find((s) => s.trim().length > 0) ?? ""; + // Build the name so the policy-id suffix always survives truncation — + // slicing the title *before* appending it keeps names unique even for long + // titles (a collision would otherwise silently drop the mint). + const titleSlug = slugName(stripPrefix(policy.title)).slice(0, 28) || "fix"; + const idSuffix = slugName(policy.id.slice(-5)) || "x"; + const name = `repair_${titleSlug}_${idSuffix}`; + const id = ids.skill() as SkillId; + const invocationGuide = renderRepairGuide(policy, fix); + + const row: SkillRow = { + id, + ownerAgentKind: policy.ownerAgentKind, + ownerProfileId: policy.ownerProfileId, + ownerWorkspaceId: policy.ownerWorkspaceId, + name, + status: "candidate", + invocationGuide, + procedureJson: null, + eta: REPAIR_CANDIDATE_INITIAL_ETA, + support: 1, + gain: policy.gain, + trialsAttempted: 0, + trialsPassed: 0, + sourcePolicyIds: [policy.id], + sourceWorldModelIds: [], + evidenceAnchors: (policy.sourceTraceIds ?? []) as TraceId[], + vec: null, + createdAt: now, + updatedAt: now, + version: 1, + repairOrigin: true, + strictTrial: deriveStrictTrial(policy), + }; + + try { + deps.repos.skills.insert(row); + } catch (err) { + // e.g. a unique-name collision — non-fatal, just skip the mint. + deps.log?.warn("skill.repair_candidate.insert_failed", { + policyId: policy.id, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } + + if (deps.embedder) { + deps.repos.embeddingRetryQueue.enqueue({ + id: `er_${ids.span()}`, + targetKind: "skill", + targetId: id, + vectorField: "vec", + sourceText: invocationGuide || name, + embedRole: "document", + now, + }); + } + + return id; +} + +function stripPrefix(title: string): string { + return title.replace(/^(avoid|repair|prefer|success)\s*:\s*/i, "").trim(); +} + +function slugName(raw: string): string { + return raw + .toLowerCase() + .replace(/[^a-z0-9]+/g, "_") + .replace(/^_+|_+$/g, ""); +} + +function renderRepairGuide(policy: PolicyRow, fix: string): string { + const lines: string[] = []; + // Keep the "[unverified repair]" marker in the SAME paragraph as the title + // (no blank line) so it also surfaces in the Tier-1 teaser, not just the + // full procedure the agent loads on demand. + lines.push(`# ${stripPrefix(policy.title) || "Repair candidate"}`); + lines.push("[unverified repair] distilled from a failure — validate by re-running before trusting."); + lines.push(""); + if (policy.trigger?.trim()) { + lines.push("**When to use**"); + lines.push(policy.trigger.trim()); + lines.push(""); + } + if (fix.trim()) { + lines.push("**Suggested fix**"); + lines.push(fix.trim()); + lines.push(""); + } + const antiPattern = policy.decisionGuidance?.antiPattern ?? []; + if (antiPattern.length) { + lines.push("**Avoid**"); + for (const a of antiPattern) lines.push(`- ${a}`); + lines.push(""); + } + return lines.join("\n").trim(); +} diff --git a/apps/memos-local-plugin/core/skill/skill.ts b/apps/memos-local-plugin/core/skill/skill.ts index d0cdc48ae..d52e0ad12 100644 --- a/apps/memos-local-plugin/core/skill/skill.ts +++ b/apps/memos-local-plugin/core/skill/skill.ts @@ -205,7 +205,14 @@ export async function runSkill( // reset η toward the recomputed value — existing practitioner skills // lose credibility when the underlying policy shifts materially. if (decision.action === "rebuild" && decision.existingSkill) { - row.eta = recomputeEta(decision.existingSkill, decision.policy, config); + const recomputed = recomputeEta(decision.existingSkill, decision.policy, config); + // Q4: a repair candidate that earned trust via real trials must not have + // it wiped when a later positive feedback rebuilds it into a success- + // backed skill — take the higher of earned vs. recomputed η. (The + // rebuilt row drops `repairOrigin`, so it graduates on normal thresholds.) + row.eta = decision.existingSkill.repairOrigin + ? Math.max(recomputed, decision.existingSkill.eta) + : recomputed; } repos.skills.upsert(row); diff --git a/apps/memos-local-plugin/core/skill/subscriber.ts b/apps/memos-local-plugin/core/skill/subscriber.ts index 52ae6a4f6..f531f733e 100644 --- a/apps/memos-local-plugin/core/skill/subscriber.ts +++ b/apps/memos-local-plugin/core/skill/subscriber.ts @@ -20,6 +20,7 @@ import type { L2Event, L2EventBus } from "../memory/l2/types.js"; import type { Logger } from "../logger/types.js"; import type { RewardEvent, RewardEventBus } from "../reward/types.js"; import { rootLogger } from "../logger/index.js"; +import { objectiveOutcome } from "../experience/feedback-builder.js"; import { applySkillFeedback, runSkill, @@ -32,7 +33,7 @@ import type { SkillFeedbackKind, SkillTrigger, } from "./types.js"; -import type { SkillId } from "../types.js"; +import type { EpisodeId, SkillId } from "../types.js"; export interface SkillSubscriberDeps extends Omit { @@ -157,20 +158,41 @@ export function attachSkillSubscriber( function resolveTrialsForReward(evt: Extract): void { const rTask = evt.result.rHuman; - const outcome = + const looseOutcome = rTask >= 0.5 ? "pass" : rTask <= -0.5 ? "fail" : "unknown"; const trials = deps.repos.skillTrials.listPendingForEpisode(evt.result.episodeId); if (trials.length === 0) return; + + // Strict (verifier-origin repair) trials judge by full credit only — never + // the loose rTask threshold. Computed lazily (and once) since most trials + // are loose. + let strictMemo: "pass" | "fail" | "unknown" | undefined; + const strictOutcome = (): "pass" | "fail" | "unknown" => { + if (strictMemo === undefined) { + strictMemo = computeStrictOutcome(evt.result.episodeId); + } + return strictMemo; + }; + for (const trial of trials) { + const skill = deps.repos.skills.getById(trial.skillId); + const strict = skill?.strictTrial === true; + const outcome = strict ? strictOutcome() : looseOutcome; const evidence = { source: "reward.updated", episodeId: evt.result.episodeId, rTask, - threshold: { pass: 0.5, fail: -0.5 }, - reason: - outcome === "pass" + mode: strict ? "strict-full-pass" : "loose-threshold", + threshold: strict ? { fullPassOnly: true } : { pass: 0.5, fail: -0.5 }, + reason: strict + ? outcome === "pass" + ? "verifier full pass" + : outcome === "fail" + ? "verifier not a full pass" + : "no verifier signal on this episode" + : outcome === "pass" ? "rTask >= 0.5" : outcome === "fail" ? "rTask <= -0.5" @@ -195,11 +217,27 @@ export function attachSkillSubscriber( skillId: trial.skillId, episodeId: evt.result.episodeId, outcome, + mode: strict ? "strict" : "loose", rTask, }); } } + /** + * Verifier-only verdict for a strict repair trial: scan the episode's + * feedback for an objective verifier signal and require a full pass. Returns + * "unknown" when no verifier payload exists — strict trials never pass on a + * loose reward. + */ + function computeStrictOutcome(episodeId: EpisodeId): "pass" | "fail" | "unknown" { + const rows = deps.repos.feedback.list({ episodeId }); + for (const fb of rows) { + const o = objectiveOutcome(fb.raw, null); + if (o !== "unknown") return o; + } + return "unknown"; + } + async function flush(): Promise { // Loop in case additional events arrive while we're draining. while (inflight) { diff --git a/apps/memos-local-plugin/core/skill/types.ts b/apps/memos-local-plugin/core/skill/types.ts index a2799b061..6e1a67a4f 100644 --- a/apps/memos-local-plugin/core/skill/types.ts +++ b/apps/memos-local-plugin/core/skill/types.ts @@ -118,6 +118,12 @@ export interface SkillConfig { archiveEta: number; /** Below this η, skills never surface in Tier-1 — matches retrieval config. */ minEtaForRetrieval: number; + /** + * Graduation floor for repair-origin candidates (unproven fixes minted from + * a failure). Higher than `minEtaForRetrieval` so promotion needs a majority + * of real trial passes. Falls back to `minEtaForRetrieval` when unset. + */ + repairCandidateMinEta?: number; } /** diff --git a/apps/memos-local-plugin/core/storage/migrations/013-skill-repair-origin.sql b/apps/memos-local-plugin/core/storage/migrations/013-skill-repair-origin.sql new file mode 100644 index 000000000..345fcefa6 --- /dev/null +++ b/apps/memos-local-plugin/core/storage/migrations/013-skill-repair-origin.sql @@ -0,0 +1,7 @@ +-- Repair-candidate skills: minted from a constructive negative (a failure whose +-- feedback named a corrective fix), unproven until trials validate them. +-- repair_origin = 1 → unvalidated repair; uses the stricter promotion bar. +-- strict_trial = 1 → trials judge pass by full credit only (verifier origin), +-- not the loose r_task >= 0.5 threshold. +ALTER TABLE skills ADD COLUMN repair_origin INTEGER NOT NULL DEFAULT 0 CHECK (repair_origin IN (0,1)); +ALTER TABLE skills ADD COLUMN strict_trial INTEGER NOT NULL DEFAULT 0 CHECK (strict_trial IN (0,1)); diff --git a/apps/memos-local-plugin/core/storage/migrator.ts b/apps/memos-local-plugin/core/storage/migrator.ts index efefe1885..306d2d2b4 100644 --- a/apps/memos-local-plugin/core/storage/migrator.ts +++ b/apps/memos-local-plugin/core/storage/migrator.ts @@ -202,9 +202,29 @@ function applyMigration(db: StorageDb, file: MigrationFile): void { } return; } + if (file.version === 13 && file.name === "skill-repair-origin") { + ensureSkillRepairOriginColumns(db); + return; + } db.exec(fs.readFileSync(file.fullPath, "utf8")); } +function ensureSkillRepairOriginColumns(db: StorageDb): void { + if (!tableExists(db, "skills")) return; + ensureColumn( + db, + "skills", + "repair_origin", + "INTEGER NOT NULL DEFAULT 0 CHECK (repair_origin IN (0,1))", + ); + ensureColumn( + db, + "skills", + "strict_trial", + "INTEGER NOT NULL DEFAULT 0 CHECK (strict_trial IN (0,1))", + ); +} + function ensureEmbeddingRetryLeaseColumns(db: StorageDb): void { const columns = new Set( db.prepare(`PRAGMA table_info(embedding_retry_queue)`) diff --git a/apps/memos-local-plugin/core/storage/repos/skills.ts b/apps/memos-local-plugin/core/storage/repos/skills.ts index db5a972e7..90af683ab 100644 --- a/apps/memos-local-plugin/core/storage/repos/skills.ts +++ b/apps/memos-local-plugin/core/storage/repos/skills.ts @@ -41,6 +41,8 @@ const COLUMNS = [ "evidence_anchors_json", "usage_count", "last_used_at", + "repair_origin", + "strict_trial", ]; export interface SkillSearchMeta { @@ -401,6 +403,8 @@ interface RawSkillRow { evidence_anchors_json: string; usage_count: number; last_used_at: number | null; + repair_origin: number; + strict_trial: number; } function rowToParams(row: SkillRow): Record { @@ -429,6 +433,8 @@ function rowToParams(row: SkillRow): Record { evidence_anchors_json: toJsonText(row.evidenceAnchors), usage_count: row.usageCount ?? 0, last_used_at: row.lastUsedAt ?? null, + repair_origin: row.repairOrigin ? 1 : 0, + strict_trial: row.strictTrial ? 1 : 0, }; } @@ -470,5 +476,7 @@ function mapRow(r: RawSkillRow): SkillRow { editedAt: r.edited_at, usageCount: r.usage_count ?? 0, lastUsedAt: r.last_used_at ?? null, + repairOrigin: (r.repair_origin ?? 0) !== 0, + strictTrial: (r.strict_trial ?? 0) !== 0, }; } diff --git a/apps/memos-local-plugin/core/types.ts b/apps/memos-local-plugin/core/types.ts index 0e4f87f52..7e805c6a9 100644 --- a/apps/memos-local-plugin/core/types.ts +++ b/apps/memos-local-plugin/core/types.ts @@ -309,6 +309,19 @@ export interface SkillRow extends OwnedRow { usageCount?: number; /** Last successful `memos_skill_get` time, or null when never loaded. */ lastUsedAt?: EpochMs | null; + /** + * Migration 013 — unvalidated repair candidate, minted from a constructive + * negative (a failure whose feedback named a fix). Uses the stricter + * promotion bar (`repairCandidate.minEtaForPromotion`) and is surfaced as + * "unverified" in retrieval until trials validate it. Defaults to false. + */ + repairOrigin?: boolean; + /** + * Migration 013 — trials for this skill judge a pass by full credit only + * (verifier origin: passed===total / reward>=1), not the loose + * `r_task >= 0.5` threshold. Defaults to false (loose, soft-feedback origin). + */ + strictTrial?: boolean; } export interface SkillTrialRow extends OwnedRow { diff --git a/apps/memos-local-plugin/tests/unit/experience/feedback-builder.test.ts b/apps/memos-local-plugin/tests/unit/experience/feedback-builder.test.ts index c1a24c5a4..ee8d15a52 100644 --- a/apps/memos-local-plugin/tests/unit/experience/feedback-builder.test.ts +++ b/apps/memos-local-plugin/tests/unit/experience/feedback-builder.test.ts @@ -156,6 +156,56 @@ describe("feedback experience builder", () => { expect(row?.skillEligible).toBe(false); }); + it("records the suggested fix as a preference on a constructive negative (avoid + do-Y in one record)", async () => { + const result = await runFeedbackExperience( + { + feedback: feedback({ + id: "fb_fix" as FeedbackRow["id"], + polarity: "neutral", + // Failed, but the feedback names a concrete corrective direction. + rationale: + "Verifier feedback: failed, Time Limit Exceeded on the O(n^2) bitset. Instead use FFT/autocorrelation to count the triplets in O(n log n).", + raw: { source: "verifier", verifier: { reward: 0, passed: 3, total: 4 } }, + }), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: -0.51 }, + trace, + }, + { repos: handle.repos, embedder: fakeEmbedder(), namespace, now: () => NOW }, + ); + + expect(result.policyId).toBeTruthy(); + const row = handle.repos.policies.getById(result.policyId!); + // Stays a negative, non-skill-eligible record... + expect(row?.evidencePolarity).toBe("negative"); + expect(row?.skillEligible).toBe(false); + // ...but now also carries the suggested fix as a preference. + expect(row?.decisionGuidance.preference.join("\n").toLowerCase()).toContain("fft"); + }); + + it("does NOT record a fix on a bare-verdict negative (no constructive direction)", async () => { + const result = await runFeedbackExperience( + { + feedback: feedback({ + id: "fb_bare" as FeedbackRow["id"], + polarity: "neutral", + rationale: + "Verifier feedback for the previous attempt. Verifier reward: 0.0. passed: 3, total: 4. TimeoutException(): Time Limit Exceeded. Please briefly reflect on what you would keep and what you would improve next time.", + raw: { source: "verifier", verifier: { reward: 0, passed: 3, total: 4 } }, + }), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: -0.51 }, + trace, + }, + { repos: handle.repos, embedder: fakeEmbedder(), namespace, now: () => NOW }, + ); + + expect(result.policyId).toBeTruthy(); + const row = handle.repos.policies.getById(result.policyId!); + expect(row?.evidencePolarity).toBe("negative"); + expect(row?.skillEligible).toBe(false); + // Pure warning: the avoidance is present, no fabricated fix. + expect(row?.decisionGuidance.preference).toEqual([]); + }); + it("merges later avoidance feedback into a success-backed experience without losing skill eligibility", async () => { const ok = await runFeedbackExperience( { diff --git a/apps/memos-local-plugin/tests/unit/skill/lifecycle.test.ts b/apps/memos-local-plugin/tests/unit/skill/lifecycle.test.ts index 86101908e..1e0b622d4 100644 --- a/apps/memos-local-plugin/tests/unit/skill/lifecycle.test.ts +++ b/apps/memos-local-plugin/tests/unit/skill/lifecycle.test.ts @@ -23,6 +23,8 @@ function mkSkill(partial: Partial = {}): SkillRow { createdAt: partial.createdAt ?? NOW, updatedAt: partial.updatedAt ?? NOW, version: partial.version ?? 1, + repairOrigin: partial.repairOrigin, + strictTrial: partial.strictTrial, }; } @@ -59,6 +61,41 @@ describe("skill/lifecycle", () => { expect(fifth.transition).toBe("archived"); }); + it("holds repair-origin candidates to a stricter promotion floor (1-of-3 archives where a normal candidate promotes)", () => { + const cfg = makeSkillConfig({ + candidateTrials: 3, + minEtaForRetrieval: 0.1, + repairCandidateMinEta: 0.5, + archiveEta: 0.1, + }); + // 1 pass, 2 fails → η ≈ 0.275 after 3 trials. + const run = (repairOrigin: boolean) => { + let s = mkSkill({ status: "candidate", eta: 0.1, repairOrigin }); + s = { ...s, ...applyFeedback(s, "trial.pass", cfg) }; + s = { ...s, ...applyFeedback(s, "trial.fail", cfg) }; + return applyFeedback(s, "trial.fail", cfg); + }; + // A normal candidate clears the 0.1 floor and promotes... + expect(run(false).status).toBe("active"); + // ...but the unproven repair needs a majority of real passes — 1-of-3 archives. + expect(run(true).status).toBe("archived"); + }); + + it("promotes a repair-origin candidate once a majority of trials pass (2-of-3)", () => { + const cfg = makeSkillConfig({ + candidateTrials: 3, + minEtaForRetrieval: 0.1, + repairCandidateMinEta: 0.5, + archiveEta: 0.1, + }); + let s = mkSkill({ status: "candidate", eta: 0.1, repairOrigin: true }); + s = { ...s, ...applyFeedback(s, "trial.pass", cfg) }; + s = { ...s, ...applyFeedback(s, "trial.pass", cfg) }; + const after = applyFeedback(s, "trial.fail", cfg); + expect(after.status).toBe("active"); + expect(after.transition).toBe("promoted"); + }); + it("promotes candidate → active once enough passing trials accrue", () => { const cfg = makeSkillConfig({ candidateTrials: 3, minEtaForRetrieval: 0.5 }); let s = mkSkill({ status: "candidate" }); diff --git a/apps/memos-local-plugin/tests/unit/skill/repair-candidate.test.ts b/apps/memos-local-plugin/tests/unit/skill/repair-candidate.test.ts new file mode 100644 index 000000000..e319dbfb4 --- /dev/null +++ b/apps/memos-local-plugin/tests/unit/skill/repair-candidate.test.ts @@ -0,0 +1,150 @@ +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import { runFeedbackExperience } from "../../../core/experience/feedback-builder.js"; +import { + isRepairCandidatePolicy, + mintRepairCandidate, +} from "../../../core/skill/repair-candidate.js"; +import type { + EpisodeId, + FeedbackRow, + PolicyId, + RuntimeNamespace, + TraceRow, +} from "../../../core/types.js"; +import { makeTmpDb, type TmpDbHandle } from "../../helpers/tmp-db.js"; +import { NOW, seedTrace, vec } from "../feedback/_helpers.js"; + +const namespace: RuntimeNamespace = { + agentKind: "hermes", + profileId: "default", + workspaceId: "workspace", +}; + +function feedback(partial: Partial = {}): FeedbackRow { + return { + id: "fb_1" as FeedbackRow["id"], + ownerAgentKind: "hermes", + ownerProfileId: "default", + ownerWorkspaceId: "workspace", + ts: NOW, + episodeId: "ep_feedback" as EpisodeId, + traceId: "tr_feedback" as TraceRow["id"], + channel: "explicit", + polarity: "neutral", + magnitude: 1, + rationale: + "Verifier feedback: failed, Time Limit Exceeded on the O(n^2) bitset. Instead use FFT/autocorrelation to count the triplets in O(n log n).", + raw: { source: "verifier", verifier: { reward: 0, passed: 3, total: 4 } }, + ...partial, + }; +} + +async function makeConstructiveNegative(handle: TmpDbHandle, trace: TraceRow): Promise { + const result = await runFeedbackExperience( + { + feedback: feedback(), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: -0.51 }, + trace, + }, + { repos: handle.repos, embedder: null, namespace, now: () => NOW }, + ); + expect(result.policyId).toBeTruthy(); + return result.policyId!; +} + +describe("repair candidate minting", () => { + let handle: TmpDbHandle; + let trace: TraceRow; + + beforeEach(() => { + handle = makeTmpDb({ agent: "hermes" }); + trace = seedTrace(handle, { + id: "tr_feedback", + episodeId: "ep_feedback", + sessionId: "se_feedback", + userText: "Count arithmetic-progression triplets in the array.", + agentText: "Used an O(n^2) bitset and timed out.", + vec: vec([1, 0, 0]), + }); + }); + + afterEach(() => { + handle.cleanup(); + }); + + it("mints a candidate skill (eta=floor, repairOrigin, strict) from a constructive negative", async () => { + const policyId = await makeConstructiveNegative(handle, trace); + const policy = handle.repos.policies.getById(policyId)!; + expect(isRepairCandidatePolicy(policy)).toBe(true); + + const skillId = mintRepairCandidate(policy, { + repos: handle.repos, + embedder: null, + now: () => NOW, + }); + expect(skillId).toBeTruthy(); + + const skill = handle.repos.skills.getById(skillId!)!; + expect(skill.status).toBe("candidate"); + expect(skill.eta).toBeCloseTo(0.1, 6); // born at the retrieval floor, no head start + expect(skill.repairOrigin).toBe(true); + expect(skill.strictTrial).toBe(true); // verifier origin → full-pass-only trials + expect(skill.sourcePolicyIds).toEqual([policyId]); + expect(skill.trialsAttempted).toBe(0); + expect(skill.invocationGuide.toLowerCase()).toContain("fft"); + }); + + it("dedups: a second mint for the same policy returns null (rebuild path owns it)", async () => { + const policyId = await makeConstructiveNegative(handle, trace); + const policy = handle.repos.policies.getById(policyId)!; + + const first = mintRepairCandidate(policy, { repos: handle.repos, embedder: null, now: () => NOW }); + expect(first).toBeTruthy(); + const second = mintRepairCandidate(policy, { repos: handle.repos, embedder: null, now: () => NOW }); + expect(second).toBeNull(); + expect(handle.repos.skills.list({ limit: 50 }).length).toBe(1); + }); + + it("keeps the policy-id suffix in the name even for a long title (no silent collision)", async () => { + const longFix = + "Verifier feedback failed with Time Limit Exceeded so instead use the fast fourier transform autocorrelation counting technique to avoid the quadratic blowup in this arithmetic progression triplet problem"; + const result = await runFeedbackExperience( + { + feedback: feedback({ id: "fb_long" as FeedbackRow["id"], rationale: longFix }), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: -0.51 }, + trace, + }, + { repos: handle.repos, embedder: null, namespace, now: () => NOW }, + ); + const policy = handle.repos.policies.getById(result.policyId!)!; + expect(isRepairCandidatePolicy(policy)).toBe(true); + + const skillId = mintRepairCandidate(policy, { repos: handle.repos, embedder: null, now: () => NOW }); + expect(skillId).toBeTruthy(); + const skill = handle.repos.skills.getById(skillId!)!; + // The last-5 of the policy id must survive truncation so names stay unique. + const idSuffix = policy.id.slice(-5).toLowerCase().replace(/[^a-z0-9]+/g, ""); + expect(skill.name).toContain(idSuffix); + expect(skill.name.length).toBeLessThanOrEqual(41); + }); + + it("does not mint from a bare-verdict negative (no fix → not a repair candidate)", async () => { + const result = await runFeedbackExperience( + { + feedback: feedback({ + id: "fb_bare" as FeedbackRow["id"], + rationale: + "Verifier feedback: failed. Verifier reward: 0.0. passed: 3, total: 4. Time Limit Exceeded. Please reflect on what to improve next time.", + raw: { source: "verifier", verifier: { reward: 0, passed: 3, total: 4 } }, + }), + episode: { id: "ep_feedback" as EpisodeId, traceIds: [trace.id], rTask: -0.51 }, + trace, + }, + { repos: handle.repos, embedder: null, namespace, now: () => NOW }, + ); + const policy = handle.repos.policies.getById(result.policyId!)!; + expect(isRepairCandidatePolicy(policy)).toBe(false); + expect(mintRepairCandidate(policy, { repos: handle.repos, embedder: null, now: () => NOW })).toBeNull(); + }); +}); diff --git a/apps/memos-local-plugin/tests/unit/skill/subscriber.test.ts b/apps/memos-local-plugin/tests/unit/skill/subscriber.test.ts index dbda7394c..378eb5066 100644 --- a/apps/memos-local-plugin/tests/unit/skill/subscriber.test.ts +++ b/apps/memos-local-plugin/tests/unit/skill/subscriber.test.ts @@ -154,4 +154,108 @@ describe("skill/subscriber", () => { expect(r.crystallized).toBe(1); sub.dispose(); }); + + it("resolves a strict (repair) trial by full-pass-only while a loose trial passes on the same reward", async () => { + handle = makeTmpDb(); + const h = handle; + const l2Bus = createL2EventBus(); + const rewardBus = createRewardEventBus(); + const bus = createSkillEventBus(); + + const { episodeId } = seedTracesForPolicy(h, "po_strict" as PolicyId); + + const baseSkill = { + ownerAgentKind: "openclaw" as const, + ownerProfileId: "default", + ownerWorkspaceId: null, + invocationGuide: "guide", + procedureJson: null, + eta: 0.5, + support: 1, + gain: 0.3, + trialsAttempted: 0, + trialsPassed: 0, + sourcePolicyIds: [], + sourceWorldModelIds: [], + evidenceAnchors: [], + vec: null, + createdAt: 1 as never, + updatedAt: 1 as never, + version: 1, + }; + h.repos.skills.insert({ + ...baseSkill, + id: "sk_strict" as never, + name: "strict_repair", + status: "candidate", + strictTrial: true, + repairOrigin: true, + } as never); + h.repos.skills.insert({ + ...baseSkill, + id: "sk_loose" as never, + name: "loose_skill", + status: "candidate", + strictTrial: false, + } as never); + + const baseTrial = { + ownerAgentKind: "openclaw" as const, + ownerProfileId: "default", + ownerWorkspaceId: null, + sessionId: null, + episodeId, + traceId: null, + turnId: null, + toolCallId: null, + status: "pending" as const, + createdAt: 1, + resolvedAt: null, + evidence: {}, + }; + h.repos.skillTrials.createPending({ ...baseTrial, id: "st_strict", skillId: "sk_strict" as never } as never); + h.repos.skillTrials.createPending({ ...baseTrial, id: "st_loose", skillId: "sk_loose" as never } as never); + + // The re-run's verifier: a PARTIAL pass (3/4, reward 0) — a failure under + // full-pass-only, even though r_task=0.6 would loosely pass. + h.repos.feedback.insert({ + id: "fb_v" as never, + ts: 5, + episodeId: episodeId as never, + traceId: null, + channel: "explicit", + polarity: "neutral", + magnitude: 1, + rationale: "Verifier: passed 3/4.", + raw: { source: "verifier", verifier: { reward: 0, passed: 3, total: 4 } }, + } as never); + + const sub = attachSkillSubscriber({ + l2Bus, + rewardBus, + bus, + repos: h.repos, + embedder: null, + llm: null, + log: rootLogger.child({ channel: "core.skill.subscriber" }), + config: makeSkillConfig({ cooldownMs: 0, candidateTrials: 5 }), + }); + + rewardBus.emit({ + kind: "reward.updated", + result: { episodeId, sessionId: `s-po_strict`, rHuman: 0.6, completedAt: 10 } as never, + }); + await new Promise((r) => setTimeout(r, 20)); + await sub.flush(); + + const strict = h.repos.skills.getById("sk_strict" as never)!; + const loose = h.repos.skills.getById("sk_loose" as never)!; + // Strict: verifier was a partial pass → trial fails (no pass credit). + expect(strict.trialsAttempted).toBe(1); + expect(strict.trialsPassed).toBe(0); + // Loose: r_task 0.6 ≥ 0.5 → trial passes. + expect(loose.trialsAttempted).toBe(1); + expect(loose.trialsPassed).toBe(1); + sub.dispose(); + }); }); From 064d434ce9e204d82ee0347ee5004083aed4d115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Wed, 27 May 2026 11:42:26 +0800 Subject: [PATCH 2/6] fix: Modify the prompt for negative examples that do not validate skills, and increase the probability of their usage. --- .../core/skill/lifecycle.ts | 18 +++++++++-- .../core/skill/repair-candidate.ts | 12 ++++--- .../tests/unit/skill/lifecycle.test.ts | 31 +++++++++++++++++++ 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/apps/memos-local-plugin/core/skill/lifecycle.ts b/apps/memos-local-plugin/core/skill/lifecycle.ts index 3bb1f2b65..a2f088a64 100644 --- a/apps/memos-local-plugin/core/skill/lifecycle.ts +++ b/apps/memos-local-plugin/core/skill/lifecycle.ts @@ -121,14 +121,28 @@ function applyThumbs( magnitudeOverride: number | undefined, ): LifecycleUpdate { const delta = (magnitudeOverride ?? cfg.etaDelta) * sign; - const eta = clamp01(skill.eta + delta); + let eta = clamp01(skill.eta + delta); let status = skill.status; let transition: SkillLifecycleTransition | undefined; + // An unproven repair candidate's fate is decided by real trials, not by a + // thumbs-down (which often targets the whole response, not this specific + // fix). On a down-vote: deduct η but keep it at/above the retrieval floor so + // it stays surfaceable (η below the floor = invisible = de-facto dead), and + // never auto-archive it here. A failing trial can still archive it. + const protectRepairCandidate = skill.repairOrigin === true && skill.status === "candidate"; + if (protectRepairCandidate && sign < 0) { + eta = Math.max(eta, cfg.minEtaForRetrieval); + } + if (skill.status === "archived" && eta >= cfg.minEtaForRetrieval) { status = "candidate"; transition = "promoted"; - } else if (skill.status !== "archived" && eta < cfg.archiveEta) { + } else if ( + !protectRepairCandidate && + skill.status !== "archived" && + eta < cfg.archiveEta + ) { status = "archived"; transition = "archived"; } diff --git a/apps/memos-local-plugin/core/skill/repair-candidate.ts b/apps/memos-local-plugin/core/skill/repair-candidate.ts index 441914275..fe88eac46 100644 --- a/apps/memos-local-plugin/core/skill/repair-candidate.ts +++ b/apps/memos-local-plugin/core/skill/repair-candidate.ts @@ -159,11 +159,15 @@ function slugName(raw: string): string { function renderRepairGuide(policy: PolicyRow, fix: string): string { const lines: string[] = []; - // Keep the "[unverified repair]" marker in the SAME paragraph as the title - // (no blank line) so it also surfaces in the Tier-1 teaser, not just the - // full procedure the agent loads on demand. + // Invitational framing, kept in the SAME paragraph as the title (no blank + // line) so it surfaces in the Tier-1 teaser too. Natural relevance recall is + // the only thing that gets this candidate validated, so the wording must + // encourage trying it — frame applying it as double-value (solve + confirm), + // not as a "low-confidence, avoid" warning that would starve it of trials. lines.push(`# ${stripPrefix(policy.title) || "Repair candidate"}`); - lines.push("[unverified repair] distilled from a failure — validate by re-running before trusting."); + lines.push( + "Candidate fix distilled from a past failure on a similar task — applying it here both solves the task and confirms the fix.", + ); lines.push(""); if (policy.trigger?.trim()) { lines.push("**When to use**"); diff --git a/apps/memos-local-plugin/tests/unit/skill/lifecycle.test.ts b/apps/memos-local-plugin/tests/unit/skill/lifecycle.test.ts index 1e0b622d4..3374ce4e9 100644 --- a/apps/memos-local-plugin/tests/unit/skill/lifecycle.test.ts +++ b/apps/memos-local-plugin/tests/unit/skill/lifecycle.test.ts @@ -116,6 +116,37 @@ describe("skill/lifecycle", () => { expect(after.transition).toBe("archived"); }); + it("protects an unproven repair candidate from a thumbs-down (deduct but never archive/hide)", () => { + const cfg = makeSkillConfig({ etaDelta: 0.1, archiveEta: 0.1, minEtaForRetrieval: 0.1 }); + // The repair candidate survives a down-vote and stays surfaceable — + // its fate is left to real trials. + const repair = applyFeedback( + mkSkill({ status: "candidate", eta: 0.1, repairOrigin: true }), + "user.negative", + cfg, + ); + expect(repair.status).toBe("candidate"); + expect(repair.eta).toBeGreaterThanOrEqual(0.1); + // A normal candidate at the same η does get archived by the down-vote. + const normal = applyFeedback(mkSkill({ status: "candidate", eta: 0.1 }), "user.negative", cfg); + expect(normal.status).toBe("archived"); + }); + + it("still lets a failing trial archive a repair candidate (fate decided by trials, not thumbs)", () => { + const cfg = makeSkillConfig({ + candidateTrials: 1, + archiveEta: 0.1, + minEtaForRetrieval: 0.1, + repairCandidateMinEta: 0.5, + }); + const after = applyFeedback( + mkSkill({ status: "candidate", eta: 0.1, repairOrigin: true }), + "trial.fail", + cfg, + ); + expect(after.status).toBe("archived"); + }); + it("handles user thumbs", () => { const cfg = makeSkillConfig({ etaDelta: 0.1 }); let s = mkSkill({ eta: 0.5, status: "active" }); From 6d71025897fb04fce08c8422a6fb57a9cc4ed7de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Thu, 28 May 2026 13:41:00 +0800 Subject: [PATCH 3/6] fix:reflection work too much --- .../adapters/openclaw/tools.ts | 3 +- .../core/capture/ALGORITHMS.md | 203 ++++---- .../memos-local-plugin/core/capture/README.md | 208 ++++---- .../core/capture/alpha-scorer.ts | 210 -------- .../core/capture/batch-scorer.ts | 164 ++----- .../core/capture/capture.ts | 456 +++++++----------- apps/memos-local-plugin/core/capture/index.ts | 4 +- .../core/capture/reflection-extractor.ts | 59 --- .../core/capture/reflection-synth.ts | 158 ------ apps/memos-local-plugin/core/capture/types.ts | 52 +- .../core/config/defaults.ts | 42 +- apps/memos-local-plugin/core/config/schema.ts | 29 +- .../core/experience/feedback-builder.ts | 6 +- .../core/feedback/evidence.ts | 5 +- .../core/feedback/synthesize.ts | 10 +- apps/memos-local-plugin/core/index.ts | 5 - .../core/llm/prompts/index.ts | 2 +- .../core/llm/prompts/reflection.ts | 144 +----- .../core/memory/l2/induce.ts | 5 +- .../core/memory/l2/signature.ts | 3 +- .../core/memory/l3/abstract.ts | 5 +- .../core/pipeline/memory-core.ts | 5 +- .../core/retrieval/injector.ts | 6 +- .../core/retrieval/llm-filter.ts | 7 +- .../core/retrieval/tier2-trace.ts | 3 +- .../core/skill/crystallize.ts | 7 +- .../memos-local-plugin/core/skill/verifier.ts | 3 +- .../docs/CONFIG-ADVANCED.md | 30 +- .../templates/config.hermes.yaml | 5 + .../templates/config.openclaw.yaml | 5 + .../tests/unit/capture/alpha-scorer.test.ts | 151 ------ .../tests/unit/capture/batch-scorer.test.ts | 53 +- .../tests/unit/capture/capture-batch.test.ts | 347 +++---------- .../tests/unit/capture/capture.test.ts | 61 ++- .../tests/unit/capture/normalizer.test.ts | 2 +- .../unit/capture/reflection-extractor.test.ts | 94 ---- .../unit/capture/reflection-synth.test.ts | 133 ----- .../tests/unit/llm/prompts.test.ts | 4 +- .../viewer/src/stores/i18n.ts | 4 +- 39 files changed, 673 insertions(+), 2020 deletions(-) delete mode 100644 apps/memos-local-plugin/core/capture/alpha-scorer.ts delete mode 100644 apps/memos-local-plugin/core/capture/reflection-extractor.ts delete mode 100644 apps/memos-local-plugin/core/capture/reflection-synth.ts delete mode 100644 apps/memos-local-plugin/tests/unit/capture/alpha-scorer.test.ts delete mode 100644 apps/memos-local-plugin/tests/unit/capture/reflection-extractor.test.ts delete mode 100644 apps/memos-local-plugin/tests/unit/capture/reflection-synth.test.ts diff --git a/apps/memos-local-plugin/adapters/openclaw/tools.ts b/apps/memos-local-plugin/adapters/openclaw/tools.ts index f8492e87a..76a74c3d0 100644 --- a/apps/memos-local-plugin/adapters/openclaw/tools.ts +++ b/apps/memos-local-plugin/adapters/openclaw/tools.ts @@ -18,6 +18,7 @@ import { Type, type Static } from "@sinclair/typebox"; import type { AgentKind, RuntimeNamespace, SkillId, TraceId } from "../../agent-contract/dto.js"; import type { MemoryCore } from "../../agent-contract/memory-core.js"; +import { reflectionAsText } from "../../core/capture/types.js"; import { bridgeSessionId } from "./bridge.js"; import type { @@ -242,7 +243,7 @@ export function registerOpenClawTools(api: OpenClawPluginApi, opts: ToolsOptions episodeId: trace.episodeId, ts: trace.ts, value: trace.value, - reflection: clip(trace.reflection, bodyCap), + reflection: clip(reflectionAsText(trace.reflection) ?? undefined, bodyCap), userText: clip(trace.userText, bodyCap), toolCalls: trace.toolCalls.map((tc) => ({ name: tc.name, diff --git a/apps/memos-local-plugin/core/capture/ALGORITHMS.md b/apps/memos-local-plugin/core/capture/ALGORITHMS.md index 15a15ac64..953a197a0 100644 --- a/apps/memos-local-plugin/core/capture/ALGORITHMS.md +++ b/apps/memos-local-plugin/core/capture/ALGORITHMS.md @@ -26,103 +26,101 @@ Edge cases: The V7 spec keeps all sub-agent traces under the root episode so `R_task` backprops correctly up the decision tree. -## V7 §3.2.2 — Reflection extraction +## V7 §3.2 — Windowed binary path-relevance scoring -Procedure `ExtractReflection(τ_t)`: +The original per-step reflection scorer (`reflection-extractor` → +`reflection-synth` → `alpha-scorer`) was removed in the 2026-05 redesign +(see [docs/superpowers/specs/2026-05-27-l1-batch-reflection-binary-design.md](../../docs/superpowers/specs/2026-05-27-l1-batch-reflection-binary-design.md)). +Reflection no longer produces free-form natural-language text and `α` is +no longer a continuous quality score. Instead, every step gets a binary +"is this step on the final trajectory?" judgement: ``` -if τ_t.meta.reflection is non-empty: - return τ_t.meta.reflection # adapter-native -elif regex_match(τ_t.agentText): - return cleaned_match(…) # inline reasoning -elif config.synthReflections: - return LLM(Synthesis, τ_t) # synthesized -else: - return ∅ +α_t ∈ {0, 1} +reflection_t ∈ { "RELATED", "IRRELEVANT", "RELATED_DEFAULT" } ``` -Implemented by `reflection-extractor.ts` (steps 1-2) + -`reflection-synth.ts` (step 3). Prompt for synthesis is minimal and -temperature=0.1 — we want a terse, agent-voiced explanation, never a -judgment. +with the semantics: +- `α_t = 1` / `RELATED` — the step is effective and downstream actions + continue from it. +- `α_t = 0` / `IRRELEVANT` — the step is a detour / dead-end that did + not influence the final path. +- `RELATED_DEFAULT` — episode-level safe default written by the fallback + path when the windowed scorer never produced a usable result for a + step (or for the whole episode). -## V7 §3.2.3 — α scoring +### Window topology -V7 defines the "reflection utility" α via a four-axis rubric: +Windows are owned by `runEpisodeBatchScoring` in `capture.ts`. Two passes: -``` -α_t = judge(state_t, action_t, outcome_t, reflection_t) - = weighted_mean(faithfulness, causal_insight, - transferability, concreteness) -usable_t = 1 iff α_t ≥ 0.4 AND non_tautological(reflection_t) -if usable_t = 0: - α_t ← 0 # equation 5: unusable reflections cannot skew backprop -``` +| Pass | `windowSize` | `overlap` | per-window retries | +|---------|--------------|-----------|--------------------| +| primary | 20 | 3 | 1 | +| degrade | 9 | 3 | 2 | + +Stride is `windowSize − overlap` (17 for primary, 6 for degrade). The +last window of either pass is allowed to be shorter than `windowSize`. +`buildWindows(length, windowSize, overlap)` returns half-open `[start, +end)` pairs in ascending order. -The judge is `REFLECTION_SCORE_PROMPT` (see -`core/llm/prompts/reflection.ts`), which returns a JSON object. Our -implementation clamps α to [0, 1], applies the `usable` mask, and -guarantees finite values. +### Merge rule -When `alphaScoring=false` OR the LLM fails: +`mergeWindowScores` aggregates per-window results by absolute +`global_idx = win.start + i`. Per-step combination is: ``` -α_t = 0.5 # neutral; Phase 7 backprop still runs, half-weighted -usable_t = 1 +if any window assigned alpha=1 → final alpha = 1, label = RELATED +elif any window assigned alpha=0 → final alpha = 0, label = IRRELEVANT +else → final alpha = 1, label = RELATED_DEFAULT + (MISSING_WINDOW_DEFAULT) ``` -This preserves the "graceful degradation" property V7 asks for: a local -setup without a paid LLM still accrues L1 traces with meaningful -priority once reward arrives. - -## V7 §3.2 batched variant — `batch-scorer.ts` - -The per-step path (`reflection-synth.ts` + `alpha-scorer.ts`) issues 2N -LLM calls per N-step episode. `batch-scorer.ts` collapses them into ONE: +The "1-over-0" rule is intentional: overlapping windows often disagree +about a borderline step at the seam; counting it as RELATED is the +safer default because the downstream reward/L2/Skill chain treats +`α = 0` as a hard mask. + +### Failure ladder + +1. **Per-window** — up to `maxRetries+1` calls (1 attempt + retries). + A malformed payload from the LLM is one of: array length ≠ window + length, non-numeric / non-{0,1} `alpha`, `relevance` outside + {RELATED, IRRELEVANT}, missing `idx`. The validator in + `batch-scorer.ts :: validateBatchPayload` raises + `LLM_OUTPUT_MALFORMED` and the facade's own malformed-retry triggers + once before our outer retry kicks in. +2. **Window pass** — if every window in the primary pass eventually + succeeded, we accept its results. Otherwise we discard the partial + primary results and re-run with the degrade pass over the whole + episode. +3. **Episode-wide fallback** — if the degrade pass also has any failed + window, every step in the episode is overwritten with + `{ alpha: 1, text: "RELATED_DEFAULT", reason: "FALLBACK_ALL_ONE" }` + and we log `reflection_fallback_all_one` at error level with + `{ degraded: true, episodeId, stepsCount, failedWindows }`. +4. **No reflect LLM wired** — short-circuits straight to the + episode-wide fallback (`reason: "no_llm"`). + +The downstream reward / L2 / Skill chain runs in every case; the +fallback is meant to keep the pipeline available, not to gate it. + +### Bookkeeping (`CaptureResult.llmCalls`) + +- `batchedReflection` — number of successful batch calls this episode. + One per window that actually returned a usable payload (so a long + episode can be >1, and the degrade pass can add more). +- `reflectionSynth` / `alphaScoring` — permanently `0`. Retained on the + `CaptureResult` interface for backward-compatible analytics consumers. + +### Stable prompt fingerprint ``` -inputs = [{idx, state, action, outcome, reflection, synth_allowed}, …] - ↓ BATCH_REFLECTION_PROMPT -outputs = {scores: [{idx, reflection_text, alpha, usable, reason}, …]} +op = capture.reflection.batch.v ``` -Dispatch (in `capture.ts`): - -| `cfg.batchMode` | `cfg.batchThreshold` | behavior | -|-------------------|----------------------|----------| -| `per_step` | (ignored) | legacy: 2N calls | -| `per_episode` | (ignored) | always batch | -| `auto` (default) | `12` | batch when `N ≤ 12`; else per-step | - -The dispatcher also refuses to batch when no LLM is wired — same fallback -path as missing-LLM in per-step mode. - -Why batched mode tends to produce **better** reflections (not just cheaper): -the prompt sees the full episode timeline including the final outcome, so -it can credit-attribute across steps. V7 §3.2.3's `causal_insight` and -`transferability` axes both benefit from the wider context. Per-step -synth, in contrast, can only rationalize from local `(s, a, o)`. - -Failure handling: - -- LLM throws / facade gives up after `malformedRetries=1` → capture - catches in `runBatchScoring`, surfaces a `{stage: "batch"}` warning, - and the per-step path runs as a fallback. -- Validator rejects on length mismatch, missing/non-numeric `alpha`, - non-boolean `usable`, non-string `reflection_text`. Same fallback. - -Bookkeeping (`CaptureResult.llmCalls`): - -- `batchedReflection`: 0 or 1 per episode (1 on a successful batch). -- `reflectionSynth` / `alphaScoring`: only nonzero when the per-step path - ran (either selected directly, or as fallback after a batch failure). - -Stable prompt fingerprint: - -- `op = capture.reflection.batch.v3` (see `BATCH_OP_TAG` constant; version - matches `BATCH_REFLECTION_PROMPT.version`). - Bumping `BATCH_REFLECTION_PROMPT.version` changes the op tag so audit - rows remain attributable. +Bumping `BATCH_REFLECTION_PROMPT.version` in +`core/llm/prompts/reflection.ts` rolls the op tag automatically so audit +rows stay attributable to a specific prompt revision. ## V7 §3.2.4 — Reward wiring @@ -131,8 +129,8 @@ Capture does NOT compute `r_step` or `V_t`. It writes: ``` trace.value = 0 # V_t will be filled by Phase 7 trace.r_human = null # assigned on feedback (Phase 7 R_human path) -trace.alpha = α_t # from §3.2.3 -trace.priority = 0 # recomputed after backprop +trace.alpha = α_t # binary {0, 1} from the windowed scorer +trace.priority = 0.5 # seeded so retrieval can find it pre-reward ``` Phase 7 updates these via `tracesRepo.updateScore` once the @@ -148,7 +146,7 @@ priority(f¹_t) ∝ max(V_t, 0) · decay(Δt) - `decay(Δt)` = half-life ≈ 30 days (Phase 7 constant) - `V_t` = backpropagated value from the R_task + step rewards (Phase 7) -Capture initialises `priority=0`. The formula activates in +Capture initialises `priority=0.5`. The formula activates in `core/reward/backprop.ts` (Phase 7). ## Text & vector conventions @@ -171,26 +169,31 @@ marker. Rationale: - Tail keeps "what the agent concluded with" — often the most useful sentence for Tier 2 recall. - Dropping the middle rarely hurts (that's usually thinking + tool - rationales that the reflection already summarises). + rationales that the windowed scorer already collapses into a binary + judgement). Per-tool-call outputs use the same clamp with `maxToolOutputChars`. ## Concurrency -Reflection + α stages iterate per-step. We run them with -`config.capture.llmConcurrency` workers (default 4). The embedding stage -uses the embedder's own batching — one call for ALL steps. - -Typical budget for a 10-step episode with alpha scoring on and an -external LLM: 10 α calls ÷ 4 workers ≈ 3 batches, plus one embed call. -Wall clock usually 3-10s on a mid-tier OpenAI-compat endpoint. - -## Stable prompt fingerprints - -Every LLM call carries: -- `op = capture.alpha.reflection.score.v1` (alpha scorer) -- `op = capture.reflection.synth` (reflection synth) - -Bumping `REFLECTION_SCORE_PROMPT.version` in `core/llm/prompts/reflection.ts` -changes the op tag automatically, so historical α values remain -attributable to their scoring prompt generation. +The windowed scorer is sequential per episode (windows run in order, +not in parallel) because the merge rule benefits from short feedback +loops on failures — a failing primary pass is detected before the +degrade pass starts. Summariser and embedder stages still use +`config.capture.llmConcurrency` workers (default 4). + +Typical budget for a 60-step episode with the primary pass succeeding: +`ceil((60 - 3) / 17) = 4` batch calls, plus one embed call. Wall clock +is dominated by the batch latency of the reflect model. + +## Downstream consumers and the enum reflection field + +`traces.reflection` is now one of `RELATED | IRRELEVANT | +RELATED_DEFAULT` (plus legacy free-form text from pre-2026-05 traces). +Downstream modules that previously fed the reflection string into LLM +prompts, error-signature heuristics, or keyword blobs use the +`reflectionAsText` helper exported from `core/capture/types.ts` to +filter the three fixed labels back to `null`. That keeps the L2 +signature bucket, L2/L3 induction prompts, skill crystallisation / +verification, feedback evidence, and feedback-builder notes from +treating `RELATED_DEFAULT` as natural language. diff --git a/apps/memos-local-plugin/core/capture/README.md b/apps/memos-local-plugin/core/capture/README.md index 49adcf329..6e28ecc7d 100644 --- a/apps/memos-local-plugin/core/capture/README.md +++ b/apps/memos-local-plugin/core/capture/README.md @@ -11,39 +11,51 @@ sessionBus.on("episode.finalized") ↓ attachCaptureSubscriber(...) ← this module ↓ -createCaptureRunner.run({ episode, closedBy }) +captureRunner.runReflect({ episode, closedBy }) ↓ -INSERT INTO traces ... (×N) +INSERT INTO traces ... (×N) / UPDATE traces SET reflection, alpha ↓ -sessionBus.emit({ kind: "capture.done", result }) +captureBus.emit({ kind: "capture.done", result }) ``` -- One episode → 0..N trace rows (one per agent step). -- Abandoned episodes are captured too (V7 treats them as R_task=−1, which - Phase 7 assigns). Toggle with `captureAbandoned: false` if you need to. -- Fire-and-forget by default; tests call `drain()` to await all pending. +- Per-turn `runLite` writes trace rows with `reflection=null` / + `alpha=0` immediately, so the viewer sees the memory card. +- Topic-end `runReflect` re-runs the windowed binary scorer over the + whole (now-closed) episode and patches each existing row. +- Abandoned episodes go through the same pipeline; Phase 7 still + assigns `R_task = −1`. ## 2. Data flow ``` -episode.turns ──► step-extractor one StepCandidate per decision point +episode.turns ──► step-extractor one StepCandidate per decision point │ ▼ - normalizer truncate / dedup / drop empty + normalizer truncate / dedup / drop empty │ ▼ - reflection-extractor prefer adapter-provided; else regex - │ ←─ (optional) reflection-synth (LLM) + batch-scorer (windowed binary) primary {batch=20, overlap=3, 1 retry} + │ ↓ on any failed window + │ degrade {batch=9, overlap=3, 2 retries} + │ ↓ on any failed window + │ episode-wide RELATED_DEFAULT fallback ▼ - alpha-scorer REFLECTION_SCORE_PROMPT → α ∈ [0,1] - │ usable=false ⇒ α = 0 + merge by global_idx 1-over-0; missing window → RELATED_DEFAULT + │ ▼ - embedder vec_summary + vec_action (Phase 3) + embedder vec_summary + vec_action (Phase 3) │ ▼ - tracesRepo.insert + episodesRepo.updateTraceIds + tracesRepo.insert / + episodesRepo.updateTraceIds + tracesRepo.updateReflection ``` +`traces.reflection` is always one of `RELATED | IRRELEVANT | +RELATED_DEFAULT` after `runReflect`. There is no natural-language +reflection text; downstream consumers use `reflectionAsText` (exported +from `core/capture/types.ts`) to filter the fixed labels out of prompts +and keyword blobs. + ## 3. Public API ```ts @@ -57,17 +69,22 @@ const runner = createCaptureRunner({ tracesRepo, episodesRepo, embedder, // nullable (then vec is null) - llm, // nullable (then α stays neutral 0.5 if reflection exists) + llm, // main LLM, used by the summariser + reflectLlm, // dedicated reflect LLM; falls back to `llm` bus: captureBus, cfg: { maxTextChars: 4000, maxToolOutputChars: 2000, embedTraces: true, + llmConcurrency: 4, + // Windowed binary reflection is the only supported mode. + batchMode: "windowed", + // alphaScoring / synthReflections / batchThreshold / + // reflectionContextMode / longEpisodeReflectMode are retained for + // backward config compatibility but ignored by the windowed + // pipeline. alphaScoring: true, synthReflections: false, - llmConcurrency: 4, - // V7 §3.2 batched variant — one LLM call per episode. See §6a. - batchMode: "auto", batchThreshold: 12, }, }); @@ -79,8 +96,8 @@ sub.stop(); await sub.drain(); ``` -You can also call `runner.run({episode, closedBy})` synchronously (tests -and integration tests do this). +You can also call `runner.runLite(...)` / `runner.runReflect(...)` +directly (tests and integration tests do this). ## 4. Step extraction rules (V7 §3.2.1) @@ -89,104 +106,51 @@ and integration tests do this). - **Merge tool turns** into the assistant step that preceded them within the same segment. `tool` turns emit `ToolCallDTO` entries with inputs, outputs, errors, and timing. -- **Sub-agent depth**: passed through from `turn.meta.depth` / `turn.meta.isSubagent`. - The extractor doesn't create new episodes for sub-agents — they are - extra traces under the same episode with `isSubagent=true`. +- **Sub-agent depth**: passed through from `turn.meta.depth` / + `turn.meta.isSubagent`. The extractor doesn't create new episodes for + sub-agents — they are extra traces under the same episode with + `isSubagent=true`. - **Synthetic fallback**: an episode with a user turn but no assistant turn still produces one skeletal trace so Phase 7 has somewhere to assign R_task. -## 5. Reflection resolution - -Order (highest-precedence first): - -1. `step.rawReflection` (from `turn.meta.reflection`, set by the adapter - when the host agent emits self-reflections natively). Source: `adapter`. -2. `extractReflection(step)` — regex over `agentText` for Markdown - `### Reasoning:` blocks, `...` tags, and a - small Chinese/English heuristic set. Source: `extracted`. -3. `synthesizeReflection(llm, step)` — only when - `config.capture.synthReflections=true`. Source: `synth`. -4. Otherwise `reflection.text = null`, `alpha = 0`, `usable = false`. - Source: `none`. - -## 6. α scoring (V7 §3.2.3, eq. 5) - -When a reflection exists: - -- If `config.capture.alphaScoring=false`: α defaults to `0.5` (neutral), - `usable=true`. Phase 7 will backprop but weighted half-strength. -- Otherwise: call `REFLECTION_SCORE_PROMPT` with - `{state, action, outcome, reflection}` and parse JSON `{alpha, usable, reason}`. - When `usable=false`, we clamp `α=0` before persisting. - -LLM failures fall back to neutral α (same as "scoring disabled") plus a -warning in `CaptureResult.warnings`. Capture NEVER throws on LLM failure -alone — only a DB `INSERT` failure is fatal. +## 5. Windowed binary reflection (V7 §3.2) -## 6a. Batched ρ+α (V7 §3.2 batched variant) +Per-step reflection / α scoring was replaced by a path-relevance +judgement. See [ALGORITHMS.md](./ALGORITHMS.md) for the full derivation; +the highlights: -Per-step calls are expensive on long episodes (2N LLM calls for N steps). -`batch-scorer.ts` collapses synth + α into ONE LLM call covering every -step. Activated by `algorithm.capture.batchMode`: +- Each window is `≤ batch_size` consecutive steps, sliced with a fixed + `overlap` so seam steps appear in two windows. +- The batch scorer returns `{ alpha: 0|1, relevance: "RELATED" | + "IRRELEVANT" }` per step. Validator rejects any other shape. +- Overlap merge: any window calling a step `RELATED` (`alpha=1`) wins. +- If a step has no window result after both passes, it is written as + `RELATED_DEFAULT` (the safe default). +- If any window in both passes failed, the whole episode is overwritten + with `RELATED_DEFAULT`. +- The dispatcher never throws on reflection failure — only a DB + `INSERT` is fatal. -| value | behavior | -|-------|----------| -| `per_step` | legacy path; one synth + one α call per step (`llmConcurrency` workers in parallel) | -| `per_episode` | always batch; one call per episode | -| `auto` (default) | batch when `stepCount ≤ batchThreshold` (default 12); else fall back to per-step | +## 6. α scoring -Batched mode also gives the LLM access to the **full causal chain** of the -episode in one shot, so reflections it writes can credit-attribute across -steps (V7 §3.2.3 axes `causal_insight` / `transferability` benefit). - -Bookkeeping is split across `CaptureResult.llmCalls`: -- `batchedReflection`: 0 or 1 per episode (1 on a clean batched call). -- `reflectionSynth` / `alphaScoring`: only nonzero in per-step mode. - -Failures in the batched call (LLM throw, malformed JSON, length mismatch) -are logged as a `stage: "batch"` warning and capture **automatically falls -back** to the per-step path — no traces are lost. - -## 6b. Downstream preview for long per-step reflection - -For long episodes, `batchMode: "auto"` still falls back to per-step scoring -when `stepCount > batchThreshold`. Operators can enrich that fallback without -making it serial: - -```yaml -algorithm: - capture: - reflectionContextMode: task_downstream - longEpisodeReflectMode: per_step_downstream -``` - -This keeps `runConcurrently(...)` intact. Before launching the per-step work, -capture precomputes a read-only preview for each step from the already -normalized episode: - -- up to `downstreamStepCount` following steps, capped at 3; -- labels are always `step+1`, `step+2`, `step+3`; -- `text` steps are inserted as standalone downstream text blocks; -- `tooluse` steps include tool names and tool output; -- if a downstream tool step already has adapter/extracted reflection, that - existing reflection is included; newly synthesized reflections from the - same run are not used, so there is no reverse-order dependency. - -`reflectionContextMode: task` is the default, preserving task-summary -enrichment while leaving downstream preview opt-in. +`α_t ∈ {0, 1}` only. There is no continuous score, no +`alphaScoring=false` neutral path, and no LLM-quality rubric. The +`alphaScoring` config flag is preserved for back-compat but has no +effect. ## 7. Embedding - When `config.capture.embedTraces=true` and `embedder` is non-null, we build two texts per step — "state" (userText) and "action" (agentText + tool signatures) — and batch them through `embedder.embedMany(...)`. -- Failures fall back to `vecSummary=null / vecAction=null`. Vector search - will just skip these rows. +- Failures fall back to `vecSummary=null / vecAction=null`. Vector + search will just skip these rows. ## 8. Priority (V7 §3.3) -Initial `priority = 0` for every new trace. The formula +Initial `priority = 0.5` for every new trace so retrieval can find it +before reward backprop runs. The formula `priority(f1) ∝ max(V, 0) · decay(Δt)` activates in Phase 7 after backprop, when `tracesRepo.updateScore` runs. @@ -197,11 +161,12 @@ Capture runs on a dedicated `CaptureEventBus` (create via stable. The orchestrator (Phase 15) bridges session.* and capture.* into one unified stream for the viewer. -| Event | Payload | When | -|----------------------|---------------------------------------------|--------------------------------------------| -| `capture.started` | `{episodeId, sessionId}` | Before stage 1. | -| `capture.done` | `{result: CaptureResult}` | After all rows are persisted (happy path). | -| `capture.failed` | `{episodeId, sessionId, stage, error}` | DB insert failed; throws afterwards. | +| Event | Payload | When | +|----------------------|---------------------------------------------|-----------------------------------------------------------------------| +| `capture.started` | `{episodeId, sessionId}` | Before stage 1. | +| `capture.lite.done` | `{result: CaptureResult}` | After each per-turn `runLite` (no reward trigger). | +| `capture.done` | `{result: CaptureResult}` | After `runReflect` completes; gates the reward / L2 / Skill cascade. | +| `capture.failed` | `{episodeId, sessionId, stage, error}` | DB insert failed; throws afterwards. | Subscribers: - **Phase 7 reward orchestrator** listens for `capture.done` to run @@ -213,29 +178,38 @@ Subscribers: - `internal` — DB insert raw throw. - `llm_unavailable` / `llm_timeout` / `llm_output_malformed` — surfaced - from alpha / synth stages but converted to warnings (non-fatal). + from the windowed scorer but converted to warnings (non-fatal). The + episode-wide fallback writes `RELATED_DEFAULT` and the chain + continues. ## 11. Logging channels - `core.capture` — top-level run summary, warnings, timings. - `core.capture.extractor` — extractor debug (segment counts, synthetic fallbacks). -- `core.capture.reflection` — extraction/synth details. -- `core.capture.alpha` — α scores per step, model id, reason. -- `core.capture.batch` — batched ρ+α run summary (steps, synthAccepted, model). +- `core.capture.batch` — per-window batch run summary (steps, model, durationMs). +- `core.capture.summarizer` — per-turn summariser fallbacks. - `core.capture.embed` — embed failures (1 line per batch). +Top-level events to watch: +- `capture.reflect.scoring.start` — kicks off `runEpisodeBatchScoring` + for an episode. +- `capture.reflect.trace.scored` — per-trace patch result with final + alpha + reflection label + reason. +- `capture.reflect.done` / `capture.lite.done` / + `capture.lightweight.done` — phase completion summaries. +- `reflection_fallback_all_one` — episode-wide fallback was triggered. + Includes `degraded=true`, `episodeId`, `stepsCount`, + `failedWindows`. + ## 12. Testing Under `tests/unit/capture/`: - `step-extractor.test.ts` — split rules, tool merging, sub-agent depth, synthetic fallback. - `normalizer.test.ts` — truncation, dedup, drop-empty. -- `reflection-extractor.test.ts` — adapter-priority, regex matches per language, length cap. -- `alpha-scorer.test.ts` — JSON parse, clamp, `usable=false → α=0`, LLM error path. -- `reflection-synth.test.ts` — happy path, `NO_REFLECTION` sentinel, LLM error. -- `batch-scorer.test.ts` — batched ρ+α validator, order-independence, synth-disabled fallback. +- `batch-scorer.test.ts` — binary validator, order-independence, payload shape. - `embedder.test.ts` — pair interleaving, failure → null vectors. -- `capture.test.ts` (integration) — end-to-end with in-memory repos (per-step path). -- `capture-batch.test.ts` — end-to-end with batched ρ+α + auto-mode threshold fallback. +- `capture.test.ts` (integration) — end-to-end with in-memory repos. +- `capture-batch.test.ts` — end-to-end with the windowed binary scorer. - `subscriber.test.ts` — finalized→run wiring, abandoned opt-out, drain. See `ALGORITHMS.md` for V7 formula derivations and prompt fingerprints. diff --git a/apps/memos-local-plugin/core/capture/alpha-scorer.ts b/apps/memos-local-plugin/core/capture/alpha-scorer.ts deleted file mode 100644 index 11ccee959..000000000 --- a/apps/memos-local-plugin/core/capture/alpha-scorer.ts +++ /dev/null @@ -1,210 +0,0 @@ -/** - * `alpha-scorer` — grade a reflection with the `REFLECTION_SCORE_PROMPT` - * (defined in `core/llm/prompts/reflection.ts`). - * - * Implements V7 eq. 5: - * α_t = judge(state_t, action_t, outcome_t, reflection_t) - * usable = α ≥ 0.4 ∧ non-tautological - * if ¬usable then α ← 0 - * - * We parse a `{alpha: number, usable: boolean, reason?: string}` JSON - * response, clamp α to [0, 1], and force α = 0 when `usable=false`. - * - * Failures (LLM unavailable, malformed JSON) return a neutral - * `{alpha: null, usable: false}` — the caller decides what to do - * (capture.ts falls back to α=0 so nothing is trained on ungraded data). - */ - -import { ERROR_CODES, MemosError } from "../../agent-contract/errors.js"; -import type { LlmClient } from "../llm/index.js"; -import { - detectDominantLanguage, - languageSteeringLine, -} from "../llm/prompts/index.js"; -import { REFLECTION_SCORE_PROMPT } from "../llm/prompts/reflection.js"; -import { rootLogger } from "../logger/index.js"; -import { sanitizeDerivedText } from "../safety/content.js"; -import type { NormalizedStep, ReflectionContext, ReflectionScore } from "./types.js"; - -export interface AlphaInput extends ReflectionContext { - step: NormalizedStep; - reflectionText: string; - episodeId?: string; - phase?: string; - outcomeMaxChars?: number; -} - -export interface AlphaOutput { - alpha: number; - usable: boolean; - reason: string | null; - model: string; -} - -export async function scoreReflection( - llm: LlmClient, - input: AlphaInput, -): Promise { - const log = rootLogger.child({ channel: "core.capture.alpha" }); - - const thinking = (input.step.agentThinking ?? "").trim(); - const userPayload = [ - `TASK CONTEXT:`, - input.taskSummary?.trim().slice(0, 1_200) || "(none)", - ``, - `STATE:`, - input.step.userText.slice(0, 1_200) || "(none)", - ``, - `THINKING:`, - thinking ? thinking.slice(0, 1_500) : "(none — model did not emit thinking this step)", - ``, - `ACTION:`, - input.step.agentText.slice(0, 1_500) || "(none)", - input.step.toolCalls.length > 0 - ? `\nTOOL_CALLS:\n${input.step.toolCalls - .map((t) => - t.errorCode - ? `- ${t.name}(${summarizeInput(t.input)}) → ERROR[${t.errorCode}] ${truncate(outputOf(t), 300)}` - : `- ${t.name}(${summarizeInput(t.input)}) → ${truncate(outputOf(t), 300)}`, - ) - .join("\n")}` - : "\nTOOL_CALLS: (none)", - ``, - `OUTCOME:`, - // Use the last 1 tool output as the "outcome" signal if present. - lastToolOutcome(input.step, input.outcomeMaxChars ?? 600), - ``, - `DOWNSTREAM STEP PREVIEW:`, - formatDownstreamPreview(input), - ``, - `REFLECTION:`, - input.reflectionText.slice(0, 1_500), - ] - .filter(Boolean) - .join("\n"); - - // Match the `reason` string's language to the step's own language so - // the Memories viewer doesn't mix 中文 + English per row. - const stepLang = detectDominantLanguage([ - input.step.userText, - input.step.agentText, - input.step.agentThinking, - input.reflectionText, - ]); - - const rsp = await llm.completeJson<{ - alpha: unknown; - usable: unknown; - reason?: unknown; - }>( - [ - { role: "system", content: REFLECTION_SCORE_PROMPT.system }, - { role: "system", content: languageSteeringLine(stepLang) }, - { role: "user", content: userPayload }, - ], - { - op: `capture.alpha.${REFLECTION_SCORE_PROMPT.id}.v${REFLECTION_SCORE_PROMPT.version}`, - episodeId: input.episodeId, - phase: input.phase, - schemaHint: `{"alpha": 0..1, "usable": true|false, "reason": "short string"}`, - validate: (v) => { - const o = v as Record; - if (typeof o.alpha !== "number") { - throw new MemosError(ERROR_CODES.LLM_OUTPUT_MALFORMED, "alpha must be number", { - got: o.alpha, - }); - } - if (typeof o.usable !== "boolean") { - throw new MemosError(ERROR_CODES.LLM_OUTPUT_MALFORMED, "usable must be boolean", { - got: o.usable, - }); - } - }, - malformedRetries: 1, - temperature: 0, - }, - ); - - const rawAlpha = rsp.value.alpha as number; - const usable = Boolean(rsp.value.usable); - const alpha = clamp01(rawAlpha); - const finalAlpha = usable ? alpha : 0; - const reason = typeof rsp.value.reason === "string" ? sanitizeDerivedText(rsp.value.reason) : null; - - log.debug("alpha.scored", { - key: input.step.key, - alpha: finalAlpha, - usable, - rawAlpha, - model: rsp.servedBy, - reason, - }); - - return { alpha: finalAlpha, usable, reason, model: rsp.servedBy }; -} - -export function disabledScore(text: string | null, source: ReflectionScore["source"]): ReflectionScore { - return { - text, - alpha: text ? 0.5 : 0, - usable: text !== null, - source, - }; -} - -function clamp01(v: number): number { - if (!Number.isFinite(v)) return 0; - return Math.max(0, Math.min(1, v)); -} - -function summarizeInput(v: unknown): string { - if (v === undefined || v === null) return ""; - if (typeof v === "string") return v.slice(0, 200); - try { - return JSON.stringify(v).slice(0, 200); - } catch { - return String(v).slice(0, 200); - } -} - -function outputOf(t: { output?: unknown }): string { - if (t.output === undefined || t.output === null) return ""; - if (typeof t.output === "string") return t.output; - try { - return JSON.stringify(t.output); - } catch { - return String(t.output); - } -} - -function lastToolOutcome(step: NormalizedStep, maxChars: number): string { - const last = step.toolCalls[step.toolCalls.length - 1]; - if (!last) return "(assistant-only step)"; - return (last.errorCode ? `ERROR[${last.errorCode}] ` : "") + truncate(outputOf(last), maxChars); -} - -function truncate(s: string, n: number): string { - return s.length > n ? s.slice(0, n) + "..." : s; -} - -function formatDownstreamPreview(input: AlphaInput): string { - const preview = input.downstream ?? []; - if (preview.length === 0) return "(none)"; - return preview - .map((item) => { - const label = `step+${item.offset}`; - if (item.kind === "tooluse") { - const lines = [ - `[${label}] type=tooluse`, - `tool_names: ${item.toolNames?.join(", ") || "(unknown)"}`, - `tool_output: ${item.toolOutput?.trim() || "(none)"}`, - ]; - if (item.reflection?.trim()) { - lines.push(`existing_reflection: ${item.reflection.trim()}`); - } - return lines.join("\n"); - } - return [`[${label}] type=text`, item.text?.trim() || "(empty)"].join("\n"); - }) - .join("\n\n"); -} diff --git a/apps/memos-local-plugin/core/capture/batch-scorer.ts b/apps/memos-local-plugin/core/capture/batch-scorer.ts index e7b8ab50f..86bde9b73 100644 --- a/apps/memos-local-plugin/core/capture/batch-scorer.ts +++ b/apps/memos-local-plugin/core/capture/batch-scorer.ts @@ -1,40 +1,22 @@ /** - * `batch-scorer` — episode-level reflection synthesis + α scoring in ONE - * LLM call. Activated by `algorithm.capture.batchMode` + `batchThreshold`. - * - * Why this exists (V7 §3.2 batched variant): - * - * The per-step path (`reflection-synth.ts` + `alpha-scorer.ts`) issues - * 2 LLM calls per agent step (synth + α). For a 10-step episode that's - * ~20 calls — slow and expensive. This module folds them into one call - * that processes the whole episode at once. - * - * Beyond cost: the LLM here sees the *complete* causal chain (every - * step in order, including the final outcome), so reflections it - * writes can credit-attribute across steps in a way grounded - * per-step reflections never can. V7 §3.2.3's `causal_insight` and - * `transferability` axes benefit directly. - * - * Trade-offs (encoded in capture.ts dispatch): - * - Prompt grows linearly with N steps. Capped via `batchThreshold`; - * long episodes degrade to the per-step path automatically. - * - One bad output value forces a single batched retry instead of N - * isolated retries — but the facade already does `malformedRetries` - * for us, and on hard failure capture.ts falls back to per-step. + * `batch-scorer` — windowed binary path-relevance scoring for one episode + * window. Always invoked through `capture.ts :: runEpisodeBatchScoring`, + * which owns the primary/degrade window topology and retry ladder. * * Wire format ↔ prompt: - * Send `{ host_context?, task_context?, steps: [{idx, state, action, outcome, reflection, synth_allowed}] }`. - * `task_context` is episode-level task summary (nullable string). - * Receive `{scores: [{idx, reflection_text, alpha, usable, reason}]}`. + * Send `{ host_context?, task_context?, steps: [{idx, state, thinking, + * action, tool_calls, outcome}] }`. + * Receive `{ scores: [{idx, alpha: 0|1, relevance: "RELATED" | + * "IRRELEVANT", reason: str}] }`. * See `core/llm/prompts/reflection.ts :: BATCH_REFLECTION_PROMPT`. + * + * Validation is strict: any non-{0,1} alpha or relevance outside + * {RELATED, IRRELEVANT} raises `LLM_OUTPUT_MALFORMED` so the caller's + * window retry ladder can take over. */ import { ERROR_CODES, MemosError } from "../../agent-contract/errors.js"; import type { LlmClient } from "../llm/index.js"; -import { - detectDominantLanguage, - languageSteeringLine, -} from "../llm/prompts/index.js"; import { BATCH_REFLECTION_PROMPT } from "../llm/prompts/reflection.js"; import { rootLogger } from "../logger/index.js"; import { sanitizeDerivedText } from "../safety/content.js"; @@ -42,32 +24,16 @@ import type { NormalizedStep, ReflectionScore } from "./types.js"; export interface BatchScoreInput { step: NormalizedStep; - /** - * Reflection already extracted (adapter / regex). `null` when none — the - * LLM may synthesize one if `synthReflections` is enabled. - */ - existingReflection: string | null; } export interface BatchScoreOptions { - /** - * Mirror of `CaptureConfig.synthReflections`. When `false`, any reflection - * the LLM writes for steps that came in empty is discarded - * (text→null, α→0, source→none) — preserves the per-step contract. - */ - synthReflections: boolean; episodeId?: string; phase?: string; taskSummary?: string | null; - /** - * Cap per-field text we shovel into the prompt. Default 1_200 chars per - * `state`/`outcome`, 1_500 per `action`. Mirrors per-step prompts. - */ perFieldChars?: { state: number; action: number; outcome: number; - reflection: number; }; } @@ -76,15 +42,12 @@ export interface BatchScoreResult { scores: ReflectionScore[]; /** `servedBy` model id from the underlying LLM call. */ model: string; - /** Number of steps where we accepted a newly-synthesized reflection. */ - synthAccepted: number; } interface RawScoreEntry { idx: number; - reflection_text: unknown; alpha: unknown; - usable: unknown; + relevance: unknown; reason?: unknown; } @@ -96,14 +59,13 @@ const DEFAULT_FIELD_CHARS = { state: 1_200, action: 1_500, outcome: 600, - reflection: 1_200, thinking: 1_500, } as const; export const BATCH_OP_TAG = `capture.${BATCH_REFLECTION_PROMPT.id}.v${BATCH_REFLECTION_PROMPT.version}`; /** - * One LLM call → reflections + α for every input step. + * One LLM call → binary relevance + α(0/1) for every input step. * * Throws `MemosError` with `LLM_OUTPUT_MALFORMED` when the LLM returns a * shape we cannot parse even after the facade's malformed-retry. Caller @@ -118,7 +80,7 @@ export async function batchScoreReflections( ): Promise { const log = rootLogger.child({ channel: "core.capture.batch" }); if (inputs.length === 0) { - return { scores: [], model: "none", synthAccepted: 0 }; + return { scores: [], model: "none" }; } const fieldChars = { ...DEFAULT_FIELD_CHARS, ...(opts.perFieldChars ?? {}) }; @@ -137,28 +99,12 @@ export async function batchScoreReflections( errorCode: t.errorCode ?? null, })), outcome: lastToolOutcome(input.step, fieldChars.outcome), - reflection: clip(input.existingReflection ?? "", fieldChars.reflection), - synth_allowed: opts.synthReflections, })), }; - // Reflections are first-person narrations — written in the same - // language the user + agent were speaking so the Memories panel - // stays coherent. Detect once per batch from the aggregate turn - // texts; all steps in one episode share a language in practice. - const reflectionLang = detectDominantLanguage( - inputs.flatMap((i) => [ - i.step.userText, - i.step.agentText, - i.step.agentThinking, - i.existingReflection, - ]), - ); - const rsp = await llm.completeJson( [ { role: "system", content: BATCH_REFLECTION_PROMPT.system }, - { role: "system", content: languageSteeringLine(reflectionLang) }, { role: "user", content: JSON.stringify(payload) }, ], { @@ -166,7 +112,7 @@ export async function batchScoreReflections( episodeId: opts.episodeId, phase: opts.phase, schemaHint: - '{"scores": [{"idx": int, "reflection_text": "str", "alpha": 0..1, "usable": bool, "reason": "str"}]}', + '{"scores": [{"idx": int, "alpha": 0|1, "relevance": "RELATED|IRRELEVANT", "reason": "str"}]}', validate: (v) => validateBatchPayload(v, inputs.length), malformedRetries: 1, temperature: 0, @@ -178,56 +124,36 @@ export async function batchScoreReflections( const byIdx = new Map(); for (const entry of rsp.value.scores) byIdx.set(Number(entry.idx), entry); - let synthAccepted = 0; const scores: ReflectionScore[] = inputs.map((input, i) => { const raw = byIdx.get(i); if (!raw) { - // Should be impossible after validateBatchPayload, but degrade - // safely: treat as no-reflection. - return disabledScoreFor(input); + return { + text: "IRRELEVANT", + alpha: 0, + usable: false, + source: "none", + }; } - const incomingText = (input.existingReflection ?? "").trim(); - const llmText = typeof raw.reflection_text === "string" ? sanitizeDerivedText(raw.reflection_text) : ""; - const usable = Boolean(raw.usable); - const rawAlpha = clamp01(numOrZero(raw.alpha)); - const alpha = usable ? rawAlpha : 0; + const alpha = clamp01(numOrZero(raw.alpha)) >= 0.5 ? 1 : 0; + const relevance = raw.relevance === "RELATED" ? "RELATED" : "IRRELEVANT"; const reason = typeof raw.reason === "string" ? sanitizeDerivedText(raw.reason) : null; - - let finalText: string | null; - let source: ReflectionScore["source"]; - if (incomingText.length > 0) { - // Caller already had a reflection; never let the LLM rewrite it - // (the prompt asks for verbatim copy, but we double-enforce here). - finalText = incomingText.slice(0, 1_500); - source = sourceForExisting(input); - } else if (llmText.length > 0 && opts.synthReflections) { - finalText = llmText.slice(0, 1_500); - source = "synth"; - synthAccepted += 1; - } else { - // Either the LLM didn't write one (incoherent step) or synth is - // disabled and we discard whatever it wrote. - return disabledScoreFor(input); - } - return { - text: finalText, + text: relevance, alpha, - usable: usable && finalText !== null, + usable: alpha === 1, reason, - source, + source: "synth", model: rsp.servedBy, }; }); log.debug("batch.scored", { steps: inputs.length, - synthAccepted, model: rsp.servedBy, durationMs: rsp.durationMs, }); - return { scores, model: rsp.servedBy, synthAccepted }; + return { scores, model: rsp.servedBy }; } // ─── helpers ──────────────────────────────────────────────────────────────── @@ -252,28 +178,6 @@ function batchHostContext( return out; } -function disabledScoreFor(input: BatchScoreInput): ReflectionScore { - const text = (input.existingReflection ?? "").trim(); - if (text.length === 0) { - return { text: null, alpha: 0, usable: false, source: "none" }; - } - // We had a reflection but the LLM result was unusable — keep the text, - // attribute α=0.5 the same way `disabledScore` does for non-LLM paths so - // backprop still has a non-zero weight. - return { - text: text.slice(0, 1_500), - alpha: 0.5, - usable: true, - source: sourceForExisting(input), - }; -} - -function sourceForExisting(input: BatchScoreInput): ReflectionScore["source"] { - return input.step.rawReflection !== null && input.step.rawReflection.trim().length > 0 - ? "adapter" - : "extracted"; -} - function validateBatchPayload(v: unknown, expected: number): void { const o = v as { scores?: unknown }; if (!o || !Array.isArray(o.scores)) { @@ -299,23 +203,23 @@ function validateBatchPayload(v: unknown, expected: number): void { got: entry.idx, }); } - if (typeof entry.alpha !== "number") { + if (typeof entry.alpha !== "number" || !Number.isFinite(entry.alpha)) { throw new MemosError(ERROR_CODES.LLM_OUTPUT_MALFORMED, "batch reflection: alpha must be number", { idx: entry.idx, got: entry.alpha, }); } - if (typeof entry.usable !== "boolean") { - throw new MemosError(ERROR_CODES.LLM_OUTPUT_MALFORMED, "batch reflection: usable must be boolean", { + if (entry.alpha !== 0 && entry.alpha !== 1) { + throw new MemosError(ERROR_CODES.LLM_OUTPUT_MALFORMED, "batch reflection: alpha must be 0 or 1", { idx: entry.idx, - got: entry.usable, + got: entry.alpha, }); } - if (entry.reflection_text != null && typeof entry.reflection_text !== "string") { + if (entry.relevance !== "RELATED" && entry.relevance !== "IRRELEVANT") { throw new MemosError( ERROR_CODES.LLM_OUTPUT_MALFORMED, - "batch reflection: reflection_text must be string when present", - { idx: entry.idx, got: typeof entry.reflection_text }, + "batch reflection: relevance must be RELATED or IRRELEVANT", + { idx: entry.idx, got: entry.relevance }, ); } } diff --git a/apps/memos-local-plugin/core/capture/capture.ts b/apps/memos-local-plugin/core/capture/capture.ts index 9d52f749e..e4228aacd 100644 --- a/apps/memos-local-plugin/core/capture/capture.ts +++ b/apps/memos-local-plugin/core/capture/capture.ts @@ -18,17 +18,15 @@ import { ERROR_CODES, MemosError } from "../../agent-contract/errors.js"; import type { Embedder } from "../embedding/index.js"; import type { LlmClient } from "../llm/index.js"; import { rootLogger } from "../logger/index.js"; +import type { Logger } from "../logger/types.js"; import { ids } from "../id.js"; import type { EpisodeRow, TraceRow, TraceId, EpochMs } from "../types.js"; import type { makeEmbeddingRetryQueueRepo } from "../storage/repos/embedding_retry_queue.js"; import type { makeTracesRepo } from "../storage/repos/traces.js"; import type { EpisodesRepo } from "../session/persistence.js"; -import { disabledScore, scoreReflection } from "./alpha-scorer.js"; import { batchScoreReflections, type BatchScoreInput } from "./batch-scorer.js"; import { embedSteps, type VecPair } from "./embedder.js"; import { normalizeSteps } from "./normalizer.js"; -import { extractReflection } from "./reflection-extractor.js"; -import { synthesizeReflection } from "./reflection-synth.js"; import { extractSteps } from "./step-extractor.js"; import { createSummarizer, type Summarizer } from "./summarizer.js"; import { tagsForStep } from "./tagger.js"; @@ -39,9 +37,7 @@ import type { CaptureEventBus, CaptureInput, CaptureResult, - DownstreamStepPreview, NormalizedStep, - ReflectionContext, ReflectionScore, ScoredStep, StepCandidate, @@ -434,47 +430,30 @@ export function createCaptureRunner(deps: CaptureDeps): CaptureRunner { return result; } - // Batch reflection + α across every step of the now-closed - // episode. Falls back to per-step scoring when over the threshold - // or when batching fails / no LLM is wired. The reflect pass uses - // `reflectLlm` (skill-evolver model when configured) for higher - // quality reflections; per-turn lite capture still uses `llm`. + // Episode-level binary reflection/alpha scoring with fixed windows. + // Per-step scoring is removed; all failures degrade through the + // window retry ladder and finally default to RELATED_DEFAULT. const reflectStart = now(); const rLlm = deps.reflectLlm ?? deps.llm; - const useBatch = shouldBatch(deps.cfg, normalized.length, rLlm !== null); - const contextEnabled = contextModeFor(deps.cfg, useBatch, normalized.length); - const taskSummary = contextEnabled.includeTask - ? buildTaskReflectionSummary(input.episode, normalized, deps.cfg.taskContextMaxChars) - : null; - const downstreamByStep = contextEnabled.includeDownstream - ? buildDownstreamStepPreviews(normalized, deps.cfg) - : normalized.map(() => []); + const taskSummary = buildTaskReflectionSummary(input.episode, normalized, deps.cfg.taskContextMaxChars); log.info("capture.reflect.scoring.start", { episodeId: input.episode.id, sessionId: input.episode.sessionId, steps: normalized.length, - mode: useBatch ? "batch" : contextEnabled.includeDownstream ? "per_step_downstream" : "per_step", - reflectionContextMode: deps.cfg.reflectionContextMode, - downstreamPreview: contextEnabled.includeDownstream, + mode: "batch_windowed_binary", provider: rLlm?.provider ?? "none", model: rLlm?.model ?? "none", taskSummary: taskSummary ? taskSummary.slice(0, 240) : null, }); - let scored: ScoredStep[] = []; - if (useBatch) { - scored = await runBatchScoring(normalized, rLlm!, deps, warnings, llmCalls, input.episode.id, taskSummary); - } - if (!useBatch || scored.length === 0) { - scored = await runPerStepScoring( - normalized, - rLlm, - deps, - warnings, - llmCalls, - input.episode.id, - buildReflectionContexts(normalized, taskSummary, downstreamByStep), - ); - } + const scored = await runEpisodeBatchScoring( + normalized, + rLlm, + warnings, + llmCalls, + input.episode.id, + taskSummary, + log, + ); const reflectMs = now() - reflectStart; // Patch each existing trace with the freshly-computed reflection + @@ -1017,191 +996,183 @@ export function createCaptureRunner(deps: CaptureDeps): CaptureRunner { // ─── helpers ──────────────────────────────────────────────────────────────── -/** - * Decide whether to use the batched reflection+α path. - * - * `per_step` → never (legacy path). - * `per_episode` → always, when an LLM is available. - * `auto` → batch when step count fits inside `batchThreshold`. - */ -function shouldBatch(cfg: CaptureConfig, stepCount: number, hasLlm: boolean): boolean { - if (!hasLlm) return false; - if (stepCount === 0) return false; - if (cfg.batchMode === "per_step") return false; - if (cfg.batchMode === "per_episode") return true; - // "auto" - return stepCount <= cfg.batchThreshold; -} - -function contextModeFor( - cfg: CaptureConfig, - useBatch: boolean, - stepCount: number, -): { includeTask: boolean; includeDownstream: boolean } { - const mode = cfg.reflectionContextMode; - const includeTask = mode === "task" || mode === "task_downstream"; - const wantsDownstream = mode === "downstream" || mode === "task_downstream"; - const longPerStep = !useBatch && stepCount > cfg.batchThreshold; - const includeDownstream = - wantsDownstream && - cfg.longEpisodeReflectMode === "per_step_downstream" && - cfg.downstreamStepCount > 0 && - cfg.downstreamContextMaxChars > 0 && - longPerStep; - return { includeTask, includeDownstream }; -} - -function buildReflectionContexts( - steps: readonly NormalizedStep[], - taskSummary: string | null, - downstreamByStep: readonly DownstreamStepPreview[][], -): ReflectionContext[] { - return steps.map((_, idx) => ({ - taskSummary, - downstream: downstreamByStep[idx] ?? [], - })); -} - -async function runBatchScoring( +async function runEpisodeBatchScoring( normalized: NormalizedStep[], - llm: LlmClient, - deps: CaptureDeps, + llm: LlmClient | null, warnings: CaptureResult["warnings"], - llmCalls: { reflectionSynth: number; alphaScoring: number; batchedReflection: number }, + llmCalls: { batchedReflection: number }, episodeId: string, taskSummary: string | null, + log: Logger, ): Promise { - const inputs: BatchScoreInput[] = normalized.map((step) => ({ - step, - existingReflection: extractReflection(step), - })); - - try { - const out = await batchScoreReflections(llm, inputs, { - synthReflections: deps.cfg.synthReflections, - episodeId, - phase: "reflect", - taskSummary, - }); - llmCalls.batchedReflection += 1; - return normalized.map((step, i) => ({ + const fallbackAllOne = (): ScoredStep[] => + normalized.map((step) => ({ ...step, - reflection: out.scores[i] ?? disabledScore(null, "none"), + reflection: { + text: "RELATED_DEFAULT", + alpha: 1, + usable: true, + source: "none", + reason: "FALLBACK_ALL_ONE", + }, })); - } catch (err) { - // Single failure mode: the batched call (or its validator) threw. - // Fall back to per-step in the caller. We surface a warning so the - // viewer can show "batch path degraded" without crashing capture. + + if (!llm) { warnings.push({ stage: "batch", - message: "batched reflection scoring failed; falling back to per-step", - detail: errDetail(err), + message: "no reflect llm; using episode-wide RELATED_DEFAULT fallback", + }); + log.warn("reflection_fallback_all_one", { + degraded: true, + episodeId, + stepsCount: normalized.length, + failedWindows: normalized.length > 0 ? 1 : 0, + reason: "no_llm", }); - return []; + return fallbackAllOne(); } -} -async function runPerStepScoring( - normalized: NormalizedStep[], - llm: LlmClient | null, - deps: CaptureDeps, - warnings: CaptureResult["warnings"], - llmCalls: { reflectionSynth: number; alphaScoring: number }, - episodeId: string, - contexts: ReflectionContext[], -): Promise { - const concurrency = Math.max(1, deps.cfg.llmConcurrency); - return runConcurrently(normalized, concurrency, async (step, idx): Promise => { - const context = contexts[idx] ?? {}; - const { score, synthCount } = await resolveReflection(step, llm, deps, warnings, episodeId, context); - llmCalls.reflectionSynth += synthCount; - const finalScore = await resolveAlpha(step, score, llm, deps, warnings, episodeId, context); - if (finalScore !== score) llmCalls.alphaScoring += 1; - return { ...step, reflection: finalScore }; + const primary = await runWindowPass({ + normalized, + llm, + episodeId, + taskSummary, + windowSize: 20, + overlap: 3, + maxRetries: 1, + warnings, + llmCalls, + }); + if (primary.success) return mergeWindowScores(normalized, primary.results); + + warnings.push({ + stage: "batch", + message: "primary window pass failed; degrading to smaller windows", + detail: { windowSize: 9, overlap: 3 }, + }); + + const degraded = await runWindowPass({ + normalized, + llm, + episodeId, + taskSummary, + windowSize: 9, + overlap: 3, + maxRetries: 2, + warnings, + llmCalls, }); + if (degraded.success) return mergeWindowScores(normalized, degraded.results); + + log.error("reflection_fallback_all_one", { + degraded: true, + episodeId, + stepsCount: normalized.length, + failedWindows: degraded.failedWindows, + }); + warnings.push({ + stage: "batch", + message: "all window retries exhausted; force RELATED_DEFAULT for episode", + detail: { failedWindows: degraded.failedWindows }, + }); + return fallbackAllOne(); } -async function resolveReflection( - step: NormalizedStep, - llm: LlmClient | null, - deps: CaptureDeps, - warnings: CaptureResult["warnings"], - episodeId: string, - context: ReflectionContext, -): Promise<{ score: ReflectionScore; synthCount: number }> { - const adapterProvided = step.rawReflection !== null && step.rawReflection.trim().length > 0; - const extracted = extractReflection(step); - if (extracted) { - return { - score: disabledScore(extracted, adapterProvided ? "adapter" : "extracted"), - synthCount: 0, - }; - } - if (!deps.cfg.synthReflections || !llm) { - return { score: disabledScore(null, "none"), synthCount: 0 }; - } - try { - const synth = await synthesizeReflection(llm, step, { - episodeId, - phase: "reflect", - taskSummary: context.taskSummary, - downstream: context.downstream, - outcomeMaxChars: deps.cfg.synthOutcomeMaxChars, - }); - if (synth.text) { - return { - score: { text: synth.text, alpha: null, usable: true, source: "synth", model: synth.model }, - synthCount: 1, - }; +async function runWindowPass(args: { + normalized: NormalizedStep[]; + llm: LlmClient; + episodeId: string; + taskSummary: string | null; + windowSize: number; + overlap: number; + maxRetries: number; + warnings: CaptureResult["warnings"]; + llmCalls: { batchedReflection: number }; +}): Promise<{ success: boolean; results: Map; failedWindows: number }> { + const windows = buildWindows(args.normalized.length, args.windowSize, args.overlap); + const results = new Map(); + let failedWindows = 0; + for (const win of windows) { + let ok = false; + for (let attempt = 0; attempt <= args.maxRetries; attempt++) { + try { + const inputs: BatchScoreInput[] = args.normalized + .slice(win.start, win.end) + .map((step) => ({ step })); + const out = await batchScoreReflections(args.llm, inputs, { + episodeId: args.episodeId, + phase: "reflect", + taskSummary: args.taskSummary, + }); + args.llmCalls.batchedReflection += 1; + results.set(win.start, out.scores); + ok = true; + break; + } catch (err) { + if (attempt === args.maxRetries) { + args.warnings.push({ + stage: "batch", + message: "window batch scoring failed", + detail: { ...errDetail(err), windowStart: win.start, windowEnd: win.end, attempts: attempt + 1 }, + }); + } + } } - return { score: disabledScore(null, "none"), synthCount: 1 }; - } catch (err) { - warnings.push({ - stage: "reflection.synth", - message: "synth failed", - detail: errDetail(err), - }); - return { score: disabledScore(null, "none"), synthCount: 0 }; + if (!ok) failedWindows += 1; } + return { success: failedWindows === 0, results, failedWindows }; } -async function resolveAlpha( - step: NormalizedStep, - current: ReflectionScore, - llm: LlmClient | null, - deps: CaptureDeps, - warnings: CaptureResult["warnings"], - episodeId: string, - context: ReflectionContext, -): Promise { - if (!current.text) return current; // nothing to grade - if (!deps.cfg.alphaScoring || !llm) return current; - - try { - const scored = await scoreReflection(llm, { - step, - reflectionText: current.text, - episodeId, - phase: "reflect", - taskSummary: context.taskSummary, - downstream: context.downstream, - outcomeMaxChars: deps.cfg.synthOutcomeMaxChars, - }); +function mergeWindowScores( + normalized: NormalizedStep[], + windowScores: Map, +): ScoredStep[] { + const merged = new Map(); + const starts = [...windowScores.keys()].sort((a, b) => a - b); + for (const start of starts) { + const scores = windowScores.get(start) ?? []; + for (let i = 0; i < scores.length; i++) { + const idx = start + i; + const next = scores[i]; + if (!next) continue; + const prev = merged.get(idx); + if (!prev) { + merged.set(idx, next); + continue; + } + const prevAlpha = prev.alpha === 1 ? 1 : 0; + const nextAlpha = next.alpha === 1 ? 1 : 0; + if (nextAlpha > prevAlpha) merged.set(idx, next); + } + } + return normalized.map((step, idx) => { + const score = merged.get(idx); + if (score) return { ...step, reflection: score }; return { - ...current, - alpha: scored.alpha, - usable: scored.usable, - reason: scored.reason, - model: scored.model, + ...step, + reflection: { + text: "RELATED_DEFAULT", + alpha: 1, + usable: true, + source: "none", + reason: "MISSING_WINDOW_DEFAULT", + }, }; - } catch (err) { - warnings.push({ - stage: "alpha", - message: "alpha scoring failed; keeping neutral α", - detail: errDetail(err), - }); - return current; + }); +} + +function buildWindows(length: number, windowSize: number, overlap: number): Array<{ start: number; end: number }> { + if (length <= 0) return []; + const out: Array<{ start: number; end: number }> = []; + const stride = Math.max(1, windowSize - overlap); + let start = 0; + while (start < length) { + const end = Math.min(length, start + windowSize); + out.push({ start, end }); + if (end >= length) break; + start += stride; } + return out; } async function runConcurrently( @@ -1263,93 +1234,6 @@ function buildTaskReflectionSummary( return summary ? clipForPrompt(summary, maxChars) : null; } -function buildDownstreamStepPreviews( - steps: readonly NormalizedStep[], - cfg: CaptureConfig, -): DownstreamStepPreview[][] { - return steps.map((_, idx) => { - const out: DownstreamStepPreview[] = []; - let usedChars = 0; - const count = Math.max(0, Math.min(3, cfg.downstreamStepCount)); - for (let offset = 1; offset <= count; offset++) { - const step = steps[idx + offset]; - if (!step) break; - const remaining = cfg.downstreamContextMaxChars - usedChars; - if (remaining <= 0) break; - const item = downstreamPreviewForStep( - step, - offset as 1 | 2 | 3, - Math.min(cfg.downstreamPerStepMaxChars, remaining), - ); - usedChars += previewSize(item); - out.push(item); - } - return out; - }); -} - -function downstreamPreviewForStep( - step: NormalizedStep, - offset: 1 | 2 | 3, - maxChars: number, -): DownstreamStepPreview { - const existingReflection = extractReflection(step); - if (step.toolCalls.length > 0) { - return { - offset, - kind: "tooluse", - toolNames: step.toolCalls.map((t) => t.name).filter(Boolean), - toolOutput: clipForPrompt(summarizeToolOutputs(step), maxChars), - reflection: existingReflection ? clipForPrompt(existingReflection, Math.floor(maxChars / 2)) : null, - }; - } - return { - offset, - kind: "text", - text: clipForPrompt(textPreviewForStep(step), maxChars), - }; -} - -function summarizeToolOutputs(step: NormalizedStep): string { - return step.toolCalls - .map((t) => { - const label = t.errorCode ? `${t.name} ERROR[${t.errorCode}]` : t.name; - const output = outputOfToolCall(t); - return `${label}: ${output || "(no output)"}`; - }) - .join("\n"); -} - -function outputOfToolCall(t: { output?: unknown }): string { - if (t.output === undefined || t.output === null) return ""; - if (typeof t.output === "string") return t.output; - try { - return JSON.stringify(t.output); - } catch { - return String(t.output); - } -} - -function textPreviewForStep(step: NormalizedStep): string { - const parts = [ - step.userText.trim() ? `state: ${step.userText.trim()}` : "", - step.agentText.trim() ? `action: ${step.agentText.trim()}` : "", - ].filter(Boolean); - return parts.join("\n") || "(empty)"; -} - -function previewSize(item: DownstreamStepPreview): number { - return [ - item.kind, - item.text, - item.toolNames?.join(", "), - item.toolOutput, - item.reflection, - ] - .filter(Boolean) - .join("\n").length; -} - function stringMeta(meta: Record, key: string): string | undefined { const value = meta[key]; return typeof value === "string" && value.trim() ? value.trim() : undefined; diff --git a/apps/memos-local-plugin/core/capture/index.ts b/apps/memos-local-plugin/core/capture/index.ts index 344952747..4667e1ccd 100644 --- a/apps/memos-local-plugin/core/capture/index.ts +++ b/apps/memos-local-plugin/core/capture/index.ts @@ -13,9 +13,6 @@ export { export { createCaptureEventBus } from "./events.js"; export { extractSteps } from "./step-extractor.js"; export { normalizeSteps } from "./normalizer.js"; -export { extractReflection } from "./reflection-extractor.js"; -export { synthesizeReflection } from "./reflection-synth.js"; -export { scoreReflection, disabledScore } from "./alpha-scorer.js"; export { batchScoreReflections, type BatchScoreInput, @@ -24,6 +21,7 @@ export { BATCH_OP_TAG as CAPTURE_BATCH_OP_TAG, } from "./batch-scorer.js"; export { embedSteps } from "./embedder.js"; +export { REFLECTION_ENUM_LABELS, reflectionAsText } from "./types.js"; export type { CaptureConfig, CaptureEvent, diff --git a/apps/memos-local-plugin/core/capture/reflection-extractor.ts b/apps/memos-local-plugin/core/capture/reflection-extractor.ts deleted file mode 100644 index e3f3037cd..000000000 --- a/apps/memos-local-plugin/core/capture/reflection-extractor.ts +++ /dev/null @@ -1,59 +0,0 @@ -/** - * `reflection-extractor` — try to lift a self-reflection out of the - * assistant text for free (no LLM required). - * - * The V7 spec defines a reflection as "the agent's own explanation of - * why it made this decision". Hosts sometimes emit this inline: - * - An OpenClaw assistant block containing `### Reasoning:` or - * `I chose this because …`. - * - A Hermes `` tag (legacy). - * - A Chinese-language agent producing "我这样做是因为…" or "思考过程:". - * - * We recognise a handful of high-precision patterns and return the cleaned - * snippet. Never throws, never invokes an LLM. - * - * If the step already has `rawReflection` set (from adapter-provided meta), - * that wins unchanged. - */ - -import type { NormalizedStep } from "./types.js"; - -const INLINE_PATTERNS: RegExp[] = [ - // Markdown heading-style reasoning blocks. - /^###?\s*(reasoning|rationale|why|思考(?:过程|过程如下)?|我的理由)[::]?\s*\n([\s\S]+?)(?=\n(?:###?\s|$))/im, - - // "..." legacy tags. - /\s*([\s\S]+?)\s*<\/reflection>/i, - - // English inline phrase "Reflection: ..." / "Reasoning: ..." - /\b(reflection|reasoning|rationale)\s*[::]\s*([\s\S]{20,})/i, - - // Chinese phrases. - /(我(?:这么|这样)做的?(?:原因|理由)[是::]?)\s*([\s\S]{10,})/m, - /(思考(?:过程|过程如下))\s*[::]?\s*([\s\S]{10,})/m, -]; - -/** - * Extract a reflection from the step. Prefers the adapter-provided value; - * falls back to parsing `agentText`. Returns `null` when no signal found. - */ -export function extractReflection(step: NormalizedStep): string | null { - if (step.rawReflection && step.rawReflection.trim().length > 0) { - return step.rawReflection.trim(); - } - const text = step.agentText ?? ""; - if (text.length === 0) return null; - - for (const pat of INLINE_PATTERNS) { - const m = pat.exec(text); - if (m) { - // The actual body is the last capturing group. - const body = (m[m.length - 1] ?? "").trim(); - if (body.length >= 10) { - // Cap so a misbehaving pattern can't swallow the whole message. - return body.slice(0, 1_500); - } - } - } - return null; -} diff --git a/apps/memos-local-plugin/core/capture/reflection-synth.ts b/apps/memos-local-plugin/core/capture/reflection-synth.ts deleted file mode 100644 index bc076a320..000000000 --- a/apps/memos-local-plugin/core/capture/reflection-synth.ts +++ /dev/null @@ -1,158 +0,0 @@ -/** - * `reflection-synth` — optionally ask the LLM to WRITE a reflection when - * the agent turn contained none. Off by default (costly). - * - * This is strictly a fallback path; the extractor runs first. - * - * The prompt is deliberately minimal — we don't want the LLM to grade or - * judge (that's `alpha-scorer`), just to produce a first-person - * "here's what I was trying to do" summary. The α scorer gets the next - * crack and can still mark it unusable. - */ - -import { MemosError } from "../../agent-contract/errors.js"; -import type { LlmClient } from "../llm/index.js"; -import { rootLogger } from "../logger/index.js"; -import { sanitizeDerivedText } from "../safety/content.js"; -import type { NormalizedStep, ReflectionContext } from "./types.js"; - -const SYSTEM = `You are reviewing a single step of an AI agent's decision. - -Write a first-person reflection from the agent's perspective explaining WHY -it produced this response / tool calls given the user input. Keep it to -2–4 sentences, concrete, avoid repeating the visible action. - -If the step is empty or incoherent, return exactly: NO_REFLECTION`; - -export interface SynthesizedReflection { - text: string | null; - model: string; -} - -export interface ReflectionSynthContext extends ReflectionContext { - episodeId?: string; - phase?: string; - outcomeMaxChars?: number; -} - -export async function synthesizeReflection( - llm: LlmClient, - step: NormalizedStep, - context?: ReflectionSynthContext, -): Promise { - const log = rootLogger.child({ channel: "core.capture.reflection" }); - - const thinking = (step.agentThinking ?? "").trim(); - const userPayload = [ - `TASK CONTEXT:`, - context?.taskSummary?.trim().slice(0, 1_200) || "(none)", - ``, - `USER/OBSERVATION:`, - step.userText.slice(0, 1_200) || "(none)", - ``, - `THINKING (model's native chain-of-thought, if any):`, - thinking ? thinking.slice(0, 1_500) : "(none)", - ``, - `AGENT ACTION:`, - step.agentText.slice(0, 1_500) || "(none)", - step.toolCalls.length > 0 - ? `\nTOOL CALLS:\n${step.toolCalls - .map((t) => - t.errorCode - ? `- ${t.name}(${safeStringify(t.input).slice(0, 400)}) → ERROR[${t.errorCode}]` - : `- ${t.name}(${safeStringify(t.input).slice(0, 400)})`, - ) - .join("\n")}` - : "", - ``, - `OUTCOME:`, - lastToolOutcome(step, context?.outcomeMaxChars ?? 600), - ``, - `DOWNSTREAM STEP PREVIEW:`, - formatDownstreamPreview(context), - ] - .filter(Boolean) - .join("\n"); - - try { - const rsp = await llm.complete( - [ - { role: "system", content: SYSTEM }, - { role: "user", content: userPayload }, - ], - { - op: "capture.reflection.synth", - episodeId: context?.episodeId, - phase: context?.phase, - temperature: 0.1, - }, - ); - const raw = sanitizeDerivedText(rsp.text); - if (raw === "" || raw === "NO_REFLECTION") { - log.debug("synth.no_reflection", { key: step.key }); - return { text: null, model: rsp.servedBy }; - } - return { text: raw.slice(0, 1_500), model: rsp.servedBy }; - } catch (err) { - log.warn("synth.failed", { key: step.key, err: errDetail(err) }); - return { text: null, model: "none" }; - } -} - -function errDetail(err: unknown): Record { - if (err instanceof MemosError) return { code: err.code, message: err.message }; - if (err instanceof Error) return { name: err.name, message: err.message }; - return { value: String(err) }; -} - -function safeStringify(v: unknown): string { - if (v === undefined || v === null) return ""; - if (typeof v === "string") return v; - try { - return JSON.stringify(v); - } catch { - return String(v); - } -} - -function lastToolOutcome(step: NormalizedStep, maxChars: number): string { - const last = step.toolCalls[step.toolCalls.length - 1]; - if (!last) return "(assistant-only step)"; - return (last.errorCode ? `ERROR[${last.errorCode}] ` : "") + truncate(outputOf(last), maxChars); -} - -function outputOf(t: { output?: unknown }): string { - if (t.output === undefined || t.output === null) return ""; - if (typeof t.output === "string") return t.output; - try { - return JSON.stringify(t.output); - } catch { - return String(t.output); - } -} - -function truncate(s: string, n: number): string { - return s.length > n ? s.slice(0, n) + "..." : s; -} - -function formatDownstreamPreview(context?: ReflectionSynthContext): string { - const preview = context?.downstream ?? []; - if (preview.length === 0) return "(none)"; - return preview - .map((item) => { - const label = `step+${item.offset}`; - if (item.kind === "tooluse") { - const lines = [ - `[${label}] type=tooluse`, - `tool_names: ${item.toolNames?.join(", ") || "(unknown)"}`, - `tool_output: ${item.toolOutput?.trim() || "(none)"}`, - ]; - if (item.reflection?.trim()) { - lines.push(`existing_reflection: ${item.reflection.trim()}`); - } - return lines.join("\n"); - } - return [`[${label}] type=text`, item.text?.trim() || "(empty)"].join("\n"); - }) - .join("\n\n"); -} diff --git a/apps/memos-local-plugin/core/capture/types.ts b/apps/memos-local-plugin/core/capture/types.ts index efdc637f2..7c115ed08 100644 --- a/apps/memos-local-plugin/core/capture/types.ts +++ b/apps/memos-local-plugin/core/capture/types.ts @@ -2,8 +2,8 @@ * Internal DTOs for `core/capture`. * * These are the stage-to-stage contracts between: - * step-extractor → normalizer → reflection-extractor → (reflection-synth?) - * → alpha-scorer → embedder → traces repo + * step-extractor → normalizer → batch-scorer (windowed binary) → + * embedder → traces repo * * Not exported through the plugin's public surface (adapters don't care). * Exposed to Phase 15 via the pipeline event bus as `CaptureResult` so the @@ -63,6 +63,30 @@ export interface NormalizedStep extends StepCandidate { // ─── Stage 3: with a scored reflection ────────────────────────────────────── +/** + * Fixed-enum values written into `traces.reflection` by the windowed binary + * reflection pipeline. Anything outside this set is legacy natural-language + * reflection text from before the 2026-05 redesign. + */ +export const REFLECTION_ENUM_LABELS = new Set([ + "RELATED", + "IRRELEVANT", + "RELATED_DEFAULT", +]); + +/** + * Return the reflection value only when it carries free-form natural-language + * signal — the three fixed labels are converted to `null` so downstream + * consumers don't feed `RELATED_DEFAULT` (or similar) into LLM prompts, + * keyword blobs, or error-signature heuristics. + */ +export function reflectionAsText(value: string | null | undefined): string | null { + if (!value) return null; + const trimmed = value.trim(); + if (!trimmed) return null; + return REFLECTION_ENUM_LABELS.has(trimmed) ? null : value; +} + export interface ReflectionScore { /** The final reflection text (may differ from `rawReflection` if synthed). */ text: string | null; @@ -193,27 +217,9 @@ export interface CaptureConfig { alphaScoring: boolean; synthReflections: boolean; llmConcurrency: number; - /** - * V7 §3.2 batched variant. Controls when reflection synthesis + α scoring - * collapse into ONE LLM call per episode instead of N per-step calls. - * - * - "per_step" — legacy path; one synth/α call per step. Predictable - * prompt size, slow & costly on long episodes. - * - "per_episode" — always batch the entire episode into one call. - * Long episodes risk overflowing the model context. - * - "auto" — batch when `stepCount ≤ batchThreshold`; otherwise - * fall back to per-step. Recommended default. - * - * Either way, `R_human` (the terminal reward) is computed independently - * by `core/reward` after user feedback arrives — batching only affects - * capture-stage LLM usage. - */ - batchMode: "per_step" | "per_episode" | "auto"; - /** - * In `batchMode: "auto"`, episodes with strictly more than this many - * normalized steps fall back to the per-step path. Acts as a guard - * against prompt-window overflow on very long agent traces. - */ + /** Reflection mode. "windowed" enforces fixed-size episode windows only. */ + batchMode: "windowed"; + /** Retained for backward config compatibility; ignored by windowed mode. */ batchThreshold: number; /** * Controls which extra context is included in per-step reflection and α diff --git a/apps/memos-local-plugin/core/config/defaults.ts b/apps/memos-local-plugin/core/config/defaults.ts index 3d59646b1..439f2b175 100644 --- a/apps/memos-local-plugin/core/config/defaults.ts +++ b/apps/memos-local-plugin/core/config/defaults.ts @@ -61,42 +61,20 @@ export const DEFAULT_CONFIG: ResolvedConfig = { maxTextChars: 4_000, maxToolOutputChars: 2_000, embedTraces: true, + // alphaScoring / synthReflections are no-ops under the windowed + // binary pipeline (kept for backward config compatibility). alphaScoring: true, - // OpenClaw's tool messages don't include explicit "reflection" - // blocks; without synthesis the alpha scorer sees an empty - // reflection and forces α = 0 (see `core/capture/alpha-scorer.ts` - // line 97). That makes reflection-weighted backprop degenerate - // into pure γ-discount and produces flat V distributions — - // L2 association + skill crystallization both starve. Enable - // synth by default so even turns without explicit reflections - // still contribute useful α values. synthReflections: true, llmConcurrency: 4, - // V7 §3.2 batched variant. With "auto" we issue a single LLM call - // per episode for both reflection synth and α scoring as long as - // the episode is short enough — this collapses 2N per-step calls - // (N synth + N α) into 1 batched call. Long episodes (>12 steps) - // automatically fall back to the per-step path so the prompt - // never overflows the model's context window. R_human + backprop - // remain task-end events handled by `core/reward`, unchanged. - batchMode: "auto", + // Windowed binary reflection mode is the only supported value. + // Fixed strategy: primary window 20 (overlap 3), degrade to 9 + // (overlap 3), then episode-level fallback writes + // RELATED_DEFAULT + alpha=1. See core/capture/capture.ts. + batchMode: "windowed", batchThreshold: 12, - // `reflectionContextMode` controls which extra prompt context blocks - // topic-end reflection receives: - // - "none": no TASK CONTEXT and no DOWNSTREAM STEP PREVIEW - // - "task": inject TASK CONTEXT only - // - "downstream": inject DOWNSTREAM STEP PREVIEW only - // - "task_downstream": inject both blocks - // `longEpisodeReflectMode` controls the fallback used when an episode is - // too long for batch scoring: - // - "per_step_parallel": keep the current parallel per-step path. Each - // step is reflected independently, using only the context blocks - // enabled by `reflectionContextMode` that are available without - // downstream preview. - // - "per_step_downstream": still run per-step work in parallel, but - // prebuild a bounded DOWNSTREAM STEP PREVIEW for each step (step+1 - // through step+N, capped by `downstreamStepCount`) and inject it when - // `reflectionContextMode` includes "downstream". + // reflectionContextMode / longEpisodeReflectMode are retained for + // backward config compatibility but have no effect on the windowed + // binary reflection pipeline. reflectionContextMode: "task_downstream", longEpisodeReflectMode: "per_step_downstream", downstreamStepCount: 3, diff --git a/apps/memos-local-plugin/core/config/schema.ts b/apps/memos-local-plugin/core/config/schema.ts index 8d485a36f..de2871c99 100644 --- a/apps/memos-local-plugin/core/config/schema.ts +++ b/apps/memos-local-plugin/core/config/schema.ts @@ -114,27 +114,11 @@ const AlgorithmSchema = Type.Object({ synthReflections: Bool(false), /** Concurrency for α scoring + synth LLM calls (per_step mode only). */ llmConcurrency: NumberInRange(4, 1, 32), - /** - * V7 §3.2 batched variant. When/how to fold per-step reflection synth + - * α scoring into one episode-level LLM call: - * - "per_step" : legacy path, N per-step LLM calls - * - "per_episode" : always batch - * - "auto" : batch when stepCount ≤ batchThreshold, else per-step - */ - batchMode: Type.Union( - [Type.Literal("per_step"), Type.Literal("per_episode"), Type.Literal("auto")], - { default: "auto" }, - ), - /** - * Step-count cap for "auto" mode. Episodes above this limit fall back - * to per-step calls so the batched prompt cannot overflow context. - */ + /** Windowed-only reflection mode (per-step path removed). */ + batchMode: Type.Literal("windowed", { default: "windowed" }), + /** Retained for backward compatibility; ignored by windowed mode. */ batchThreshold: NumberInRange(12, 1, 64), - /** - * Optional context blocks for per-step reflection and α prompts. - * Defaults to "task" to preserve the current task-summary enrichment; - * downstream preview remains opt-in. - */ + /** Retained for compatibility; no effect in windowed binary mode. */ reflectionContextMode: Type.Union( [ Type.Literal("none"), @@ -144,10 +128,7 @@ const AlgorithmSchema = Type.Object({ ], { default: "task" }, ), - /** - * Long-episode fallback mode after batch auto-threshold is exceeded. - * `per_step_downstream` keeps parallelism but adds step+1..step+3 preview. - */ + /** Retained for compatibility; no effect in windowed binary mode. */ longEpisodeReflectMode: Type.Union( [Type.Literal("per_step_parallel"), Type.Literal("per_step_downstream")], { default: "per_step_parallel" }, diff --git a/apps/memos-local-plugin/core/experience/feedback-builder.ts b/apps/memos-local-plugin/core/experience/feedback-builder.ts index 0eb87812d..77c4b9011 100644 --- a/apps/memos-local-plugin/core/experience/feedback-builder.ts +++ b/apps/memos-local-plugin/core/experience/feedback-builder.ts @@ -11,6 +11,7 @@ import type { import type { Embedder } from "../embedding/types.js"; import type { LlmClient } from "../llm/index.js"; import { classifyFeedback } from "../feedback/classifier.js"; +import { reflectionAsText } from "../capture/types.js"; import { createFeedbackRefiner } from "./feedback-refiner.js"; import { ids } from "../id.js"; import { ownerFromNamespace } from "../runtime/namespace.js"; @@ -690,7 +691,10 @@ function traceHint(trace: TraceRow): string { const parts = [ trace.summary ? `summary=${cleanLine(trace.summary, 140)}` : null, trace.userText ? `user=${cleanLine(trace.userText, 140)}` : null, - trace.reflection ? `note=${cleanLine(trace.reflection, 140)}` : null, + (() => { + const refl = reflectionAsText(trace.reflection); + return refl ? `note=${cleanLine(refl, 140)}` : null; + })(), ]; return parts.filter(Boolean).join(" | "); } diff --git a/apps/memos-local-plugin/core/feedback/evidence.ts b/apps/memos-local-plugin/core/feedback/evidence.ts index 4bcaa0368..c4fdcdb6d 100644 --- a/apps/memos-local-plugin/core/feedback/evidence.ts +++ b/apps/memos-local-plugin/core/feedback/evidence.ts @@ -18,6 +18,7 @@ import type { Logger } from "../logger/types.js"; import type { Repos } from "../storage/repos/index.js"; import type { SessionId, TraceRow } from "../types.js"; +import { reflectionAsText } from "../capture/types.js"; import type { FeedbackConfig } from "./types.js"; export interface EvidenceInput { @@ -111,12 +112,12 @@ function partition( function traceContains(trace: TraceRow, needle: string): boolean { const blob = - `${trace.userText}\n${trace.agentText}\n${trace.reflection ?? ""}`.toLowerCase(); + `${trace.userText}\n${trace.agentText}\n${reflectionAsText(trace.reflection) ?? ""}`.toLowerCase(); return blob.includes(needle); } function isFailureLike(trace: TraceRow): boolean { - const blob = `${trace.agentText}\n${trace.reflection ?? ""}`.toLowerCase(); + const blob = `${trace.agentText}\n${reflectionAsText(trace.reflection) ?? ""}`.toLowerCase(); return ( /(error|failed|failure|exception|traceback|timeout|retry)/.test(blob) || trace.toolCalls.some((call) => Boolean(call.errorCode)) diff --git a/apps/memos-local-plugin/core/feedback/synthesize.ts b/apps/memos-local-plugin/core/feedback/synthesize.ts index e5249878f..1f8443b25 100644 --- a/apps/memos-local-plugin/core/feedback/synthesize.ts +++ b/apps/memos-local-plugin/core/feedback/synthesize.ts @@ -21,6 +21,7 @@ import type { Logger } from "../logger/types.js"; import { DECISION_REPAIR_PROMPT } from "../llm/prompts/decision-repair.js"; import { sanitizeDerivedMarkdown, sanitizeDerivedText } from "../safety/content.js"; import type { PolicyId, PolicyRow, TraceId, TraceRow } from "../types.js"; +import { reflectionAsText } from "../capture/types.js"; import { capTrace } from "./evidence.js"; import type { ClassifiedFeedback, @@ -134,7 +135,10 @@ function packPrompt( `value: ${t.value.toFixed(2)}`, `user: ${capped.userText.trim()}`, `agent: ${capped.agentText.trim()}`, - capped.reflection ? `reflection: ${capped.reflection.trim()}` : "", + (() => { + const r = reflectionAsText(capped.reflection); + return r ? `reflection: ${r.trim()}` : ""; + })(), ] .filter(Boolean) .join("\n"); @@ -214,8 +218,8 @@ function templateDraft( const best = input.highValue.slice().sort((a, b) => b.value - a.value)[0]; const worst = input.lowValue.slice().sort((a, b) => a.value - b.value)[0]; const hint = input.classifiedFeedback; - const preferText = sanitizeDerivedMarkdown(hint?.prefer) || sanitizeDerivedMarkdown(firstNonEmpty(best?.reflection, best?.agentText)); - const avoidText = sanitizeDerivedMarkdown(hint?.avoid) || sanitizeDerivedMarkdown(firstNonEmpty(worst?.reflection, worst?.agentText)); + const preferText = sanitizeDerivedMarkdown(hint?.prefer) || sanitizeDerivedMarkdown(firstNonEmpty(reflectionAsText(best?.reflection), best?.agentText)); + const avoidText = sanitizeDerivedMarkdown(hint?.avoid) || sanitizeDerivedMarkdown(firstNonEmpty(reflectionAsText(worst?.reflection), worst?.agentText)); if (!preferText && !avoidText) return null; return { contextHash: input.contextHash, diff --git a/apps/memos-local-plugin/core/index.ts b/apps/memos-local-plugin/core/index.ts index d2b979bce..3be7b231b 100644 --- a/apps/memos-local-plugin/core/index.ts +++ b/apps/memos-local-plugin/core/index.ts @@ -62,7 +62,6 @@ export { BedrockLlmProvider, HostLlmProvider, LocalOnlyLlmProvider, - REFLECTION_SCORE_PROMPT, REWARD_R_HUMAN_PROMPT, L2_INDUCTION_PROMPT, L3_ABSTRACTION_PROMPT, @@ -164,10 +163,6 @@ export { createCaptureEventBus, extractSteps as extractCaptureSteps, normalizeSteps as normalizeCaptureSteps, - extractReflection, - synthesizeReflection, - scoreReflection, - disabledScore as disabledReflectionScore, embedSteps as embedCaptureSteps, type CaptureConfig, type CaptureDeps, diff --git a/apps/memos-local-plugin/core/llm/prompts/index.ts b/apps/memos-local-plugin/core/llm/prompts/index.ts index 504398a92..68ce91271 100644 --- a/apps/memos-local-plugin/core/llm/prompts/index.ts +++ b/apps/memos-local-plugin/core/llm/prompts/index.ts @@ -17,7 +17,7 @@ export interface PromptDef { system: string; } -export { REFLECTION_SCORE_PROMPT, BATCH_REFLECTION_PROMPT } from "./reflection.js"; +export { BATCH_REFLECTION_PROMPT } from "./reflection.js"; export { REWARD_R_HUMAN_PROMPT } from "./reward.js"; export { L2_INDUCTION_PROMPT } from "./l2-induction.js"; export { L3_ABSTRACTION_PROMPT } from "./l3-abstraction.js"; diff --git a/apps/memos-local-plugin/core/llm/prompts/reflection.ts b/apps/memos-local-plugin/core/llm/prompts/reflection.ts index 063bf8e85..f83497fbc 100644 --- a/apps/memos-local-plugin/core/llm/prompts/reflection.ts +++ b/apps/memos-local-plugin/core/llm/prompts/reflection.ts @@ -1,86 +1,27 @@ import type { PromptDef } from "./index.js"; /** - * V7 §3.2 — Reflection scorer. + * V7 §3.2 — Windowed binary path-relevance scoring. * - * Given an L1 trace (state, action, outcome, reflection_text), return a - * quality score α_t ∈ [0, 1] and a boolean `usable` flag. The facade parses - * the JSON output; we validate structure at the call site. - */ -export const REFLECTION_SCORE_PROMPT: PromptDef = { - id: "reflection.score", - version: 2, - description: "Score an agent reflection for quality and usability, with full-step context.", - system: `You are a strict reviewer of agent self-reflections. - -You see the FULL context of one agent step: -- STATE — what the agent saw before acting (user prompt, prior observation) -- THINKING — the LLM's own native chain-of-thought for this step, if any - (Claude extended-thinking, pi-ai ThinkingContent). Empty when - the model didn't emit thinking this turn. -- ACTION — what the agent produced (assistant text output) -- TOOL_CALLS — tools the agent invoked this step, with inputs and outputs - (or errors). Tool usage + outcomes are part of the action - chain and carry their own signal about what the agent did. -- OUTCOME — the final observable result of the step (last tool outcome - or "(assistant-only step)" for pure text turns) -- REFLECTION — the text being graded: the agent's first-person explanation - of WHY it acted this way and WHAT it learned. - -Score the REFLECTION on four axes, combined into ONE number α ∈ [0, 1]: - - 1. faithfulness — does the reflection match what ACTUALLY happened - across THINKING + ACTION + TOOL_CALLS + OUTCOME? - 2. causal insight — does it identify why the action / tool choice - worked or failed? Bonus when it connects the - model's visible THINKING to the resulting action. - 3. transferability — does it surface a lesson useful on a similar - future task? - 4. concreteness — are the details specific (real command names, - real error messages, real decisions) rather than - generic platitudes like "I should do better"? - -Rules: -- THINKING and TOOL_CALLS are first-class evidence for grading α — - a reflection that ignores a visible thinking chain or misreports a - tool call should score LOW on faithfulness. -- TOOL_CALLS that errored are strong signal: the reflection should - name the error and what it implied. Missing that is a faithfulness - penalty. -- An empty / purely-tautological reflection → α = 0, usable = false. -- α ≥ 0.4 AND reflection non-tautological → usable = true; else false. - -Return JSON: -{ - "alpha": 0.0-1.0, - "usable": true | false, - "reason": "one-sentence justification" -}`, -}; - -/** - * V7 §3.2 — *Batched* reflection synthesis + α scoring. + * One LLM call per episode window. The LLM sees the full causal chain of + * the window in order and returns a binary `alpha ∈ {0, 1}` plus a fixed + * `RELATED | IRRELEVANT` label per step. There is no natural-language + * reflection synthesis: `traces.reflection` is overwritten by the label + * (or `RELATED_DEFAULT` when the windowed pipeline falls back to its + * episode-wide safe default). * - * One LLM call per episode instead of N synth + N α calls. The LLM sees the - * complete causal chain (every step in order, including the final outcome), - * which lets it write better-grounded reflections than per-step grounded - * ones — V7 §3.2.3 axes "causal_insight" and "transferability" benefit - * directly from the wider context window. - * - * Activated by `algorithm.capture.batchMode: "auto" | "per_episode"` in - * `core/config`. The dispatcher in `core/capture/capture.ts` also enforces - * `algorithm.capture.batchThreshold` so very long episodes degrade to the - * per-step path instead of overflowing the prompt window. - * - * Output schema is documented inside the prompt — `core/capture/batch-scorer.ts` - * validates each entry and falls back to per-step on any malformed value. + * Window topology and retry ladder are owned by `core/capture/capture.ts` + * (primary `batch=20, overlap=3` → degrade `batch=9, overlap=3` → + * episode-wide `RELATED_DEFAULT` fallback). `core/capture/batch-scorer.ts` + * validates each entry's shape and rejects any `alpha` that is not exactly + * 0 or 1 / `relevance` that is not exactly RELATED|IRRELEVANT. */ export const BATCH_REFLECTION_PROMPT: PromptDef = { id: "reflection.batch", - version: 3, + version: 4, description: - "Score (and optionally synthesize) reflections for an entire episode in one call, with full thinking + tool-call context.", - system: `You are reviewing every step of one AI agent episode in a single pass. + "Binary path-relevance scoring for every step in one episode window.", + system: `You are reviewing a WINDOW of one AI agent episode. INPUT: a JSON array under "steps". Each entry has: - "idx": step index (integer, 0-based, sequential) @@ -94,62 +35,27 @@ INPUT: a JSON array under "steps". Each entry has: first-class evidence for scoring the step. - "outcome": the step's final observable outcome (last tool output, error, or "(assistant-only step)" for pure text turns) -- "reflection": the agent's own first-person reflection (may be empty string) -- "synth_allowed": boolean — when true and "reflection" is empty, you SHOULD - write a brand-new 2–3 sentence first-person reflection for that step. When - false, leave "reflection_text" empty for steps that came in with empty - "reflection". +- "task_context": optional episode-level task summary. The user payload may also include "host_context". That describes the host agent being reviewed and the separate reflection model doing this review. -Do NOT project the reflection model's own identity/provider/capabilities onto -the host agent. If hostModel/hostProvider are present, treat them as the -authoritative runtime context unless the episode itself contains a correction. -The user payload may also include "task_context" (string or null). When -non-null and non-empty, it is the **episode-level task summary**: a compact -overview of what this episode was about (e.g. initial user goal, intent -metadata, closing assistant reply, and tools used across the episode). It -applies to **every** step — use it to keep per-step reflections and α scores -aligned with the overall task; it is NOT a substitute for each step's own -"state", "outcome", or "tool_calls". When "task_context" is null or missing, -infer the episode goal only from the "steps" timeline. +Goal: decide whether each step is RELEVANT to the final trajectory. +You must NOT produce long natural-language reflection text. For EACH input step, return one object containing: - "idx": copy the input idx exactly -- "reflection_text": - * If input "reflection" was non-empty → copy it verbatim, do not rewrite. - * If input "reflection" was empty AND "synth_allowed" is true → write a - NEW 2–3 sentence first-person explanation of WHY the agent acted this - way and WHAT it learned. Concrete, no judgment, no repeating the - visible action. - * If input "reflection" was empty AND "synth_allowed" is false → return - the empty string "". - * If the step is incoherent or completely empty → return "". -- "alpha": one number in [0, 1] grading the reflection on: - 1. faithfulness — does it describe what actually happened across - thinking + action + tool_calls + outcome? Missing or misnaming - a visible thinking block / tool call / tool error is a penalty. - 2. causal insight — does it identify why the action or tool choice - worked / failed? Bonus when it ties visible thinking to action. - 3. transferability — does it surface a lesson useful on a similar task? - 4. concreteness — are the details specific (real command names, - real error messages) rather than generic platitudes? - When "reflection_text" is empty, return alpha=0. -- "usable": true when alpha ≥ 0.4 AND the reflection is not tautological. - When "reflection_text" is empty, return usable=false. -- "reason": one short sentence justifying the alpha score. - -Knowing the FULL episode timeline (including the final outcome) is permitted -and encouraged — that is the whole point of batched scoring. Reflections -written here may show better causal insight than per-step ones because you -can see how each step contributed to the eventual result. +- "alpha": MUST be integer 0 or 1 only + * 1 => this step is effective and downstream steps continue from it + * 0 => detour / ineffective / irrelevant to trajectory +- "relevance": MUST be one of "RELATED" or "IRRELEVANT" +- "reason": short code-like reason, <= 8 words (e.g. "ON_PATH", "DETOUR") Return JSON of the form: { "scores": [ - {"idx": 0, "reflection_text": "...", "alpha": 0.7, "usable": true, "reason": "..."}, - {"idx": 1, "reflection_text": "...", "alpha": 0.3, "usable": false, "reason": "..."} + {"idx": 0, "alpha": 1, "relevance": "RELATED", "reason": "ON_PATH"}, + {"idx": 1, "alpha": 0, "relevance": "IRRELEVANT", "reason": "DETOUR"} ] } diff --git a/apps/memos-local-plugin/core/memory/l2/induce.ts b/apps/memos-local-plugin/core/memory/l2/induce.ts index 2ea750cd2..d1835c002 100644 --- a/apps/memos-local-plugin/core/memory/l2/induce.ts +++ b/apps/memos-local-plugin/core/memory/l2/induce.ts @@ -16,6 +16,7 @@ import { languageSteeringLine, } from "../../llm/prompts/index.js"; import { L2_INDUCTION_PROMPT } from "../../llm/prompts/l2-induction.js"; +import { reflectionAsText } from "../../capture/types.js"; import type { LlmClient } from "../../llm/index.js"; import type { Logger } from "../../logger/types.js"; import { sanitizeDerivedMarkdown, sanitizeDerivedMarkdownList, sanitizeDerivedText } from "../../safety/content.js"; @@ -81,7 +82,7 @@ export async function induceDraft( // dominant language of the evidence bucket — Chinese users expect // their own L2 memories in 中文, English users expect English. const evidenceLang = detectDominantLanguage( - input.evidenceTraces.flatMap((t) => [t.userText, t.agentText, t.reflection]), + input.evidenceTraces.flatMap((t) => [t.userText, t.agentText, reflectionAsText(t.reflection)]), ); try { @@ -199,7 +200,7 @@ function packTraces( `user: ${truncate(t.userText, 200)}`, `agent: ${truncate(t.agentText, 300)}`, `tools: ${formatTools(t.toolCalls)}`, - `reflection: ${truncate(t.reflection ?? "-", 300)}`, + `reflection: ${truncate(reflectionAsText(t.reflection) ?? "-", 300)}`, `V: ${t.value.toFixed(2)} alpha: ${t.alpha.toFixed(2)}`, ].join("\n"); if (block.length > budget) { diff --git a/apps/memos-local-plugin/core/memory/l2/signature.ts b/apps/memos-local-plugin/core/memory/l2/signature.ts index 7d3135667..7fa729ec3 100644 --- a/apps/memos-local-plugin/core/memory/l2/signature.ts +++ b/apps/memos-local-plugin/core/memory/l2/signature.ts @@ -11,6 +11,7 @@ */ import type { ToolCallDTO } from "../../../agent-contract/dto.js"; +import { reflectionAsText } from "../../capture/types.js"; import type { TraceRow } from "../../types.js"; import type { PatternSignature, SignatureComponents } from "./types.js"; @@ -86,7 +87,7 @@ function firstErrCode(trace: TraceRow): string { return `EXIT_${n}`; } } - const refl = trace.reflection ?? ""; + const refl = reflectionAsText(trace.reflection) ?? ""; const m2 = refl.match(/\b([A-Z][A-Z0-9_]{2,}_[A-Z0-9_]+)\b/); if (m2) return m2[1].slice(0, 48); return MISSING; diff --git a/apps/memos-local-plugin/core/memory/l3/abstract.ts b/apps/memos-local-plugin/core/memory/l3/abstract.ts index 0bde96fb3..666097580 100644 --- a/apps/memos-local-plugin/core/memory/l3/abstract.ts +++ b/apps/memos-local-plugin/core/memory/l3/abstract.ts @@ -12,6 +12,7 @@ import { languageSteeringLine, } from "../../llm/prompts/index.js"; import { L3_ABSTRACTION_PROMPT } from "../../llm/prompts/l3-abstraction.js"; +import { reflectionAsText } from "../../capture/types.js"; import type { LlmClient } from "../../llm/index.js"; import type { Logger } from "../../logger/types.js"; import { sanitizeDerivedMarkdown, sanitizeDerivedText } from "../../safety/content.js"; @@ -84,7 +85,7 @@ export async function abstractDraft( langSamples.push(p.title, p.trigger, p.procedure, p.boundary, p.verification); } for (const traces of input.evidenceByPolicy.values()) { - for (const t of traces) langSamples.push(t.userText, t.agentText, t.reflection); + for (const t of traces) langSamples.push(t.userText, t.agentText, reflectionAsText(t.reflection)); } const evidenceLang = detectDominantLanguage(langSamples); @@ -248,7 +249,7 @@ function packPolicy( ` tags: ${(t.tags ?? []).join(",") || "-"}`, ` user: ${truncate(t.userText, 160)}`, ` agent: ${truncate(t.agentText, 240)}`, - ` reflection: ${truncate(t.reflection ?? "-", 200)}`, + ` reflection: ${truncate(reflectionAsText(t.reflection) ?? "-", 200)}`, ].join("\n"), cfg.traceCharCap, ), diff --git a/apps/memos-local-plugin/core/pipeline/memory-core.ts b/apps/memos-local-plugin/core/pipeline/memory-core.ts index 03b2386f4..1a1a5290f 100644 --- a/apps/memos-local-plugin/core/pipeline/memory-core.ts +++ b/apps/memos-local-plugin/core/pipeline/memory-core.ts @@ -74,6 +74,7 @@ import type { } from "../types.js"; import type { ResolvedConfig, ResolvedHome } from "../config/index.js"; import { loadConfig, resolveHome, SECRET_FIELD_PATHS } from "../config/index.js"; +import { reflectionAsText } from "../capture/types.js"; import { feedbackText, runFeedbackExperience } from "../experience/feedback-builder.js"; import { isRepairCandidatePolicy, mintRepairCandidate } from "../skill/repair-candidate.js"; import { rootLogger } from "../logger/index.js"; @@ -934,7 +935,7 @@ export function createMemoryCore( const details = r.traces.map((tc) => ({ role: inferTurnRole(tc), action: phase === "lite" ? ("stored" as const) : ("reflected" as const), - summary: tc.reflection?.text ?? null, + summary: reflectionAsText(tc.reflection?.text ?? null), content: ( tc.userText || tc.agentText || @@ -5554,7 +5555,7 @@ export function deriveSkillStatus( if (relatedPolicies.length === 0) { return { status: "not_generated", - reason: "暂未归纳出 L2 经验", + reason: "L2 经验归纳尚未产出(可能仍在异步处理中)", reasonKey: "tasks.skillReason.not_generated.noPolicy", reasonParams: thresholds, linkedSkillId: null, diff --git a/apps/memos-local-plugin/core/retrieval/injector.ts b/apps/memos-local-plugin/core/retrieval/injector.ts index 610326372..cd1fdff8b 100644 --- a/apps/memos-local-plugin/core/retrieval/injector.ts +++ b/apps/memos-local-plugin/core/retrieval/injector.ts @@ -20,6 +20,7 @@ import type { SessionId, } from "../../agent-contract/dto.js"; import { ids } from "../id.js"; +import { reflectionAsText } from "../capture/types.js"; import type { CollectedGuidance } from "./decision-guidance.js"; import type { RankedCandidate } from "./ranker.js"; import type { @@ -266,7 +267,10 @@ function renderTrace(c: TraceCandidate): InjectionSnippet { if (summaryLine) parts.push(summaryLine); if (c.userText) parts.push(`[user] ${c.userText}`); if (c.agentText) parts.push(`[assistant] ${c.agentText}`); - if (c.reflection) parts.push(`[note] ${c.reflection}`); + { + const refl = reflectionAsText(c.reflection); + if (refl) parts.push(`[note] ${refl}`); + } const body = withToolFollowUp( truncate(parts.join("\n")), `→ call \`memos_get(id="${c.refId}", kind="trace")\` for the full turn`, diff --git a/apps/memos-local-plugin/core/retrieval/llm-filter.ts b/apps/memos-local-plugin/core/retrieval/llm-filter.ts index 15868fb68..10a73cd6d 100644 --- a/apps/memos-local-plugin/core/retrieval/llm-filter.ts +++ b/apps/memos-local-plugin/core/retrieval/llm-filter.ts @@ -23,6 +23,7 @@ import type { LlmClient } from "../llm/index.js"; import type { Logger } from "../logger/types.js"; import { RETRIEVAL_FILTER_PROMPT } from "../llm/prompts/index.js"; +import { reflectionAsText } from "../capture/types.js"; import type { RankedCandidate } from "./ranker.js"; import type { RetrievalConfig } from "./types.js"; @@ -313,8 +314,10 @@ function describeCandidate(r: RankedCandidate, bodyChars: number): string { if (tr.userText?.trim()) parts.push(`[user] ${tr.userText.trim()}`); if (tr.agentText?.trim()) parts.push(`[assistant] ${tr.agentText.trim()}`); - if (tr.reflection?.trim()) - parts.push(`[note] ${tr.reflection.trim()}`); + { + const refl = reflectionAsText(tr.reflection)?.trim(); + if (refl) parts.push(`[note] ${refl}`); + } const body = squashBody(parts.join(" "), bodyChars); return `[TRACE] ${body}`; } diff --git a/apps/memos-local-plugin/core/retrieval/tier2-trace.ts b/apps/memos-local-plugin/core/retrieval/tier2-trace.ts index 1f902f54c..a1cf91d62 100644 --- a/apps/memos-local-plugin/core/retrieval/tier2-trace.ts +++ b/apps/memos-local-plugin/core/retrieval/tier2-trace.ts @@ -24,6 +24,7 @@ * "single-channel false positive" hole that pure-cosine retrieval has. */ +import { reflectionAsText } from "../capture/types.js"; import { rootLogger } from "../logger/index.js"; import { priorityFor } from "../reward/backprop.js"; import type { EmbeddingVector, EpisodeId, SessionId, TraceId } from "../types.js"; @@ -464,7 +465,7 @@ function renderEpisodeSummary(_best: TraceCandidate, members: readonly TraceCand const a = m.agentText?.trim().replace(/\s+/g, " ") ?? ""; if (a) parts.push(`agent: ${a.slice(0, 120)}`); } - const r = m.reflection?.trim() ?? ""; + const r = reflectionAsText(m.reflection)?.trim() ?? ""; if (r) parts.push(`reflection: ${r.slice(0, 160)}`); return parts.join("\n "); }); diff --git a/apps/memos-local-plugin/core/skill/crystallize.ts b/apps/memos-local-plugin/core/skill/crystallize.ts index fd6a70c67..7b2b04bd4 100644 --- a/apps/memos-local-plugin/core/skill/crystallize.ts +++ b/apps/memos-local-plugin/core/skill/crystallize.ts @@ -16,6 +16,7 @@ import { languageSteeringLine, } from "../llm/prompts/index.js"; import { SKILL_CRYSTALLIZE_PROMPT } from "../llm/prompts/skill-crystallize.js"; +import { reflectionAsText } from "../capture/types.js"; import type { Logger } from "../logger/types.js"; import { sanitizeDerivedList, @@ -108,7 +109,7 @@ export async function crystallizeDraft( input.policy.title, input.policy.trigger, input.policy.procedure, - ...input.evidence.flatMap((t) => [t.userText, t.agentText, t.reflection]), + ...input.evidence.flatMap((t) => [t.userText, t.agentText, reflectionAsText(t.reflection)]), ]); try { @@ -222,7 +223,7 @@ function packPrompt(input: CrystallizeInput, config: SkillConfig): string { const evidence = input.evidence.slice(0, config.evidenceLimit).map((t) => ({ id: t.id, episodeId: t.episodeId, - reflection: t.reflection, + reflection: reflectionAsText(t.reflection), user: capString(t.userText, config.traceCharCap), agent: capString(t.agentText, config.traceCharCap), value: Number.isFinite(t.value) ? t.value : 0, @@ -235,7 +236,7 @@ function packPrompt(input: CrystallizeInput, config: SkillConfig): string { .map((t) => ({ id: t.id, episodeId: t.episodeId, - reflection: t.reflection, + reflection: reflectionAsText(t.reflection), user: capString(t.userText, config.traceCharCap), agent: capString(t.agentText, config.traceCharCap), value: Number.isFinite(t.value) ? t.value : 0, diff --git a/apps/memos-local-plugin/core/skill/verifier.ts b/apps/memos-local-plugin/core/skill/verifier.ts index 276d21815..6927da30d 100644 --- a/apps/memos-local-plugin/core/skill/verifier.ts +++ b/apps/memos-local-plugin/core/skill/verifier.ts @@ -21,6 +21,7 @@ import type { Logger } from "../logger/types.js"; import type { TraceRow } from "../types.js"; +import { reflectionAsText } from "../capture/types.js"; import type { SkillCrystallizationDraft } from "./types.js"; import { extractToolNames } from "./tool-names.js"; @@ -118,7 +119,7 @@ function computeResonance( if (draftTokens.size === 0) return 0; let hit = 0; for (const t of evidence) { - const txt = `${t.userText}\n${t.agentText}\n${t.reflection ?? ""}`.toLowerCase(); + const txt = `${t.userText}\n${t.agentText}\n${reflectionAsText(t.reflection) ?? ""}`.toLowerCase(); const toks = tokensOf(txt); let overlap = 0; for (const tok of draftTokens) if (toks.has(tok)) overlap += 1; diff --git a/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md b/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md index 014e3deea..7d8cb8afe 100644 --- a/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md +++ b/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md @@ -77,9 +77,18 @@ algorithm: maxTextChars: 4000 # per-turn text cap before truncation maxToolOutputChars: 2000 # per-tool-call output cap embedTraces: true # vectorize state+action with the embedder - alphaScoring: true # ask the LLM to grade each reflection (α ∈ [0,1]) - synthReflections: false # ask the LLM to WRITE a reflection when missing - llmConcurrency: 4 # parallel LLM calls per episode + alphaScoring: true # binary alpha enabled (0/1 only) + synthReflections: false # retained for compatibility (windowed mode ignores synthesis) + llmConcurrency: 4 # retained for compatibility + batchMode: windowed # only supported mode + batchThreshold: 12 # retained for compatibility; ignored in windowed mode + reflectionContextMode: task_downstream # retained for compatibility + longEpisodeReflectMode: per_step_downstream # retained for compatibility + downstreamStepCount: 3 # retained for compatibility + taskContextMaxChars: 800 # used as prompt context cap + downstreamContextMaxChars: 1200 # retained for compatibility + downstreamPerStepMaxChars: 400 # retained for compatibility + synthOutcomeMaxChars: 600 # retained for compatibility reward: gamma: 0.9 # γ discount factor (V7 §0.6 eq. 4/5) tauSoftmax: 0.5 # τ for softmax reweighting in Phase 9 L2 induction @@ -165,6 +174,21 @@ algorithm: # core/retrieval/README.md.) ``` +### Reflection batching (current behavior) + +`algorithm.capture.batchMode` 现在固定为 `windowed`,per-step 反思链路已删除。 + +- 主窗口:`20`,`overlap=3`,每窗重试 1 次 +- 降级窗口:`9`,`overlap=3`,每窗重试 2 次 +- 全部失败:整集 episode 强制写入 `reflection=RELATED_DEFAULT`、`alpha=1` +- overlap 冲突合并:`alpha=1` 覆盖 `alpha=0` + +`traces.reflection` 为固定枚举: + +- `RELATED` +- `IRRELEVANT` +- `RELATED_DEFAULT` + #### Tuning cheat-sheet | Symptom | Try | diff --git a/apps/memos-local-plugin/templates/config.hermes.yaml b/apps/memos-local-plugin/templates/config.hermes.yaml index 191580f85..d08cc72e6 100644 --- a/apps/memos-local-plugin/templates/config.hermes.yaml +++ b/apps/memos-local-plugin/templates/config.hermes.yaml @@ -30,6 +30,11 @@ llm: algorithm: lightweightMemory: enabled: true # true = low-cost summaries only; false = memory self-evolution with tasks/experiences/world models/skills + capture: + # reflection/alpha now runs in windowed batch mode only. + # fixed strategy: primary window 20 (overlap 3), degrade to 9 (overlap 3), + # then episode-level fallback writes RELATED_DEFAULT + alpha=1. + batchMode: windowed hub: enabled: false diff --git a/apps/memos-local-plugin/templates/config.openclaw.yaml b/apps/memos-local-plugin/templates/config.openclaw.yaml index e263dd3c2..e172bdf50 100644 --- a/apps/memos-local-plugin/templates/config.openclaw.yaml +++ b/apps/memos-local-plugin/templates/config.openclaw.yaml @@ -29,6 +29,11 @@ llm: algorithm: lightweightMemory: enabled: true # true = low-cost summaries only; false = memory self-evolution with tasks/experiences/world models/skills + capture: + # reflection/alpha now runs in windowed batch mode only. + # fixed strategy: primary window 20 (overlap 3), degrade to 9 (overlap 3), + # then episode-level fallback writes RELATED_DEFAULT + alpha=1. + batchMode: windowed hub: enabled: false diff --git a/apps/memos-local-plugin/tests/unit/capture/alpha-scorer.test.ts b/apps/memos-local-plugin/tests/unit/capture/alpha-scorer.test.ts deleted file mode 100644 index f035372ed..000000000 --- a/apps/memos-local-plugin/tests/unit/capture/alpha-scorer.test.ts +++ /dev/null @@ -1,151 +0,0 @@ -import { beforeAll, describe, expect, it } from "vitest"; - -import { disabledScore, scoreReflection } from "../../../core/capture/alpha-scorer.js"; -import type { NormalizedStep } from "../../../core/capture/types.js"; -import { REFLECTION_SCORE_PROMPT } from "../../../core/llm/prompts/reflection.js"; -import { initTestLogger } from "../../../core/logger/index.js"; -import { fakeLlm, throwingLlm } from "../../helpers/fake-llm.js"; - -const op = `capture.alpha.${REFLECTION_SCORE_PROMPT.id}.v${REFLECTION_SCORE_PROMPT.version}`; - -function step(partial: Partial = {}): NormalizedStep { - return { - key: "k", - ts: 1_000, - userText: partial.userText ?? "do a thing", - agentText: partial.agentText ?? "did the thing", - toolCalls: partial.toolCalls ?? [], - rawReflection: null, - depth: 0, - isSubagent: false, - meta: {}, - truncated: false, - }; -} - -describe("capture/alpha-scorer", () => { - beforeAll(() => initTestLogger()); - - it("returns clamped α and model for usable reflections", async () => { - const llm = fakeLlm({ - completeJson: { - [op]: { alpha: 0.82, usable: true, reason: "clear causal claim" }, - }, - }); - const out = await scoreReflection(llm, { - step: step(), - reflectionText: "I tried X because Y, which matches last week's pattern.", - }); - expect(out.alpha).toBeCloseTo(0.82, 5); - expect(out.usable).toBe(true); - expect(out.model).toBe("openai_compatible"); - expect(out.reason).toContain("clear"); - }); - - it("forces α=0 when usable=false", async () => { - const llm = fakeLlm({ - completeJson: { - [op]: { alpha: 0.9, usable: false, reason: "tautological" }, - }, - }); - const out = await scoreReflection(llm, { - step: step(), - reflectionText: "I did this because I needed to do this.", - }); - expect(out.alpha).toBe(0); - expect(out.usable).toBe(false); - }); - - it("clamps out-of-range α values", async () => { - const llm = fakeLlm({ - completeJson: { - [op]: { alpha: 1.7, usable: true, reason: "x" }, - }, - }); - const out = await scoreReflection(llm, { - step: step(), - reflectionText: "r", - }); - expect(out.alpha).toBe(1); - }); - - it("negative α is clamped to 0", async () => { - const llm = fakeLlm({ - completeJson: { - [op]: { alpha: -0.5, usable: true, reason: "x" }, - }, - }); - const out = await scoreReflection(llm, { step: step(), reflectionText: "r" }); - expect(out.alpha).toBe(0); - }); - - it("non-finite α becomes 0", async () => { - const llm = fakeLlm({ - completeJson: { - [op]: { alpha: Number.NaN, usable: true, reason: "x" }, - }, - }); - const out = await scoreReflection(llm, { step: step(), reflectionText: "r" }); - expect(out.alpha).toBe(0); - }); - - it("propagates LLM errors to the caller", async () => { - const llm = throwingLlm(new Error("boom")); - await expect( - scoreReflection(llm, { step: step(), reflectionText: "r" }), - ).rejects.toThrow(); - }); - - it("disabledScore assigns neutral α=0.5 when text present", () => { - const s = disabledScore("something real", "extracted"); - expect(s.alpha).toBe(0.5); - expect(s.usable).toBe(true); - expect(s.source).toBe("extracted"); - }); - - it("disabledScore assigns α=0, usable=false when null", () => { - const s = disabledScore(null, "none"); - expect(s.alpha).toBe(0); - expect(s.usable).toBe(false); - expect(s.text).toBeNull(); - }); - - it("uses the right op so the prompt registry tag is stable", async () => { - const captured: string[] = []; - const llm = fakeLlm({ - completeJson: { - [op]: (_input: unknown) => { - captured.push(op); - return { alpha: 0.5, usable: true, reason: "ok" }; - }, - }, - }); - await scoreReflection(llm, { step: step(), reflectionText: "r" }); - expect(captured).toEqual([op]); - }); - - it("injects downstream preview without breaking JSON scoring", async () => { - let userPrompt = ""; - const llm = fakeLlm({ - completeJson: { - [op]: (input) => { - const messages = input as Array<{ role: string; content: string }>; - userPrompt = messages.find((m) => m.role === "user")?.content ?? ""; - return { alpha: 0.5, usable: true, reason: "ok" }; - }, - }, - }); - const out = await scoreReflection(llm, { - step: step(), - reflectionText: "I checked the first fact before using the downstream evidence.", - downstream: [ - { offset: 1, kind: "text", text: "action: next step used the result" }, - { offset: 2, kind: "tooluse", toolNames: ["shell"], toolOutput: "ok" }, - ], - }); - expect(out.alpha).toBe(0.5); - expect(userPrompt).toContain("[step+1] type=text"); - expect(userPrompt).toContain("[step+2] type=tooluse"); - expect(userPrompt).toContain("tool_output: ok"); - }); -}); diff --git a/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts b/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts index 8cb440699..6b7d975ee 100644 --- a/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts @@ -35,7 +35,8 @@ function step( } function input(s: NormalizedStep, existing: string | null = null): BatchScoreInput { - return { step: s, existingReflection: existing }; + void existing; + return { step: s }; } describe("batchScoreReflections", () => { @@ -43,9 +44,8 @@ describe("batchScoreReflections", () => { it("empty inputs short-circuit without an LLM call", async () => { const llm = throwingLlm(new Error("would have crashed")); - const out = await batchScoreReflections(llm, [], { synthReflections: true }); + const out = await batchScoreReflections(llm, [], {}); expect(out.scores).toEqual([]); - expect(out.synthAccepted).toBe(0); }); it("respects out-of-order idx in the LLM response", async () => { @@ -53,8 +53,8 @@ describe("batchScoreReflections", () => { completeJson: { [BATCH_OP_TAG]: { scores: [ - { idx: 1, reflection_text: "second", alpha: 0.4, usable: true }, - { idx: 0, reflection_text: "first", alpha: 0.7, usable: true }, + { idx: 1, alpha: 0, relevance: "IRRELEVANT" }, + { idx: 0, alpha: 1, relevance: "RELATED" }, ], }, }, @@ -65,12 +65,12 @@ describe("batchScoreReflections", () => { input(step({ userText: "u0", agentText: "a0" }), "first"), input(step({ userText: "u1", agentText: "a1" }), "second"), ], - { synthReflections: true }, + {}, ); - expect(out.scores[0]!.text).toBe("first"); - expect(out.scores[0]!.alpha).toBeCloseTo(0.7, 5); - expect(out.scores[1]!.text).toBe("second"); - expect(out.scores[1]!.alpha).toBeCloseTo(0.4, 5); + expect(out.scores[0]!.text).toBe("RELATED"); + expect(out.scores[0]!.alpha).toBe(1); + expect(out.scores[1]!.text).toBe("IRRELEVANT"); + expect(out.scores[1]!.alpha).toBe(0); }); it("rejects responses with mismatched length", async () => { @@ -86,7 +86,7 @@ describe("batchScoreReflections", () => { input(step({ userText: "u0", agentText: "a0" }), "x"), input(step({ userText: "u1", agentText: "a1" }), "y"), ], - { synthReflections: true }, + {}, ), ).rejects.toThrow(/length mismatch/); }); @@ -100,22 +100,19 @@ describe("batchScoreReflections", () => { }, }); await expect( - batchScoreReflections(llm, [input(step({ userText: "u", agentText: "a" }), "x")], { - synthReflections: true, - }), + batchScoreReflections(llm, [input(step({ userText: "u", agentText: "a" }), "x")], {}), ).rejects.toThrow(/alpha must be number/); }); - it("synth disabled + empty existing → discards LLM-written text, α=0", async () => { + it("maps IRRELEVANT to alpha=0", async () => { const llm = fakeLlm({ completeJson: { [BATCH_OP_TAG]: { scores: [ { idx: 0, - reflection_text: "Confidently fabricated reflection.", - alpha: 0.8, - usable: true, + alpha: 0, + relevance: "IRRELEVANT", }, ], }, @@ -124,24 +121,22 @@ describe("batchScoreReflections", () => { const out = await batchScoreReflections( llm, [input(step({ userText: "u", agentText: "a" }), null)], - { synthReflections: false }, + {}, ); - expect(out.scores[0]!.text).toBeNull(); + expect(out.scores[0]!.text).toBe("IRRELEVANT"); expect(out.scores[0]!.alpha).toBe(0); - expect(out.scores[0]!.source).toBe("none"); - expect(out.synthAccepted).toBe(0); + expect(out.scores[0]!.source).toBe("synth"); }); - it("synth enabled + empty existing → adopts LLM text and reports synthAccepted", async () => { + it("maps RELATED to alpha=1", async () => { const llm = fakeLlm({ completeJson: { [BATCH_OP_TAG]: { scores: [ { idx: 0, - reflection_text: "I picked tool X because the user asked for Y.", - alpha: 0.6, - usable: true, + alpha: 1, + relevance: "RELATED", }, ], }, @@ -150,10 +145,10 @@ describe("batchScoreReflections", () => { const out = await batchScoreReflections( llm, [input(step({ userText: "u", agentText: "a" }), null)], - { synthReflections: true }, + {}, ); - expect(out.scores[0]!.text).toContain("tool X"); + expect(out.scores[0]!.text).toBe("RELATED"); expect(out.scores[0]!.source).toBe("synth"); - expect(out.synthAccepted).toBe(1); + expect(out.scores[0]!.alpha).toBe(1); }); }); diff --git a/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts b/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts index d86290517..bc4e8cad5 100644 --- a/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts @@ -1,25 +1,12 @@ /** - * Capture pipeline — batched reflection+α path (V7 §3.2 batched variant). - * - * These tests exercise `algorithm.capture.batchMode = "auto" | "per_episode"` - * and prove that: - * 1. one LLM call covers all step's ρ + α (no per-step calls); - * 2. existing reflections are preserved verbatim; - * 3. synth-disabled steps stay at α=0 even when the LLM tries to write - * one for them; - * 4. `auto` mode falls back to per-step when stepCount > batchThreshold; - * 5. a malformed batched response degrades into the per-step path - * instead of crashing capture. + * Capture pipeline — windowed binary reflection/alpha path. */ import { afterEach, beforeAll, beforeEach, describe, expect, it } from "vitest"; import { createCaptureRunner, type CaptureRunner } from "../../../core/capture/capture.js"; import { createCaptureEventBus } from "../../../core/capture/events.js"; -import { - BATCH_REFLECTION_PROMPT, - REFLECTION_SCORE_PROMPT, -} from "../../../core/llm/prompts/reflection.js"; +import { BATCH_REFLECTION_PROMPT } from "../../../core/llm/prompts/reflection.js"; import type { CaptureConfig, CaptureEvent, @@ -38,7 +25,6 @@ import { fakeLlm } from "../../helpers/fake-llm.js"; import { makeTmpDb, type TmpDbHandle } from "../../helpers/tmp-db.js"; const batchOp = `capture.${BATCH_REFLECTION_PROMPT.id}.v${BATCH_REFLECTION_PROMPT.version}`; -const alphaOp = `capture.alpha.${REFLECTION_SCORE_PROMPT.id}.v${REFLECTION_SCORE_PROMPT.version}`; /** * Drives both phases of the new capture lifecycle (lite write → reflect @@ -80,7 +66,7 @@ function baseConfig(overrides: Partial = {}): CaptureConfig { alphaScoring: true, synthReflections: true, llmConcurrency: 2, - batchMode: "auto", + batchMode: "windowed", batchThreshold: 12, reflectionContextMode: "none", longEpisodeReflectMode: "per_step_parallel", @@ -128,7 +114,7 @@ function episodeSnapshot(opts: { }; } -describe("capture/pipeline (batched ρ+α path)", () => { +describe("capture/pipeline (windowed binary path)", () => { beforeAll(() => initTestLogger()); let tmp: TmpDbHandle; @@ -180,32 +166,14 @@ describe("capture/pipeline (batched ρ+α path)", () => { }); } - it("3-step episode → ONE batched LLM call (no per-step alpha/synth)", async () => { + it("single window writes RELATED/IRRELEVANT with alpha 1/0", async () => { const llm = fakeLlm({ completeJson: { [batchOp]: { scores: [ - { - idx: 0, - reflection_text: "I asked for the file list because it was needed.", - alpha: 0.6, - usable: true, - reason: "ok", - }, - { - idx: 1, - reflection_text: "I narrowed the search to the src tree.", - alpha: 0.7, - usable: true, - reason: "good", - }, - { - idx: 2, - reflection_text: "I confirmed the result and stopped.", - alpha: 0.5, - usable: true, - reason: "ok", - }, + { idx: 0, alpha: 1, relevance: "RELATED", reason: "ON_PATH" }, + { idx: 1, alpha: 0, relevance: "IRRELEVANT", reason: "DETOUR" }, + { idx: 2, alpha: 1, relevance: "RELATED", reason: "ON_PATH" }, ], }, }, @@ -233,281 +201,92 @@ describe("capture/pipeline (batched ρ+α path)", () => { expect(result.llmCalls.alphaScoring).toBe(0); const rows = result.traceIds.map((id) => tmp.repos.traces.getById(id)!); - expect(rows[0]!.reflection).toContain("file list"); - expect(rows[0]!.alpha).toBeCloseTo(0.6, 5); - expect(rows[1]!.alpha).toBeCloseTo(0.7, 5); - expect(rows[2]!.alpha).toBeCloseTo(0.5, 5); - }); - - it("preserves existing adapter-provided reflection verbatim (no rewrite)", async () => { - const llm = fakeLlm({ - completeJson: { - // The LLM tries to "improve" the reflection. We must IGNORE that - // text and copy the adapter-provided one through. - [batchOp]: { - scores: [ - { - idx: 0, - reflection_text: "LLM-rewritten reflection that should be ignored.", - alpha: 0.8, - usable: true, - reason: "good", - }, - ], - }, - }, - }); - const runner = buildRunner({}, llm); - - const ep = episodeSnapshot({ - id: "ep_1", - sessionId: "se_1", - turns: [ - turn("user", "do x", 1_000), - turn("assistant", "done", 1_100, { - reflection: "I picked the cheapest tool because user said so.", - }), - ], - }); - - const result = await runCapture(runner, ep); - const t = tmp.repos.traces.getById(result.traceIds[0]!)!; - // Original reflection survives intact. - expect(t.reflection).toBe("I picked the cheapest tool because user said so."); - // α is taken from the LLM grading. - expect(t.alpha).toBeCloseTo(0.8, 5); - expect(result.llmCalls.batchedReflection).toBe(1); - }); - - it("synthReflections=false discards LLM-written reflections for empty steps", async () => { - const llm = fakeLlm({ - completeJson: { - [batchOp]: { - scores: [ - // LLM tries to invent a reflection; with synth disabled we drop it. - { - idx: 0, - reflection_text: "Fabricated reflection by the LLM.", - alpha: 0.7, - usable: true, - reason: "n/a", - }, - ], - }, - }, - }); - const runner = buildRunner({ synthReflections: false }, llm); - - const ep = episodeSnapshot({ - id: "ep_1", - sessionId: "se_1", - turns: [ - turn("user", "list files", 1_000), - turn("assistant", "ok", 1_100), // no reflection pattern - ], - }); - const result = await runCapture(runner, ep); - const t = tmp.repos.traces.getById(result.traceIds[0]!)!; - expect(t.reflection).toBeNull(); - expect(t.alpha).toBe(0); // V7 disabledScore semantics - }); - - it("auto mode falls back to per-step when stepCount > batchThreshold", async () => { - const llm = fakeLlm({ - completeJson: { - // ONLY per-step alpha mock; if batched gets called, the test fails - // with "no completeJson mock for op=...batch...". - [alphaOp]: { alpha: 0.5, usable: true, reason: "ok" }, - }, - complete: { - "capture.reflection.synth": "I made this decision deliberately.", - }, - }); - const runner = buildRunner({ batchMode: "auto", batchThreshold: 2 }, llm); - - // 3 steps → above threshold → per-step path. - const ep = episodeSnapshot({ - id: "ep_1", - sessionId: "se_1", - turns: [ - turn("user", "a", 1_000), - turn("assistant", "1", 1_010), - turn("user", "b", 1_020), - turn("assistant", "2", 1_030), - turn("user", "c", 1_040), - turn("assistant", "3", 1_050), - ], - }); - - const result = await runCapture(runner, ep); - expect(result.traceIds).toHaveLength(3); - expect(result.llmCalls.batchedReflection).toBe(0); - // 3 synth + 3 alpha calls in per-step mode. - expect(result.llmCalls.reflectionSynth).toBe(3); - expect(result.llmCalls.alphaScoring).toBe(3); + expect(rows[0]!.reflection).toBe("RELATED"); + expect(rows[0]!.alpha).toBe(1); + expect(rows[1]!.reflection).toBe("IRRELEVANT"); + expect(rows[1]!.alpha).toBe(0); + expect(rows[2]!.reflection).toBe("RELATED"); + expect(rows[2]!.alpha).toBe(1); }); - it("long per-step downstream mode injects up to three following steps", async () => { - const synthPrompts: string[] = []; - const alphaPrompts: string[] = []; + it("window overlap conflict uses alpha=1 override", async () => { const llm = fakeLlm({ - complete: { - "capture.reflection.synth": (input) => { - const messages = input as Array<{ role: string; content: string }>; - synthPrompts.push(messages.find((m) => m.role === "user")?.content ?? ""); - return "I used this step because it shaped a following decision."; - }, - }, completeJson: { - [alphaOp]: (input) => { + [batchOp]: (input) => { const messages = input as Array<{ role: string; content: string }>; - alphaPrompts.push(messages.find((m) => m.role === "user")?.content ?? ""); - return { alpha: 0.5, usable: true, reason: "ok" }; + const payload = JSON.parse(messages.find((m) => m.role === "user")!.content) as { + steps: Array<{ idx: number }>; + }; + if (payload.steps.length === 20) { + return { scores: payload.steps.map((s) => ({ idx: s.idx, alpha: 0, relevance: "IRRELEVANT" })) }; + } + return { scores: payload.steps.map((s) => ({ idx: s.idx, alpha: 1, relevance: "RELATED" })) }; }, }, }); - const runner = buildRunner( - { - batchMode: "auto", - batchThreshold: 2, - reflectionContextMode: "task_downstream", - longEpisodeReflectMode: "per_step_downstream", - downstreamStepCount: 3, - }, - llm, - ); - - const ep = episodeSnapshot({ - id: "ep_1", - sessionId: "se_1", - turns: [ - turn("user", "step zero", 1_000), - turn("assistant", "inspect first", 1_010), - turn("user", "step one", 1_020), - turn("assistant", "tool follows", 1_030, { - toolCalls: [{ name: "shell", input: { command: "pwd" }, output: "/tmp/project" }], - }), - turn("user", "step two", 1_050), - turn("assistant", "### Reasoning:\nI reused the tool result.\n\nnext action", 1_060), - turn("user", "step three", 1_070), - turn("assistant", "finish", 1_080), - ], - }); - - const result = await runCapture(runner, ep); - - expect(result.traceIds).toHaveLength(4); - expect(result.llmCalls.batchedReflection).toBe(0); - expect(result.llmCalls.reflectionSynth).toBe(3); - expect(result.llmCalls.alphaScoring).toBe(4); - - const firstPrompt = synthPrompts[0]!; - expect(firstPrompt).toContain("TASK CONTEXT:"); - expect(firstPrompt).toContain("[step+1] type=tooluse"); - expect(firstPrompt).toContain("tool_names: shell"); - expect(firstPrompt).toContain("tool_output: shell: /tmp/project"); - expect(firstPrompt).toContain("[step+2] type=text"); - expect(firstPrompt).toContain("[step+3] type=text"); - - const step3Prompt = alphaPrompts[2]!; - expect(step3Prompt).toContain("[step+1] type=text"); - expect(step3Prompt).not.toContain("[step+2]"); - expect(step3Prompt).not.toContain("[step+3]"); - }); - - it("per_episode mode batches even when step count is large", async () => { - const scores = Array.from({ length: 5 }, (_, i) => ({ - idx: i, - reflection_text: `reflection #${i}`, - alpha: 0.4, - usable: true, - reason: "ok", - })); - const llm = fakeLlm({ - completeJson: { [batchOp]: { scores } }, - }); - const runner = buildRunner({ batchMode: "per_episode", batchThreshold: 2 }, llm); - + const runner = buildRunner({}, llm); const turns: EpisodeTurn[] = []; - for (let i = 0; i < 5; i++) { - turns.push(turn("user", `q${i}`, 1_000 + i * 100)); - turns.push(turn("assistant", `a${i}`, 1_050 + i * 100)); + for (let i = 0; i < 21; i++) { + turns.push(turn("user", `q${i}`, 1_000 + i * 10)); + turns.push(turn("assistant", `a${i}`, 1_005 + i * 10)); } - const ep = episodeSnapshot({ id: "ep_1", sessionId: "se_1", turns }); - const result = await runCapture(runner, ep); - expect(result.traceIds).toHaveLength(5); - expect(result.llmCalls.batchedReflection).toBe(1); - expect(result.llmCalls.alphaScoring).toBe(0); + const result = await runCapture(runner, episodeSnapshot({ id: "ep_1", sessionId: "se_1", turns })); + expect(result.llmCalls.batchedReflection).toBe(2); + const rows = result.traceIds.map((id) => tmp.repos.traces.getById(id)!); + // idx 17..19 are overlap, should be upgraded to RELATED (alpha=1). + expect(rows[17]!.alpha).toBe(1); + expect(rows[18]!.alpha).toBe(1); + expect(rows[19]!.alpha).toBe(1); }); - it("malformed batched response → falls back to per-step + emits warning", async () => { + it("all retries failed => episode fallback RELATED_DEFAULT + alpha=1", async () => { const llm = fakeLlm({ - completeJson: { - // Wrong shape: scores has fewer entries than steps. Validator throws, - // capture catches and falls back to per-step. - [batchOp]: { scores: [] }, - [alphaOp]: { alpha: 0.5, usable: true, reason: "ok" }, - }, - complete: { - "capture.reflection.synth": "I responded after thinking it through.", - }, + completeJson: {}, }); - const runner = buildRunner({ batchMode: "per_episode" }, llm); + const runner = buildRunner({}, llm); const ep = episodeSnapshot({ id: "ep_1", sessionId: "se_1", - turns: [ - turn("user", "do x", 1_000), - turn("assistant", "done", 1_100), - ], + turns: [turn("user", "do x", 1_000), turn("assistant", "done", 1_100)], }); + const result = await runCapture(runner, ep); - expect(result.traceIds).toHaveLength(1); - expect(result.warnings.some((w) => w.stage === "batch")).toBe(true); - expect(result.llmCalls.batchedReflection).toBe(0); - // Per-step fallback ran. - expect(result.llmCalls.reflectionSynth).toBe(1); - expect(result.llmCalls.alphaScoring).toBe(1); + const t = tmp.repos.traces.getById(result.traceIds[0]!)!; + expect(t.reflection).toBe("RELATED_DEFAULT"); + expect(t.alpha).toBe(1); + expect(result.warnings.some((w) => w.message.includes("force RELATED_DEFAULT"))).toBe(true); }); - it("usable=false in batched response forces α=0 (V7 eq.5)", async () => { + it("degraded pass uses 9-size windows after primary fail", async () => { const llm = fakeLlm({ completeJson: { - [batchOp]: { - scores: [ - { - idx: 0, - reflection_text: "I did things.", - alpha: 0.9, - usable: false, - reason: "tautology", - }, - ], + [batchOp]: (input) => { + const messages = input as Array<{ role: string; content: string }>; + const payload = JSON.parse(messages.find((m) => m.role === "user")!.content) as { + steps: Array<{ idx: number }>; + }; + if (payload.steps.length === 20) throw new Error("fail primary window"); + return { scores: payload.steps.map((s) => ({ idx: s.idx, alpha: 1, relevance: "RELATED" })) }; }, }, }); const runner = buildRunner({}, llm); + const ep = episodeSnapshot({ id: "ep_1", sessionId: "se_1", - turns: [ - turn("user", "q", 1_000), - turn( - "assistant", - "### Reasoning:\nI executed the obvious action that any agent would, period.", - 1_100, - ), - ], + turns: Array.from({ length: 25 }).flatMap((_, i) => [ + turn("user", `u${i}`, 1_000 + i * 20), + turn("assistant", `a${i}`, 1_010 + i * 20), + ]), }); const result = await runCapture(runner, ep); - const t = tmp.repos.traces.getById(result.traceIds[0]!)!; - // Reflection text preserved (came from regex extractor), but α clamped. - expect(t.reflection).toContain("obvious action"); - expect(t.alpha).toBe(0); + expect(result.warnings.some((w) => w.message.includes("degrading to smaller windows"))).toBe(true); + expect(result.traceIds).toHaveLength(25); + expect(result.traceIds.every((id) => tmp.repos.traces.getById(id)!.alpha === 1)).toBe(true); }); - - it("no LLM available → batch dispatch refuses, per-step path runs as today", async () => { + it("no LLM available => directly fallback to RELATED_DEFAULT", async () => { const runner = buildRunner({ alphaScoring: false }, null); const ep = episodeSnapshot({ id: "ep_1", @@ -516,8 +295,8 @@ describe("capture/pipeline (batched ρ+α path)", () => { }); const result = await runCapture(runner, ep); expect(result.traceIds).toHaveLength(1); - expect(result.llmCalls.batchedReflection).toBe(0); - expect(result.llmCalls.reflectionSynth).toBe(0); - expect(result.llmCalls.alphaScoring).toBe(0); + const t = tmp.repos.traces.getById(result.traceIds[0]!)!; + expect(t.reflection).toBe("RELATED_DEFAULT"); + expect(t.alpha).toBe(1); }); }); diff --git a/apps/memos-local-plugin/tests/unit/capture/capture.test.ts b/apps/memos-local-plugin/tests/unit/capture/capture.test.ts index 95d728338..0df39c470 100644 --- a/apps/memos-local-plugin/tests/unit/capture/capture.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/capture.test.ts @@ -11,7 +11,7 @@ import { afterEach, beforeAll, beforeEach, describe, expect, it } from "vitest"; import { createCaptureEventBus } from "../../../core/capture/events.js"; import { createCaptureRunner, type CaptureRunner } from "../../../core/capture/capture.js"; import type { Embedder } from "../../../core/embedding/types.js"; -import { REFLECTION_SCORE_PROMPT } from "../../../core/llm/prompts/reflection.js"; +import { BATCH_REFLECTION_PROMPT } from "../../../core/llm/prompts/reflection.js"; import type { CaptureConfig, CaptureEvent, @@ -32,7 +32,7 @@ import { fakeEmbedder } from "../../helpers/fake-embedder.js"; import { fakeLlm } from "../../helpers/fake-llm.js"; import { makeTmpDb, type TmpDbHandle } from "../../helpers/tmp-db.js"; -const alphaOp = `capture.alpha.${REFLECTION_SCORE_PROMPT.id}.v${REFLECTION_SCORE_PROMPT.version}`; +const batchOp = `capture.${BATCH_REFLECTION_PROMPT.id}.v${BATCH_REFLECTION_PROMPT.version}`; /** * End-to-end test helper: runs the lite-phase capture (which writes @@ -87,10 +87,7 @@ function baseConfig(overrides: Partial = {}): CaptureConfig { alphaScoring: true, synthReflections: false, llmConcurrency: 2, - // Default to per-step here so the existing assertions on - // `llmCalls.alphaScoring`/`reflectionSynth` continue to hold. The - // batched path has its own dedicated test file. - batchMode: "per_step", + batchMode: "windowed", batchThreshold: 12, ...overrides, }; @@ -261,8 +258,13 @@ describe("capture/pipeline (end-to-end)", () => { expect(seen.map((e) => e.kind)).toEqual(["capture.started", "capture.lite.done"]); }); - it("writes one trace per step with α=0 when alpha disabled and no reflection present", async () => { - const runner = buildRunner({ alphaScoring: false }); + it("writes one trace per step with binary reflection fields", async () => { + const llm = fakeLlm({ + completeJson: { + [batchOp]: { scores: [{ idx: 0, alpha: 0, relevance: "IRRELEVANT" }] }, + }, + }); + const runner = buildRunner({ alphaScoring: false }, llm); const ep = episodeSnapshot({ id: "ep_1", sessionId: "se_1", @@ -275,7 +277,7 @@ describe("capture/pipeline (end-to-end)", () => { expect(persisted).not.toBeNull(); expect(persisted!.userText).toBe("say hi"); expect(persisted!.agentText).toBe("hi"); - expect(persisted!.reflection).toBeNull(); + expect(persisted!.reflection).toBe("IRRELEVANT"); expect(persisted!.alpha).toBe(0); expect(persisted!.value).toBe(0); // Newly-captured rows seed `priority` at 0.5 so they're visible to @@ -397,10 +399,10 @@ describe("capture/pipeline (end-to-end)", () => { expect(tmp.repos.episodes.getById("ep_1" as EpisodeId)!.traceIds).toEqual(lite.traceIds); }); - it("passes through adapter-provided reflection and stores α from LLM", async () => { + it("stores binary alpha/reflection from batch scorer", async () => { const llm = fakeLlm({ completeJson: { - [alphaOp]: { alpha: 0.8, usable: true, reason: "concrete" }, + [batchOp]: { scores: [{ idx: 0, alpha: 1, relevance: "RELATED", reason: "ON_PATH" }] }, }, }); const runner = buildRunner({}, llm); @@ -417,19 +419,18 @@ describe("capture/pipeline (end-to-end)", () => { const result = await runCapture(runner, ep); expect(result.traceIds).toHaveLength(1); - expect(result.llmCalls.alphaScoring).toBe(1); - expect(result.llmCalls.reflectionSynth).toBe(0); + expect(result.llmCalls.batchedReflection).toBe(1); const t = tmp.repos.traces.getById(result.traceIds[0]!)!; - expect(t.reflection).toContain("shell tool"); - expect(t.alpha).toBeCloseTo(0.8, 5); - expect(result.traces[0]?.reflection.reason).toBe("concrete"); + expect(t.reflection).toBe("RELATED"); + expect(t.alpha).toBe(1); + expect(result.traces[0]?.reflection.reason).toBe("ON_PATH"); }); - it("clamps α to 0 when LLM marks reflection unusable", async () => { + it("sets alpha=0 when batch returns IRRELEVANT", async () => { const llm = fakeLlm({ completeJson: { - [alphaOp]: { alpha: 0.9, usable: false, reason: "tautology" }, + [batchOp]: { scores: [{ idx: 0, alpha: 0, relevance: "IRRELEVANT" }] }, }, }); const runner = buildRunner({}, llm); @@ -447,11 +448,11 @@ describe("capture/pipeline (end-to-end)", () => { }); const result = await runCapture(runner, ep); const t = tmp.repos.traces.getById(result.traceIds[0]!)!; - expect(t.reflection).toBeTruthy(); + expect(t.reflection).toBe("IRRELEVANT"); expect(t.alpha).toBe(0); }); - it("alpha LLM failure is non-fatal — trace still persists with neutral α", async () => { + it("batch LLM failure is non-fatal and falls back to RELATED_DEFAULT", async () => { const llm = fakeLlm({ completeJson: {} }); // no mocks → throws const runner = buildRunner({}, llm); const ep = episodeSnapshot({ @@ -465,20 +466,16 @@ describe("capture/pipeline (end-to-end)", () => { const result = await runCapture(runner, ep); expect(result.traceIds).toHaveLength(1); - expect(result.warnings.some((w) => w.stage === "alpha")).toBe(true); + expect(result.warnings.some((w) => w.stage === "batch")).toBe(true); const t = tmp.repos.traces.getById(result.traceIds[0]!)!; - expect(t.reflection).toBeTruthy(); - expect(t.alpha).toBeCloseTo(0.5, 5); // neutral fallback from disabledScore + expect(t.reflection).toBe("RELATED_DEFAULT"); + expect(t.alpha).toBe(1); }); - it("synthesizes reflection when configured and extraction found nothing", async () => { + it("reflect phase writes binary enums without synthesis", async () => { const llm = fakeLlm({ - complete: { - "capture.reflection.synth": - "I decided to run ls because the user requested a directory listing.", - }, completeJson: { - [alphaOp]: { alpha: 0.6, usable: true, reason: "ok" }, + [batchOp]: { scores: [{ idx: 0, alpha: 1, relevance: "RELATED" }] }, }, }); const runner = buildRunner({ synthReflections: true }, llm); @@ -491,10 +488,10 @@ describe("capture/pipeline (end-to-end)", () => { ], }); const result = await runCapture(runner, ep); - expect(result.llmCalls.reflectionSynth).toBe(1); + expect(result.llmCalls.batchedReflection).toBe(1); const t = tmp.repos.traces.getById(result.traceIds[0]!)!; - expect(t.reflection).toContain("directory listing"); - expect(t.alpha).toBeCloseTo(0.6, 5); + expect(t.reflection).toBe("RELATED"); + expect(t.alpha).toBe(1); }); it("updates episode.trace_ids_json with new ids", async () => { diff --git a/apps/memos-local-plugin/tests/unit/capture/normalizer.test.ts b/apps/memos-local-plugin/tests/unit/capture/normalizer.test.ts index a09f7d1f5..c91a77b84 100644 --- a/apps/memos-local-plugin/tests/unit/capture/normalizer.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/normalizer.test.ts @@ -11,7 +11,7 @@ const cfg: CaptureConfig = { alphaScoring: false, synthReflections: false, llmConcurrency: 1, - batchMode: "per_step", + batchMode: "windowed", batchThreshold: 12, }; diff --git a/apps/memos-local-plugin/tests/unit/capture/reflection-extractor.test.ts b/apps/memos-local-plugin/tests/unit/capture/reflection-extractor.test.ts deleted file mode 100644 index 4ff9e19e5..000000000 --- a/apps/memos-local-plugin/tests/unit/capture/reflection-extractor.test.ts +++ /dev/null @@ -1,94 +0,0 @@ -import { describe, expect, it } from "vitest"; - -import { extractReflection } from "../../../core/capture/reflection-extractor.js"; -import type { NormalizedStep } from "../../../core/capture/types.js"; - -function step(partial: Partial): NormalizedStep { - return { - key: "k", - ts: 1_000, - userText: partial.userText ?? "", - agentText: partial.agentText ?? "", - toolCalls: partial.toolCalls ?? [], - rawReflection: partial.rawReflection ?? null, - depth: 0, - isSubagent: false, - meta: {}, - truncated: false, - }; -} - -describe("capture/reflection-extractor", () => { - it("prefers adapter-provided rawReflection", () => { - const r = extractReflection( - step({ - rawReflection: "I picked X because Y.", - agentText: "### Reasoning:\nI picked something else", - }), - ); - expect(r).toBe("I picked X because Y."); - }); - - it("extracts markdown ### Reasoning block", () => { - const r = extractReflection( - step({ - agentText: - "Sure, here's the fix.\n\n### Reasoning:\nThe bug was in the null check because the cache was cold.\n\n### Result:\nAll green.", - }), - ); - expect(r).toContain("The bug was in the null check"); - expect(r).not.toContain("Result"); - }); - - it("extracts ... legacy tags", () => { - const r = extractReflection( - step({ - agentText: - "Run passed.\nI retried the request once, then gave up to avoid an infinite loop.", - }), - ); - expect(r).toContain("retried the request once"); - }); - - it("extracts inline 'Reflection:' phrase", () => { - const r = extractReflection( - step({ - agentText: - "Final answer is 42.\n\nReflection: I chose 42 because it's the right length for the docstring. This approach keeps the function small.", - }), - ); - expect(r?.length).toBeGreaterThanOrEqual(20); - expect(r).toContain("docstring"); - }); - - it("extracts Chinese 思考过程", () => { - const r = extractReflection( - step({ - agentText: "我先运行了 ls,然后读取 log。\n\n思考过程:我觉得这个错误应该是权限问题导致的,所以先 chmod。", - }), - ); - expect(r).toBeTruthy(); - expect(r).toContain("权限问题"); - }); - - it("returns null when no pattern matches", () => { - const r = extractReflection(step({ agentText: "Done." })); - expect(r).toBeNull(); - }); - - it("returns null when agent text is empty and no rawReflection", () => { - expect(extractReflection(step({ agentText: "" }))).toBeNull(); - }); - - it("ignores too-short matches", () => { - const r = extractReflection(step({ agentText: "Reflection: ok" })); - expect(r).toBeNull(); - }); - - it("caps extracted reflection at 1500 chars", () => { - const body = "X".repeat(5_000); - const r = extractReflection(step({ agentText: `Reasoning: ${body}` })); - expect(r).not.toBeNull(); - expect(r!.length).toBeLessThanOrEqual(1_500); - }); -}); diff --git a/apps/memos-local-plugin/tests/unit/capture/reflection-synth.test.ts b/apps/memos-local-plugin/tests/unit/capture/reflection-synth.test.ts deleted file mode 100644 index 6e8bb910d..000000000 --- a/apps/memos-local-plugin/tests/unit/capture/reflection-synth.test.ts +++ /dev/null @@ -1,133 +0,0 @@ -import { beforeAll, describe, expect, it } from "vitest"; - -import { synthesizeReflection } from "../../../core/capture/reflection-synth.js"; -import type { NormalizedStep } from "../../../core/capture/types.js"; -import { initTestLogger } from "../../../core/logger/index.js"; -import { fakeLlm, throwingLlm } from "../../helpers/fake-llm.js"; - -function step(partial: Partial): NormalizedStep { - return { - key: "k", - ts: 1_000, - userText: partial.userText ?? "", - agentText: partial.agentText ?? "", - toolCalls: partial.toolCalls ?? [], - rawReflection: null, - depth: 0, - isSubagent: false, - meta: {}, - truncated: false, - }; -} - -describe("capture/reflection-synth", () => { - beforeAll(() => initTestLogger()); - - it("returns the LLM text when the model answers", async () => { - const llm = fakeLlm({ - complete: { - "capture.reflection.synth": - "I tried the shell first because the prompt looked shell-shaped, then verified with a read.", - }, - }); - const out = await synthesizeReflection( - llm, - step({ userText: "ls", agentText: "running ls", toolCalls: [] }), - ); - expect(out.text).toContain("shell first"); - expect(out.model).toBe("openai_compatible"); - }); - - it("injects task context and last tool outcome into the prompt", async () => { - let userPrompt = ""; - const llm = fakeLlm({ - complete: { - "capture.reflection.synth": (input) => { - const messages = input as Array<{ role: string; content: string }>; - userPrompt = messages.find((m) => m.role === "user")?.content ?? ""; - return "I checked the working directory because the task needed the project path."; - }, - }, - }); - - await synthesizeReflection( - llm, - step({ - userText: "where am I?", - agentText: "checking pwd", - toolCalls: [{ name: "shell", input: { command: "pwd" }, output: "/tmp/project" }], - }), - { episodeId: "ep_1", phase: "reflect", taskSummary: "Task: inspect current project" }, - ); - - expect(userPrompt).toContain("TASK CONTEXT:"); - expect(userPrompt).toContain("Task: inspect current project"); - expect(userPrompt).toContain("OUTCOME:"); - expect(userPrompt).toContain("/tmp/project"); - }); - - it("injects downstream preview with explicit step offsets and types", async () => { - let userPrompt = ""; - const llm = fakeLlm({ - complete: { - "capture.reflection.synth": (input) => { - const messages = input as Array<{ role: string; content: string }>; - userPrompt = messages.find((m) => m.role === "user")?.content ?? ""; - return "I used the first check because the downstream output confirmed the path."; - }, - }, - }); - - await synthesizeReflection( - llm, - step({ userText: "inspect", agentText: "checking", toolCalls: [] }), - { - downstream: [ - { offset: 1, kind: "text", text: "state: user asked for tests\naction: searched test files" }, - { - offset: 2, - kind: "tooluse", - toolNames: ["shell"], - toolOutput: "tests passed", - reflection: "I ran tests to validate the change.", - }, - ], - }, - ); - - expect(userPrompt).toContain("DOWNSTREAM STEP PREVIEW:"); - expect(userPrompt).toContain("[step+1] type=text"); - expect(userPrompt).toContain("[step+2] type=tooluse"); - expect(userPrompt).toContain("tool_names: shell"); - expect(userPrompt).toContain("tool_output: tests passed"); - expect(userPrompt).toContain("existing_reflection: I ran tests"); - }); - - it("returns null on the NO_REFLECTION sentinel", async () => { - const llm = fakeLlm({ - complete: { "capture.reflection.synth": "NO_REFLECTION" }, - }); - const out = await synthesizeReflection(llm, step({ userText: "q", agentText: "a" })); - expect(out.text).toBeNull(); - }); - - it("returns null on empty response", async () => { - const llm = fakeLlm({ complete: { "capture.reflection.synth": " " } }); - const out = await synthesizeReflection(llm, step({ agentText: "a" })); - expect(out.text).toBeNull(); - }); - - it("falls back to text=null on LLM error", async () => { - const llm = throwingLlm(new Error("boom")); - const out = await synthesizeReflection(llm, step({ agentText: "a" })); - expect(out.text).toBeNull(); - expect(out.model).toBe("none"); - }); - - it("caps text at 1500 chars", async () => { - const big = "Z".repeat(5_000); - const llm = fakeLlm({ complete: { "capture.reflection.synth": big } }); - const out = await synthesizeReflection(llm, step({ agentText: "a" })); - expect(out.text?.length).toBeLessThanOrEqual(1_500); - }); -}); diff --git a/apps/memos-local-plugin/tests/unit/llm/prompts.test.ts b/apps/memos-local-plugin/tests/unit/llm/prompts.test.ts index bee52730a..95c26bdd2 100644 --- a/apps/memos-local-plugin/tests/unit/llm/prompts.test.ts +++ b/apps/memos-local-plugin/tests/unit/llm/prompts.test.ts @@ -1,9 +1,9 @@ import { describe, expect, it } from "vitest"; import { + BATCH_REFLECTION_PROMPT, DECISION_REPAIR_PROMPT, L2_INDUCTION_PROMPT, - REFLECTION_SCORE_PROMPT, RETRIEVAL_FILTER_PROMPT, REWARD_R_HUMAN_PROMPT, SKILL_CRYSTALLIZE_PROMPT, @@ -13,7 +13,7 @@ import { describe("llm/prompts", () => { const all = [ - REFLECTION_SCORE_PROMPT, + BATCH_REFLECTION_PROMPT, REWARD_R_HUMAN_PROMPT, L2_INDUCTION_PROMPT, DECISION_REPAIR_PROMPT, diff --git a/apps/memos-local-plugin/viewer/src/stores/i18n.ts b/apps/memos-local-plugin/viewer/src/stores/i18n.ts index 28c9b0d0e..8fd5348de 100644 --- a/apps/memos-local-plugin/viewer/src/stores/i18n.ts +++ b/apps/memos-local-plugin/viewer/src/stores/i18n.ts @@ -569,7 +569,7 @@ const en = { "tasks.skillReason.not_generated.belowThreshold": "Task score R={rTask} is below the induction threshold (≥ {threshold}) — the conversation was normal, but not strong enough to generalize into an L2 experience; similar tasks will accumulate over time.", "tasks.skillReason.not_generated.noPolicy": - "No L2 experience induced yet — requires at least {minEpisodesForInduction} similar task(s) (minEpisodesForInduction) with V ≥ {minTraceValue} to trigger L2 induction, then support ≥ {skillMinSupport} and gain ≥ {skillMinGain} to crystallize into a skill.", + "No L2 experience is linked to this task yet — induction may still be processing asynchronously. Refresh in a moment to see the latest status.", "tasks.skillReason.generated": "Skill \"{skillName}\" crystallized from experience {policyId}.", "tasks.skillReason.upgraded": @@ -1425,7 +1425,7 @@ const zh: Record = { "tasks.skillReason.not_generated.belowThreshold": "任务评分 R={rTask} 未达到沉淀阈值 (≥ {threshold})——对话本身正常,只是还不够强到能泛化成 L2 经验;多做几个相似任务后会自动积累。", "tasks.skillReason.not_generated.noPolicy": - "暂未归纳出 L2 经验——需要至少 {minEpisodesForInduction} 个相似任务(minEpisodesForInduction),且 V 值 ≥ {minTraceValue} 才能触发 L2 诱导,之后 support ≥ {skillMinSupport} 且 gain ≥ {skillMinGain} 才会结晶为技能。", + "该任务暂未关联到 L2 经验——可能仍在异步归纳处理中。稍后刷新可查看最新状态。", "tasks.skillReason.generated": "技能「{skillName}」已从经验 {policyId} 结晶。", "tasks.skillReason.upgraded": From d7bd3f54ac0f6bf1198148bb8e0a4a693b43d9d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Thu, 28 May 2026 16:25:48 +0800 Subject: [PATCH 4/6] fix: L1 Trace Value Redesign --- .../core/capture/ALGORITHMS.md | 55 ++--- .../memos-local-plugin/core/capture/README.md | 27 ++- .../core/capture/batch-scorer.ts | 77 +++--- .../core/capture/capture.ts | 27 ++- apps/memos-local-plugin/core/capture/types.ts | 1 + .../core/config/defaults.ts | 2 + apps/memos-local-plugin/core/config/schema.ts | 4 + .../core/llm/prompts/reflection.ts | 39 ++- .../core/reward/ALGORITHMS.md | 45 ++-- apps/memos-local-plugin/core/reward/README.md | 25 +- .../core/reward/backprop.ts | 86 ++++--- apps/memos-local-plugin/core/reward/reward.ts | 4 + apps/memos-local-plugin/core/reward/types.ts | 10 + .../docs/CONFIG-ADVANCED.md | 11 +- .../templates/config.hermes.yaml | 8 +- .../templates/config.openclaw.yaml | 8 +- .../tests/unit/capture/batch-scorer.test.ts | 55 ++++- .../tests/unit/capture/capture-batch.test.ts | 26 +- .../tests/unit/capture/capture.test.ts | 14 +- .../tests/unit/memory/l2/subscriber.test.ts | 2 +- .../tests/unit/reward/backprop.test.ts | 226 +++++++++++++----- .../unit/reward/reward.integration.test.ts | 18 +- .../tests/unit/reward/subscriber.test.ts | 6 +- .../unit/skill/skill.integration.test.ts | 2 +- 24 files changed, 506 insertions(+), 272 deletions(-) diff --git a/apps/memos-local-plugin/core/capture/ALGORITHMS.md b/apps/memos-local-plugin/core/capture/ALGORITHMS.md index 953a197a0..3bac5d63f 100644 --- a/apps/memos-local-plugin/core/capture/ALGORITHMS.md +++ b/apps/memos-local-plugin/core/capture/ALGORITHMS.md @@ -31,23 +31,19 @@ Edge cases: The original per-step reflection scorer (`reflection-extractor` → `reflection-synth` → `alpha-scorer`) was removed in the 2026-05 redesign (see [docs/superpowers/specs/2026-05-27-l1-batch-reflection-binary-design.md](../../docs/superpowers/specs/2026-05-27-l1-batch-reflection-binary-design.md)). -Reflection no longer produces free-form natural-language text and `α` is -no longer a continuous quality score. Instead, every step gets a binary -"is this step on the final trajectory?" judgement: +Reflection no longer produces free-form natural-language text. Instead, every +step gets a fixed-label path relevance judgement and an aligned numeric `α`: ``` -α_t ∈ {0, 1} -reflection_t ∈ { "RELATED", "IRRELEVANT", "RELATED_DEFAULT" } +α_t ∈ {0, 0.5, 1} +reflection_t ∈ { "PIVOTAL", "RELATED", "IRRELEVANT", "RELATED_DEFAULT" } ``` with the semantics: -- `α_t = 1` / `RELATED` — the step is effective and downstream actions - continue from it. -- `α_t = 0` / `IRRELEVANT` — the step is a detour / dead-end that did - not influence the final path. -- `RELATED_DEFAULT` — episode-level safe default written by the fallback - path when the windowed scorer never produced a usable result for a - step (or for the whole episode). +- `PIVOTAL` → `α_t = 1` —关键转折点。 +- `RELATED` → `α_t = 0.5` —相关但非关键路径。 +- `IRRELEVANT` → `α_t = 0` —无关/偏航路径。 +- `RELATED_DEFAULT` → `α_t = 0.5` —missing-window 或 episode fallback 的安全默认值。 ### Window topology @@ -68,35 +64,40 @@ end)` pairs in ascending order. `mergeWindowScores` aggregates per-window results by absolute `global_idx = win.start + i`. Per-step combination is: +Window overlap 合并按标签优先级(已替代旧的二值 merge 口径): + +``` +PIVOTAL > RELATED / RELATED_DEFAULT > IRRELEVANT +``` + +Numeric `alpha` follows final label mapping: + ``` -if any window assigned alpha=1 → final alpha = 1, label = RELATED -elif any window assigned alpha=0 → final alpha = 0, label = IRRELEVANT -else → final alpha = 1, label = RELATED_DEFAULT - (MISSING_WINDOW_DEFAULT) +PIVOTAL=1, RELATED=0.5, RELATED_DEFAULT=0.5, IRRELEVANT=0 ``` -The "1-over-0" rule is intentional: overlapping windows often disagree -about a borderline step at the seam; counting it as RELATED is the -safer default because the downstream reward/L2/Skill chain treats -`α = 0` as a hard mask. +> 旧口径(`alpha=1` 覆盖 `alpha=0`,且 missing-window 默认 `alpha=1`)已废弃。 ### Failure ladder 1. **Per-window** — up to `maxRetries+1` calls (1 attempt + retries). A malformed payload from the LLM is one of: array length ≠ window - length, non-numeric / non-{0,1} `alpha`, `relevance` outside - {RELATED, IRRELEVANT}, missing `idx`. The validator in + length, `relevance` outside {IRRELEVANT, RELATED, PIVOTAL}, or + missing `idx`. The validator in `batch-scorer.ts :: validateBatchPayload` raises `LLM_OUTPUT_MALFORMED` and the facade's own malformed-retry triggers - once before our outer retry kicks in. + once before our outer retry kicks in. A missing/empty `reason` is + NOT malformed — the entry is kept and we emit a `batch.reason_missing` + warn instead, so a stray reason omission never costs the whole + episode its (relevance, alpha) signal. 2. **Window pass** — if every window in the primary pass eventually succeeded, we accept its results. Otherwise we discard the partial primary results and re-run with the degrade pass over the whole episode. 3. **Episode-wide fallback** — if the degrade pass also has any failed window, every step in the episode is overwritten with - `{ alpha: 1, text: "RELATED_DEFAULT", reason: "FALLBACK_ALL_ONE" }` - and we log `reflection_fallback_all_one` at error level with + `{ alpha: 0.5, text: "RELATED_DEFAULT", reason: "FALLBACK_RELATED_DEFAULT" }` + and we log `reflection_fallback_related_default` at error level with `{ degraded: true, episodeId, stepsCount, failedWindows }`. 4. **No reflect LLM wired** — short-circuits straight to the episode-wide fallback (`reason: "no_llm"`). @@ -129,7 +130,7 @@ Capture does NOT compute `r_step` or `V_t`. It writes: ``` trace.value = 0 # V_t will be filled by Phase 7 trace.r_human = null # assigned on feedback (Phase 7 R_human path) -trace.alpha = α_t # binary {0, 1} from the windowed scorer +trace.alpha = α_t # {0, 0.5, 1} from relevance mapping trace.priority = 0.5 # seeded so retrieval can find it pre-reward ``` @@ -188,7 +189,7 @@ is dominated by the batch latency of the reflect model. ## Downstream consumers and the enum reflection field -`traces.reflection` is now one of `RELATED | IRRELEVANT | +`traces.reflection` is now one of `PIVOTAL | RELATED | IRRELEVANT | RELATED_DEFAULT` (plus legacy free-form text from pre-2026-05 traces). Downstream modules that previously fed the reflection string into LLM prompts, error-signature heuristics, or keyword blobs use the diff --git a/apps/memos-local-plugin/core/capture/README.md b/apps/memos-local-plugin/core/capture/README.md index 6e28ecc7d..ea9d22911 100644 --- a/apps/memos-local-plugin/core/capture/README.md +++ b/apps/memos-local-plugin/core/capture/README.md @@ -34,13 +34,13 @@ episode.turns ──► step-extractor one StepCandidate per decis normalizer truncate / dedup / drop empty │ ▼ - batch-scorer (windowed binary) primary {batch=20, overlap=3, 1 retry} + batch-scorer (windowed relevance) primary {batch=20, overlap=3, 1 retry} │ ↓ on any failed window │ degrade {batch=9, overlap=3, 2 retries} │ ↓ on any failed window │ episode-wide RELATED_DEFAULT fallback ▼ - merge by global_idx 1-over-0; missing window → RELATED_DEFAULT + merge by global_idx PIVOTAL > RELATED/RELATED_DEFAULT > IRRELEVANT │ ▼ embedder vec_summary + vec_action (Phase 3) @@ -50,7 +50,7 @@ episode.turns ──► step-extractor one StepCandidate per decis tracesRepo.updateReflection ``` -`traces.reflection` is always one of `RELATED | IRRELEVANT | +`traces.reflection` is always one of `PIVOTAL | RELATED | IRRELEVANT | RELATED_DEFAULT` after `runReflect`. There is no natural-language reflection text; downstream consumers use `reflectionAsText` (exported from `core/capture/types.ts`) to filter the fixed labels out of prompts @@ -114,7 +114,7 @@ directly (tests and integration tests do this). turn still produces one skeletal trace so Phase 7 has somewhere to assign R_task. -## 5. Windowed binary reflection (V7 §3.2) +## 5. Windowed reflection (V7 §3.2) Per-step reflection / α scoring was replaced by a path-relevance judgement. See [ALGORITHMS.md](./ALGORITHMS.md) for the full derivation; @@ -122,19 +122,21 @@ the highlights: - Each window is `≤ batch_size` consecutive steps, sliced with a fixed `overlap` so seam steps appear in two windows. -- The batch scorer returns `{ alpha: 0|1, relevance: "RELATED" | - "IRRELEVANT" }` per step. Validator rejects any other shape. -- Overlap merge: any window calling a step `RELATED` (`alpha=1`) wins. +- The batch scorer returns per-step `relevance` in + `IRRELEVANT | RELATED | PIVOTAL` plus a short `reason` code. +- Reflection→alpha mapping is fixed: `IRRELEVANT=0`, + `RELATED=0.5`, `PIVOTAL=1`, `RELATED_DEFAULT=0.5`. +- Overlap merge uses priority: `PIVOTAL > RELATED/RELATED_DEFAULT > IRRELEVANT`. - If a step has no window result after both passes, it is written as - `RELATED_DEFAULT` (the safe default). + `RELATED_DEFAULT + alpha=0.5` (the safe default). - If any window in both passes failed, the whole episode is overwritten - with `RELATED_DEFAULT`. + with `RELATED_DEFAULT + alpha=0.5`. - The dispatcher never throws on reflection failure — only a DB `INSERT` is fatal. ## 6. α scoring -`α_t ∈ {0, 1}` only. There is no continuous score, no +`α_t ∈ {0, 0.5, 1}` only. There is no continuous score, no `alphaScoring=false` neutral path, and no LLM-quality rubric. The `alphaScoring` config flag is preserved for back-compat but has no effect. @@ -197,9 +199,8 @@ Top-level events to watch: alpha + reflection label + reason. - `capture.reflect.done` / `capture.lite.done` / `capture.lightweight.done` — phase completion summaries. -- `reflection_fallback_all_one` — episode-wide fallback was triggered. - Includes `degraded=true`, `episodeId`, `stepsCount`, - `failedWindows`. +- `reflection_fallback_related_default` — episode-wide fallback was triggered. + Includes `degraded=true`, `episodeId`, `stepsCount`, `failedWindows`. ## 12. Testing diff --git a/apps/memos-local-plugin/core/capture/batch-scorer.ts b/apps/memos-local-plugin/core/capture/batch-scorer.ts index 86bde9b73..31578555b 100644 --- a/apps/memos-local-plugin/core/capture/batch-scorer.ts +++ b/apps/memos-local-plugin/core/capture/batch-scorer.ts @@ -1,18 +1,20 @@ /** - * `batch-scorer` — windowed binary path-relevance scoring for one episode + * `batch-scorer` — windowed tri-valued path-relevance scoring for one episode * window. Always invoked through `capture.ts :: runEpisodeBatchScoring`, * which owns the primary/degrade window topology and retry ladder. * * Wire format ↔ prompt: * Send `{ host_context?, task_context?, steps: [{idx, state, thinking, * action, tool_calls, outcome}] }`. - * Receive `{ scores: [{idx, alpha: 0|1, relevance: "RELATED" | - * "IRRELEVANT", reason: str}] }`. + * Receive `{ scores: [{idx, + * relevance: "IRRELEVANT" | "RELATED" | "PIVOTAL", reason: str}] }`. * See `core/llm/prompts/reflection.ts :: BATCH_REFLECTION_PROMPT`. * - * Validation is strict: any non-{0,1} alpha or relevance outside - * {RELATED, IRRELEVANT} raises `LLM_OUTPUT_MALFORMED` so the caller's - * window retry ladder can take over. + * Validation: relevance outside {IRRELEVANT, RELATED, PIVOTAL} raises + * `LLM_OUTPUT_MALFORMED` so the caller's window retry ladder can take over. + * A missing/empty `reason` is downgraded to a per-window warn — we keep the + * (relevance, alpha) signal rather than fall the whole episode into + * `RELATED_DEFAULT` just because the model dropped the reason code. */ import { ERROR_CODES, MemosError } from "../../agent-contract/errors.js"; @@ -46,7 +48,6 @@ export interface BatchScoreResult { interface RawScoreEntry { idx: number; - alpha: unknown; relevance: unknown; reason?: unknown; } @@ -65,7 +66,7 @@ const DEFAULT_FIELD_CHARS = { export const BATCH_OP_TAG = `capture.${BATCH_REFLECTION_PROMPT.id}.v${BATCH_REFLECTION_PROMPT.version}`; /** - * One LLM call → binary relevance + α(0/1) for every input step. + * One LLM call → tri-valued relevance; backend maps α for every input step. * * Throws `MemosError` with `LLM_OUTPUT_MALFORMED` when the LLM returns a * shape we cannot parse even after the facade's malformed-retry. Caller @@ -112,7 +113,7 @@ export async function batchScoreReflections( episodeId: opts.episodeId, phase: opts.phase, schemaHint: - '{"scores": [{"idx": int, "alpha": 0|1, "relevance": "RELATED|IRRELEVANT", "reason": "str"}]}', + '{"scores": [{"idx": int, "relevance": "IRRELEVANT|RELATED|PIVOTAL", "reason": "str"}]}', validate: (v) => validateBatchPayload(v, inputs.length), malformedRetries: 1, temperature: 0, @@ -124,6 +125,7 @@ export async function batchScoreReflections( const byIdx = new Map(); for (const entry of rsp.value.scores) byIdx.set(Number(entry.idx), entry); + let missingReasonCount = 0; const scores: ReflectionScore[] = inputs.map((input, i) => { const raw = byIdx.get(i); if (!raw) { @@ -134,19 +136,30 @@ export async function batchScoreReflections( source: "none", }; } - const alpha = clamp01(numOrZero(raw.alpha)) >= 0.5 ? 1 : 0; - const relevance = raw.relevance === "RELATED" ? "RELATED" : "IRRELEVANT"; - const reason = typeof raw.reason === "string" ? sanitizeDerivedText(raw.reason) : null; + const label = mapRawRelevance(raw.relevance); + const alpha = alphaForReflection(label); + const reason = sanitizeReason(raw.reason); + if (reason === null) missingReasonCount += 1; return { - text: relevance, + text: label, alpha, - usable: alpha === 1, + usable: alpha > 0, reason, source: "synth", model: rsp.servedBy, }; }); + if (missingReasonCount > 0) { + log.warn("batch.reason_missing", { + episodeId: opts.episodeId, + phase: opts.phase, + steps: inputs.length, + missingReasonCount, + model: rsp.servedBy, + }); + } + log.debug("batch.scored", { steps: inputs.length, model: rsp.servedBy, @@ -203,22 +216,10 @@ function validateBatchPayload(v: unknown, expected: number): void { got: entry.idx, }); } - if (typeof entry.alpha !== "number" || !Number.isFinite(entry.alpha)) { - throw new MemosError(ERROR_CODES.LLM_OUTPUT_MALFORMED, "batch reflection: alpha must be number", { - idx: entry.idx, - got: entry.alpha, - }); - } - if (entry.alpha !== 0 && entry.alpha !== 1) { - throw new MemosError(ERROR_CODES.LLM_OUTPUT_MALFORMED, "batch reflection: alpha must be 0 or 1", { - idx: entry.idx, - got: entry.alpha, - }); - } - if (entry.relevance !== "RELATED" && entry.relevance !== "IRRELEVANT") { + if (entry.relevance !== "IRRELEVANT" && entry.relevance !== "RELATED" && entry.relevance !== "PIVOTAL") { throw new MemosError( ERROR_CODES.LLM_OUTPUT_MALFORMED, - "batch reflection: relevance must be RELATED or IRRELEVANT", + "batch reflection: relevance must be IRRELEVANT/RELATED/PIVOTAL", { idx: entry.idx, got: entry.relevance }, ); } @@ -257,11 +258,21 @@ function clip(s: string, n: number): string { return s.length > n ? s.slice(0, n) + "…" : s; } -function clamp01(v: number): number { - if (!Number.isFinite(v)) return 0; - return Math.max(0, Math.min(1, v)); +function alphaForReflection(label: ReflectionScore["text"]): number { + if (label === "PIVOTAL") return 1; + if (label === "RELATED" || label === "RELATED_DEFAULT") return 0.5; + return 0; +} + +function mapRawRelevance(relevance: unknown): ReflectionScore["text"] { + if (relevance === "PIVOTAL") return "PIVOTAL"; + if (relevance === "RELATED") return "RELATED"; + return "IRRELEVANT"; } -function numOrZero(v: unknown): number { - return typeof v === "number" && Number.isFinite(v) ? v : 0; +function sanitizeReason(value: unknown): string | null { + if (typeof value !== "string") return null; + const cleaned = sanitizeDerivedText(value).trim(); + if (!cleaned) return null; + return cleaned.slice(0, 80); } diff --git a/apps/memos-local-plugin/core/capture/capture.ts b/apps/memos-local-plugin/core/capture/capture.ts index e4228aacd..91f0466ab 100644 --- a/apps/memos-local-plugin/core/capture/capture.ts +++ b/apps/memos-local-plugin/core/capture/capture.ts @@ -1005,15 +1005,15 @@ async function runEpisodeBatchScoring( taskSummary: string | null, log: Logger, ): Promise { - const fallbackAllOne = (): ScoredStep[] => + const fallbackRelatedDefault = (): ScoredStep[] => normalized.map((step) => ({ ...step, reflection: { text: "RELATED_DEFAULT", - alpha: 1, + alpha: 0.5, usable: true, source: "none", - reason: "FALLBACK_ALL_ONE", + reason: "FALLBACK_RELATED_DEFAULT", }, })); @@ -1022,14 +1022,14 @@ async function runEpisodeBatchScoring( stage: "batch", message: "no reflect llm; using episode-wide RELATED_DEFAULT fallback", }); - log.warn("reflection_fallback_all_one", { + log.warn("reflection_fallback_related_default", { degraded: true, episodeId, stepsCount: normalized.length, failedWindows: normalized.length > 0 ? 1 : 0, reason: "no_llm", }); - return fallbackAllOne(); + return fallbackRelatedDefault(); } const primary = await runWindowPass({ @@ -1064,7 +1064,7 @@ async function runEpisodeBatchScoring( }); if (degraded.success) return mergeWindowScores(normalized, degraded.results); - log.error("reflection_fallback_all_one", { + log.error("reflection_fallback_related_default", { degraded: true, episodeId, stepsCount: normalized.length, @@ -1075,7 +1075,7 @@ async function runEpisodeBatchScoring( message: "all window retries exhausted; force RELATED_DEFAULT for episode", detail: { failedWindows: degraded.failedWindows }, }); - return fallbackAllOne(); + return fallbackRelatedDefault(); } async function runWindowPass(args: { @@ -1140,9 +1140,7 @@ function mergeWindowScores( merged.set(idx, next); continue; } - const prevAlpha = prev.alpha === 1 ? 1 : 0; - const nextAlpha = next.alpha === 1 ? 1 : 0; - if (nextAlpha > prevAlpha) merged.set(idx, next); + if (reflectionRank(next) > reflectionRank(prev)) merged.set(idx, next); } } return normalized.map((step, idx) => { @@ -1152,7 +1150,7 @@ function mergeWindowScores( ...step, reflection: { text: "RELATED_DEFAULT", - alpha: 1, + alpha: 0.5, usable: true, source: "none", reason: "MISSING_WINDOW_DEFAULT", @@ -1161,6 +1159,13 @@ function mergeWindowScores( }); } +function reflectionRank(score: ReflectionScore): number { + const label = (score.text ?? "").trim(); + if (label === "PIVOTAL") return 2; + if (label === "RELATED" || label === "RELATED_DEFAULT") return 1; + return 0; +} + function buildWindows(length: number, windowSize: number, overlap: number): Array<{ start: number; end: number }> { if (length <= 0) return []; const out: Array<{ start: number; end: number }> = []; diff --git a/apps/memos-local-plugin/core/capture/types.ts b/apps/memos-local-plugin/core/capture/types.ts index 7c115ed08..548b473f8 100644 --- a/apps/memos-local-plugin/core/capture/types.ts +++ b/apps/memos-local-plugin/core/capture/types.ts @@ -70,6 +70,7 @@ export interface NormalizedStep extends StepCandidate { */ export const REFLECTION_ENUM_LABELS = new Set([ "RELATED", + "PIVOTAL", "IRRELEVANT", "RELATED_DEFAULT", ]); diff --git a/apps/memos-local-plugin/core/config/defaults.ts b/apps/memos-local-plugin/core/config/defaults.ts index 439f2b175..af9deb945 100644 --- a/apps/memos-local-plugin/core/config/defaults.ts +++ b/apps/memos-local-plugin/core/config/defaults.ts @@ -85,6 +85,8 @@ export const DEFAULT_CONFIG: ResolvedConfig = { }, reward: { gamma: 0.9, + lambda: 0.5, + delta: 0.1, tauSoftmax: 0.5, decayHalfLifeDays: 30, llmScoring: true, diff --git a/apps/memos-local-plugin/core/config/schema.ts b/apps/memos-local-plugin/core/config/schema.ts index de2871c99..d831240a2 100644 --- a/apps/memos-local-plugin/core/config/schema.ts +++ b/apps/memos-local-plugin/core/config/schema.ts @@ -147,6 +147,10 @@ const AlgorithmSchema = Type.Object({ reward: Type.Object({ /** V7 §0.6 eq. 4/5: discount factor γ for reflection-weighted backprop. */ gamma: NumberInRange(0.9, 0, 1), + /** Position-bias mix λ: 0 => flat, 1 => pure γ^(T-t). */ + lambda: NumberInRange(0.5, 0, 1), + /** Recovery boost δ for first non-zero step after an IRRELEVANT step. */ + delta: NumberInRange(0.1, 0, 10), /** V7 §2.4.5 eq. 3: temperature τ for softmax reweighting in L2 induction. */ tauSoftmax: NumberInRange(0.5, 0.01, 10), /** V7 §3.3: priority decay half-life in days. */ diff --git a/apps/memos-local-plugin/core/llm/prompts/reflection.ts b/apps/memos-local-plugin/core/llm/prompts/reflection.ts index f83497fbc..8e52b484e 100644 --- a/apps/memos-local-plugin/core/llm/prompts/reflection.ts +++ b/apps/memos-local-plugin/core/llm/prompts/reflection.ts @@ -1,26 +1,21 @@ import type { PromptDef } from "./index.js"; /** - * V7 §3.2 — Windowed binary path-relevance scoring. + * V7 §3.2 — Windowed path-relevance scoring (tri-valued relevance). * - * One LLM call per episode window. The LLM sees the full causal chain of - * the window in order and returns a binary `alpha ∈ {0, 1}` plus a fixed - * `RELATED | IRRELEVANT` label per step. There is no natural-language - * reflection synthesis: `traces.reflection` is overwritten by the label - * (or `RELATED_DEFAULT` when the windowed pipeline falls back to its - * episode-wide safe default). + * One LLM call per episode window. The LLM returns only: + * - `idx` + * - `relevance ∈ {IRRELEVANT, RELATED, PIVOTAL}` + * - `reason` (short reason code) * - * Window topology and retry ladder are owned by `core/capture/capture.ts` - * (primary `batch=20, overlap=3` → degrade `batch=9, overlap=3` → - * episode-wide `RELATED_DEFAULT` fallback). `core/capture/batch-scorer.ts` - * validates each entry's shape and rejects any `alpha` that is not exactly - * 0 or 1 / `relevance` that is not exactly RELATED|IRRELEVANT. + * `alpha` is mapped in backend: IRRELEVANT=0, RELATED=0.5, PIVOTAL=1. + * `RELATED_DEFAULT` is backend fallback only and must not be emitted by LLM. */ export const BATCH_REFLECTION_PROMPT: PromptDef = { id: "reflection.batch", - version: 4, + version: 6, description: - "Binary path-relevance scoring for every step in one episode window.", + "Tri-valued path-relevance scoring for each step in an episode window.", system: `You are reviewing a WINDOW of one AI agent episode. INPUT: a JSON array under "steps". Each entry has: @@ -40,22 +35,24 @@ INPUT: a JSON array under "steps". Each entry has: The user payload may also include "host_context". That describes the host agent being reviewed and the separate reflection model doing this review. -Goal: decide whether each step is RELEVANT to the final trajectory. +Goal: decide each step's relevance to the final trajectory. You must NOT produce long natural-language reflection text. For EACH input step, return one object containing: - "idx": copy the input idx exactly -- "alpha": MUST be integer 0 or 1 only - * 1 => this step is effective and downstream steps continue from it - * 0 => detour / ineffective / irrelevant to trajectory -- "relevance": MUST be one of "RELATED" or "IRRELEVANT" +- "relevance": MUST be one of "IRRELEVANT", "RELATED", "PIVOTAL" + * IRRELEVANT => detour / ineffective / not on useful path + * RELATED => useful on-path support step + * PIVOTAL => key turning point, removing it would cause major rework/failure + * IMPORTANT: NEVER output "RELATED_DEFAULT" - "reason": short code-like reason, <= 8 words (e.g. "ON_PATH", "DETOUR") Return JSON of the form: { "scores": [ - {"idx": 0, "alpha": 1, "relevance": "RELATED", "reason": "ON_PATH"}, - {"idx": 1, "alpha": 0, "relevance": "IRRELEVANT", "reason": "DETOUR"} + {"idx": 0, "relevance": "RELATED", "reason": "ON_PATH"}, + {"idx": 1, "relevance": "PIVOTAL", "reason": "RECOVERY"}, + {"idx": 2, "relevance": "IRRELEVANT", "reason": "DETOUR"} ] } diff --git a/apps/memos-local-plugin/core/reward/ALGORITHMS.md b/apps/memos-local-plugin/core/reward/ALGORITHMS.md index a6864f8fc..71fefd6aa 100644 --- a/apps/memos-local-plugin/core/reward/ALGORITHMS.md +++ b/apps/memos-local-plugin/core/reward/ALGORITHMS.md @@ -64,36 +64,31 @@ final outcome — "did it end well?" matters most). Deterministic, no LLM. --- -## V7 §0.6 — reflection-weighted backprop +## V7 §0.6 — normalized credit backprop -> *Equations 4–5:* -> - `V(f¹_{k,T_k}) = R_human(h_k)` (terminal case) -> - `V(f¹_{k,t}) = α_t · R_human(h_k) + (1 - α_t) · γ · V(f¹_{k,t+1})` +> 当前实现口径(新公式): +> - `f_t = (1 - λ) + λ · γ^(T - t)` +> - `recovery_t = 1 if α_t>0 and t>0 and α_{t-1}=0 else 0` +> - `r_t = 1 + δ · recovery_t` +> - `w_t = α_t · f_t · r_t` +> - `S = Σ_t w_t` +> - `V_t = 0 (if S=0) else (w_t / S) · R_human` ### Our implementation (`core/reward/backprop.ts`) -```ts -let nextV = rHuman; // V_{T+1} — sentinel -for (let i = traces.length - 1; i >= 0; i--) { - const alpha = clamp(t.alpha, 0, 1); - const V = i === traces.length - 1 - ? rHuman // V_T = R_human - : alpha * rHuman + (1 - alpha) * gamma * nextV; - ... - nextV = V; -} -``` +Implementation computes one normalized weight per step, then scales by +`R_human`: + +- **α source**: `TraceRow.alpha` from capture (`0 / 0.5 / 1`), defensively + clamped to `[0, 1]`. +- **fading term `f_t`**: mixes flat mass and temporal decay by `lambda`. +- **recovery boost `r_t`**: only applies when trajectory re-enters + relevant path (`α: 0 → >0`). +- **normalization**: `Σ V_t = R_human` whenever `S>0`; all values stay in + `[-1, 1]` after scaling. +- **degenerate case**: `S=0` (all `α=0`) writes `V_t=0` for all steps. -- **Walk direction**: right → left so `V_{t+1}` is always available. -- **α source**: `TraceRow.alpha` filled by capture's alpha-scorer. We - clamp defensively to `[0, 1]` in case of bad data. -- **γ clamp**: `gamma ∈ [0, 1]`. Outside-range values are clipped. -- **`R_human` clamp**: `[-1, 1]`. Keeps `V_t` in range by construction. -- **Semantics reproduced**: - - α=1 ("aha!" step): `V_t = R_human` — immediate credit, no γ discount. - - α=0 (blind trial): `V_t = γ·V_{t+1}` — pure temporal propagation. - - middle α: linear blend. Matches V7's "explicit distinction between - key findings and blind trial-and-error". +> 旧递推公式 `V_t = α_t·R_human + (1-α_t)·γ·V_{t+1}` 已废弃,不再作为实现口径。 --- diff --git a/apps/memos-local-plugin/core/reward/README.md b/apps/memos-local-plugin/core/reward/README.md index b16085c46..8142d068d 100644 --- a/apps/memos-local-plugin/core/reward/README.md +++ b/apps/memos-local-plugin/core/reward/README.md @@ -1,9 +1,9 @@ -# core/reward — Phase 7 (R_human + reflection-weighted backprop) +# core/reward — Phase 7 (R_human + normalized credit backprop) > V7 §0.6 / §2.4.2 / §3.3. Converts user feedback into a per-episode -> scalar `R_human ∈ [-1, 1]`, then distributes credit backward over the -> episode's L1 traces using reflection weights `α_t` (from capture) and -> an exponential time-decay for the retrieval `priority`. +> scalar `R_human ∈ [-1, 1]`, then distributes credit over the +> episode's L1 traces using normalized weights (`alpha + gamma + lambda + delta`) +> and an exponential time-decay for the retrieval `priority`. ## 1. When it runs @@ -56,10 +56,14 @@ run is not aborted. `R_human = 0.45·goal_achievement + 0.30·process_quality + 0.25·user_satisfaction` -- **Reflection-weighted backprop** (V7 §0.6 eq. 4/5): +- **Normalized credit backprop**: - `V_T = R_human` - `V_t = α_t · R_human + (1 − α_t) · γ · V_{t+1}` + `f_t = (1 − λ) + λ · γ^(T − t)` + `recovery_t = 1 if α_t>0 and t>0 and α_{t−1}=0 else 0` + `r_t = 1 + δ · recovery_t` + `w_t = α_t · f_t · r_t` + `S = Σ_t w_t` + `V_t = 0 (if S=0) else (w_t/S) · R_human` - **Priority with time decay** (V7 §3.3): @@ -73,7 +77,9 @@ backprop. | Key | Default | Meaning | |------------------------|---------|-------------------------------------------------| -| `gamma` | 0.9 | γ discount factor | +| `gamma` | 0.9 | γ 衰减因子,范围 `[0,1]` | +| `lambda` | 0.5 | λ 混合系数(平坦分配 vs 时间衰减),范围 `[0,1]` | +| `delta` | 0.1 | δ recovery 增益,范围 `>=0` | | `tauSoftmax` | 0.5 | τ for softmax reweighting in L2 induction (Phase 9 uses) | | `decayHalfLifeDays` | 30 | Half-life for priority decay | | `llmScoring` | true | Use LLM rubric (v2); off = heuristic only | @@ -84,6 +90,9 @@ backprop. All documented in `docs/CONFIG-ADVANCED.md`. +> 旧递推口径 `V_t = α_t · R_human + (1 − α_t) · γ · V_{t+1}` 已废弃, +> 文档与实现以当前归一化信用分配公式为准。 + ## 6. Public API ```ts diff --git a/apps/memos-local-plugin/core/reward/backprop.ts b/apps/memos-local-plugin/core/reward/backprop.ts index 4426d6f51..67972625a 100644 --- a/apps/memos-local-plugin/core/reward/backprop.ts +++ b/apps/memos-local-plugin/core/reward/backprop.ts @@ -1,30 +1,18 @@ /** - * `backprop` — V7 §0.6 eq. 4+5 + §3.3 priority formula. + * `backprop` — normalized credit assignment with position smoothing. * - * Given traces in chronological order and a terminal reward `rHuman`, - * compute `V_t` for each step by walking RIGHT-TO-LEFT: + * For traces in chronological order (t = 1..T): * - * V_T = R_human - * V_t = α_t · R_human + (1 − α_t) · γ · V_{t+1} - * - * Then compute priority with exponential time decay: + * f_t = (1-λ) + λ·γ^(T-t) + * recovery_t = 1 if α_t>0 and α_{t-1}=0 else 0 + * r_t = 1 + δ·recovery_t + * w_t = α_t·f_t·r_t + * V_t = (w_t / Σw)·R_human when Σw>0 + * V_t = 0 when Σw=0 * + * Priority stays: * priority(f1_t) = max(V_t, 0) · decay(Δt) * decay(Δt) = 0.5 ^ (Δt_days / halfLifeDays) - * - * Pure function — no I/O. The caller persists via `tracesRepo.updateScore`. - * - * Design notes: - * - `alpha` is already clamped to [0, 1] by capture, but we clamp again - * defensively in case a downstream rescoring widened it. - * - `rHuman` is clamped to [-1, 1] to guarantee `V_t` stays in range. - * - A trace with no reflection (α=0) gets V_t via pure γ-discount, which - * matches V7 §0.6: "pure trial-and-error steps propagate by γ only". - * - Priority uses `max(V, 0)` because V7 §3.3 says negative value traces - * sink to the bottom but MUST remain on disk — they can still be - * surfaced by Decision Repair. - * - We do NOT touch `r_human` or `alpha` on the trace row: α stays - * capture-owned; r_human is episode-level and lives in `episodes.r_task`. */ import { rootLogger } from "../logger/index.js"; @@ -36,6 +24,8 @@ export function backprop(input: BackpropInput): BackpropResult { const log = rootLogger.child({ channel: "core.reward.backprop" }); const gamma = clamp(input.gamma, 0, 1); + const lambda = clamp(input.lambda, 0, 1); + const delta = Math.max(0, Number.isFinite(input.delta) ? input.delta : 0); const rHuman = clamp(input.rHuman, -1, 1); const now = input.now ?? Date.now(); const halfLife = Math.max(1, input.decayHalfLifeDays); @@ -46,21 +36,36 @@ export function backprop(input: BackpropInput): BackpropResult { updates: [], meanAbsValue: 0, maxPriority: 0, - echoParams: { gamma, decayHalfLifeDays: halfLife, now }, + echoParams: { gamma, lambda, delta, decayHalfLifeDays: halfLife, now }, }; } - // Walk last → first so V_{t+1} is always available. - let nextV = rHuman; + const effectiveAlpha: number[] = input.traces.map((trace) => alphaFromTrace(trace)); + const weights: number[] = new Array(input.traces.length).fill(0); + let sumW = 0; + let fallbackAlphaCount = 0; + let unknownReflectionCount = 0; + for (let i = 0; i < input.traces.length; i++) { + const alpha = effectiveAlpha[i]!; + const prevAlpha = i > 0 ? effectiveAlpha[i - 1]! : 0; + const recovery = i > 0 && alpha > 0 && prevAlpha === 0 ? 1 : 0; + const positional = (1 - lambda) + lambda * Math.pow(gamma, input.traces.length - 1 - i); + const boost = 1 + delta * recovery; + const w = alpha * positional * boost; + weights[i] = w; + sumW += w; + const reflection = input.traces[i]!.reflection; + if (!reflection || !reflection.trim()) fallbackAlphaCount += 1; + else if (!KNOWN_REFLECTION_LABELS.has(reflection.trim())) unknownReflectionCount += 1; + } + let sumAbsV = 0; let maxPriority = 0; - for (let i = input.traces.length - 1; i >= 0; i--) { + for (let i = 0; i < input.traces.length; i++) { const t = input.traces[i]!; - const alpha = clamp(t.alpha, 0, 1); - const V = i === input.traces.length - 1 - ? rHuman // V_T = R_human (V7 §0.6 boundary case) - : alpha * rHuman + (1 - alpha) * gamma * nextV; + const alpha = effectiveAlpha[i]!; + const V = sumW > 0 ? (weights[i]! / sumW) * rHuman : 0; const dtDays = Math.max(0, (now - t.ts) / MS_PER_DAY); const decay = Math.pow(0.5, dtDays / halfLife); @@ -74,7 +79,6 @@ export function backprop(input: BackpropInput): BackpropResult { }; sumAbsV += Math.abs(V); if (priority > maxPriority) maxPriority = priority; - nextV = V; } const meanAbsValue = sumAbsV / updates.length; @@ -83,6 +87,11 @@ export function backprop(input: BackpropInput): BackpropResult { traces: updates.length, rHuman, gamma, + lambda, + delta, + sumW, + fallbackAlphaCount, + unknownReflectionCount, meanAbsValue, maxPriority, }); @@ -91,7 +100,7 @@ export function backprop(input: BackpropInput): BackpropResult { updates, meanAbsValue, maxPriority, - echoParams: { gamma, decayHalfLifeDays: halfLife, now }, + echoParams: { gamma, lambda, delta, decayHalfLifeDays: halfLife, now }, }; } @@ -116,3 +125,18 @@ function clamp(v: number, lo: number, hi: number): number { if (!Number.isFinite(v)) return 0; return Math.max(lo, Math.min(hi, v)); } + +const KNOWN_REFLECTION_LABELS = new Set([ + "IRRELEVANT", + "RELATED", + "PIVOTAL", + "RELATED_DEFAULT", +]); + +function alphaFromTrace(trace: BackpropInput["traces"][number]): number { + const reflection = trace.reflection?.trim(); + if (reflection === "IRRELEVANT") return 0; + if (reflection === "RELATED" || reflection === "RELATED_DEFAULT") return 0.5; + if (reflection === "PIVOTAL") return 1; + return clamp(trace.alpha, 0, 1); +} diff --git a/apps/memos-local-plugin/core/reward/reward.ts b/apps/memos-local-plugin/core/reward/reward.ts index 1b6d3354c..172ee388f 100644 --- a/apps/memos-local-plugin/core/reward/reward.ts +++ b/apps/memos-local-plugin/core/reward/reward.ts @@ -159,6 +159,8 @@ export function createRewardRunner(deps: RewardDeps): RewardRunner { maxPriority: 0, echoParams: { gamma: deps.cfg.gamma, + lambda: deps.cfg.lambda, + delta: deps.cfg.delta, decayHalfLifeDays: deps.cfg.decayHalfLifeDays, now: startedAt, }, @@ -222,6 +224,8 @@ export function createRewardRunner(deps: RewardDeps): RewardRunner { traces, rHuman: humanScore.rHuman, gamma: deps.cfg.gamma, + lambda: deps.cfg.lambda, + delta: deps.cfg.delta, decayHalfLifeDays: deps.cfg.decayHalfLifeDays, now: startedAt, }); diff --git a/apps/memos-local-plugin/core/reward/types.ts b/apps/memos-local-plugin/core/reward/types.ts index 0e545ef0e..2c8df6548 100644 --- a/apps/memos-local-plugin/core/reward/types.ts +++ b/apps/memos-local-plugin/core/reward/types.ts @@ -23,6 +23,10 @@ import type { export interface RewardConfig { /** V7 §0.6 eq. 4/5: discount factor γ. */ gamma: number; + /** Position-bias mixing factor λ: 0 => flat, 1 => pure γ^(T-t). */ + lambda: number; + /** Recovery boost δ for first non-zero step after an IRRELEVANT step. */ + delta: number; /** V7 §2.4.5 eq. 3: softmax τ (used downstream by L2 induction; we just expose it here). */ tauSoftmax: number; /** V7 §3.3: priority decay half-life in days. */ @@ -136,6 +140,10 @@ export interface BackpropInput { rHuman: number; /** Discount factor γ ∈ [0, 1]. */ gamma: number; + /** Position-bias mixing factor λ ∈ [0, 1]. */ + lambda: number; + /** Recovery boost δ ∈ [0, +∞). */ + delta: number; /** Decay half-life in days (for priority). */ decayHalfLifeDays: number; /** Anchor time for the decay calculation (ms). Defaults to `Date.now()`. */ @@ -161,6 +169,8 @@ export interface BackpropResult { /** γ / half-life / anchor recorded for audit. */ echoParams: { gamma: number; + lambda: number; + delta: number; decayHalfLifeDays: number; now: EpochMs; }; diff --git a/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md b/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md index 7d8cb8afe..4fef78fdd 100644 --- a/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md +++ b/apps/memos-local-plugin/docs/CONFIG-ADVANCED.md @@ -90,7 +90,9 @@ algorithm: downstreamPerStepMaxChars: 400 # retained for compatibility synthOutcomeMaxChars: 600 # retained for compatibility reward: - gamma: 0.9 # γ discount factor (V7 §0.6 eq. 4/5) + gamma: 0.9 # γ 衰减因子,控制远端 step 权重衰减,范围 [0,1] + lambda: 0.5 # λ 混合系数:0=仅平坦分配,1=仅按 gamma^(T-t) 衰减,范围 [0,1] + delta: 0.1 # δ recovery 增益:当 alpha 从 0 恢复到 >0 时放大权重,范围 >=0 tauSoftmax: 0.5 # τ for softmax reweighting in Phase 9 L2 induction decayHalfLifeDays: 30 # priority decay half-life (V7 §3.3) llmScoring: true # use rubric LLM for R_human; off = heuristic only @@ -180,11 +182,14 @@ algorithm: - 主窗口:`20`,`overlap=3`,每窗重试 1 次 - 降级窗口:`9`,`overlap=3`,每窗重试 2 次 -- 全部失败:整集 episode 强制写入 `reflection=RELATED_DEFAULT`、`alpha=1` -- overlap 冲突合并:`alpha=1` 覆盖 `alpha=0` +- 输出字段支持 `pivotal(0/1)`;仅当 `alpha=1` 时允许 `pivotal=1` +- 映射固定:`IRRELEVANT=0`,`RELATED=0.5`,`PIVOTAL=1`,`RELATED_DEFAULT=0.5` +- 全部失败:整集 episode 强制写入 `reflection=RELATED_DEFAULT`、`alpha=0.5` +- overlap 冲突合并优先级:`PIVOTAL > RELATED/RELATED_DEFAULT > IRRELEVANT` `traces.reflection` 为固定枚举: +- `PIVOTAL` - `RELATED` - `IRRELEVANT` - `RELATED_DEFAULT` diff --git a/apps/memos-local-plugin/templates/config.hermes.yaml b/apps/memos-local-plugin/templates/config.hermes.yaml index d08cc72e6..877fe5f0c 100644 --- a/apps/memos-local-plugin/templates/config.hermes.yaml +++ b/apps/memos-local-plugin/templates/config.hermes.yaml @@ -33,8 +33,14 @@ algorithm: capture: # reflection/alpha now runs in windowed batch mode only. # fixed strategy: primary window 20 (overlap 3), degrade to 9 (overlap 3), - # then episode-level fallback writes RELATED_DEFAULT + alpha=1. + # then episode-level fallback writes RELATED_DEFAULT + alpha=0.5. + # merge priority: PIVOTAL > RELATED/RELATED_DEFAULT > IRRELEVANT. + # batch output may include pivotal(0/1); pivotal=1 only when alpha=1. batchMode: windowed + reward: + gamma: 0.9 # γ in [0,1], temporal decay term in backprop weight + lambda: 0.5 # λ in [0,1], mixes flat vs gamma^(T-t) weighting + delta: 0.1 # δ >= 0, recovery boost when alpha transitions 0 -> >0 hub: enabled: false diff --git a/apps/memos-local-plugin/templates/config.openclaw.yaml b/apps/memos-local-plugin/templates/config.openclaw.yaml index e172bdf50..2f1f6b03e 100644 --- a/apps/memos-local-plugin/templates/config.openclaw.yaml +++ b/apps/memos-local-plugin/templates/config.openclaw.yaml @@ -32,8 +32,14 @@ algorithm: capture: # reflection/alpha now runs in windowed batch mode only. # fixed strategy: primary window 20 (overlap 3), degrade to 9 (overlap 3), - # then episode-level fallback writes RELATED_DEFAULT + alpha=1. + # then episode-level fallback writes RELATED_DEFAULT + alpha=0.5. + # merge priority: PIVOTAL > RELATED/RELATED_DEFAULT > IRRELEVANT. + # batch output may include pivotal(0/1); pivotal=1 only when alpha=1. batchMode: windowed + reward: + gamma: 0.9 # γ in [0,1], temporal decay term in backprop weight + lambda: 0.5 # λ in [0,1], mixes flat vs gamma^(T-t) weighting + delta: 0.1 # δ >= 0, recovery boost when alpha transitions 0 -> >0 hub: enabled: false diff --git a/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts b/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts index 6b7d975ee..b81365076 100644 --- a/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts @@ -53,8 +53,8 @@ describe("batchScoreReflections", () => { completeJson: { [BATCH_OP_TAG]: { scores: [ - { idx: 1, alpha: 0, relevance: "IRRELEVANT" }, - { idx: 0, alpha: 1, relevance: "RELATED" }, + { idx: 1, relevance: "IRRELEVANT", reason: "DETOUR" }, + { idx: 0, relevance: "PIVOTAL", reason: "TURNING_POINT" }, ], }, }, @@ -67,7 +67,7 @@ describe("batchScoreReflections", () => { ], {}, ); - expect(out.scores[0]!.text).toBe("RELATED"); + expect(out.scores[0]!.text).toBe("PIVOTAL"); expect(out.scores[0]!.alpha).toBe(1); expect(out.scores[1]!.text).toBe("IRRELEVANT"); expect(out.scores[1]!.alpha).toBe(0); @@ -76,7 +76,7 @@ describe("batchScoreReflections", () => { it("rejects responses with mismatched length", async () => { const llm = fakeLlm({ completeJson: { - [BATCH_OP_TAG]: { scores: [{ idx: 0, reflection_text: "x", alpha: 0.5, usable: true }] }, + [BATCH_OP_TAG]: { scores: [{ idx: 0, relevance: "IRRELEVANT", reason: "DETOUR" }] }, }, }); await expect( @@ -91,17 +91,35 @@ describe("batchScoreReflections", () => { ).rejects.toThrow(/length mismatch/); }); - it("rejects entries with non-number alpha", async () => { + it("keeps (relevance, alpha) when reason is missing — no MALFORMED throw", async () => { + const llm = fakeLlm({ + completeJson: { + [BATCH_OP_TAG]: { + scores: [{ idx: 0, relevance: "PIVOTAL" }], + }, + }, + }); + const out = await batchScoreReflections( + llm, + [input(step({ userText: "u", agentText: "a" }), "x")], + {}, + ); + expect(out.scores[0]!.text).toBe("PIVOTAL"); + expect(out.scores[0]!.alpha).toBe(1); + expect(out.scores[0]!.reason).toBeNull(); + }); + + it("rejects entries with illegal relevance", async () => { const llm = fakeLlm({ completeJson: { [BATCH_OP_TAG]: { - scores: [{ idx: 0, reflection_text: "x", alpha: "bad", usable: true }], + scores: [{ idx: 0, relevance: "RELATED_DEFAULT", reason: "BAD_ENUM" }], }, }, }); await expect( batchScoreReflections(llm, [input(step({ userText: "u", agentText: "a" }), "x")], {}), - ).rejects.toThrow(/alpha must be number/); + ).rejects.toThrow(/relevance must be IRRELEVANT\/RELATED\/PIVOTAL/); }); it("maps IRRELEVANT to alpha=0", async () => { @@ -111,8 +129,8 @@ describe("batchScoreReflections", () => { scores: [ { idx: 0, - alpha: 0, relevance: "IRRELEVANT", + reason: "DETOUR", }, ], }, @@ -128,15 +146,15 @@ describe("batchScoreReflections", () => { expect(out.scores[0]!.source).toBe("synth"); }); - it("maps RELATED to alpha=1", async () => { + it("maps RELATED to alpha=0.5", async () => { const llm = fakeLlm({ completeJson: { [BATCH_OP_TAG]: { scores: [ { idx: 0, - alpha: 1, relevance: "RELATED", + reason: "ON_PATH", }, ], }, @@ -149,6 +167,23 @@ describe("batchScoreReflections", () => { ); expect(out.scores[0]!.text).toBe("RELATED"); expect(out.scores[0]!.source).toBe("synth"); + expect(out.scores[0]!.alpha).toBe(0.5); + }); + + it("maps PIVOTAL to alpha=1", async () => { + const llm = fakeLlm({ + completeJson: { + [BATCH_OP_TAG]: { + scores: [{ idx: 0, relevance: "PIVOTAL", reason: "RECOVERY" }], + }, + }, + }); + const out = await batchScoreReflections( + llm, + [input(step({ userText: "u", agentText: "a" }), null)], + {}, + ); + expect(out.scores[0]!.text).toBe("PIVOTAL"); expect(out.scores[0]!.alpha).toBe(1); }); }); diff --git a/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts b/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts index bc4e8cad5..381e9f3ac 100644 --- a/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts @@ -171,9 +171,9 @@ describe("capture/pipeline (windowed binary path)", () => { completeJson: { [batchOp]: { scores: [ - { idx: 0, alpha: 1, relevance: "RELATED", reason: "ON_PATH" }, - { idx: 1, alpha: 0, relevance: "IRRELEVANT", reason: "DETOUR" }, - { idx: 2, alpha: 1, relevance: "RELATED", reason: "ON_PATH" }, + { idx: 0, relevance: "RELATED", reason: "ON_PATH" }, + { idx: 1, relevance: "IRRELEVANT", reason: "DETOUR" }, + { idx: 2, relevance: "PIVOTAL", reason: "TURNING_POINT" }, ], }, }, @@ -202,10 +202,10 @@ describe("capture/pipeline (windowed binary path)", () => { const rows = result.traceIds.map((id) => tmp.repos.traces.getById(id)!); expect(rows[0]!.reflection).toBe("RELATED"); - expect(rows[0]!.alpha).toBe(1); + expect(rows[0]!.alpha).toBe(0.5); expect(rows[1]!.reflection).toBe("IRRELEVANT"); expect(rows[1]!.alpha).toBe(0); - expect(rows[2]!.reflection).toBe("RELATED"); + expect(rows[2]!.reflection).toBe("PIVOTAL"); expect(rows[2]!.alpha).toBe(1); }); @@ -218,9 +218,9 @@ describe("capture/pipeline (windowed binary path)", () => { steps: Array<{ idx: number }>; }; if (payload.steps.length === 20) { - return { scores: payload.steps.map((s) => ({ idx: s.idx, alpha: 0, relevance: "IRRELEVANT" })) }; + return { scores: payload.steps.map((s) => ({ idx: s.idx, relevance: "IRRELEVANT", reason: "DETOUR" })) }; } - return { scores: payload.steps.map((s) => ({ idx: s.idx, alpha: 1, relevance: "RELATED" })) }; + return { scores: payload.steps.map((s) => ({ idx: s.idx, relevance: "PIVOTAL", reason: "RECOVERY" })) }; }, }, }); @@ -233,13 +233,13 @@ describe("capture/pipeline (windowed binary path)", () => { const result = await runCapture(runner, episodeSnapshot({ id: "ep_1", sessionId: "se_1", turns })); expect(result.llmCalls.batchedReflection).toBe(2); const rows = result.traceIds.map((id) => tmp.repos.traces.getById(id)!); - // idx 17..19 are overlap, should be upgraded to RELATED (alpha=1). + // idx 17..19 are overlap, should be upgraded to PIVOTAL (alpha=1). expect(rows[17]!.alpha).toBe(1); expect(rows[18]!.alpha).toBe(1); expect(rows[19]!.alpha).toBe(1); }); - it("all retries failed => episode fallback RELATED_DEFAULT + alpha=1", async () => { + it("all retries failed => episode fallback RELATED_DEFAULT + alpha=0.5", async () => { const llm = fakeLlm({ completeJson: {}, }); @@ -254,7 +254,7 @@ describe("capture/pipeline (windowed binary path)", () => { const result = await runCapture(runner, ep); const t = tmp.repos.traces.getById(result.traceIds[0]!)!; expect(t.reflection).toBe("RELATED_DEFAULT"); - expect(t.alpha).toBe(1); + expect(t.alpha).toBe(0.5); expect(result.warnings.some((w) => w.message.includes("force RELATED_DEFAULT"))).toBe(true); }); @@ -267,7 +267,7 @@ describe("capture/pipeline (windowed binary path)", () => { steps: Array<{ idx: number }>; }; if (payload.steps.length === 20) throw new Error("fail primary window"); - return { scores: payload.steps.map((s) => ({ idx: s.idx, alpha: 1, relevance: "RELATED" })) }; + return { scores: payload.steps.map((s) => ({ idx: s.idx, relevance: "RELATED", reason: "ON_PATH" })) }; }, }, }); @@ -284,7 +284,7 @@ describe("capture/pipeline (windowed binary path)", () => { const result = await runCapture(runner, ep); expect(result.warnings.some((w) => w.message.includes("degrading to smaller windows"))).toBe(true); expect(result.traceIds).toHaveLength(25); - expect(result.traceIds.every((id) => tmp.repos.traces.getById(id)!.alpha === 1)).toBe(true); + expect(result.traceIds.every((id) => tmp.repos.traces.getById(id)!.alpha === 0.5)).toBe(true); }); it("no LLM available => directly fallback to RELATED_DEFAULT", async () => { const runner = buildRunner({ alphaScoring: false }, null); @@ -297,6 +297,6 @@ describe("capture/pipeline (windowed binary path)", () => { expect(result.traceIds).toHaveLength(1); const t = tmp.repos.traces.getById(result.traceIds[0]!)!; expect(t.reflection).toBe("RELATED_DEFAULT"); - expect(t.alpha).toBe(1); + expect(t.alpha).toBe(0.5); }); }); diff --git a/apps/memos-local-plugin/tests/unit/capture/capture.test.ts b/apps/memos-local-plugin/tests/unit/capture/capture.test.ts index 0df39c470..4782c85f0 100644 --- a/apps/memos-local-plugin/tests/unit/capture/capture.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/capture.test.ts @@ -261,7 +261,7 @@ describe("capture/pipeline (end-to-end)", () => { it("writes one trace per step with binary reflection fields", async () => { const llm = fakeLlm({ completeJson: { - [batchOp]: { scores: [{ idx: 0, alpha: 0, relevance: "IRRELEVANT" }] }, + [batchOp]: { scores: [{ idx: 0, relevance: "IRRELEVANT", reason: "DETOUR" }] }, }, }); const runner = buildRunner({ alphaScoring: false }, llm); @@ -402,7 +402,7 @@ describe("capture/pipeline (end-to-end)", () => { it("stores binary alpha/reflection from batch scorer", async () => { const llm = fakeLlm({ completeJson: { - [batchOp]: { scores: [{ idx: 0, alpha: 1, relevance: "RELATED", reason: "ON_PATH" }] }, + [batchOp]: { scores: [{ idx: 0, relevance: "RELATED", reason: "ON_PATH" }] }, }, }); const runner = buildRunner({}, llm); @@ -423,14 +423,14 @@ describe("capture/pipeline (end-to-end)", () => { const t = tmp.repos.traces.getById(result.traceIds[0]!)!; expect(t.reflection).toBe("RELATED"); - expect(t.alpha).toBe(1); + expect(t.alpha).toBe(0.5); expect(result.traces[0]?.reflection.reason).toBe("ON_PATH"); }); it("sets alpha=0 when batch returns IRRELEVANT", async () => { const llm = fakeLlm({ completeJson: { - [batchOp]: { scores: [{ idx: 0, alpha: 0, relevance: "IRRELEVANT" }] }, + [batchOp]: { scores: [{ idx: 0, relevance: "IRRELEVANT", reason: "DETOUR" }] }, }, }); const runner = buildRunner({}, llm); @@ -469,13 +469,13 @@ describe("capture/pipeline (end-to-end)", () => { expect(result.warnings.some((w) => w.stage === "batch")).toBe(true); const t = tmp.repos.traces.getById(result.traceIds[0]!)!; expect(t.reflection).toBe("RELATED_DEFAULT"); - expect(t.alpha).toBe(1); + expect(t.alpha).toBe(0.5); }); it("reflect phase writes binary enums without synthesis", async () => { const llm = fakeLlm({ completeJson: { - [batchOp]: { scores: [{ idx: 0, alpha: 1, relevance: "RELATED" }] }, + [batchOp]: { scores: [{ idx: 0, relevance: "RELATED", reason: "ON_PATH" }] }, }, }); const runner = buildRunner({ synthReflections: true }, llm); @@ -491,7 +491,7 @@ describe("capture/pipeline (end-to-end)", () => { expect(result.llmCalls.batchedReflection).toBe(1); const t = tmp.repos.traces.getById(result.traceIds[0]!)!; expect(t.reflection).toBe("RELATED"); - expect(t.alpha).toBe(1); + expect(t.alpha).toBe(0.5); }); it("updates episode.trace_ids_json with new ids", async () => { diff --git a/apps/memos-local-plugin/tests/unit/memory/l2/subscriber.test.ts b/apps/memos-local-plugin/tests/unit/memory/l2/subscriber.test.ts index f5ac75e09..6d9a140c1 100644 --- a/apps/memos-local-plugin/tests/unit/memory/l2/subscriber.test.ts +++ b/apps/memos-local-plugin/tests/unit/memory/l2/subscriber.test.ts @@ -88,7 +88,7 @@ function fakeRewardResult(episodeId: string, traceIds: string[]): RewardResult { updates: [], meanAbsValue: 0.8, maxPriority: 0.8, - echoParams: { gamma: 0.9, decayHalfLifeDays: 30, now: NOW }, + echoParams: { gamma: 0.9, lambda: 0.5, delta: 0.1, decayHalfLifeDays: 30, now: NOW }, }, traceIds: traceIds as RewardResult["traceIds"], timings: { summary: 0, score: 0, backprop: 0, persist: 0, total: 0 }, diff --git a/apps/memos-local-plugin/tests/unit/reward/backprop.test.ts b/apps/memos-local-plugin/tests/unit/reward/backprop.test.ts index c78480ac0..e91113a07 100644 --- a/apps/memos-local-plugin/tests/unit/reward/backprop.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/backprop.test.ts @@ -12,7 +12,7 @@ function makeTrace(partial: Partial & { id: string; ts: number; alpha? userText: "", agentText: "", toolCalls: [], - reflection: null, + reflection: partial.reflection ?? null, value: 0, alpha: (partial.alpha ?? 0) as TraceRow["alpha"], rHuman: null, @@ -28,86 +28,202 @@ function makeTrace(partial: Partial & { id: string; ts: number; alpha? describe("reward/backprop", () => { const now = (1_700_000_000_000) as EpochMs; - it("V_T = R_human on the last step (boundary case)", () => { - const t1 = makeTrace({ id: "t1", ts: now - 60_000, alpha: 0 }); - const t2 = makeTrace({ id: "t2", ts: now - 30_000, alpha: 0 }); - const t3 = makeTrace({ id: "t3", ts: now, alpha: 0 }); - - const res = backprop({ traces: [t1, t2, t3], rHuman: 0.8, gamma: 0.9, decayHalfLifeDays: 30, now }); - expect(res.updates).toHaveLength(3); - expect(res.updates[2]!.value).toBeCloseTo(0.8, 6); + it("uses normalized credit assignment and conserves ΣV=R", () => { + const traces = [ + makeTrace({ id: "t1", ts: now - 5_000, reflection: "RELATED" }), + makeTrace({ id: "t2", ts: now - 4_000, reflection: "RELATED" }), + makeTrace({ id: "t3", ts: now - 3_000, reflection: "IRRELEVANT" }), + makeTrace({ id: "t4", ts: now - 2_000, reflection: "RELATED" }), + makeTrace({ id: "t5", ts: now - 1_000, reflection: "RELATED" }), + ]; + const res = backprop({ + traces, + rHuman: 0.5, + gamma: 0.9, + lambda: 0.5, + delta: 0, + decayHalfLifeDays: 30, + now, + }); + const values = res.updates.map((u) => u.value); + expect(values[2]).toBe(0); + const sum = values.reduce((a, b) => a + b, 0); + expect(sum).toBeCloseTo(0.5, 6); }); - it("pure γ-discount when α=0 everywhere", () => { - const t1 = makeTrace({ id: "t1", ts: now - 2_000, alpha: 0 }); - const t2 = makeTrace({ id: "t2", ts: now - 1_000, alpha: 0 }); - const t3 = makeTrace({ id: "t3", ts: now, alpha: 0 }); - - const res = backprop({ traces: [t1, t2, t3], rHuman: 1.0, gamma: 0.9, decayHalfLifeDays: 365, now }); - // V3 = 1, V2 = 0.9 * 1 = 0.9, V1 = 0.9 * 0.9 = 0.81 - expect(res.updates[2]!.value).toBeCloseTo(1.0); - expect(res.updates[1]!.value).toBeCloseTo(0.9); - expect(res.updates[0]!.value).toBeCloseTo(0.81); + it("applies recovery boost to first non-zero step after zero", () => { + const traces = [ + makeTrace({ id: "t1", ts: now - 6_000, reflection: "RELATED" }), + makeTrace({ id: "t2", ts: now - 5_000, reflection: "RELATED" }), + makeTrace({ id: "t3", ts: now - 4_000, reflection: "IRRELEVANT" }), + makeTrace({ id: "t4", ts: now - 3_000, reflection: "PIVOTAL" }), + makeTrace({ id: "t5", ts: now - 2_000, reflection: "RELATED" }), + makeTrace({ id: "t6", ts: now - 1_000, reflection: "PIVOTAL" }), + ]; + const res = backprop({ + traces, + rHuman: 1, + gamma: 0.9, + lambda: 0, + delta: 0.1, + decayHalfLifeDays: 30, + now, + }); + // With lambda=0, positional factor is flat, so recovery decides t4 > t6. + expect(res.updates[3]!.value).toBeGreaterThan(res.updates[5]!.value); }); - it("α=1 pulls that step straight to R_human (no discount)", () => { - const t1 = makeTrace({ id: "t1", ts: now - 2_000, alpha: 1 }); // "aha!" step - const t2 = makeTrace({ id: "t2", ts: now - 1_000, alpha: 0 }); - const t3 = makeTrace({ id: "t3", ts: now, alpha: 0 }); - - const res = backprop({ traces: [t1, t2, t3], rHuman: 0.6, gamma: 0.5, decayHalfLifeDays: 365, now }); - expect(res.updates[0]!.value).toBeCloseTo(0.6); // 1·R + 0·γ·… = R + it("keeps negative reward proportional and conserved", () => { + const traces = [ + makeTrace({ id: "t1", ts: now - 3_000, reflection: "RELATED" }), + makeTrace({ id: "t2", ts: now - 2_000, reflection: "PIVOTAL" }), + makeTrace({ id: "t3", ts: now - 1_000, reflection: "RELATED" }), + ]; + const res = backprop({ + traces, + rHuman: -0.8, + gamma: 0.9, + lambda: 0.5, + delta: 0.1, + decayHalfLifeDays: 30, + now, + }); + const sum = res.updates.reduce((a, b) => a + b.value, 0); + expect(sum).toBeCloseTo(-0.8, 6); + expect(res.updates.every((u) => u.value <= 0)).toBe(true); }); - it("mixes α between 0 and 1 using the V7 formula", () => { - const t1 = makeTrace({ id: "t1", ts: now - 2_000, alpha: 0.5 }); - const t2 = makeTrace({ id: "t2", ts: now - 1_000, alpha: 0 }); - const t3 = makeTrace({ id: "t3", ts: now, alpha: 0 }); + it("falls back to alpha when reflection is missing", () => { + const traces = [ + makeTrace({ id: "t1", ts: now - 2_000, reflection: null, alpha: 1 }), + makeTrace({ id: "t2", ts: now - 1_000, reflection: null, alpha: 0 }), + ]; + const res = backprop({ + traces, + rHuman: 1, + gamma: 0.9, + lambda: 0.5, + delta: 0, + decayHalfLifeDays: 30, + now, + }); + expect(res.updates[0]!.value).toBeCloseTo(1, 6); + expect(res.updates[1]!.value).toBe(0); + }); - const r = 1.0; - const gamma = 0.9; - const res = backprop({ traces: [t1, t2, t3], rHuman: r, gamma, decayHalfLifeDays: 365, now }); - const v3 = r; - const v2 = 0 * r + 1 * gamma * v3; // 0.9 - const v1 = 0.5 * r + 0.5 * gamma * v2; // 0.5 + 0.5·0.81 = 0.905 - expect(res.updates[2]!.value).toBeCloseTo(v3, 6); - expect(res.updates[1]!.value).toBeCloseTo(v2, 6); - expect(res.updates[0]!.value).toBeCloseTo(v1, 6); + it("returns all-zero values when Σw=0 (all irrelevant)", () => { + const traces = [ + makeTrace({ id: "t1", ts: now - 2_000, reflection: "IRRELEVANT" }), + makeTrace({ id: "t2", ts: now - 1_000, reflection: "IRRELEVANT" }), + ]; + const res = backprop({ + traces, + rHuman: 0.9, + gamma: 0.9, + lambda: 0.5, + delta: 0.1, + decayHalfLifeDays: 30, + now, + }); + expect(res.updates[0]!.value).toBe(0); + expect(res.updates[1]!.value).toBe(0); }); it("clamps R_human and γ to their legal ranges", () => { - const t = makeTrace({ id: "t", ts: now, alpha: 0 }); - const res = backprop({ traces: [t], rHuman: 5 /* > 1 */, gamma: 2 /* > 1 */, decayHalfLifeDays: 1, now }); + const t = makeTrace({ id: "t", ts: now, reflection: "PIVOTAL", alpha: 0 }); + const res = backprop({ + traces: [t], + rHuman: 5 /* > 1 */, + gamma: 2 /* > 1 */, + lambda: 2, + delta: -1, + decayHalfLifeDays: 1, + now, + }); expect(res.updates[0]!.value).toBeCloseTo(1); expect(res.echoParams.gamma).toBeCloseTo(1); - - const res2 = backprop({ traces: [t], rHuman: -5, gamma: -1, decayHalfLifeDays: 1, now }); + expect(res.echoParams.lambda).toBeCloseTo(1); + expect(res.echoParams.delta).toBeCloseTo(0); + + const res2 = backprop({ + traces: [t], + rHuman: -5, + gamma: -1, + lambda: -1, + delta: 0, + decayHalfLifeDays: 1, + now, + }); expect(res2.updates[0]!.value).toBeCloseTo(-1); expect(res2.echoParams.gamma).toBeCloseTo(0); + expect(res2.echoParams.lambda).toBeCloseTo(0); }); - it("priority = max(V, 0) · decay(Δt)", () => { - const halfLife = 30; // days - const t1 = makeTrace({ id: "t1", ts: now - 30 * 86_400_000, alpha: 0 }); // 1 half-life ago - const t2 = makeTrace({ id: "t2", ts: now, alpha: 0 }); + it("zero-α traces collapse to V=0 and priority=0 (overrides capture seed)", () => { + const t1 = makeTrace({ id: "t1", ts: now - 30 * 86_400_000, reflection: "IRRELEVANT" }); + const t2 = makeTrace({ id: "t2", ts: now, reflection: "IRRELEVANT" }); + const res = backprop({ + traces: [t1, t2], + rHuman: 1.0, + gamma: 1.0, + lambda: 1, + delta: 0, + decayHalfLifeDays: 30, + now, + }); + expect(res.updates[0]!.value).toBe(0); + expect(res.updates[1]!.value).toBe(0); + expect(res.updates[0]!.priority).toBe(0); + expect(res.updates[1]!.priority).toBe(0); + }); - const res = backprop({ traces: [t1, t2], rHuman: 1.0, gamma: 1.0, decayHalfLifeDays: halfLife, now }); - // V1 = γ·V2 = 1·1 = 1, decay = 0.5 → priority = 0.5 - expect(res.updates[0]!.value).toBeCloseTo(1, 6); - expect(res.updates[0]!.priority).toBeCloseTo(0.5, 6); - // V2 = 1, decay = 1 → priority = 1 - expect(res.updates[1]!.priority).toBeCloseTo(1, 6); + it("priority = max(V, 0) · decay(Δt) under normalized weights", () => { + const halfLife = 30; // days + // Two equal-weight RELATED traces with flat positional (lambda=0): + // positional = 1, recovery=0, w = 0.5 each, S = 1 → V_t = R/2 = 0.5. + // Decay: t1 is one half-life old → 0.5; t2 is current → 1.0. + const t1 = makeTrace({ id: "t1", ts: now - 30 * 86_400_000, reflection: "RELATED" }); + const t2 = makeTrace({ id: "t2", ts: now, reflection: "RELATED" }); + const res = backprop({ + traces: [t1, t2], + rHuman: 1.0, + gamma: 1.0, + lambda: 0, + delta: 0, + decayHalfLifeDays: halfLife, + now, + }); + expect(res.updates[0]!.value).toBeCloseTo(0.5, 6); + expect(res.updates[1]!.value).toBeCloseTo(0.5, 6); + expect(res.updates[0]!.priority).toBeCloseTo(0.25, 6); // 0.5 · 0.5 + expect(res.updates[1]!.priority).toBeCloseTo(0.5, 6); // 0.5 · 1 }); it("negative V produces zero priority (V7 §3.3 max(V,0))", () => { const t = makeTrace({ id: "t", ts: now, alpha: 1 }); - const res = backprop({ traces: [t], rHuman: -0.8, gamma: 0.9, decayHalfLifeDays: 30, now }); + const res = backprop({ + traces: [t], + rHuman: -0.8, + gamma: 0.9, + lambda: 0.5, + delta: 0.1, + decayHalfLifeDays: 30, + now, + }); expect(res.updates[0]!.value).toBeCloseTo(-0.8); expect(res.updates[0]!.priority).toBe(0); }); it("empty trace list returns zeros without throwing", () => { - const res = backprop({ traces: [], rHuman: 0.5, gamma: 0.9, decayHalfLifeDays: 30, now }); + const res = backprop({ + traces: [], + rHuman: 0.5, + gamma: 0.9, + lambda: 0.5, + delta: 0.1, + decayHalfLifeDays: 30, + now, + }); expect(res.updates).toEqual([]); expect(res.meanAbsValue).toBe(0); expect(res.maxPriority).toBe(0); diff --git a/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts b/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts index 238713a29..477595425 100644 --- a/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/reward.integration.test.ts @@ -29,6 +29,8 @@ const NOW = 1_700_000_000_000 as EpochMs; function cfg(): RewardConfig { return { gamma: 0.9, + lambda: 0.5, + delta: 0.1, tauSoftmax: 0.5, decayHalfLifeDays: 30, llmScoring: true, @@ -234,17 +236,15 @@ describe("reward/integration", () => { const tA = handle.repos.traces.getById("tr_a" as unknown as TraceRow["id"])!; const tB = handle.repos.traces.getById("tr_b" as unknown as TraceRow["id"])!; const tC = handle.repos.traces.getById("tr_c" as unknown as TraceRow["id"])!; - // V_C = R_human, V_B = γ·V_C, V_A = 0.5·R + 0.5·γ·V_B. + // With normalized credit + reflection/alpha fallback: + // alphas are [0.5, 0, 0] => only tr_a gets non-zero weight. const r = result.rHuman; - const vC = r; - const vB = 0.9 * vC; - const vA = 0.5 * r + 0.5 * 0.9 * vB; - expect(tC.value).toBeCloseTo(vC, 5); - expect(tB.value).toBeCloseTo(vB, 5); - expect(tA.value).toBeCloseTo(vA, 5); + expect(tA.value).toBeCloseTo(r, 5); + expect(tB.value).toBeCloseTo(0, 5); + expect(tC.value).toBeCloseTo(0, 5); // Priority for all three should be positive and ≤ V (decay ≤ 1). - expect(tC.priority).toBeGreaterThan(0); - expect(tC.priority).toBeLessThanOrEqual(vC + 1e-9); + expect(tA.priority).toBeGreaterThan(0); + expect(tA.priority).toBeLessThanOrEqual(r + 1e-9); const ep = handle.repos.episodes.getById(eid as unknown as EpisodeRow["id"])!; expect(ep.rTask).toBeCloseTo(result.rHuman, 5); diff --git a/apps/memos-local-plugin/tests/unit/reward/subscriber.test.ts b/apps/memos-local-plugin/tests/unit/reward/subscriber.test.ts index 6664ac62a..7516cc82b 100644 --- a/apps/memos-local-plugin/tests/unit/reward/subscriber.test.ts +++ b/apps/memos-local-plugin/tests/unit/reward/subscriber.test.ts @@ -40,7 +40,7 @@ function makeRunner(spy: RunSpy, behavior: "ok" | "pending" | "error" = "ok") { updates: [], meanAbsValue: 0, maxPriority: 0, - echoParams: { gamma: 0.9, decayHalfLifeDays: 30, now: Date.now() }, + echoParams: { gamma: 0.9, lambda: 0.5, delta: 0.1, decayHalfLifeDays: 30, now: Date.now() }, }, traceIds: [], timings: { summary: 0, score: 0, backprop: 0, persist: 0, total: 0 }, @@ -55,6 +55,8 @@ function makeRunner(spy: RunSpy, behavior: "ok" | "pending" | "error" = "ok") { function cfg(windowSec = 0): RewardConfig { return { gamma: 0.9, + lambda: 0.5, + delta: 0.1, tauSoftmax: 0.5, decayHalfLifeDays: 30, llmScoring: false, @@ -238,7 +240,7 @@ describe("reward/subscriber", () => { updates: [], meanAbsValue: 0, maxPriority: 0, - echoParams: { gamma: 0.9, decayHalfLifeDays: 30, now: Date.now() }, + echoParams: { gamma: 0.9, lambda: 0.5, delta: 0.1, decayHalfLifeDays: 30, now: Date.now() }, }, traceIds: [], timings: { summary: 0, score: 0, backprop: 0, persist: 0, total: 0 }, diff --git a/apps/memos-local-plugin/tests/unit/skill/skill.integration.test.ts b/apps/memos-local-plugin/tests/unit/skill/skill.integration.test.ts index d2280fdc9..771f52c50 100644 --- a/apps/memos-local-plugin/tests/unit/skill/skill.integration.test.ts +++ b/apps/memos-local-plugin/tests/unit/skill/skill.integration.test.ts @@ -255,7 +255,7 @@ describe("skill/runSkill (integration)", () => { updates: [], meanAbsValue: 0, maxPriority: 0, - echoParams: { gamma: 0.9, decayHalfLifeDays: 30, now: Date.now() }, + echoParams: { gamma: 0.9, lambda: 0.5, delta: 0.1, decayHalfLifeDays: 30, now: Date.now() }, }, traceIds: [], timings: { summary: 0, score: 0, backprop: 0, persist: 0, total: 0 }, From 617e8d64edc71da49ab8ba8c97073fba77bbf934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Thu, 28 May 2026 17:30:42 +0800 Subject: [PATCH 5/6] fix:viewer problems --- .../core/pipeline/memory-core.ts | 14 ++- .../core/pipeline/orchestrator.ts | 68 ++++++++++++++ .../core/session/manager.ts | 3 + .../server/routes/policies.ts | 5 + .../server/routes/session.ts | 4 +- .../memos-local-plugin/server/routes/skill.ts | 41 +++++++- .../memos-local-plugin/server/routes/trace.ts | 11 ++- .../tests/unit/pipeline/orchestrator.test.ts | 93 +++++++++++++++++++ .../viewer/src/views/MemoriesView.tsx | 2 +- 9 files changed, 228 insertions(+), 13 deletions(-) diff --git a/apps/memos-local-plugin/core/pipeline/memory-core.ts b/apps/memos-local-plugin/core/pipeline/memory-core.ts index 1a1a5290f..1261fd5e5 100644 --- a/apps/memos-local-plugin/core/pipeline/memory-core.ts +++ b/apps/memos-local-plugin/core/pipeline/memory-core.ts @@ -473,6 +473,8 @@ export function createMemoryCore( pkgVersion: string, options: CreateMemoryCoreOptions = {}, ): MemoryCore { + // "经验" 列表的 q 过滤是内存子串匹配;扫描深度不足会导致"展示全部"漏项。 + const POLICY_SCAN_LIMIT = 100_000; const bootAt = Date.now(); const log = rootLogger.child({ channel: "core.pipeline.memory-core" }); let telemetry = options.telemetry ?? null; @@ -899,6 +901,7 @@ export function createMemoryCore( topicState: (ep.meta?.topicState as string | undefined) ?? "interrupted", pauseReason: (ep.meta?.pauseReason as string | undefined) ?? "startup_recovered_open_topic", recoveredAtStartup: nowMs, + pausedAt: typeof ep.meta?.pausedAt === "number" ? ep.meta.pausedAt : nowMs, }); } if (stale.length > 0) { @@ -2613,9 +2616,10 @@ export function createMemoryCore( const offset = Math.max(0, input?.offset ?? 0); const needle = (input?.q ?? "").trim().toLowerCase(); const namespaceFiltered = Boolean(input?.ownerAgentKind || input?.ownerProfileId); + const shouldDeepScan = namespaceFiltered || needle.length > 0; const rows = handle.repos.policies.list({ status: input?.status, - limit: namespaceFiltered ? 100_000 : limit + offset + (needle ? 200 : 0), + limit: shouldDeepScan ? POLICY_SCAN_LIMIT : limit + offset, offset: 0, }); const visibleRows = rows.filter((r) => @@ -2623,7 +2627,7 @@ export function createMemoryCore( ); const filtered = needle ? visibleRows.filter((r) => - (r.title + "\n" + r.trigger + "\n" + r.procedure) + (r.id + "\n" + r.title + "\n" + r.trigger + "\n" + r.procedure) .toLowerCase() .includes(needle), ) @@ -2648,11 +2652,11 @@ export function createMemoryCore( // q is a client-side substring match; mirror `listPolicies` and // walk the full filtered result. Caller passes no limit/offset // so the natural list pages through everything. - const rows = handle.repos.policies.list({ status: input?.status }).filter((r) => + const rows = handle.repos.policies.list({ status: input?.status, limit: POLICY_SCAN_LIMIT }).filter((r) => (input?.includeAllNamespaces || visibleToCurrent(r)) && matchesNamespaceFilter(r, input) ); return rows.filter((r) => - (r.title + "\n" + r.trigger + "\n" + r.procedure) + (r.id + "\n" + r.title + "\n" + r.trigger + "\n" + r.procedure) .toLowerCase() .includes(needle), ).length; @@ -5555,7 +5559,7 @@ export function deriveSkillStatus( if (relatedPolicies.length === 0) { return { status: "not_generated", - reason: "L2 经验归纳尚未产出(可能仍在异步处理中)", + reason: "暂未归纳出 L2 经验", reasonKey: "tasks.skillReason.not_generated.noPolicy", reasonParams: thresholds, linkedSkillId: null, diff --git a/apps/memos-local-plugin/core/pipeline/orchestrator.ts b/apps/memos-local-plugin/core/pipeline/orchestrator.ts index 75dc7e244..fc8d18345 100644 --- a/apps/memos-local-plugin/core/pipeline/orchestrator.ts +++ b/apps/memos-local-plugin/core/pipeline/orchestrator.ts @@ -265,6 +265,19 @@ export function createPipeline(deps: PipelineDeps): PipelineHandle { // `addTurn` calls without a repo round-trip. const openEpisodeBySession = new Map(); + // Paused/interrupted episodes serve as restart-recovery hooks for + // `findRecoverableOpenTopic`. Without a time cap they linger forever + // and steal turns from genuinely-new dashboard sessions (because the + // recovery path explicitly ignores sessionId for paused candidates). + // 1 minute keeps the "user reopened openclaw mid-task" intent intact + // while a brand-new dashboard session 60s+ later gets a fresh episode. + const PAUSED_AUTO_FINALIZE_MS = 60_000; + // Cap the sweep cost to once per N seconds — we trigger it from every + // `onTurnStart`, so without throttling a burst of turns scans the + // episodes table repeatedly. + const PAUSED_SWEEP_THROTTLE_MS = 30_000; + let lastPausedSweepAt = 0; + // Track the most-recently-closed episode per session so V7 §0.1 // "revision" can reopen it. Cleared on `new_task`. const lastEpisodeBySession = new Map< @@ -812,6 +825,60 @@ export function createPipeline(deps: PipelineDeps): PipelineHandle { ); } + /** + * Finalize any episode that has been `topicState: paused | interrupted` + * for longer than `PAUSED_AUTO_FINALIZE_MS`. Legacy paused rows without a + * `pausedAt` stamp are treated as immediately stale (they predate this + * cleanup and should be closed on the first opportunity). + * + * Hooked into `onTurnStart` so a new dashboard session — the common case + * the user actually sees — cannot accidentally graft itself onto a + * weeks-old paused episode via `findRecoverableOpenTopic`. + */ + function sweepStalePausedEpisodes(nowMs: number): void { + if (nowMs - lastPausedSweepAt < PAUSED_SWEEP_THROTTLE_MS) return; + lastPausedSweepAt = nowMs; + try { + const rows = deps.repos.episodes.list({ status: "open", limit: 200 }); + for (const ep of rows) { + const meta = (ep as { meta?: Record }).meta ?? {}; + const topicState = meta.topicState; + if (topicState !== "paused" && topicState !== "interrupted") continue; + const pausedAt = typeof meta.pausedAt === "number" ? meta.pausedAt : null; + const fallbackStamp = ep.endedAt ?? ep.startedAt; + const stamp = pausedAt ?? fallbackStamp; + const ageMs = nowMs - stamp; + if (ageMs < PAUSED_AUTO_FINALIZE_MS) continue; + log.info("episode.paused_auto_finalized", { + episodeId: ep.id, + sessionId: ep.sessionId, + topicState, + ageMs, + thresholdMs: PAUSED_AUTO_FINALIZE_MS, + pausedAt, + }); + try { + session.sessionManager.finalizeEpisode(ep.id as EpisodeId, { + patchMeta: { + topicState: "ended", + closeReason: "auto_finalized_after_pause", + pausedExceededMs: ageMs, + }, + }); + } catch (err) { + log.debug("paused_auto_finalize.finalize_failed", { + episodeId: ep.id, + err: err instanceof Error ? err.message : String(err), + }); + } + } + } catch (err) { + log.debug("paused_auto_finalize.scan_failed", { + err: err instanceof Error ? err.message : String(err), + }); + } + } + function findRecoverableOpenTopic( currentSessionId: SessionId, atTs: number, @@ -1048,6 +1115,7 @@ export function createPipeline(deps: PipelineDeps): PipelineHandle { async function onTurnStart(input: TurnInputDTO): Promise { const t0 = now(); + sweepStalePausedEpisodes(t0); const initialSessionId = await ensureSession( input.agent, input.sessionId, diff --git a/apps/memos-local-plugin/core/session/manager.ts b/apps/memos-local-plugin/core/session/manager.ts index 44da570b2..13b28107d 100644 --- a/apps/memos-local-plugin/core/session/manager.ts +++ b/apps/memos-local-plugin/core/session/manager.ts @@ -186,6 +186,7 @@ export function createSessionManager(deps: SessionManagerDeps): SessionManager { topicState: "paused", pauseReason: `session_closed:${reason}`, sessionCloseReason: reason, + pausedAt: now(), }); continue; } @@ -199,6 +200,7 @@ export function createSessionManager(deps: SessionManagerDeps): SessionManager { topicState: "paused", pauseReason: `session_closed:${reason}`, sessionCloseReason: reason, + pausedAt: now(), }); } live.delete(id); @@ -383,6 +385,7 @@ export function createSessionManager(deps: SessionManagerDeps): SessionManager { topicState: "paused", pauseReason: `shutdown:${reason}`, sessionCloseReason: `shutdown:${reason}`, + pausedAt: now(), }); } } diff --git a/apps/memos-local-plugin/server/routes/policies.ts b/apps/memos-local-plugin/server/routes/policies.ts index 14b236cf2..51b9671b4 100644 --- a/apps/memos-local-plugin/server/routes/policies.ts +++ b/apps/memos-local-plugin/server/routes/policies.ts @@ -39,6 +39,9 @@ export function registerPoliciesRoutes(routes: Routes, deps: ServerDeps): void { const q = params.get("q") || undefined; const ownerAgentKind = params.get("ownerAgentKind") || undefined; const ownerProfileId = params.get("ownerProfileId") || undefined; + // Viewer 的实例切换(包含“全部实例”与显式实例)应以筛选条件为准, + // 不能再被当前会话命名空间二次裁剪,否则会出现“选了 default 但空列表”。 + const includeAllNamespaces = true; const policies = await deps.core.listPolicies({ status, limit, @@ -46,12 +49,14 @@ export function registerPoliciesRoutes(routes: Routes, deps: ServerDeps): void { q, ownerAgentKind, ownerProfileId, + includeAllNamespaces, }); const total = await deps.core.countPolicies({ status, q, ownerAgentKind, ownerProfileId, + includeAllNamespaces, }); return { policies, diff --git a/apps/memos-local-plugin/server/routes/session.ts b/apps/memos-local-plugin/server/routes/session.ts index 135b55704..0bcf51b51 100644 --- a/apps/memos-local-plugin/server/routes/session.ts +++ b/apps/memos-local-plugin/server/routes/session.ts @@ -120,7 +120,9 @@ export function registerSessionRoutes(routes: Routes, deps: ServerDeps): void { }); if (q) { rows = rows.filter( - (ep: EpisodeListItemDTO) => !!ep.preview && ep.preview.toLowerCase().includes(q), + (ep: EpisodeListItemDTO) => + ep.id.toLowerCase().includes(q) || + (!!ep.preview && ep.preview.toLowerCase().includes(q)), ); } if (status) { diff --git a/apps/memos-local-plugin/server/routes/skill.ts b/apps/memos-local-plugin/server/routes/skill.ts index cf8edd4b2..cfbc03a62 100644 --- a/apps/memos-local-plugin/server/routes/skill.ts +++ b/apps/memos-local-plugin/server/routes/skill.ts @@ -14,10 +14,10 @@ import { parseJson, writeError, type Routes } from "./registry.js"; export function registerSkillRoutes(routes: Routes, deps: ServerDeps): void { routes.set("GET /api/v1/skills", async (ctx) => { const params = ctx.url.searchParams; - const status = (params.get("status") as SkillDTO["status"] | null) ?? undefined; + const status = parseSkillStatus(params.get("status")); const q = (params.get("q") || "").trim().toLowerCase(); - const ownerAgentKind = params.get("ownerAgentKind") || undefined; - const ownerProfileId = params.get("ownerProfileId") || undefined; + const ownerAgentKind = parseOwnerFilter(params.get("ownerAgentKind")); + const ownerProfileId = parseOwnerFilter(params.get("ownerProfileId")); // Viewer needs prev/next pagination — ask for one extra page so we // can tell the client whether there's more without a count query. const pageSize = limitOrUndefined(params.get("limit")) ?? 50; @@ -27,10 +27,14 @@ export function registerSkillRoutes(routes: Routes, deps: ServerDeps): void { limit: q ? 5000 : pageSize + offset + 1, ownerAgentKind, ownerProfileId, + includeAllNamespaces: true, }); if (q) { all = all.filter( - (s) => s.name.toLowerCase().includes(q) || s.invocationGuide.toLowerCase().includes(q), + (s) => + s.id.toLowerCase().includes(q) || + s.name.toLowerCase().includes(q) || + s.invocationGuide.toLowerCase().includes(q), ); } const page = all.slice(offset, offset + pageSize); @@ -39,13 +43,30 @@ export function registerSkillRoutes(routes: Routes, deps: ServerDeps): void { status, ownerAgentKind, ownerProfileId, + includeAllNamespaces: true, }); + const debug = params.get("debug") === "1"; return { skills: page, limit: pageSize, offset, total, nextOffset: hasMore ? offset + pageSize : undefined, + ...(debug + ? { + _debug: { + q, + status: params.get("status"), + parsedStatus: status, + ownerAgentKind: params.get("ownerAgentKind"), + parsedOwnerAgentKind: ownerAgentKind, + ownerProfileId: params.get("ownerProfileId"), + parsedOwnerProfileId: ownerProfileId, + allCount: all.length, + pageCount: page.length, + }, + } + : {}), }; }); @@ -298,6 +319,18 @@ export function registerSkillRoutes(routes: Routes, deps: ServerDeps): void { }); } +function parseSkillStatus(raw: string | null): SkillDTO["status"] | undefined { + if (raw === "active" || raw === "candidate" || raw === "archived") return raw; + return undefined; +} + +function parseOwnerFilter(raw: string | null): string | undefined { + if (!raw) return undefined; + const v = raw.trim().toLowerCase(); + if (!v || v === "all" || v === "*") return undefined; + return raw; +} + function renderSkillMarkdown(skill: SkillDTO): string { const name = skill.name || skill.id; const description = skillDescription(skill.invocationGuide, name); diff --git a/apps/memos-local-plugin/server/routes/trace.ts b/apps/memos-local-plugin/server/routes/trace.ts index ec1dca4c8..409e919fe 100644 --- a/apps/memos-local-plugin/server/routes/trace.ts +++ b/apps/memos-local-plugin/server/routes/trace.ts @@ -35,8 +35,8 @@ export function registerTraceRoutes(routes: Routes, deps: ServerDeps): void { const offset = Number.isFinite(parsedOffset) && parsedOffset >= 0 ? parsedOffset : 0; const sessionId = params.get("sessionId") || undefined; const namespace = parseNamespace(params.get("namespace")); - const ownerAgentKind = params.get("ownerAgentKind") || namespace?.ownerAgentKind || undefined; - const ownerProfileId = params.get("ownerProfileId") || namespace?.ownerProfileId || undefined; + const ownerAgentKind = parseOwnerFilter(params.get("ownerAgentKind")) || namespace?.ownerAgentKind || undefined; + const ownerProfileId = parseOwnerFilter(params.get("ownerProfileId")) || namespace?.ownerProfileId || undefined; const q = params.get("q") || undefined; // When `groupByTurn=true`, pagination treats each (episodeId, turnId) // pair as one "memory" — matching the viewer's grouped display where @@ -224,3 +224,10 @@ function parseNamespace(value: string | null): { ownerAgentKind: string; ownerPr if (!ownerAgentKind || !ownerProfileId) return null; return { ownerAgentKind, ownerProfileId }; } + +function parseOwnerFilter(raw: string | null): string | undefined { + if (!raw) return undefined; + const v = raw.trim().toLowerCase(); + if (!v || v === "all" || v === "*") return undefined; + return raw; +} diff --git a/apps/memos-local-plugin/tests/unit/pipeline/orchestrator.test.ts b/apps/memos-local-plugin/tests/unit/pipeline/orchestrator.test.ts index f4826a6f3..bcac43cab 100644 --- a/apps/memos-local-plugin/tests/unit/pipeline/orchestrator.test.ts +++ b/apps/memos-local-plugin/tests/unit/pipeline/orchestrator.test.ts @@ -258,6 +258,99 @@ describe("pipeline/orchestrator", () => { expect(packet.tierLatencyMs).toBeDefined(); }); + it("auto-finalizes paused episodes older than 60s so a new session starts fresh", async () => { + // Reproduces the bug where a new dashboard session was silently grafted + // onto a previous session's paused-but-open episode after openclaw + // restart. With the fix, the per-turn sweep finalizes paused episodes + // older than 60s so `findRecoverableOpenTopic` cannot grab them. + let nowMs = 1_700_000_000_000; + const deps: PipelineDeps = { ...buildDeps(dbHandle!), now: () => nowMs }; + pipeline = createPipeline(deps); + + // Session A — one full turn so the episode has real content. + const sidA = "s-prev-session"; + const startA = await pipeline.onTurnStart({ + agent: "openclaw", + sessionId: sidA, + userText: "fix the broken build", + ts: nowMs, + }); + await pipeline.onTurnEnd({ + agent: "openclaw", + sessionId: sidA, + episodeId: startA.episodeId ?? "ep-ignored", + agentText: "Running make…", + toolCalls: [], + ts: nowMs + 5_000, + }); + const epA = startA.episodeId!; + + // Simulate openclaw shutdown — pauses (not finalizes) the open episode. + pipeline.sessionManager.closeSession(sidA, "shutdown:test"); + const pausedRow = dbHandle!.repos.episodes.getById(epA); + expect(pausedRow!.status).toBe("open"); + const pausedMeta = (pausedRow as unknown as { meta: Record }).meta; + expect(pausedMeta.topicState).toBe("paused"); + expect(typeof pausedMeta.pausedAt).toBe("number"); + + // 95s later — past the 60s pause window. A brand-new dashboard session + // arrives. Sweep should finalize epA BEFORE `findRecoverableOpenTopic` + // looks at it, so the new session starts its own episode. + nowMs += 95_000; + const sidB = "s-new-session"; + const startB = await pipeline.onTurnStart({ + agent: "openclaw", + sessionId: sidB, + userText: "starting fresh", + ts: nowMs, + }); + expect(dbHandle!.repos.episodes.getById(epA)!.status).toBe("closed"); + expect(startB.episodeId).toBeDefined(); + expect(startB.episodeId).not.toBe(epA); + const epB = dbHandle!.repos.episodes.getById(startB.episodeId!); + expect(epB!.sessionId).toBe(sidB); + }); + + it("preserves recovery when a new session arrives within the 60s pause window", async () => { + // Conjugate of the test above: if openclaw restarts and the user picks + // up within 60s, `findRecoverableOpenTopic` should still graft the new + // turn onto the prior open episode — the recovery feature is intact. + let nowMs = 1_700_000_000_000; + const deps: PipelineDeps = { ...buildDeps(dbHandle!), now: () => nowMs }; + pipeline = createPipeline(deps); + + const sidA = "s-prior"; + const startA = await pipeline.onTurnStart({ + agent: "openclaw", + sessionId: sidA, + userText: "hello", + ts: nowMs, + }); + await pipeline.onTurnEnd({ + agent: "openclaw", + sessionId: sidA, + episodeId: startA.episodeId ?? "ep-ignored", + agentText: "hi", + toolCalls: [], + ts: nowMs + 1_000, + }); + const epA = startA.episodeId!; + pipeline.sessionManager.closeSession(sidA, "shutdown:test"); + expect((dbHandle!.repos.episodes.getById(epA)! as unknown as { meta: Record }).meta.topicState).toBe("paused"); + + // 31s later — within the 60s window. New session recovers epA. + nowMs += 31_000; + const sidB = "s-resumed"; + const startB = await pipeline.onTurnStart({ + agent: "openclaw", + sessionId: sidB, + userText: "i'm back", + ts: nowMs, + }); + expect(startB.episodeId).toBe(epA); + expect(dbHandle!.repos.episodes.getById(epA)!.status).toBe("open"); + }); + it("shutdown drains async work before detaching subscribers", async () => { pipeline = createPipeline(buildDeps(dbHandle!)); await pipeline.onTurnStart({ diff --git a/apps/memos-local-plugin/viewer/src/views/MemoriesView.tsx b/apps/memos-local-plugin/viewer/src/views/MemoriesView.tsx index 189bb3208..0aaa7bde0 100644 --- a/apps/memos-local-plugin/viewer/src/views/MemoriesView.tsx +++ b/apps/memos-local-plugin/viewer/src/views/MemoriesView.tsx @@ -1150,7 +1150,7 @@ function TraceDrawer({
{group.episodeId - ? t("memories.detail.fromTask", { id: group.episodeId.slice(0, 10) }) + ? t("memories.detail.fromTask", { id: group.episodeId }) : t("memories.detail.oneMemory")}

{title}

From 5331c227d82cf8aee40b3e4d11a27f80c1746b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=91=E5=B8=83=E6=9E=97?= <11641432+heiheiyouyou@user.noreply.gitee.com> Date: Fri, 29 May 2026 11:25:12 +0800 Subject: [PATCH 6/6] fix: update reflection template --- .../adapters/openclaw/index.ts | 92 +++++++++++----- .../core/capture/batch-scorer.ts | 19 +++- .../core/llm/prompts/reflection.ts | 103 ++++++++++++------ .../unit/adapters/openclaw-runtime.test.ts | 11 +- .../tests/unit/capture/batch-scorer.test.ts | 27 +++++ .../tests/unit/capture/capture-batch.test.ts | 6 +- .../tests/unit/viewer/tasks-chat.test.ts | 29 +++-- .../viewer/src/stores/i18n.ts | 2 + .../viewer/src/views/tasks-chat-data.ts | 44 +++++++- .../viewer/src/views/tasks-chat.tsx | 29 +++++ 10 files changed, 284 insertions(+), 78 deletions(-) diff --git a/apps/memos-local-plugin/adapters/openclaw/index.ts b/apps/memos-local-plugin/adapters/openclaw/index.ts index 9c318889a..013d3a0a3 100644 --- a/apps/memos-local-plugin/adapters/openclaw/index.ts +++ b/apps/memos-local-plugin/adapters/openclaw/index.ts @@ -89,6 +89,28 @@ interface PluginRuntime { shutdown: () => Promise; } +interface SharedRuntimeState { + registrations: number; + runtime: PluginRuntime | null; + bootstrapError: Error | null; + bootstrapPromise: Promise; +} + +const SHARED_RUNTIME_KEY = "__memos_local_openclaw_runtime_v1__"; + +function readSharedRuntimeState(): SharedRuntimeState | null { + const g = globalThis as Record; + const state = g[SHARED_RUNTIME_KEY]; + if (!state || typeof state !== "object") return null; + return state as SharedRuntimeState; +} + +function writeSharedRuntimeState(state: SharedRuntimeState | null): void { + const g = globalThis as Record; + if (state) g[SHARED_RUNTIME_KEY] = state; + else delete g[SHARED_RUNTIME_KEY]; +} + /** * Locate the plugin source root (the directory holding `package.json`, * `bridge.cts`, etc.). Two layouts to support: built tarball @@ -276,9 +298,7 @@ async function closeViewerAfterFailedBootstrap( } } -// ─── Registration ────────────────────────────────────────────────────────── - -function register(api: OpenClawPluginApi): void { +function createSharedRuntimeState(api: OpenClawPluginApi): SharedRuntimeState { let runtimeLock: OpenClawRuntimeLockHandle; try { runtimeLock = acquireOpenClawRuntimeLock({ @@ -296,6 +316,40 @@ function register(api: OpenClawPluginApi): void { throw err; } + const state: SharedRuntimeState = { + registrations: 0, + runtime: null, + bootstrapError: null, + bootstrapPromise: Promise.resolve(), + }; + state.bootstrapPromise = createRuntime(api, runtimeLock) + .then((runtime) => { + state.runtime = runtime; + api.logger.info("memos-local: plugin ready"); + }) + .catch((err) => { + state.bootstrapError = err instanceof Error ? err : new Error(String(err)); + const duplicate = err instanceof DuplicateOpenClawRuntimeError; + api.logger.error("memos-local: bootstrap failed", { + err: state.bootstrapError.message, + code: duplicate ? err.code : (err as { code?: unknown }).code, + }); + }); + return state; +} + +// ─── Registration ────────────────────────────────────────────────────────── + +function register(api: OpenClawPluginApi): void { + let state = readSharedRuntimeState(); + if (!state) { + state = createSharedRuntimeState(api); + writeSharedRuntimeState(state); + } else { + api.logger.info("memos-local: reusing in-process shared runtime"); + } + state.registrations += 1; + // 1. Memory capability (prompt prelude) — register synchronously so the // host immediately knows who owns the memory slot, even if bootstrap // fails later. @@ -351,26 +405,10 @@ function register(api: OpenClawPluginApi): void { // 2. Kick off core bootstrap. OpenClaw only accepts tool / hook // registration during the synchronous `register(api)` window, so // tools register a shell now and wait for runtime inside execute(). - let runtime: PluginRuntime | null = null; - let bootstrapError: Error | null = null; - const bootstrapPromise = createRuntime(api, runtimeLock) - .then((r) => { - runtime = r; - api.logger.info("memos-local: plugin ready"); - }) - .catch((err) => { - bootstrapError = err instanceof Error ? err : new Error(String(err)); - const duplicate = err instanceof DuplicateOpenClawRuntimeError; - api.logger.error("memos-local: bootstrap failed", { - err: bootstrapError.message, - code: duplicate ? err.code : (err as { code?: unknown }).code, - }); - }); - const ensureRuntime = async (): Promise => { - if (runtime) return runtime; - await bootstrapPromise; - return runtime; + if (state.runtime) return state.runtime; + await state.bootstrapPromise; + return state.runtime; }; registerOpenClawTools(api, { @@ -448,11 +486,15 @@ function register(api: OpenClawPluginApi): void { id: "memos-local", name: "memos-local", async start() { - await bootstrapPromise; - if (bootstrapError) throw bootstrapError; + await state.bootstrapPromise; + if (state.bootstrapError) throw state.bootstrapError; }, async stop() { - if (runtime) await runtime.shutdown(); + state.registrations = Math.max(0, state.registrations - 1); + if (state.registrations > 0) return; + const runtimeToStop = state.runtime; + writeSharedRuntimeState(null); + if (runtimeToStop) await runtimeToStop.shutdown(); }, }); } diff --git a/apps/memos-local-plugin/core/capture/batch-scorer.ts b/apps/memos-local-plugin/core/capture/batch-scorer.ts index 31578555b..74a89f8c2 100644 --- a/apps/memos-local-plugin/core/capture/batch-scorer.ts +++ b/apps/memos-local-plugin/core/capture/batch-scorer.ts @@ -136,7 +136,9 @@ export async function batchScoreReflections( source: "none", }; } - const label = mapRawRelevance(raw.relevance); + const baseLabel = mapRawRelevance(raw.relevance); + const socialOnly = isSocialOnlyStep(input.step); + const label: ReflectionScore["text"] = socialOnly ? "IRRELEVANT" : baseLabel; const alpha = alphaForReflection(label); const reason = sanitizeReason(raw.reason); if (reason === null) missingReasonCount += 1; @@ -144,7 +146,7 @@ export async function batchScoreReflections( text: label, alpha, usable: alpha > 0, - reason, + reason: socialOnly ? "SOCIAL_ONLY" : reason, source: "synth", model: rsp.servedBy, }; @@ -276,3 +278,16 @@ function sanitizeReason(value: unknown): string | null { if (!cleaned) return null; return cleaned.slice(0, 80); } + +function isSocialOnlyStep(step: NormalizedStep): boolean { + if (step.toolCalls.length > 0) return false; + const combined = `${step.userText}\n${step.agentText}\n${step.agentThinking ?? ""}`.toLowerCase(); + if (!combined.trim()) return false; + + const socialPattern = + /(谢谢|感谢|辛苦|棒|很好|很对|厉害|夸奖|客气|不用谢|再见|拜拜|你好|您好|早上好|晚上好|thank(s| you)?|appreciate|great job|well done|awesome|nice|you're welcome|no problem|bye|goodbye|hello|hi)/i; + const taskSignalPattern = + /(修复|实现|改|更新|测试|报错|错误|命令|脚本|代码|函数|文件|数据库|sql|trace|episode|reward|reflection|alpha|value|fix|implement|update|test|error|command|script|code|function|file|db|database|query|bug|issue|task)/i; + + return socialPattern.test(combined) && !taskSignalPattern.test(combined); +} diff --git a/apps/memos-local-plugin/core/llm/prompts/reflection.ts b/apps/memos-local-plugin/core/llm/prompts/reflection.ts index 8e52b484e..f544ee81b 100644 --- a/apps/memos-local-plugin/core/llm/prompts/reflection.ts +++ b/apps/memos-local-plugin/core/llm/prompts/reflection.ts @@ -13,49 +13,90 @@ import type { PromptDef } from "./index.js"; */ export const BATCH_REFLECTION_PROMPT: PromptDef = { id: "reflection.batch", - version: 6, + version: 9, description: "Tri-valued path-relevance scoring for each step in an episode window.", system: `You are reviewing a WINDOW of one AI agent episode. -INPUT: a JSON array under "steps". Each entry has: +Payload top-level fields: "steps" (required, array) and "task_context" +(optional episode-level task summary). Each entry in "steps" has: - "idx": step index (integer, 0-based, sequential) - "state": what the agent saw before acting (user prompt / prior obs) -- "thinking": the LLM's native chain-of-thought for this step - (Claude extended-thinking / pi-ai ThinkingContent). May - be empty string. +- "thinking": the LLM's chain-of-thought for this step. May be empty. - "action": what the agent chose to do (assistant text) -- "tool_calls": the tools invoked, with inputs + outputs + errorCode. - May be empty array. Tool usage + outcomes are - first-class evidence for scoring the step. +- "tool_calls": tools invoked, with inputs + outputs + errorCode. May + be empty. Tool usage + outcomes are first-class evidence. - "outcome": the step's final observable outcome (last tool output, error, or "(assistant-only step)" for pure text turns) -- "task_context": optional episode-level task summary. - -The user payload may also include "host_context". That describes the host -agent being reviewed and the separate reflection model doing this review. Goal: decide each step's relevance to the final trajectory. You must NOT produce long natural-language reflection text. -For EACH input step, return one object containing: +Hard override (must follow): if a step is purely social/polite phatic +exchange (praise, thanks, greetings, apologies, small talk — "you did +great", "thank you", "bye", etc.) and does not add task constraints, +technical decisions, executable actions, debugging evidence, or progress +toward completion, label it IRRELEVANT — even if sentiment is positive. + +Scoring rubric (apply in order: IRRELEVANT vs on-path, then RELATED vs PIVOTAL): + +- IRRELEVANT => off-path, ineffective, or social-only (see hard override above). +- RELATED => any step that is useful and on the task path. This is the default + for on-path work. Do NOT reserve RELATED only for "deletable" steps; many + RELATED steps are necessary, and deletion cost is NOT the criterion. +- PIVOTAL => a strict subset of RELATED: mark PIVOTAL only when the step is + a path-critical turning point or foundational decision for the episode. + Prefer few PIVOTAL labels per window. Typical PIVOTAL cases: + * Prior exploration failed or stalled; this step finds the correct + approach, root cause, or workable fix that later steps build on. + * The step establishes the episode's core plan, architecture, constraints, + or governing principles that shape how the rest of the task runs. + Do NOT use counterfactual deletion ("if removed, major rework/failure") as + the main test — many RELATED steps would also be costly to remove. Reserve + PIVOTAL for steps that change direction or set the backbone of the solution, + not for routine on-path execution (reading files, minor edits, status updates, + generic tool calls that merely continue an already-correct plan). + +Calibration examples (PIVOTAL is RELATIVE to prior steps in the window — +look at the sequence, not the step in isolation): + +Sequence A — recovery after exploration: + step 0: try \`from foo import bar\` -> ImportError + -> RELATED, reason "EXPLORATION" + step 1: try \`from foo.bar import baz\` -> ImportError + -> RELATED, reason "EXPLORATION" + step 2: grep project, discover \`bar\` lives under \`foo.utils.bar\` + -> PIVOTAL, reason "ROOT_CAUSE" + (prior two steps stalled; this step unblocks the rest) + step 3: rewrite import -> tests pass + -> RELATED, reason "EXECUTION" + +Sequence B — plan anchor at the start: + step 0: user gives vague request "build me a chat bot" + -> IRRELEVANT, reason "NO_ACTION" + step 1: after clarifying, lock in "FastAPI + WebSocket, single room" + -> PIVOTAL, reason "PLAN_ANCHOR" + (every later step is built on this architectural choice) + step 2: scaffold the project directory + -> RELATED, reason "EXECUTION" + step 3: implement WebSocket handler + -> RELATED, reason "EXECUTION" + +Sequence C — routine on-path, NO PIVOTAL needed: + step 0: read config.json + -> RELATED, reason "READ_CONFIG" + step 1: change port field 8080 -> 9090 + -> RELATED, reason "CONFIG_EDIT" + step 2: restart service -> ok + -> RELATED, reason "VERIFY" + (Linear execution with no turning point. A window can legitimately + contain zero PIVOTAL steps — do NOT force one.) + +Output: a JSON object \`{"scores": [...]}\` with exactly one entry per input +step, in input order — no skips, no extras. Each entry: - "idx": copy the input idx exactly -- "relevance": MUST be one of "IRRELEVANT", "RELATED", "PIVOTAL" - * IRRELEVANT => detour / ineffective / not on useful path - * RELATED => useful on-path support step - * PIVOTAL => key turning point, removing it would cause major rework/failure - * IMPORTANT: NEVER output "RELATED_DEFAULT" -- "reason": short code-like reason, <= 8 words (e.g. "ON_PATH", "DETOUR") - -Return JSON of the form: -{ - "scores": [ - {"idx": 0, "relevance": "RELATED", "reason": "ON_PATH"}, - {"idx": 1, "relevance": "PIVOTAL", "reason": "RECOVERY"}, - {"idx": 2, "relevance": "IRRELEVANT", "reason": "DETOUR"} - ] -} - -The "scores" array MUST contain exactly one entry per input step, in input -order. Do not skip steps. Do not invent extra entries.`, +- "relevance": one of "IRRELEVANT" | "RELATED" | "PIVOTAL" (NEVER emit + "RELATED_DEFAULT" — that label is backend-only) +- "reason": short code-like reason, <= 8 words (see calibration sequences + above for example codes)`, }; diff --git a/apps/memos-local-plugin/tests/unit/adapters/openclaw-runtime.test.ts b/apps/memos-local-plugin/tests/unit/adapters/openclaw-runtime.test.ts index 19378853a..b7e112ee3 100644 --- a/apps/memos-local-plugin/tests/unit/adapters/openclaw-runtime.test.ts +++ b/apps/memos-local-plugin/tests/unit/adapters/openclaw-runtime.test.ts @@ -108,7 +108,7 @@ function deferred() { } describe("OpenClaw adapter runtime lifecycle", () => { - it("blocks a duplicate register before the second runtime bootstraps", async () => { + it("reuses the in-process runtime across repeated register() calls", async () => { const home = useTempMemosHome(); const firstCore = makeCore(); const boot = deferred<{ core: ReturnType; config: typeof DEFAULT_CONFIG; home: ResolvedHome }>(); @@ -126,14 +126,17 @@ describe("OpenClaw adapter runtime lifecycle", () => { expect(bootstrapMemoryCoreFull).toHaveBeenCalledTimes(1); const api2 = makeApi(); - expect(() => plugin.register(api2)).toThrow(/already active/); + expect(() => plugin.register(api2)).not.toThrow(); expect(bootstrapMemoryCoreFull).toHaveBeenCalledTimes(1); - expect(api2.registerTool).not.toHaveBeenCalled(); - expect(api2.on).not.toHaveBeenCalled(); + expect(api2.registerTool).toHaveBeenCalled(); + expect(api2.on).toHaveBeenCalled(); boot.resolve({ core: firstCore, config: DEFAULT_CONFIG, home }); await api1.services[0]!.start?.(); + await api2.services[0]!.start?.(); await api1.services[0]!.stop?.(); + expect(fs.existsSync(path.join(home.daemonDir, "openclaw-runtime.lock"))).toBe(true); + await api2.services[0]!.stop?.(); expect(fs.existsSync(path.join(home.daemonDir, "openclaw-runtime.lock"))).toBe(false); }); diff --git a/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts b/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts index b81365076..78290e97f 100644 --- a/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/batch-scorer.test.ts @@ -186,4 +186,31 @@ describe("batchScoreReflections", () => { expect(out.scores[0]!.text).toBe("PIVOTAL"); expect(out.scores[0]!.alpha).toBe(1); }); + + it("forces social-only turns to IRRELEVANT as fallback", async () => { + const llm = fakeLlm({ + completeJson: { + [BATCH_OP_TAG]: { + scores: [{ idx: 0, relevance: "PIVOTAL", reason: "TURNING_POINT" }], + }, + }, + }); + const out = await batchScoreReflections( + llm, + [ + input( + step({ + userText: "你做的很对,运行起来也很流畅,棒!", + agentText: "谢谢夸奖!有需要随时说。", + toolCalls: [], + }), + null, + ), + ], + {}, + ); + expect(out.scores[0]!.text).toBe("IRRELEVANT"); + expect(out.scores[0]!.alpha).toBe(0); + expect(out.scores[0]!.reason).toBe("SOCIAL_ONLY"); + }); }); diff --git a/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts b/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts index 381e9f3ac..acce77303 100644 --- a/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts +++ b/apps/memos-local-plugin/tests/unit/capture/capture-batch.test.ts @@ -166,7 +166,7 @@ describe("capture/pipeline (windowed binary path)", () => { }); } - it("single window writes RELATED/IRRELEVANT with alpha 1/0", async () => { + it("single window writes tri-valued reflections and social fallback", async () => { const llm = fakeLlm({ completeJson: { [batchOp]: { @@ -205,8 +205,8 @@ describe("capture/pipeline (windowed binary path)", () => { expect(rows[0]!.alpha).toBe(0.5); expect(rows[1]!.reflection).toBe("IRRELEVANT"); expect(rows[1]!.alpha).toBe(0); - expect(rows[2]!.reflection).toBe("PIVOTAL"); - expect(rows[2]!.alpha).toBe(1); + expect(rows[2]!.reflection).toBe("IRRELEVANT"); + expect(rows[2]!.alpha).toBe(0); }); it("window overlap conflict uses alpha=1 override", async () => { diff --git a/apps/memos-local-plugin/tests/unit/viewer/tasks-chat.test.ts b/apps/memos-local-plugin/tests/unit/viewer/tasks-chat.test.ts index a4c0ba970..281371434 100644 --- a/apps/memos-local-plugin/tests/unit/viewer/tasks-chat.test.ts +++ b/apps/memos-local-plugin/tests/unit/viewer/tasks-chat.test.ts @@ -27,12 +27,13 @@ function trace(part: Partial): TimelineTrace { agentThinking: part.agentThinking ?? null, reflection: part.reflection ?? null, value: part.value ?? 0, + alpha: part.alpha, toolCalls: part.toolCalls ?? [], }; } describe("flattenChat", () => { - it("emits user → tool cards with thinking → assistant; reflection is dropped", () => { + it("emits user → tool cards with thinking → assistant and attaches reflection metadata", () => { const t = trace({ id: "tr1", userText: "go fix the deploy", @@ -74,15 +75,12 @@ describe("flattenChat", () => { expect(msgs[1]!.errorCode).toBe("EXIT_1"); expect(msgs[1]!.toolDurationMs).toBe(190); expect(msgs[3]!.text).toBe("done — see PR #42"); - for (const m of msgs) { - expect(m.text).not.toContain("INTERNAL: scoring note"); - } + expect(msgs[1]!.relatedLabel).toContain("INTERNAL: scoring note"); + expect(msgs[3]!.relatedLabel).toContain("INTERNAL: scoring note"); + expect(msgs[1]!.scoreLabel).toBe("V 0.00"); }); - it("never emits a thinking bubble when the trace only has a reflection", () => { - // V7 §0.1 separation regression: reflection is plugin-internal - // scoring data and must NOT pollute the conversation log even - // when no agentThinking is present. + it("does not emit a standalone bubble when only reflection exists", () => { const t = trace({ id: "tr_nothink", userText: "x", @@ -91,6 +89,7 @@ describe("flattenChat", () => { }); const msgs = flattenChat([t]); expect(msgs.map((m) => m.role)).toEqual(["user", "assistant"]); + expect(msgs[1]!.relatedLabel).toBe("this should not appear in the chat log"); }); it("sorts tool calls within a trace by startedAt", () => { @@ -179,6 +178,20 @@ describe("flattenChat", () => { expect(msgs.map((m) => m.role)).toEqual(["tool"]); }); + it("formats score label with alpha when available", () => { + const t = trace({ + id: "tr_score", + userText: "q", + agentText: "a", + value: 0.88, + alpha: 0.42, + }); + const msgs = flattenChat([t]); + const nonUser = msgs.filter((m) => m.role !== "user"); + expect(nonUser).toHaveLength(1); + expect(nonUser[0]!.scoreLabel).toBe("V 0.88 · α 0.42"); + }); + it("serialises object tool inputs as pretty JSON, leaves strings alone", () => { const t = trace({ id: "tr5", diff --git a/apps/memos-local-plugin/viewer/src/stores/i18n.ts b/apps/memos-local-plugin/viewer/src/stores/i18n.ts index 8fd5348de..ddf595438 100644 --- a/apps/memos-local-plugin/viewer/src/stores/i18n.ts +++ b/apps/memos-local-plugin/viewer/src/stores/i18n.ts @@ -505,6 +505,7 @@ const en = { "tasks.chat.role.assistant": "Assistant", "tasks.chat.role.tool": "Tool", "tasks.chat.role.thinking": "Thinking", + "tasks.chat.role.reflection": "related", "tasks.chat.tool.assistantTextBefore": "Assistant text before tool", "tasks.chat.tool.thinking": "Thinking", "tasks.chat.tool.input": "Input", @@ -1370,6 +1371,7 @@ const zh: Record = { "tasks.chat.role.assistant": "助手", "tasks.chat.role.tool": "工具", "tasks.chat.role.thinking": "思考", + "tasks.chat.role.reflection": "相关性", "tasks.chat.tool.assistantTextBefore": "工具前回复", "tasks.chat.tool.thinking": "工具前思考", "tasks.chat.tool.input": "输入", diff --git a/apps/memos-local-plugin/viewer/src/views/tasks-chat-data.ts b/apps/memos-local-plugin/viewer/src/views/tasks-chat-data.ts index fe12010e8..fe2e0518a 100644 --- a/apps/memos-local-plugin/viewer/src/views/tasks-chat-data.ts +++ b/apps/memos-local-plugin/viewer/src/views/tasks-chat-data.ts @@ -40,6 +40,7 @@ export interface TimelineTrace { */ reflection?: string | null; value: number; + alpha?: number; toolCalls?: TimelineToolCall[]; } @@ -54,6 +55,12 @@ export interface ChatMsg { key: string; /** Trace id this message originates from (so we can deep-link later). */ traceId: string; + /** Per-trace score label rendered in message meta (e.g. V 0.83 · α 0.41). */ + scoreLabel?: string; + /** MemOS reflection label (e.g. PIVOTAL / IRRELEVANT). */ + relatedLabel?: string; + /** Localized relatedness title chosen by trace query language. */ + relatedTitle?: string; // Tool-only fields: toolName?: string; /** Visible assistant narration emitted before this tool call. */ @@ -107,11 +114,8 @@ const TOOL_OUTPUT_PREVIEW_CHARS = 1_600; * inside the corresponding tool card. * 3. `assistant` — the assistant's final text reply (if non-empty). * - * `trace.reflection` is **deliberately not** turned into a chat bubble. - * Reflection is the MemOS plugin's own post-hoc note used to compute - * α + R_human backprop — an internal scoring signal, not part of the - * user↔agent conversation. The trace drawer surfaces it under a - * dedicated "Reflection" panel. + * `trace.reflection` is attached as per-trace metadata (`relatedLabel`) + * and rendered in each trace bubble footer rather than a standalone bubble. * * The function never throws on malformed input — missing fields are * dropped silently, unknown JSON is best-effort serialised, and tool @@ -123,12 +127,18 @@ export function flattenChat(traces: readonly TimelineTrace[]): ChatMsg[] { const userTrace = group.find((tr) => (tr.userText ?? "").trim().length > 0); const userText = (userTrace?.userText ?? "").trim(); if (userTrace && userText) { + const userScoreLabel = formatTraceScore(userTrace); + const userRelated = (userTrace.reflection ?? "").trim(); + const userRelatedTitle = pickRelatedTitleByTrace(userTrace); out.push({ role: "user", text: userText, ts: userTrace.turnId ?? userTrace.ts, key: `${userTrace.id}:user`, traceId: userTrace.id, + scoreLabel: userScoreLabel, + relatedLabel: userRelated || undefined, + relatedTitle: userRelatedTitle, }); } @@ -168,6 +178,9 @@ function groupTracesByTurn(traces: readonly TimelineTrace[]): TimelineTrace[][] } function appendTraceMessages(out: ChatMsg[], tr: TimelineTrace): void { + const scoreLabel = formatTraceScore(tr); + const relatedLabel = (tr.reflection ?? "").trim() || undefined; + const relatedTitle = pickRelatedTitleByTrace(tr); const tools = [...(tr.toolCalls ?? [])].sort((a, b) => { const at = a.startedAt ?? Number.POSITIVE_INFINITY; const bt = b.startedAt ?? Number.POSITIVE_INFINITY; @@ -186,6 +199,9 @@ function appendTraceMessages(out: ChatMsg[], tr: TimelineTrace): void { ts: tr.ts, key: `${tr.id}:thinking`, traceId: tr.id, + scoreLabel, + relatedLabel, + relatedTitle, }); } } @@ -205,6 +221,9 @@ function appendTraceMessages(out: ChatMsg[], tr: TimelineTrace): void { ts: tc.startedAt, key: `${tr.id}:tool:${idx}`, traceId: tr.id, + scoreLabel, + relatedLabel, + relatedTitle, toolName: tc.name, toolAssistantTextBefore: assistantBefore || undefined, toolThinking: tb || undefined, @@ -223,6 +242,9 @@ function appendTraceMessages(out: ChatMsg[], tr: TimelineTrace): void { ts: tr.ts, key: `${tr.id}:assistant`, traceId: tr.id, + scoreLabel, + relatedLabel, + relatedTitle, }); } } @@ -334,3 +356,15 @@ function serializeToolPayload(v: unknown): string { function clip(s: string, n: number): string { return s.length > n ? `${s.slice(0, n)}…` : s; } + +function formatTraceScore(tr: TimelineTrace): string { + const value = Number.isFinite(tr.value) ? tr.value : 0; + const alpha = tr.alpha; + if (Number.isFinite(alpha)) return `V ${value.toFixed(2)} · α ${alpha!.toFixed(2)}`; + return `V ${value.toFixed(2)}`; +} + +function pickRelatedTitleByTrace(tr: TimelineTrace): string { + const basis = `${tr.userText ?? ""}\n${tr.agentText ?? ""}`; + return /[\u3400-\u9fff]/.test(basis) ? "相关性" : "related"; +} diff --git a/apps/memos-local-plugin/viewer/src/views/tasks-chat.tsx b/apps/memos-local-plugin/viewer/src/views/tasks-chat.tsx index 4a3a3c669..f48bc2904 100644 --- a/apps/memos-local-plugin/viewer/src/views/tasks-chat.tsx +++ b/apps/memos-local-plugin/viewer/src/views/tasks-chat.tsx @@ -144,6 +144,7 @@ export function ChatBubble({ msg }: { msg: ChatMsg }) { (msg.role === "user" || msg.role === "assistant") && msg.text.length > threshold; const [expanded, setExpanded] = useState(false); + const footer = traceFooter(msg); return (
@@ -154,6 +155,9 @@ export function ChatBubble({ msg }: { msg: ChatMsg }) {
{roleLabel(msg)} {time} + {msg.scoreLabel && ( + {msg.scoreLabel} + )} {msg.role === "tool" && msg.toolDurationMs != null && ( {msg.toolDurationMs}ms )} @@ -166,12 +170,22 @@ export function ChatBubble({ msg }: { msg: ChatMsg }) { ) : msg.role === "thinking" ? (
+ {footer && ( +
+ {footer} +
+ )}
) : (
+ {footer && ( +
+ {footer} +
+ )} {collapsible && !expanded && (
)} @@ -196,6 +210,7 @@ function ToolBubble({ msg }: { msg: ChatMsg }) { const klass = "chat-item__bubble chat-item__bubble--tool" + (errored ? " chat-item__bubble--error" : ""); + const footer = traceFooter(msg); return (
@@ -256,6 +271,11 @@ function ToolBubble({ msg }: { msg: ChatMsg }) { {t("tasks.chat.tool.noPayload")}
)} + {footer && ( +
+ {footer} +
+ )}
); } @@ -267,6 +287,15 @@ function roleLabel(msg: ChatMsg): string { return t(`tasks.chat.role.${msg.role}` as "tasks.chat.role.user"); } +function traceFooter(msg: ChatMsg): string { + const parts: string[] = []; + if (msg.relatedLabel) { + parts.push(`${msg.relatedTitle ?? t("tasks.chat.role.reflection" as "tasks.chat.role.user")}: ${msg.relatedLabel}`); + } + if (msg.scoreLabel) parts.push(msg.scoreLabel); + return parts.join(" · "); +} + function formatTime(ts?: number): string { if (!ts) return ""; try {