From c4a7e407b45d410ff3a0ca9dd526264d9ffd9e51 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Mon, 4 May 2026 14:49:42 -0700 Subject: [PATCH] Fix context summary continuation format --- agents/__tests__/context-pruner.test.ts | 259 +++++++++++++----- agents/context-pruner.ts | 220 ++++++++++----- .../e2e/base2-free-summary-format.e2e.test.ts | 57 ++-- 3 files changed, 387 insertions(+), 149 deletions(-) diff --git a/agents/__tests__/context-pruner.test.ts b/agents/__tests__/context-pruner.test.ts index b691f33a9f..4837740e79 100644 --- a/agents/__tests__/context-pruner.test.ts +++ b/agents/__tests__/context-pruner.test.ts @@ -292,9 +292,12 @@ describe('context-pruner handleSteps', () => { expect(content).toContain('') expect(content).toContain('') - // Should contain the user and assistant markers - expect(content).toContain('[USER]') - expect(content).toContain('[ASSISTANT]') + // Should use a memory artifact format, not transcript role markers + expect(content).toContain('') + expect(content).toContain('User request:') + expect(content).toContain('Progress note:') + expect(content).not.toContain('[USER]') + expect(content).not.toContain('[ASSISTANT]') }) test('includes tool call summaries in the output', () => { @@ -303,7 +306,9 @@ describe('context-pruner handleSteps', () => { createToolCallMessage('call-1', 'read_files', { paths: ['file1.ts', 'file2.ts'], }), - createToolResultMessage('call-1', 'read_files', { content: 'file data' } as JSONValue), + createToolResultMessage('call-1', 'read_files', { + content: 'file data', + } as JSONValue), createMessage('user', 'Now edit this file'), createToolCallMessage('call-2', 'str_replace', { path: 'file1.ts', @@ -316,8 +321,8 @@ describe('context-pruner handleSteps', () => { const content = results[0].input.messages[0].content[0].text // Should contain tool summaries - expect(content).toContain('Read files: file1.ts, file2.ts') - expect(content).toContain('Edited file: file1.ts') + expect(content).toContain('Previously inspected files: file1.ts, file2.ts') + expect(content).toContain('Previously edited file: file1.ts') }) test('summarizes various tool types correctly', () => { @@ -345,10 +350,10 @@ describe('context-pruner handleSteps', () => { const results = runHandleSteps(messages, 50000, 10000) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('Wrote file: new-file.ts') - expect(content).toContain('Ran command: npm test') - expect(content).toContain('Code search: "function"') - expect(content).toContain('Spawned agents:') + expect(content).toContain('Previously wrote file: new-file.ts') + expect(content).toContain('Previously ran command: npm test') + expect(content).toContain('Previous code search for "function"') + expect(content).toContain('Previously delegated agents:') expect(content).toContain('- file-picker') expect(content).toContain('- commander') }) @@ -365,7 +370,7 @@ describe('context-pruner handleSteps', () => { const results = runHandleSteps(messages, 50000, 10000) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[TOOL ERROR: read_files] File not found') + expect(content).toContain('Tool error from read_files: File not found') }) test('notes when user messages have images', () => { @@ -382,7 +387,7 @@ describe('context-pruner handleSteps', () => { const results = runHandleSteps(messages, 50000, 10000) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[USER] [with image(s)]') + expect(content).toContain('User request [image(s) were attached]:') }) test('removes only INSTRUCTIONS_PROMPT and SUBAGENT_SPAWN when under context limit', () => { @@ -490,6 +495,90 @@ describe('context-pruner handleSteps', () => { expect(instructionsContent).toBe('Parent agent instructions') }) + test('preserves tagged live user prompt as a real message after summary', () => { + const liveUserPrompt: Message = { + role: 'user', + content: [{ type: 'text', text: 'LATEST LIVE REQUEST' }], + tags: ['USER_PROMPT'], + } + const instructionsPrompt: Message = { + role: 'user', + content: [{ type: 'text', text: 'Parent instructions' }], + tags: ['INSTRUCTIONS_PROMPT'], + } + const prunerParamsPrompt: Message = { + role: 'user', + content: [{ type: 'text', text: '{"maxContextLength":200000}' }], + tags: ['USER_PROMPT'], + } + const messages: Message[] = [ + createMessage('user', 'Older request'), + createMessage('assistant', 'Older answer'), + liveUserPrompt, + instructionsPrompt, + prunerParamsPrompt, + ] + + const results = runHandleSteps(messages, 250000, 200000) + const resultMessages = results[0].input.messages + + expect(resultMessages).toHaveLength(2) + const summaryContent = (resultMessages[0].content[0] as { text: string }) + .text + expect(summaryContent).toContain('Older request') + expect(summaryContent).not.toContain('LATEST LIVE REQUEST') + expect(resultMessages[1]).toEqual( + expect.objectContaining({ + role: 'user', + tags: ['USER_PROMPT'], + }), + ) + expect((resultMessages[1].content[0] as { text: string }).text).toBe( + 'LATEST LIVE REQUEST', + ) + }) + + test('keeps live user prompt in memory and adds continuation prompt when pruning mid-turn', () => { + const liveUserPrompt: Message = { + role: 'user', + content: [{ type: 'text', text: 'PLEASE FIX THE BUG' }], + tags: ['USER_PROMPT'], + } + const prunerParamsPrompt: Message = { + role: 'user', + content: [{ type: 'text', text: '{"maxContextLength":200000}' }], + tags: ['USER_PROMPT'], + } + const messages: Message[] = [ + liveUserPrompt, + createMessage('assistant', 'I found the likely issue.'), + createToolCallMessage('call-1', 'read_files', { + paths: ['src/bug.ts'], + }), + createToolResultMessage('call-1', 'read_files', { + content: 'buggy code', + }), + prunerParamsPrompt, + ] + + const results = runHandleSteps(messages, 250000, 200000) + const resultMessages = results[0].input.messages + + expect(resultMessages).toHaveLength(2) + const summaryContent = (resultMessages[0].content[0] as { text: string }) + .text + expect(summaryContent).toContain('PLEASE FIX THE BUG') + expect(summaryContent).toContain('I found the likely issue.') + expect(summaryContent).toContain('Previously inspected files: src/bug.ts') + + expect(resultMessages[1].role).toBe('user') + expect(resultMessages[1].tags).toBeUndefined() + const continuationText = (resultMessages[1].content[0] as { text: string }) + .text + expect(continuationText).toContain('Continue the existing assistant turn') + expect(continuationText).toContain('Do not restart completed work') + }) + test('handles empty message history', () => { const messages: Message[] = [] @@ -564,7 +653,7 @@ describe('context-pruner handleSteps', () => { const results = runHandleSteps(messages, 50000, 10000) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('Spawned agent: file-picker') + expect(content).toContain('Previously delegated agent file-picker') }) test('handles long terminal commands by truncating', () => { @@ -583,7 +672,7 @@ describe('context-pruner handleSteps', () => { // Should truncate to 50 chars + ... expect(content).toContain( - 'Ran command: npm run build -- --config=production --verbose --o...', + 'Previously ran command: npm run build -- --config=production --verbose --o...', ) }) @@ -597,7 +686,7 @@ describe('context-pruner handleSteps', () => { const results = runHandleSteps(messages, 50000, 10000) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('Used tool: unknown_tool_name') + expect(content).toContain('Previously used tool unknown_tool_name') }) test('handles multiple tool calls in single assistant message', () => { @@ -630,8 +719,8 @@ describe('context-pruner handleSteps', () => { const content = results[0].input.messages[0].content[0].text // Both tool calls should be in the summary - expect(content).toContain('Read files: a.ts') - expect(content).toContain('Read files: b.ts') + expect(content).toContain('Previously inspected files: a.ts') + expect(content).toContain('Previously inspected files: b.ts') }) test('handles mixed text and tool calls in assistant message', () => { @@ -659,7 +748,7 @@ describe('context-pruner handleSteps', () => { // Should have both text and tool summary expect(content).toContain('Let me read that file for you') - expect(content).toContain('Read files: test.ts') + expect(content).toContain('Previously inspected files: test.ts') }) }) @@ -803,7 +892,9 @@ describe('context-pruner code_search with flags', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('Code search: "myFunction" (-g *.ts -i)') + expect(content).toContain( + 'Previous code search for "myFunction" (-g *.ts -i)', + ) }) }) @@ -877,7 +968,7 @@ describe('context-pruner ask_user with questions and answers', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[USER ANSWERED] Option B was selected') + expect(content).toContain('User answered: Option B was selected') }) test('includes multi-select answers', () => { @@ -896,7 +987,7 @@ describe('context-pruner ask_user with questions and answers', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[USER ANSWERED] Caching, Logging, Monitoring') + expect(content).toContain('User answered: Caching, Logging, Monitoring') }) test('shows when user skipped question', () => { @@ -913,7 +1004,7 @@ describe('context-pruner ask_user with questions and answers', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[USER SKIPPED QUESTION]') + expect(content).toContain('User skipped question') }) }) @@ -964,7 +1055,7 @@ describe('context-pruner terminal command exit codes', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[COMMAND FAILED] Exit code: 1') + expect(content).toContain('Command failed with exit code: 1') }) test('does not show failure for successful command (exit code 0)', () => { @@ -982,7 +1073,7 @@ describe('context-pruner terminal command exit codes', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).not.toContain('[COMMAND FAILED]') + expect(content).not.toContain('Command failed with exit code') }) }) @@ -1257,9 +1348,7 @@ First assistant response }) test('keeps multi-part tool entries grouped across compaction cycles', () => { - const simulateCompaction = ( - inputMessages: Message[], - ): Message => { + const simulateCompaction = (inputMessages: Message[]): Message => { const result = runHandleSteps(inputMessages, 250000, 200000) return result[0].input.messages[0] } @@ -1285,8 +1374,10 @@ First assistant response .text // Both parts should be present in cycle 1 - expect(summary1Text).toContain('[TOOL ERROR: run_terminal_command] Test suite failed') - expect(summary1Text).toContain('[COMMAND FAILED] Exit code: 1') + expect(summary1Text).toContain( + 'Tool error from run_terminal_command: Test suite failed', + ) + expect(summary1Text).toContain('Command failed with exit code: 1') // Cycle 2: re-compact — the multi-part entry should stay as one entry const cycle2Messages: Message[] = [ @@ -1299,8 +1390,10 @@ First assistant response .text // Both parts should still be present together after re-compaction - expect(summary2Text).toContain('[TOOL ERROR: run_terminal_command] Test suite failed') - expect(summary2Text).toContain('[COMMAND FAILED] Exit code: 1') + expect(summary2Text).toContain( + 'Tool error from run_terminal_command: Test suite failed', + ) + expect(summary2Text).toContain('Command failed with exit code: 1') // They should be within the same --- delimited chunk (not split apart) const separator = '\n\n---\n\n' @@ -1308,9 +1401,9 @@ First assistant response .replace(/[\s\S]*?\n\n/, '') .replace(/<\/conversation_summary>[\s\S]*/, '') .split(separator) - const errorChunk = chunks.find((c) => c.includes('[TOOL ERROR:')) + const errorChunk = chunks.find((c) => c.includes('Tool error from')) expect(errorChunk).toBeDefined() - expect(errorChunk).toContain('[COMMAND FAILED] Exit code: 1') + expect(errorChunk).toContain('Command failed with exit code: 1') }) test('handles 3+ compaction cycles without nested PREVIOUS SUMMARY markers', () => { @@ -1562,14 +1655,15 @@ describe('context-pruner str_replace and write_file tool results', () => { createToolResultMessage('call-1', 'str_replace', { file: 'src/utils.ts', message: 'Updated file', - unifiedDiff: '--- a/src/utils.ts\n+++ b/src/utils.ts\n@@ -1,1 +1,1 @@\n-foo\n+bar', + unifiedDiff: + '--- a/src/utils.ts\n+++ b/src/utils.ts\n@@ -1,1 +1,1 @@\n-foo\n+bar', }), ] const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[EDIT RESULT: str_replace]') + expect(content).toContain('Edit result from str_replace:') expect(content).toContain('unifiedDiff') expect(content).toContain('-foo') expect(content).toContain('+bar') @@ -1585,14 +1679,15 @@ describe('context-pruner str_replace and write_file tool results', () => { createToolResultMessage('call-1', 'write_file', { file: 'src/new-file.ts', message: 'Created file', - unifiedDiff: '--- /dev/null\n+++ b/src/new-file.ts\n@@ -0,0 +1 @@\n+export const hello = "world"', + unifiedDiff: + '--- /dev/null\n+++ b/src/new-file.ts\n@@ -0,0 +1 @@\n+export const hello = "world"', }), ] const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[EDIT RESULT: write_file]') + expect(content).toContain('Edit result from write_file:') expect(content).toContain('export const hello') }) @@ -1614,7 +1709,7 @@ describe('context-pruner str_replace and write_file tool results', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('[EDIT RESULT: str_replace]') + expect(content).toContain('Edit result from str_replace:') expect(content).toContain('...') // Should not contain the full diff expect(content).not.toContain(longDiff) @@ -1680,8 +1775,8 @@ describe('context-pruner str_replace and write_file tool results', () => { const content = results[0].input.messages[0].content[0].text // Should have both the tool call summary and the full result - expect(content).toContain('Edited file: src/file.ts') - expect(content).toContain('[EDIT RESULT: str_replace]') + expect(content).toContain('Previously edited file: src/file.ts') + expect(content).toContain('Edit result from str_replace:') expect(content).toContain('errorMessage') expect(content).toContain('No match found for old string') }) @@ -1731,7 +1826,7 @@ describe('context-pruner glob and list_directory tools', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('Glob: **/*.ts') + expect(content).toContain('Previous glob search for **/*.ts') }) test('summarizes list_directory tool with path', () => { @@ -1746,7 +1841,7 @@ describe('context-pruner glob and list_directory tools', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('Listed dir: src') + expect(content).toContain('Previously listed directory: src') }) test('summarizes read_subtree tool with paths', () => { @@ -1761,7 +1856,9 @@ describe('context-pruner glob and list_directory tools', () => { const results = runHandleSteps(messages) const content = results[0].input.messages[0].content[0].text - expect(content).toContain('Read subtree: src/components, src/utils') + expect(content).toContain( + 'Previously inspected subtrees: src/components, src/utils', + ) }) }) @@ -1920,17 +2017,24 @@ describe('context-pruner dual-budget behavior', () => { }) test('counts tool result summaries against assistant+tool budget', () => { - // Use str_replace with a large result — this produces a summarized [EDIT RESULT] entry + // Use str_replace with a large result — this produces a summarized edit-result entry const largeDiff = 'LARGE_DIFF_CONTENT_' + 'X'.repeat(900) const messages = [ createMessage('user', 'Do something'), - createToolCallMessage('call-1', 'str_replace', { path: 'big.ts', replacements: [] }), - createToolResultMessage('call-1', 'str_replace', { file: 'big.ts', message: 'Updated', unifiedDiff: largeDiff }), + createToolCallMessage('call-1', 'str_replace', { + path: 'big.ts', + replacements: [], + }), + createToolResultMessage('call-1', 'str_replace', { + file: 'big.ts', + message: 'Updated', + unifiedDiff: largeDiff, + }), createMessage('user', 'Recent question'), createMessage('assistant', 'Recent answer'), ] - // Assistant budget too small for the large [EDIT RESULT] summary entry + // Assistant budget too small for the large edit-result summary entry const results = runHandleSteps(messages, 250000, 200000, { assistantToolBudget: 100, userBudget: 5000, @@ -2133,11 +2237,23 @@ describe('context-pruner dual-budget behavior', () => { // Long user message (~45k chars, exceeds USER_MESSAGE_LIMIT of 13k tokens = 39k chars) // Middle marker placed ~85% through so it falls in the truncated gap // (past the 80% prefix but before the 20% suffix) - const longUserMessage = 'LONG_USER_START_' + 'Here is a detailed specification for the new feature. '.repeat(650) + '_LONG_USER_MIDDLE_MARKER_' + 'Here is a detailed specification for the new feature. '.repeat(150) + const longUserMessage = + 'LONG_USER_START_' + + 'Here is a detailed specification for the new feature. '.repeat(650) + + '_LONG_USER_MIDDLE_MARKER_' + + 'Here is a detailed specification for the new feature. '.repeat(150) // Long assistant message with text (~8k chars, exceeds ASSISTANT_MESSAGE_LIMIT of 1.3k tokens = 3.9k chars) // plus multiple tool calls. Middle marker placed ~60% through so it falls in the truncated gap. - const longAssistantText = 'LONG_ASSISTANT_START_' + 'I will implement this step by step, starting with the data model changes. '.repeat(60) + '_LONG_ASST_MIDDLE_MARKER_' + 'I will implement this step by step, starting with the data model changes. '.repeat(40) + const longAssistantText = + 'LONG_ASSISTANT_START_' + + 'I will implement this step by step, starting with the data model changes. '.repeat( + 60, + ) + + '_LONG_ASST_MIDDLE_MARKER_' + + 'I will implement this step by step, starting with the data model changes. '.repeat( + 40, + ) const assistantWithToolCalls: Message = { role: 'assistant', content: [ @@ -2172,7 +2288,8 @@ describe('context-pruner dual-budget behavior', () => { } // str_replace result with a large diff (~3k chars, exceeds 2k truncation limit) - const largeDiff = 'DIFF_START_MARKER_' + '+added line\n'.repeat(250) + '_DIFF_END_MARKER' + const largeDiff = + 'DIFF_START_MARKER_' + '+added line\n'.repeat(250) + '_DIFF_END_MARKER' // spawn_agents result with 5 non-blacklisted agents producing large outputs // Each ~4k chars, total ~20k, exceeds TOOL_ENTRY_LIMIT of 5k tokens = 15k chars @@ -2180,7 +2297,10 @@ describe('context-pruner dual-budget behavior', () => { agentType: 'editor', value: { type: 'string', - value: `AGENT_${i}_OUTPUT_START_` + 'Implementation details. '.repeat(160) + `_AGENT_${i}_OUTPUT_END`, + value: + `AGENT_${i}_OUTPUT_START_` + + 'Implementation details. '.repeat(160) + + `_AGENT_${i}_OUTPUT_END`, }, })) @@ -2188,8 +2308,14 @@ describe('context-pruner dual-budget behavior', () => { previousSummary, createMessage('user', longUserMessage), assistantWithToolCalls, - createToolResultMessage('call-1', 'read_files', { content: 'file data' } as JSONValue), - createToolResultMessage('call-2', 'str_replace', { file: 'src/model.ts', message: 'Updated', unifiedDiff: largeDiff }), + createToolResultMessage('call-1', 'read_files', { + content: 'file data', + } as JSONValue), + createToolResultMessage('call-2', 'str_replace', { + file: 'src/model.ts', + message: 'Updated', + unifiedDiff: largeDiff, + }), { role: 'tool', toolCallId: 'call-3', @@ -2210,7 +2336,8 @@ describe('context-pruner dual-budget behavior', () => { // === Structure checks === expect(content).toContain('') expect(content).toContain('') - const summaryTagCount = (content.match(//g) || []).length + const summaryTagCount = (content.match(//g) || []) + .length expect(summaryTagCount).toBe(1) // === Previous summary entries preserved === @@ -2229,12 +2356,14 @@ describe('context-pruner dual-budget behavior', () => { expect(content).not.toContain('_LONG_ASST_MIDDLE_MARKER_') // Middle marker falls in truncated gap // === Tool call summaries present === - expect(content).toContain('Read files: src/model.ts, src/service.ts') - expect(content).toContain('Edited file: src/model.ts') - expect(content).toContain('Spawned agents:') + expect(content).toContain( + 'Previously inspected files: src/model.ts, src/service.ts', + ) + expect(content).toContain('Previously edited file: src/model.ts') + expect(content).toContain('Previously delegated agents:') // === str_replace result: present but truncated at 2k chars === - expect(content).toContain('[EDIT RESULT: str_replace]') + expect(content).toContain('Edit result from str_replace:') expect(content).toContain('DIFF_START_MARKER_') expect(content).not.toContain('_DIFF_END_MARKER') // Truncated by 2k result limit @@ -2258,13 +2387,16 @@ describe('context-pruner dual-budget behavior', () => { content: [ { type: 'text', - text: `\nThis is a summary of the conversation so far. The original messages have been condensed to save context space.\n\n[USER]\nOLD_DROPPED_USER: ${'X'.repeat(600)}\n\n---\n\n[ASSISTANT]\nOLD_DROPPED_ASSISTANT: ${'Y'.repeat(600)}\n\n---\n\n[USER]\nOLD_DROPPED_USER_2: Asked about deployment\n\n---\n\n[ASSISTANT]\nOLD_DROPPED_ASSISTANT_2: Explained deployment process\n`, + text: `\nThis is a summary of the conversation so far. The original messages have been condensed to save context space.\n\n[USER]\nOLD_DROPPED_USER: ${'X'.repeat(600)}\n\n---\n\n[ASSISTANT]\nOLD_DROPPED_ASSISTANT: ${'Y'.repeat(600)}\n\n---\n\n[USER]\nOLD_DROPPED_USER_2: Asked about deployment\n\n---\n\n[ASSISTANT]\nOLD_DROPPED_ASSISTANT_2: ${'Explained deployment process. '.repeat(80)}\n`, }, ], } // Long user message (~12k chars, under truncation limit but uses significant budget) - const longUserMessage = 'SURVIVED_USER_START_' + 'Feature request details. '.repeat(400) + '_SURVIVED_USER_END' + const longUserMessage = + 'SURVIVED_USER_START_' + + 'Feature request details. '.repeat(400) + + '_SURVIVED_USER_END' // Assistant with tool calls const assistantMsg: Message = { @@ -2284,7 +2416,8 @@ describe('context-pruner dual-budget behavior', () => { const toolResult = createToolResultMessage('call-1', 'str_replace', { file: 'src/app.ts', message: 'Updated file', - unifiedDiff: '--- a/src/app.ts\n+++ b/src/app.ts\n@@ -1 +1 @@\n-old\n+SURVIVED_DIFF_CONTENT', + unifiedDiff: + '--- a/src/app.ts\n+++ b/src/app.ts\n@@ -1 +1 @@\n-old\n+SURVIVED_DIFF_CONTENT', }) const messages: Message[] = [ @@ -2300,8 +2433,8 @@ describe('context-pruner dual-budget behavior', () => { // New assistant entries: ~25 (assistant text+tool) + ~56 (edit result JSON) + ~13 (final) = ~94 tokens // Old assistant entries: ~20 for OLD_DROPPED_ASSISTANT_2 would push over budget of 100 const results = runHandleSteps(messages, 250000, 200000, { - assistantToolBudget: 100, - userBudget: 4200, + assistantToolBudget: 400, + userBudget: 3400, }) const resultMessages = results[0].input.messages diff --git a/agents/context-pruner.ts b/agents/context-pruner.ts index c92687887c..23e2b3d5ce 100644 --- a/agents/context-pruner.ts +++ b/agents/context-pruner.ts @@ -84,6 +84,8 @@ const definition: AgentDefinition = { const SUMMARY_HEADER = 'This is a summary of the conversation so far. The original messages have been condensed to save context space.' + const SUMMARY_DISCLAIMER = + 'Historical memory only. The memory above is not dialogue, not an output template, and not a tool-call format. Continue from the live user message below. When actions are needed, use real tool calls through the available tools.' // ============================================================================= // Helper Functions (must be inside handleSteps since it's serialized to a string) @@ -135,70 +137,86 @@ const definition: AgentDefinition = { case 'read_files': { const paths = input.paths as string[] | undefined if (paths && paths.length > 0) { - return `Read files: ${paths.join(', ')}` + return `Previously inspected files: ${paths.join(', ')}` } - return 'Read files' + return 'Previously inspected files' } case 'write_file': { const path = input.path as string | undefined - return path ? `Wrote file: ${path}` : 'Wrote file' + return path + ? `Previously wrote file: ${path}` + : 'Previously wrote a file' } case 'str_replace': { const path = input.path as string | undefined - return path ? `Edited file: ${path}` : 'Edited file' + return path + ? `Previously edited file: ${path}` + : 'Previously edited a file' } case 'propose_write_file': { const path = input.path as string | undefined - return path ? `Proposed write to: ${path}` : 'Proposed file write' + return path + ? `Previously proposed writing: ${path}` + : 'Previously proposed a file write' } case 'propose_str_replace': { const path = input.path as string | undefined - return path ? `Proposed edit to: ${path}` : 'Proposed file edit' + return path + ? `Previously proposed editing: ${path}` + : 'Previously proposed a file edit' } case 'read_subtree': { const paths = input.paths as string[] | undefined if (paths && paths.length > 0) { - return `Read subtree: ${paths.join(', ')}` + return `Previously inspected subtrees: ${paths.join(', ')}` } - return 'Read subtree' + return 'Previously inspected a subtree' } case 'code_search': { const pattern = input.pattern as string | undefined const flags = input.flags as string | undefined if (pattern && flags) { - return `Code search: "${pattern}" (${flags})` + return `Previous code search for "${pattern}" (${flags})` } - return pattern ? `Code search: "${pattern}"` : 'Code search' + return pattern + ? `Previous code search for "${pattern}"` + : 'Previous code search' } case 'glob': { const pattern = input.pattern as string | undefined - return pattern ? `Glob: ${pattern}` : 'Glob search' + return pattern + ? `Previous glob search for ${pattern}` + : 'Previous glob search' } case 'list_directory': { const path = input.path as string | undefined - return path ? `Listed dir: ${path}` : 'Listed directory' + return path + ? `Previously listed directory: ${path}` + : 'Previously listed a directory' } case 'find_files': { const prompt = input.prompt as string | undefined - return prompt ? `Find files: "${prompt}"` : 'Find files' + return prompt + ? `Previous file-finding request: "${prompt}"` + : 'Previous file-finding request' } case 'run_terminal_command': { const command = input.command as string | undefined if (command) { const shortCmd = command.length > 50 ? command.slice(0, 50) + '...' : command - return `Ran command: ${shortCmd}` + return `Previously ran command: ${shortCmd}` } - return 'Ran terminal command' + return 'Previously ran a terminal command' } case 'spawn_agents': case 'spawn_agent_inline': { const agents = input.agents as | Array<{ - agent_type: string - prompt?: string - params?: Record - }> + agent_type: string + prompt?: string + params?: Record + }> | undefined const agentType = input.agent_type as string | undefined const prompt = input.prompt as string | undefined @@ -230,7 +248,7 @@ const definition: AgentDefinition = { } return detail }) - return `Spawned agents:\n${agentDetails.map((d) => `- ${d}`).join('\n')}` + return `Previously delegated agents:\n${agentDetails.map((d) => `- ${d}`).join('\n')}` } if (agentType) { const extras: string[] = [] @@ -248,11 +266,11 @@ const definition: AgentDefinition = { extras.push(`params: ${truncatedParams}`) } if (extras.length > 0) { - return `Spawned agent: ${agentType} (${extras.join(', ')})` + return `Previously delegated agent ${agentType} (${extras.join(', ')})` } - return `Spawned agent: ${agentType}` + return `Previously delegated agent ${agentType}` } - return 'Spawned agent(s)' + return 'Previously delegated agent work' } case 'write_todos': { const todos = input.todos as @@ -289,30 +307,36 @@ const definition: AgentDefinition = { return 'Suggested followups' case 'web_search': { const query = input.query as string | undefined - return query ? `Web search: "${query}"` : 'Web search' + return query + ? `Previous web search for "${query}"` + : 'Previous web search' } case 'gravity_index': { const query = input.query as string | undefined const action = input.action as string | undefined if (query) { - return `Gravity Index ${action ?? 'search'}: "${query}"` + return `Previous Gravity Index ${action ?? 'search'} for "${query}"` } - return action ? `Gravity Index ${action}` : 'Gravity Index' + return action + ? `Previous Gravity Index ${action}` + : 'Previous Gravity Index use' } case 'read_docs': { const libraryTitle = input.libraryTitle as string | undefined const topic = input.topic as string | undefined if (libraryTitle && topic) { - return `Read docs: ${libraryTitle} - ${topic}` + return `Previously consulted docs: ${libraryTitle} - ${topic}` } - return libraryTitle ? `Read docs: ${libraryTitle}` : 'Read docs' + return libraryTitle + ? `Previously consulted docs: ${libraryTitle}` + : 'Previously consulted docs' } case 'set_output': - return 'Set output' + return 'Previously set structured output' case 'set_messages': - return 'Set messages' + return 'Previously updated message history' default: - return `Used tool: ${toolName}` + return `Previously used tool ${toolName}` } } @@ -377,7 +401,11 @@ const definition: AgentDefinition = { // - Prune when context exceeds max, OR // - Prune when prompt cache will miss (>5 min gap) to take advantage of fresh context // If not, return messages with just the subagent-specific tags removed - if (agentState.contextTokenCount + TOKEN_COUNT_FUDGE_FACTOR <= maxContextLength && !cacheWillMiss) { + if ( + agentState.contextTokenCount + TOKEN_COUNT_FUDGE_FACTOR <= + maxContextLength && + !cacheWillMiss + ) { yield { toolName: 'set_messages', input: { messages: currentMessages }, @@ -404,7 +432,8 @@ const definition: AgentDefinition = { // 2. Walk backwards through summarized parts to apply token budgets // 3. Older summarized parts beyond the budgets are dropped - const assistantToolBudget: number = params?.assistantToolBudget ?? ASSISTANT_TOOL_BUDGET + const assistantToolBudget: number = + params?.assistantToolBudget ?? ASSISTANT_TOOL_BUDGET const userBudget: number = params?.userBudget ?? USER_BUDGET function shouldExcludeMessage(message: Message): boolean { @@ -429,6 +458,12 @@ const definition: AgentDefinition = { if (content.startsWith(SUMMARY_HEADER)) { content = content.slice(SUMMARY_HEADER.length).trim() } + const memoryMatch = content.match( + /([\s\S]*?)<\/historical_memory>/, + ) + if (memoryMatch) { + content = memoryMatch[1].trim() + } return content } @@ -449,7 +484,10 @@ const definition: AgentDefinition = { const trimmed = chunk.trim() const isUser = trimmed.startsWith('[USER]\n') || - trimmed.startsWith('[USER] [with image') + trimmed.startsWith('[USER] [with image') || + trimmed.startsWith('User request') || + trimmed.startsWith('User message') || + trimmed.startsWith('Current unresolved user request') return { role: isUser ? ('user' as const) : ('assistant_tool' as const), parts: [trimmed], @@ -465,10 +503,37 @@ const definition: AgentDefinition = { } } - // Filter out excluded and conversation summary messages for summarization - const messagesToSummarize = currentMessages.filter( - (message) => !shouldExcludeMessage(message) && !isConversationSummary(message), + // If pruning happens before the assistant has started responding to the + // current user prompt, preserve that prompt as a real message after the + // memory artifact. If pruning happens mid-turn, keep the prompt in the + // historical memory with the assistant/tool progress that followed it and + // append a synthetic continuation prompt instead. + const latestLiveUserPromptIndex = currentMessages.findLastIndex((message) => + message.tags?.includes('USER_PROMPT'), ) + const latestLiveUserPromptMessage = + latestLiveUserPromptIndex !== -1 + ? currentMessages[latestLiveUserPromptIndex] + : null + const isMidTurnPrune = + latestLiveUserPromptIndex !== -1 && + currentMessages + .slice(latestLiveUserPromptIndex + 1) + .some( + (message) => + !shouldExcludeMessage(message) && !isConversationSummary(message), + ) + + // Filter out excluded, conversation summary, and live-prompt messages for summarization + const messagesToSummarize = currentMessages + .filter( + (_message, index) => + isMidTurnPrune || index !== latestLiveUserPromptIndex, + ) + .filter( + (message) => + !shouldExcludeMessage(message) && !isConversationSummary(message), + ) // Find the last user message with images to preserve in the final output let lastUserImageParts: Array> = [] @@ -487,7 +552,10 @@ const definition: AgentDefinition = { } // Phase 1: Summarize ALL messages into tagged entries - const summarizedEntries: Array<{ role: 'user' | 'assistant_tool'; parts: string[] }> = [] + const summarizedEntries: Array<{ + role: 'user' | 'assistant_tool' + parts: string[] + }> = [] for (const message of messagesToSummarize) { if (message.role === 'user') { @@ -501,10 +569,10 @@ const definition: AgentDefinition = { part.type === 'image' || part.type === 'media', ) } - const imageNote = hasImages ? ' [with image(s)]' : '' + const imageNote = hasImages ? ' [image(s) were attached]' : '' summarizedEntries.push({ role: 'user', - parts: [`[USER]${imageNote}\n${text}`], + parts: [`User request${imageNote}:\n${text}`], }) } } else if (message.role === 'assistant') { @@ -531,17 +599,20 @@ const definition: AgentDefinition = { const parts: string[] = [] if (textParts.length > 0) { let combinedText = textParts.join('\n') - combinedText = truncateLongText(combinedText, ASSISTANT_MESSAGE_LIMIT * CHARS_PER_TOKEN) - parts.push(combinedText) + combinedText = truncateLongText( + combinedText, + ASSISTANT_MESSAGE_LIMIT * CHARS_PER_TOKEN, + ) + parts.push(`Progress note:\n${combinedText}`) } if (toolSummaries.length > 0) { - parts.push(toolSummaries.join('; ')) + parts.push(`Prior action record:\n${toolSummaries.join('\n')}`) } if (parts.length > 0) { summarizedEntries.push({ role: 'assistant_tool', - parts: [`[ASSISTANT]\n${parts.join('\n')}`], + parts, }) } } else if (message.role === 'tool') { @@ -559,7 +630,7 @@ const definition: AgentDefinition = { errorText = errorText.slice(0, 100) + '...' } entryParts.push( - `[TOOL ERROR: ${toolMessage.toolName}] ${errorText}`, + `Tool error from ${toolMessage.toolName}: ${errorText}`, ) } @@ -569,20 +640,20 @@ const definition: AgentDefinition = { ) { const exitCode = value.exitCode as number if (exitCode !== 0) { - entryParts.push(`[COMMAND FAILED] Exit code: ${exitCode}`) + entryParts.push(`Command failed with exit code: ${exitCode}`) } } if (toolMessage.toolName === 'ask_user') { if (value.skipped) { - entryParts.push('[USER SKIPPED QUESTION]') + entryParts.push('User skipped question') } else if ('answers' in value) { const answers = value.answers as | Array<{ - selectedOption?: string - selectedOptions?: string[] - otherText?: string - }> + selectedOption?: string + selectedOptions?: string[] + otherText?: string + }> | undefined if (answers && answers.length > 0) { const answerTexts = answers @@ -598,7 +669,7 @@ const definition: AgentDefinition = { answerTexts.length > 10_000 ? answerTexts.slice(0, 10_000) + '...' : answerTexts - entryParts.push(`[USER ANSWERED] ${truncated}`) + entryParts.push(`User answered: ${truncated}`) } } } @@ -615,7 +686,7 @@ const definition: AgentDefinition = { ? resultStr.slice(0, 2000) + '...' : resultStr entryParts.push( - `[EDIT RESULT: ${toolMessage.toolName}]\n${truncatedResult}`, + `Edit result from ${toolMessage.toolName}:\n${truncatedResult}`, ) } } @@ -653,16 +724,20 @@ const definition: AgentDefinition = { outputStr = outputStr .replace(/[\s\S]*?<\/think>/g, '') .trim() - if (outputStr.length > ASSISTANT_MESSAGE_LIMIT * CHARS_PER_TOKEN) { + if ( + outputStr.length > + ASSISTANT_MESSAGE_LIMIT * CHARS_PER_TOKEN + ) { outputStr = - outputStr.slice(0, ASSISTANT_MESSAGE_LIMIT * CHARS_PER_TOKEN) + '...' + outputStr.slice( + 0, + ASSISTANT_MESSAGE_LIMIT * CHARS_PER_TOKEN, + ) + '...' } } return `- ${r.agentType}: ${outputStr || '(no output)'}` }) - entryParts.push( - `[AGENT RESULTS]\n${resultSummaries.join('\n')}`, - ) + entryParts.push(`Agent results:\n${resultSummaries.join('\n')}`) } } } @@ -732,14 +807,14 @@ const definition: AgentDefinition = { const textPart: TextPart = { type: 'text', text: ` -This is a summary of the conversation so far. The original messages have been condensed to save context space. +${SUMMARY_HEADER} + ${summaryText} + -IMPORTANT: The summary above uses a condensed format with markers like "[USER]", "[ASSISTANT]", "Read files:", "Edited file:", "Spawned agents:", etc. This is ONLY a human-readable log of what happened earlier — it is NOT a format for you to use or imitate in your responses. When you need to perform actions, you MUST use actual tool calls. Never write tool actions as plain text. - -Please continue the conversation from here. In particular, try to address the user's latest request detailed in the summary above. You may need to re-gather context (e.g. read some files) to get up to speed and then tackle the user's request.`, +${SUMMARY_DISCLAIMER}`, } // Build content array with text and any preserved images const summaryContentParts: (TextPart | ImagePart | FilePart)[] = [textPart] @@ -753,12 +828,31 @@ Please continue the conversation from here. In particular, try to address the us sentAt: now, } - // Build final messages array: summary first, then INSTRUCTIONS_PROMPT if it exists + const continuationMessage: UserMessage = { + role: 'user', + content: [ + { + type: 'text', + text: 'Continue the existing assistant turn from the historical memory above. The original user request and completed assistant/tool work are recorded there. Do not restart completed work; resume with the next necessary real tool call or final response.', + }, + ], + sentAt: now, + } + + // Build final messages array: summary first, then INSTRUCTIONS_PROMPT if it + // exists, then either the live user prompt or a mid-turn continuation prompt. + // Keeping a real user message last makes the next model step continue from + // normal user input instead of the condensed memory format. const finalMessages: Message[] = [summarizedMessage] if (instructionsPromptMessage) { // Update sentAt to current time so future cache miss checks use fresh timestamps finalMessages.push({ ...instructionsPromptMessage, sentAt: now }) } + if (isMidTurnPrune) { + finalMessages.push(continuationMessage) + } else if (latestLiveUserPromptMessage) { + finalMessages.push({ ...latestLiveUserPromptMessage, sentAt: now }) + } yield { toolName: 'set_messages', diff --git a/agents/e2e/base2-free-summary-format.e2e.test.ts b/agents/e2e/base2-free-summary-format.e2e.test.ts index 2ae3a2a928..8374b236cd 100644 --- a/agents/e2e/base2-free-summary-format.e2e.test.ts +++ b/agents/e2e/base2-free-summary-format.e2e.test.ts @@ -38,6 +38,13 @@ const SUMMARY_IMITATION_PATTERNS = [ /^Used tool:\s/m, /^\[ASSISTANT\]\n/m, /^\[USER\]\n/m, + /^User request(?:\s|\[|:)/m, + /^Progress note:\s/m, + /^Prior action record:\s/m, + /^Previously inspected files:\s/m, + /^Previously edited file:\s/m, + /^Previously delegated agents:\s*\n/m, + /^Edit result from \w+:/m, ] /** @@ -59,8 +66,8 @@ function detectSummaryImitation(text: string): string[] { /** * Creates a pre-summarized conversation that mimics what the context pruner produces. - * NOTE: The IMPORTANT disclaimer text here must be kept in sync with the one in - * agents/context-pruner.ts. If you change the disclaimer there, update it here too. + * NOTE: The disclaimer text here must be kept in sync with the one in + * agents/context-pruner.ts. If you change the memory artifact format there, update it here too. */ function createSummarizedConversation(): Message { return { @@ -71,44 +78,50 @@ function createSummarizedConversation(): Message { text: ` This is a summary of the conversation so far. The original messages have been condensed to save context space. -[USER] + +User request: The user asked to set up a new TypeScript project with a simple utility file at src/utils.ts containing a helper function called formatDate. --- -[ASSISTANT] +Progress note: Sure, I'll help set up the project. -Tools: Read files: package.json, tsconfig.json; Wrote file: src/utils.ts + +Prior action record: +Previously inspected files: package.json, tsconfig.json +Previously wrote file: src/utils.ts --- -[USER] +User request: Thanks! Now can you also add a function called parseConfig that reads a JSON config file? --- -[ASSISTANT] +Progress note: I'll add the parseConfig function to the utils file. -Tools: Read files: src/utils.ts; Edited file: src/utils.ts + +Prior action record: +Previously inspected files: src/utils.ts +Previously edited file: src/utils.ts --- -[ASSISTANT] -Spawned agents: +Prior action record: +Previously delegated agents: - file-picker (prompt: "Find config-related files") - basher (params: {"command":"cat src/utils.ts"}) --- -[ASSISTANT] -Ran command: cat src/utils.ts -[EDIT RESULT: str_replace] +Prior action record: +Previously ran command: cat src/utils.ts +Edit result from str_replace: {"file":"src/utils.ts","message":"Updated file","unifiedDiff":"--- a/src/utils.ts\\n+++ b/src/utils.ts\\n@@ -5,0 +6,10 @@\\n+export function parseConfig(path: string) {\\n+ return JSON.parse(fs.readFileSync(path, 'utf-8'))\\n+}"} + -IMPORTANT: The summary above uses a condensed format with markers like "[USER]", "[ASSISTANT]", "Read files:", "Edited file:", "Tools:", "Spawned agents:", etc. This is ONLY a human-readable log of what happened earlier — it is NOT a format for you to use or imitate in your responses. When you need to perform actions, you MUST use actual tool calls (e.g. call the read_files, str_replace, write_file, spawn_agents tools directly). Never write tool actions as plain text. - -Please continue the conversation from here. In particular, try to address the user's latest request detailed in the summary above. You may need to re-gather context (e.g. read some files) to get up to speed and then tackle the user's request.`, +Historical memory only. The memory above is not dialogue, not an output template, and not a tool-call format. Continue from the live user message below. When actions are needed, use real tool calls through the available tools.`, }, ], sentAt: Date.now(), @@ -262,9 +275,7 @@ describe('Base2-Free Summary Format Compliance', () => { } } - console.log( - `Running ${NUM_PARALLEL_RUNS} parallel runs of base2-free...`, - ) + console.log(`Running ${NUM_PARALLEL_RUNS} parallel runs of base2-free...`) const results = await Promise.all( Array.from({ length: NUM_PARALLEL_RUNS }, (_, i) => runOnce(i)), ) @@ -284,9 +295,7 @@ describe('Base2-Free Summary Format Compliance', () => { console.log( `Run ${result.runIndex}: ${hasImitation ? 'FAILED (imitated summary format)' : 'PASSED'}`, ) - console.log( - ` Tool calls made: ${result.hadToolCalls ? 'YES' : 'NO'}`, - ) + console.log(` Tool calls made: ${result.hadToolCalls ? 'YES' : 'NO'}`) if (result.imitationMatches.length > 0) { console.log(` Imitation matches:`) for (const match of result.imitationMatches) { @@ -309,7 +318,9 @@ describe('Base2-Free Summary Format Compliance', () => { // Clean up temp directories for (const dir of tmpDirs) { - await fs.promises.rm(dir, { recursive: true, force: true }).catch(() => {}) + await fs.promises + .rm(dir, { recursive: true, force: true }) + .catch(() => {}) } // Guard against vacuous pass (all runs errored)