From cc12d9573de50c71bd5c5e7bd8f7e0aee63f616b Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 00:21:01 +0800 Subject: [PATCH 01/44] debug code about compile --- openkb/__main__.py | 4 + openkb/agent/compiler.py | 635 ++++++++++++++++++++++++++------------- openkb/agent/tools.py | 39 +++ openkb/cli.py | 47 ++- pyproject.toml | 2 +- tests/test_compiler.py | 354 ++++++++++++++-------- 6 files changed, 742 insertions(+), 339 deletions(-) create mode 100644 openkb/__main__.py diff --git a/openkb/__main__.py b/openkb/__main__.py new file mode 100644 index 0000000..28f9e41 --- /dev/null +++ b/openkb/__main__.py @@ -0,0 +1,4 @@ +"""Allow running OpenKB as ``python -m openkb``.""" +from openkb.cli import cli + +cli() diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index d4e34e3..8307abb 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -1,202 +1,281 @@ -"""Wiki compilation agent for OpenKB. - -Provides an agent that reads converted documents, generates summaries, -updates concept pages, and maintains the wiki index. +"""Wiki compilation pipeline for OpenKB. + +Pipeline leveraging LLM prompt caching: + Step 1: Build base context A (schema + document content). + Step 2: A → generate summary. + Step 3: A + summary → extract concept list. + Step 4: Concurrent LLM calls (A cached) → generate each concept page. + Step 5: Code writes all files, updates index, appends log. """ from __future__ import annotations +import asyncio +import json +import logging +import sys +import time from pathlib import Path -from agents import Agent, Runner, function_tool -import os +import litellm + +from openkb.schema import get_agents_md -from pageindex import PageIndexClient +logger = logging.getLogger(__name__) -from openkb.agent.tools import list_wiki_files, read_wiki_file, write_wiki_file -from openkb.schema import SCHEMA_MD, get_agents_md +# --------------------------------------------------------------------------- +# Prompt templates +# --------------------------------------------------------------------------- -_COMPILER_INSTRUCTIONS_TEMPLATE = """\ +_SYSTEM_TEMPLATE = """\ You are a wiki compilation agent for a personal knowledge base. {schema_md} -## Your job -When given a new document, you must: -1. Write a summary page to summaries/.md with: - - A YAML frontmatter block: `sources: [filename]` - - Key concepts, findings, and ideas from the document - - [[wikilinks]] to related concepts -2. Update or create concept pages in concepts/ for any significant cross-document themes. -3. Update index.md: - - Under ## Documents: add a one-liner entry for the new document - - Under ## Concepts: add/update entries for any concepts you touched - -Always use the provided tools to read existing wiki pages before writing, -so you can append or update without losing prior content. -Use [[wikilinks]] consistently to connect related pages. +Write all content in {language} language. +Use [[wikilinks]] to connect related pages (e.g. [[concepts/attention]]). """ -_LONG_DOC_INSTRUCTIONS_TEMPLATE = """\ -You are a wiki compilation agent for a personal knowledge base. +_SUMMARY_USER = """\ +New document: {doc_name} -{schema_md} +Full text: +{content} -## Your job for long documents (already summarised by PageIndex) -The summary and source pages are already written. Your tasks are: -1. Update or create concept pages in concepts/ for significant themes. -2. Update index.md: - - Under ## Documents: add a one-liner entry referencing the document - - Under ## Concepts: add/update entries for any concepts you touched -3. Do NOT regenerate or overwrite the existing summary page. - -Use get_page_content to fetch specific page ranges from long documents when -you need more detail before writing concept pages. -Always read existing wiki pages before writing to preserve prior content. -Use [[wikilinks]] consistently to connect related pages. +Write a summary page for this document in Markdown. Include: +- Key concepts, findings, and ideas +- [[wikilinks]] to concepts that could become cross-document concept pages + +Return ONLY the Markdown content (no frontmatter, no code fences). """ +_CONCEPTS_LIST_USER = """\ +Based on the summary above, identify the key concepts worth creating as \ +standalone wiki concept pages. -def build_compiler_agent(wiki_root: str, model: str, language: str = "en") -> Agent: - """Build and return the wiki-compiler agent. +Existing concept pages: {existing_concepts} - Creates @function_tool wrappers that bind *wiki_root* so the agent - doesn't need to supply it explicitly. +Return a JSON array of objects, each with: +- "name": concept slug (e.g. "transformer-architecture") +- "title": human-readable title (e.g. "Transformer Architecture") +- "is_update": true if this concept already exists and should be updated - Args: - wiki_root: Absolute path to the wiki directory. - model: LLM model name to use for the agent. - language: Language code for wiki content (e.g. 'en', 'fr'). +Only include concepts for significant themes. For the first document, \ +create 2-3 foundational concepts at most. Do NOT create concepts that are \ +just the document topic itself (e.g. don't create "machine-translation" \ +for a translation paper). - Returns: - Configured :class:`~agents.Agent` instance. - """ - schema_md = get_agents_md(Path(wiki_root)) - instructions = _COMPILER_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) - instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." - - @function_tool - def list_files(directory: str) -> str: - """List all Markdown files in a wiki subdirectory. - - Args: - directory: Subdirectory path relative to wiki root (e.g. 'sources'). - """ - return list_wiki_files(directory, wiki_root) - - @function_tool - def read_file(path: str) -> str: - """Read a Markdown file from the wiki. - - Args: - path: File path relative to wiki root (e.g. 'sources/notes.md'). - """ - return read_wiki_file(path, wiki_root) - - @function_tool - def write_file(path: str, content: str) -> str: - """Write or overwrite a Markdown file in the wiki. - - Args: - path: File path relative to wiki root (e.g. 'concepts/attention.md'). - content: Markdown content to write. - """ - return write_wiki_file(path, content, wiki_root) - - from agents.model_settings import ModelSettings - - return Agent( - name="wiki-compiler", - instructions=instructions, - tools=[list_files, read_file, write_file], - model=f"litellm/{model}", - model_settings=ModelSettings(parallel_tool_calls=False), - ) +Return ONLY valid JSON array, no fences, no explanation. +""" +_CONCEPT_PAGE_USER = """\ +Write the concept page for: {title} -def build_long_doc_compiler_agent(wiki_root: str, kb_dir: str, model: str, language: str = "en") -> Agent: - """Build the wiki-compiler agent with an extra get_page_content tool. +This concept relates to the document "{doc_name}" summarized above. +{update_instruction} - Args: - wiki_root: Absolute path to the wiki directory. - kb_dir: Absolute path to the knowledge base root (contains .openkb/). - model: LLM model name to use for the agent. - language: Language code for wiki content (e.g. 'en', 'fr'). +Return ONLY the Markdown content (no frontmatter, no code fences). Include: +- Clear explanation of the concept +- Key details from the source document +- [[wikilinks]] to related concepts and [[summaries/{doc_name}]] +""" - Returns: - Configured :class:`~agents.Agent` instance. - """ - from openkb.config import load_config +_LONG_DOC_SUMMARY_USER = """\ +This is a PageIndex summary for long document "{doc_name}" (doc_id: {doc_id}): - openkb_dir = Path(kb_dir) / ".openkb" - config = load_config(openkb_dir / "config.yaml") - _model = config.get("model", model) - pageindex_api_key = os.environ.get("PAGEINDEX_API_KEY", "") - client = PageIndexClient( - api_key=pageindex_api_key or None, - model=_model, - storage_path=str(openkb_dir), - ) - col = client.collection() - - schema_md = get_agents_md(Path(wiki_root)) - instructions = _LONG_DOC_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) - instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." - - @function_tool - def list_files(directory: str) -> str: - """List all Markdown files in a wiki subdirectory. - - Args: - directory: Subdirectory path relative to wiki root (e.g. 'sources'). - """ - return list_wiki_files(directory, wiki_root) - - @function_tool - def read_file(path: str) -> str: - """Read a Markdown file from the wiki. - - Args: - path: File path relative to wiki root (e.g. 'sources/notes.md'). - """ - return read_wiki_file(path, wiki_root) - - @function_tool - def write_file(path: str, content: str) -> str: - """Write or overwrite a Markdown file in the wiki. - - Args: - path: File path relative to wiki root (e.g. 'concepts/attention.md'). - content: Markdown content to write. - """ - return write_wiki_file(path, content, wiki_root) - - @function_tool - def get_page_content(doc_id: str, pages: str) -> str: - """Retrieve text content for specific pages of a long document. - - Args: - doc_id: Document identifier from PageIndex. - pages: Page range string, e.g. '1-5' or '3,7,12'. - """ - results = col.get_page_content(doc_id, pages) - if not results: - return "No content found for the given pages." - parts = [] - for item in results: - page_num = item.get("page_index", "?") - text = item.get("text", "") - parts.append(f"[Page {page_num}]\n{text}") - return "\n\n".join(parts) - - from agents.model_settings import ModelSettings - - return Agent( - name="wiki-compiler", - instructions=instructions, - tools=[list_files, read_file, write_file, get_page_content], - model=f"litellm/{_model}", - model_settings=ModelSettings(parallel_tool_calls=False), - ) +{content} + +Based on this structured summary, write a concise overview that captures \ +the key themes and findings. This will be used to generate concept pages. + +Return ONLY the Markdown content (no frontmatter, no code fences). +""" + + +# --------------------------------------------------------------------------- +# LLM helpers +# --------------------------------------------------------------------------- + +import threading + + +class _Spinner: + """Animated dots spinner that runs in a background thread.""" + + def __init__(self, label: str): + self._label = label + self._stop = threading.Event() + self._thread: threading.Thread | None = None + + def start(self) -> None: + sys.stdout.write(f" {self._label}") + sys.stdout.flush() + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + + def _run(self) -> None: + while not self._stop.wait(timeout=1.0): + sys.stdout.write(".") + sys.stdout.flush() + + def stop(self, suffix: str = "") -> None: + self._stop.set() + if self._thread: + self._thread.join() + sys.stdout.write(f" {suffix}\n") + sys.stdout.flush() + + +def _format_usage(elapsed: float, usage) -> str: + """Format timing and token usage into a short summary string.""" + cached = getattr(usage, "prompt_tokens_details", None) + cache_info = "" + if cached and hasattr(cached, "cached_tokens") and cached.cached_tokens: + cache_info = f", cached={cached.cached_tokens}" + return f"{elapsed:.1f}s (in={usage.prompt_tokens}, out={usage.completion_tokens}{cache_info})" + + +def _fmt_messages(messages: list[dict], max_content: int = 200) -> str: + """Format messages for debug output, truncating long content.""" + parts = [] + for msg in messages: + role = msg["role"] + content = msg["content"] + if len(content) > max_content: + preview = content[:max_content] + f"... ({len(content)} chars)" + else: + preview = content + parts.append(f" [{role}] {preview}") + return "\n".join(parts) + + +def _llm_call(model: str, messages: list[dict], step_name: str, **kwargs) -> str: + """Single LLM call with animated progress and debug logging.""" + logger.debug("LLM request [%s]:\n%s", step_name, _fmt_messages(messages)) + if kwargs: + logger.debug("LLM kwargs [%s]: %s", step_name, kwargs) + + spinner = _Spinner(step_name) + spinner.start() + t0 = time.time() + + response = litellm.completion(model=model, messages=messages, **kwargs) + content = response.choices[0].message.content or "" + + spinner.stop(_format_usage(time.time() - t0, response.usage)) + logger.debug("LLM response [%s]:\n%s", step_name, content[:500] + ("..." if len(content) > 500 else "")) + return content.strip() + + +async def _llm_call_async(model: str, messages: list[dict], step_name: str) -> str: + """Async LLM call with timing output and debug logging.""" + logger.debug("LLM request [%s]:\n%s", step_name, _fmt_messages(messages)) + + t0 = time.time() + + response = await litellm.acompletion(model=model, messages=messages) + content = response.choices[0].message.content or "" + + elapsed = time.time() - t0 + sys.stdout.write(f" {step_name}... {_format_usage(elapsed, response.usage)}\n") + sys.stdout.flush() + logger.debug("LLM response [%s]:\n%s", step_name, content[:500] + ("..." if len(content) > 500 else "")) + return content.strip() + + +def _parse_json(text: str) -> list | dict: + """Parse JSON from LLM response, stripping markdown fences if present.""" + cleaned = text.strip() + if cleaned.startswith("```"): + first_nl = cleaned.index("\n") + cleaned = cleaned[first_nl + 1:] + if cleaned.endswith("```"): + cleaned = cleaned[:-3] + return json.loads(cleaned.strip()) + + +# --------------------------------------------------------------------------- +# File I/O helpers +# --------------------------------------------------------------------------- + +def _read_wiki_context(wiki_dir: Path) -> tuple[str, list[str]]: + """Read current index.md content and list of existing concept slugs.""" + index_path = wiki_dir / "index.md" + index_content = index_path.read_text(encoding="utf-8") if index_path.exists() else "" + + concepts_dir = wiki_dir / "concepts" + existing = sorted(p.stem for p in concepts_dir.glob("*.md")) if concepts_dir.exists() else [] + + return index_content, existing + + +def _find_source_filename(doc_name: str, kb_dir: Path) -> str: + """Find the original filename in raw/ for a given doc stem.""" + raw_dir = kb_dir / "raw" + if raw_dir.exists(): + for f in raw_dir.iterdir(): + if f.stem == doc_name: + return f.name + return f"{doc_name}.pdf" + + +def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str) -> None: + """Write summary page with frontmatter.""" + summaries_dir = wiki_dir / "summaries" + summaries_dir.mkdir(parents=True, exist_ok=True) + frontmatter = f"---\nsources: [{source_file}]\n---\n\n" + (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") + + +def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool) -> None: + """Write or update a concept page, managing the sources frontmatter.""" + concepts_dir = wiki_dir / "concepts" + concepts_dir.mkdir(parents=True, exist_ok=True) + path = concepts_dir / f"{name}.md" + + if is_update and path.exists(): + existing = path.read_text(encoding="utf-8") + if source_file not in existing: + if existing.startswith("---"): + end = existing.index("---", 3) + fm = existing[:end + 3] + body = existing[end + 3:] + if "sources:" in fm: + fm = fm.replace("sources: [", f"sources: [{source_file}, ") + existing = fm + body + existing += f"\n\n{content}" + path.write_text(existing, encoding="utf-8") + else: + frontmatter = f"---\nsources: [{source_file}]\n---\n\n" + path.write_text(frontmatter + content, encoding="utf-8") + + +def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> None: + """Append document and concept entries to index.md.""" + index_path = wiki_dir / "index.md" + if not index_path.exists(): + return + + text = index_path.read_text(encoding="utf-8") + + doc_entry = f"- [[summaries/{doc_name}]]" + if doc_entry not in text: + if "## Documents" in text: + text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) + + for name in concept_names: + concept_entry = f"- [[concepts/{name}]]" + if concept_entry not in text: + if "## Concepts" in text: + text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) + + index_path.write_text(text, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +DEFAULT_COMPILE_CONCURRENCY = 5 async def compile_short_doc( @@ -204,17 +283,15 @@ async def compile_short_doc( source_path: Path, kb_dir: Path, model: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, ) -> None: - """Run the compiler agent for a short (non-PageIndex) document. + """Compile a short document using a multi-step LLM pipeline with caching. - Reads the converted source Markdown, then asks the agent to generate a - summary, update concept pages, and update the index. - - Args: - doc_name: Document stem name (no extension). - source_path: Path to the converted Markdown in wiki/sources/. - kb_dir: Root of the knowledge base (contains wiki/ and .openkb/). - model: LLM model name. + Step 1: Build base context A (schema + doc content). + Step 2: A → generate summary. + Step 3: A + summary → extract concept list. + Step 4: Concurrent LLM calls (A cached) → generate each concept page. + Step 5: Code writes files, updates index. """ from openkb.config import load_config @@ -222,17 +299,92 @@ async def compile_short_doc( config = load_config(openkb_dir / "config.yaml") language: str = config.get("language", "en") - wiki_root = str(kb_dir / "wiki") - agent = build_compiler_agent(wiki_root, model, language=language) - + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) + source_file = _find_source_filename(doc_name, kb_dir) content = source_path.read_text(encoding="utf-8") - message = ( - f"New document: {doc_name}\n\n" - f"Full text:\n{content}\n\n" - "Generate summary, update concepts, update index." + + # Base context A: system + document + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _SUMMARY_USER.format( + doc_name=doc_name, content=content, + )} + + # --- Step 1: Generate summary --- + summary = _llm_call(model, [system_msg, doc_msg], "summary") + _write_summary(wiki_dir, doc_name, source_file, summary) + + # --- Step 2: Extract concept list (A cached) --- + _, existing_concepts = _read_wiki_context(wiki_dir) + + concepts_list_raw = _llm_call(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPTS_LIST_USER.format( + existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)", + )}, + ], "concepts-list", max_tokens=512) + + try: + concepts_list = _parse_json(concepts_list_raw) + except (json.JSONDecodeError, ValueError) as exc: + logger.warning("Failed to parse concepts list: %s", exc) + logger.debug("Raw: %s", concepts_list_raw) + _update_index(wiki_dir, doc_name, []) + return + + if not concepts_list: + _update_index(wiki_dir, doc_name, []) + return + + # --- Step 3: Generate concept pages concurrently (A cached) --- + semaphore = asyncio.Semaphore(max_concurrency) + + async def _gen_concept(concept: dict) -> tuple[str, str, bool]: + name = concept["name"] + title = concept.get("title", name) + is_update = concept.get("is_update", False) + update_instruction = ( + "This concept page already exists. Add new information from this document " + "without duplicating existing content." + if is_update else "" + ) + + async with semaphore: + page_content = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_PAGE_USER.format( + title=title, doc_name=doc_name, + update_instruction=update_instruction, + )}, + ], f"concept:{name}") + + return name, page_content, is_update + + sys.stdout.write(f" Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n") + sys.stdout.flush() + + results = await asyncio.gather( + *[_gen_concept(c) for c in concepts_list], + return_exceptions=True, ) - await Runner.run(agent, message) + concept_names = [] + for r in results: + if isinstance(r, Exception): + logger.warning("Concept generation failed: %s", r) + continue + name, page_content, is_update = r + _write_concept(wiki_dir, name, page_content, source_file, is_update) + concept_names.append(name) + + # --- Step 4: Update index (code only) --- + _update_index(wiki_dir, doc_name, concept_names) async def compile_long_doc( @@ -241,18 +393,12 @@ async def compile_long_doc( doc_id: str, kb_dir: Path, model: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, ) -> None: - """Run the compiler agent for a long (PageIndex) document. - - The summary page is already written. The agent updates concept pages and - the index without regenerating the summary. + """Compile a long (PageIndex) document's concepts and index. - Args: - doc_name: Document stem name (no extension). - summary_path: Path to the existing summary Markdown in wiki/summaries/. - doc_id: PageIndex document identifier. - kb_dir: Root of the knowledge base. - model: LLM model name. + The summary page is already written by the indexer. This function + generates concept pages and updates the index. """ from openkb.config import load_config @@ -260,14 +406,87 @@ async def compile_long_doc( config = load_config(openkb_dir / "config.yaml") language: str = config.get("language", "en") - wiki_root = str(kb_dir / "wiki") - agent = build_long_doc_compiler_agent(wiki_root, str(kb_dir), model, language=language) - - content = summary_path.read_text(encoding="utf-8") - message = ( - f"New long document: {doc_name} (doc_id: {doc_id})\n" - f"Summary tree:\n{content}\n" - "Update concepts and index. Do NOT regenerate summary." + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) + source_file = _find_source_filename(doc_name, kb_dir) + summary = summary_path.read_text(encoding="utf-8") + + # Base context A + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format( + doc_name=doc_name, doc_id=doc_id, content=summary, + )} + + # --- Step 1: Extract concept list --- + _, existing_concepts = _read_wiki_context(wiki_dir) + + # Get a concise overview first (for concept generation context) + overview = _llm_call(model, [system_msg, doc_msg], "overview") + + concepts_list_raw = _llm_call(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": overview}, + {"role": "user", "content": _CONCEPTS_LIST_USER.format( + existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)", + )}, + ], "concepts-list", max_tokens=512) + + try: + concepts_list = _parse_json(concepts_list_raw) + except (json.JSONDecodeError, ValueError) as exc: + logger.warning("Failed to parse concepts list: %s", exc) + logger.debug("Raw: %s", concepts_list_raw) + _update_index(wiki_dir, doc_name, []) + return + + if not concepts_list: + _update_index(wiki_dir, doc_name, []) + return + + # --- Step 2: Generate concept pages concurrently --- + semaphore = asyncio.Semaphore(max_concurrency) + + async def _gen_concept(concept: dict) -> tuple[str, str, bool]: + name = concept["name"] + title = concept.get("title", name) + is_update = concept.get("is_update", False) + update_instruction = ( + "This concept page already exists. Add new information." + if is_update else "" + ) + + async with semaphore: + page_content = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": overview}, + {"role": "user", "content": _CONCEPT_PAGE_USER.format( + title=title, doc_name=doc_name, + update_instruction=update_instruction, + )}, + ], f"concept:{name}") + + return name, page_content, is_update + + sys.stdout.write(f" Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n") + sys.stdout.flush() + + results = await asyncio.gather( + *[_gen_concept(c) for c in concepts_list], + return_exceptions=True, ) - await Runner.run(agent, message) + concept_names = [] + for r in results: + if isinstance(r, Exception): + logger.warning("Concept generation failed: %s", r) + continue + name, page_content, is_update = r + _write_concept(wiki_dir, name, page_content, source_file, is_update) + concept_names.append(name) + + # --- Step 3: Update index (code only) --- + _update_index(wiki_dir, doc_name, concept_names) diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index 7a5b1ca..40875f3 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -72,3 +72,42 @@ def write_wiki_file(path: str, content: str, wiki_root: str) -> str: full_path.parent.mkdir(parents=True, exist_ok=True) full_path.write_text(content, encoding="utf-8") return f"Written: {path}" + + +def write_wiki_files(files_json: str, wiki_root: str) -> str: + """Write multiple Markdown files to the wiki in one call. + + Args: + files_json: JSON array of objects, each with ``"path"`` and ``"content"`` keys. + Example: ``[{"path": "concepts/foo.md", "content": "# Foo\\n..."}]`` + wiki_root: Absolute path to the wiki root directory. + + Returns: + Summary of written files, or error message on failure. + """ + import json + + try: + files = json.loads(files_json) + except json.JSONDecodeError as exc: + return f"Invalid JSON: {exc}" + + if not isinstance(files, list): + return "Expected a JSON array of {path, content} objects." + + root = Path(wiki_root).resolve() + written: list[str] = [] + for entry in files: + path = entry.get("path", "") + content = entry.get("content", "") + if not path: + continue + full_path = (root / path).resolve() + if not full_path.is_relative_to(root): + written.append(f"Skipped (path escape): {path}") + continue + full_path.parent.mkdir(parents=True, exist_ok=True) + full_path.write_text(content, encoding="utf-8") + written.append(path) + + return f"Written {len(written)} files: {', '.join(written)}" diff --git a/openkb/cli.py b/openkb/cli.py index da664f5..388ac87 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -3,13 +3,20 @@ import asyncio import json +import logging import time from pathlib import Path import os +# Disable Agents SDK tracing (requires OPENAI_API_KEY otherwise) +os.environ.setdefault("OPENAI_AGENTS_DISABLE_TRACING", "1") +# Use local model cost map — skip fetching from GitHub on every invocation +os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True") + import click import litellm +litellm.suppress_debug_info = True from dotenv import load_dotenv from openkb.config import DEFAULT_CONFIG, load_config, save_config @@ -17,14 +24,28 @@ from openkb.log import append_log from openkb.schema import AGENTS_MD -load_dotenv() +load_dotenv() # load from cwd (covers running inside the KB dir) + +def _setup_llm_key(kb_dir: Path | None = None) -> None: + """Set LiteLLM API key from LLM_API_KEY env var if present. + + If *kb_dir* is given, also loads ``.env`` from the KB root so that + the key is found even when the CLI is invoked from another directory. + Also propagates to provider-specific env vars (OPENAI_API_KEY, etc.) + so that the Agents SDK litellm provider can pick them up. + """ + if kb_dir is not None: + env_file = kb_dir / ".env" + if env_file.exists(): + load_dotenv(env_file, override=False) -def _setup_llm_key() -> None: - """Set LiteLLM API key from LLM_API_KEY env var if present.""" api_key = os.environ.get("LLM_API_KEY", "") if api_key: litellm.api_key = api_key + for env_var in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY"): + if not os.environ.get(env_var): + os.environ[env_var] = api_key # Supported document extensions for the `add` command SUPPORTED_EXTENSIONS = { @@ -73,9 +94,10 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: from openkb.agent.compiler import compile_long_doc, compile_short_doc from openkb.state import HashRegistry + logger = logging.getLogger(__name__) openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") - _setup_llm_key() + _setup_llm_key(kb_dir) model: str = config.get("model", DEFAULT_CONFIG["model"]) registry = HashRegistry(openkb_dir / "hashes.json") @@ -85,6 +107,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: result = convert_document(file_path, kb_dir) except Exception as exc: click.echo(f" [ERROR] Conversion failed: {exc}") + logger.debug("Conversion traceback:", exc_info=True) return if result.skipped: @@ -101,6 +124,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: index_result = index_long_document(result.raw_path, kb_dir) except Exception as exc: click.echo(f" [ERROR] Indexing failed: {exc}") + logger.debug("Indexing traceback:", exc_info=True) return summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md" @@ -117,6 +141,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: time.sleep(2) else: click.echo(f" [ERROR] Compilation failed: {exc}") + logger.debug("Compilation traceback:", exc_info=True) return else: click.echo(f" Compiling short doc…") @@ -130,6 +155,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: time.sleep(2) else: click.echo(f" [ERROR] Compilation failed: {exc}") + logger.debug("Compilation traceback:", exc_info=True) return # Register hash only after successful compilation @@ -146,8 +172,15 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: # --------------------------------------------------------------------------- @click.group() -def cli(): +@click.option("-v", "--verbose", is_flag=True, default=False, help="Enable verbose logging.") +def cli(verbose): """OpenKB — Karpathy's LLM Knowledge Base workflow, powered by PageIndex.""" + logging.basicConfig( + format="%(name)s %(levelname)s: %(message)s", + level=logging.WARNING, + ) + if verbose: + logging.getLogger("openkb").setLevel(logging.DEBUG) @cli.command() @@ -249,7 +282,7 @@ def query(question, save): openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") - _setup_llm_key() + _setup_llm_key(kb_dir) model: str = config.get("model", DEFAULT_CONFIG["model"]) try: @@ -314,7 +347,7 @@ def lint(fix): openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") - _setup_llm_key() + _setup_llm_key(kb_dir) model: str = config.get("model", DEFAULT_CONFIG["model"]) # Structural lint diff --git a/pyproject.toml b/pyproject.toml index 393cbd0..eb1cdde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ classifiers = [ keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"] dependencies = [ "pageindex==0.3.0.dev0", - "markitdown[all]", + "markitdown", "click>=8.0", "watchdog>=3.0", "litellm", diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 0549bb9..1d17c6c 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1,158 +1,266 @@ -"""Tests for openkb.agent.compiler.""" +"""Tests for openkb.agent.compiler pipeline.""" from __future__ import annotations +import json from pathlib import Path -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import MagicMock, patch, AsyncMock import pytest from openkb.agent.compiler import ( - build_compiler_agent, compile_long_doc, compile_short_doc, + _parse_json, + _write_summary, + _write_concept, + _update_index, + _read_wiki_context, ) -from openkb.schema import SCHEMA_MD -class TestBuildCompilerAgent: - def test_agent_name(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini") - assert agent.name == "wiki-compiler" - - def test_agent_tools_count(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini") - # list_files, read_file, write_file - assert len(agent.tools) == 3 - - def test_schema_in_instructions(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini") - assert SCHEMA_MD in agent.instructions - - def test_agent_model(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "my-custom-model") - assert agent.model == "litellm/my-custom-model" - - def test_tool_names(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini") - tool_names = {t.name for t in agent.tools} - assert "list_files" in tool_names - assert "read_file" in tool_names - assert "write_file" in tool_names +class TestParseJson: + def test_plain_json(self): + assert _parse_json('[{"name": "foo"}]') == [{"name": "foo"}] + + def test_fenced_json(self): + text = '```json\n[{"name": "bar"}]\n```' + assert _parse_json(text) == [{"name": "bar"}] + + def test_invalid_json(self): + with pytest.raises((json.JSONDecodeError, ValueError)): + _parse_json("not json") + + +class TestWriteSummary: + def test_writes_with_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.") + path = wiki / "summaries" / "my-doc.md" + assert path.exists() + text = path.read_text() + assert "sources: [my-doc.pdf]" in text + assert "# Summary" in text + + +class TestWriteConcept: + def test_new_concept(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False) + path = wiki / "concepts" / "attention.md" + assert path.exists() + text = path.read_text() + assert "sources: [paper.pdf]" in text + assert "# Attention" in text + + def test_update_concept_appends_source(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nOld content.", + encoding="utf-8", + ) + _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True) + text = (concepts / "attention.md").read_text() + assert "paper2.pdf" in text + assert "paper1.pdf" in text + assert "New info from paper2." in text + + +class TestUpdateIndex: + def test_appends_entries(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention", "transformer"]) + text = (wiki / "index.md").read_text() + assert "[[summaries/my-doc]]" in text + assert "[[concepts/attention]]" in text + assert "[[concepts/transformer]]" in text + + def test_no_duplicates(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n- [[summaries/my-doc]]\n\n## Concepts\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", []) + text = (wiki / "index.md").read_text() + assert text.count("[[summaries/my-doc]]") == 1 + + +class TestReadWikiContext: + def test_empty_wiki(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + index, concepts = _read_wiki_context(wiki) + assert index == "" + assert concepts == [] + + def test_with_content(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text("# Index\n", encoding="utf-8") + concepts_dir = wiki / "concepts" + concepts_dir.mkdir() + (concepts_dir / "attention.md").write_text("# Attention", encoding="utf-8") + (concepts_dir / "transformer.md").write_text("# Transformer", encoding="utf-8") + index, concepts = _read_wiki_context(wiki) + assert "# Index" in index + assert concepts == ["attention", "transformer"] + + +def _mock_completion(responses: list[str]): + """Create a mock for litellm.completion that returns responses in order.""" + call_count = {"n": 0} + + def side_effect(*args, **kwargs): + idx = min(call_count["n"], len(responses) - 1) + call_count["n"] += 1 + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + mock_resp.choices[0].message.content = responses[idx] + mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50) + mock_resp.usage.prompt_tokens_details = None + return mock_resp + + return side_effect + + +def _mock_acompletion(responses: list[str]): + """Create an async mock for litellm.acompletion.""" + call_count = {"n": 0} + + async def side_effect(*args, **kwargs): + idx = min(call_count["n"], len(responses) - 1) + call_count["n"] += 1 + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + mock_resp.choices[0].message.content = responses[idx] + mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50) + mock_resp.usage.prompt_tokens_details = None + return mock_resp + + return side_effect class TestCompileShortDoc: @pytest.mark.asyncio - async def test_calls_runner_run(self, tmp_path): - # Create a source file - wiki_dir = tmp_path / "wiki" - wiki_dir.mkdir() - source_path = wiki_dir / "sources" / "my_doc.md" - source_path.parent.mkdir(parents=True) - source_path.write_text("# My Doc\n\nSome content.", encoding="utf-8") - - # Create .openkb dir for agent build - openkb_dir = tmp_path / ".openkb" - openkb_dir.mkdir() - - mock_result = MagicMock() - mock_result.final_output = "Done" + async def test_full_pipeline(self, tmp_path): + # Setup KB structure + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "test-doc.md" + source_path.write_text("# Test Doc\n\nSome content about transformers.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + summary_response = "# Summary\n\nThis document discusses transformers." + concepts_list_response = json.dumps([ + {"name": "transformer", "title": "Transformer", "is_update": False}, + ]) + concept_page_response = "# Transformer\n\nA neural network architecture." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_response, concepts_list_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") - with patch("openkb.agent.compiler.Runner.run", new_callable=AsyncMock) as mock_run: - mock_run.return_value = mock_result - await compile_short_doc("my_doc", source_path, tmp_path, "gpt-4o-mini") + # Verify summary written + summary_path = wiki / "summaries" / "test-doc.md" + assert summary_path.exists() + assert "sources: [test-doc.pdf]" in summary_path.read_text() - mock_run.assert_called_once() - call_args = mock_run.call_args - agent_arg = call_args[0][0] - message_arg = call_args[0][1] + # Verify concept written + concept_path = wiki / "concepts" / "transformer.md" + assert concept_path.exists() + assert "sources: [test-doc.pdf]" in concept_path.read_text() - assert agent_arg.name == "wiki-compiler" - assert "my_doc" in message_arg - assert "Some content." in message_arg - assert "Generate summary" in message_arg + # Verify index updated + index_text = (wiki / "index.md").read_text() + assert "[[summaries/test-doc]]" in index_text + assert "[[concepts/transformer]]" in index_text @pytest.mark.asyncio - async def test_message_contains_doc_name_and_content(self, tmp_path): - wiki_dir = tmp_path / "wiki" - source_path = wiki_dir / "sources" / "test_paper.md" - source_path.parent.mkdir(parents=True) - source_path.write_text("# Test Paper\n\nKey findings here.", encoding="utf-8") - + async def test_handles_bad_json(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "doc.md" + source_path.write_text("Content", encoding="utf-8") (tmp_path / ".openkb").mkdir() - captured = {} - - async def fake_run(agent, message, **kwargs): - captured["message"] = message - return MagicMock(final_output="ok") - - with patch("openkb.agent.compiler.Runner.run", side_effect=fake_run): - await compile_short_doc("test_paper", source_path, tmp_path, "gpt-4o-mini") + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion(["Summary text", "not valid json"]) + ) + # Should not raise + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") - assert "test_paper" in captured["message"] - assert "Key findings here." in captured["message"] + # Summary should still be written + assert (wiki / "summaries" / "doc.md").exists() class TestCompileLongDoc: @pytest.mark.asyncio - async def test_calls_runner_run(self, tmp_path): - wiki_dir = tmp_path / "wiki" - summary_path = wiki_dir / "summaries" / "big_doc.md" - summary_path.parent.mkdir(parents=True) - summary_path.write_text("# Big Doc Summary\n\nSection tree.", encoding="utf-8") - + async def test_full_pipeline(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n", + encoding="utf-8", + ) + summary_path = wiki / "summaries" / "big-doc.md" + summary_path.write_text("# Big Doc\n\nPageIndex summary tree.", encoding="utf-8") openkb_dir = tmp_path / ".openkb" openkb_dir.mkdir() - # Write minimal config (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n") - - mock_result = MagicMock() - mock_result.final_output = "Done" - - with patch("openkb.agent.compiler.Runner.run", new_callable=AsyncMock) as mock_run, \ - patch("openkb.agent.compiler.PageIndexClient") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_run.return_value = mock_result - - await compile_long_doc( - "big_doc", summary_path, "doc-abc123", tmp_path, "gpt-4o-mini" + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake") + + overview_response = "Overview of the big document." + concepts_list_response = json.dumps([ + {"name": "deep-learning", "title": "Deep Learning", "is_update": False}, + ]) + concept_page_response = "# Deep Learning\n\nA subfield of ML." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([overview_response, concepts_list_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) ) - - mock_run.assert_called_once() - call_args = mock_run.call_args - message_arg = call_args[0][1] - - assert "big_doc" in message_arg - assert "doc-abc123" in message_arg - assert "Do NOT regenerate summary" in message_arg - - @pytest.mark.asyncio - async def test_long_doc_agent_has_four_tools(self, tmp_path): - wiki_dir = tmp_path / "wiki" - summary_path = wiki_dir / "summaries" / "big.md" - summary_path.parent.mkdir(parents=True) - summary_path.write_text("Summary content", encoding="utf-8") - - openkb_dir = tmp_path / ".openkb" - openkb_dir.mkdir() - (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n") - - captured_agent = {} - - async def fake_run(agent, message, **kwargs): - captured_agent["agent"] = agent - return MagicMock(final_output="ok") - - with patch("openkb.agent.compiler.Runner.run", side_effect=fake_run), \ - patch("openkb.agent.compiler.PageIndexClient") as mock_client_cls: - mock_client_cls.return_value = MagicMock() - await compile_long_doc( - "big", summary_path, "doc-xyz", tmp_path, "gpt-4o-mini" + "big-doc", summary_path, "doc-123", tmp_path, "gpt-4o-mini" ) - agent = captured_agent["agent"] - assert len(agent.tools) == 4 - tool_names = {t.name for t in agent.tools} - assert "get_page_content" in tool_names + concept_path = wiki / "concepts" / "deep-learning.md" + assert concept_path.exists() + assert "Deep Learning" in concept_path.read_text() + + index_text = (wiki / "index.md").read_text() + assert "[[summaries/big-doc]]" in index_text + assert "[[concepts/deep-learning]]" in index_text From 864068173cde37571b59fcbff00f3f50e104ba85 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 21:07:37 +0800 Subject: [PATCH 02/44] feat: add _read_concept_briefs for concept dedup context --- openkb/agent/compiler.py | 55 +++++++++++++++++++++++++++++++--- tests/test_compiler.py | 64 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 4 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 8307abb..9d721ca 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -12,7 +12,9 @@ import asyncio import json import logging +import re import sys +import threading import time from pathlib import Path @@ -95,9 +97,6 @@ # LLM helpers # --------------------------------------------------------------------------- -import threading - - class _Spinner: """Animated dots spinner that runs in a background thread.""" @@ -208,6 +207,37 @@ def _read_wiki_context(wiki_dir: Path) -> tuple[str, list[str]]: return index_content, existing +def _read_concept_briefs(wiki_dir: Path) -> str: + """Read existing concept pages and return compact one-line summaries. + + For each concept, skips YAML frontmatter, takes the first 150 chars of the + body (newlines collapsed to spaces), and formats as ``- {slug}: {brief}``. + + Returns "(none yet)" if the concepts directory is missing or empty. + """ + concepts_dir = wiki_dir / "concepts" + if not concepts_dir.exists(): + return "(none yet)" + + md_files = sorted(concepts_dir.glob("*.md")) + if not md_files: + return "(none yet)" + + lines: list[str] = [] + for path in md_files: + text = path.read_text(encoding="utf-8") + # Strip YAML frontmatter if present + if text.startswith("---"): + end = text.find("---", 3) + if end != -1: + text = text[end + 3:] + body = text.strip().replace("\n", " ") + brief = body[:150] + lines.append(f"- {path.stem}: {brief}") + + return "\n".join(lines) + + def _find_source_filename(doc_name: str, kb_dir: Path) -> str: """Find the original filename in raw/ for a given doc stem.""" raw_dir = kb_dir / "raw" @@ -226,11 +256,24 @@ def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") +_SAFE_NAME_RE = re.compile(r'[^a-zA-Z0-9_\-]') + + +def _sanitize_concept_name(name: str) -> str: + """Sanitize a concept name for safe use as a filename.""" + sanitized = _SAFE_NAME_RE.sub("-", name).strip("-") + return sanitized or "unnamed-concept" + + def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool) -> None: """Write or update a concept page, managing the sources frontmatter.""" concepts_dir = wiki_dir / "concepts" concepts_dir.mkdir(parents=True, exist_ok=True) - path = concepts_dir / f"{name}.md" + safe_name = _sanitize_concept_name(name) + path = (concepts_dir / f"{safe_name}.md").resolve() + if not path.is_relative_to(concepts_dir.resolve()): + logger.warning("Concept name escapes concepts dir: %s", name) + return if is_update and path.exists(): existing = path.read_text(encoding="utf-8") @@ -241,7 +284,11 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is body = existing[end + 3:] if "sources:" in fm: fm = fm.replace("sources: [", f"sources: [{source_file}, ") + else: + fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) existing = fm + body + else: + existing = f"---\nsources: [{source_file}]\n---\n\n" + existing existing += f"\n\n{content}" path.write_text(existing, encoding="utf-8") else: diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 1d17c6c..4be4aa7 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -15,6 +15,7 @@ _write_concept, _update_index, _read_wiki_context, + _read_concept_briefs, ) @@ -116,6 +117,69 @@ def test_with_content(self, tmp_path): assert concepts == ["attention", "transformer"] +class TestReadConceptBriefs: + def test_empty_wiki(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "concepts").mkdir() + assert _read_concept_briefs(wiki) == "(none yet)" + + def test_no_concepts_dir(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + assert _read_concept_briefs(wiki) == "(none yet)" + + def test_reads_briefs_with_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\n---\n\nAttention is a mechanism that allows models to focus on relevant parts.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- attention:" in result + assert "Attention is a mechanism" in result + assert "sources" not in result + assert "---" not in result + + def test_reads_briefs_without_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "transformer.md").write_text( + "Transformer is a neural network architecture based on attention.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- transformer:" in result + assert "Transformer is a neural network" in result + + def test_truncates_long_content(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + long_body = "A" * 300 + (concepts / "longconcept.md").write_text(long_body, encoding="utf-8") + result = _read_concept_briefs(wiki) + # The brief part should be truncated at 150 chars + brief = result.split("- longconcept: ", 1)[1] + assert len(brief) == 150 + assert brief == "A" * 150 + + def test_sorted_alphabetically(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8") + (concepts / "apple.md").write_text("Apple concept.", encoding="utf-8") + (concepts / "mango.md").write_text("Mango concept.", encoding="utf-8") + result = _read_concept_briefs(wiki) + lines = result.strip().splitlines() + slugs = [line.split(":")[0].lstrip("- ") for line in lines] + assert slugs == ["apple", "mango", "zebra"] + + def _mock_completion(responses: list[str]): """Create a mock for litellm.completion that returns responses in order.""" call_count = {"n": 0} From 4f1d3323cc01f4be1cbd053c90773e3349b6748f Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 21:10:30 +0800 Subject: [PATCH 03/44] feat: add concepts plan and update prompt templates Add _CONCEPTS_PLAN_USER (create/update/related JSON structure) and _CONCEPT_UPDATE_USER templates; add TestParseConceptsPlan tests. --- openkb/agent/compiler.py | 71 ++++++++++++++++++++++++++++++++++++++++ tests/test_compiler.py | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 9d721ca..6830b69 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -69,6 +69,33 @@ Return ONLY valid JSON array, no fences, no explanation. """ +_CONCEPTS_PLAN_USER = """\ +Based on the summary above, decide how to update the wiki's concept pages. + +Existing concept pages: +{concept_briefs} + +Return a JSON object with three keys: + +1. "create" — new concepts not covered by any existing page. Array of objects: + {{"name": "concept-slug", "title": "Human-Readable Title"}} + +2. "update" — existing concepts that have significant new information from \ +this document worth integrating. Array of objects: + {{"name": "existing-slug", "title": "Existing Title"}} + +3. "related" — existing concepts tangentially related to this document but \ +not needing content changes, just a cross-reference link. Array of slug strings. + +Rules: +- For the first few documents, create 2-3 foundational concepts at most. +- Do NOT create a concept that overlaps with an existing one — use "update". +- Do NOT create concepts that are just the document topic itself. +- "related" is for lightweight cross-linking only, no content rewrite needed. + +Return ONLY valid JSON, no fences, no explanation. +""" + _CONCEPT_PAGE_USER = """\ Write the concept page for: {title} @@ -81,6 +108,20 @@ - [[wikilinks]] to related concepts and [[summaries/{doc_name}]] """ +_CONCEPT_UPDATE_USER = """\ +Update the concept page for: {title} + +Current content of this page: +{existing_content} + +New information from document "{doc_name}" (summarized above) should be \ +integrated into this page. Rewrite the full page incorporating the new \ +information naturally — do not just append. Maintain existing \ +[[wikilinks]] and add new ones where appropriate. + +Return ONLY the Markdown content (no frontmatter, no code fences). +""" + _LONG_DOC_SUMMARY_USER = """\ This is a PageIndex summary for long document "{doc_name}" (doc_id: {doc_id}): @@ -296,6 +337,36 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is path.write_text(frontmatter + content, encoding="utf-8") +def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None: + """Add a cross-reference link to an existing concept page (no LLM call).""" + concepts_dir = wiki_dir / "concepts" + path = concepts_dir / f"{concept_slug}.md" + if not path.exists(): + return + + text = path.read_text(encoding="utf-8") + link = f"[[summaries/{doc_name}]]" + if link in text: + return + + # Update sources in frontmatter + if source_file not in text: + if text.startswith("---"): + end = text.index("---", 3) + fm = text[:end + 3] + body = text[end + 3:] + if "sources:" in fm: + fm = fm.replace("sources: [", f"sources: [{source_file}, ") + else: + fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) + text = fm + body + else: + text = f"---\nsources: [{source_file}]\n---\n\n" + text + + text += f"\n\nSee also: {link}" + path.write_text(text, encoding="utf-8") + + def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> None: """Append document and concept entries to index.md.""" index_path = wiki_dir / "index.md" diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 4be4aa7..fd5b249 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -16,6 +16,7 @@ _update_index, _read_wiki_context, _read_concept_briefs, + _add_related_link, ) @@ -32,6 +33,31 @@ def test_invalid_json(self): _parse_json("not json") +class TestParseConceptsPlan: + def test_dict_format(self): + text = json.dumps({ + "create": [{"name": "foo", "title": "Foo"}], + "update": [{"name": "bar", "title": "Bar"}], + "related": ["baz"], + }) + parsed = _parse_json(text) + assert isinstance(parsed, dict) + assert len(parsed["create"]) == 1 + assert len(parsed["update"]) == 1 + assert parsed["related"] == ["baz"] + + def test_fallback_list_format(self): + text = json.dumps([{"name": "foo", "title": "Foo"}]) + parsed = _parse_json(text) + assert isinstance(parsed, list) + + def test_fenced_dict(self): + text = '```json\n{"create": [], "update": [], "related": []}\n```' + parsed = _parse_json(text) + assert isinstance(parsed, dict) + assert parsed["create"] == [] + + class TestWriteSummary: def test_writes_with_frontmatter(self, tmp_path): wiki = tmp_path / "wiki" @@ -180,6 +206,39 @@ def test_sorted_alphabetically(self, tmp_path): assert slugs == ["apple", "mango", "zebra"] +class TestAddRelatedLink: + def test_adds_see_also_link(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSome content.", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper2.pdf") + text = (concepts / "attention.md").read_text() + assert "[[summaries/new-doc]]" in text + assert "paper2.pdf" in text + + def test_skips_if_already_linked(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSee also: [[summaries/new-doc]]", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper1.pdf") + text = (concepts / "attention.md").read_text() + assert text.count("[[summaries/new-doc]]") == 1 + + def test_skips_if_file_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + # Should not raise + _add_related_link(wiki, "nonexistent", "doc", "file.pdf") + + def _mock_completion(responses: list[str]): """Create a mock for litellm.completion that returns responses in order.""" call_count = {"n": 0} From fc0857e4109e93a6dff2d9ba21422572bc055813 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 21:14:15 +0800 Subject: [PATCH 04/44] feat: concept dedup with briefs, update/related paths, extract _compile_concepts --- openkb/agent/compiler.py | 270 +++++++++++++++++++-------------------- tests/test_compiler.py | 174 ++++++++++++++++++++++++- 2 files changed, 301 insertions(+), 143 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 6830b69..a6f5bdc 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -396,81 +396,67 @@ def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> No DEFAULT_COMPILE_CONCURRENCY = 5 -async def compile_short_doc( - doc_name: str, - source_path: Path, +async def _compile_concepts( + wiki_dir: Path, kb_dir: Path, model: str, - max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, + system_msg: dict, + doc_msg: dict, + summary: str, + doc_name: str, + max_concurrency: int, ) -> None: - """Compile a short document using a multi-step LLM pipeline with caching. + """Shared Steps 2-4: concepts plan → generate/update → index. - Step 1: Build base context A (schema + doc content). - Step 2: A → generate summary. - Step 3: A + summary → extract concept list. - Step 4: Concurrent LLM calls (A cached) → generate each concept page. - Step 5: Code writes files, updates index. + Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related + actions, then executes each action type accordingly. """ - from openkb.config import load_config - - openkb_dir = kb_dir / ".openkb" - config = load_config(openkb_dir / "config.yaml") - language: str = config.get("language", "en") - - wiki_dir = kb_dir / "wiki" - schema_md = get_agents_md(wiki_dir) source_file = _find_source_filename(doc_name, kb_dir) - content = source_path.read_text(encoding="utf-8") - # Base context A: system + document - system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( - schema_md=schema_md, language=language, - )} - doc_msg = {"role": "user", "content": _SUMMARY_USER.format( - doc_name=doc_name, content=content, - )} + # --- Step 2: Get concepts plan (A cached) --- + concept_briefs = _read_concept_briefs(wiki_dir) - # --- Step 1: Generate summary --- - summary = _llm_call(model, [system_msg, doc_msg], "summary") - _write_summary(wiki_dir, doc_name, source_file, summary) - - # --- Step 2: Extract concept list (A cached) --- - _, existing_concepts = _read_wiki_context(wiki_dir) - - concepts_list_raw = _llm_call(model, [ + plan_raw = _llm_call(model, [ system_msg, doc_msg, {"role": "assistant", "content": summary}, - {"role": "user", "content": _CONCEPTS_LIST_USER.format( - existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)", + {"role": "user", "content": _CONCEPTS_PLAN_USER.format( + concept_briefs=concept_briefs, )}, - ], "concepts-list", max_tokens=512) + ], "concepts-plan", max_tokens=1024) try: - concepts_list = _parse_json(concepts_list_raw) + parsed = _parse_json(plan_raw) except (json.JSONDecodeError, ValueError) as exc: - logger.warning("Failed to parse concepts list: %s", exc) - logger.debug("Raw: %s", concepts_list_raw) + logger.warning("Failed to parse concepts plan: %s", exc) + logger.debug("Raw: %s", plan_raw) _update_index(wiki_dir, doc_name, []) return - if not concepts_list: + # Fallback: if LLM returns a flat list, treat all items as "create" + if isinstance(parsed, list): + plan = {"create": parsed, "update": [], "related": []} + else: + plan = { + "create": parsed.get("create", []), + "update": parsed.get("update", []), + "related": parsed.get("related", []), + } + + create_items = plan["create"] + update_items = plan["update"] + related_items = plan["related"] + + if not create_items and not update_items and not related_items: _update_index(wiki_dir, doc_name, []) return - # --- Step 3: Generate concept pages concurrently (A cached) --- + # --- Step 3: Generate/update concept pages concurrently (A cached) --- semaphore = asyncio.Semaphore(max_concurrency) - async def _gen_concept(concept: dict) -> tuple[str, str, bool]: + async def _gen_create(concept: dict) -> tuple[str, str, bool]: name = concept["name"] title = concept.get("title", name) - is_update = concept.get("is_update", False) - update_instruction = ( - "This concept page already exists. Add new information from this document " - "without duplicating existing content." - if is_update else "" - ) - async with semaphore: page_content = await _llm_call_async(model, [ system_msg, @@ -478,45 +464,76 @@ async def _gen_concept(concept: dict) -> tuple[str, str, bool]: {"role": "assistant", "content": summary}, {"role": "user", "content": _CONCEPT_PAGE_USER.format( title=title, doc_name=doc_name, - update_instruction=update_instruction, + update_instruction="", )}, ], f"concept:{name}") + return name, page_content, False - return name, page_content, is_update + async def _gen_update(concept: dict) -> tuple[str, str, bool]: + name = concept["name"] + title = concept.get("title", name) + concept_path = wiki_dir / "concepts" / f"{name}.md" + if concept_path.exists(): + raw_text = concept_path.read_text(encoding="utf-8") + if raw_text.startswith("---"): + parts = raw_text.split("---", 2) + existing_content = parts[2].strip() if len(parts) >= 3 else raw_text + else: + existing_content = raw_text + else: + existing_content = "(page not found — create from scratch)" + async with semaphore: + page_content = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_UPDATE_USER.format( + title=title, doc_name=doc_name, + existing_content=existing_content, + )}, + ], f"update:{name}") + return name, page_content, True - sys.stdout.write(f" Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n") - sys.stdout.flush() + tasks = [] + tasks.extend(_gen_create(c) for c in create_items) + tasks.extend(_gen_update(c) for c in update_items) - results = await asyncio.gather( - *[_gen_concept(c) for c in concepts_list], - return_exceptions=True, - ) + concept_names: list[str] = [] - concept_names = [] - for r in results: - if isinstance(r, Exception): - logger.warning("Concept generation failed: %s", r) - continue - name, page_content, is_update = r - _write_concept(wiki_dir, name, page_content, source_file, is_update) - concept_names.append(name) + if tasks: + total = len(tasks) + sys.stdout.write(f" Generating {total} concept(s) (concurrency={max_concurrency})...\n") + sys.stdout.flush() + + results = await asyncio.gather(*tasks, return_exceptions=True) + + for r in results: + if isinstance(r, Exception): + logger.warning("Concept generation failed: %s", r) + continue + name, page_content, is_update = r + _write_concept(wiki_dir, name, page_content, source_file, is_update) + concept_names.append(name) + + # --- Step 3b: Process related items (code only, no LLM) --- + for slug in related_items: + _add_related_link(wiki_dir, slug, doc_name, source_file) # --- Step 4: Update index (code only) --- _update_index(wiki_dir, doc_name, concept_names) -async def compile_long_doc( +async def compile_short_doc( doc_name: str, - summary_path: Path, - doc_id: str, + source_path: Path, kb_dir: Path, model: str, max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, ) -> None: - """Compile a long (PageIndex) document's concepts and index. + """Compile a short document using a multi-step LLM pipeline with caching. - The summary page is already written by the indexer. This function - generates concept pages and updates the index. + Step 1: Build base context A (schema + doc content), generate summary. + Steps 2-4: Delegated to ``_compile_concepts``. """ from openkb.config import load_config @@ -527,84 +544,63 @@ async def compile_long_doc( wiki_dir = kb_dir / "wiki" schema_md = get_agents_md(wiki_dir) source_file = _find_source_filename(doc_name, kb_dir) - summary = summary_path.read_text(encoding="utf-8") + content = source_path.read_text(encoding="utf-8") - # Base context A + # Base context A: system + document system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( schema_md=schema_md, language=language, )} - doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format( - doc_name=doc_name, doc_id=doc_id, content=summary, + doc_msg = {"role": "user", "content": _SUMMARY_USER.format( + doc_name=doc_name, content=content, )} - # --- Step 1: Extract concept list --- - _, existing_concepts = _read_wiki_context(wiki_dir) - - # Get a concise overview first (for concept generation context) - overview = _llm_call(model, [system_msg, doc_msg], "overview") + # --- Step 1: Generate summary --- + summary = _llm_call(model, [system_msg, doc_msg], "summary") + _write_summary(wiki_dir, doc_name, source_file, summary) - concepts_list_raw = _llm_call(model, [ - system_msg, - doc_msg, - {"role": "assistant", "content": overview}, - {"role": "user", "content": _CONCEPTS_LIST_USER.format( - existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)", - )}, - ], "concepts-list", max_tokens=512) + # --- Steps 2-4: Concept plan → generate/update → index --- + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, + summary, doc_name, max_concurrency, + ) - try: - concepts_list = _parse_json(concepts_list_raw) - except (json.JSONDecodeError, ValueError) as exc: - logger.warning("Failed to parse concepts list: %s", exc) - logger.debug("Raw: %s", concepts_list_raw) - _update_index(wiki_dir, doc_name, []) - return - if not concepts_list: - _update_index(wiki_dir, doc_name, []) - return +async def compile_long_doc( + doc_name: str, + summary_path: Path, + doc_id: str, + kb_dir: Path, + model: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: + """Compile a long (PageIndex) document's concepts and index. - # --- Step 2: Generate concept pages concurrently --- - semaphore = asyncio.Semaphore(max_concurrency) + The summary page is already written by the indexer. This function + generates concept pages and updates the index. + """ + from openkb.config import load_config - async def _gen_concept(concept: dict) -> tuple[str, str, bool]: - name = concept["name"] - title = concept.get("title", name) - is_update = concept.get("is_update", False) - update_instruction = ( - "This concept page already exists. Add new information." - if is_update else "" - ) + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + language: str = config.get("language", "en") - async with semaphore: - page_content = await _llm_call_async(model, [ - system_msg, - doc_msg, - {"role": "assistant", "content": overview}, - {"role": "user", "content": _CONCEPT_PAGE_USER.format( - title=title, doc_name=doc_name, - update_instruction=update_instruction, - )}, - ], f"concept:{name}") + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) + summary_content = summary_path.read_text(encoding="utf-8") - return name, page_content, is_update + # Base context A + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format( + doc_name=doc_name, doc_id=doc_id, content=summary_content, + )} - sys.stdout.write(f" Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n") - sys.stdout.flush() + # --- Step 1: Generate overview --- + overview = _llm_call(model, [system_msg, doc_msg], "overview") - results = await asyncio.gather( - *[_gen_concept(c) for c in concepts_list], - return_exceptions=True, + # --- Steps 2-4: Concept plan → generate/update → index --- + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, + overview, doc_name, max_concurrency, ) - - concept_names = [] - for r in results: - if isinstance(r, Exception): - logger.warning("Concept generation failed: %s", r) - continue - name, page_content, is_update = r - _write_concept(wiki_dir, name, page_content, source_file, is_update) - concept_names.append(name) - - # --- Step 3: Update index (code only) --- - _update_index(wiki_dir, doc_name, concept_names) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index fd5b249..d0903f5 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -10,6 +10,7 @@ from openkb.agent.compiler import ( compile_long_doc, compile_short_doc, + _compile_concepts, _parse_json, _write_summary, _write_concept, @@ -292,9 +293,11 @@ async def test_full_pipeline(self, tmp_path): (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") summary_response = "# Summary\n\nThis document discusses transformers." - concepts_list_response = json.dumps([ - {"name": "transformer", "title": "Transformer", "is_update": False}, - ]) + concepts_list_response = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], + "related": [], + }) concept_page_response = "# Transformer\n\nA neural network architecture." with patch("openkb.agent.compiler.litellm") as mock_litellm: @@ -364,9 +367,11 @@ async def test_full_pipeline(self, tmp_path): (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake") overview_response = "Overview of the big document." - concepts_list_response = json.dumps([ - {"name": "deep-learning", "title": "Deep Learning", "is_update": False}, - ]) + concepts_list_response = json.dumps({ + "create": [{"name": "deep-learning", "title": "Deep Learning"}], + "update": [], + "related": [], + }) concept_page_response = "# Deep Learning\n\nA subfield of ML." with patch("openkb.agent.compiler.litellm") as mock_litellm: @@ -387,3 +392,160 @@ async def test_full_pipeline(self, tmp_path): index_text = (wiki / "index.md").read_text() assert "[[summaries/big-doc]]" in index_text assert "[[concepts/deep-learning]]" in index_text + + +class TestCompileConceptsPlan: + """Integration tests for _compile_concepts with the new plan format.""" + + def _setup_wiki(self, tmp_path, existing_concepts=None): + """Helper to set up a wiki directory with optional existing concepts.""" + wiki = tmp_path / "wiki" + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n", + encoding="utf-8", + ) + (tmp_path / "raw").mkdir(exist_ok=True) + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + if existing_concepts: + for name, content in existing_concepts.items(): + (wiki / "concepts" / f"{name}.md").write_text( + content, encoding="utf-8", + ) + + return wiki + + @pytest.mark.asyncio + async def test_create_and_update_flow(self, tmp_path): + """Pre-existing 'attention' concept; plan creates 'flash-attention' and updates 'attention'.""" + wiki = self._setup_wiki(tmp_path, existing_concepts={ + "attention": "---\nsources: [old-paper.pdf]\n---\n\n# Attention\n\nOriginal content about attention.", + }) + + plan_response = json.dumps({ + "create": [{"name": "flash-attention", "title": "Flash Attention"}], + "update": [{"name": "attention", "title": "Attention"}], + "related": [], + }) + create_page_response = "# Flash Attention\n\nAn efficient attention algorithm." + update_page_response = "# Attention\n\nUpdated content with new info." + + system_msg = {"role": "system", "content": "You are a wiki agent."} + doc_msg = {"role": "user", "content": "Document about attention mechanisms."} + summary = "Summary of the document." + + call_order = {"n": 0} + + async def ordered_acompletion(*args, **kwargs): + idx = call_order["n"] + call_order["n"] += 1 + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + # create tasks come first, then update tasks + if idx == 0: + mock_resp.choices[0].message.content = create_page_response + else: + mock_resp.choices[0].message.content = update_page_response + mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50) + mock_resp.usage.prompt_tokens_details = None + return mock_resp + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=ordered_acompletion + ) + await _compile_concepts( + wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg, + summary, "test-doc", 5, + ) + + # Verify flash-attention created + fa_path = wiki / "concepts" / "flash-attention.md" + assert fa_path.exists() + fa_text = fa_path.read_text() + assert "sources: [test-doc.pdf]" in fa_text + assert "Flash Attention" in fa_text + + # Verify attention updated (is_update=True path in _write_concept) + att_path = wiki / "concepts" / "attention.md" + assert att_path.exists() + att_text = att_path.read_text() + assert "test-doc.pdf" in att_text + assert "old-paper.pdf" in att_text + + # Verify index updated + index_text = (wiki / "index.md").read_text() + assert "[[concepts/flash-attention]]" in index_text + assert "[[concepts/attention]]" in index_text + + @pytest.mark.asyncio + async def test_related_adds_link_no_llm(self, tmp_path): + """Plan has only related items. No acompletion calls should be made.""" + wiki = self._setup_wiki(tmp_path, existing_concepts={ + "transformer": "---\nsources: [old.pdf]\n---\n\n# Transformer\n\nContent about transformers.", + }) + + plan_response = json.dumps({ + "create": [], + "update": [], + "related": ["transformer"], + }) + + system_msg = {"role": "system", "content": "You are a wiki agent."} + doc_msg = {"role": "user", "content": "Document content."} + summary = "Summary." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([plan_response]) + ) + mock_litellm.acompletion = AsyncMock() + await _compile_concepts( + wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg, + summary, "test-doc", 5, + ) + # acompletion should never be called — related is code-only + mock_litellm.acompletion.assert_not_called() + + # Verify link added to transformer page + transformer_text = (wiki / "concepts" / "transformer.md").read_text() + assert "[[summaries/test-doc]]" in transformer_text + assert "test-doc.pdf" in transformer_text + + @pytest.mark.asyncio + async def test_fallback_list_format(self, tmp_path): + """LLM returns a flat array instead of dict — treated as all create.""" + wiki = self._setup_wiki(tmp_path) + + plan_response = json.dumps([ + {"name": "attention", "title": "Attention"}, + ]) + concept_page_response = "# Attention\n\nA mechanism for focusing." + + system_msg = {"role": "system", "content": "You are a wiki agent."} + doc_msg = {"role": "user", "content": "Document content."} + summary = "Summary." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await _compile_concepts( + wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg, + summary, "test-doc", 5, + ) + + # Verify concept was created (not updated) + att_path = wiki / "concepts" / "attention.md" + assert att_path.exists() + att_text = att_path.read_text() + assert "sources: [test-doc.pdf]" in att_text + assert "Attention" in att_text From 4249d5374b4e6f86b7bbb0c16dcc5781883f0bae Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 21:16:27 +0800 Subject: [PATCH 05/44] chore: update compiler docstring, remove dead _CONCEPTS_LIST_USER --- openkb/agent/compiler.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index a6f5bdc..326708a 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -3,9 +3,9 @@ Pipeline leveraging LLM prompt caching: Step 1: Build base context A (schema + document content). Step 2: A → generate summary. - Step 3: A + summary → extract concept list. - Step 4: Concurrent LLM calls (A cached) → generate each concept page. - Step 5: Code writes all files, updates index, appends log. + Step 3: A + summary → concepts plan (create/update/related). + Step 4: Concurrent LLM calls (A cached) → generate new + rewrite updated concepts. + Step 5: Code adds cross-ref links to related concepts, updates index. """ from __future__ import annotations @@ -50,24 +50,6 @@ Return ONLY the Markdown content (no frontmatter, no code fences). """ -_CONCEPTS_LIST_USER = """\ -Based on the summary above, identify the key concepts worth creating as \ -standalone wiki concept pages. - -Existing concept pages: {existing_concepts} - -Return a JSON array of objects, each with: -- "name": concept slug (e.g. "transformer-architecture") -- "title": human-readable title (e.g. "Transformer Architecture") -- "is_update": true if this concept already exists and should be updated - -Only include concepts for significant themes. For the first document, \ -create 2-3 foundational concepts at most. Do NOT create concepts that are \ -just the document topic itself (e.g. don't create "machine-translation" \ -for a translation paper). - -Return ONLY valid JSON array, no fences, no explanation. -""" _CONCEPTS_PLAN_USER = """\ Based on the summary above, decide how to update the wiki's concept pages. From 1a28c11f24a8412bb709c79c180e9502c38dcf45 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 21:24:48 +0800 Subject: [PATCH 06/44] =?UTF-8?q?fix:=20code=20review=20fixes=20=E2=80=94?= =?UTF-8?q?=20security,=20robustness,=20tests,=20and=20CI=20hardening?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Restore markitdown[all] extras for docx/pptx/xlsx support - Sanitize concept names to prevent path traversal in compiler - Add path traversal guard in copy_relative_images - Fix _write_concept duplicate append when frontmatter lacks sources key - Remove dead write_wiki_files function - Fix watcher thread race in _schedule_flush - Warn when unimplemented --fix flag is used in lint command - Harden CI publish workflow with environment gate and SHA-pinned actions - Fix test_indexer to actually assert IndexConfig flag values - Fix test_converter to test correct PDF code path (pymupdf, not markitdown) - Use str.find() instead of str.index() in frontmatter parsing to avoid ValueError --- .github/workflows/publish.yml | 7 ++++--- openkb/agent/compiler.py | 34 ++++++++++++++++--------------- openkb/agent/tools.py | 38 ----------------------------------- openkb/cli.py | 4 +++- openkb/images.py | 5 ++++- openkb/watcher.py | 11 +++++----- pyproject.toml | 2 +- tests/test_converter.py | 11 ++++------ tests/test_indexer.py | 11 +++++----- 9 files changed, 46 insertions(+), 77 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 78fd0e0..17b26c2 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -8,12 +8,13 @@ on: jobs: publish: runs-on: ubuntu-latest + environment: pypi permissions: id-token: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.2.2 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: "3.12" @@ -24,4 +25,4 @@ jobs: run: python -m build - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa # release/v1.11.0 diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 326708a..9119b03 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -302,14 +302,15 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is existing = path.read_text(encoding="utf-8") if source_file not in existing: if existing.startswith("---"): - end = existing.index("---", 3) - fm = existing[:end + 3] - body = existing[end + 3:] - if "sources:" in fm: - fm = fm.replace("sources: [", f"sources: [{source_file}, ") - else: - fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) - existing = fm + body + end = existing.find("---", 3) + if end != -1: + fm = existing[:end + 3] + body = existing[end + 3:] + if "sources:" in fm: + fm = fm.replace("sources: [", f"sources: [{source_file}, ") + else: + fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) + existing = fm + body else: existing = f"---\nsources: [{source_file}]\n---\n\n" + existing existing += f"\n\n{content}" @@ -334,14 +335,15 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f # Update sources in frontmatter if source_file not in text: if text.startswith("---"): - end = text.index("---", 3) - fm = text[:end + 3] - body = text[end + 3:] - if "sources:" in fm: - fm = fm.replace("sources: [", f"sources: [{source_file}, ") - else: - fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) - text = fm + body + end = text.find("---", 3) + if end != -1: + fm = text[:end + 3] + body = text[end + 3:] + if "sources:" in fm: + fm = fm.replace("sources: [", f"sources: [{source_file}, ") + else: + fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) + text = fm + body else: text = f"---\nsources: [{source_file}]\n---\n\n" + text diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index 40875f3..185344b 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -73,41 +73,3 @@ def write_wiki_file(path: str, content: str, wiki_root: str) -> str: full_path.write_text(content, encoding="utf-8") return f"Written: {path}" - -def write_wiki_files(files_json: str, wiki_root: str) -> str: - """Write multiple Markdown files to the wiki in one call. - - Args: - files_json: JSON array of objects, each with ``"path"`` and ``"content"`` keys. - Example: ``[{"path": "concepts/foo.md", "content": "# Foo\\n..."}]`` - wiki_root: Absolute path to the wiki root directory. - - Returns: - Summary of written files, or error message on failure. - """ - import json - - try: - files = json.loads(files_json) - except json.JSONDecodeError as exc: - return f"Invalid JSON: {exc}" - - if not isinstance(files, list): - return "Expected a JSON array of {path, content} objects." - - root = Path(wiki_root).resolve() - written: list[str] = [] - for entry in files: - path = entry.get("path", "") - content = entry.get("content", "") - if not path: - continue - full_path = (root / path).resolve() - if not full_path.is_relative_to(root): - written.append(f"Skipped (path escape): {path}") - continue - full_path.parent.mkdir(parents=True, exist_ok=True) - full_path.write_text(content, encoding="utf-8") - written.append(path) - - return f"Written {len(written)} files: {', '.join(written)}" diff --git a/openkb/cli.py b/openkb/cli.py index 388ac87..149f391 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -334,9 +334,11 @@ def on_new_files(paths): @cli.command() -@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues.") # TODO: --fix not yet implemented +@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues (not yet implemented).") def lint(fix): """Lint the knowledge base for structural and semantic inconsistencies.""" + if fix: + click.echo("Warning: --fix is not yet implemented. Running lint in report-only mode.") kb_dir = _find_kb_dir() if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") diff --git a/openkb/images.py b/openkb/images.py index 80ef37f..d72cec7 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -171,7 +171,10 @@ def copy_relative_images( for match in _RELATIVE_RE.finditer(markdown): alt, rel_path = match.group(1), match.group(2) - src = source_dir / rel_path + src = (source_dir / rel_path).resolve() + if not src.is_relative_to(source_dir.resolve()): + logger.warning("Image path escapes source dir: %s; skipping.", rel_path) + continue if not src.exists(): logger.warning( "Relative image not found: %s; leaving original link.", src diff --git a/openkb/watcher.py b/openkb/watcher.py index 77fdf24..2a0fae9 100644 --- a/openkb/watcher.py +++ b/openkb/watcher.py @@ -37,11 +37,12 @@ def __init__(self, callback: Callable[[list[str]], None], debounce_seconds: floa def _schedule_flush(self) -> None: """Cancel any existing timer and start a fresh debounce timer.""" - if self._timer is not None: - self._timer.cancel() - self._timer = threading.Timer(self._debounce_seconds, self._flush) - self._timer.daemon = True - self._timer.start() + with self._lock: + if self._timer is not None: + self._timer.cancel() + self._timer = threading.Timer(self._debounce_seconds, self._flush) + self._timer.daemon = True + self._timer.start() def _flush(self) -> None: """Call the callback with all collected pending paths, then clear.""" diff --git a/pyproject.toml b/pyproject.toml index eb1cdde..393cbd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ classifiers = [ keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"] dependencies = [ "pageindex==0.3.0.dev0", - "markitdown", + "markitdown[all]", "click>=8.0", "watchdog>=3.0", "litellm", diff --git a/tests/test_converter.py b/tests/test_converter.py index 5efb6eb..6c184fd 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -81,27 +81,24 @@ def test_md_raw_file_copied(self, kb_dir): class TestConvertDocumentPdfShort: - def test_short_pdf_converted_via_markitdown(self, kb_dir, tmp_path): - """PDF under threshold is converted with markitdown.""" + def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path): + """PDF under threshold is converted with pymupdf (convert_pdf_with_images).""" src = tmp_path / "short.pdf" src.write_bytes(b"%PDF-1.4 fake content") - fake_result = MagicMock() - fake_result.text_content = "# Short PDF\n\nConverted content." - with ( patch("openkb.converter.pymupdf.open") as mock_mu, - patch("openkb.converter.MarkItDown") as mock_mid_cls, + patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi, ): fake_doc = MagicMock() fake_doc.page_count = 5 # below default threshold of 20 fake_doc.__enter__ = MagicMock(return_value=fake_doc) fake_doc.__exit__ = MagicMock(return_value=False) mock_mu.return_value = fake_doc - mock_mid_cls.return_value.convert.return_value = fake_result result = convert_document(src, kb_dir) + mock_cpwi.assert_called_once() assert result.skipped is False assert result.is_long_doc is False assert result.source_path is not None diff --git a/tests/test_indexer.py b/tests/test_indexer.py index c9c7101..e35c969 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -95,10 +95,11 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls: index_long_document(pdf_path, kb_dir) - # Verify PageIndexClient was instantiated + # Verify PageIndexClient was instantiated with correct IndexConfig mock_cls.assert_called_once() - # Check that index_config with correct flags was passed _, kwargs = mock_cls.call_args - ic = kwargs.get("index_config") or mock_cls.call_args[0][0] if mock_cls.call_args[0] else None - # Either as positional or keyword — either way PageIndexClient was called - assert mock_cls.called + ic = kwargs.get("index_config") + assert ic is not None, "index_config must be passed to PageIndexClient" + assert ic.if_add_node_text is True + assert ic.if_add_node_summary is True + assert ic.if_add_doc_description is True From 4b891fa0db1df37eb822842ede0ef5114a234b60 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:07:38 +0800 Subject: [PATCH 07/44] feat: bidirectional backlinks between summaries and concepts - Add _backlink_summary: ensures summary pages link to all related concepts - Add _backlink_concepts: ensures concept pages link back to source summaries - _update_index auto-creates index.md if missing - Both merge into existing sections instead of duplicating --- openkb/agent/compiler.py | 65 ++++++++++++++++++++++- tests/test_compiler.py | 110 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+), 1 deletion(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 9119b03..5075278 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -351,11 +351,68 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f path.write_text(text, encoding="utf-8") +def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: + """Append missing concept wikilinks to the summary page (no LLM call). + + After all concepts are generated, this ensures the summary page links + back to every related concept — closing the bidirectional link that + concept pages already have toward the summary. + + If a ``## Related Concepts`` section already exists, new links are + appended into it rather than creating a duplicate section. + """ + summary_path = wiki_dir / "summaries" / f"{doc_name}.md" + if not summary_path.exists(): + return + + text = summary_path.read_text(encoding="utf-8") + missing = [slug for slug in concept_slugs if f"[[concepts/{slug}]]" not in text] + if not missing: + return + + new_links = "\n".join(f"- [[concepts/{s}]]" for s in missing) + if "## Related Concepts" in text: + # Append into existing section + text = text.replace("## Related Concepts\n", f"## Related Concepts\n{new_links}\n", 1) + else: + text += f"\n\n## Related Concepts\n{new_links}\n" + summary_path.write_text(text, encoding="utf-8") + + +def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: + """Append missing summary wikilink to each concept page (no LLM call). + + Ensures every concept page links back to the source document's summary, + regardless of whether the LLM included the link in its output. + + If a ``## Related Documents`` section already exists, the link is + appended into it rather than creating a duplicate section. + """ + link = f"[[summaries/{doc_name}]]" + concepts_dir = wiki_dir / "concepts" + + for slug in concept_slugs: + path = concepts_dir / f"{slug}.md" + if not path.exists(): + continue + text = path.read_text(encoding="utf-8") + if link in text: + continue + if "## Related Documents" in text: + text = text.replace("## Related Documents\n", f"## Related Documents\n- {link}\n", 1) + else: + text += f"\n\n## Related Documents\n- {link}\n" + path.write_text(text, encoding="utf-8") + + def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> None: """Append document and concept entries to index.md.""" index_path = wiki_dir / "index.md" if not index_path.exists(): - return + index_path.write_text( + "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) text = index_path.read_text(encoding="utf-8") @@ -503,6 +560,12 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]: for slug in related_items: _add_related_link(wiki_dir, slug, doc_name, source_file) + # --- Step 3c: Backlink — summary ↔ concepts (code only) --- + all_concept_slugs = concept_names + [s for s in related_items] + if all_concept_slugs: + _backlink_summary(wiki_dir, doc_name, all_concept_slugs) + _backlink_concepts(wiki_dir, doc_name, all_concept_slugs) + # --- Step 4: Update index (code only) --- _update_index(wiki_dir, doc_name, concept_names) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index d0903f5..e1238df 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -18,6 +18,8 @@ _read_wiki_context, _read_concept_briefs, _add_related_link, + _backlink_summary, + _backlink_concepts, ) @@ -207,6 +209,114 @@ def test_sorted_alphabetically(self, tmp_path): assert slugs == ["apple", "mango", "zebra"] +class TestBacklinkSummary: + def test_adds_missing_concept_links(self, tmp_path): + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + (summaries / "paper.md").write_text( + "---\nsources: [paper.pdf]\n---\n\n# Summary\n\nContent about attention.", + encoding="utf-8", + ) + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + text = (summaries / "paper.md").read_text() + assert "[[concepts/attention]]" in text + assert "[[concepts/transformer]]" in text + + def test_skips_already_linked(self, tmp_path): + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + (summaries / "paper.md").write_text( + "---\nsources: [paper.pdf]\n---\n\n# Summary\n\nSee [[concepts/attention]].", + encoding="utf-8", + ) + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + text = (summaries / "paper.md").read_text() + # attention already linked, should not duplicate + assert text.count("[[concepts/attention]]") == 1 + # transformer should be added + assert "[[concepts/transformer]]" in text + + def test_no_op_when_all_linked(self, tmp_path): + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + original = "# Summary\n\n[[concepts/attention]] and [[concepts/transformer]]" + (summaries / "paper.md").write_text(original, encoding="utf-8") + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + assert (summaries / "paper.md").read_text() == original + + def test_skips_if_file_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + # Should not raise + _backlink_summary(wiki, "nonexistent", ["attention"]) + + def test_merges_into_existing_section(self, tmp_path): + """Second add should merge into existing ## Related Concepts, not duplicate.""" + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + (summaries / "paper.md").write_text( + "# Summary\n\nContent.\n\n## Related Concepts\n- [[concepts/attention]]\n", + encoding="utf-8", + ) + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + text = (summaries / "paper.md").read_text() + assert text.count("## Related Concepts") == 1 + assert "[[concepts/transformer]]" in text + assert text.count("[[concepts/attention]]") == 1 + + +class TestBacklinkConcepts: + def test_adds_summary_link_to_concept(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\n---\n\n# Attention\n\nContent.", + encoding="utf-8", + ) + _backlink_concepts(wiki, "paper", ["attention"]) + text = (concepts / "attention.md").read_text() + assert "[[summaries/paper]]" in text + assert "## Related Documents" in text + + def test_skips_if_already_linked(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "# Attention\n\nBased on [[summaries/paper]].", + encoding="utf-8", + ) + _backlink_concepts(wiki, "paper", ["attention"]) + text = (concepts / "attention.md").read_text() + assert text.count("[[summaries/paper]]") == 1 + assert "## Related Documents" not in text + + def test_merges_into_existing_section(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "# Attention\n\n## Related Documents\n- [[summaries/old-paper]]\n", + encoding="utf-8", + ) + _backlink_concepts(wiki, "new-paper", ["attention"]) + text = (concepts / "attention.md").read_text() + assert text.count("## Related Documents") == 1 + assert "[[summaries/old-paper]]" in text + assert "[[summaries/new-paper]]" in text + + def test_skips_missing_concept_file(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "concepts").mkdir(parents=True) + # Should not raise + _backlink_concepts(wiki, "paper", ["nonexistent"]) + + class TestAddRelatedLink: def test_adds_see_also_link(self, tmp_path): wiki = tmp_path / "wiki" From 072d9f557e66dfa3f4b50bccc52eb96466e8d434 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:31:38 +0800 Subject: [PATCH 08/44] docs: specs and plans for concept dedup and retrieve redesign --- .../2026-04-09-concept-dedup-and-update.md | 888 +++++++++++++ .../plans/2026-04-09-retrieve-redesign.md | 1104 +++++++++++++++++ ...6-04-09-concept-dedup-and-update-design.md | 163 +++ .../specs/2026-04-09-retrieve-redesign.md | 262 ++++ 4 files changed, 2417 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md create mode 100644 docs/superpowers/plans/2026-04-09-retrieve-redesign.md create mode 100644 docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md create mode 100644 docs/superpowers/specs/2026-04-09-retrieve-redesign.md diff --git a/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md b/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md new file mode 100644 index 0000000..1a312a6 --- /dev/null +++ b/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md @@ -0,0 +1,888 @@ +# Concept Dedup & Existing Page Update — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Give the compiler enough context about existing concepts to make smart dedup/update decisions, and add the ability to rewrite existing concept pages with new information — all without breaking prompt caching. + +**Architecture:** Extend the deterministic pipeline in `compiler.py` with: (1) concept briefs read from disk before the concepts-plan LLM call, (2) a new JSON output format with create/update/related actions, (3) a new concurrent "update" path that sends existing page content to the LLM for rewriting, (4) a code-only "related" path for cross-ref links. Extract shared logic between `compile_short_doc` and `compile_long_doc` into `_compile_concepts`. + +**Tech Stack:** Python, litellm, asyncio, pytest + +--- + +### Task 1: Add `_read_concept_briefs` and test + +**Files:** +- Modify: `openkb/agent/compiler.py:199-207` (File I/O helpers section) +- Modify: `tests/test_compiler.py:98-116` (TestReadWikiContext section) + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_compiler.py`: + +```python +from openkb.agent.compiler import _read_concept_briefs + +class TestReadConceptBriefs: + def test_empty_wiki(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + assert _read_concept_briefs(wiki) == "(none yet)" + + def test_no_concepts_dir(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + assert _read_concept_briefs(wiki) == "(none yet)" + + def test_reads_briefs_with_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\n---\n\nAttention allows models to focus on relevant input parts selectively.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- attention: Attention allows models" in result + + def test_reads_briefs_without_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "rnn.md").write_text( + "Recurrent neural networks process sequences step by step.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- rnn: Recurrent neural networks" in result + + def test_truncates_long_content(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "long.md").write_text("A" * 300, encoding="utf-8") + result = _read_concept_briefs(wiki) + brief_line = result.split("\n")[0] + # slug + ": " + 150 chars = well under 200 + assert len(brief_line) < 200 + + def test_sorted_alphabetically(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8") + (concepts / "alpha.md").write_text("Alpha concept.", encoding="utf-8") + result = _read_concept_briefs(wiki) + lines = result.strip().split("\n") + assert lines[0].startswith("- alpha:") + assert lines[1].startswith("- zebra:") +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v` +Expected: FAIL with `ImportError: cannot import name '_read_concept_briefs'` + +- [ ] **Step 3: Implement `_read_concept_briefs`** + +Add to `openkb/agent/compiler.py` in the File I/O helpers section (after `_read_wiki_context`): + +```python +def _read_concept_briefs(wiki_dir: Path) -> str: + """Read existing concept pages and return compact briefs for the LLM. + + Returns a string like: + - attention: Attention allows models to focus on relevant input parts... + - transformer: The Transformer is a neural network architecture... + + Or "(none yet)" if no concept pages exist. + """ + concepts_dir = wiki_dir / "concepts" + if not concepts_dir.exists(): + return "(none yet)" + briefs = [] + for p in sorted(concepts_dir.glob("*.md")): + text = p.read_text(encoding="utf-8") + # Skip YAML frontmatter + if text.startswith("---"): + parts = text.split("---", 2) + body = parts[2].strip() if len(parts) >= 3 else "" + else: + body = text.strip() + brief = body[:150].replace("\n", " ") + if brief: + briefs.append(f"- {p.stem}: {brief}") + return "\n".join(briefs) or "(none yet)" +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v` +Expected: All 6 tests PASS + +- [ ] **Step 5: Update the import in test file** + +Add `_read_concept_briefs` to the existing import block at the top of `tests/test_compiler.py`: + +```python +from openkb.agent.compiler import ( + compile_long_doc, + compile_short_doc, + _parse_json, + _write_summary, + _write_concept, + _update_index, + _read_wiki_context, + _read_concept_briefs, +) +``` + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: add _read_concept_briefs for concept dedup context" +``` + +--- + +### Task 2: Replace prompt template and update JSON parsing + +**Files:** +- Modify: `openkb/agent/compiler.py:53-70` (prompt templates section) +- Modify: `tests/test_compiler.py:21-31` (TestParseJson section) + +- [ ] **Step 1: Write the failing test for new JSON format** + +Add to `tests/test_compiler.py`: + +```python +class TestParseConceptsPlan: + def test_dict_format(self): + text = json.dumps({ + "create": [{"name": "foo", "title": "Foo"}], + "update": [{"name": "bar", "title": "Bar"}], + "related": ["baz"], + }) + parsed = _parse_json(text) + assert isinstance(parsed, dict) + assert len(parsed["create"]) == 1 + assert len(parsed["update"]) == 1 + assert parsed["related"] == ["baz"] + + def test_fallback_list_format(self): + """If LLM returns old flat array, _parse_json still works.""" + text = json.dumps([{"name": "foo", "title": "Foo"}]) + parsed = _parse_json(text) + assert isinstance(parsed, list) + + def test_fenced_dict(self): + text = '```json\n{"create": [], "update": [], "related": []}\n```' + parsed = _parse_json(text) + assert isinstance(parsed, dict) + assert parsed["create"] == [] +``` + +- [ ] **Step 2: Run test to verify it passes (these use existing `_parse_json`)** + +Run: `pytest tests/test_compiler.py::TestParseConceptsPlan -v` +Expected: All 3 PASS — `_parse_json` already handles dicts. This confirms compatibility. + +- [ ] **Step 3: Replace `_CONCEPTS_LIST_USER` with `_CONCEPTS_PLAN_USER`** + +In `openkb/agent/compiler.py`, replace the `_CONCEPTS_LIST_USER` template (lines 53-70) with: + +```python +_CONCEPTS_PLAN_USER = """\ +Based on the summary above, decide how to update the wiki's concept pages. + +Existing concept pages: +{concept_briefs} + +Return a JSON object with three keys: + +1. "create" — new concepts not covered by any existing page. Array of objects: + {{"name": "concept-slug", "title": "Human-Readable Title"}} + +2. "update" — existing concepts that have significant new information from \ +this document worth integrating. Array of objects: + {{"name": "existing-slug", "title": "Existing Title"}} + +3. "related" — existing concepts tangentially related to this document but \ +not needing content changes, just a cross-reference link. Array of slug strings. + +Rules: +- For the first few documents, create 2-3 foundational concepts at most. +- Do NOT create a concept that overlaps with an existing one — use "update". +- Do NOT create concepts that are just the document topic itself. +- "related" is for lightweight cross-linking only, no content rewrite needed. + +Return ONLY valid JSON, no fences, no explanation. +""" +``` + +- [ ] **Step 4: Add `_CONCEPT_UPDATE_USER` template** + +Add after `_CONCEPT_PAGE_USER` (after line 82): + +```python +_CONCEPT_UPDATE_USER = """\ +Update the concept page for: {title} + +Current content of this page: +{existing_content} + +New information from document "{doc_name}" (summarized above) should be \ +integrated into this page. Rewrite the full page incorporating the new \ +information naturally — do not just append. Maintain existing \ +[[wikilinks]] and add new ones where appropriate. + +Return ONLY the Markdown content (no frontmatter, no code fences). +""" +``` + +- [ ] **Step 5: Run all existing tests to verify nothing breaks** + +Run: `pytest tests/test_compiler.py -v` +Expected: All PASS (templates aren't tested directly, only via integration tests which we'll update later) + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: add concepts plan and update prompt templates" +``` + +--- + +### Task 3: Add `_add_related_link` and test + +**Files:** +- Modify: `openkb/agent/compiler.py` (File I/O helpers section, after `_write_concept`) +- Modify: `tests/test_compiler.py` + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_compiler.py`: + +```python +from openkb.agent.compiler import _add_related_link + +class TestAddRelatedLink: + def test_adds_see_also_link(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSome content.", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper2.pdf") + text = (concepts / "attention.md").read_text() + assert "[[summaries/new-doc]]" in text + assert "paper2.pdf" in text + + def test_skips_if_already_linked(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSee also: [[summaries/new-doc]]", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper1.pdf") + text = (concepts / "attention.md").read_text() + # Should not duplicate + assert text.count("[[summaries/new-doc]]") == 1 + + def test_skips_if_file_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + # Should not raise + _add_related_link(wiki, "nonexistent", "doc", "file.pdf") +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v` +Expected: FAIL with `ImportError: cannot import name '_add_related_link'` + +- [ ] **Step 3: Implement `_add_related_link`** + +Add to `openkb/agent/compiler.py` after `_write_concept`: + +```python +def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None: + """Add a cross-reference link to an existing concept page (no LLM call).""" + concepts_dir = wiki_dir / "concepts" + path = concepts_dir / f"{concept_slug}.md" + if not path.exists(): + return + + text = path.read_text(encoding="utf-8") + link = f"[[summaries/{doc_name}]]" + if link in text: + return + + # Update sources in frontmatter + if source_file not in text: + if text.startswith("---"): + end = text.index("---", 3) + fm = text[:end + 3] + body = text[end + 3:] + if "sources:" in fm: + fm = fm.replace("sources: [", f"sources: [{source_file}, ") + else: + fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) + text = fm + body + else: + text = f"---\nsources: [{source_file}]\n---\n\n" + text + + text += f"\n\nSee also: {link}" + path.write_text(text, encoding="utf-8") +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v` +Expected: All 3 tests PASS + +- [ ] **Step 5: Update the import in test file** + +Add `_add_related_link` to the import block at top of `tests/test_compiler.py`. + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: add _add_related_link for code-only cross-referencing" +``` + +--- + +### Task 4: Extract `_compile_concepts` and refactor both public functions + +**Files:** +- Modify: `openkb/agent/compiler.py:290-509` (Public API section — full rewrite) +- Modify: `tests/test_compiler.py:153-267` (integration tests) + +This is the core task. It extracts the shared Steps 2-4 into `_compile_concepts`, updates both public functions to call it, and switches to the new concepts plan format. + +- [ ] **Step 1: Write integration test for new create/update/related flow** + +Add to `tests/test_compiler.py`: + +```python +class TestCompileConceptsPlan: + """Integration tests for the new create/update/related flow.""" + + @pytest.mark.asyncio + async def test_create_and_update_flow(self, tmp_path): + """New doc creates one concept and updates an existing one.""" + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + concepts_dir = wiki / "concepts" + concepts_dir.mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + # Pre-existing concept + (concepts_dir / "attention.md").write_text( + "---\nsources: [old-paper.pdf]\n---\n\n# Attention\n\nOld content about attention.", + encoding="utf-8", + ) + + source_path = wiki / "sources" / "new-paper.md" + source_path.write_text("# New Paper\n\nContent about flash attention and transformers.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "new-paper.pdf").write_bytes(b"fake") + + summary_resp = "This paper introduces flash attention, improving on attention mechanisms." + plan_resp = json.dumps({ + "create": [{"name": "flash-attention", "title": "Flash Attention"}], + "update": [{"name": "attention", "title": "Attention Mechanism"}], + "related": [], + }) + create_page_resp = "# Flash Attention\n\nAn efficient attention algorithm." + update_page_resp = "# Attention\n\nUpdated content with flash attention details." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([create_page_resp, update_page_resp]) + ) + await compile_short_doc("new-paper", source_path, tmp_path, "gpt-4o-mini") + + # New concept created + flash_path = concepts_dir / "flash-attention.md" + assert flash_path.exists() + assert "sources: [new-paper.pdf]" in flash_path.read_text() + + # Existing concept rewritten (not appended) + attn_text = (concepts_dir / "attention.md").read_text() + assert "new-paper.pdf" in attn_text + assert "Updated content with flash attention details" in attn_text + + # Index updated for both + index_text = (wiki / "index.md").read_text() + assert "[[concepts/flash-attention]]" in index_text + + @pytest.mark.asyncio + async def test_related_adds_link_no_llm(self, tmp_path): + """Related concepts get cross-ref links without LLM calls.""" + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + concepts_dir = wiki / "concepts" + concepts_dir.mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + (concepts_dir / "transformer.md").write_text( + "---\nsources: [old.pdf]\n---\n\n# Transformer\n\nArchitecture details.", + encoding="utf-8", + ) + + source_path = wiki / "sources" / "doc.md" + source_path.write_text("Content", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake") + + summary_resp = "A short summary." + plan_resp = json.dumps({ + "create": [], + "update": [], + "related": ["transformer"], + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + # acompletion should NOT be called (no create/update) + mock_litellm.acompletion = AsyncMock(side_effect=AssertionError("should not be called")) + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + # Related concept should have cross-ref link + transformer_text = (concepts_dir / "transformer.md").read_text() + assert "[[summaries/doc]]" in transformer_text + + @pytest.mark.asyncio + async def test_fallback_list_format(self, tmp_path): + """If LLM returns old flat array, treat all as create.""" + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "doc.md" + source_path.write_text("Content", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake") + + summary_resp = "Summary." + # Old format: flat array + plan_resp = json.dumps([{"name": "foo", "title": "Foo"}]) + page_resp = "# Foo\n\nContent." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([page_resp]) + ) + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + assert (wiki / "concepts" / "foo.md").exists() +``` + +- [ ] **Step 2: Run the new tests to verify they fail** + +Run: `pytest tests/test_compiler.py::TestCompileConceptsPlan -v` +Expected: FAIL — the current code uses old prompt format and doesn't handle dict responses + +- [ ] **Step 3: Implement `_compile_concepts` and refactor public functions** + +Replace the entire Public API section (from `DEFAULT_COMPILE_CONCURRENCY` to end of file) in `openkb/agent/compiler.py` with: + +```python +DEFAULT_COMPILE_CONCURRENCY = 5 + + +async def _compile_concepts( + wiki_dir: Path, + kb_dir: Path, + model: str, + system_msg: dict, + doc_msg: dict, + summary: str, + doc_name: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: + """Shared concept compilation logic: plan → create/update/related → index. + + This is the core of the compilation pipeline, shared by both + compile_short_doc and compile_long_doc. + """ + source_file = _find_source_filename(doc_name, kb_dir) + concept_briefs = _read_concept_briefs(wiki_dir) + + # --- Concepts plan (A cached) --- + plan_raw = _llm_call(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPTS_PLAN_USER.format( + concept_briefs=concept_briefs, + )}, + ], "concepts-plan", max_tokens=1024) + + try: + parsed = _parse_json(plan_raw) + except (json.JSONDecodeError, ValueError) as exc: + logger.warning("Failed to parse concepts plan: %s", exc) + logger.debug("Raw: %s", plan_raw) + _update_index(wiki_dir, doc_name, []) + return + + # Fallback: if LLM returns flat array, treat all as create + if isinstance(parsed, list): + create_list, update_list, related_list = parsed, [], [] + else: + create_list = parsed.get("create", []) + update_list = parsed.get("update", []) + related_list = parsed.get("related", []) + + if not create_list and not update_list and not related_list: + _update_index(wiki_dir, doc_name, []) + return + + # --- Concurrent concept generation (A cached) --- + semaphore = asyncio.Semaphore(max_concurrency) + + async def _gen_create(concept: dict) -> tuple[str, str, bool]: + name = concept["name"] + title = concept.get("title", name) + async with semaphore: + page_content = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_PAGE_USER.format( + title=title, doc_name=doc_name, + update_instruction="", + )}, + ], f"create:{name}") + return name, page_content, False + + async def _gen_update(concept: dict) -> tuple[str, str, bool]: + name = concept["name"] + title = concept.get("title", name) + # Read existing page content for the LLM to integrate + concept_path = wiki_dir / "concepts" / f"{name}.md" + if concept_path.exists(): + raw_text = concept_path.read_text(encoding="utf-8") + # Strip frontmatter for the LLM + if raw_text.startswith("---"): + parts = raw_text.split("---", 2) + existing_content = parts[2].strip() if len(parts) >= 3 else raw_text + else: + existing_content = raw_text + else: + existing_content = "(page not found — create from scratch)" + async with semaphore: + page_content = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_UPDATE_USER.format( + title=title, doc_name=doc_name, + existing_content=existing_content, + )}, + ], f"update:{name}") + return name, page_content, True + + tasks = [] + tasks.extend(_gen_create(c) for c in create_list) + tasks.extend(_gen_update(c) for c in update_list) + + if tasks: + total = len(tasks) + sys.stdout.write(f" Generating {total} concept(s) (concurrency={max_concurrency})...\n") + sys.stdout.flush() + + results = await asyncio.gather(*tasks, return_exceptions=True) + else: + results = [] + + concept_names = [] + for r in results: + if isinstance(r, Exception): + logger.warning("Concept generation failed: %s", r) + continue + name, page_content, is_update = r + _write_concept(wiki_dir, name, page_content, source_file, is_update) + concept_names.append(name) + + # --- Related: code-only cross-ref links --- + for slug in related_list: + _add_related_link(wiki_dir, slug, doc_name, source_file) + + # --- Update index --- + _update_index(wiki_dir, doc_name, concept_names) + + +async def compile_short_doc( + doc_name: str, + source_path: Path, + kb_dir: Path, + model: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: + """Compile a short document into wiki pages. + + Step 1: Generate summary from full document text. + Step 2: Plan + generate/update concept pages (via _compile_concepts). + """ + from openkb.config import load_config + + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + language: str = config.get("language", "en") + + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) + source_file = _find_source_filename(doc_name, kb_dir) + content = source_path.read_text(encoding="utf-8") + + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _SUMMARY_USER.format( + doc_name=doc_name, content=content, + )} + + # Step 1: Generate summary + summary = _llm_call(model, [system_msg, doc_msg], "summary") + _write_summary(wiki_dir, doc_name, source_file, summary) + + # Step 2: Compile concepts + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, summary, + doc_name, max_concurrency, + ) + + +async def compile_long_doc( + doc_name: str, + summary_path: Path, + doc_id: str, + kb_dir: Path, + model: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: + """Compile a long (PageIndex) document into wiki concept pages. + + The summary page is already written by the indexer. This function + generates an overview, then plans + generates/updates concept pages. + """ + from openkb.config import load_config + + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + language: str = config.get("language", "en") + + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) + summary_text = summary_path.read_text(encoding="utf-8") + + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format( + doc_name=doc_name, doc_id=doc_id, content=summary_text, + )} + + # Step 1: Generate overview + overview = _llm_call(model, [system_msg, doc_msg], "overview") + + # Step 2: Compile concepts + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, overview, + doc_name, max_concurrency, + ) +``` + +- [ ] **Step 4: Update existing integration tests** + +Update `TestCompileShortDoc.test_full_pipeline` — the concepts-list response now needs to be the new dict format: + +```python +class TestCompileShortDoc: + @pytest.mark.asyncio + async def test_full_pipeline(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "test-doc.md" + source_path.write_text("# Test Doc\n\nSome content about transformers.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + summary_response = "# Summary\n\nThis document discusses transformers." + plan_response = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], + "related": [], + }) + concept_page_response = "# Transformer\n\nA neural network architecture." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_response, plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") + + summary_path = wiki / "summaries" / "test-doc.md" + assert summary_path.exists() + assert "sources: [test-doc.pdf]" in summary_path.read_text() + + concept_path = wiki / "concepts" / "transformer.md" + assert concept_path.exists() + assert "sources: [test-doc.pdf]" in concept_path.read_text() + + index_text = (wiki / "index.md").read_text() + assert "[[summaries/test-doc]]" in index_text + assert "[[concepts/transformer]]" in index_text +``` + +Update `TestCompileShortDoc.test_handles_bad_json` — no changes needed (bad JSON still triggers fallback). + +Update `TestCompileLongDoc.test_full_pipeline`: + +```python +class TestCompileLongDoc: + @pytest.mark.asyncio + async def test_full_pipeline(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n", + encoding="utf-8", + ) + summary_path = wiki / "summaries" / "big-doc.md" + summary_path.write_text("# Big Doc\n\nPageIndex summary tree.", encoding="utf-8") + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n") + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake") + + overview_response = "Overview of the big document." + plan_response = json.dumps({ + "create": [{"name": "deep-learning", "title": "Deep Learning"}], + "update": [], + "related": [], + }) + concept_page_response = "# Deep Learning\n\nA subfield of ML." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([overview_response, plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await compile_long_doc( + "big-doc", summary_path, "doc-123", tmp_path, "gpt-4o-mini" + ) + + concept_path = wiki / "concepts" / "deep-learning.md" + assert concept_path.exists() + assert "Deep Learning" in concept_path.read_text() + + index_text = (wiki / "index.md").read_text() + assert "[[summaries/big-doc]]" in index_text + assert "[[concepts/deep-learning]]" in index_text +``` + +- [ ] **Step 5: Run all tests** + +Run: `pytest tests/test_compiler.py -v` +Expected: All PASS + +- [ ] **Step 6: Run the full test suite** + +Run: `pytest tests/ -v` +Expected: All 149+ tests PASS + +- [ ] **Step 7: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: concept dedup with briefs, update/related paths, extract _compile_concepts" +``` + +--- + +### Task 5: Clean up old references and update module docstring + +**Files:** +- Modify: `openkb/agent/compiler.py:1-9` (module docstring) + +- [ ] **Step 1: Update module docstring** + +Replace the docstring at the top of `openkb/agent/compiler.py`: + +```python +"""Wiki compilation pipeline for OpenKB. + +Pipeline leveraging LLM prompt caching: + Step 1: Build base context A (schema + document content). + Step 2: A → generate summary. + Step 3: A + summary → concepts plan (create/update/related). + Step 4: Concurrent LLM calls (A cached) → generate new + rewrite updated concepts. + Step 5: Code adds cross-ref links to related concepts, updates index. +""" +``` + +- [ ] **Step 2: Verify `_CONCEPTS_LIST_USER` is fully removed** + +Search for any remaining references to `_CONCEPTS_LIST_USER` in the codebase: + +Run: `grep -r "_CONCEPTS_LIST_USER" openkb/ tests/` +Expected: No matches + +- [ ] **Step 3: Run full test suite one final time** + +Run: `pytest tests/ -q` +Expected: All tests pass + +- [ ] **Step 4: Commit** + +```bash +git add openkb/agent/compiler.py +git commit -m "chore: update compiler docstring for new pipeline" +``` diff --git a/docs/superpowers/plans/2026-04-09-retrieve-redesign.md b/docs/superpowers/plans/2026-04-09-retrieve-redesign.md new file mode 100644 index 0000000..3c659bc --- /dev/null +++ b/docs/superpowers/plans/2026-04-09-retrieve-redesign.md @@ -0,0 +1,1104 @@ +# Retrieve Redesign Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Unify query across long/short docs, add brief summaries to index.md and frontmatter, store long doc sources as JSON with per-page access. + +**Architecture:** (1) LLM prompts return `{"brief", "content"}` JSON — briefs flow into frontmatter and index.md. (2) Indexer stores long doc pages as JSON array. (3) New `get_page_content` tool replaces `pageindex_retrieve`. (4) Query agent uses same tools for all docs. + +**Tech Stack:** Python, litellm, asyncio, pytest + +--- + +### Task 1: Add `get_page_content` tool and `parse_pages` helper + +**Files:** +- Modify: `openkb/agent/tools.py` +- Modify: `tests/test_agent_tools.py` + +- [ ] **Step 1: Write failing tests** + +Add to `tests/test_agent_tools.py`: + +```python +from openkb.agent.tools import get_page_content, parse_pages + +class TestParsePages: + def test_single_page(self): + assert parse_pages("3") == [3] + + def test_range(self): + assert parse_pages("3-5") == [3, 4, 5] + + def test_comma_separated(self): + assert parse_pages("1,3,5") == [1, 3, 5] + + def test_mixed(self): + assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12] + + def test_deduplication(self): + assert parse_pages("3,3,3") == [3] + + def test_sorted(self): + assert parse_pages("5,1,3") == [1, 3, 5] + + def test_ignores_zero_and_negative(self): + assert parse_pages("0,-1,3") == [3] + + +class TestGetPageContent: + def test_reads_pages_from_json(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [ + {"page": 1, "content": "Page one text."}, + {"page": 2, "content": "Page two text."}, + {"page": 3, "content": "Page three text."}, + ] + (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") + + result = get_page_content("paper", "1,3", wiki_root) + assert "[Page 1]" in result + assert "Page one text." in result + assert "[Page 3]" in result + assert "Page three text." in result + assert "Page two" not in result + + def test_returns_error_for_missing_file(self, tmp_path): + wiki_root = str(tmp_path) + (tmp_path / "sources").mkdir() + result = get_page_content("nonexistent", "1", wiki_root) + assert "not found" in result.lower() + + def test_returns_error_for_no_matching_pages(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [{"page": 1, "content": "Only page."}] + (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") + + result = get_page_content("paper", "99", wiki_root) + assert "no content" in result.lower() or result.strip() == "" + + def test_includes_images_info(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [ + {"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]}, + ] + (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8") + + result = get_page_content("doc", "1", wiki_root) + assert "img.png" in result + + def test_path_escape_denied(self, tmp_path): + wiki_root = str(tmp_path) + (tmp_path / "sources").mkdir() + result = get_page_content("../../etc/passwd", "1", wiki_root) + assert "denied" in result.lower() or "not found" in result.lower() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_agent_tools.py::TestParsePages tests/test_agent_tools.py::TestGetPageContent -v` +Expected: FAIL with `ImportError` + +- [ ] **Step 3: Implement `parse_pages` and `get_page_content`** + +Add to `openkb/agent/tools.py`: + +```python +import json as _json + + +def parse_pages(pages: str) -> list[int]: + """Parse a page specification like '3-5,7,10-12' into a sorted list of ints.""" + result: set[int] = set() + for part in pages.split(","): + part = part.strip() + if "-" in part: + start_str, end_str = part.split("-", 1) + try: + start, end = int(start_str), int(end_str) + result.update(range(start, end + 1)) + except ValueError: + continue + else: + try: + result.add(int(part)) + except ValueError: + continue + return sorted(n for n in result if n >= 1) + + +def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: + """Get text content of specific pages from a long document. + + Reads from ``wiki/sources/{doc_name}.json`` which contains a JSON array + of ``{"page": int, "content": str, "images": [...]}`` objects. + + Args: + doc_name: Document name (stem, e.g. ``'attention-is-all-you-need'``). + pages: Page specification (e.g. ``'3-5,7,10-12'``). + wiki_root: Absolute path to the wiki root directory. + + Returns: + Formatted text of requested pages, or error message if not found. + """ + root = Path(wiki_root).resolve() + json_path = (root / "sources" / f"{doc_name}.json").resolve() + if not json_path.is_relative_to(root): + return "Access denied: path escapes wiki root." + if not json_path.exists(): + return f"Document not found: {doc_name}. No sources/{doc_name}.json file." + + data = _json.loads(json_path.read_text(encoding="utf-8")) + page_nums = set(parse_pages(pages)) + matched = [p for p in data if p["page"] in page_nums] + + if not matched: + return f"No content found for pages: {pages}" + + parts: list[str] = [] + for p in matched: + header = f"[Page {p['page']}]" + text = p.get("content", "") + if "images" in p: + img_refs = ", ".join(img["path"] for img in p["images"]) + text += f"\n[Images: {img_refs}]" + parts.append(f"{header}\n{text}") + + return "\n\n".join(parts) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pytest tests/test_agent_tools.py -v` +Expected: All PASS + +- [ ] **Step 5: Commit** + +```bash +git add openkb/agent/tools.py tests/test_agent_tools.py +git commit -m "feat: add get_page_content tool and parse_pages helper" +``` + +--- + +### Task 2: Change LLM prompts to return `{"brief", "content"}` JSON + +**Files:** +- Modify: `openkb/agent/compiler.py` (prompt templates, lines 40-105) +- Modify: `tests/test_compiler.py` (TestParseConceptsPlan) + +- [ ] **Step 1: Write test for brief+content JSON parsing** + +Add to `tests/test_compiler.py`: + +```python +class TestParseBriefContent: + def test_dict_with_brief_and_content(self): + text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."}) + parsed = _parse_json(text) + assert parsed["brief"] == "A short desc" + assert "# Full page" in parsed["content"] + + def test_plain_text_fallback(self): + """If LLM returns plain text, _parse_json raises — caller handles fallback.""" + with pytest.raises((json.JSONDecodeError, ValueError)): + _parse_json("Just plain markdown text without JSON") +``` + +- [ ] **Step 2: Run test to verify it passes (existing _parse_json handles dicts)** + +Run: `pytest tests/test_compiler.py::TestParseBriefContent -v` +Expected: PASS — `_parse_json` already handles dicts + +- [ ] **Step 3: Update `_SUMMARY_USER` prompt** + +Replace in `openkb/agent/compiler.py`: + +```python +_SUMMARY_USER = """\ +New document: {doc_name} + +Full text: +{content} + +Write a summary page for this document in Markdown. + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) describing the document's main contribution +- "content": The full summary in Markdown. Include key concepts, findings, ideas, \ +and [[wikilinks]] to concepts that could become cross-document concept pages + +Return ONLY valid JSON, no fences. +""" +``` + +- [ ] **Step 4: Update `_CONCEPT_PAGE_USER` prompt** + +Replace in `openkb/agent/compiler.py`: + +```python +_CONCEPT_PAGE_USER = """\ +Write the concept page for: {title} + +This concept relates to the document "{doc_name}" summarized above. +{update_instruction} + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept +- "content": The full concept page in Markdown. Include clear explanation, \ +key details from the source document, and [[wikilinks]] to related concepts \ +and [[summaries/{doc_name}]] + +Return ONLY valid JSON, no fences. +""" +``` + +- [ ] **Step 5: Update `_CONCEPT_UPDATE_USER` prompt** + +Replace in `openkb/agent/compiler.py`: + +```python +_CONCEPT_UPDATE_USER = """\ +Update the concept page for: {title} + +Current content of this page: +{existing_content} + +New information from document "{doc_name}" (summarized above) should be \ +integrated into this page. Rewrite the full page incorporating the new \ +information naturally — do not just append. Maintain existing \ +[[wikilinks]] and add new ones where appropriate. + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept (may differ from before) +- "content": The rewritten full concept page in Markdown + +Return ONLY valid JSON, no fences. +""" +``` + +- [ ] **Step 6: Run all tests (prompts aren't tested directly)** + +Run: `pytest tests/test_compiler.py -v` +Expected: All PASS + +- [ ] **Step 7: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: update LLM prompts to return brief+content JSON" +``` + +--- + +### Task 3: Update `_write_summary` and `_write_concept` to store `brief` in frontmatter + +**Files:** +- Modify: `openkb/agent/compiler.py` (lines 274-320, `_write_summary` and `_write_concept`) +- Modify: `tests/test_compiler.py` + +- [ ] **Step 1: Write failing tests** + +Update existing and add new tests in `tests/test_compiler.py`: + +```python +class TestWriteSummary: + def test_writes_with_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers") + path = wiki / "summaries" / "my-doc.md" + assert path.exists() + text = path.read_text() + assert "sources: [my-doc.pdf]" in text + assert "brief: Introduces transformers" in text + assert "# Summary" in text + + def test_writes_without_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.") + path = wiki / "summaries" / "my-doc.md" + text = path.read_text() + assert "sources: [my-doc.pdf]" in text + assert "brief:" not in text +``` + +Update `TestWriteConcept`: + +```python +class TestWriteConcept: + def test_new_concept_with_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False, brief="Mechanism for selective focus") + path = wiki / "concepts" / "attention.md" + assert path.exists() + text = path.read_text() + assert "sources: [paper.pdf]" in text + assert "brief: Mechanism for selective focus" in text + assert "# Attention" in text + + def test_new_concept(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False) + path = wiki / "concepts" / "attention.md" + assert path.exists() + text = path.read_text() + assert "sources: [paper.pdf]" in text + assert "# Attention" in text + + def test_update_concept_appends_source(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\nbrief: Old brief\n---\n\n# Attention\n\nOld content.", + encoding="utf-8", + ) + _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True, brief="Updated brief") + text = (concepts / "attention.md").read_text() + assert "paper2.pdf" in text + assert "paper1.pdf" in text + assert "brief: Updated brief" in text + assert "New info from paper2." in text +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v` +Expected: FAIL — `_write_summary` and `_write_concept` don't accept `brief` parameter + +- [ ] **Step 3: Update `_write_summary` to accept `brief`** + +```python +def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None: + """Write summary page with frontmatter.""" + summaries_dir = wiki_dir / "summaries" + summaries_dir.mkdir(parents=True, exist_ok=True) + fm_lines = [f"sources: [{source_file}]"] + if brief: + fm_lines.append(f"brief: {brief}") + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" + (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") +``` + +- [ ] **Step 4: Update `_write_concept` to accept `brief`** + +Add `brief: str = ""` parameter to `_write_concept`. In the new-concept branch: + +```python + else: + fm_lines = [f"sources: [{source_file}]"] + if brief: + fm_lines.append(f"brief: {brief}") + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" + path.write_text(frontmatter + content, encoding="utf-8") +``` + +In the update branch, after updating sources in frontmatter, also update brief: + +```python + if is_update and path.exists(): + existing = path.read_text(encoding="utf-8") + if source_file not in existing: + # ... existing frontmatter update logic ... + # Update brief in frontmatter if provided + if brief and existing.startswith("---"): + end = existing.find("---", 3) + if end != -1: + fm = existing[:end + 3] + body = existing[end + 3:] + if "brief:" in fm: + import re + fm = re.sub(r"brief:.*", f"brief: {brief}", fm) + else: + fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1) + existing = fm + body + path.write_text(existing, encoding="utf-8") +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v` +Expected: All PASS + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: store brief in frontmatter of summary and concept pages" +``` + +--- + +### Task 4: Update `_update_index` to include briefs, and update `_read_concept_briefs` to read from frontmatter + +**Files:** +- Modify: `openkb/agent/compiler.py` (lines 233-261 and 408-430) +- Modify: `tests/test_compiler.py` + +- [ ] **Step 1: Write failing tests for `_update_index` with briefs** + +```python +class TestUpdateIndex: + def test_appends_entries_with_briefs(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention", "transformer"], + doc_brief="Introduces transformers", + concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"}) + text = (wiki / "index.md").read_text() + assert "[[summaries/my-doc]] — Introduces transformers" in text + assert "[[concepts/attention]] — Focus mechanism" in text + assert "[[concepts/transformer]] — NN architecture" in text + + def test_no_duplicates(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n- [[summaries/my-doc]] — Old brief\n\n## Concepts\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", [], doc_brief="New brief") + text = (wiki / "index.md").read_text() + assert text.count("[[summaries/my-doc]]") == 1 + + def test_backwards_compat_no_briefs(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention"]) + text = (wiki / "index.md").read_text() + assert "[[summaries/my-doc]]" in text + assert "[[concepts/attention]]" in text +``` + +Write test for updated `_read_concept_briefs`: + +```python +class TestReadConceptBriefs: + # ... keep existing tests ... + + def test_reads_brief_from_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\nbrief: Selective focus mechanism\n---\n\n# Attention\n\nLong content...", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- attention: Selective focus mechanism" in result + + def test_falls_back_to_body_truncation(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "old.md").write_text( + "---\nsources: [paper.pdf]\n---\n\nOld concept without brief field.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- old: Old concept without brief field." in result +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_compiler.py::TestUpdateIndex tests/test_compiler.py::TestReadConceptBriefs -v` +Expected: FAIL — `_update_index` doesn't accept `doc_brief`/`concept_briefs` parameters + +- [ ] **Step 3: Update `_update_index`** + +```python +def _update_index( + wiki_dir: Path, doc_name: str, concept_names: list[str], + doc_brief: str = "", concept_briefs: dict[str, str] | None = None, +) -> None: + """Append document and concept entries to index.md with optional briefs.""" + index_path = wiki_dir / "index.md" + if not index_path.exists(): + index_path.write_text( + "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + + text = index_path.read_text(encoding="utf-8") + + doc_link = f"[[summaries/{doc_name}]]" + if doc_link not in text: + doc_entry = f"- {doc_link}" + if doc_brief: + doc_entry += f" — {doc_brief}" + if "## Documents" in text: + text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) + + if concept_briefs is None: + concept_briefs = {} + for name in concept_names: + concept_link = f"[[concepts/{name}]]" + if concept_link not in text: + concept_entry = f"- {concept_link}" + if name in concept_briefs: + concept_entry += f" — {concept_briefs[name]}" + if "## Concepts" in text: + text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) + + index_path.write_text(text, encoding="utf-8") +``` + +- [ ] **Step 4: Update `_read_concept_briefs` to read from frontmatter `brief:` field** + +```python +def _read_concept_briefs(wiki_dir: Path) -> str: + """Read existing concept pages and return compact one-line summaries. + + Reads ``brief:`` from YAML frontmatter if available, otherwise falls back + to the first 150 characters of the body text. + """ + concepts_dir = wiki_dir / "concepts" + if not concepts_dir.exists(): + return "(none yet)" + + md_files = sorted(concepts_dir.glob("*.md")) + if not md_files: + return "(none yet)" + + lines: list[str] = [] + for path in md_files: + text = path.read_text(encoding="utf-8") + brief = "" + body = text + if text.startswith("---"): + end = text.find("---", 3) + if end != -1: + fm = text[:end + 3] + body = text[end + 3:] + # Try to extract brief from frontmatter + for line in fm.split("\n"): + if line.startswith("brief:"): + brief = line[len("brief:"):].strip() + break + if not brief: + brief = body.strip().replace("\n", " ")[:150] + if brief: + lines.append(f"- {path.stem}: {brief}") + + return "\n".join(lines) or "(none yet)" +``` + +- [ ] **Step 5: Run tests** + +Run: `pytest tests/test_compiler.py -v` +Expected: All PASS + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: add briefs to index.md entries and read from frontmatter" +``` + +--- + +### Task 5: Wire briefs through `_compile_concepts` and public functions + +**Files:** +- Modify: `openkb/agent/compiler.py` (lines 438-611, `_compile_concepts`, `compile_short_doc`, `compile_long_doc`) +- Modify: `tests/test_compiler.py` + +This task connects the brief+content JSON parsing to the write functions and index update. + +- [ ] **Step 1: Write integration test** + +```python +class TestBriefIntegration: + @pytest.mark.asyncio + async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "test-doc.md" + source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + summary_resp = json.dumps({ + "brief": "A paper about transformers", + "content": "# Summary\n\nThis paper discusses transformers.", + }) + plan_resp = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], + "related": [], + }) + concept_resp = json.dumps({ + "brief": "NN architecture using self-attention", + "content": "# Transformer\n\nA neural network architecture.", + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_resp]) + ) + await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") + + # Check summary frontmatter has brief + summary_text = (wiki / "summaries" / "test-doc.md").read_text() + assert "brief: A paper about transformers" in summary_text + + # Check concept frontmatter has brief + concept_text = (wiki / "concepts" / "transformer.md").read_text() + assert "brief: NN architecture using self-attention" in concept_text + + # Check index has briefs + index_text = (wiki / "index.md").read_text() + assert "[[summaries/test-doc]] — A paper about transformers" in index_text + assert "[[concepts/transformer]] — NN architecture using self-attention" in index_text +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_compiler.py::TestBriefIntegration -v` +Expected: FAIL + +- [ ] **Step 3: Update `compile_short_doc` to parse brief+content from summary response** + +In `compile_short_doc`, replace: + +```python + # --- Step 1: Generate summary --- + summary = _llm_call(model, [system_msg, doc_msg], "summary") + _write_summary(wiki_dir, doc_name, source_file, summary) +``` + +With: + +```python + # --- Step 1: Generate summary --- + summary_raw = _llm_call(model, [system_msg, doc_msg], "summary") + try: + summary_parsed = _parse_json(summary_raw) + doc_brief = summary_parsed.get("brief", "") + summary = summary_parsed.get("content", summary_raw) + except (json.JSONDecodeError, ValueError): + doc_brief = "" + summary = summary_raw + _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief) +``` + +- [ ] **Step 4: Update `_compile_concepts` signature and wiring** + +Add `doc_brief: str = ""` parameter to `_compile_concepts`. + +In `_gen_create`, parse the response: + +```python + async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: + name = concept["name"] + title = concept.get("title", name) + async with semaphore: + raw = await _llm_call_async(model, [ + system_msg, doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_PAGE_USER.format( + title=title, doc_name=doc_name, update_instruction="", + )}, + ], f"create:{name}") + try: + parsed = _parse_json(raw) + brief = parsed.get("brief", "") + content = parsed.get("content", raw) + except (json.JSONDecodeError, ValueError): + brief, content = "", raw + return name, content, False, brief +``` + +Same for `_gen_update` — returns `tuple[str, str, bool, str]` (name, content, is_update, brief). + +In the results processing loop: + +```python + concept_briefs_map: dict[str, str] = {} + for r in results: + if isinstance(r, Exception): + logger.warning("Concept generation failed: %s", r) + continue + name, page_content, is_update, brief = r + _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief) + concept_names.append(name) + if brief: + concept_briefs_map[name] = brief +``` + +Pass briefs to `_update_index`: + +```python + _update_index(wiki_dir, doc_name, concept_names, + doc_brief=doc_brief, concept_briefs=concept_briefs_map) +``` + +- [ ] **Step 5: Update `compile_short_doc` to pass `doc_brief` to `_compile_concepts`** + +```python + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, + summary, doc_name, max_concurrency, doc_brief=doc_brief, + ) +``` + +- [ ] **Step 6: Update `compile_long_doc` to pass `doc_brief` from `IndexResult.description`** + +`compile_long_doc` currently takes `doc_id` but not `description`. Add `doc_description: str = ""` parameter: + +```python +async def compile_long_doc( + doc_name: str, + summary_path: Path, + doc_id: str, + kb_dir: Path, + model: str, + doc_description: str = "", + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: +``` + +The `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain text, not JSON). Pass `doc_description` as `doc_brief`: + +```python + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, + overview, doc_name, max_concurrency, doc_brief=doc_description, + ) +``` + +Also update the CLI call in `cli.py` line 135: + +```python +asyncio.run( + compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model, + doc_description=index_result.description) +) +``` + +- [ ] **Step 7: Update existing integration tests for new JSON response format** + +Update all mock LLM responses in `TestCompileShortDoc`, `TestCompileLongDoc`, and `TestCompileConceptsPlan` to return `{"brief": "...", "content": "..."}` JSON instead of plain text for summary and concept responses. + +- [ ] **Step 8: Run all tests** + +Run: `pytest tests/ -q` +Expected: All PASS + +- [ ] **Step 9: Commit** + +```bash +git add openkb/agent/compiler.py openkb/cli.py tests/test_compiler.py +git commit -m "feat: wire brief+content JSON through compile pipeline to index and frontmatter" +``` + +--- + +### Task 6: Indexer — long doc sources from markdown to JSON + +**Files:** +- Modify: `openkb/indexer.py` +- Modify: `openkb/tree_renderer.py` (remove `render_source_md`) +- Modify: `tests/test_indexer.py` + +- [ ] **Step 1: Write failing test** + +Update `tests/test_indexer.py`: + +```python + def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path): + """Long doc source should be written as JSON, not markdown.""" + import json as json_mod + doc_id = "abc-123" + fake_col = self._make_fake_collection(doc_id, sample_tree) + + fake_client = MagicMock() + fake_client.collection.return_value = fake_col + # Mock get_page_content to return page data + fake_col.get_page_content.return_value = [ + {"page": 1, "content": "Page one text."}, + {"page": 2, "content": "Page two text."}, + ] + + pdf_path = tmp_path / "sample.pdf" + pdf_path.write_bytes(b"%PDF-1.4 fake") + + with patch("openkb.indexer.PageIndexClient", return_value=fake_client): + index_long_document(pdf_path, kb_dir) + + # Should be JSON, not MD + json_file = kb_dir / "wiki" / "sources" / "sample.json" + assert json_file.exists() + assert not (kb_dir / "wiki" / "sources" / "sample.md").exists() + data = json_mod.loads(json_file.read_text()) + assert len(data) == 2 + assert data[0]["page"] == 1 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_indexer.py::TestIndexLongDocument::test_source_page_written_as_json -v` +Expected: FAIL + +- [ ] **Step 3: Update `indexer.py` to write JSON sources** + +Replace the source writing block (lines 103-110) with: + +```python + # Write wiki/sources/ as JSON (per-page content from PageIndex) + sources_dir = kb_dir / "wiki" / "sources" + sources_dir.mkdir(parents=True, exist_ok=True) + dest_images_dir = sources_dir / "images" / pdf_path.stem + + # Get per-page content from PageIndex + all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}") + + # Relocate image paths + dest_images_dir.mkdir(parents=True, exist_ok=True) + for page in all_pages: + if "images" in page: + for img in page["images"]: + src_path = Path(img["path"]) + if src_path.exists(): + filename = src_path.name + dest = dest_images_dir / filename + if not dest.exists(): + shutil.copy2(src_path, dest) + img["path"] = f"images/{pdf_path.stem}/{filename}" + + import json as json_mod + (sources_dir / f"{pdf_path.stem}.json").write_text( + json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", + ) +``` + +Remove the `render_source_md` import and `_relocate_images` call. + +- [ ] **Step 4: Remove `render_source_md` from tree_renderer.py** + +Remove the `render_source_md` function and `_render_nodes_source` helper from `openkb/tree_renderer.py`. Keep `render_summary_md` and `_render_nodes_summary`. + +- [ ] **Step 5: Update existing test `test_source_page_written`** + +The old test checks for `.md` — update it to check for `.json` or remove it (replaced by the new test). + +- [ ] **Step 6: Run all tests** + +Run: `pytest tests/ -q` +Expected: All PASS + +- [ ] **Step 7: Commit** + +```bash +git add openkb/indexer.py openkb/tree_renderer.py tests/test_indexer.py +git commit -m "feat: store long doc sources as per-page JSON, remove render_source_md" +``` + +--- + +### Task 7: Query agent — remove `pageindex_retrieve`, add `get_page_content`, update instructions + +**Files:** +- Modify: `openkb/agent/query.py` +- Modify: `openkb/schema.py` +- Modify: `tests/test_query.py` + +- [ ] **Step 1: Write failing tests** + +Update `tests/test_query.py`: + +```python +class TestBuildQueryAgent: + def test_agent_name(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + assert agent.name == "wiki-query" + + def test_agent_has_three_tools(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + assert len(agent.tools) == 3 + + def test_agent_tool_names(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + names = {t.name for t in agent.tools} + assert "list_files" in names + assert "read_file" in names + assert "get_page_content" in names + assert "pageindex_retrieve" not in names + + def test_instructions_mention_get_page_content(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + assert "get_page_content" in agent.instructions +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_query.py::TestBuildQueryAgent -v` +Expected: FAIL — old signature requires `openkb_dir` + +- [ ] **Step 3: Rewrite `query.py`** + +Remove `_pageindex_retrieve_impl` entirely (~110 lines). Remove `PageIndexClient` import. Update `build_query_agent`: + +```python +def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent: + """Build and return the Q&A agent.""" + schema_md = get_agents_md(Path(wiki_root)) + instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) + instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." + + @function_tool + def list_files(directory: str) -> str: + """List all Markdown files in a wiki subdirectory.""" + return list_wiki_files(directory, wiki_root) + + @function_tool + def read_file(path: str) -> str: + """Read a Markdown file from the wiki.""" + return read_wiki_file(path, wiki_root) + + @function_tool + def get_page_content_tool(doc_name: str, pages: str) -> str: + """Get text content of specific pages from a long document. + + Args: + doc_name: Document name (e.g. 'attention-is-all-you-need'). + pages: Page specification (e.g. '3-5,7,10-12'). + """ + from openkb.agent.tools import get_page_content + return get_page_content(doc_name, pages, wiki_root) + + from agents.model_settings import ModelSettings + + return Agent( + name="wiki-query", + instructions=instructions, + tools=[list_files, read_file, get_page_content_tool], + model=f"litellm/{model}", + model_settings=ModelSettings(parallel_tool_calls=False), + ) +``` + +Update `_QUERY_INSTRUCTIONS_TEMPLATE`: + +```python +_QUERY_INSTRUCTIONS_TEMPLATE = """\ +You are a knowledge-base Q&A agent. You answer questions by searching the wiki. + +{schema_md} + +## Search strategy +1. Read index.md to understand what documents and concepts are available. + Each entry has a brief summary to help you judge relevance. +2. Read relevant summary pages (summaries/) for document overviews. +3. Read concept pages (concepts/) for cross-document synthesis. +4. For long documents, use get_page_content(doc_name, pages) to read + specific pages when you need detailed content. The summary page + shows chapter structure with page ranges to help you decide which + pages to read. +5. Synthesise a clear, well-cited answer. + +Always ground your answer in the wiki content. If you cannot find relevant +information, say so clearly. +""" +``` + +Update `run_query` to match new `build_query_agent` signature (remove `openkb_dir` param): + +```python +async def run_query(question: str, kb_dir: Path, model: str, stream: bool = False) -> str: + from openkb.config import load_config + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + language: str = config.get("language", "en") + + wiki_root = str(kb_dir / "wiki") + agent = build_query_agent(wiki_root, model, language=language) + # ... rest unchanged ... +``` + +- [ ] **Step 4: Update `openkb/schema.py` AGENTS_MD** + +Add a note about `get_page_content` for long documents in the Schema: + +```python +## Page Types +- **Summary Page** (summaries/): Key content of a single source document. +- **Concept Page** (concepts/): Cross-document topic synthesis with [[wikilinks]]. +- **Exploration Page** (explorations/): Saved query results — analyses, comparisons, syntheses. +- **Source Page** (sources/): Full-text for short docs (.md) or per-page JSON for long docs (.json). +- **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained. +``` + +- [ ] **Step 5: Run all tests** + +Run: `pytest tests/ -q` +Expected: All PASS + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/query.py openkb/schema.py tests/test_query.py +git commit -m "feat: replace pageindex_retrieve with get_page_content, unify query for all docs" +``` + +--- + +### Task 8: Final cleanup and full verification + +**Files:** +- Modify: `openkb/indexer.py` (remove unused imports) +- Verify all files + +- [ ] **Step 1: Remove unused imports** + +In `indexer.py`, remove `from openkb.tree_renderer import render_source_md` if still present (keep `render_summary_md`). + +In `query.py`, verify `PageIndexClient` import is removed. + +- [ ] **Step 2: Run full test suite** + +Run: `pytest tests/ -v` +Expected: All PASS + +- [ ] **Step 3: Grep for dead references** + +Run: `grep -r "pageindex_retrieve\|render_source_md\|_relocate_images" openkb/ tests/` +Expected: No matches + +- [ ] **Step 4: Commit** + +```bash +git add -A +git commit -m "chore: remove dead imports and references" +``` diff --git a/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md b/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md new file mode 100644 index 0000000..2fcd853 --- /dev/null +++ b/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md @@ -0,0 +1,163 @@ +# Concept Dedup & Existing Page Update + +**Date:** 2026-04-09 +**Status:** Approved +**Branch:** bugfix/compile + +## Problem + +The compiler pipeline generates concept pages per document, but: + +1. **No dedup** — LLM only sees concept slug names, not content. It can't reliably judge whether a new concept overlaps with an existing one. As the KB grows, concepts duplicate and diverge. +2. **No update of existing pages** — When a new document has information relevant to existing concepts, those pages are not updated. Knowledge doesn't compound across documents. + +The old agent-based approach solved this (the agent could read/write wiki files freely), but was too slow — 20-30 tool-call round-trips per document. + +## Design + +Extend the existing deterministic pipeline to give the LLM enough context for dedup/update decisions, without adding agent loops or breaking prompt caching. + +### Prompt Caching Invariant + +The cached prefix `[system_msg, doc_msg]` must remain identical across all LLM calls within a single document compilation. All new context (concept briefs, existing page content) goes into messages **after** the cached prefix. + +### Pipeline Overview + +``` +Step 1: [system, doc] → summary (unchanged) +Step 2: [system, doc, summary, concepts_plan_prompt] → concepts plan JSON +Step 3a: [system, doc, summary, create_prompt] × N → new concept pages (concurrent) +Step 3b: [system, doc, summary, update_prompt] × M → rewritten concept pages (concurrent) +Step 3c: code-only × K → add cross-ref links to related concepts +Step 4: update index (unchanged) +``` + +Steps 3a and 3b share a single semaphore and run concurrently together. + +### Part 1: Concept Briefs + +New function `_read_concept_briefs(wiki_dir)` reads existing concept pages and returns a compact summary string: + +``` +- attention: Attention is a mechanism that allows models to focus on relevant parts... +- transformer-architecture: The Transformer is a neural network architecture... +``` + +For each concept file in `wiki/concepts/*.md`: +- Skip YAML frontmatter +- Take first 150 characters of body text +- Format as `- {slug}: {brief}` + +This replaces the current `", ".join(existing_concepts)` in the concepts-list prompt. Pure file I/O, no LLM call. + +### Part 2: Concepts Plan Prompt + +The `_CONCEPTS_LIST_USER` template is replaced with a new `_CONCEPTS_PLAN_USER` template that asks the LLM to return a JSON object with three action types: + +```json +{ + "create": [{"name": "flash-attention", "title": "Flash Attention"}], + "update": [{"name": "attention", "title": "Attention Mechanism"}], + "related": ["transformer-architecture"] +} +``` + +- **create** — New concept not covered by any existing page. +- **update** — Existing concept with significant new information worth integrating. +- **related** — Existing concept tangentially related; only needs a cross-reference link. + +The prompt includes rules: +- Don't create concepts that overlap with existing ones — use "update" instead. +- Don't create concepts that are just the document topic itself. +- For first few documents, create 2-3 foundational concepts at most. +- "related" is for lightweight cross-linking only. + +### Part 3: Three Execution Paths + +#### create (unchanged) + +Same as current: concurrent `_llm_call_async` with `_CONCEPT_PAGE_USER` template. Written via `_write_concept` with `is_update=False`. + +#### update (new) + +New template `_CONCEPT_UPDATE_USER`: + +``` +Update the concept page for: {title} + +Current content of this page: +{existing_content} + +New information from document "{doc_name}" (summarized above) should be +integrated into this page. Rewrite the full page incorporating the new +information naturally. Maintain existing cross-references and add new ones +where appropriate. + +Return ONLY the Markdown content (no frontmatter, no code fences). +``` + +Call structure: `[system_msg, doc_msg, {assistant: summary}, update_user_msg]` + +The cached prefix `[system_msg, doc_msg]` is shared with create calls. The `existing_content` (typically 200-500 tokens) is in the final user message only. + +Written via `_write_concept` with `is_update=True`. The frontmatter `sources:` list is updated to include the new source file. + +#### related (code-only, no LLM) + +For each related slug: +1. Read the concept file +2. If `summaries/{doc_name}` is not already linked, append `\n\nSee also: [[summaries/{doc_name}]]` +3. Update frontmatter `sources:` list + +Pure file I/O, millisecond-level. + +### Part 4: Shared Logic Between Short and Long Doc + +Current `compile_short_doc` and `compile_long_doc` duplicate Steps 2-4. Extract shared logic into `_compile_concepts(wiki_dir, model, system_msg, doc_msg, summary, doc_name, kb_dir, max_concurrency)`. + +Public functions become: +- `compile_short_doc`: builds context A from source text → calls `_compile_concepts` +- `compile_long_doc`: builds context A from PageIndex summary → calls `_compile_concepts` + +### Part 5: JSON Parsing Fallback + +If the LLM returns a flat JSON array instead of the expected dict, treat it as all "create" actions: + +```python +if isinstance(parsed, list): + create_list, update_list, related_list = parsed, [], [] +else: + create_list = parsed.get("create", []) + update_list = parsed.get("update", []) + related_list = parsed.get("related", []) +``` + +This ensures backward compatibility if the LLM doesn't follow the new format. + +## Token Cost Analysis + +Compared to current pipeline (per document with C existing concepts): + +| Step | Current | New | Delta | +|------|---------|-----|-------| +| concepts-list prompt | ~50 tokens (slug names) | ~50 + C×30 tokens (briefs) | +C×30 | +| update calls | 0 | M × ~500 tokens (existing content) | +M×500 | +| related | 0 | 0 (code-only) | 0 | + +At C=30 existing concepts: +900 tokens in concepts-list prompt. +At M=2 update calls: +1000 tokens total. + +Total overhead: ~2000 tokens per document. Negligible compared to document content (5K-20K tokens). + +## Files Changed + +- `openkb/agent/compiler.py` — all changes + - New: `_read_concept_briefs()`, `_CONCEPTS_PLAN_USER`, `_CONCEPT_UPDATE_USER`, `_add_related_link()`, `_compile_concepts()` + - Modified: `compile_short_doc()`, `compile_long_doc()`, `_parse_json()` caller logic +- `tests/test_compiler.py` — update tests for new JSON format and update/related paths + +## Not In Scope + +- Concept briefs truncation/filtering for very large KBs (100+ concepts) — revisit when needed +- Interactive ingest (human-in-the-loop checkpoint) — separate feature +- Lint --fix auto-repair — separate feature diff --git a/docs/superpowers/specs/2026-04-09-retrieve-redesign.md b/docs/superpowers/specs/2026-04-09-retrieve-redesign.md new file mode 100644 index 0000000..15224be --- /dev/null +++ b/docs/superpowers/specs/2026-04-09-retrieve-redesign.md @@ -0,0 +1,262 @@ +# Retrieve Redesign: Unified Query, Brief Summaries, and Local Page Content + +**Date:** 2026-04-09 +**Status:** Approved +**Branch:** bugfix/compile + +## Problems + +### 1. Long vs Short Doc Split in Query + +The query agent treats long documents (PageIndex-indexed) and short documents differently: + +- **Short docs**: agent reads `wiki/sources/{name}.md` via `read_file` +- **Long docs**: agent calls `pageindex_retrieve(doc_id, question)` — a black-box RAG call + +**Design Principle**: PageIndex is an indexer, not a retriever. Query-time retrieval should be done by the agent navigating the wiki, using the same tools for all documents. + +### 2. index.md Has No Brief Summaries + +Karpathy's gist says index.md should have "each page listed with a link, **a one-line summary**". Currently it only has wikilinks with no descriptions. The query agent must open every file to understand what's available. + +### 3. No Brief Summaries on Concepts Either + +Same problem: concept entries in index.md have no description. The agent can't judge relevance from the index alone. + +## Design + +### Part 1: Structured LLM Output with Brief Summaries + +All LLM generation steps (summary, concept create, concept update) now return a JSON object with both a one-line brief and the full content. + +#### Summary Generation + +`_SUMMARY_USER` prompt changes to request JSON output: + +``` +Write a summary page for this document in Markdown. + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) describing the document's main contribution +- "content": The full summary in Markdown. Include key concepts, findings, and [[wikilinks]] + +Return ONLY valid JSON, no fences. +``` + +LLM returns: +```json +{ + "brief": "Introduces the Transformer architecture based entirely on self-attention", + "content": "# Attention Is All You Need\n\nThis paper proposes..." +} +``` + +The `brief` is: +- Written into summary frontmatter: `brief: Introduces the Transformer...` +- Passed to `_update_index` for the Documents section + +The `content` is written to `wiki/summaries/{name}.md` as before. + +#### Concept Generation (create) + +`_CONCEPT_PAGE_USER` prompt changes similarly: + +``` +Write the concept page for: {title} + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept +- "content": The full concept page in Markdown with [[wikilinks]] + +Return ONLY valid JSON, no fences. +``` + +The `brief` is: +- Written into concept frontmatter: `brief: Mechanism allowing each position to attend to all others` +- Passed to `_update_index` for the Concepts section +- Used by `_read_concept_briefs` (read from frontmatter instead of truncating body text) + +#### Concept Generation (update) + +`_CONCEPT_UPDATE_USER` also returns `{"brief": "...", "content": "..."}`. The brief may change as the concept evolves with new information. + +#### Long Doc Summary (overview) + +Long documents do NOT need the LLM to generate a brief. The brief comes directly from PageIndex's `doc_description` field (available via `IndexResult.description`), which is already a document-level summary generated during indexing. `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain markdown overview, not JSON) — the brief is passed through from the indexer. + +In `compile_long_doc`, the `doc_description` is passed to `_compile_concepts` which forwards it to `_update_index` as the doc brief. + +#### Parsing + +All LLM responses go through `_parse_json`. Callers extract `brief` and `content`: + +```python +parsed = _parse_json(raw) +brief = parsed.get("brief", "") +content = parsed.get("content", raw) # fallback: treat raw as content if not JSON +``` + +The fallback ensures backward compatibility if the LLM returns plain text instead of JSON. + +### Part 2: index.md with Brief Summaries + +`_update_index` signature changes: + +```python +def _update_index(wiki_dir, doc_name, concept_names, doc_brief="", concept_briefs=None): +``` + +Output format: + +```markdown +## Documents +- [[summaries/attention-is-all-you-need]] — Introduces the Transformer architecture based on self-attention +- [[summaries/flash-attention]] — Efficient attention algorithm reducing memory from quadratic to linear + +## Concepts +- [[concepts/self-attention]] — Mechanism allowing each position to attend to all others in a sequence +- [[concepts/transformer]] — Neural network architecture based entirely on attention mechanisms +``` + +When updating an existing entry (re-compile), the brief is updated in place. + +### Part 3: Frontmatter with Brief + +Summary and concept pages get a `brief` field in frontmatter: + +```markdown +--- +sources: [paper.pdf] +brief: Introduces the Transformer architecture based on self-attention +--- + +# Attention Is All You Need +... +``` + +`_read_concept_briefs` is updated to read from `brief:` frontmatter field instead of truncating body text. Fallback to body truncation if `brief:` is absent (backward compat with existing pages). + +### Part 4: Long Doc Sources from Markdown to JSON + +Store per-page content as JSON instead of a giant markdown file. + +**Current**: +``` +wiki/sources/paper.md ← rendered markdown, 10K-50K tokens +``` + +**New**: +``` +wiki/sources/paper.json ← per-page JSON array +``` + +**JSON format** (only the `pages` array from PageIndex, not the full doc object): +```json +[ + { + "page": 1, + "content": "Full text of page 1...", + "images": [{"path": "images/paper/p1_img1.png", "width": 400, "height": 300}] + }, + { + "page": 2, + "content": "Full text of page 2..." + } +] +``` + +`images` field is optional. Image paths are relative to `wiki/sources/`. Short documents are not affected — they stay as `.md`. + +#### Indexer Changes + +In `indexer.py`, replace `render_source_md` + `_relocate_images` with: +1. `col.get_page_content(doc_id, "1-9999")` to get all pages +2. Relocate image paths in each page's `images` array +3. Write as JSON to `wiki/sources/{name}.json` + +### Part 5: New Tool `get_page_content` + +Add to `openkb/agent/tools.py`: + +```python +def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: + """Get text content of specific pages from a long document. + + Args: + doc_name: Document name (e.g. 'attention-is-all-you-need'). + pages: Page specification (e.g. '3-5,7,10-12'). + wiki_root: Absolute path to the wiki root directory. + """ +``` + +Implementation: +1. Read `wiki/sources/{doc_name}.json` +2. Parse `pages` spec into a set of page numbers (comma-separated, ranges with `-`) +3. Filter pages, format as `[Page N]\n{content}\n\n` +4. Return concatenated text, or error if file not found + +### Part 6: Query Agent Changes + +**Remove**: `pageindex_retrieve` tool and `_pageindex_retrieve_impl` entirely. + +**Add**: `get_page_content` tool. + +**Update instructions**: +``` +## Search strategy +1. Read index.md to understand what documents and concepts are available. + Each entry has a brief summary to help you judge relevance. +2. Read relevant summary pages (summaries/) for document overviews. +3. Read concept pages (concepts/) for cross-document synthesis. +4. For long documents, use get_page_content(doc_name, pages) to read + specific pages. The summary page shows chapter structure with page + ranges to help you decide which pages to read. +5. Synthesise a clear, well-cited answer. +``` + +**Remove**: `openkb_dir` and `model` parameters from `build_query_agent`. + +### What Gets Removed + +- `_pageindex_retrieve_impl` (~110 lines) +- `pageindex_retrieve` tool +- `render_source_md` from `tree_renderer.py` +- `_relocate_images` in current form (replaced by per-page relocation) +- PageIndex imports in `query.py` + +### What Stays + +- `render_summary_md` — summaries still markdown +- Short doc pipeline — unchanged +- Image files in `wiki/sources/images/` +- PageIndex in `indexer.py` — still used for tree building + +## Compile Pipeline Changes Summary + +The compile pipeline (`_compile_concepts`, `compile_short_doc`, `compile_long_doc`) changes: + +1. **Summary step**: parse JSON response, extract `brief` + `content` +2. **Concept create/update steps**: parse JSON response, extract `brief` + `content` +3. **`_write_summary`**: add `brief` to frontmatter +4. **`_write_concept`**: add/update `brief` in frontmatter +5. **`_update_index`**: write `— {brief}` after each wikilink +6. **`_read_concept_briefs`**: read from `brief:` frontmatter field (fallback to body truncation) + +## Files Changed + +- `openkb/agent/compiler.py` — prompt templates return JSON with brief+content, parse responses, pass briefs to index/frontmatter +- `openkb/indexer.py` — sources output from md to json, image relocation per-page +- `openkb/agent/tools.py` — add `get_page_content` +- `openkb/agent/query.py` — remove `pageindex_retrieve`, add `get_page_content`, update instructions +- `openkb/tree_renderer.py` — remove `render_source_md` +- `openkb/schema.py` — update AGENTS_MD +- `tests/test_compiler.py` — update for JSON LLM responses +- `tests/test_indexer.py` — update for JSON output +- `tests/test_query.py` — update for new tool set +- `tests/test_agent_tools.py` — add tests for `get_page_content` + +## Not In Scope + +- Cloud PageIndex query support (removed entirely) +- Changes to the lint pipeline +- Interactive ingest From 39ae5c5fa8a0a3bbe5698574a5b62e48ff5d23f2 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:33:27 +0800 Subject: [PATCH 09/44] feat: add get_page_content tool and parse_pages helper Adds parse_pages() to expand page specs like "1-3,7" into sorted deduplicated int lists, and get_page_content() to read per-page JSON (sources/{doc}.json) and format output with optional image paths. Includes path-traversal guard consistent with existing tools. --- openkb/agent/tools.py | 81 ++++++++++++++++++++++++++++++++++++ tests/test_agent_tools.py | 87 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 167 insertions(+), 1 deletion(-) diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index 185344b..0d1164c 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -6,6 +6,7 @@ """ from __future__ import annotations +import json as _json from pathlib import Path @@ -52,6 +53,86 @@ def read_wiki_file(path: str, wiki_root: str) -> str: return full_path.read_text(encoding="utf-8") +def parse_pages(pages: str) -> list[int]: + """Parse a page specification string into a sorted, deduplicated list of page numbers. + + Args: + pages: Page spec such as ``"3-5,7,10-12"``. + + Returns: + Sorted list of positive page numbers, e.g. ``[3, 4, 5, 7, 10, 11, 12]``. + """ + result: set[int] = set() + for part in pages.split(","): + part = part.strip() + if "-" in part: + # Handle ranges like "3-5"; also handle negative numbers by only + # splitting on the first "-" that follows a digit. + segments = part.split("-") + # Re-join to handle leading negatives: segments[0] may be empty + # if part starts with "-". We just try to parse start/end. + try: + if len(segments) == 2: + start, end = int(segments[0]), int(segments[1]) + result.update(range(start, end + 1)) + elif len(segments) == 3 and segments[0] == "": + # e.g. "-1" split gives ['', '1'] + result.add(-int(segments[1])) + # More complex cases (e.g. negative range) are ignored. + except ValueError: + pass + else: + try: + result.add(int(part)) + except ValueError: + pass + return sorted(n for n in result if n > 0) + + +def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: + """Return formatted content for specified pages of a document. + + Reads ``{wiki_root}/sources/{doc_name}.json`` which must be a JSON array of + objects with at least ``{"page": int, "content": str}`` fields and an + optional ``"images"`` list of ``{"path": str, ...}`` objects. + + Args: + doc_name: Document name without extension (e.g. ``"paper"``). + pages: Page specification string (e.g. ``"1-3,7"``). + wiki_root: Absolute path to the wiki root directory. + + Returns: + Formatted page content, or an error message string. + """ + root = Path(wiki_root).resolve() + target = (root / "sources" / f"{doc_name}.json").resolve() + if not target.is_relative_to(root): + return "Access denied: path escapes wiki root." + if not target.exists(): + return f"File not found: sources/{doc_name}.json" + + data = _json.loads(target.read_text(encoding="utf-8")) + requested = set(parse_pages(pages)) + matches = [entry for entry in data if entry.get("page") in requested] + + if not matches: + return f"No content found for pages {pages} in {doc_name}." + + parts: list[str] = [] + for entry in matches: + page_num = entry["page"] + content = entry.get("content", "") + block = f"[Page {page_num}]\n{content}" + images = entry.get("images") + if images: + paths = ", ".join(img["path"] for img in images if "path" in img) + if paths: + block += f"\n[Images: {paths}]" + parts.append(block) + + return "\n\n".join(parts) + "\n\n" + + def write_wiki_file(path: str, content: str, wiki_root: str) -> str: """Write or overwrite a Markdown file in the wiki. diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py index bfffc2f..3d95a88 100644 --- a/tests/test_agent_tools.py +++ b/tests/test_agent_tools.py @@ -5,7 +5,7 @@ import pytest -from openkb.agent.tools import list_wiki_files, read_wiki_file, write_wiki_file +from openkb.agent.tools import get_page_content, list_wiki_files, parse_pages, read_wiki_file, write_wiki_file # --------------------------------------------------------------------------- @@ -128,3 +128,88 @@ def test_returns_written_path(self, tmp_path): result = write_wiki_file("reports/health.md", "All good.", wiki_root) assert result == "Written: reports/health.md" + + +# --------------------------------------------------------------------------- +# parse_pages +# --------------------------------------------------------------------------- + + +class TestParsePages: + def test_single_page(self): + assert parse_pages("3") == [3] + + def test_range(self): + assert parse_pages("3-5") == [3, 4, 5] + + def test_comma_separated(self): + assert parse_pages("1,3,5") == [1, 3, 5] + + def test_mixed(self): + assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12] + + def test_deduplication(self): + assert parse_pages("3,3,3") == [3] + + def test_sorted(self): + assert parse_pages("5,1,3") == [1, 3, 5] + + def test_ignores_zero_and_negative(self): + assert parse_pages("0,-1,3") == [3] + + +# --------------------------------------------------------------------------- +# get_page_content +# --------------------------------------------------------------------------- + + +class TestGetPageContent: + def test_reads_pages_from_json(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [ + {"page": 1, "content": "Page one text."}, + {"page": 2, "content": "Page two text."}, + {"page": 3, "content": "Page three text."}, + ] + (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") + result = get_page_content("paper", "1,3", wiki_root) + assert "[Page 1]" in result + assert "Page one text." in result + assert "[Page 3]" in result + assert "Page three text." in result + assert "Page two" not in result + + def test_returns_error_for_missing_file(self, tmp_path): + wiki_root = str(tmp_path) + (tmp_path / "sources").mkdir() + result = get_page_content("nonexistent", "1", wiki_root) + assert "not found" in result.lower() + + def test_returns_error_for_no_matching_pages(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [{"page": 1, "content": "Only page."}] + (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") + result = get_page_content("paper", "99", wiki_root) + assert "no content" in result.lower() + + def test_includes_images_info(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [{"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]}] + (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8") + result = get_page_content("doc", "1", wiki_root) + assert "img.png" in result + + def test_path_escape_denied(self, tmp_path): + wiki_root = str(tmp_path) + (tmp_path / "sources").mkdir() + result = get_page_content("../../etc/passwd", "1", wiki_root) + assert "denied" in result.lower() or "not found" in result.lower() From b6ce04e02267751c423c205a80543474568bf4f2 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:33:29 +0800 Subject: [PATCH 10/44] feat: update LLM prompts to return brief+content JSON Replace _SUMMARY_USER, _CONCEPT_PAGE_USER, and _CONCEPT_UPDATE_USER to request a JSON object with "brief" (one-line summary) and "content" (full Markdown). Add TestParseBriefContent to tests/test_compiler.py. --- openkb/agent/compiler.py | 28 +++++++++++++++++++--------- tests/test_compiler.py | 13 +++++++++++++ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 5075278..947b0cc 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -43,11 +43,14 @@ Full text: {content} -Write a summary page for this document in Markdown. Include: -- Key concepts, findings, and ideas -- [[wikilinks]] to concepts that could become cross-document concept pages +Write a summary page for this document in Markdown. -Return ONLY the Markdown content (no frontmatter, no code fences). +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) describing the document's main contribution +- "content": The full summary in Markdown. Include key concepts, findings, ideas, \ +and [[wikilinks]] to concepts that could become cross-document concept pages + +Return ONLY valid JSON, no fences. """ @@ -84,10 +87,13 @@ This concept relates to the document "{doc_name}" summarized above. {update_instruction} -Return ONLY the Markdown content (no frontmatter, no code fences). Include: -- Clear explanation of the concept -- Key details from the source document -- [[wikilinks]] to related concepts and [[summaries/{doc_name}]] +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept +- "content": The full concept page in Markdown. Include clear explanation, \ +key details from the source document, and [[wikilinks]] to related concepts \ +and [[summaries/{doc_name}]] + +Return ONLY valid JSON, no fences. """ _CONCEPT_UPDATE_USER = """\ @@ -101,7 +107,11 @@ information naturally — do not just append. Maintain existing \ [[wikilinks]] and add new ones where appropriate. -Return ONLY the Markdown content (no frontmatter, no code fences). +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept (may differ from before) +- "content": The rewritten full concept page in Markdown + +Return ONLY valid JSON, no fences. """ _LONG_DOC_SUMMARY_USER = """\ diff --git a/tests/test_compiler.py b/tests/test_compiler.py index e1238df..2d5f376 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -61,6 +61,19 @@ def test_fenced_dict(self): assert parsed["create"] == [] +class TestParseBriefContent: + def test_dict_with_brief_and_content(self): + text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."}) + parsed = _parse_json(text) + assert parsed["brief"] == "A short desc" + assert "# Full page" in parsed["content"] + + def test_plain_text_fallback(self): + """If LLM returns plain text, _parse_json raises — caller handles fallback.""" + with pytest.raises((json.JSONDecodeError, ValueError)): + _parse_json("Just plain markdown text without JSON") + + class TestWriteSummary: def test_writes_with_frontmatter(self, tmp_path): wiki = tmp_path / "wiki" From a172c433ac23d9f5ebca38adba98813a8b7c3214 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:35:25 +0800 Subject: [PATCH 11/44] feat: store brief in frontmatter of summary and concept pages --- openkb/agent/compiler.py | 48 ++++++++++++++++++++++++++++++---------- tests/test_compiler.py | 42 ++++++++++++++++++++++++++++++++--- 2 files changed, 75 insertions(+), 15 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 947b0cc..62ab44f 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -243,8 +243,9 @@ def _read_wiki_context(wiki_dir: Path) -> tuple[str, list[str]]: def _read_concept_briefs(wiki_dir: Path) -> str: """Read existing concept pages and return compact one-line summaries. - For each concept, skips YAML frontmatter, takes the first 150 chars of the - body (newlines collapsed to spaces), and formats as ``- {slug}: {brief}``. + For each concept, reads the ``brief:`` field from YAML frontmatter if + present; otherwise falls back to truncating the first 150 chars of the body + (newlines collapsed to spaces). Formats each as ``- {slug}: {brief}``. Returns "(none yet)" if the concepts directory is missing or empty. """ @@ -259,16 +260,23 @@ def _read_concept_briefs(wiki_dir: Path) -> str: lines: list[str] = [] for path in md_files: text = path.read_text(encoding="utf-8") - # Strip YAML frontmatter if present + brief = "" + body = text if text.startswith("---"): end = text.find("---", 3) if end != -1: - text = text[end + 3:] - body = text.strip().replace("\n", " ") - brief = body[:150] - lines.append(f"- {path.stem}: {brief}") + fm = text[:end + 3] + body = text[end + 3:] + for line in fm.split("\n"): + if line.startswith("brief:"): + brief = line[len("brief:"):].strip() + break + if not brief: + brief = body.strip().replace("\n", " ")[:150] + if brief: + lines.append(f"- {path.stem}: {brief}") - return "\n".join(lines) + return "\n".join(lines) or "(none yet)" def _find_source_filename(doc_name: str, kb_dir: Path) -> str: @@ -281,11 +289,14 @@ def _find_source_filename(doc_name: str, kb_dir: Path) -> str: return f"{doc_name}.pdf" -def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str) -> None: +def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None: """Write summary page with frontmatter.""" summaries_dir = wiki_dir / "summaries" summaries_dir.mkdir(parents=True, exist_ok=True) - frontmatter = f"---\nsources: [{source_file}]\n---\n\n" + fm_lines = [f"sources: [{source_file}]"] + if brief: + fm_lines.append(f"brief: {brief}") + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") @@ -298,7 +309,7 @@ def _sanitize_concept_name(name: str) -> str: return sanitized or "unnamed-concept" -def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool) -> None: +def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool, brief: str = "") -> None: """Write or update a concept page, managing the sources frontmatter.""" concepts_dir = wiki_dir / "concepts" concepts_dir.mkdir(parents=True, exist_ok=True) @@ -324,9 +335,22 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is else: existing = f"---\nsources: [{source_file}]\n---\n\n" + existing existing += f"\n\n{content}" + if brief and existing.startswith("---"): + end = existing.find("---", 3) + if end != -1: + fm = existing[:end + 3] + body = existing[end + 3:] + if "brief:" in fm: + fm = re.sub(r"brief:.*", f"brief: {brief}", fm) + else: + fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1) + existing = fm + body path.write_text(existing, encoding="utf-8") else: - frontmatter = f"---\nsources: [{source_file}]\n---\n\n" + fm_lines = [f"sources: [{source_file}]"] + if brief: + fm_lines.append(f"brief: {brief}") + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" path.write_text(frontmatter + content, encoding="utf-8") diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 2d5f376..b64ce31 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -78,25 +78,61 @@ class TestWriteSummary: def test_writes_with_frontmatter(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() - _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.") + _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers") path = wiki / "summaries" / "my-doc.md" assert path.exists() text = path.read_text() assert "sources: [my-doc.pdf]" in text + assert "brief: Introduces transformers" in text assert "# Summary" in text + def test_writes_without_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.") + path = wiki / "summaries" / "my-doc.md" + text = path.read_text() + assert "sources: [my-doc.pdf]" in text + assert "brief:" not in text + class TestWriteConcept: - def test_new_concept(self, tmp_path): + def test_new_concept_with_brief(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() - _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False) + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False, brief="Mechanism for selective focus") path = wiki / "concepts" / "attention.md" assert path.exists() text = path.read_text() assert "sources: [paper.pdf]" in text + assert "brief: Mechanism for selective focus" in text assert "# Attention" in text + def test_new_concept_without_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False) + path = wiki / "concepts" / "attention.md" + assert path.exists() + text = path.read_text() + assert "sources: [paper.pdf]" in text + assert "brief:" not in text + + def test_update_concept_updates_brief(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\nbrief: Old brief\n---\n\n# Attention\n\nOld content.", + encoding="utf-8", + ) + _write_concept(wiki, "attention", "New info.", "paper2.pdf", True, brief="Updated brief") + text = (concepts / "attention.md").read_text() + assert "paper2.pdf" in text + assert "paper1.pdf" in text + assert "brief: Updated brief" in text + assert "Old brief" not in text + def test_update_concept_appends_source(self, tmp_path): wiki = tmp_path / "wiki" concepts = wiki / "concepts" From ca2391297f2c16621ff5eeaab4503f9e151d233a Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:36:44 +0800 Subject: [PATCH 12/44] feat: add briefs to index.md entries and read from frontmatter --- openkb/agent/compiler.py | 29 ++++++++++++++++++----- tests/test_compiler.py | 50 ++++++++++++++++++++++++++++++++++------ 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 62ab44f..9a169b0 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -439,8 +439,19 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) path.write_text(text, encoding="utf-8") -def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> None: - """Append document and concept entries to index.md.""" +def _update_index( + wiki_dir: Path, doc_name: str, concept_names: list[str], + doc_brief: str = "", concept_briefs: dict[str, str] | None = None, +) -> None: + """Append document and concept entries to index.md. + + When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries + are written as ``- [[link]] — brief text``. Existing entries are detected + by the link part only, so updating a brief on a re-compile works correctly. + """ + if concept_briefs is None: + concept_briefs = {} + index_path = wiki_dir / "index.md" if not index_path.exists(): index_path.write_text( @@ -450,14 +461,20 @@ def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> No text = index_path.read_text(encoding="utf-8") - doc_entry = f"- [[summaries/{doc_name}]]" - if doc_entry not in text: + doc_link = f"[[summaries/{doc_name}]]" + if doc_link not in text: + doc_entry = f"- {doc_link}" + if doc_brief: + doc_entry += f" — {doc_brief}" if "## Documents" in text: text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) for name in concept_names: - concept_entry = f"- [[concepts/{name}]]" - if concept_entry not in text: + concept_link = f"[[concepts/{name}]]" + if concept_link not in text: + concept_entry = f"- {concept_link}" + if name in concept_briefs: + concept_entry += f" — {concept_briefs[name]}" if "## Concepts" in text: text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index b64ce31..085116d 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -149,30 +149,44 @@ def test_update_concept_appends_source(self, tmp_path): class TestUpdateIndex: - def test_appends_entries(self, tmp_path): + def test_appends_entries_with_briefs(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() (wiki / "index.md").write_text( "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", encoding="utf-8", ) - _update_index(wiki, "my-doc", ["attention", "transformer"]) + _update_index(wiki, "my-doc", ["attention", "transformer"], + doc_brief="Introduces transformers", + concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"}) text = (wiki / "index.md").read_text() - assert "[[summaries/my-doc]]" in text - assert "[[concepts/attention]]" in text - assert "[[concepts/transformer]]" in text + assert "[[summaries/my-doc]] — Introduces transformers" in text + assert "[[concepts/attention]] — Focus mechanism" in text + assert "[[concepts/transformer]] — NN architecture" in text def test_no_duplicates(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() (wiki / "index.md").write_text( - "# Index\n\n## Documents\n- [[summaries/my-doc]]\n\n## Concepts\n", + "# Index\n\n## Documents\n- [[summaries/my-doc]] — Old brief\n\n## Concepts\n", encoding="utf-8", ) - _update_index(wiki, "my-doc", []) + _update_index(wiki, "my-doc", [], doc_brief="New brief") text = (wiki / "index.md").read_text() assert text.count("[[summaries/my-doc]]") == 1 + def test_backwards_compat_no_briefs(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention"]) + text = (wiki / "index.md").read_text() + assert "[[summaries/my-doc]]" in text + assert "[[concepts/attention]]" in text + class TestReadWikiContext: def test_empty_wiki(self, tmp_path): @@ -257,6 +271,28 @@ def test_sorted_alphabetically(self, tmp_path): slugs = [line.split(":")[0].lstrip("- ") for line in lines] assert slugs == ["apple", "mango", "zebra"] + def test_reads_brief_from_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\nbrief: Selective focus mechanism\n---\n\n# Attention\n\nLong content...", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- attention: Selective focus mechanism" in result + + def test_falls_back_to_body_truncation(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "old.md").write_text( + "---\nsources: [paper.pdf]\n---\n\nOld concept without brief field.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- old: Old concept without brief field." in result + class TestBacklinkSummary: def test_adds_missing_concept_links(self, tmp_path): From 5b086a54835a2050f7997496b633d988864f4355 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:40:28 +0800 Subject: [PATCH 13/44] feat: wire brief+content JSON through compile pipeline to index and frontmatter --- openkb/agent/compiler.py | 53 ++++++++++++++++++------- openkb/cli.py | 3 +- tests/test_compiler.py | 86 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 120 insertions(+), 22 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 9a169b0..b4b549e 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -497,6 +497,7 @@ async def _compile_concepts( summary: str, doc_name: str, max_concurrency: int, + doc_brief: str = "", ) -> None: """Shared Steps 2-4: concepts plan → generate/update → index. @@ -546,11 +547,11 @@ async def _compile_concepts( # --- Step 3: Generate/update concept pages concurrently (A cached) --- semaphore = asyncio.Semaphore(max_concurrency) - async def _gen_create(concept: dict) -> tuple[str, str, bool]: + async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: name = concept["name"] title = concept.get("title", name) async with semaphore: - page_content = await _llm_call_async(model, [ + raw = await _llm_call_async(model, [ system_msg, doc_msg, {"role": "assistant", "content": summary}, @@ -559,9 +560,15 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool]: update_instruction="", )}, ], f"concept:{name}") - return name, page_content, False - - async def _gen_update(concept: dict) -> tuple[str, str, bool]: + try: + parsed = _parse_json(raw) + brief = parsed.get("brief", "") + content = parsed.get("content", raw) + except (json.JSONDecodeError, ValueError): + brief, content = "", raw + return name, content, False, brief + + async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: name = concept["name"] title = concept.get("title", name) concept_path = wiki_dir / "concepts" / f"{name}.md" @@ -575,7 +582,7 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]: else: existing_content = "(page not found — create from scratch)" async with semaphore: - page_content = await _llm_call_async(model, [ + raw = await _llm_call_async(model, [ system_msg, doc_msg, {"role": "assistant", "content": summary}, @@ -584,13 +591,20 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]: existing_content=existing_content, )}, ], f"update:{name}") - return name, page_content, True + try: + parsed = _parse_json(raw) + brief = parsed.get("brief", "") + content = parsed.get("content", raw) + except (json.JSONDecodeError, ValueError): + brief, content = "", raw + return name, content, True, brief tasks = [] tasks.extend(_gen_create(c) for c in create_items) tasks.extend(_gen_update(c) for c in update_items) concept_names: list[str] = [] + concept_briefs_map: dict[str, str] = {} if tasks: total = len(tasks) @@ -603,9 +617,11 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]: if isinstance(r, Exception): logger.warning("Concept generation failed: %s", r) continue - name, page_content, is_update = r - _write_concept(wiki_dir, name, page_content, source_file, is_update) + name, page_content, is_update, brief = r + _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief) concept_names.append(name) + if brief: + concept_briefs_map[name] = brief # --- Step 3b: Process related items (code only, no LLM) --- for slug in related_items: @@ -618,7 +634,8 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]: _backlink_concepts(wiki_dir, doc_name, all_concept_slugs) # --- Step 4: Update index (code only) --- - _update_index(wiki_dir, doc_name, concept_names) + _update_index(wiki_dir, doc_name, concept_names, + doc_brief=doc_brief, concept_briefs=concept_briefs_map) async def compile_short_doc( @@ -653,13 +670,20 @@ async def compile_short_doc( )} # --- Step 1: Generate summary --- - summary = _llm_call(model, [system_msg, doc_msg], "summary") - _write_summary(wiki_dir, doc_name, source_file, summary) + summary_raw = _llm_call(model, [system_msg, doc_msg], "summary") + try: + summary_parsed = _parse_json(summary_raw) + doc_brief = summary_parsed.get("brief", "") + summary = summary_parsed.get("content", summary_raw) + except (json.JSONDecodeError, ValueError): + doc_brief = "" + summary = summary_raw + _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief) # --- Steps 2-4: Concept plan → generate/update → index --- await _compile_concepts( wiki_dir, kb_dir, model, system_msg, doc_msg, - summary, doc_name, max_concurrency, + summary, doc_name, max_concurrency, doc_brief=doc_brief, ) @@ -669,6 +693,7 @@ async def compile_long_doc( doc_id: str, kb_dir: Path, model: str, + doc_description: str = "", max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, ) -> None: """Compile a long (PageIndex) document's concepts and index. @@ -700,5 +725,5 @@ async def compile_long_doc( # --- Steps 2-4: Concept plan → generate/update → index --- await _compile_concepts( wiki_dir, kb_dir, model, system_msg, doc_msg, - overview, doc_name, max_concurrency, + overview, doc_name, max_concurrency, doc_brief=doc_description, ) diff --git a/openkb/cli.py b/openkb/cli.py index 149f391..d8ec0fd 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -132,7 +132,8 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: for attempt in range(2): try: asyncio.run( - compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model) + compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model, + doc_description=index_result.description) ) break except Exception as exc: diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 085116d..bbb6259 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -487,13 +487,19 @@ async def test_full_pipeline(self, tmp_path): (tmp_path / "raw").mkdir() (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") - summary_response = "# Summary\n\nThis document discusses transformers." + summary_response = json.dumps({ + "brief": "Discusses transformers", + "content": "# Summary\n\nThis document discusses transformers.", + }) concepts_list_response = json.dumps({ "create": [{"name": "transformer", "title": "Transformer"}], "update": [], "related": [], }) - concept_page_response = "# Transformer\n\nA neural network architecture." + concept_page_response = json.dumps({ + "brief": "NN architecture using self-attention", + "content": "# Transformer\n\nA neural network architecture.", + }) with patch("openkb.agent.compiler.litellm") as mock_litellm: mock_litellm.completion = MagicMock( @@ -534,7 +540,7 @@ async def test_handles_bad_json(self, tmp_path): with patch("openkb.agent.compiler.litellm") as mock_litellm: mock_litellm.completion = MagicMock( - side_effect=_mock_completion(["Summary text", "not valid json"]) + side_effect=_mock_completion(["Plain summary text", "not valid json"]) ) # Should not raise await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") @@ -567,7 +573,10 @@ async def test_full_pipeline(self, tmp_path): "update": [], "related": [], }) - concept_page_response = "# Deep Learning\n\nA subfield of ML." + concept_page_response = json.dumps({ + "brief": "Subfield of ML using neural networks", + "content": "# Deep Learning\n\nA subfield of ML.", + }) with patch("openkb.agent.compiler.litellm") as mock_litellm: mock_litellm.completion = MagicMock( @@ -624,8 +633,14 @@ async def test_create_and_update_flow(self, tmp_path): "update": [{"name": "attention", "title": "Attention"}], "related": [], }) - create_page_response = "# Flash Attention\n\nAn efficient attention algorithm." - update_page_response = "# Attention\n\nUpdated content with new info." + create_page_response = json.dumps({ + "brief": "Efficient attention algorithm", + "content": "# Flash Attention\n\nAn efficient attention algorithm.", + }) + update_page_response = json.dumps({ + "brief": "Updated attention mechanism", + "content": "# Attention\n\nUpdated content with new info.", + }) system_msg = {"role": "system", "content": "You are a wiki agent."} doc_msg = {"role": "user", "content": "Document about attention mechanisms."} @@ -720,7 +735,10 @@ async def test_fallback_list_format(self, tmp_path): plan_response = json.dumps([ {"name": "attention", "title": "Attention"}, ]) - concept_page_response = "# Attention\n\nA mechanism for focusing." + concept_page_response = json.dumps({ + "brief": "A mechanism for focusing", + "content": "# Attention\n\nA mechanism for focusing.", + }) system_msg = {"role": "system", "content": "You are a wiki agent."} doc_msg = {"role": "user", "content": "Document content."} @@ -744,3 +762,57 @@ async def test_fallback_list_format(self, tmp_path): att_text = att_path.read_text() assert "sources: [test-doc.pdf]" in att_text assert "Attention" in att_text + + +class TestBriefIntegration: + @pytest.mark.asyncio + async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "test-doc.md" + source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + summary_resp = json.dumps({ + "brief": "A paper about transformers", + "content": "# Summary\n\nThis paper discusses transformers.", + }) + plan_resp = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], + "related": [], + }) + concept_resp = json.dumps({ + "brief": "NN architecture using self-attention", + "content": "# Transformer\n\nA neural network architecture.", + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_resp]) + ) + await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") + + # Summary frontmatter has brief + summary_text = (wiki / "summaries" / "test-doc.md").read_text() + assert "brief: A paper about transformers" in summary_text + + # Concept frontmatter has brief + concept_text = (wiki / "concepts" / "transformer.md").read_text() + assert "brief: NN architecture using self-attention" in concept_text + + # Index has briefs + index_text = (wiki / "index.md").read_text() + assert "— A paper about transformers" in index_text + assert "— NN architecture using self-attention" in index_text From cc6215a23b93beaab0be3509c39d0eb4760ef519 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:42:39 +0800 Subject: [PATCH 14/44] feat: store long doc sources as per-page JSON, remove render_source_md Replace markdown source generation with per-page JSON from PageIndex get_page_content; remove render_source_md, _render_nodes_source, _relocate_images, and _IMG_REF_RE. Image relocation is now done inline per page. Update tests to assert .json output and mock get_page_content. --- openkb/indexer.py | 57 +++++++++++++++++------------------------ openkb/tree_renderer.py | 31 ---------------------- tests/test_indexer.py | 24 ++++++++++++----- 3 files changed, 41 insertions(+), 71 deletions(-) diff --git a/openkb/indexer.py b/openkb/indexer.py index 18aafc6..c8b81f3 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -1,8 +1,8 @@ """PageIndex indexer for long documents.""" from __future__ import annotations +import json as json_mod import logging -import re import shutil from dataclasses import dataclass from pathlib import Path @@ -12,12 +12,10 @@ from pageindex import IndexConfig, PageIndexClient from openkb.config import load_config -from openkb.tree_renderer import render_source_md, render_summary_md +from openkb.tree_renderer import render_summary_md logger = logging.getLogger(__name__) -_IMG_REF_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") - @dataclass class IndexResult: @@ -28,31 +26,6 @@ class IndexResult: tree: dict -def _relocate_images(markdown: str, doc_stem: str, dest_images_dir: Path) -> str: - """Copy images from PageIndex internal paths to wiki/sources/images/ and rewrite refs. - - PageIndex stores images internally (e.g. .openkb/files/{collection}/{doc_id}/images/). - We copy them to dest_images_dir and rewrite paths to be relative to the .md file - (i.e. images/{doc_stem}/filename). - """ - dest_images_dir.mkdir(parents=True, exist_ok=True) - - def _replace(match: re.Match) -> str: - alt = match.group(1) - src_path_str = match.group(2) - src_path = Path(src_path_str) - if not src_path.exists(): - logger.warning("Image not found: %s", src_path) - return match.group(0) - filename = src_path.name - dest = dest_images_dir / filename - if not dest.exists(): - shutil.copy2(src_path, dest) - return f"![{alt}](images/{doc_stem}/{filename})" - - return _IMG_REF_RE.sub(_replace, markdown) - - def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: """Index a long PDF document using PageIndex and write wiki pages.""" openkb_dir = kb_dir / ".openkb" @@ -100,14 +73,30 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: "structure": structure, } - # Write wiki/sources/ — copy images from PageIndex internal location - # and rewrite paths to be relative to the .md file (images/{stem}/filename) + # Write wiki/sources/ — get per-page content from PageIndex and store as JSON sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) dest_images_dir = sources_dir / "images" / pdf_path.stem - source_md = render_source_md(tree, doc_name, doc_id) - source_md = _relocate_images(source_md, pdf_path.stem, dest_images_dir) - (sources_dir / f"{pdf_path.stem}.md").write_text(source_md, encoding="utf-8") + + # Get per-page content from PageIndex + all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}") + + # Relocate image paths in each page + dest_images_dir.mkdir(parents=True, exist_ok=True) + for page in all_pages: + if "images" in page: + for img in page["images"]: + src_path = Path(img["path"]) + if src_path.exists(): + filename = src_path.name + dest = dest_images_dir / filename + if not dest.exists(): + shutil.copy2(src_path, dest) + img["path"] = f"images/{pdf_path.stem}/{filename}" + + (sources_dir / f"{pdf_path.stem}.json").write_text( + json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", + ) # Write wiki/summaries/ (no images, just summaries) summaries_dir = kb_dir / "wiki" / "summaries" diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py index f745b48..991434e 100644 --- a/openkb/tree_renderer.py +++ b/openkb/tree_renderer.py @@ -13,25 +13,6 @@ def _yaml_frontmatter(source_name: str, doc_id: str) -> str: ) -def _render_nodes_source(nodes: list[dict], depth: int) -> str: - """Recursively render nodes for the *source* view (text content).""" - lines: list[str] = [] - heading_prefix = "#" * min(depth, 6) - for node in nodes: - title = node.get("title", "") - start = node.get("start_index", "") - end = node.get("end_index", "") - text = node.get("text", "") - children = node.get("nodes", []) - - lines.append(f"{heading_prefix} {title} (pages {start}\u2013{end})\n") - if text: - lines.append(f"{text}\n") - if children: - lines.append(_render_nodes_source(children, depth + 1)) - - return "\n".join(lines) - def _render_nodes_summary(nodes: list[dict], depth: int) -> str: """Recursively render nodes for the *summary* view (summaries only).""" @@ -53,18 +34,6 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str: return "\n".join(lines) -def render_source_md(tree: dict, source_name: str, doc_id: str) -> str: - """Render the full-text (source) Markdown page for a PageIndex tree. - - The page begins with YAML frontmatter, then recursively renders - every node as a heading with its ``(pages X–Y)`` range and full text. - Heading level equals tree depth (h1 at root), capped at h6. - """ - frontmatter = _yaml_frontmatter(source_name, doc_id) - structure = tree.get("structure", []) - body = _render_nodes_source(structure, depth=1) - return frontmatter + "\n" + body - def render_summary_md(tree: dict, source_name: str, doc_id: str) -> str: """Render the summary Markdown page for a PageIndex tree. diff --git a/tests/test_indexer.py b/tests/test_indexer.py index e35c969..0948d64 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -23,6 +23,9 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict): "doc_type": "pdf", "structure": sample_tree["structure"], } + + # get_page_content returns empty list by default (overridden per test as needed) + col.get_page_content.return_value = [] return col def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): @@ -43,12 +46,19 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): assert result.description == sample_tree["doc_description"] assert result.tree is not None - def test_source_page_written(self, kb_dir, sample_tree, tmp_path): + def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path): + """Long doc source should be written as JSON, not markdown.""" + import json as json_mod doc_id = "abc-123" fake_col = self._make_fake_collection(doc_id, sample_tree) fake_client = MagicMock() fake_client.collection.return_value = fake_col + # Mock get_page_content to return page data + fake_col.get_page_content.return_value = [ + {"page": 1, "content": "Page one text."}, + {"page": 2, "content": "Page two text."}, + ] pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") @@ -56,11 +66,13 @@ def test_source_page_written(self, kb_dir, sample_tree, tmp_path): with patch("openkb.indexer.PageIndexClient", return_value=fake_client): index_long_document(pdf_path, kb_dir) - source_file = kb_dir / "wiki" / "sources" / "sample.md" - assert source_file.exists() - content = source_file.read_text(encoding="utf-8") - assert "type: pageindex" in content - assert "Introduction" in content + json_file = kb_dir / "wiki" / "sources" / "sample.json" + assert json_file.exists() + assert not (kb_dir / "wiki" / "sources" / "sample.md").exists() + data = json_mod.loads(json_file.read_text()) + assert len(data) == 2 + assert data[0]["page"] == 1 + assert data[0]["content"] == "Page one text." def test_summary_page_written(self, kb_dir, sample_tree, tmp_path): doc_id = "abc-123" From 49afbdb508281e987c9a95422abd604500c9a3d3 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:43:09 +0800 Subject: [PATCH 15/44] feat: replace pageindex_retrieve with get_page_content, unify query for all docs Remove _pageindex_retrieve_impl and the pageindex_retrieve tool; add get_page_content_tool that uses the local JSON-based page store for all long documents. Update instructions and schema description accordingly. --- openkb/agent/query.py | 167 +++++------------------------------------- openkb/schema.py | 2 +- tests/test_query.py | 102 +++----------------------- 3 files changed, 32 insertions(+), 239 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 6a740fb..051d8e7 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -3,11 +3,7 @@ from pathlib import Path -import litellm from agents import Agent, Runner, function_tool -import os - -from pageindex import PageIndexClient from openkb.agent.tools import list_wiki_files, read_wiki_file from openkb.schema import SCHEMA_MD, get_agents_md @@ -18,11 +14,14 @@ {schema_md} ## Search strategy -1. Start by reading index.md to understand what documents and concepts are available. -2. Read relevant summary pages (summaries/) to get document overviews. +1. Read index.md to understand what documents and concepts are available. + Each entry has a brief summary to help you judge relevance. +2. Read relevant summary pages (summaries/) for document overviews. 3. Read concept pages (concepts/) for cross-document synthesis. -4. For long documents indexed with PageIndex, call pageindex_retrieve with the - document ID and the user's question to get detailed page-level content. +4. For long documents, use get_page_content(doc_name, pages) to read + specific pages when you need detailed content. The summary page + shows chapter structure with page ranges to help you decide which + pages to read. 5. Synthesise a clear, well-cited answer. Always ground your answer in the wiki content. If you cannot find relevant @@ -30,132 +29,8 @@ """ -def _pageindex_retrieve_impl(doc_id: str, question: str, openkb_dir: str, model: str) -> str: - """Retrieve relevant content from a long document via PageIndex. - - For cloud-indexed docs: delegates to col.query() directly. - For local docs: uses structure-based page selection + get_page_content. - """ - pageindex_api_key = os.environ.get("PAGEINDEX_API_KEY", "") - # Determine if this doc was cloud-indexed (cloud doc_ids have "pi-" prefix) - is_cloud_doc = doc_id.startswith("pi-") - - if is_cloud_doc: - # Cloud doc: use PageIndex streaming query (avoids timeout, shows progress) - import sys - import asyncio - import threading - - client = PageIndexClient(api_key=pageindex_api_key or None, model=model) - col = client.collection() - try: - stream = col.query(question, doc_ids=[doc_id], stream=True) - collected: list[str] = [] - done = threading.Event() - - async def _consume(): - try: - async for event in stream: - if event.type == "answer_delta": - sys.stdout.write(event.data) - sys.stdout.flush() - collected.append(event.data) - elif event.type == "tool_call": - name = event.data.get("name", "") - args = event.data.get("args", "") - sys.stdout.write(f"\n [PageIndex] {name}({args})\n") - sys.stdout.flush() - sys.stdout.write("\n") - sys.stdout.flush() - finally: - done.set() - - # Run streaming in a separate thread with its own event loop - def _run(): - loop = asyncio.new_event_loop() - loop.run_until_complete(_consume()) - loop.close() - - t = threading.Thread(target=_run, daemon=True) - t.start() - t.join(timeout=120) - return "".join(collected) if collected else "No answer from PageIndex." - except Exception as exc: - return f"Error querying cloud PageIndex: {exc}" - - # Local doc: use local PageIndex with structure-based retrieval - client = PageIndexClient(model=model, storage_path=openkb_dir) - col = client.collection() - - try: - structure = col.get_document_structure(doc_id) - except Exception as exc: - return f"Error retrieving document structure: {exc}" - - if not structure: - return "No structure found for document." - sections = [] - for idx, node in enumerate(structure): - title = node.get("title", f"Section {idx + 1}") - node_id = node.get("node_id", str(idx)) - summary = node.get("summary", "") - start = node.get("start_index", idx) - end = node.get("end_index", idx) - sections.append( - f"node_id={node_id} title='{title}' pages={start}-{end} summary='{summary}'" - ) - - sections_text = "\n".join(sections) - prompt = ( - f"Given the following document sections:\n{sections_text}\n\n" - f"Which page ranges are most relevant to this question: '{question}'?\n" - "Reply with a comma-separated list of page numbers or ranges (e.g. '1-3,7,10-12'). " - "Return ONLY the page specification, nothing else." - ) - - # 2. Ask LLM which pages are relevant - try: - response = litellm.completion( - model=model, - messages=[{"role": "user", "content": prompt}], - ) - page_spec = response.choices[0].message.content.strip() - except Exception as exc: - return f"Error selecting relevant pages: {exc}" - - if not page_spec: - return "Could not determine relevant pages." - - # 3. Fetch those pages - try: - pages = col.get_page_content(doc_id, page_spec) - except Exception as exc: - return f"Error fetching page content: {exc}" - - if not pages: - return f"No content found for pages: {page_spec}" - - parts = [] - for item in pages: - page_num = item.get("page_index", "?") - text = item.get("text", "") - parts.append(f"[Page {page_num}]\n{text}") - - return "\n\n".join(parts) - - -def build_query_agent(wiki_root: str, openkb_dir: str, model: str, language: str = "en") -> Agent: - """Build and return the Q&A agent. - - Args: - wiki_root: Absolute path to the wiki directory. - openkb_dir: Path to the .openkb/ state directory. - model: LLM model name. - language: Language code for wiki content (e.g. 'en', 'fr'). - - Returns: - Configured :class:`~agents.Agent` instance. - """ +def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent: + """Build and return the Q&A agent.""" schema_md = get_agents_md(Path(wiki_root)) instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." @@ -163,7 +38,6 @@ def build_query_agent(wiki_root: str, openkb_dir: str, model: str, language: str @function_tool def list_files(directory: str) -> str: """List all Markdown files in a wiki subdirectory. - Args: directory: Subdirectory path relative to wiki root (e.g. 'sources'). """ @@ -172,31 +46,29 @@ def list_files(directory: str) -> str: @function_tool def read_file(path: str) -> str: """Read a Markdown file from the wiki. - Args: path: File path relative to wiki root (e.g. 'summaries/paper.md'). """ return read_wiki_file(path, wiki_root) @function_tool - def pageindex_retrieve(doc_id: str, question: str) -> str: - """Retrieve relevant content from a long document via PageIndex. - - Use this when you need detailed content from a document that was - indexed with PageIndex (long documents). - + def get_page_content_tool(doc_name: str, pages: str) -> str: + """Get text content of specific pages from a long document. + Use this when you need detailed content from a document. The summary + page shows chapter structure with page ranges. Args: - doc_id: PageIndex document identifier (found in index.md). - question: The question you are trying to answer. + doc_name: Document name (e.g. 'attention-is-all-you-need'). + pages: Page specification (e.g. '3-5,7,10-12'). """ - return _pageindex_retrieve_impl(doc_id, question, openkb_dir, model) + from openkb.agent.tools import get_page_content + return get_page_content(doc_name, pages, wiki_root) from agents.model_settings import ModelSettings return Agent( name="wiki-query", instructions=instructions, - tools=[list_files, read_file, pageindex_retrieve], + tools=[list_files, read_file, get_page_content_tool], model=f"litellm/{model}", model_settings=ModelSettings(parallel_tool_calls=False), ) @@ -224,9 +96,8 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals language: str = config.get("language", "en") wiki_root = str(kb_dir / "wiki") - openkb_path = str(openkb_dir) - agent = build_query_agent(wiki_root, openkb_path, model, language=language) + agent = build_query_agent(wiki_root, model, language=language) if not stream: result = await Runner.run(agent, question) diff --git a/openkb/schema.py b/openkb/schema.py index d0fc602..1911e86 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -6,7 +6,7 @@ # Wiki Schema ## Directory Structure -- sources/ — Full-text converted from raw documents. Do not modify directly. +- sources/ — Document content. Short docs as .md, long docs as .json (per-page). Do not modify directly. - sources/images/ — Extracted images from documents, referenced by sources. - summaries/ — One per source document. Summary of key content. - concepts/ — Cross-document topic synthesis. Created when a theme spans multiple documents. diff --git a/tests/test_query.py b/tests/test_query.py index 084fc9e..dc14779 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -6,119 +6,41 @@ import pytest -from openkb.agent.query import _pageindex_retrieve_impl, build_query_agent, run_query +from openkb.agent.query import build_query_agent, run_query from openkb.schema import SCHEMA_MD class TestBuildQueryAgent: def test_agent_name(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert agent.name == "wiki-query" def test_agent_has_three_tools(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert len(agent.tools) == 3 def test_agent_tool_names(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") names = {t.name for t in agent.tools} assert "list_files" in names assert "read_file" in names - assert "pageindex_retrieve" in names + assert "get_page_content_tool" in names + assert "pageindex_retrieve" not in names - def test_instructions_reference_registered_pageindex_tool(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") - tool_names = {t.name for t in agent.tools} - assert "pageindex_retrieve" in agent.instructions - assert "pageindex_retrieve" in tool_names + def test_instructions_mention_get_page_content(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + assert "get_page_content" in agent.instructions + assert "pageindex_retrieve" not in agent.instructions def test_schema_in_instructions(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert SCHEMA_MD in agent.instructions def test_agent_model(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "my-model") + agent = build_query_agent(str(tmp_path), "my-model") assert agent.model == "litellm/my-model" -class TestPageindexRetrieve: - def test_returns_page_content(self, tmp_path): - mock_structure = [ - { - "node_id": "n1", - "title": "Introduction", - "start_index": 1, - "end_index": 5, - "summary": "Overview section", - } - ] - mock_pages = [ - {"page_index": 1, "text": "Introduction text here."}, - {"page_index": 2, "text": "More intro content."}, - ] - - mock_col = MagicMock() - mock_col.get_document_structure.return_value = mock_structure - mock_col.get_page_content.return_value = mock_pages - - mock_client = MagicMock() - mock_client.collection.return_value = mock_col - - with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \ - patch("openkb.agent.query.litellm.completion") as mock_llm, \ - patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False): - mock_llm.return_value = MagicMock( - choices=[MagicMock(message=MagicMock(content="1-2"))] - ) - result = _pageindex_retrieve_impl("doc123", "What is the intro?", "/db", "gpt-4o-mini") - - assert "Introduction text here." in result - assert "More intro content." in result - - def test_cloud_doc_uses_streaming_query(self, tmp_path): - """Cloud doc (pi- prefix) delegates to col.query(stream=True).""" - from dataclasses import dataclass - from typing import Any - - @dataclass - class FakeEvent: - type: str - data: Any - - class FakeStream: - async def __aiter__(self): - yield FakeEvent(type="answer_delta", data="Cloud ") - yield FakeEvent(type="answer_delta", data="answer about MCP.") - - mock_stream = FakeStream() - - mock_col = MagicMock() - mock_col.query.return_value = mock_stream - - mock_client = MagicMock() - mock_client.collection.return_value = mock_col - - with patch("openkb.agent.query.PageIndexClient", return_value=mock_client): - result = _pageindex_retrieve_impl("pi-abc123", "What is MCP?", "/db", "gpt-4o-mini") - - assert "Cloud answer about MCP." in result - mock_col.query.assert_called_once_with("What is MCP?", doc_ids=["pi-abc123"], stream=True) - - def test_local_empty_structure_returns_error(self, tmp_path): - """Local doc with empty structure returns error.""" - mock_col = MagicMock() - mock_col.get_document_structure.return_value = [] - - mock_client = MagicMock() - mock_client.collection.return_value = mock_col - - with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \ - patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False): - result = _pageindex_retrieve_impl("local-uuid-123", "What?", "/db", "gpt-4o-mini") - - assert "No structure found" in result - - class TestRunQuery: @pytest.mark.asyncio async def test_run_query_returns_final_output(self, tmp_path): From 8b75b7e3416a06fcb3c9ba39f06daebcd840e65a Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:44:15 +0800 Subject: [PATCH 16/44] fix: remove tests for deleted render_source_md --- tests/test_tree_renderer.py | 119 +----------------------------------- 1 file changed, 1 insertion(+), 118 deletions(-) diff --git a/tests/test_tree_renderer.py b/tests/test_tree_renderer.py index 1d81b3b..d636d5b 100644 --- a/tests/test_tree_renderer.py +++ b/tests/test_tree_renderer.py @@ -3,124 +3,7 @@ import pytest -from openkb.tree_renderer import render_source_md, render_summary_md - - -# --------------------------------------------------------------------------- -# render_source_md -# --------------------------------------------------------------------------- - - -class TestRenderSourceMd: - def test_has_yaml_frontmatter(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert output.startswith("---\n") - assert "source: Sample Document" in output - assert "type: pageindex" in output - assert "doc_id: doc-abc" in output - assert "---\n" in output - - def test_top_level_nodes_are_h1(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert "# Introduction" in output - assert "# Conclusion" in output - - def test_nested_nodes_are_h2(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert "## Background" in output - assert "## Motivation" in output - - def test_page_range_included(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert "(pages 0–120)" in output # Introduction - assert "(pages 0–60)" in output # Background - assert "(pages 61–120)" in output # Motivation - assert "(pages 121–200)" in output # Conclusion - - def test_node_text_included(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert "This document introduces the core concepts of the system." in output - assert "Background information on the subject." in output - - def test_no_summary_in_source(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - # Source pages show text, not summaries - assert "Summary:" not in output - - def test_heading_depth_capped_at_6(self): - """Deeply nested nodes must not exceed h6.""" - deep_tree = { - "doc_name": "Deep", - "doc_description": "A deeply nested doc.", - "structure": [ - { - "title": "L1", - "start_index": 0, - "end_index": 10, - "text": "L1 text", - "summary": "L1 summary", - "nodes": [ - { - "title": "L2", - "start_index": 0, - "end_index": 5, - "text": "L2 text", - "summary": "L2 summary", - "nodes": [ - { - "title": "L3", - "start_index": 0, - "end_index": 3, - "text": "L3 text", - "summary": "L3 summary", - "nodes": [ - { - "title": "L4", - "start_index": 0, - "end_index": 1, - "text": "L4 text", - "summary": "L4 summary", - "nodes": [ - { - "title": "L5", - "start_index": 0, - "end_index": 1, - "text": "L5 text", - "summary": "L5 summary", - "nodes": [ - { - "title": "L6", - "start_index": 0, - "end_index": 1, - "text": "L6 text", - "summary": "L6 summary", - "nodes": [ - { - "title": "L7", - "start_index": 0, - "end_index": 1, - "text": "L7 text", - "summary": "L7 summary", - "nodes": [], - } - ], - } - ], - } - ], - } - ], - } - ], - } - ], - } - ], - } - output = render_source_md(deep_tree, "Deep", "doc-deep") - # L7 is at depth 7 — must render as h6, not h7 - assert "#######" not in output - assert "L7 text" in output +from openkb.tree_renderer import render_summary_md # --------------------------------------------------------------------------- From 36ae619cfb72c48b3954ae93e674c97ca97dde67 Mon Sep 17 00:00:00 2001 From: mountain Date: Thu, 9 Apr 2026 23:45:01 +0800 Subject: [PATCH 17/44] chore: remove dead references to render_source_md --- openkb/tree_renderer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py index 991434e..6770d7e 100644 --- a/openkb/tree_renderer.py +++ b/openkb/tree_renderer.py @@ -38,8 +38,7 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str: def render_summary_md(tree: dict, source_name: str, doc_id: str) -> str: """Render the summary Markdown page for a PageIndex tree. - Identical structure to :func:`render_source_md` but replaces node text - with ``Summary: {summary}`` for each node. + Renders each node as a heading with page range and its summary text. """ frontmatter = _yaml_frontmatter(source_name, doc_id) structure = tree.get("structure", []) From 27a9e3a89e59d443dfc8487e94a3bd9770a24b11 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 00:19:20 +0800 Subject: [PATCH 18/44] fix: change default model to gpt-5.4-mini, fix page_count fallback in indexer - Default model changed from gpt-5.4 to gpt-5.4-mini - Indexer get_page_content no longer uses hardcoded 9999 fallback - Infers page_count from structure end_index when doc lacks page_count field - Added debug logging for doc keys and page_count diagnosis --- openkb/config.py | 32 +++++++++++++++++++++++++++++++- openkb/indexer.py | 18 ++++++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/openkb/config.py b/openkb/config.py index fbd7bca..b83e134 100644 --- a/openkb/config.py +++ b/openkb/config.py @@ -6,11 +6,14 @@ import yaml DEFAULT_CONFIG: dict[str, Any] = { - "model": "gpt-5.4", + "model": "gpt-5.4-mini", "language": "en", "pageindex_threshold": 20, } +GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb" +GLOBAL_CONFIG_PATH = GLOBAL_CONFIG_DIR / "global.yaml" + def load_config(config_path: Path) -> dict[str, Any]: """Load YAML config from config_path, merged with DEFAULT_CONFIG. @@ -30,3 +33,30 @@ def save_config(config_path: Path, config: dict) -> None: config_path.parent.mkdir(parents=True, exist_ok=True) with config_path.open("w", encoding="utf-8") as fh: yaml.safe_dump(config, fh, allow_unicode=True, sort_keys=True) + + +def load_global_config() -> dict[str, Any]: + """Load the global config from ~/.config/openkb/global.yaml.""" + if GLOBAL_CONFIG_PATH.exists(): + with GLOBAL_CONFIG_PATH.open("r", encoding="utf-8") as fh: + return yaml.safe_load(fh) or {} + return {} + + +def save_global_config(config: dict[str, Any]) -> None: + """Save the global config to ~/.config/openkb/global.yaml.""" + GLOBAL_CONFIG_DIR.mkdir(parents=True, exist_ok=True) + with GLOBAL_CONFIG_PATH.open("w", encoding="utf-8") as fh: + yaml.safe_dump(config, fh, allow_unicode=True, sort_keys=True) + + +def register_kb(kb_path: Path) -> None: + """Register a KB path in the global config's known_kbs list.""" + gc = load_global_config() + known = gc.get("known_kbs", []) + resolved = str(kb_path.resolve()) + if resolved not in known: + known.append(resolved) + gc["known_kbs"] = known + gc["default_kb"] = resolved + save_global_config(gc) diff --git a/openkb/indexer.py b/openkb/indexer.py index c8b81f3..8cd6913 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -67,6 +67,10 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: description: str = doc.get("doc_description", "") structure: list = doc.get("structure", []) + # Debug: print doc keys and page_count to diagnose get_page_content range + logger.info("Doc keys: %s", list(doc.keys())) + logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT")) + tree = { "doc_name": doc_name, "doc_description": description, @@ -78,8 +82,18 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: sources_dir.mkdir(parents=True, exist_ok=True) dest_images_dir = sources_dir / "images" / pdf_path.stem - # Get per-page content from PageIndex - all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}") + # Get per-page content from PageIndex — use actual page count + page_count = doc.get("page_count") + if page_count is None: + # Fallback: count pages from structure's max end_index + max_page = 0 + for node in structure: + end = node.get("end_index", 0) + if end > max_page: + max_page = end + page_count = max_page if max_page > 0 else 100 + logger.info("page_count not in doc, inferred from structure: %d", page_count) + all_pages = col.get_page_content(doc_id, f"1-{page_count}") # Relocate image paths in each page dest_images_dir.mkdir(parents=True, exist_ok=True) From 0bc0b441497c4d32cfe1c427dc39c9e943874d51 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 00:40:09 +0800 Subject: [PATCH 19/44] feat: doc type in index.md, remove list_files from query agent, source backlink for short docs - index.md entries now show (short) or (pageindex) type marker - Query agent prompt updated: guides agent to read sources for detail - Removed list_files tool from query agent (index.md is sufficient) - Short doc summaries now have source_doc frontmatter linking to sources/ - Reverted list_wiki_files to only list .md files - Fixed tests for model name change and agent tool count --- openkb/agent/compiler.py | 26 ++++++++++++++++++++------ openkb/agent/query.py | 34 +++++++++++++--------------------- tests/test_compiler.py | 2 +- tests/test_config.py | 2 +- tests/test_query.py | 6 +++--- 5 files changed, 38 insertions(+), 32 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index b4b549e..f79e0f4 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -289,13 +289,20 @@ def _find_source_filename(doc_name: str, kb_dir: Path) -> str: return f"{doc_name}.pdf" -def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None: - """Write summary page with frontmatter.""" +def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, + brief: str = "", doc_type: str = "short") -> None: + """Write summary page with frontmatter. + + For short docs, includes a ``source_doc`` field linking to the full + source text in ``sources/{doc_name}.md``. + """ summaries_dir = wiki_dir / "summaries" summaries_dir.mkdir(parents=True, exist_ok=True) fm_lines = [f"sources: [{source_file}]"] if brief: fm_lines.append(f"brief: {brief}") + if doc_type == "short": + fm_lines.append(f"source_doc: sources/{doc_name}.md") frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") @@ -442,12 +449,15 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) def _update_index( wiki_dir: Path, doc_name: str, concept_names: list[str], doc_brief: str = "", concept_briefs: dict[str, str] | None = None, + doc_type: str = "short", ) -> None: """Append document and concept entries to index.md. When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries - are written as ``- [[link]] — brief text``. Existing entries are detected - by the link part only, so updating a brief on a re-compile works correctly. + are written as ``- [[link]] (type) — brief text``. Existing entries are + detected by the link part only, so updating a brief on a re-compile works. + ``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the + query agent knows how to access detailed content. """ if concept_briefs is None: concept_briefs = {} @@ -463,7 +473,7 @@ def _update_index( doc_link = f"[[summaries/{doc_name}]]" if doc_link not in text: - doc_entry = f"- {doc_link}" + doc_entry = f"- {doc_link} ({doc_type})" if doc_brief: doc_entry += f" — {doc_brief}" if "## Documents" in text: @@ -498,6 +508,7 @@ async def _compile_concepts( doc_name: str, max_concurrency: int, doc_brief: str = "", + doc_type: str = "short", ) -> None: """Shared Steps 2-4: concepts plan → generate/update → index. @@ -635,7 +646,8 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: # --- Step 4: Update index (code only) --- _update_index(wiki_dir, doc_name, concept_names, - doc_brief=doc_brief, concept_briefs=concept_briefs_map) + doc_brief=doc_brief, concept_briefs=concept_briefs_map, + doc_type=doc_type) async def compile_short_doc( @@ -684,6 +696,7 @@ async def compile_short_doc( await _compile_concepts( wiki_dir, kb_dir, model, system_msg, doc_msg, summary, doc_name, max_concurrency, doc_brief=doc_brief, + doc_type="short", ) @@ -726,4 +739,5 @@ async def compile_long_doc( await _compile_concepts( wiki_dir, kb_dir, model, system_msg, doc_msg, overview, doc_name, max_concurrency, doc_brief=doc_description, + doc_type="pageindex", ) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 051d8e7..134901a 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -5,8 +5,8 @@ from agents import Agent, Runner, function_tool -from openkb.agent.tools import list_wiki_files, read_wiki_file -from openkb.schema import SCHEMA_MD, get_agents_md +from openkb.agent.tools import read_wiki_file +from openkb.schema import get_agents_md _QUERY_INSTRUCTIONS_TEMPLATE = """\ You are a knowledge-base Q&A agent. You answer questions by searching the wiki. @@ -14,18 +14,18 @@ {schema_md} ## Search strategy -1. Read index.md to understand what documents and concepts are available. - Each entry has a brief summary to help you judge relevance. +1. Read index.md to see all documents and concepts with brief summaries. + Each document is marked (short) or (pageindex) to indicate its type. 2. Read relevant summary pages (summaries/) for document overviews. 3. Read concept pages (concepts/) for cross-document synthesis. -4. For long documents, use get_page_content(doc_name, pages) to read - specific pages when you need detailed content. The summary page - shows chapter structure with page ranges to help you decide which - pages to read. -5. Synthesise a clear, well-cited answer. - -Always ground your answer in the wiki content. If you cannot find relevant -information, say so clearly. +4. When you need detailed source content: + - Short documents: read_file("sources/{{doc_name}}.md") for the full text. + - PageIndex documents: use get_page_content(doc_name, pages) to read + specific pages. The summary page shows chapter structure with page + ranges to help you decide which pages to read. +5. Synthesise a clear, well-cited answer grounded in wiki content. + +If you cannot find relevant information, say so clearly. """ @@ -35,14 +35,6 @@ def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." - @function_tool - def list_files(directory: str) -> str: - """List all Markdown files in a wiki subdirectory. - Args: - directory: Subdirectory path relative to wiki root (e.g. 'sources'). - """ - return list_wiki_files(directory, wiki_root) - @function_tool def read_file(path: str) -> str: """Read a Markdown file from the wiki. @@ -68,7 +60,7 @@ def get_page_content_tool(doc_name: str, pages: str) -> str: return Agent( name="wiki-query", instructions=instructions, - tools=[list_files, read_file, get_page_content_tool], + tools=[read_file, get_page_content_tool], model=f"litellm/{model}", model_settings=ModelSettings(parallel_tool_calls=False), ) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index bbb6259..b3746d1 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -160,7 +160,7 @@ def test_appends_entries_with_briefs(self, tmp_path): doc_brief="Introduces transformers", concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"}) text = (wiki / "index.md").read_text() - assert "[[summaries/my-doc]] — Introduces transformers" in text + assert "[[summaries/my-doc]] (short) — Introduces transformers" in text assert "[[concepts/attention]] — Focus mechanism" in text assert "[[concepts/transformer]] — NN architecture" in text diff --git a/tests/test_config.py b/tests/test_config.py index 31bd0ab..495e075 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -10,7 +10,7 @@ def test_default_config_keys(): def test_default_config_values(): - assert DEFAULT_CONFIG["model"] == "gpt-5.4" + assert DEFAULT_CONFIG["model"] == "gpt-5.4-mini" assert DEFAULT_CONFIG["language"] == "en" assert DEFAULT_CONFIG["pageindex_threshold"] == 20 diff --git a/tests/test_query.py b/tests/test_query.py index dc14779..8be4cb9 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -15,16 +15,16 @@ def test_agent_name(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert agent.name == "wiki-query" - def test_agent_has_three_tools(self, tmp_path): + def test_agent_has_two_tools(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") - assert len(agent.tools) == 3 + assert len(agent.tools) == 2 def test_agent_tool_names(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") names = {t.name for t in agent.tools} - assert "list_files" in names assert "read_file" in names assert "get_page_content_tool" in names + assert "list_files" not in names assert "pageindex_retrieve" not in names def test_instructions_mention_get_page_content(self, tmp_path): From 739c8eb5d507d7a23a71cb2c38cb1dcfe8328634 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 00:50:30 +0800 Subject: [PATCH 20/44] feat: warn when no LLM API key found instead of failing silently --- openkb/cli.py | 112 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 90 insertions(+), 22 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index d8ec0fd..b14f45c 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -19,7 +19,7 @@ litellm.suppress_debug_info = True from dotenv import load_dotenv -from openkb.config import DEFAULT_CONFIG, load_config, save_config +from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb from openkb.converter import convert_document from openkb.log import append_log from openkb.schema import AGENTS_MD @@ -30,8 +30,11 @@ def _setup_llm_key(kb_dir: Path | None = None) -> None: """Set LiteLLM API key from LLM_API_KEY env var if present. - If *kb_dir* is given, also loads ``.env`` from the KB root so that - the key is found even when the CLI is invoked from another directory. + Load order (override=False, so first one wins): + 1. System environment variables (already set) + 2. KB-local .env (kb_dir/.env) + 3. Global .env (~/.config/openkb/.env) + Also propagates to provider-specific env vars (OPENAI_API_KEY, etc.) so that the Agents SDK litellm provider can pick them up. """ @@ -40,8 +43,23 @@ def _setup_llm_key(kb_dir: Path | None = None) -> None: if env_file.exists(): load_dotenv(env_file, override=False) + from openkb.config import GLOBAL_CONFIG_DIR + global_env = GLOBAL_CONFIG_DIR / ".env" + if global_env.exists(): + load_dotenv(global_env, override=False) + api_key = os.environ.get("LLM_API_KEY", "") - if api_key: + if not api_key: + # Check if any provider key is already set + has_key = any(os.environ.get(k) for k in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY")) + if not has_key: + click.echo( + "Warning: No LLM API key found. Set one of:\n" + f" 1. {kb_dir / '.env' if kb_dir else '/.env'} — LLM_API_KEY=sk-...\n" + f" 2. {GLOBAL_CONFIG_DIR / '.env'} — LLM_API_KEY=sk-...\n" + " 3. Export LLM_API_KEY in your shell profile" + ) + else: litellm.api_key = api_key for env_var in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY"): if not os.environ.get(env_var): @@ -74,11 +92,29 @@ def _display_type(raw_type: str) -> str: # Helpers # --------------------------------------------------------------------------- -def _find_kb_dir() -> Path | None: - """Return the knowledge-base root if .openkb/ exists in cwd, else None.""" - candidate = Path(".openkb") - if candidate.exists() and candidate.is_dir(): - return Path(".") +def _find_kb_dir(override: Path | None = None) -> Path | None: + """Find the KB root: explicit override → walk up from cwd → global default_kb.""" + # 0. Explicit override (--kb-dir or OPENKB_DIR) + if override is not None: + if (override / ".openkb").is_dir(): + return override + return None + # 1. Walk up from cwd + current = Path.cwd().resolve() + while True: + if (current / ".openkb").is_dir(): + return current + parent = current.parent + if parent == current: + break + current = parent + # 2. Fall back to global config default_kb + gc = load_global_config() + default = gc.get("default_kb") + if default: + p = Path(default) + if (p / ".openkb").is_dir(): + return p return None @@ -174,7 +210,9 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: @click.group() @click.option("-v", "--verbose", is_flag=True, default=False, help="Enable verbose logging.") -def cli(verbose): +@click.option("--kb-dir", "kb_dir_override", default=None, type=click.Path(exists=True, file_okay=False, resolve_path=True), help="Path to a KB root directory (overrides auto-detection).") +@click.pass_context +def cli(ctx, verbose, kb_dir_override): """OpenKB — Karpathy's LLM Knowledge Base workflow, powered by PageIndex.""" logging.basicConfig( format="%(name)s %(levelname)s: %(message)s", @@ -182,6 +220,27 @@ def cli(verbose): ) if verbose: logging.getLogger("openkb").setLevel(logging.DEBUG) + ctx.ensure_object(dict) + if kb_dir_override: + ctx.obj["kb_dir_override"] = Path(kb_dir_override) + else: + env_kb = os.environ.get("OPENKB_DIR") + if env_kb: + ctx.obj["kb_dir_override"] = Path(env_kb).resolve() + else: + ctx.obj["kb_dir_override"] = None + + +@cli.command() +@click.argument("path", default=".") +def use(path): + """Set PATH as the default knowledge base.""" + target = Path(path).resolve() + if not (target / ".openkb").is_dir(): + click.echo(f"Not a knowledge base: {target}") + return + register_kb(target) + click.echo(f"Default KB set to: {target}") @cli.command() @@ -229,14 +288,18 @@ def init(): save_config(openkb_dir / "config.yaml", config) (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8") + # Register this KB in the global config + register_kb(Path.cwd()) + click.echo("Knowledge base initialised.") @cli.command() @click.argument("path") -def add(path): +@click.pass_context +def add(ctx, path): """Add a document or directory of documents at PATH to the knowledge base.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -272,9 +335,10 @@ def add(path): @cli.command() @click.argument("question") @click.option("--save", is_flag=True, default=False, help="Save the answer to wiki/explorations/.") -def query(question, save): +@click.pass_context +def query(ctx, question, save): """Query the knowledge base with QUESTION.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -307,9 +371,10 @@ def query(question, save): @cli.command() -def watch(): +@click.pass_context +def watch(ctx): """Watch the raw/ directory for new documents and process them automatically.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -336,11 +401,12 @@ def on_new_files(paths): @cli.command() @click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues (not yet implemented).") -def lint(fix): +@click.pass_context +def lint(ctx, fix): """Lint the knowledge base for structural and semantic inconsistencies.""" if fix: click.echo("Warning: --fix is not yet implemented. Running lint in report-only mode.") - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -379,9 +445,10 @@ def lint(fix): @cli.command(name="list") -def list_cmd(): +@click.pass_context +def list_cmd(ctx): """List all documents in the knowledge base.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -439,9 +506,10 @@ def list_cmd(): @cli.command() -def status(): +@click.pass_context +def status(ctx): """Show the current status of the knowledge base.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return From be66e31281fdeae1aa4eede5c606eb22f70e7c81 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 00:51:05 +0800 Subject: [PATCH 21/44] fix: strengthen query agent instructions to always read source content --- openkb/agent/query.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 134901a..0904571 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -18,14 +18,17 @@ Each document is marked (short) or (pageindex) to indicate its type. 2. Read relevant summary pages (summaries/) for document overviews. 3. Read concept pages (concepts/) for cross-document synthesis. -4. When you need detailed source content: +4. **Always read source content before answering.** Summaries and concepts + are overviews — for accurate, detailed answers you MUST consult sources: - Short documents: read_file("sources/{{doc_name}}.md") for the full text. - PageIndex documents: use get_page_content(doc_name, pages) to read specific pages. The summary page shows chapter structure with page ranges to help you decide which pages to read. -5. Synthesise a clear, well-cited answer grounded in wiki content. +5. Synthesise a clear, well-cited answer grounded in source content. -If you cannot find relevant information, say so clearly. +IMPORTANT: Do NOT answer based on summaries alone. Always verify and enrich +your answer by reading the actual source content. If the question asks about +details, experiments, specific data, or quotes, reading the source is mandatory. """ From 7b3bc0ca0e5efc8c03a2bcaeb64c8fb1d688bbec Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 00:53:12 +0800 Subject: [PATCH 22/44] Revert "fix: strengthen query agent instructions to always read source content" This reverts commit be66e31281fdeae1aa4eede5c606eb22f70e7c81. --- openkb/agent/query.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 0904571..134901a 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -18,17 +18,14 @@ Each document is marked (short) or (pageindex) to indicate its type. 2. Read relevant summary pages (summaries/) for document overviews. 3. Read concept pages (concepts/) for cross-document synthesis. -4. **Always read source content before answering.** Summaries and concepts - are overviews — for accurate, detailed answers you MUST consult sources: +4. When you need detailed source content: - Short documents: read_file("sources/{{doc_name}}.md") for the full text. - PageIndex documents: use get_page_content(doc_name, pages) to read specific pages. The summary page shows chapter structure with page ranges to help you decide which pages to read. -5. Synthesise a clear, well-cited answer grounded in source content. +5. Synthesise a clear, well-cited answer grounded in wiki content. -IMPORTANT: Do NOT answer based on summaries alone. Always verify and enrich -your answer by reading the actual source content. If the question asks about -details, experiments, specific data, or quotes, reading the source is mandatory. +If you cannot find relevant information, say so clearly. """ From 634b212fcdc43dd5a92969e1d68c714753259f4a Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 00:53:50 +0800 Subject: [PATCH 23/44] fix: isolate tests from real KB directories via mocking --- tests/test_add_command.py | 5 +++-- tests/test_cli.py | 11 ++++++++--- tests/test_list_status.py | 6 ++++-- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/test_add_command.py b/tests/test_add_command.py index 0ad9397..ca97d26 100644 --- a/tests/test_add_command.py +++ b/tests/test_add_command.py @@ -37,8 +37,9 @@ def test_finds_openkb_dir(self, tmp_path, monkeypatch): def test_returns_none_if_no_openkb(self, tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) - result = _find_kb_dir() - assert result is None + with patch("openkb.cli.load_global_config", return_value={}): + result = _find_kb_dir() + assert result is None class TestAddCommand: diff --git a/tests/test_cli.py b/tests/test_cli.py index 1ad10b3..22c27fc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,6 @@ import json +from unittest.mock import patch + import pytest from click.testing import CliRunner @@ -8,7 +10,8 @@ def test_init_creates_structure(tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli.register_kb"): result = runner.invoke(cli, ["init"]) assert result.exit_code == 0 @@ -42,7 +45,8 @@ def test_init_creates_structure(tmp_path): def test_init_schema_content(tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli.register_kb"): result = runner.invoke(cli, ["init"]) assert result.exit_code == 0 @@ -53,7 +57,8 @@ def test_init_schema_content(tmp_path): def test_init_already_exists(tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli.register_kb"): # First run should succeed result = runner.invoke(cli, ["init"]) assert result.exit_code == 0 diff --git a/tests/test_list_status.py b/tests/test_list_status.py index 0ef9f56..21b8de4 100644 --- a/tests/test_list_status.py +++ b/tests/test_list_status.py @@ -32,7 +32,8 @@ def _setup_kb(tmp_path: Path) -> Path: class TestListCommand: def test_list_no_kb(self, tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli._find_kb_dir", return_value=None): result = runner.invoke(cli, ["list"]) assert "No knowledge base found" in result.output @@ -91,7 +92,8 @@ def test_list_no_concepts_section_when_empty(self, tmp_path): class TestStatusCommand: def test_status_no_kb(self, tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli._find_kb_dir", return_value=None): result = runner.invoke(cli, ["status"]) assert "No knowledge base found" in result.output From 19ebfeda9fca3b29a442372c45aa74d3894ccd35 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 03:35:37 +0800 Subject: [PATCH 24/44] fix: suppress warnings and disable agents SDK tracing via API --- openkb/cli.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index b14f45c..c43e588 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -9,11 +9,14 @@ import os -# Disable Agents SDK tracing (requires OPENAI_API_KEY otherwise) -os.environ.setdefault("OPENAI_AGENTS_DISABLE_TRACING", "1") +from agents import set_tracing_disabled +set_tracing_disabled(True) # Use local model cost map — skip fetching from GitHub on every invocation os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True") +import warnings +warnings.filterwarnings("ignore") + import click import litellm litellm.suppress_debug_info = True From dde64d1d9baa68b11ec20a84beef0a8a4415660f Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 03:35:43 +0800 Subject: [PATCH 25/44] fix: add MAX_TURNS limit to agent Runner calls --- openkb/agent/linter.py | 4 +++- openkb/agent/query.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/openkb/agent/linter.py b/openkb/agent/linter.py index 5201949..fb81da7 100644 --- a/openkb/agent/linter.py +++ b/openkb/agent/linter.py @@ -6,6 +6,8 @@ from agents import Agent, Runner, function_tool from openkb.agent.tools import list_wiki_files, read_wiki_file + +MAX_TURNS = 50 from openkb.schema import SCHEMA_MD, get_agents_md _LINTER_INSTRUCTIONS_TEMPLATE = """\ @@ -102,5 +104,5 @@ async def run_knowledge_lint(kb_dir: Path, model: str) -> str: "Produce a structured Markdown report." ) - result = await Runner.run(agent, prompt) + result = await Runner.run(agent, prompt, max_turns=MAX_TURNS) return result.final_output or "Knowledge lint completed. No output produced." diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 134901a..c9a8986 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -6,6 +6,8 @@ from agents import Agent, Runner, function_tool from openkb.agent.tools import read_wiki_file + +MAX_TURNS = 50 from openkb.schema import get_agents_md _QUERY_INSTRUCTIONS_TEMPLATE = """\ @@ -92,10 +94,10 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals agent = build_query_agent(wiki_root, model, language=language) if not stream: - result = await Runner.run(agent, question) + result = await Runner.run(agent, question, max_turns=MAX_TURNS) return result.final_output or "" - result = Runner.run_streamed(agent, question) + result = Runner.run_streamed(agent, question, max_turns=MAX_TURNS) collected = [] async for event in result.stream_events(): if isinstance(event, RawResponsesStreamEvent): From 63da1fe4d9aeeed5a2aba2697990d53755d32298 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 04:24:34 +0800 Subject: [PATCH 26/44] refactor: unify summary frontmatter to doc_type + full_text Replace sources/brief/source_doc/doc_id/source fields with two consistent fields: doc_type (short|pageindex) and full_text pointing to the actual source content under sources/. --- openkb/agent/compiler.py | 22 +++++++++------------- openkb/schema.py | 2 +- openkb/tree_renderer.py | 5 ++--- tests/test_compiler.py | 19 ++++++++++--------- tests/test_indexer.py | 2 +- tests/test_tree_renderer.py | 5 ++--- 6 files changed, 25 insertions(+), 30 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index f79e0f4..d97b0eb 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -289,20 +289,16 @@ def _find_source_filename(doc_name: str, kb_dir: Path) -> str: return f"{doc_name}.pdf" -def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, - brief: str = "", doc_type: str = "short") -> None: - """Write summary page with frontmatter. - - For short docs, includes a ``source_doc`` field linking to the full - source text in ``sources/{doc_name}.md``. - """ +def _write_summary(wiki_dir: Path, doc_name: str, summary: str, + doc_type: str = "short") -> None: + """Write summary page with frontmatter.""" summaries_dir = wiki_dir / "summaries" summaries_dir.mkdir(parents=True, exist_ok=True) - fm_lines = [f"sources: [{source_file}]"] - if brief: - fm_lines.append(f"brief: {brief}") - if doc_type == "short": - fm_lines.append(f"source_doc: sources/{doc_name}.md") + ext = "md" if doc_type == "short" else "json" + fm_lines = [ + f"doc_type: {doc_type}", + f"full_text: sources/{doc_name}.{ext}", + ] frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") @@ -690,7 +686,7 @@ async def compile_short_doc( except (json.JSONDecodeError, ValueError): doc_brief = "" summary = summary_raw - _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief) + _write_summary(wiki_dir, doc_name, summary) # --- Steps 2-4: Concept plan → generate/update → index --- await _compile_concepts( diff --git a/openkb/schema.py b/openkb/schema.py index 1911e86..9684733 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -35,7 +35,7 @@ ## Format - Use [[wikilink]] to link other wiki pages (e.g., [[concepts/attention]]) -- Summary pages header: `sources: [paper.pdf]` +- Summary pages header: `doc_type: short|pageindex` and `full_text: sources/{name}.md|.json` - Concept pages header: `sources: [paper1.pdf, paper2.pdf, ...]` - Standard Markdown heading hierarchy - Keep each page focused on a single topic diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py index 6770d7e..efad980 100644 --- a/openkb/tree_renderer.py +++ b/openkb/tree_renderer.py @@ -6,9 +6,8 @@ def _yaml_frontmatter(source_name: str, doc_id: str) -> str: """Return a YAML frontmatter block for a PageIndex wiki page.""" return ( "---\n" - f"source: {source_name}\n" - "type: pageindex\n" - f"doc_id: {doc_id}\n" + "doc_type: pageindex\n" + f"full_text: sources/{source_name}.json\n" "---\n" ) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index b3746d1..6b3ad0d 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -78,22 +78,22 @@ class TestWriteSummary: def test_writes_with_frontmatter(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() - _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers") + _write_summary(wiki, "my-doc", "# Summary\n\nContent here.") path = wiki / "summaries" / "my-doc.md" assert path.exists() text = path.read_text() - assert "sources: [my-doc.pdf]" in text - assert "brief: Introduces transformers" in text + assert "doc_type: short" in text + assert "full_text: sources/my-doc.md" in text assert "# Summary" in text def test_writes_without_brief(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() - _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.") + _write_summary(wiki, "my-doc", "# Summary\n\nContent here.") path = wiki / "summaries" / "my-doc.md" text = path.read_text() - assert "sources: [my-doc.pdf]" in text - assert "brief:" not in text + assert "doc_type: short" in text + assert "full_text: sources/my-doc.md" in text class TestWriteConcept: @@ -513,7 +513,7 @@ async def test_full_pipeline(self, tmp_path): # Verify summary written summary_path = wiki / "summaries" / "test-doc.md" assert summary_path.exists() - assert "sources: [test-doc.pdf]" in summary_path.read_text() + assert "full_text: sources/test-doc.md" in summary_path.read_text() # Verify concept written concept_path = wiki / "concepts" / "transformer.md" @@ -804,9 +804,10 @@ async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): ) await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") - # Summary frontmatter has brief + # Summary frontmatter has doc_type and full_text summary_text = (wiki / "summaries" / "test-doc.md").read_text() - assert "brief: A paper about transformers" in summary_text + assert "doc_type: short" in summary_text + assert "full_text: sources/test-doc.md" in summary_text # Concept frontmatter has brief concept_text = (wiki / "concepts" / "transformer.md").read_text() diff --git a/tests/test_indexer.py b/tests/test_indexer.py index 0948d64..ee7909c 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -90,7 +90,7 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path): summary_file = kb_dir / "wiki" / "summaries" / "sample.md" assert summary_file.exists() content = summary_file.read_text(encoding="utf-8") - assert "type: pageindex" in content + assert "doc_type: pageindex" in content assert "Summary:" in content def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_path): diff --git a/tests/test_tree_renderer.py b/tests/test_tree_renderer.py index d636d5b..f20e174 100644 --- a/tests/test_tree_renderer.py +++ b/tests/test_tree_renderer.py @@ -15,9 +15,8 @@ class TestRenderSummaryMd: def test_has_yaml_frontmatter(self, sample_tree): output = render_summary_md(sample_tree, "Sample Document", "doc-abc") assert output.startswith("---\n") - assert "source: Sample Document" in output - assert "type: pageindex" in output - assert "doc_id: doc-abc" in output + assert "doc_type: pageindex" in output + assert "full_text: sources/Sample Document.json" in output def test_top_level_nodes_are_h1(self, sample_tree): output = render_summary_md(sample_tree, "Sample Document", "doc-abc") From 06e26cea7d950cb734f10d43b66a039914f95034 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 04:26:04 +0800 Subject: [PATCH 27/44] fix: concept sources link to summaries and strip duplicate frontmatter Concept pages now reference summaries/{doc}.md instead of raw PDF filenames. Also strips frontmatter from LLM content during concept updates to prevent duplicate YAML blocks. Removes unused _find_source_filename. --- openkb/agent/compiler.py | 20 ++++++++------------ openkb/schema.py | 2 +- tests/test_compiler.py | 10 +++++----- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index d97b0eb..df417ba 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -279,15 +279,6 @@ def _read_concept_briefs(wiki_dir: Path) -> str: return "\n".join(lines) or "(none yet)" -def _find_source_filename(doc_name: str, kb_dir: Path) -> str: - """Find the original filename in raw/ for a given doc stem.""" - raw_dir = kb_dir / "raw" - if raw_dir.exists(): - for f in raw_dir.iterdir(): - if f.stem == doc_name: - return f.name - return f"{doc_name}.pdf" - def _write_summary(wiki_dir: Path, doc_name: str, summary: str, doc_type: str = "short") -> None: @@ -337,7 +328,13 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is existing = fm + body else: existing = f"---\nsources: [{source_file}]\n---\n\n" + existing - existing += f"\n\n{content}" + # Strip frontmatter from LLM content to avoid duplicate blocks + clean = content + if clean.startswith("---"): + end = clean.find("---", 3) + if end != -1: + clean = clean[end + 3:].lstrip("\n") + existing += f"\n\n{clean}" if brief and existing.startswith("---"): end = existing.find("---", 3) if end != -1: @@ -511,7 +508,7 @@ async def _compile_concepts( Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related actions, then executes each action type accordingly. """ - source_file = _find_source_filename(doc_name, kb_dir) + source_file = f"summaries/{doc_name}.md" # --- Step 2: Get concepts plan (A cached) --- concept_briefs = _read_concept_briefs(wiki_dir) @@ -666,7 +663,6 @@ async def compile_short_doc( wiki_dir = kb_dir / "wiki" schema_md = get_agents_md(wiki_dir) - source_file = _find_source_filename(doc_name, kb_dir) content = source_path.read_text(encoding="utf-8") # Base context A: system + document diff --git a/openkb/schema.py b/openkb/schema.py index 9684733..8642521 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -36,7 +36,7 @@ ## Format - Use [[wikilink]] to link other wiki pages (e.g., [[concepts/attention]]) - Summary pages header: `doc_type: short|pageindex` and `full_text: sources/{name}.md|.json` -- Concept pages header: `sources: [paper1.pdf, paper2.pdf, ...]` +- Concept pages header: `sources: [summaries/doc1.md, summaries/doc2.md, ...]` - Standard Markdown heading hierarchy - Keep each page focused on a single topic """ diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 6b3ad0d..a895b79 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -518,7 +518,7 @@ async def test_full_pipeline(self, tmp_path): # Verify concept written concept_path = wiki / "concepts" / "transformer.md" assert concept_path.exists() - assert "sources: [test-doc.pdf]" in concept_path.read_text() + assert "sources: [summaries/test-doc.md]" in concept_path.read_text() # Verify index updated index_text = (wiki / "index.md").read_text() @@ -678,14 +678,14 @@ async def ordered_acompletion(*args, **kwargs): fa_path = wiki / "concepts" / "flash-attention.md" assert fa_path.exists() fa_text = fa_path.read_text() - assert "sources: [test-doc.pdf]" in fa_text + assert "sources: [summaries/test-doc.md]" in fa_text assert "Flash Attention" in fa_text # Verify attention updated (is_update=True path in _write_concept) att_path = wiki / "concepts" / "attention.md" assert att_path.exists() att_text = att_path.read_text() - assert "test-doc.pdf" in att_text + assert "summaries/test-doc.md" in att_text assert "old-paper.pdf" in att_text # Verify index updated @@ -725,7 +725,7 @@ async def test_related_adds_link_no_llm(self, tmp_path): # Verify link added to transformer page transformer_text = (wiki / "concepts" / "transformer.md").read_text() assert "[[summaries/test-doc]]" in transformer_text - assert "test-doc.pdf" in transformer_text + assert "summaries/test-doc.md" in transformer_text @pytest.mark.asyncio async def test_fallback_list_format(self, tmp_path): @@ -760,7 +760,7 @@ async def test_fallback_list_format(self, tmp_path): att_path = wiki / "concepts" / "attention.md" assert att_path.exists() att_text = att_path.read_text() - assert "sources: [test-doc.pdf]" in att_text + assert "sources: [summaries/test-doc.md]" in att_text assert "Attention" in att_text From f38781e4c032638513ce508c687081181c21613d Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 04:30:01 +0800 Subject: [PATCH 28/44] fix: update query agent to use summary full_text field Add hint that summaries may omit details. Update search strategy to reference the full_text frontmatter field instead of hardcoded paths. --- openkb/agent/query.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index c9a8986..8bbc93b 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -19,12 +19,12 @@ 1. Read index.md to see all documents and concepts with brief summaries. Each document is marked (short) or (pageindex) to indicate its type. 2. Read relevant summary pages (summaries/) for document overviews. + Note: summaries may omit details. 3. Read concept pages (concepts/) for cross-document synthesis. -4. When you need detailed source content: - - Short documents: read_file("sources/{{doc_name}}.md") for the full text. - - PageIndex documents: use get_page_content(doc_name, pages) to read - specific pages. The summary page shows chapter structure with page - ranges to help you decide which pages to read. +4. When you need detailed source content, check the summary's full_text field: + - Short documents (full_text ends in .md): read_file(full_text) for the full text. + - PageIndex documents (full_text ends in .json): use get_page_content(doc_name, pages) + to read specific pages. The summary shows chapter structure with page ranges. 5. Synthesise a clear, well-cited answer grounded in wiki content. If you cannot find relevant information, say so clearly. From bebfbdb5e2989778d3afd1f66e4e8d13c5cd2dd0 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 04:30:01 +0800 Subject: [PATCH 29/44] fix: remove page marker comments from short doc source markdown --- openkb/images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openkb/images.py b/openkb/images.py index d72cec7..6916842 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -89,7 +89,7 @@ def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> for page_idx in range(len(doc)): page = doc[page_idx] page_num = page_idx + 1 - parts.append(f"\n\n\n") + parts.append("\n\n") for block in page.get_text("dict")["blocks"]: if block["type"] == 0: # text block From 4d34baf816f6aa5c26bbfa34e4c323cbe8ae920d Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 04:31:52 +0800 Subject: [PATCH 30/44] fix: rename chapter structure to document tree structure in query prompt --- openkb/agent/query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 8bbc93b..3f12ddf 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -24,7 +24,7 @@ 4. When you need detailed source content, check the summary's full_text field: - Short documents (full_text ends in .md): read_file(full_text) for the full text. - PageIndex documents (full_text ends in .json): use get_page_content(doc_name, pages) - to read specific pages. The summary shows chapter structure with page ranges. + to read specific pages. The summary shows document tree structure with page ranges. 5. Synthesise a clear, well-cited answer grounded in wiki content. If you cannot find relevant information, say so clearly. @@ -49,7 +49,7 @@ def read_file(path: str) -> str: def get_page_content_tool(doc_name: str, pages: str) -> str: """Get text content of specific pages from a long document. Use this when you need detailed content from a document. The summary - page shows chapter structure with page ranges. + page shows document tree structure with page ranges. Args: doc_name: Document name (e.g. 'attention-is-all-you-need'). pages: Page specification (e.g. '3-5,7,10-12'). From 5f563eeafff7596d60341febbb0915b417cb1d51 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 04:35:41 +0800 Subject: [PATCH 31/44] fix: improve query agent prompt wording for source content --- openkb/agent/query.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 3f12ddf..234863a 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -21,9 +21,10 @@ 2. Read relevant summary pages (summaries/) for document overviews. Note: summaries may omit details. 3. Read concept pages (concepts/) for cross-document synthesis. -4. When you need detailed source content, check the summary's full_text field: - - Short documents (full_text ends in .md): read_file(full_text) for the full text. - - PageIndex documents (full_text ends in .json): use get_page_content(doc_name, pages) +4. When you need detailed source document content, each summary page has a + `full_text` frontmatter field with the path to the original document content: + - Short documents (doc_type: short): read_file with that path. + - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages) to read specific pages. The summary shows document tree structure with page ranges. 5. Synthesise a clear, well-cited answer grounded in wiki content. From 0b07a8edb53fbe1a2cf67d103d0c9e22058df6d4 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 04:54:49 +0800 Subject: [PATCH 32/44] fix: move warning suppression after imports to avoid markitdown override --- openkb/cli.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index c43e588..f5c271b 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -14,9 +14,6 @@ # Use local model cost map — skip fetching from GitHub on every invocation os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True") -import warnings -warnings.filterwarnings("ignore") - import click import litellm litellm.suppress_debug_info = True @@ -27,6 +24,10 @@ from openkb.log import append_log from openkb.schema import AGENTS_MD +# Suppress warnings after all imports — markitdown overrides filters at import time +import warnings +warnings.filterwarnings("ignore") + load_dotenv() # load from cwd (covers running inside the KB dir) From 45c5b6ce0baf8351f910fd4119338257dd09846f Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 05:03:14 +0800 Subject: [PATCH 33/44] fix: add blank line between tool calls and before answer in query output --- openkb/agent/query.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 234863a..98a7f21 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -113,13 +113,10 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals if item.type == "tool_call_item": raw = item.raw_item args = getattr(raw, "arguments", "{}") - sys.stdout.write(f"\n[tool call] {raw.name}({args})\n") + sys.stdout.write(f"[tool call] {raw.name}({args})\n\n") sys.stdout.flush() elif item.type == "tool_call_output_item": - output = str(item.output) - preview = output[:200] + "..." if len(output) > 200 else output - sys.stdout.write(f"[tool output] {preview}\n\n") - sys.stdout.flush() + pass sys.stdout.write("\n") sys.stdout.flush() return "".join(collected) if collected else result.final_output or "" From 0118d2d3d0d71790be6ab4271292ca7f206edd22 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 05:11:11 +0800 Subject: [PATCH 34/44] fix: add self-talk before tool calls and fix output formatting --- openkb/agent/query.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 98a7f21..3cbe6e0 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -28,6 +28,8 @@ to read specific pages. The summary shows document tree structure with page ranges. 5. Synthesise a clear, well-cited answer grounded in wiki content. +Before each tool call, briefly state what you are about to do. + If you cannot find relevant information, say so clearly. """ @@ -113,7 +115,7 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals if item.type == "tool_call_item": raw = item.raw_item args = getattr(raw, "arguments", "{}") - sys.stdout.write(f"[tool call] {raw.name}({args})\n\n") + sys.stdout.write(f"\n[tool call] {raw.name}({args})\n\n") sys.stdout.flush() elif item.type == "tool_call_output_item": pass From 15f970d529219a4c507279fffde6eff16126637e Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 05:21:22 +0800 Subject: [PATCH 35/44] fix: add space after colon in concept/update step names --- openkb/agent/compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index df417ba..64bc204 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -563,7 +563,7 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: title=title, doc_name=doc_name, update_instruction="", )}, - ], f"concept:{name}") + ], f"concept: {name}") try: parsed = _parse_json(raw) brief = parsed.get("brief", "") @@ -594,7 +594,7 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: title=title, doc_name=doc_name, existing_content=existing_content, )}, - ], f"update:{name}") + ], f"update: {name}") try: parsed = _parse_json(raw) brief = parsed.get("brief", "") From c8f96ebc56a2a13aab7825c196f4c5a4f05444a5 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 05:33:51 +0800 Subject: [PATCH 36/44] fix: prevent duplicate frontmatter in LLM-generated content Remove frontmatter format from schema to avoid LLM copying it. Add strip as fallback in _write_summary and _write_concept create path. --- openkb/agent/compiler.py | 8 ++++++++ openkb/schema.py | 3 +-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 64bc204..73b1a9c 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -283,6 +283,10 @@ def _read_concept_briefs(wiki_dir: Path) -> str: def _write_summary(wiki_dir: Path, doc_name: str, summary: str, doc_type: str = "short") -> None: """Write summary page with frontmatter.""" + if summary.startswith("---"): + end = summary.find("---", 3) + if end != -1: + summary = summary[end + 3:].lstrip("\n") summaries_dir = wiki_dir / "summaries" summaries_dir.mkdir(parents=True, exist_ok=True) ext = "md" if doc_type == "short" else "json" @@ -347,6 +351,10 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is existing = fm + body path.write_text(existing, encoding="utf-8") else: + if content.startswith("---"): + end = content.find("---", 3) + if end != -1: + content = content[end + 3:].lstrip("\n") fm_lines = [f"sources: [{source_file}]"] if brief: fm_lines.append(f"brief: {brief}") diff --git a/openkb/schema.py b/openkb/schema.py index 8642521..b2c8cf0 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -35,10 +35,9 @@ ## Format - Use [[wikilink]] to link other wiki pages (e.g., [[concepts/attention]]) -- Summary pages header: `doc_type: short|pageindex` and `full_text: sources/{name}.md|.json` -- Concept pages header: `sources: [summaries/doc1.md, summaries/doc2.md, ...]` - Standard Markdown heading hierarchy - Keep each page focused on a single topic +- Do not include YAML frontmatter (---) in generated content; it is managed by code """ # Backward compat alias From febc8c98bf7e64229604e839b1536a61ee22a945 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 05:41:10 +0800 Subject: [PATCH 37/44] fix: improve init prompts, prevent duplicate frontmatter, use American English --- openkb/agent/query.py | 8 +++++--- openkb/cli.py | 14 ++++++++++---- openkb/lint.py | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 3cbe6e0..5f3fc77 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -25,10 +25,12 @@ `full_text` frontmatter field with the path to the original document content: - Short documents (doc_type: short): read_file with that path. - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages) - to read specific pages. The summary shows document tree structure with page ranges. -5. Synthesise a clear, well-cited answer grounded in wiki content. + with tight page ranges. The summary shows document tree structure with page + ranges to help you target. Never fetch the whole document. +5. Synthesize a clear, well-cited answer grounded in wiki content. -Before each tool call, briefly state what you are about to do. +Answer based only on wiki content. Before each tool call, briefly state what you +are about to do. Be concise. If you cannot find relevant information, say so clearly. """ diff --git a/openkb/cli.py b/openkb/cli.py index f5c271b..b29d5be 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -257,14 +257,20 @@ def init(): # Interactive prompts model = click.prompt( - "Model (e.g. gpt-5.4, anthropic/claude-sonnet-4-6, gemini/gemini-3.1-pro-preview)", + f"Model (e.g. gpt-5.4, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]", default=DEFAULT_CONFIG["model"], + show_default=False, + ) + language = click.prompt( + f"Language [default: {DEFAULT_CONFIG['language']}]", + default=DEFAULT_CONFIG["language"], + show_default=False, ) - language = click.prompt("Language", default=DEFAULT_CONFIG["language"]) pageindex_threshold = click.prompt( - "PageIndex threshold (pages)", + f"PageIndex threshold (pages) [default: {DEFAULT_CONFIG['pageindex_threshold']}]", default=DEFAULT_CONFIG["pageindex_threshold"], type=int, + show_default=False, ) # Create directory structure Path("raw").mkdir(exist_ok=True) @@ -295,7 +301,7 @@ def init(): # Register this KB in the global config register_kb(Path.cwd()) - click.echo("Knowledge base initialised.") + click.echo("Knowledge base initialized.") @cli.command() diff --git a/openkb/lint.py b/openkb/lint.py index c1c9105..78b22e5 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -29,7 +29,7 @@ def _read_md(path: Path) -> str: def _all_wiki_pages(wiki: Path) -> dict[str, Path]: """Return a mapping of stem/relative-path → absolute Path for all .md files. - Keys are normalised: 'concepts/attention', 'summaries/paper', 'index', etc. + Keys are normalized: 'concepts/attention', 'summaries/paper', 'index', etc. """ pages: dict[str, Path] = {} for md in wiki.rglob("*.md"): From 4938cd7d194d93b524d38c8ecc5f1f4b180715b6 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 05:54:12 +0800 Subject: [PATCH 38/44] fix: improve query agent tool descriptions and prompt clarity --- openkb/agent/query.py | 12 ++++++------ openkb/cli.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 5f3fc77..5c24dba 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -27,10 +27,10 @@ - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages) with tight page ranges. The summary shows document tree structure with page ranges to help you target. Never fetch the whole document. -5. Synthesize a clear, well-cited answer grounded in wiki content. +5. Synthesize a clear, concise, well-cited answer grounded in wiki content. -Answer based only on wiki content. Before each tool call, briefly state what you -are about to do. Be concise. +Answer based only on wiki content. Be concise. +Before each tool call, briefly state what you are about to do. If you cannot find relevant information, say so clearly. """ @@ -52,9 +52,9 @@ def read_file(path: str) -> str: @function_tool def get_page_content_tool(doc_name: str, pages: str) -> str: - """Get text content of specific pages from a long document. - Use this when you need detailed content from a document. The summary - page shows document tree structure with page ranges. + """Get text content of specific pages from a PageIndex (long) document. + Only use for documents with doc_type: pageindex. For short documents, + use read_file instead. Args: doc_name: Document name (e.g. 'attention-is-all-you-need'). pages: Page specification (e.g. '3-5,7,10-12'). diff --git a/openkb/cli.py b/openkb/cli.py index b29d5be..6495a85 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -257,7 +257,7 @@ def init(): # Interactive prompts model = click.prompt( - f"Model (e.g. gpt-5.4, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]", + f"Model (e.g. gpt-5.4-mini, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]", default=DEFAULT_CONFIG["model"], show_default=False, ) From 5a1f014f59ed5a4a129ad56238b17127157fac9f Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 06:23:34 +0800 Subject: [PATCH 39/44] fix: replace unicode ellipsis, fix image paths in pageindex content, remove empty dirs on init --- openkb/cli.py | 12 +++++------- openkb/indexer.py | 6 +++++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index 6495a85..3683371 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -158,7 +158,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: # 3/4. Index and compile if result.is_long_doc: - click.echo(f" Long document detected — indexing with PageIndex…") + click.echo(f" Long document detected — indexing with PageIndex...") try: from openkb.indexer import index_long_document index_result = index_long_document(result.raw_path, kb_dir) @@ -168,7 +168,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: return summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md" - click.echo(f" Compiling long doc (doc_id={index_result.doc_id})…") + click.echo(f" Compiling long doc (doc_id={index_result.doc_id})...") for attempt in range(2): try: asyncio.run( @@ -185,7 +185,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: logger.debug("Compilation traceback:", exc_info=True) return else: - click.echo(f" Compiling short doc…") + click.echo(f" Compiling short doc...") for attempt in range(2): try: asyncio.run(compile_short_doc(doc_name, result.source_path, kb_dir, model)) @@ -277,8 +277,6 @@ def init(): Path("wiki/sources/images").mkdir(parents=True, exist_ok=True) Path("wiki/summaries").mkdir(parents=True, exist_ok=True) Path("wiki/concepts").mkdir(parents=True, exist_ok=True) - Path("wiki/explorations").mkdir(parents=True, exist_ok=True) - Path("wiki/reports").mkdir(parents=True, exist_ok=True) # Write wiki files Path("wiki/AGENTS.md").write_text(AGENTS_MD, encoding="utf-8") @@ -430,12 +428,12 @@ def lint(ctx, fix): model: str = config.get("model", DEFAULT_CONFIG["model"]) # Structural lint - click.echo("Running structural lint…") + click.echo("Running structural lint...") structural_report = run_structural_lint(kb_dir) click.echo(structural_report) # Knowledge lint (semantic) - click.echo("Running knowledge lint…") + click.echo("Running knowledge lint...") try: knowledge_report = asyncio.run(run_knowledge_lint(kb_dir, model)) except Exception as exc: diff --git a/openkb/indexer.py b/openkb/indexer.py index 8cd6913..78ebf36 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -106,7 +106,11 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: dest = dest_images_dir / filename if not dest.exists(): shutil.copy2(src_path, dest) - img["path"] = f"images/{pdf_path.stem}/{filename}" + new_path = f"images/{pdf_path.stem}/{filename}" + # Also fix image references in page content + if "content" in page: + page["content"] = page["content"].replace(str(src_path), new_path) + img["path"] = new_path (sources_dir / f"{pdf_path.stem}.json").write_text( json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", From ad05577f355af9351b5ff2b42176bb68d4fa9d45 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 07:46:34 +0800 Subject: [PATCH 40/44] refactor: use pymupdf for page content extraction, unify image paths Replace PageIndex get_page_content with pymupdf-based convert_pdf_to_pages for long doc JSON generation. All image paths now use sources/images/ prefix relative to wiki root. Removes dependency on PageIndex for source content. --- openkb/images.py | 67 ++++++++++++++++++++++++++++++++++++++++++----- openkb/indexer.py | 39 +++++---------------------- 2 files changed, 67 insertions(+), 39 deletions(-) diff --git a/openkb/images.py b/openkb/images.py index 6916842..7628414 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -67,11 +67,66 @@ def extract_pdf_images(pdf_path: Path, doc_name: str, images_dir: Path) -> dict[ logger.warning("Failed to save image block on page %d", page_num) continue - rel_path = f"images/{doc_name}/{filename}" + rel_path = f"sources/images/{doc_name}/{filename}" page_images.setdefault(page_num, []).append(rel_path) return page_images +def convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict]: + """Convert a PDF to per-page dicts with text content and images. + + Each dict has ``{"page": int, "content": str, "images": [{"path": str}]}``. + Images are saved to *images_dir* and referenced with wiki-root-relative paths. + """ + images_dir.mkdir(parents=True, exist_ok=True) + pages: list[dict] = [] + img_counter = 0 + + with pymupdf.open(str(pdf_path)) as doc: + for page_idx in range(len(doc)): + page = doc[page_idx] + page_num = page_idx + 1 + parts: list[str] = [] + page_images: list[dict] = [] + + for block in page.get_text("dict")["blocks"]: + if block["type"] == 0: # text block + lines = [] + for line in block["lines"]: + spans_text = "".join(span["text"] for span in line["spans"]) + lines.append(spans_text) + parts.append("\n".join(lines)) + + elif block["type"] == 1: # image block + width = block.get("width", 0) + height = block.get("height", 0) + if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM: + continue + image_bytes = block.get("image") + if not image_bytes: + continue + try: + pix = pymupdf.Pixmap(image_bytes) + if pix.n > 4: + pix = pymupdf.Pixmap(pymupdf.csRGB, pix) + img_counter += 1 + filename = f"p{page_num}_img{img_counter}.png" + (images_dir / filename).write_bytes(pix.tobytes("png")) + pix = None + img_path = f"sources/images/{doc_name}/{filename}" + parts.append(f"\n![image]({img_path})\n") + page_images.append({"path": img_path}) + except Exception: + logger.warning("Failed to save image block on page %d", page_num) + + pages.append({ + "page": page_num, + "content": "\n".join(parts), + "images": page_images, + }) + return pages + + def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> str: """Convert a PDF to markdown with inline images using pymupdf dict-mode. @@ -115,7 +170,7 @@ def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> filename = f"p{page_num}_img{img_counter}.png" (images_dir / filename).write_bytes(pix.tobytes("png")) pix = None - parts.append(f"\n![image](images/{doc_name}/{filename})\n") + parts.append(f"\n![image](sources/images/{doc_name}/{filename})\n") except Exception: logger.warning("Failed to save image block on page %d", page_num) return "\n".join(parts) @@ -126,7 +181,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str For each ``![alt](data:image/ext;base64,DATA)`` match: - Decode base64 bytes → save to ``images_dir/img_NNN.ext`` - - Replace the link with ``![alt](images/{doc_name}/img_NNN.ext)`` + - Replace the link with ``![alt](sources/images/{doc_name}/img_NNN.ext)`` - On decode failure: log a warning and leave the original text unchanged. """ counter = 0 @@ -150,7 +205,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str images_dir.mkdir(parents=True, exist_ok=True) dest.write_bytes(image_bytes) - new_ref = f"![{alt}](images/{doc_name}/{filename})" + new_ref = f"![{alt}](sources/images/{doc_name}/{filename})" result = result.replace(match.group(0), new_ref, 1) return result @@ -164,7 +219,7 @@ def copy_relative_images( For each ``![alt](relative/path)`` match (skipping http/https and data URIs): - Resolve path relative to ``source_dir`` - Copy to ``images_dir/{filename}`` - - Replace link with ``![alt](images/{doc_name}/{filename})`` + - Replace link with ``![alt](sources/images/{doc_name}/{filename})`` - Missing source file: log a warning and leave the original text unchanged. """ result = markdown @@ -186,7 +241,7 @@ def copy_relative_images( images_dir.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dest) - new_ref = f"![{alt}](images/{doc_name}/{filename})" + new_ref = f"![{alt}](sources/images/{doc_name}/{filename})" result = result.replace(match.group(0), new_ref, 1) return result diff --git a/openkb/indexer.py b/openkb/indexer.py index 78ebf36..dd8ddaf 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -3,7 +3,7 @@ import json as json_mod import logging -import shutil + from dataclasses import dataclass from pathlib import Path @@ -77,40 +77,13 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: "structure": structure, } - # Write wiki/sources/ — get per-page content from PageIndex and store as JSON + # Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex) sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) - dest_images_dir = sources_dir / "images" / pdf_path.stem - - # Get per-page content from PageIndex — use actual page count - page_count = doc.get("page_count") - if page_count is None: - # Fallback: count pages from structure's max end_index - max_page = 0 - for node in structure: - end = node.get("end_index", 0) - if end > max_page: - max_page = end - page_count = max_page if max_page > 0 else 100 - logger.info("page_count not in doc, inferred from structure: %d", page_count) - all_pages = col.get_page_content(doc_id, f"1-{page_count}") - - # Relocate image paths in each page - dest_images_dir.mkdir(parents=True, exist_ok=True) - for page in all_pages: - if "images" in page: - for img in page["images"]: - src_path = Path(img["path"]) - if src_path.exists(): - filename = src_path.name - dest = dest_images_dir / filename - if not dest.exists(): - shutil.copy2(src_path, dest) - new_path = f"images/{pdf_path.stem}/{filename}" - # Also fix image references in page content - if "content" in page: - page["content"] = page["content"].replace(str(src_path), new_path) - img["path"] = new_path + images_dir = sources_dir / "images" / pdf_path.stem + + from openkb.images import convert_pdf_to_pages + all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir) (sources_dir / f"{pdf_path.stem}.json").write_text( json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", From 0340cb13557701b4c3da08e707f956976a4e0d09 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 07:46:42 +0800 Subject: [PATCH 41/44] feat: add multimodal get_image tool to query agent Query agent can now view images referenced in source documents via get_image tool, which returns ToolOutputImage for the LLM to inspect. Prompt updated to use images when questions involve figures or visuals. --- openkb/agent/query.py | 24 ++++++++++++++++++++---- openkb/agent/tools.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_query.py | 7 +++---- 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 5c24dba..d252ee6 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -5,7 +5,8 @@ from agents import Agent, Runner, function_tool -from openkb.agent.tools import read_wiki_file +from agents import ToolOutputImage, ToolOutputText +from openkb.agent.tools import read_wiki_file, read_wiki_image MAX_TURNS = 50 from openkb.schema import get_agents_md @@ -27,10 +28,13 @@ - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages) with tight page ranges. The summary shows document tree structure with page ranges to help you target. Never fetch the whole document. -5. Synthesize a clear, concise, well-cited answer grounded in wiki content. +5. When source content references images (e.g. ![image](sources/images/doc/file.png)), + use get_image to view them. Always view images when the question asks about + a figure, chart, diagram, or visual content. +6. Synthesize a clear, concise, well-cited answer grounded in wiki content. Answer based only on wiki content. Be concise. -Before each tool call, briefly state what you are about to do. +Before each tool call, output one short sentence explaining the reason. If you cannot find relevant information, say so clearly. """ @@ -62,12 +66,24 @@ def get_page_content_tool(doc_name: str, pages: str) -> str: from openkb.agent.tools import get_page_content return get_page_content(doc_name, pages, wiki_root) + @function_tool + def get_image(image_path: str) -> ToolOutputImage | ToolOutputText: + """View an image from the wiki. + Use when source content references images you need to see. + Args: + image_path: Image path relative to wiki root (e.g. 'sources/images/doc/p1_img1.png'). + """ + result = read_wiki_image(image_path, wiki_root) + if result["type"] == "image": + return ToolOutputImage(image_url=result["image_url"]) + return ToolOutputText(text=result["text"]) + from agents.model_settings import ModelSettings return Agent( name="wiki-query", instructions=instructions, - tools=[read_file, get_page_content_tool], + tools=[read_file, get_page_content_tool, get_image], model=f"litellm/{model}", model_settings=ModelSettings(parallel_tool_calls=False), ) diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index 0d1164c..2fe930b 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -133,6 +133,41 @@ def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: return "\n\n".join(parts) + "\n\n" +_MIME_TYPES = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", +} + + +def read_wiki_image(path: str, wiki_root: str) -> dict: + """Read an image file from the wiki and return as base64 data URL. + + Args: + path: Image path relative to *wiki_root* (e.g. ``"sources/images/doc/p1_img1.png"``). + wiki_root: Absolute path to the wiki root directory. + + Returns: + A dict with ``type``, ``image_url`` keys for ``ToolOutputImage``, + or a dict with ``type``, ``text`` keys on error. + """ + import base64 + + root = Path(wiki_root).resolve() + full_path = (root / path).resolve() + if not full_path.is_relative_to(root): + return {"type": "text", "text": "Access denied: path escapes wiki root."} + if not full_path.exists(): + return {"type": "text", "text": f"Image not found: {path}"} + + mime = _MIME_TYPES.get(full_path.suffix.lower(), "image/png") + b64 = base64.b64encode(full_path.read_bytes()).decode() + return {"type": "image", "image_url": f"data:{mime};base64,{b64}"} + + def write_wiki_file(path: str, content: str, wiki_root: str) -> str: """Write or overwrite a Markdown file in the wiki. diff --git a/tests/test_query.py b/tests/test_query.py index 8be4cb9..e00d2ea 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -15,17 +15,16 @@ def test_agent_name(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert agent.name == "wiki-query" - def test_agent_has_two_tools(self, tmp_path): + def test_agent_has_three_tools(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") - assert len(agent.tools) == 2 + assert len(agent.tools) == 3 def test_agent_tool_names(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") names = {t.name for t in agent.tools} assert "read_file" in names assert "get_page_content_tool" in names - assert "list_files" not in names - assert "pageindex_retrieve" not in names + assert "get_image" in names def test_instructions_mention_get_page_content(self, tmp_path): agent = build_query_agent(str(tmp_path), "gpt-4o-mini") From 151b90e8a69dab7e4a25db30d14e217d4ea48f25 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 07:52:30 +0800 Subject: [PATCH 42/44] fix: update tests for image path changes and removed init dirs --- tests/test_cli.py | 2 -- tests/test_images.py | 14 +++++++------- tests/test_indexer.py | 18 ++++++++++++++---- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 22c27fc..afb961d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,13 +23,11 @@ def test_init_creates_structure(tmp_path): assert (cwd / "wiki" / "sources" / "images").is_dir() assert (cwd / "wiki" / "summaries").is_dir() assert (cwd / "wiki" / "concepts").is_dir() - assert (cwd / "wiki" / "reports").is_dir() assert (cwd / ".openkb").is_dir() # Files assert (cwd / "wiki" / "AGENTS.md").is_file() assert (cwd / "wiki" / "log.md").is_file() - assert (cwd / "wiki" / "explorations").is_dir() assert (cwd / "wiki" / "index.md").is_file() assert (cwd / ".openkb" / "config.yaml").is_file() assert (cwd / ".openkb" / "hashes.json").is_file() diff --git a/tests/test_images.py b/tests/test_images.py index 0b3be21..8bbc722 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -44,7 +44,7 @@ def test_single_base64_image_extracted(self, tmp_path): # Result should reference a saved file, not the raw base64 assert "data:image/png;base64," not in result - assert "![alt text](images/doc/img_001.png)" == result + assert "![alt text](sources/images/doc/img_001.png)" == result # File should exist on disk saved = images_dir / "img_001.png" @@ -62,8 +62,8 @@ def test_multiple_base64_images_numbered_sequentially(self, tmp_path): ) result = extract_base64_images(md, "doc", images_dir) - assert "![fig1](images/doc/img_001.png)" in result - assert "![fig2](images/doc/img_002.jpeg)" in result + assert "![fig1](sources/images/doc/img_001.png)" in result + assert "![fig2](sources/images/doc/img_002.jpeg)" in result assert (images_dir / "img_001.png").exists() assert (images_dir / "img_002.jpeg").exists() @@ -92,7 +92,7 @@ def test_mixed_valid_invalid_base64(self, tmp_path, caplog): import logging with caplog.at_level(logging.WARNING, logger="openkb.images"): result = extract_base64_images(md, "doc", images_dir) - assert "![good](images/doc/img_001.png)" in result + assert "![good](sources/images/doc/img_001.png)" in result assert f"data:image/png;base64,{bad}" in result @@ -114,7 +114,7 @@ def test_existing_relative_image_copied_and_rewritten(self, tmp_path): md = "![diagram](diagram.png)" result = copy_relative_images(md, source_dir, "doc", images_dir) - assert "![diagram](images/doc/diagram.png)" == result + assert "![diagram](sources/images/doc/diagram.png)" == result assert (images_dir / "diagram.png").read_bytes() == FAKE_PNG def test_missing_relative_image_leaves_original(self, tmp_path, caplog): @@ -163,7 +163,7 @@ def test_multiple_relative_images_all_copied(self, tmp_path): md = "![a](a.png)\n![b](b.jpg)" result = copy_relative_images(md, source_dir, "doc", images_dir) - assert "![a](images/doc/a.png)" in result - assert "![b](images/doc/b.jpg)" in result + assert "![a](sources/images/doc/a.png)" in result + assert "![b](sources/images/doc/b.jpg)" in result assert (images_dir / "a.png").exists() assert (images_dir / "b.jpg").exists() diff --git a/tests/test_indexer.py b/tests/test_indexer.py index ee7909c..3dbb677 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -28,6 +28,12 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict): col.get_page_content.return_value = [] return col + def _fake_pages(self): + return [ + {"page": 1, "content": "Page one text.", "images": []}, + {"page": 2, "content": "Page two text.", "images": []}, + ] + def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): doc_id = "abc-123" fake_col = self._make_fake_collection(doc_id, sample_tree) @@ -38,7 +44,8 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") - with patch("openkb.indexer.PageIndexClient", return_value=fake_client): + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): result = index_long_document(pdf_path, kb_dir) assert isinstance(result, IndexResult) @@ -63,7 +70,8 @@ def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path): pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") - with patch("openkb.indexer.PageIndexClient", return_value=fake_client): + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): index_long_document(pdf_path, kb_dir) json_file = kb_dir / "wiki" / "sources" / "sample.json" @@ -84,7 +92,8 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path): pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") - with patch("openkb.indexer.PageIndexClient", return_value=fake_client): + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): index_long_document(pdf_path, kb_dir) summary_file = kb_dir / "wiki" / "summaries" / "sample.md" @@ -104,7 +113,8 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat pdf_path = tmp_path / "report.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") - with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls: + with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls, \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): index_long_document(pdf_path, kb_dir) # Verify PageIndexClient was instantiated with correct IndexConfig From f383fbec6f68d40c1162fd4d37fc1fa31a1cf925 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 07:55:23 +0800 Subject: [PATCH 43/44] fix: mock _find_kb_dir in test_add_missing_init to isolate from real KB dirs --- tests/test_add_command.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_add_command.py b/tests/test_add_command.py index ca97d26..2ad22e7 100644 --- a/tests/test_add_command.py +++ b/tests/test_add_command.py @@ -58,7 +58,8 @@ def _setup_kb(self, tmp_path): def test_add_missing_init(self, tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli._find_kb_dir", return_value=None): result = runner.invoke(cli, ["add", "somefile.pdf"]) assert "No knowledge base found" in result.output From a1460b407252081bdfd50eedd24af7092c543f34 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 11 Apr 2026 10:56:36 +0800 Subject: [PATCH 44/44] chore: remove docs/ directory from branch --- .../2026-04-09-concept-dedup-and-update.md | 888 ------------- .../plans/2026-04-09-retrieve-redesign.md | 1104 ----------------- ...6-04-09-concept-dedup-and-update-design.md | 163 --- .../specs/2026-04-09-retrieve-redesign.md | 262 ---- 4 files changed, 2417 deletions(-) delete mode 100644 docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md delete mode 100644 docs/superpowers/plans/2026-04-09-retrieve-redesign.md delete mode 100644 docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md delete mode 100644 docs/superpowers/specs/2026-04-09-retrieve-redesign.md diff --git a/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md b/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md deleted file mode 100644 index 1a312a6..0000000 --- a/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md +++ /dev/null @@ -1,888 +0,0 @@ -# Concept Dedup & Existing Page Update — Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Give the compiler enough context about existing concepts to make smart dedup/update decisions, and add the ability to rewrite existing concept pages with new information — all without breaking prompt caching. - -**Architecture:** Extend the deterministic pipeline in `compiler.py` with: (1) concept briefs read from disk before the concepts-plan LLM call, (2) a new JSON output format with create/update/related actions, (3) a new concurrent "update" path that sends existing page content to the LLM for rewriting, (4) a code-only "related" path for cross-ref links. Extract shared logic between `compile_short_doc` and `compile_long_doc` into `_compile_concepts`. - -**Tech Stack:** Python, litellm, asyncio, pytest - ---- - -### Task 1: Add `_read_concept_briefs` and test - -**Files:** -- Modify: `openkb/agent/compiler.py:199-207` (File I/O helpers section) -- Modify: `tests/test_compiler.py:98-116` (TestReadWikiContext section) - -- [ ] **Step 1: Write the failing test** - -Add to `tests/test_compiler.py`: - -```python -from openkb.agent.compiler import _read_concept_briefs - -class TestReadConceptBriefs: - def test_empty_wiki(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - assert _read_concept_briefs(wiki) == "(none yet)" - - def test_no_concepts_dir(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - assert _read_concept_briefs(wiki) == "(none yet)" - - def test_reads_briefs_with_frontmatter(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "attention.md").write_text( - "---\nsources: [paper.pdf]\n---\n\nAttention allows models to focus on relevant input parts selectively.", - encoding="utf-8", - ) - result = _read_concept_briefs(wiki) - assert "- attention: Attention allows models" in result - - def test_reads_briefs_without_frontmatter(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "rnn.md").write_text( - "Recurrent neural networks process sequences step by step.", - encoding="utf-8", - ) - result = _read_concept_briefs(wiki) - assert "- rnn: Recurrent neural networks" in result - - def test_truncates_long_content(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "long.md").write_text("A" * 300, encoding="utf-8") - result = _read_concept_briefs(wiki) - brief_line = result.split("\n")[0] - # slug + ": " + 150 chars = well under 200 - assert len(brief_line) < 200 - - def test_sorted_alphabetically(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8") - (concepts / "alpha.md").write_text("Alpha concept.", encoding="utf-8") - result = _read_concept_briefs(wiki) - lines = result.strip().split("\n") - assert lines[0].startswith("- alpha:") - assert lines[1].startswith("- zebra:") -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v` -Expected: FAIL with `ImportError: cannot import name '_read_concept_briefs'` - -- [ ] **Step 3: Implement `_read_concept_briefs`** - -Add to `openkb/agent/compiler.py` in the File I/O helpers section (after `_read_wiki_context`): - -```python -def _read_concept_briefs(wiki_dir: Path) -> str: - """Read existing concept pages and return compact briefs for the LLM. - - Returns a string like: - - attention: Attention allows models to focus on relevant input parts... - - transformer: The Transformer is a neural network architecture... - - Or "(none yet)" if no concept pages exist. - """ - concepts_dir = wiki_dir / "concepts" - if not concepts_dir.exists(): - return "(none yet)" - briefs = [] - for p in sorted(concepts_dir.glob("*.md")): - text = p.read_text(encoding="utf-8") - # Skip YAML frontmatter - if text.startswith("---"): - parts = text.split("---", 2) - body = parts[2].strip() if len(parts) >= 3 else "" - else: - body = text.strip() - brief = body[:150].replace("\n", " ") - if brief: - briefs.append(f"- {p.stem}: {brief}") - return "\n".join(briefs) or "(none yet)" -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v` -Expected: All 6 tests PASS - -- [ ] **Step 5: Update the import in test file** - -Add `_read_concept_briefs` to the existing import block at the top of `tests/test_compiler.py`: - -```python -from openkb.agent.compiler import ( - compile_long_doc, - compile_short_doc, - _parse_json, - _write_summary, - _write_concept, - _update_index, - _read_wiki_context, - _read_concept_briefs, -) -``` - -- [ ] **Step 6: Commit** - -```bash -git add openkb/agent/compiler.py tests/test_compiler.py -git commit -m "feat: add _read_concept_briefs for concept dedup context" -``` - ---- - -### Task 2: Replace prompt template and update JSON parsing - -**Files:** -- Modify: `openkb/agent/compiler.py:53-70` (prompt templates section) -- Modify: `tests/test_compiler.py:21-31` (TestParseJson section) - -- [ ] **Step 1: Write the failing test for new JSON format** - -Add to `tests/test_compiler.py`: - -```python -class TestParseConceptsPlan: - def test_dict_format(self): - text = json.dumps({ - "create": [{"name": "foo", "title": "Foo"}], - "update": [{"name": "bar", "title": "Bar"}], - "related": ["baz"], - }) - parsed = _parse_json(text) - assert isinstance(parsed, dict) - assert len(parsed["create"]) == 1 - assert len(parsed["update"]) == 1 - assert parsed["related"] == ["baz"] - - def test_fallback_list_format(self): - """If LLM returns old flat array, _parse_json still works.""" - text = json.dumps([{"name": "foo", "title": "Foo"}]) - parsed = _parse_json(text) - assert isinstance(parsed, list) - - def test_fenced_dict(self): - text = '```json\n{"create": [], "update": [], "related": []}\n```' - parsed = _parse_json(text) - assert isinstance(parsed, dict) - assert parsed["create"] == [] -``` - -- [ ] **Step 2: Run test to verify it passes (these use existing `_parse_json`)** - -Run: `pytest tests/test_compiler.py::TestParseConceptsPlan -v` -Expected: All 3 PASS — `_parse_json` already handles dicts. This confirms compatibility. - -- [ ] **Step 3: Replace `_CONCEPTS_LIST_USER` with `_CONCEPTS_PLAN_USER`** - -In `openkb/agent/compiler.py`, replace the `_CONCEPTS_LIST_USER` template (lines 53-70) with: - -```python -_CONCEPTS_PLAN_USER = """\ -Based on the summary above, decide how to update the wiki's concept pages. - -Existing concept pages: -{concept_briefs} - -Return a JSON object with three keys: - -1. "create" — new concepts not covered by any existing page. Array of objects: - {{"name": "concept-slug", "title": "Human-Readable Title"}} - -2. "update" — existing concepts that have significant new information from \ -this document worth integrating. Array of objects: - {{"name": "existing-slug", "title": "Existing Title"}} - -3. "related" — existing concepts tangentially related to this document but \ -not needing content changes, just a cross-reference link. Array of slug strings. - -Rules: -- For the first few documents, create 2-3 foundational concepts at most. -- Do NOT create a concept that overlaps with an existing one — use "update". -- Do NOT create concepts that are just the document topic itself. -- "related" is for lightweight cross-linking only, no content rewrite needed. - -Return ONLY valid JSON, no fences, no explanation. -""" -``` - -- [ ] **Step 4: Add `_CONCEPT_UPDATE_USER` template** - -Add after `_CONCEPT_PAGE_USER` (after line 82): - -```python -_CONCEPT_UPDATE_USER = """\ -Update the concept page for: {title} - -Current content of this page: -{existing_content} - -New information from document "{doc_name}" (summarized above) should be \ -integrated into this page. Rewrite the full page incorporating the new \ -information naturally — do not just append. Maintain existing \ -[[wikilinks]] and add new ones where appropriate. - -Return ONLY the Markdown content (no frontmatter, no code fences). -""" -``` - -- [ ] **Step 5: Run all existing tests to verify nothing breaks** - -Run: `pytest tests/test_compiler.py -v` -Expected: All PASS (templates aren't tested directly, only via integration tests which we'll update later) - -- [ ] **Step 6: Commit** - -```bash -git add openkb/agent/compiler.py tests/test_compiler.py -git commit -m "feat: add concepts plan and update prompt templates" -``` - ---- - -### Task 3: Add `_add_related_link` and test - -**Files:** -- Modify: `openkb/agent/compiler.py` (File I/O helpers section, after `_write_concept`) -- Modify: `tests/test_compiler.py` - -- [ ] **Step 1: Write the failing test** - -Add to `tests/test_compiler.py`: - -```python -from openkb.agent.compiler import _add_related_link - -class TestAddRelatedLink: - def test_adds_see_also_link(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "attention.md").write_text( - "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSome content.", - encoding="utf-8", - ) - _add_related_link(wiki, "attention", "new-doc", "paper2.pdf") - text = (concepts / "attention.md").read_text() - assert "[[summaries/new-doc]]" in text - assert "paper2.pdf" in text - - def test_skips_if_already_linked(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "attention.md").write_text( - "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSee also: [[summaries/new-doc]]", - encoding="utf-8", - ) - _add_related_link(wiki, "attention", "new-doc", "paper1.pdf") - text = (concepts / "attention.md").read_text() - # Should not duplicate - assert text.count("[[summaries/new-doc]]") == 1 - - def test_skips_if_file_missing(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - # Should not raise - _add_related_link(wiki, "nonexistent", "doc", "file.pdf") -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v` -Expected: FAIL with `ImportError: cannot import name '_add_related_link'` - -- [ ] **Step 3: Implement `_add_related_link`** - -Add to `openkb/agent/compiler.py` after `_write_concept`: - -```python -def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None: - """Add a cross-reference link to an existing concept page (no LLM call).""" - concepts_dir = wiki_dir / "concepts" - path = concepts_dir / f"{concept_slug}.md" - if not path.exists(): - return - - text = path.read_text(encoding="utf-8") - link = f"[[summaries/{doc_name}]]" - if link in text: - return - - # Update sources in frontmatter - if source_file not in text: - if text.startswith("---"): - end = text.index("---", 3) - fm = text[:end + 3] - body = text[end + 3:] - if "sources:" in fm: - fm = fm.replace("sources: [", f"sources: [{source_file}, ") - else: - fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) - text = fm + body - else: - text = f"---\nsources: [{source_file}]\n---\n\n" + text - - text += f"\n\nSee also: {link}" - path.write_text(text, encoding="utf-8") -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v` -Expected: All 3 tests PASS - -- [ ] **Step 5: Update the import in test file** - -Add `_add_related_link` to the import block at top of `tests/test_compiler.py`. - -- [ ] **Step 6: Commit** - -```bash -git add openkb/agent/compiler.py tests/test_compiler.py -git commit -m "feat: add _add_related_link for code-only cross-referencing" -``` - ---- - -### Task 4: Extract `_compile_concepts` and refactor both public functions - -**Files:** -- Modify: `openkb/agent/compiler.py:290-509` (Public API section — full rewrite) -- Modify: `tests/test_compiler.py:153-267` (integration tests) - -This is the core task. It extracts the shared Steps 2-4 into `_compile_concepts`, updates both public functions to call it, and switches to the new concepts plan format. - -- [ ] **Step 1: Write integration test for new create/update/related flow** - -Add to `tests/test_compiler.py`: - -```python -class TestCompileConceptsPlan: - """Integration tests for the new create/update/related flow.""" - - @pytest.mark.asyncio - async def test_create_and_update_flow(self, tmp_path): - """New doc creates one concept and updates an existing one.""" - wiki = tmp_path / "wiki" - (wiki / "sources").mkdir(parents=True) - (wiki / "summaries").mkdir(parents=True) - concepts_dir = wiki / "concepts" - concepts_dir.mkdir(parents=True) - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", - encoding="utf-8", - ) - # Pre-existing concept - (concepts_dir / "attention.md").write_text( - "---\nsources: [old-paper.pdf]\n---\n\n# Attention\n\nOld content about attention.", - encoding="utf-8", - ) - - source_path = wiki / "sources" / "new-paper.md" - source_path.write_text("# New Paper\n\nContent about flash attention and transformers.", encoding="utf-8") - (tmp_path / ".openkb").mkdir() - (tmp_path / "raw").mkdir() - (tmp_path / "raw" / "new-paper.pdf").write_bytes(b"fake") - - summary_resp = "This paper introduces flash attention, improving on attention mechanisms." - plan_resp = json.dumps({ - "create": [{"name": "flash-attention", "title": "Flash Attention"}], - "update": [{"name": "attention", "title": "Attention Mechanism"}], - "related": [], - }) - create_page_resp = "# Flash Attention\n\nAn efficient attention algorithm." - update_page_resp = "# Attention\n\nUpdated content with flash attention details." - - with patch("openkb.agent.compiler.litellm") as mock_litellm: - mock_litellm.completion = MagicMock( - side_effect=_mock_completion([summary_resp, plan_resp]) - ) - mock_litellm.acompletion = AsyncMock( - side_effect=_mock_acompletion([create_page_resp, update_page_resp]) - ) - await compile_short_doc("new-paper", source_path, tmp_path, "gpt-4o-mini") - - # New concept created - flash_path = concepts_dir / "flash-attention.md" - assert flash_path.exists() - assert "sources: [new-paper.pdf]" in flash_path.read_text() - - # Existing concept rewritten (not appended) - attn_text = (concepts_dir / "attention.md").read_text() - assert "new-paper.pdf" in attn_text - assert "Updated content with flash attention details" in attn_text - - # Index updated for both - index_text = (wiki / "index.md").read_text() - assert "[[concepts/flash-attention]]" in index_text - - @pytest.mark.asyncio - async def test_related_adds_link_no_llm(self, tmp_path): - """Related concepts get cross-ref links without LLM calls.""" - wiki = tmp_path / "wiki" - (wiki / "sources").mkdir(parents=True) - (wiki / "summaries").mkdir(parents=True) - concepts_dir = wiki / "concepts" - concepts_dir.mkdir(parents=True) - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", - encoding="utf-8", - ) - (concepts_dir / "transformer.md").write_text( - "---\nsources: [old.pdf]\n---\n\n# Transformer\n\nArchitecture details.", - encoding="utf-8", - ) - - source_path = wiki / "sources" / "doc.md" - source_path.write_text("Content", encoding="utf-8") - (tmp_path / ".openkb").mkdir() - (tmp_path / "raw").mkdir() - (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake") - - summary_resp = "A short summary." - plan_resp = json.dumps({ - "create": [], - "update": [], - "related": ["transformer"], - }) - - with patch("openkb.agent.compiler.litellm") as mock_litellm: - mock_litellm.completion = MagicMock( - side_effect=_mock_completion([summary_resp, plan_resp]) - ) - # acompletion should NOT be called (no create/update) - mock_litellm.acompletion = AsyncMock(side_effect=AssertionError("should not be called")) - await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") - - # Related concept should have cross-ref link - transformer_text = (concepts_dir / "transformer.md").read_text() - assert "[[summaries/doc]]" in transformer_text - - @pytest.mark.asyncio - async def test_fallback_list_format(self, tmp_path): - """If LLM returns old flat array, treat all as create.""" - wiki = tmp_path / "wiki" - (wiki / "sources").mkdir(parents=True) - (wiki / "summaries").mkdir(parents=True) - (wiki / "concepts").mkdir(parents=True) - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", - encoding="utf-8", - ) - source_path = wiki / "sources" / "doc.md" - source_path.write_text("Content", encoding="utf-8") - (tmp_path / ".openkb").mkdir() - (tmp_path / "raw").mkdir() - (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake") - - summary_resp = "Summary." - # Old format: flat array - plan_resp = json.dumps([{"name": "foo", "title": "Foo"}]) - page_resp = "# Foo\n\nContent." - - with patch("openkb.agent.compiler.litellm") as mock_litellm: - mock_litellm.completion = MagicMock( - side_effect=_mock_completion([summary_resp, plan_resp]) - ) - mock_litellm.acompletion = AsyncMock( - side_effect=_mock_acompletion([page_resp]) - ) - await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") - - assert (wiki / "concepts" / "foo.md").exists() -``` - -- [ ] **Step 2: Run the new tests to verify they fail** - -Run: `pytest tests/test_compiler.py::TestCompileConceptsPlan -v` -Expected: FAIL — the current code uses old prompt format and doesn't handle dict responses - -- [ ] **Step 3: Implement `_compile_concepts` and refactor public functions** - -Replace the entire Public API section (from `DEFAULT_COMPILE_CONCURRENCY` to end of file) in `openkb/agent/compiler.py` with: - -```python -DEFAULT_COMPILE_CONCURRENCY = 5 - - -async def _compile_concepts( - wiki_dir: Path, - kb_dir: Path, - model: str, - system_msg: dict, - doc_msg: dict, - summary: str, - doc_name: str, - max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, -) -> None: - """Shared concept compilation logic: plan → create/update/related → index. - - This is the core of the compilation pipeline, shared by both - compile_short_doc and compile_long_doc. - """ - source_file = _find_source_filename(doc_name, kb_dir) - concept_briefs = _read_concept_briefs(wiki_dir) - - # --- Concepts plan (A cached) --- - plan_raw = _llm_call(model, [ - system_msg, - doc_msg, - {"role": "assistant", "content": summary}, - {"role": "user", "content": _CONCEPTS_PLAN_USER.format( - concept_briefs=concept_briefs, - )}, - ], "concepts-plan", max_tokens=1024) - - try: - parsed = _parse_json(plan_raw) - except (json.JSONDecodeError, ValueError) as exc: - logger.warning("Failed to parse concepts plan: %s", exc) - logger.debug("Raw: %s", plan_raw) - _update_index(wiki_dir, doc_name, []) - return - - # Fallback: if LLM returns flat array, treat all as create - if isinstance(parsed, list): - create_list, update_list, related_list = parsed, [], [] - else: - create_list = parsed.get("create", []) - update_list = parsed.get("update", []) - related_list = parsed.get("related", []) - - if not create_list and not update_list and not related_list: - _update_index(wiki_dir, doc_name, []) - return - - # --- Concurrent concept generation (A cached) --- - semaphore = asyncio.Semaphore(max_concurrency) - - async def _gen_create(concept: dict) -> tuple[str, str, bool]: - name = concept["name"] - title = concept.get("title", name) - async with semaphore: - page_content = await _llm_call_async(model, [ - system_msg, - doc_msg, - {"role": "assistant", "content": summary}, - {"role": "user", "content": _CONCEPT_PAGE_USER.format( - title=title, doc_name=doc_name, - update_instruction="", - )}, - ], f"create:{name}") - return name, page_content, False - - async def _gen_update(concept: dict) -> tuple[str, str, bool]: - name = concept["name"] - title = concept.get("title", name) - # Read existing page content for the LLM to integrate - concept_path = wiki_dir / "concepts" / f"{name}.md" - if concept_path.exists(): - raw_text = concept_path.read_text(encoding="utf-8") - # Strip frontmatter for the LLM - if raw_text.startswith("---"): - parts = raw_text.split("---", 2) - existing_content = parts[2].strip() if len(parts) >= 3 else raw_text - else: - existing_content = raw_text - else: - existing_content = "(page not found — create from scratch)" - async with semaphore: - page_content = await _llm_call_async(model, [ - system_msg, - doc_msg, - {"role": "assistant", "content": summary}, - {"role": "user", "content": _CONCEPT_UPDATE_USER.format( - title=title, doc_name=doc_name, - existing_content=existing_content, - )}, - ], f"update:{name}") - return name, page_content, True - - tasks = [] - tasks.extend(_gen_create(c) for c in create_list) - tasks.extend(_gen_update(c) for c in update_list) - - if tasks: - total = len(tasks) - sys.stdout.write(f" Generating {total} concept(s) (concurrency={max_concurrency})...\n") - sys.stdout.flush() - - results = await asyncio.gather(*tasks, return_exceptions=True) - else: - results = [] - - concept_names = [] - for r in results: - if isinstance(r, Exception): - logger.warning("Concept generation failed: %s", r) - continue - name, page_content, is_update = r - _write_concept(wiki_dir, name, page_content, source_file, is_update) - concept_names.append(name) - - # --- Related: code-only cross-ref links --- - for slug in related_list: - _add_related_link(wiki_dir, slug, doc_name, source_file) - - # --- Update index --- - _update_index(wiki_dir, doc_name, concept_names) - - -async def compile_short_doc( - doc_name: str, - source_path: Path, - kb_dir: Path, - model: str, - max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, -) -> None: - """Compile a short document into wiki pages. - - Step 1: Generate summary from full document text. - Step 2: Plan + generate/update concept pages (via _compile_concepts). - """ - from openkb.config import load_config - - openkb_dir = kb_dir / ".openkb" - config = load_config(openkb_dir / "config.yaml") - language: str = config.get("language", "en") - - wiki_dir = kb_dir / "wiki" - schema_md = get_agents_md(wiki_dir) - source_file = _find_source_filename(doc_name, kb_dir) - content = source_path.read_text(encoding="utf-8") - - system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( - schema_md=schema_md, language=language, - )} - doc_msg = {"role": "user", "content": _SUMMARY_USER.format( - doc_name=doc_name, content=content, - )} - - # Step 1: Generate summary - summary = _llm_call(model, [system_msg, doc_msg], "summary") - _write_summary(wiki_dir, doc_name, source_file, summary) - - # Step 2: Compile concepts - await _compile_concepts( - wiki_dir, kb_dir, model, system_msg, doc_msg, summary, - doc_name, max_concurrency, - ) - - -async def compile_long_doc( - doc_name: str, - summary_path: Path, - doc_id: str, - kb_dir: Path, - model: str, - max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, -) -> None: - """Compile a long (PageIndex) document into wiki concept pages. - - The summary page is already written by the indexer. This function - generates an overview, then plans + generates/updates concept pages. - """ - from openkb.config import load_config - - openkb_dir = kb_dir / ".openkb" - config = load_config(openkb_dir / "config.yaml") - language: str = config.get("language", "en") - - wiki_dir = kb_dir / "wiki" - schema_md = get_agents_md(wiki_dir) - summary_text = summary_path.read_text(encoding="utf-8") - - system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( - schema_md=schema_md, language=language, - )} - doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format( - doc_name=doc_name, doc_id=doc_id, content=summary_text, - )} - - # Step 1: Generate overview - overview = _llm_call(model, [system_msg, doc_msg], "overview") - - # Step 2: Compile concepts - await _compile_concepts( - wiki_dir, kb_dir, model, system_msg, doc_msg, overview, - doc_name, max_concurrency, - ) -``` - -- [ ] **Step 4: Update existing integration tests** - -Update `TestCompileShortDoc.test_full_pipeline` — the concepts-list response now needs to be the new dict format: - -```python -class TestCompileShortDoc: - @pytest.mark.asyncio - async def test_full_pipeline(self, tmp_path): - wiki = tmp_path / "wiki" - (wiki / "sources").mkdir(parents=True) - (wiki / "summaries").mkdir(parents=True) - (wiki / "concepts").mkdir(parents=True) - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", - encoding="utf-8", - ) - source_path = wiki / "sources" / "test-doc.md" - source_path.write_text("# Test Doc\n\nSome content about transformers.", encoding="utf-8") - (tmp_path / ".openkb").mkdir() - (tmp_path / "raw").mkdir() - (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") - - summary_response = "# Summary\n\nThis document discusses transformers." - plan_response = json.dumps({ - "create": [{"name": "transformer", "title": "Transformer"}], - "update": [], - "related": [], - }) - concept_page_response = "# Transformer\n\nA neural network architecture." - - with patch("openkb.agent.compiler.litellm") as mock_litellm: - mock_litellm.completion = MagicMock( - side_effect=_mock_completion([summary_response, plan_response]) - ) - mock_litellm.acompletion = AsyncMock( - side_effect=_mock_acompletion([concept_page_response]) - ) - await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") - - summary_path = wiki / "summaries" / "test-doc.md" - assert summary_path.exists() - assert "sources: [test-doc.pdf]" in summary_path.read_text() - - concept_path = wiki / "concepts" / "transformer.md" - assert concept_path.exists() - assert "sources: [test-doc.pdf]" in concept_path.read_text() - - index_text = (wiki / "index.md").read_text() - assert "[[summaries/test-doc]]" in index_text - assert "[[concepts/transformer]]" in index_text -``` - -Update `TestCompileShortDoc.test_handles_bad_json` — no changes needed (bad JSON still triggers fallback). - -Update `TestCompileLongDoc.test_full_pipeline`: - -```python -class TestCompileLongDoc: - @pytest.mark.asyncio - async def test_full_pipeline(self, tmp_path): - wiki = tmp_path / "wiki" - (wiki / "summaries").mkdir(parents=True) - (wiki / "concepts").mkdir(parents=True) - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n", - encoding="utf-8", - ) - summary_path = wiki / "summaries" / "big-doc.md" - summary_path.write_text("# Big Doc\n\nPageIndex summary tree.", encoding="utf-8") - openkb_dir = tmp_path / ".openkb" - openkb_dir.mkdir() - (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n") - (tmp_path / "raw").mkdir() - (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake") - - overview_response = "Overview of the big document." - plan_response = json.dumps({ - "create": [{"name": "deep-learning", "title": "Deep Learning"}], - "update": [], - "related": [], - }) - concept_page_response = "# Deep Learning\n\nA subfield of ML." - - with patch("openkb.agent.compiler.litellm") as mock_litellm: - mock_litellm.completion = MagicMock( - side_effect=_mock_completion([overview_response, plan_response]) - ) - mock_litellm.acompletion = AsyncMock( - side_effect=_mock_acompletion([concept_page_response]) - ) - await compile_long_doc( - "big-doc", summary_path, "doc-123", tmp_path, "gpt-4o-mini" - ) - - concept_path = wiki / "concepts" / "deep-learning.md" - assert concept_path.exists() - assert "Deep Learning" in concept_path.read_text() - - index_text = (wiki / "index.md").read_text() - assert "[[summaries/big-doc]]" in index_text - assert "[[concepts/deep-learning]]" in index_text -``` - -- [ ] **Step 5: Run all tests** - -Run: `pytest tests/test_compiler.py -v` -Expected: All PASS - -- [ ] **Step 6: Run the full test suite** - -Run: `pytest tests/ -v` -Expected: All 149+ tests PASS - -- [ ] **Step 7: Commit** - -```bash -git add openkb/agent/compiler.py tests/test_compiler.py -git commit -m "feat: concept dedup with briefs, update/related paths, extract _compile_concepts" -``` - ---- - -### Task 5: Clean up old references and update module docstring - -**Files:** -- Modify: `openkb/agent/compiler.py:1-9` (module docstring) - -- [ ] **Step 1: Update module docstring** - -Replace the docstring at the top of `openkb/agent/compiler.py`: - -```python -"""Wiki compilation pipeline for OpenKB. - -Pipeline leveraging LLM prompt caching: - Step 1: Build base context A (schema + document content). - Step 2: A → generate summary. - Step 3: A + summary → concepts plan (create/update/related). - Step 4: Concurrent LLM calls (A cached) → generate new + rewrite updated concepts. - Step 5: Code adds cross-ref links to related concepts, updates index. -""" -``` - -- [ ] **Step 2: Verify `_CONCEPTS_LIST_USER` is fully removed** - -Search for any remaining references to `_CONCEPTS_LIST_USER` in the codebase: - -Run: `grep -r "_CONCEPTS_LIST_USER" openkb/ tests/` -Expected: No matches - -- [ ] **Step 3: Run full test suite one final time** - -Run: `pytest tests/ -q` -Expected: All tests pass - -- [ ] **Step 4: Commit** - -```bash -git add openkb/agent/compiler.py -git commit -m "chore: update compiler docstring for new pipeline" -``` diff --git a/docs/superpowers/plans/2026-04-09-retrieve-redesign.md b/docs/superpowers/plans/2026-04-09-retrieve-redesign.md deleted file mode 100644 index 3c659bc..0000000 --- a/docs/superpowers/plans/2026-04-09-retrieve-redesign.md +++ /dev/null @@ -1,1104 +0,0 @@ -# Retrieve Redesign Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Unify query across long/short docs, add brief summaries to index.md and frontmatter, store long doc sources as JSON with per-page access. - -**Architecture:** (1) LLM prompts return `{"brief", "content"}` JSON — briefs flow into frontmatter and index.md. (2) Indexer stores long doc pages as JSON array. (3) New `get_page_content` tool replaces `pageindex_retrieve`. (4) Query agent uses same tools for all docs. - -**Tech Stack:** Python, litellm, asyncio, pytest - ---- - -### Task 1: Add `get_page_content` tool and `parse_pages` helper - -**Files:** -- Modify: `openkb/agent/tools.py` -- Modify: `tests/test_agent_tools.py` - -- [ ] **Step 1: Write failing tests** - -Add to `tests/test_agent_tools.py`: - -```python -from openkb.agent.tools import get_page_content, parse_pages - -class TestParsePages: - def test_single_page(self): - assert parse_pages("3") == [3] - - def test_range(self): - assert parse_pages("3-5") == [3, 4, 5] - - def test_comma_separated(self): - assert parse_pages("1,3,5") == [1, 3, 5] - - def test_mixed(self): - assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12] - - def test_deduplication(self): - assert parse_pages("3,3,3") == [3] - - def test_sorted(self): - assert parse_pages("5,1,3") == [1, 3, 5] - - def test_ignores_zero_and_negative(self): - assert parse_pages("0,-1,3") == [3] - - -class TestGetPageContent: - def test_reads_pages_from_json(self, tmp_path): - import json - wiki_root = str(tmp_path) - sources = tmp_path / "sources" - sources.mkdir() - pages = [ - {"page": 1, "content": "Page one text."}, - {"page": 2, "content": "Page two text."}, - {"page": 3, "content": "Page three text."}, - ] - (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") - - result = get_page_content("paper", "1,3", wiki_root) - assert "[Page 1]" in result - assert "Page one text." in result - assert "[Page 3]" in result - assert "Page three text." in result - assert "Page two" not in result - - def test_returns_error_for_missing_file(self, tmp_path): - wiki_root = str(tmp_path) - (tmp_path / "sources").mkdir() - result = get_page_content("nonexistent", "1", wiki_root) - assert "not found" in result.lower() - - def test_returns_error_for_no_matching_pages(self, tmp_path): - import json - wiki_root = str(tmp_path) - sources = tmp_path / "sources" - sources.mkdir() - pages = [{"page": 1, "content": "Only page."}] - (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") - - result = get_page_content("paper", "99", wiki_root) - assert "no content" in result.lower() or result.strip() == "" - - def test_includes_images_info(self, tmp_path): - import json - wiki_root = str(tmp_path) - sources = tmp_path / "sources" - sources.mkdir() - pages = [ - {"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]}, - ] - (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8") - - result = get_page_content("doc", "1", wiki_root) - assert "img.png" in result - - def test_path_escape_denied(self, tmp_path): - wiki_root = str(tmp_path) - (tmp_path / "sources").mkdir() - result = get_page_content("../../etc/passwd", "1", wiki_root) - assert "denied" in result.lower() or "not found" in result.lower() -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `pytest tests/test_agent_tools.py::TestParsePages tests/test_agent_tools.py::TestGetPageContent -v` -Expected: FAIL with `ImportError` - -- [ ] **Step 3: Implement `parse_pages` and `get_page_content`** - -Add to `openkb/agent/tools.py`: - -```python -import json as _json - - -def parse_pages(pages: str) -> list[int]: - """Parse a page specification like '3-5,7,10-12' into a sorted list of ints.""" - result: set[int] = set() - for part in pages.split(","): - part = part.strip() - if "-" in part: - start_str, end_str = part.split("-", 1) - try: - start, end = int(start_str), int(end_str) - result.update(range(start, end + 1)) - except ValueError: - continue - else: - try: - result.add(int(part)) - except ValueError: - continue - return sorted(n for n in result if n >= 1) - - -def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: - """Get text content of specific pages from a long document. - - Reads from ``wiki/sources/{doc_name}.json`` which contains a JSON array - of ``{"page": int, "content": str, "images": [...]}`` objects. - - Args: - doc_name: Document name (stem, e.g. ``'attention-is-all-you-need'``). - pages: Page specification (e.g. ``'3-5,7,10-12'``). - wiki_root: Absolute path to the wiki root directory. - - Returns: - Formatted text of requested pages, or error message if not found. - """ - root = Path(wiki_root).resolve() - json_path = (root / "sources" / f"{doc_name}.json").resolve() - if not json_path.is_relative_to(root): - return "Access denied: path escapes wiki root." - if not json_path.exists(): - return f"Document not found: {doc_name}. No sources/{doc_name}.json file." - - data = _json.loads(json_path.read_text(encoding="utf-8")) - page_nums = set(parse_pages(pages)) - matched = [p for p in data if p["page"] in page_nums] - - if not matched: - return f"No content found for pages: {pages}" - - parts: list[str] = [] - for p in matched: - header = f"[Page {p['page']}]" - text = p.get("content", "") - if "images" in p: - img_refs = ", ".join(img["path"] for img in p["images"]) - text += f"\n[Images: {img_refs}]" - parts.append(f"{header}\n{text}") - - return "\n\n".join(parts) -``` - -- [ ] **Step 4: Run tests to verify they pass** - -Run: `pytest tests/test_agent_tools.py -v` -Expected: All PASS - -- [ ] **Step 5: Commit** - -```bash -git add openkb/agent/tools.py tests/test_agent_tools.py -git commit -m "feat: add get_page_content tool and parse_pages helper" -``` - ---- - -### Task 2: Change LLM prompts to return `{"brief", "content"}` JSON - -**Files:** -- Modify: `openkb/agent/compiler.py` (prompt templates, lines 40-105) -- Modify: `tests/test_compiler.py` (TestParseConceptsPlan) - -- [ ] **Step 1: Write test for brief+content JSON parsing** - -Add to `tests/test_compiler.py`: - -```python -class TestParseBriefContent: - def test_dict_with_brief_and_content(self): - text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."}) - parsed = _parse_json(text) - assert parsed["brief"] == "A short desc" - assert "# Full page" in parsed["content"] - - def test_plain_text_fallback(self): - """If LLM returns plain text, _parse_json raises — caller handles fallback.""" - with pytest.raises((json.JSONDecodeError, ValueError)): - _parse_json("Just plain markdown text without JSON") -``` - -- [ ] **Step 2: Run test to verify it passes (existing _parse_json handles dicts)** - -Run: `pytest tests/test_compiler.py::TestParseBriefContent -v` -Expected: PASS — `_parse_json` already handles dicts - -- [ ] **Step 3: Update `_SUMMARY_USER` prompt** - -Replace in `openkb/agent/compiler.py`: - -```python -_SUMMARY_USER = """\ -New document: {doc_name} - -Full text: -{content} - -Write a summary page for this document in Markdown. - -Return a JSON object with two keys: -- "brief": A single sentence (under 100 chars) describing the document's main contribution -- "content": The full summary in Markdown. Include key concepts, findings, ideas, \ -and [[wikilinks]] to concepts that could become cross-document concept pages - -Return ONLY valid JSON, no fences. -""" -``` - -- [ ] **Step 4: Update `_CONCEPT_PAGE_USER` prompt** - -Replace in `openkb/agent/compiler.py`: - -```python -_CONCEPT_PAGE_USER = """\ -Write the concept page for: {title} - -This concept relates to the document "{doc_name}" summarized above. -{update_instruction} - -Return a JSON object with two keys: -- "brief": A single sentence (under 100 chars) defining this concept -- "content": The full concept page in Markdown. Include clear explanation, \ -key details from the source document, and [[wikilinks]] to related concepts \ -and [[summaries/{doc_name}]] - -Return ONLY valid JSON, no fences. -""" -``` - -- [ ] **Step 5: Update `_CONCEPT_UPDATE_USER` prompt** - -Replace in `openkb/agent/compiler.py`: - -```python -_CONCEPT_UPDATE_USER = """\ -Update the concept page for: {title} - -Current content of this page: -{existing_content} - -New information from document "{doc_name}" (summarized above) should be \ -integrated into this page. Rewrite the full page incorporating the new \ -information naturally — do not just append. Maintain existing \ -[[wikilinks]] and add new ones where appropriate. - -Return a JSON object with two keys: -- "brief": A single sentence (under 100 chars) defining this concept (may differ from before) -- "content": The rewritten full concept page in Markdown - -Return ONLY valid JSON, no fences. -""" -``` - -- [ ] **Step 6: Run all tests (prompts aren't tested directly)** - -Run: `pytest tests/test_compiler.py -v` -Expected: All PASS - -- [ ] **Step 7: Commit** - -```bash -git add openkb/agent/compiler.py tests/test_compiler.py -git commit -m "feat: update LLM prompts to return brief+content JSON" -``` - ---- - -### Task 3: Update `_write_summary` and `_write_concept` to store `brief` in frontmatter - -**Files:** -- Modify: `openkb/agent/compiler.py` (lines 274-320, `_write_summary` and `_write_concept`) -- Modify: `tests/test_compiler.py` - -- [ ] **Step 1: Write failing tests** - -Update existing and add new tests in `tests/test_compiler.py`: - -```python -class TestWriteSummary: - def test_writes_with_frontmatter(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers") - path = wiki / "summaries" / "my-doc.md" - assert path.exists() - text = path.read_text() - assert "sources: [my-doc.pdf]" in text - assert "brief: Introduces transformers" in text - assert "# Summary" in text - - def test_writes_without_brief(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.") - path = wiki / "summaries" / "my-doc.md" - text = path.read_text() - assert "sources: [my-doc.pdf]" in text - assert "brief:" not in text -``` - -Update `TestWriteConcept`: - -```python -class TestWriteConcept: - def test_new_concept_with_brief(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False, brief="Mechanism for selective focus") - path = wiki / "concepts" / "attention.md" - assert path.exists() - text = path.read_text() - assert "sources: [paper.pdf]" in text - assert "brief: Mechanism for selective focus" in text - assert "# Attention" in text - - def test_new_concept(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False) - path = wiki / "concepts" / "attention.md" - assert path.exists() - text = path.read_text() - assert "sources: [paper.pdf]" in text - assert "# Attention" in text - - def test_update_concept_appends_source(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "attention.md").write_text( - "---\nsources: [paper1.pdf]\nbrief: Old brief\n---\n\n# Attention\n\nOld content.", - encoding="utf-8", - ) - _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True, brief="Updated brief") - text = (concepts / "attention.md").read_text() - assert "paper2.pdf" in text - assert "paper1.pdf" in text - assert "brief: Updated brief" in text - assert "New info from paper2." in text -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v` -Expected: FAIL — `_write_summary` and `_write_concept` don't accept `brief` parameter - -- [ ] **Step 3: Update `_write_summary` to accept `brief`** - -```python -def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None: - """Write summary page with frontmatter.""" - summaries_dir = wiki_dir / "summaries" - summaries_dir.mkdir(parents=True, exist_ok=True) - fm_lines = [f"sources: [{source_file}]"] - if brief: - fm_lines.append(f"brief: {brief}") - frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" - (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") -``` - -- [ ] **Step 4: Update `_write_concept` to accept `brief`** - -Add `brief: str = ""` parameter to `_write_concept`. In the new-concept branch: - -```python - else: - fm_lines = [f"sources: [{source_file}]"] - if brief: - fm_lines.append(f"brief: {brief}") - frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" - path.write_text(frontmatter + content, encoding="utf-8") -``` - -In the update branch, after updating sources in frontmatter, also update brief: - -```python - if is_update and path.exists(): - existing = path.read_text(encoding="utf-8") - if source_file not in existing: - # ... existing frontmatter update logic ... - # Update brief in frontmatter if provided - if brief and existing.startswith("---"): - end = existing.find("---", 3) - if end != -1: - fm = existing[:end + 3] - body = existing[end + 3:] - if "brief:" in fm: - import re - fm = re.sub(r"brief:.*", f"brief: {brief}", fm) - else: - fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1) - existing = fm + body - path.write_text(existing, encoding="utf-8") -``` - -- [ ] **Step 5: Run tests to verify they pass** - -Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v` -Expected: All PASS - -- [ ] **Step 6: Commit** - -```bash -git add openkb/agent/compiler.py tests/test_compiler.py -git commit -m "feat: store brief in frontmatter of summary and concept pages" -``` - ---- - -### Task 4: Update `_update_index` to include briefs, and update `_read_concept_briefs` to read from frontmatter - -**Files:** -- Modify: `openkb/agent/compiler.py` (lines 233-261 and 408-430) -- Modify: `tests/test_compiler.py` - -- [ ] **Step 1: Write failing tests for `_update_index` with briefs** - -```python -class TestUpdateIndex: - def test_appends_entries_with_briefs(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", - encoding="utf-8", - ) - _update_index(wiki, "my-doc", ["attention", "transformer"], - doc_brief="Introduces transformers", - concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"}) - text = (wiki / "index.md").read_text() - assert "[[summaries/my-doc]] — Introduces transformers" in text - assert "[[concepts/attention]] — Focus mechanism" in text - assert "[[concepts/transformer]] — NN architecture" in text - - def test_no_duplicates(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n- [[summaries/my-doc]] — Old brief\n\n## Concepts\n", - encoding="utf-8", - ) - _update_index(wiki, "my-doc", [], doc_brief="New brief") - text = (wiki / "index.md").read_text() - assert text.count("[[summaries/my-doc]]") == 1 - - def test_backwards_compat_no_briefs(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", - encoding="utf-8", - ) - _update_index(wiki, "my-doc", ["attention"]) - text = (wiki / "index.md").read_text() - assert "[[summaries/my-doc]]" in text - assert "[[concepts/attention]]" in text -``` - -Write test for updated `_read_concept_briefs`: - -```python -class TestReadConceptBriefs: - # ... keep existing tests ... - - def test_reads_brief_from_frontmatter(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "attention.md").write_text( - "---\nsources: [paper.pdf]\nbrief: Selective focus mechanism\n---\n\n# Attention\n\nLong content...", - encoding="utf-8", - ) - result = _read_concept_briefs(wiki) - assert "- attention: Selective focus mechanism" in result - - def test_falls_back_to_body_truncation(self, tmp_path): - wiki = tmp_path / "wiki" - concepts = wiki / "concepts" - concepts.mkdir(parents=True) - (concepts / "old.md").write_text( - "---\nsources: [paper.pdf]\n---\n\nOld concept without brief field.", - encoding="utf-8", - ) - result = _read_concept_briefs(wiki) - assert "- old: Old concept without brief field." in result -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `pytest tests/test_compiler.py::TestUpdateIndex tests/test_compiler.py::TestReadConceptBriefs -v` -Expected: FAIL — `_update_index` doesn't accept `doc_brief`/`concept_briefs` parameters - -- [ ] **Step 3: Update `_update_index`** - -```python -def _update_index( - wiki_dir: Path, doc_name: str, concept_names: list[str], - doc_brief: str = "", concept_briefs: dict[str, str] | None = None, -) -> None: - """Append document and concept entries to index.md with optional briefs.""" - index_path = wiki_dir / "index.md" - if not index_path.exists(): - index_path.write_text( - "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", - encoding="utf-8", - ) - - text = index_path.read_text(encoding="utf-8") - - doc_link = f"[[summaries/{doc_name}]]" - if doc_link not in text: - doc_entry = f"- {doc_link}" - if doc_brief: - doc_entry += f" — {doc_brief}" - if "## Documents" in text: - text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) - - if concept_briefs is None: - concept_briefs = {} - for name in concept_names: - concept_link = f"[[concepts/{name}]]" - if concept_link not in text: - concept_entry = f"- {concept_link}" - if name in concept_briefs: - concept_entry += f" — {concept_briefs[name]}" - if "## Concepts" in text: - text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) - - index_path.write_text(text, encoding="utf-8") -``` - -- [ ] **Step 4: Update `_read_concept_briefs` to read from frontmatter `brief:` field** - -```python -def _read_concept_briefs(wiki_dir: Path) -> str: - """Read existing concept pages and return compact one-line summaries. - - Reads ``brief:`` from YAML frontmatter if available, otherwise falls back - to the first 150 characters of the body text. - """ - concepts_dir = wiki_dir / "concepts" - if not concepts_dir.exists(): - return "(none yet)" - - md_files = sorted(concepts_dir.glob("*.md")) - if not md_files: - return "(none yet)" - - lines: list[str] = [] - for path in md_files: - text = path.read_text(encoding="utf-8") - brief = "" - body = text - if text.startswith("---"): - end = text.find("---", 3) - if end != -1: - fm = text[:end + 3] - body = text[end + 3:] - # Try to extract brief from frontmatter - for line in fm.split("\n"): - if line.startswith("brief:"): - brief = line[len("brief:"):].strip() - break - if not brief: - brief = body.strip().replace("\n", " ")[:150] - if brief: - lines.append(f"- {path.stem}: {brief}") - - return "\n".join(lines) or "(none yet)" -``` - -- [ ] **Step 5: Run tests** - -Run: `pytest tests/test_compiler.py -v` -Expected: All PASS - -- [ ] **Step 6: Commit** - -```bash -git add openkb/agent/compiler.py tests/test_compiler.py -git commit -m "feat: add briefs to index.md entries and read from frontmatter" -``` - ---- - -### Task 5: Wire briefs through `_compile_concepts` and public functions - -**Files:** -- Modify: `openkb/agent/compiler.py` (lines 438-611, `_compile_concepts`, `compile_short_doc`, `compile_long_doc`) -- Modify: `tests/test_compiler.py` - -This task connects the brief+content JSON parsing to the write functions and index update. - -- [ ] **Step 1: Write integration test** - -```python -class TestBriefIntegration: - @pytest.mark.asyncio - async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): - wiki = tmp_path / "wiki" - (wiki / "sources").mkdir(parents=True) - (wiki / "summaries").mkdir(parents=True) - (wiki / "concepts").mkdir(parents=True) - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", - encoding="utf-8", - ) - source_path = wiki / "sources" / "test-doc.md" - source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8") - (tmp_path / ".openkb").mkdir() - (tmp_path / "raw").mkdir() - (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") - - summary_resp = json.dumps({ - "brief": "A paper about transformers", - "content": "# Summary\n\nThis paper discusses transformers.", - }) - plan_resp = json.dumps({ - "create": [{"name": "transformer", "title": "Transformer"}], - "update": [], - "related": [], - }) - concept_resp = json.dumps({ - "brief": "NN architecture using self-attention", - "content": "# Transformer\n\nA neural network architecture.", - }) - - with patch("openkb.agent.compiler.litellm") as mock_litellm: - mock_litellm.completion = MagicMock( - side_effect=_mock_completion([summary_resp, plan_resp]) - ) - mock_litellm.acompletion = AsyncMock( - side_effect=_mock_acompletion([concept_resp]) - ) - await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") - - # Check summary frontmatter has brief - summary_text = (wiki / "summaries" / "test-doc.md").read_text() - assert "brief: A paper about transformers" in summary_text - - # Check concept frontmatter has brief - concept_text = (wiki / "concepts" / "transformer.md").read_text() - assert "brief: NN architecture using self-attention" in concept_text - - # Check index has briefs - index_text = (wiki / "index.md").read_text() - assert "[[summaries/test-doc]] — A paper about transformers" in index_text - assert "[[concepts/transformer]] — NN architecture using self-attention" in index_text -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `pytest tests/test_compiler.py::TestBriefIntegration -v` -Expected: FAIL - -- [ ] **Step 3: Update `compile_short_doc` to parse brief+content from summary response** - -In `compile_short_doc`, replace: - -```python - # --- Step 1: Generate summary --- - summary = _llm_call(model, [system_msg, doc_msg], "summary") - _write_summary(wiki_dir, doc_name, source_file, summary) -``` - -With: - -```python - # --- Step 1: Generate summary --- - summary_raw = _llm_call(model, [system_msg, doc_msg], "summary") - try: - summary_parsed = _parse_json(summary_raw) - doc_brief = summary_parsed.get("brief", "") - summary = summary_parsed.get("content", summary_raw) - except (json.JSONDecodeError, ValueError): - doc_brief = "" - summary = summary_raw - _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief) -``` - -- [ ] **Step 4: Update `_compile_concepts` signature and wiring** - -Add `doc_brief: str = ""` parameter to `_compile_concepts`. - -In `_gen_create`, parse the response: - -```python - async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: - name = concept["name"] - title = concept.get("title", name) - async with semaphore: - raw = await _llm_call_async(model, [ - system_msg, doc_msg, - {"role": "assistant", "content": summary}, - {"role": "user", "content": _CONCEPT_PAGE_USER.format( - title=title, doc_name=doc_name, update_instruction="", - )}, - ], f"create:{name}") - try: - parsed = _parse_json(raw) - brief = parsed.get("brief", "") - content = parsed.get("content", raw) - except (json.JSONDecodeError, ValueError): - brief, content = "", raw - return name, content, False, brief -``` - -Same for `_gen_update` — returns `tuple[str, str, bool, str]` (name, content, is_update, brief). - -In the results processing loop: - -```python - concept_briefs_map: dict[str, str] = {} - for r in results: - if isinstance(r, Exception): - logger.warning("Concept generation failed: %s", r) - continue - name, page_content, is_update, brief = r - _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief) - concept_names.append(name) - if brief: - concept_briefs_map[name] = brief -``` - -Pass briefs to `_update_index`: - -```python - _update_index(wiki_dir, doc_name, concept_names, - doc_brief=doc_brief, concept_briefs=concept_briefs_map) -``` - -- [ ] **Step 5: Update `compile_short_doc` to pass `doc_brief` to `_compile_concepts`** - -```python - await _compile_concepts( - wiki_dir, kb_dir, model, system_msg, doc_msg, - summary, doc_name, max_concurrency, doc_brief=doc_brief, - ) -``` - -- [ ] **Step 6: Update `compile_long_doc` to pass `doc_brief` from `IndexResult.description`** - -`compile_long_doc` currently takes `doc_id` but not `description`. Add `doc_description: str = ""` parameter: - -```python -async def compile_long_doc( - doc_name: str, - summary_path: Path, - doc_id: str, - kb_dir: Path, - model: str, - doc_description: str = "", - max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, -) -> None: -``` - -The `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain text, not JSON). Pass `doc_description` as `doc_brief`: - -```python - await _compile_concepts( - wiki_dir, kb_dir, model, system_msg, doc_msg, - overview, doc_name, max_concurrency, doc_brief=doc_description, - ) -``` - -Also update the CLI call in `cli.py` line 135: - -```python -asyncio.run( - compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model, - doc_description=index_result.description) -) -``` - -- [ ] **Step 7: Update existing integration tests for new JSON response format** - -Update all mock LLM responses in `TestCompileShortDoc`, `TestCompileLongDoc`, and `TestCompileConceptsPlan` to return `{"brief": "...", "content": "..."}` JSON instead of plain text for summary and concept responses. - -- [ ] **Step 8: Run all tests** - -Run: `pytest tests/ -q` -Expected: All PASS - -- [ ] **Step 9: Commit** - -```bash -git add openkb/agent/compiler.py openkb/cli.py tests/test_compiler.py -git commit -m "feat: wire brief+content JSON through compile pipeline to index and frontmatter" -``` - ---- - -### Task 6: Indexer — long doc sources from markdown to JSON - -**Files:** -- Modify: `openkb/indexer.py` -- Modify: `openkb/tree_renderer.py` (remove `render_source_md`) -- Modify: `tests/test_indexer.py` - -- [ ] **Step 1: Write failing test** - -Update `tests/test_indexer.py`: - -```python - def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path): - """Long doc source should be written as JSON, not markdown.""" - import json as json_mod - doc_id = "abc-123" - fake_col = self._make_fake_collection(doc_id, sample_tree) - - fake_client = MagicMock() - fake_client.collection.return_value = fake_col - # Mock get_page_content to return page data - fake_col.get_page_content.return_value = [ - {"page": 1, "content": "Page one text."}, - {"page": 2, "content": "Page two text."}, - ] - - pdf_path = tmp_path / "sample.pdf" - pdf_path.write_bytes(b"%PDF-1.4 fake") - - with patch("openkb.indexer.PageIndexClient", return_value=fake_client): - index_long_document(pdf_path, kb_dir) - - # Should be JSON, not MD - json_file = kb_dir / "wiki" / "sources" / "sample.json" - assert json_file.exists() - assert not (kb_dir / "wiki" / "sources" / "sample.md").exists() - data = json_mod.loads(json_file.read_text()) - assert len(data) == 2 - assert data[0]["page"] == 1 -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `pytest tests/test_indexer.py::TestIndexLongDocument::test_source_page_written_as_json -v` -Expected: FAIL - -- [ ] **Step 3: Update `indexer.py` to write JSON sources** - -Replace the source writing block (lines 103-110) with: - -```python - # Write wiki/sources/ as JSON (per-page content from PageIndex) - sources_dir = kb_dir / "wiki" / "sources" - sources_dir.mkdir(parents=True, exist_ok=True) - dest_images_dir = sources_dir / "images" / pdf_path.stem - - # Get per-page content from PageIndex - all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}") - - # Relocate image paths - dest_images_dir.mkdir(parents=True, exist_ok=True) - for page in all_pages: - if "images" in page: - for img in page["images"]: - src_path = Path(img["path"]) - if src_path.exists(): - filename = src_path.name - dest = dest_images_dir / filename - if not dest.exists(): - shutil.copy2(src_path, dest) - img["path"] = f"images/{pdf_path.stem}/{filename}" - - import json as json_mod - (sources_dir / f"{pdf_path.stem}.json").write_text( - json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", - ) -``` - -Remove the `render_source_md` import and `_relocate_images` call. - -- [ ] **Step 4: Remove `render_source_md` from tree_renderer.py** - -Remove the `render_source_md` function and `_render_nodes_source` helper from `openkb/tree_renderer.py`. Keep `render_summary_md` and `_render_nodes_summary`. - -- [ ] **Step 5: Update existing test `test_source_page_written`** - -The old test checks for `.md` — update it to check for `.json` or remove it (replaced by the new test). - -- [ ] **Step 6: Run all tests** - -Run: `pytest tests/ -q` -Expected: All PASS - -- [ ] **Step 7: Commit** - -```bash -git add openkb/indexer.py openkb/tree_renderer.py tests/test_indexer.py -git commit -m "feat: store long doc sources as per-page JSON, remove render_source_md" -``` - ---- - -### Task 7: Query agent — remove `pageindex_retrieve`, add `get_page_content`, update instructions - -**Files:** -- Modify: `openkb/agent/query.py` -- Modify: `openkb/schema.py` -- Modify: `tests/test_query.py` - -- [ ] **Step 1: Write failing tests** - -Update `tests/test_query.py`: - -```python -class TestBuildQueryAgent: - def test_agent_name(self, tmp_path): - agent = build_query_agent(str(tmp_path), "gpt-4o-mini") - assert agent.name == "wiki-query" - - def test_agent_has_three_tools(self, tmp_path): - agent = build_query_agent(str(tmp_path), "gpt-4o-mini") - assert len(agent.tools) == 3 - - def test_agent_tool_names(self, tmp_path): - agent = build_query_agent(str(tmp_path), "gpt-4o-mini") - names = {t.name for t in agent.tools} - assert "list_files" in names - assert "read_file" in names - assert "get_page_content" in names - assert "pageindex_retrieve" not in names - - def test_instructions_mention_get_page_content(self, tmp_path): - agent = build_query_agent(str(tmp_path), "gpt-4o-mini") - assert "get_page_content" in agent.instructions -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `pytest tests/test_query.py::TestBuildQueryAgent -v` -Expected: FAIL — old signature requires `openkb_dir` - -- [ ] **Step 3: Rewrite `query.py`** - -Remove `_pageindex_retrieve_impl` entirely (~110 lines). Remove `PageIndexClient` import. Update `build_query_agent`: - -```python -def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent: - """Build and return the Q&A agent.""" - schema_md = get_agents_md(Path(wiki_root)) - instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) - instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." - - @function_tool - def list_files(directory: str) -> str: - """List all Markdown files in a wiki subdirectory.""" - return list_wiki_files(directory, wiki_root) - - @function_tool - def read_file(path: str) -> str: - """Read a Markdown file from the wiki.""" - return read_wiki_file(path, wiki_root) - - @function_tool - def get_page_content_tool(doc_name: str, pages: str) -> str: - """Get text content of specific pages from a long document. - - Args: - doc_name: Document name (e.g. 'attention-is-all-you-need'). - pages: Page specification (e.g. '3-5,7,10-12'). - """ - from openkb.agent.tools import get_page_content - return get_page_content(doc_name, pages, wiki_root) - - from agents.model_settings import ModelSettings - - return Agent( - name="wiki-query", - instructions=instructions, - tools=[list_files, read_file, get_page_content_tool], - model=f"litellm/{model}", - model_settings=ModelSettings(parallel_tool_calls=False), - ) -``` - -Update `_QUERY_INSTRUCTIONS_TEMPLATE`: - -```python -_QUERY_INSTRUCTIONS_TEMPLATE = """\ -You are a knowledge-base Q&A agent. You answer questions by searching the wiki. - -{schema_md} - -## Search strategy -1. Read index.md to understand what documents and concepts are available. - Each entry has a brief summary to help you judge relevance. -2. Read relevant summary pages (summaries/) for document overviews. -3. Read concept pages (concepts/) for cross-document synthesis. -4. For long documents, use get_page_content(doc_name, pages) to read - specific pages when you need detailed content. The summary page - shows chapter structure with page ranges to help you decide which - pages to read. -5. Synthesise a clear, well-cited answer. - -Always ground your answer in the wiki content. If you cannot find relevant -information, say so clearly. -""" -``` - -Update `run_query` to match new `build_query_agent` signature (remove `openkb_dir` param): - -```python -async def run_query(question: str, kb_dir: Path, model: str, stream: bool = False) -> str: - from openkb.config import load_config - openkb_dir = kb_dir / ".openkb" - config = load_config(openkb_dir / "config.yaml") - language: str = config.get("language", "en") - - wiki_root = str(kb_dir / "wiki") - agent = build_query_agent(wiki_root, model, language=language) - # ... rest unchanged ... -``` - -- [ ] **Step 4: Update `openkb/schema.py` AGENTS_MD** - -Add a note about `get_page_content` for long documents in the Schema: - -```python -## Page Types -- **Summary Page** (summaries/): Key content of a single source document. -- **Concept Page** (concepts/): Cross-document topic synthesis with [[wikilinks]]. -- **Exploration Page** (explorations/): Saved query results — analyses, comparisons, syntheses. -- **Source Page** (sources/): Full-text for short docs (.md) or per-page JSON for long docs (.json). -- **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained. -``` - -- [ ] **Step 5: Run all tests** - -Run: `pytest tests/ -q` -Expected: All PASS - -- [ ] **Step 6: Commit** - -```bash -git add openkb/agent/query.py openkb/schema.py tests/test_query.py -git commit -m "feat: replace pageindex_retrieve with get_page_content, unify query for all docs" -``` - ---- - -### Task 8: Final cleanup and full verification - -**Files:** -- Modify: `openkb/indexer.py` (remove unused imports) -- Verify all files - -- [ ] **Step 1: Remove unused imports** - -In `indexer.py`, remove `from openkb.tree_renderer import render_source_md` if still present (keep `render_summary_md`). - -In `query.py`, verify `PageIndexClient` import is removed. - -- [ ] **Step 2: Run full test suite** - -Run: `pytest tests/ -v` -Expected: All PASS - -- [ ] **Step 3: Grep for dead references** - -Run: `grep -r "pageindex_retrieve\|render_source_md\|_relocate_images" openkb/ tests/` -Expected: No matches - -- [ ] **Step 4: Commit** - -```bash -git add -A -git commit -m "chore: remove dead imports and references" -``` diff --git a/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md b/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md deleted file mode 100644 index 2fcd853..0000000 --- a/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md +++ /dev/null @@ -1,163 +0,0 @@ -# Concept Dedup & Existing Page Update - -**Date:** 2026-04-09 -**Status:** Approved -**Branch:** bugfix/compile - -## Problem - -The compiler pipeline generates concept pages per document, but: - -1. **No dedup** — LLM only sees concept slug names, not content. It can't reliably judge whether a new concept overlaps with an existing one. As the KB grows, concepts duplicate and diverge. -2. **No update of existing pages** — When a new document has information relevant to existing concepts, those pages are not updated. Knowledge doesn't compound across documents. - -The old agent-based approach solved this (the agent could read/write wiki files freely), but was too slow — 20-30 tool-call round-trips per document. - -## Design - -Extend the existing deterministic pipeline to give the LLM enough context for dedup/update decisions, without adding agent loops or breaking prompt caching. - -### Prompt Caching Invariant - -The cached prefix `[system_msg, doc_msg]` must remain identical across all LLM calls within a single document compilation. All new context (concept briefs, existing page content) goes into messages **after** the cached prefix. - -### Pipeline Overview - -``` -Step 1: [system, doc] → summary (unchanged) -Step 2: [system, doc, summary, concepts_plan_prompt] → concepts plan JSON -Step 3a: [system, doc, summary, create_prompt] × N → new concept pages (concurrent) -Step 3b: [system, doc, summary, update_prompt] × M → rewritten concept pages (concurrent) -Step 3c: code-only × K → add cross-ref links to related concepts -Step 4: update index (unchanged) -``` - -Steps 3a and 3b share a single semaphore and run concurrently together. - -### Part 1: Concept Briefs - -New function `_read_concept_briefs(wiki_dir)` reads existing concept pages and returns a compact summary string: - -``` -- attention: Attention is a mechanism that allows models to focus on relevant parts... -- transformer-architecture: The Transformer is a neural network architecture... -``` - -For each concept file in `wiki/concepts/*.md`: -- Skip YAML frontmatter -- Take first 150 characters of body text -- Format as `- {slug}: {brief}` - -This replaces the current `", ".join(existing_concepts)` in the concepts-list prompt. Pure file I/O, no LLM call. - -### Part 2: Concepts Plan Prompt - -The `_CONCEPTS_LIST_USER` template is replaced with a new `_CONCEPTS_PLAN_USER` template that asks the LLM to return a JSON object with three action types: - -```json -{ - "create": [{"name": "flash-attention", "title": "Flash Attention"}], - "update": [{"name": "attention", "title": "Attention Mechanism"}], - "related": ["transformer-architecture"] -} -``` - -- **create** — New concept not covered by any existing page. -- **update** — Existing concept with significant new information worth integrating. -- **related** — Existing concept tangentially related; only needs a cross-reference link. - -The prompt includes rules: -- Don't create concepts that overlap with existing ones — use "update" instead. -- Don't create concepts that are just the document topic itself. -- For first few documents, create 2-3 foundational concepts at most. -- "related" is for lightweight cross-linking only. - -### Part 3: Three Execution Paths - -#### create (unchanged) - -Same as current: concurrent `_llm_call_async` with `_CONCEPT_PAGE_USER` template. Written via `_write_concept` with `is_update=False`. - -#### update (new) - -New template `_CONCEPT_UPDATE_USER`: - -``` -Update the concept page for: {title} - -Current content of this page: -{existing_content} - -New information from document "{doc_name}" (summarized above) should be -integrated into this page. Rewrite the full page incorporating the new -information naturally. Maintain existing cross-references and add new ones -where appropriate. - -Return ONLY the Markdown content (no frontmatter, no code fences). -``` - -Call structure: `[system_msg, doc_msg, {assistant: summary}, update_user_msg]` - -The cached prefix `[system_msg, doc_msg]` is shared with create calls. The `existing_content` (typically 200-500 tokens) is in the final user message only. - -Written via `_write_concept` with `is_update=True`. The frontmatter `sources:` list is updated to include the new source file. - -#### related (code-only, no LLM) - -For each related slug: -1. Read the concept file -2. If `summaries/{doc_name}` is not already linked, append `\n\nSee also: [[summaries/{doc_name}]]` -3. Update frontmatter `sources:` list - -Pure file I/O, millisecond-level. - -### Part 4: Shared Logic Between Short and Long Doc - -Current `compile_short_doc` and `compile_long_doc` duplicate Steps 2-4. Extract shared logic into `_compile_concepts(wiki_dir, model, system_msg, doc_msg, summary, doc_name, kb_dir, max_concurrency)`. - -Public functions become: -- `compile_short_doc`: builds context A from source text → calls `_compile_concepts` -- `compile_long_doc`: builds context A from PageIndex summary → calls `_compile_concepts` - -### Part 5: JSON Parsing Fallback - -If the LLM returns a flat JSON array instead of the expected dict, treat it as all "create" actions: - -```python -if isinstance(parsed, list): - create_list, update_list, related_list = parsed, [], [] -else: - create_list = parsed.get("create", []) - update_list = parsed.get("update", []) - related_list = parsed.get("related", []) -``` - -This ensures backward compatibility if the LLM doesn't follow the new format. - -## Token Cost Analysis - -Compared to current pipeline (per document with C existing concepts): - -| Step | Current | New | Delta | -|------|---------|-----|-------| -| concepts-list prompt | ~50 tokens (slug names) | ~50 + C×30 tokens (briefs) | +C×30 | -| update calls | 0 | M × ~500 tokens (existing content) | +M×500 | -| related | 0 | 0 (code-only) | 0 | - -At C=30 existing concepts: +900 tokens in concepts-list prompt. -At M=2 update calls: +1000 tokens total. - -Total overhead: ~2000 tokens per document. Negligible compared to document content (5K-20K tokens). - -## Files Changed - -- `openkb/agent/compiler.py` — all changes - - New: `_read_concept_briefs()`, `_CONCEPTS_PLAN_USER`, `_CONCEPT_UPDATE_USER`, `_add_related_link()`, `_compile_concepts()` - - Modified: `compile_short_doc()`, `compile_long_doc()`, `_parse_json()` caller logic -- `tests/test_compiler.py` — update tests for new JSON format and update/related paths - -## Not In Scope - -- Concept briefs truncation/filtering for very large KBs (100+ concepts) — revisit when needed -- Interactive ingest (human-in-the-loop checkpoint) — separate feature -- Lint --fix auto-repair — separate feature diff --git a/docs/superpowers/specs/2026-04-09-retrieve-redesign.md b/docs/superpowers/specs/2026-04-09-retrieve-redesign.md deleted file mode 100644 index 15224be..0000000 --- a/docs/superpowers/specs/2026-04-09-retrieve-redesign.md +++ /dev/null @@ -1,262 +0,0 @@ -# Retrieve Redesign: Unified Query, Brief Summaries, and Local Page Content - -**Date:** 2026-04-09 -**Status:** Approved -**Branch:** bugfix/compile - -## Problems - -### 1. Long vs Short Doc Split in Query - -The query agent treats long documents (PageIndex-indexed) and short documents differently: - -- **Short docs**: agent reads `wiki/sources/{name}.md` via `read_file` -- **Long docs**: agent calls `pageindex_retrieve(doc_id, question)` — a black-box RAG call - -**Design Principle**: PageIndex is an indexer, not a retriever. Query-time retrieval should be done by the agent navigating the wiki, using the same tools for all documents. - -### 2. index.md Has No Brief Summaries - -Karpathy's gist says index.md should have "each page listed with a link, **a one-line summary**". Currently it only has wikilinks with no descriptions. The query agent must open every file to understand what's available. - -### 3. No Brief Summaries on Concepts Either - -Same problem: concept entries in index.md have no description. The agent can't judge relevance from the index alone. - -## Design - -### Part 1: Structured LLM Output with Brief Summaries - -All LLM generation steps (summary, concept create, concept update) now return a JSON object with both a one-line brief and the full content. - -#### Summary Generation - -`_SUMMARY_USER` prompt changes to request JSON output: - -``` -Write a summary page for this document in Markdown. - -Return a JSON object with two keys: -- "brief": A single sentence (under 100 chars) describing the document's main contribution -- "content": The full summary in Markdown. Include key concepts, findings, and [[wikilinks]] - -Return ONLY valid JSON, no fences. -``` - -LLM returns: -```json -{ - "brief": "Introduces the Transformer architecture based entirely on self-attention", - "content": "# Attention Is All You Need\n\nThis paper proposes..." -} -``` - -The `brief` is: -- Written into summary frontmatter: `brief: Introduces the Transformer...` -- Passed to `_update_index` for the Documents section - -The `content` is written to `wiki/summaries/{name}.md` as before. - -#### Concept Generation (create) - -`_CONCEPT_PAGE_USER` prompt changes similarly: - -``` -Write the concept page for: {title} - -Return a JSON object with two keys: -- "brief": A single sentence (under 100 chars) defining this concept -- "content": The full concept page in Markdown with [[wikilinks]] - -Return ONLY valid JSON, no fences. -``` - -The `brief` is: -- Written into concept frontmatter: `brief: Mechanism allowing each position to attend to all others` -- Passed to `_update_index` for the Concepts section -- Used by `_read_concept_briefs` (read from frontmatter instead of truncating body text) - -#### Concept Generation (update) - -`_CONCEPT_UPDATE_USER` also returns `{"brief": "...", "content": "..."}`. The brief may change as the concept evolves with new information. - -#### Long Doc Summary (overview) - -Long documents do NOT need the LLM to generate a brief. The brief comes directly from PageIndex's `doc_description` field (available via `IndexResult.description`), which is already a document-level summary generated during indexing. `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain markdown overview, not JSON) — the brief is passed through from the indexer. - -In `compile_long_doc`, the `doc_description` is passed to `_compile_concepts` which forwards it to `_update_index` as the doc brief. - -#### Parsing - -All LLM responses go through `_parse_json`. Callers extract `brief` and `content`: - -```python -parsed = _parse_json(raw) -brief = parsed.get("brief", "") -content = parsed.get("content", raw) # fallback: treat raw as content if not JSON -``` - -The fallback ensures backward compatibility if the LLM returns plain text instead of JSON. - -### Part 2: index.md with Brief Summaries - -`_update_index` signature changes: - -```python -def _update_index(wiki_dir, doc_name, concept_names, doc_brief="", concept_briefs=None): -``` - -Output format: - -```markdown -## Documents -- [[summaries/attention-is-all-you-need]] — Introduces the Transformer architecture based on self-attention -- [[summaries/flash-attention]] — Efficient attention algorithm reducing memory from quadratic to linear - -## Concepts -- [[concepts/self-attention]] — Mechanism allowing each position to attend to all others in a sequence -- [[concepts/transformer]] — Neural network architecture based entirely on attention mechanisms -``` - -When updating an existing entry (re-compile), the brief is updated in place. - -### Part 3: Frontmatter with Brief - -Summary and concept pages get a `brief` field in frontmatter: - -```markdown ---- -sources: [paper.pdf] -brief: Introduces the Transformer architecture based on self-attention ---- - -# Attention Is All You Need -... -``` - -`_read_concept_briefs` is updated to read from `brief:` frontmatter field instead of truncating body text. Fallback to body truncation if `brief:` is absent (backward compat with existing pages). - -### Part 4: Long Doc Sources from Markdown to JSON - -Store per-page content as JSON instead of a giant markdown file. - -**Current**: -``` -wiki/sources/paper.md ← rendered markdown, 10K-50K tokens -``` - -**New**: -``` -wiki/sources/paper.json ← per-page JSON array -``` - -**JSON format** (only the `pages` array from PageIndex, not the full doc object): -```json -[ - { - "page": 1, - "content": "Full text of page 1...", - "images": [{"path": "images/paper/p1_img1.png", "width": 400, "height": 300}] - }, - { - "page": 2, - "content": "Full text of page 2..." - } -] -``` - -`images` field is optional. Image paths are relative to `wiki/sources/`. Short documents are not affected — they stay as `.md`. - -#### Indexer Changes - -In `indexer.py`, replace `render_source_md` + `_relocate_images` with: -1. `col.get_page_content(doc_id, "1-9999")` to get all pages -2. Relocate image paths in each page's `images` array -3. Write as JSON to `wiki/sources/{name}.json` - -### Part 5: New Tool `get_page_content` - -Add to `openkb/agent/tools.py`: - -```python -def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: - """Get text content of specific pages from a long document. - - Args: - doc_name: Document name (e.g. 'attention-is-all-you-need'). - pages: Page specification (e.g. '3-5,7,10-12'). - wiki_root: Absolute path to the wiki root directory. - """ -``` - -Implementation: -1. Read `wiki/sources/{doc_name}.json` -2. Parse `pages` spec into a set of page numbers (comma-separated, ranges with `-`) -3. Filter pages, format as `[Page N]\n{content}\n\n` -4. Return concatenated text, or error if file not found - -### Part 6: Query Agent Changes - -**Remove**: `pageindex_retrieve` tool and `_pageindex_retrieve_impl` entirely. - -**Add**: `get_page_content` tool. - -**Update instructions**: -``` -## Search strategy -1. Read index.md to understand what documents and concepts are available. - Each entry has a brief summary to help you judge relevance. -2. Read relevant summary pages (summaries/) for document overviews. -3. Read concept pages (concepts/) for cross-document synthesis. -4. For long documents, use get_page_content(doc_name, pages) to read - specific pages. The summary page shows chapter structure with page - ranges to help you decide which pages to read. -5. Synthesise a clear, well-cited answer. -``` - -**Remove**: `openkb_dir` and `model` parameters from `build_query_agent`. - -### What Gets Removed - -- `_pageindex_retrieve_impl` (~110 lines) -- `pageindex_retrieve` tool -- `render_source_md` from `tree_renderer.py` -- `_relocate_images` in current form (replaced by per-page relocation) -- PageIndex imports in `query.py` - -### What Stays - -- `render_summary_md` — summaries still markdown -- Short doc pipeline — unchanged -- Image files in `wiki/sources/images/` -- PageIndex in `indexer.py` — still used for tree building - -## Compile Pipeline Changes Summary - -The compile pipeline (`_compile_concepts`, `compile_short_doc`, `compile_long_doc`) changes: - -1. **Summary step**: parse JSON response, extract `brief` + `content` -2. **Concept create/update steps**: parse JSON response, extract `brief` + `content` -3. **`_write_summary`**: add `brief` to frontmatter -4. **`_write_concept`**: add/update `brief` in frontmatter -5. **`_update_index`**: write `— {brief}` after each wikilink -6. **`_read_concept_briefs`**: read from `brief:` frontmatter field (fallback to body truncation) - -## Files Changed - -- `openkb/agent/compiler.py` — prompt templates return JSON with brief+content, parse responses, pass briefs to index/frontmatter -- `openkb/indexer.py` — sources output from md to json, image relocation per-page -- `openkb/agent/tools.py` — add `get_page_content` -- `openkb/agent/query.py` — remove `pageindex_retrieve`, add `get_page_content`, update instructions -- `openkb/tree_renderer.py` — remove `render_source_md` -- `openkb/schema.py` — update AGENTS_MD -- `tests/test_compiler.py` — update for JSON LLM responses -- `tests/test_indexer.py` — update for JSON output -- `tests/test_query.py` — update for new tool set -- `tests/test_agent_tools.py` — add tests for `get_page_content` - -## Not In Scope - -- Cloud PageIndex query support (removed entirely) -- Changes to the lint pipeline -- Interactive ingest