From cc12d9573de50c71bd5c5e7bd8f7e0aee63f616b Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 00:21:01 +0800
Subject: [PATCH 01/44] debug code about compile

---
 openkb/__main__.py       |   4 +
 openkb/agent/compiler.py | 635 ++++++++++++++++++++++++++-------------
 openkb/agent/tools.py    |  39 +++
 openkb/cli.py            |  47 ++-
 pyproject.toml           |   2 +-
 tests/test_compiler.py   | 354 ++++++++++++++--------
 6 files changed, 742 insertions(+), 339 deletions(-)
 create mode 100644 openkb/__main__.py

diff --git a/openkb/__main__.py b/openkb/__main__.py
new file mode 100644
index 0000000..28f9e41
--- /dev/null
+++ b/openkb/__main__.py
@@ -0,0 +1,4 @@
+"""Allow running OpenKB as ``python -m openkb``."""
+from openkb.cli import cli
+
+cli()
diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index d4e34e3..8307abb 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -1,202 +1,281 @@
-"""Wiki compilation agent for OpenKB.
-
-Provides an agent that reads converted documents, generates summaries,
-updates concept pages, and maintains the wiki index.
+"""Wiki compilation pipeline for OpenKB.
+
+Pipeline leveraging LLM prompt caching:
+  Step 1: Build base context A (schema + document content).
+  Step 2: A → generate summary.
+  Step 3: A + summary → extract concept list.
+  Step 4: Concurrent LLM calls (A cached) → generate each concept page.
+  Step 5: Code writes all files, updates index, appends log.
 """
 from __future__ import annotations
 
+import asyncio
+import json
+import logging
+import sys
+import time
 from pathlib import Path
 
-from agents import Agent, Runner, function_tool
-import os
+import litellm
+
+from openkb.schema import get_agents_md
 
-from pageindex import PageIndexClient
+logger = logging.getLogger(__name__)
 
-from openkb.agent.tools import list_wiki_files, read_wiki_file, write_wiki_file
-from openkb.schema import SCHEMA_MD, get_agents_md
+# ---------------------------------------------------------------------------
+# Prompt templates
+# ---------------------------------------------------------------------------
 
-_COMPILER_INSTRUCTIONS_TEMPLATE = """\
+_SYSTEM_TEMPLATE = """\
 You are a wiki compilation agent for a personal knowledge base.
 
 {schema_md}
 
-## Your job
-When given a new document, you must:
-1. Write a summary page to summaries/<doc_name>.md with:
-   - A YAML frontmatter block: `sources: [filename]`
-   - Key concepts, findings, and ideas from the document
-   - [[wikilinks]] to related concepts
-2. Update or create concept pages in concepts/ for any significant cross-document themes.
-3. Update index.md:
-   - Under ## Documents: add a one-liner entry for the new document
-   - Under ## Concepts: add/update entries for any concepts you touched
-
-Always use the provided tools to read existing wiki pages before writing,
-so you can append or update without losing prior content.
-Use [[wikilinks]] consistently to connect related pages.
+Write all content in {language} language.
+Use [[wikilinks]] to connect related pages (e.g. [[concepts/attention]]).
 """
 
-_LONG_DOC_INSTRUCTIONS_TEMPLATE = """\
-You are a wiki compilation agent for a personal knowledge base.
+_SUMMARY_USER = """\
+New document: {doc_name}
 
-{schema_md}
+Full text:
+{content}
 
-## Your job for long documents (already summarised by PageIndex)
-The summary and source pages are already written. Your tasks are:
-1. Update or create concept pages in concepts/ for significant themes.
-2. Update index.md:
-   - Under ## Documents: add a one-liner entry referencing the document
-   - Under ## Concepts: add/update entries for any concepts you touched
-3. Do NOT regenerate or overwrite the existing summary page.
-
-Use get_page_content to fetch specific page ranges from long documents when
-you need more detail before writing concept pages.
-Always read existing wiki pages before writing to preserve prior content.
-Use [[wikilinks]] consistently to connect related pages.
+Write a summary page for this document in Markdown. Include:
+- Key concepts, findings, and ideas
+- [[wikilinks]] to concepts that could become cross-document concept pages
+
+Return ONLY the Markdown content (no frontmatter, no code fences).
 """
 
+_CONCEPTS_LIST_USER = """\
+Based on the summary above, identify the key concepts worth creating as \
+standalone wiki concept pages.
 
-def build_compiler_agent(wiki_root: str, model: str, language: str = "en") -> Agent:
-    """Build and return the wiki-compiler agent.
+Existing concept pages: {existing_concepts}
 
-    Creates @function_tool wrappers that bind *wiki_root* so the agent
-    doesn't need to supply it explicitly.
+Return a JSON array of objects, each with:
+- "name": concept slug (e.g. "transformer-architecture")
+- "title": human-readable title (e.g. "Transformer Architecture")
+- "is_update": true if this concept already exists and should be updated
 
-    Args:
-        wiki_root: Absolute path to the wiki directory.
-        model: LLM model name to use for the agent.
-        language: Language code for wiki content (e.g. 'en', 'fr').
+Only include concepts for significant themes. For the first document, \
+create 2-3 foundational concepts at most. Do NOT create concepts that are \
+just the document topic itself (e.g. don't create "machine-translation" \
+for a translation paper).
 
-    Returns:
-        Configured :class:`~agents.Agent` instance.
-    """
-    schema_md = get_agents_md(Path(wiki_root))
-    instructions = _COMPILER_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
-    instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
-
-    @function_tool
-    def list_files(directory: str) -> str:
-        """List all Markdown files in a wiki subdirectory.
-
-        Args:
-            directory: Subdirectory path relative to wiki root (e.g. 'sources').
-        """
-        return list_wiki_files(directory, wiki_root)
-
-    @function_tool
-    def read_file(path: str) -> str:
-        """Read a Markdown file from the wiki.
-
-        Args:
-            path: File path relative to wiki root (e.g. 'sources/notes.md').
-        """
-        return read_wiki_file(path, wiki_root)
-
-    @function_tool
-    def write_file(path: str, content: str) -> str:
-        """Write or overwrite a Markdown file in the wiki.
-
-        Args:
-            path: File path relative to wiki root (e.g. 'concepts/attention.md').
-            content: Markdown content to write.
-        """
-        return write_wiki_file(path, content, wiki_root)
-
-    from agents.model_settings import ModelSettings
-
-    return Agent(
-        name="wiki-compiler",
-        instructions=instructions,
-        tools=[list_files, read_file, write_file],
-        model=f"litellm/{model}",
-        model_settings=ModelSettings(parallel_tool_calls=False),
-    )
+Return ONLY valid JSON array, no fences, no explanation.
+"""
 
+_CONCEPT_PAGE_USER = """\
+Write the concept page for: {title}
 
-def build_long_doc_compiler_agent(wiki_root: str, kb_dir: str, model: str, language: str = "en") -> Agent:
-    """Build the wiki-compiler agent with an extra get_page_content tool.
+This concept relates to the document "{doc_name}" summarized above.
+{update_instruction}
 
-    Args:
-        wiki_root: Absolute path to the wiki directory.
-        kb_dir: Absolute path to the knowledge base root (contains .openkb/).
-        model: LLM model name to use for the agent.
-        language: Language code for wiki content (e.g. 'en', 'fr').
+Return ONLY the Markdown content (no frontmatter, no code fences). Include:
+- Clear explanation of the concept
+- Key details from the source document
+- [[wikilinks]] to related concepts and [[summaries/{doc_name}]]
+"""
 
-    Returns:
-        Configured :class:`~agents.Agent` instance.
-    """
-    from openkb.config import load_config
+_LONG_DOC_SUMMARY_USER = """\
+This is a PageIndex summary for long document "{doc_name}" (doc_id: {doc_id}):
 
-    openkb_dir = Path(kb_dir) / ".openkb"
-    config = load_config(openkb_dir / "config.yaml")
-    _model = config.get("model", model)
-    pageindex_api_key = os.environ.get("PAGEINDEX_API_KEY", "")
-    client = PageIndexClient(
-        api_key=pageindex_api_key or None,
-        model=_model,
-        storage_path=str(openkb_dir),
-    )
-    col = client.collection()
-
-    schema_md = get_agents_md(Path(wiki_root))
-    instructions = _LONG_DOC_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
-    instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
-
-    @function_tool
-    def list_files(directory: str) -> str:
-        """List all Markdown files in a wiki subdirectory.
-
-        Args:
-            directory: Subdirectory path relative to wiki root (e.g. 'sources').
-        """
-        return list_wiki_files(directory, wiki_root)
-
-    @function_tool
-    def read_file(path: str) -> str:
-        """Read a Markdown file from the wiki.
-
-        Args:
-            path: File path relative to wiki root (e.g. 'sources/notes.md').
-        """
-        return read_wiki_file(path, wiki_root)
-
-    @function_tool
-    def write_file(path: str, content: str) -> str:
-        """Write or overwrite a Markdown file in the wiki.
-
-        Args:
-            path: File path relative to wiki root (e.g. 'concepts/attention.md').
-            content: Markdown content to write.
-        """
-        return write_wiki_file(path, content, wiki_root)
-
-    @function_tool
-    def get_page_content(doc_id: str, pages: str) -> str:
-        """Retrieve text content for specific pages of a long document.
-
-        Args:
-            doc_id: Document identifier from PageIndex.
-            pages: Page range string, e.g. '1-5' or '3,7,12'.
-        """
-        results = col.get_page_content(doc_id, pages)
-        if not results:
-            return "No content found for the given pages."
-        parts = []
-        for item in results:
-            page_num = item.get("page_index", "?")
-            text = item.get("text", "")
-            parts.append(f"[Page {page_num}]\n{text}")
-        return "\n\n".join(parts)
-
-    from agents.model_settings import ModelSettings
-
-    return Agent(
-        name="wiki-compiler",
-        instructions=instructions,
-        tools=[list_files, read_file, write_file, get_page_content],
-        model=f"litellm/{_model}",
-        model_settings=ModelSettings(parallel_tool_calls=False),
-    )
+{content}
+
+Based on this structured summary, write a concise overview that captures \
+the key themes and findings. This will be used to generate concept pages.
+
+Return ONLY the Markdown content (no frontmatter, no code fences).
+"""
+
+
+# ---------------------------------------------------------------------------
+# LLM helpers
+# ---------------------------------------------------------------------------
+
+import threading
+
+
+class _Spinner:
+    """Animated dots spinner that runs in a background thread."""
+
+    def __init__(self, label: str):
+        self._label = label
+        self._stop = threading.Event()
+        self._thread: threading.Thread | None = None
+
+    def start(self) -> None:
+        sys.stdout.write(f"    {self._label}")
+        sys.stdout.flush()
+        self._thread = threading.Thread(target=self._run, daemon=True)
+        self._thread.start()
+
+    def _run(self) -> None:
+        while not self._stop.wait(timeout=1.0):
+            sys.stdout.write(".")
+            sys.stdout.flush()
+
+    def stop(self, suffix: str = "") -> None:
+        self._stop.set()
+        if self._thread:
+            self._thread.join()
+        sys.stdout.write(f" {suffix}\n")
+        sys.stdout.flush()
+
+
+def _format_usage(elapsed: float, usage) -> str:
+    """Format timing and token usage into a short summary string."""
+    cached = getattr(usage, "prompt_tokens_details", None)
+    cache_info = ""
+    if cached and hasattr(cached, "cached_tokens") and cached.cached_tokens:
+        cache_info = f", cached={cached.cached_tokens}"
+    return f"{elapsed:.1f}s (in={usage.prompt_tokens}, out={usage.completion_tokens}{cache_info})"
+
+
+def _fmt_messages(messages: list[dict], max_content: int = 200) -> str:
+    """Format messages for debug output, truncating long content."""
+    parts = []
+    for msg in messages:
+        role = msg["role"]
+        content = msg["content"]
+        if len(content) > max_content:
+            preview = content[:max_content] + f"... ({len(content)} chars)"
+        else:
+            preview = content
+        parts.append(f"      [{role}] {preview}")
+    return "\n".join(parts)
+
+
+def _llm_call(model: str, messages: list[dict], step_name: str, **kwargs) -> str:
+    """Single LLM call with animated progress and debug logging."""
+    logger.debug("LLM request [%s]:\n%s", step_name, _fmt_messages(messages))
+    if kwargs:
+        logger.debug("LLM kwargs [%s]: %s", step_name, kwargs)
+
+    spinner = _Spinner(step_name)
+    spinner.start()
+    t0 = time.time()
+
+    response = litellm.completion(model=model, messages=messages, **kwargs)
+    content = response.choices[0].message.content or ""
+
+    spinner.stop(_format_usage(time.time() - t0, response.usage))
+    logger.debug("LLM response [%s]:\n%s", step_name, content[:500] + ("..." if len(content) > 500 else ""))
+    return content.strip()
+
+
+async def _llm_call_async(model: str, messages: list[dict], step_name: str) -> str:
+    """Async LLM call with timing output and debug logging."""
+    logger.debug("LLM request [%s]:\n%s", step_name, _fmt_messages(messages))
+
+    t0 = time.time()
+
+    response = await litellm.acompletion(model=model, messages=messages)
+    content = response.choices[0].message.content or ""
+
+    elapsed = time.time() - t0
+    sys.stdout.write(f"    {step_name}... {_format_usage(elapsed, response.usage)}\n")
+    sys.stdout.flush()
+    logger.debug("LLM response [%s]:\n%s", step_name, content[:500] + ("..." if len(content) > 500 else ""))
+    return content.strip()
+
+
+def _parse_json(text: str) -> list | dict:
+    """Parse JSON from LLM response, stripping markdown fences if present."""
+    cleaned = text.strip()
+    if cleaned.startswith("```"):
+        first_nl = cleaned.index("\n")
+        cleaned = cleaned[first_nl + 1:]
+        if cleaned.endswith("```"):
+            cleaned = cleaned[:-3]
+    return json.loads(cleaned.strip())
+
+
+# ---------------------------------------------------------------------------
+# File I/O helpers
+# ---------------------------------------------------------------------------
+
+def _read_wiki_context(wiki_dir: Path) -> tuple[str, list[str]]:
+    """Read current index.md content and list of existing concept slugs."""
+    index_path = wiki_dir / "index.md"
+    index_content = index_path.read_text(encoding="utf-8") if index_path.exists() else ""
+
+    concepts_dir = wiki_dir / "concepts"
+    existing = sorted(p.stem for p in concepts_dir.glob("*.md")) if concepts_dir.exists() else []
+
+    return index_content, existing
+
+
+def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
+    """Find the original filename in raw/ for a given doc stem."""
+    raw_dir = kb_dir / "raw"
+    if raw_dir.exists():
+        for f in raw_dir.iterdir():
+            if f.stem == doc_name:
+                return f.name
+    return f"{doc_name}.pdf"
+
+
+def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str) -> None:
+    """Write summary page with frontmatter."""
+    summaries_dir = wiki_dir / "summaries"
+    summaries_dir.mkdir(parents=True, exist_ok=True)
+    frontmatter = f"---\nsources: [{source_file}]\n---\n\n"
+    (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
+
+
+def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool) -> None:
+    """Write or update a concept page, managing the sources frontmatter."""
+    concepts_dir = wiki_dir / "concepts"
+    concepts_dir.mkdir(parents=True, exist_ok=True)
+    path = concepts_dir / f"{name}.md"
+
+    if is_update and path.exists():
+        existing = path.read_text(encoding="utf-8")
+        if source_file not in existing:
+            if existing.startswith("---"):
+                end = existing.index("---", 3)
+                fm = existing[:end + 3]
+                body = existing[end + 3:]
+                if "sources:" in fm:
+                    fm = fm.replace("sources: [", f"sources: [{source_file}, ")
+                existing = fm + body
+            existing += f"\n\n{content}"
+        path.write_text(existing, encoding="utf-8")
+    else:
+        frontmatter = f"---\nsources: [{source_file}]\n---\n\n"
+        path.write_text(frontmatter + content, encoding="utf-8")
+
+
+def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> None:
+    """Append document and concept entries to index.md."""
+    index_path = wiki_dir / "index.md"
+    if not index_path.exists():
+        return
+
+    text = index_path.read_text(encoding="utf-8")
+
+    doc_entry = f"- [[summaries/{doc_name}]]"
+    if doc_entry not in text:
+        if "## Documents" in text:
+            text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1)
+
+    for name in concept_names:
+        concept_entry = f"- [[concepts/{name}]]"
+        if concept_entry not in text:
+            if "## Concepts" in text:
+                text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1)
+
+    index_path.write_text(text, encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+DEFAULT_COMPILE_CONCURRENCY = 5
 
 
 async def compile_short_doc(
@@ -204,17 +283,15 @@ async def compile_short_doc(
     source_path: Path,
     kb_dir: Path,
     model: str,
+    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
 ) -> None:
-    """Run the compiler agent for a short (non-PageIndex) document.
+    """Compile a short document using a multi-step LLM pipeline with caching.
 
-    Reads the converted source Markdown, then asks the agent to generate a
-    summary, update concept pages, and update the index.
-
-    Args:
-        doc_name: Document stem name (no extension).
-        source_path: Path to the converted Markdown in wiki/sources/.
-        kb_dir: Root of the knowledge base (contains wiki/ and .openkb/).
-        model: LLM model name.
+    Step 1: Build base context A (schema + doc content).
+    Step 2: A → generate summary.
+    Step 3: A + summary → extract concept list.
+    Step 4: Concurrent LLM calls (A cached) → generate each concept page.
+    Step 5: Code writes files, updates index.
     """
     from openkb.config import load_config
 
@@ -222,17 +299,92 @@ async def compile_short_doc(
     config = load_config(openkb_dir / "config.yaml")
     language: str = config.get("language", "en")
 
-    wiki_root = str(kb_dir / "wiki")
-    agent = build_compiler_agent(wiki_root, model, language=language)
-
+    wiki_dir = kb_dir / "wiki"
+    schema_md = get_agents_md(wiki_dir)
+    source_file = _find_source_filename(doc_name, kb_dir)
     content = source_path.read_text(encoding="utf-8")
-    message = (
-        f"New document: {doc_name}\n\n"
-        f"Full text:\n{content}\n\n"
-        "Generate summary, update concepts, update index."
+
+    # Base context A: system + document
+    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
+        schema_md=schema_md, language=language,
+    )}
+    doc_msg = {"role": "user", "content": _SUMMARY_USER.format(
+        doc_name=doc_name, content=content,
+    )}
+
+    # --- Step 1: Generate summary ---
+    summary = _llm_call(model, [system_msg, doc_msg], "summary")
+    _write_summary(wiki_dir, doc_name, source_file, summary)
+
+    # --- Step 2: Extract concept list (A cached) ---
+    _, existing_concepts = _read_wiki_context(wiki_dir)
+
+    concepts_list_raw = _llm_call(model, [
+        system_msg,
+        doc_msg,
+        {"role": "assistant", "content": summary},
+        {"role": "user", "content": _CONCEPTS_LIST_USER.format(
+            existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)",
+        )},
+    ], "concepts-list", max_tokens=512)
+
+    try:
+        concepts_list = _parse_json(concepts_list_raw)
+    except (json.JSONDecodeError, ValueError) as exc:
+        logger.warning("Failed to parse concepts list: %s", exc)
+        logger.debug("Raw: %s", concepts_list_raw)
+        _update_index(wiki_dir, doc_name, [])
+        return
+
+    if not concepts_list:
+        _update_index(wiki_dir, doc_name, [])
+        return
+
+    # --- Step 3: Generate concept pages concurrently (A cached) ---
+    semaphore = asyncio.Semaphore(max_concurrency)
+
+    async def _gen_concept(concept: dict) -> tuple[str, str, bool]:
+        name = concept["name"]
+        title = concept.get("title", name)
+        is_update = concept.get("is_update", False)
+        update_instruction = (
+            "This concept page already exists. Add new information from this document "
+            "without duplicating existing content."
+            if is_update else ""
+        )
+
+        async with semaphore:
+            page_content = await _llm_call_async(model, [
+                system_msg,
+                doc_msg,
+                {"role": "assistant", "content": summary},
+                {"role": "user", "content": _CONCEPT_PAGE_USER.format(
+                    title=title, doc_name=doc_name,
+                    update_instruction=update_instruction,
+                )},
+            ], f"concept:{name}")
+
+        return name, page_content, is_update
+
+    sys.stdout.write(f"    Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n")
+    sys.stdout.flush()
+
+    results = await asyncio.gather(
+        *[_gen_concept(c) for c in concepts_list],
+        return_exceptions=True,
     )
 
-    await Runner.run(agent, message)
+    concept_names = []
+    for r in results:
+        if isinstance(r, Exception):
+            logger.warning("Concept generation failed: %s", r)
+            continue
+        name, page_content, is_update = r
+        _write_concept(wiki_dir, name, page_content, source_file, is_update)
+        concept_names.append(name)
+
+    # --- Step 4: Update index (code only) ---
+    _update_index(wiki_dir, doc_name, concept_names)
 
 
 async def compile_long_doc(
@@ -241,18 +393,12 @@ async def compile_long_doc(
     doc_id: str,
     kb_dir: Path,
     model: str,
+    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
 ) -> None:
-    """Run the compiler agent for a long (PageIndex) document.
-
-    The summary page is already written. The agent updates concept pages and
-    the index without regenerating the summary.
+    """Compile a long (PageIndex) document's concepts and index.
 
-    Args:
-        doc_name: Document stem name (no extension).
-        summary_path: Path to the existing summary Markdown in wiki/summaries/.
-        doc_id: PageIndex document identifier.
-        kb_dir: Root of the knowledge base.
-        model: LLM model name.
+    The summary page is already written by the indexer. This function
+    generates concept pages and updates the index.
     """
     from openkb.config import load_config
 
@@ -260,14 +406,87 @@ async def compile_long_doc(
     config = load_config(openkb_dir / "config.yaml")
     language: str = config.get("language", "en")
 
-    wiki_root = str(kb_dir / "wiki")
-    agent = build_long_doc_compiler_agent(wiki_root, str(kb_dir), model, language=language)
-
-    content = summary_path.read_text(encoding="utf-8")
-    message = (
-        f"New long document: {doc_name} (doc_id: {doc_id})\n"
-        f"Summary tree:\n{content}\n"
-        "Update concepts and index. Do NOT regenerate summary."
+    wiki_dir = kb_dir / "wiki"
+    schema_md = get_agents_md(wiki_dir)
+    source_file = _find_source_filename(doc_name, kb_dir)
+    summary = summary_path.read_text(encoding="utf-8")
+
+    # Base context A
+    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
+        schema_md=schema_md, language=language,
+    )}
+    doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format(
+        doc_name=doc_name, doc_id=doc_id, content=summary,
+    )}
+
+    # --- Step 1: Extract concept list ---
+    _, existing_concepts = _read_wiki_context(wiki_dir)
+
+    # Get a concise overview first (for concept generation context)
+    overview = _llm_call(model, [system_msg, doc_msg], "overview")
+
+    concepts_list_raw = _llm_call(model, [
+        system_msg,
+        doc_msg,
+        {"role": "assistant", "content": overview},
+        {"role": "user", "content": _CONCEPTS_LIST_USER.format(
+            existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)",
+        )},
+    ], "concepts-list", max_tokens=512)
+
+    try:
+        concepts_list = _parse_json(concepts_list_raw)
+    except (json.JSONDecodeError, ValueError) as exc:
+        logger.warning("Failed to parse concepts list: %s", exc)
+        logger.debug("Raw: %s", concepts_list_raw)
+        _update_index(wiki_dir, doc_name, [])
+        return
+
+    if not concepts_list:
+        _update_index(wiki_dir, doc_name, [])
+        return
+
+    # --- Step 2: Generate concept pages concurrently ---
+    semaphore = asyncio.Semaphore(max_concurrency)
+
+    async def _gen_concept(concept: dict) -> tuple[str, str, bool]:
+        name = concept["name"]
+        title = concept.get("title", name)
+        is_update = concept.get("is_update", False)
+        update_instruction = (
+            "This concept page already exists. Add new information."
+            if is_update else ""
+        )
+
+        async with semaphore:
+            page_content = await _llm_call_async(model, [
+                system_msg,
+                doc_msg,
+                {"role": "assistant", "content": overview},
+                {"role": "user", "content": _CONCEPT_PAGE_USER.format(
+                    title=title, doc_name=doc_name,
+                    update_instruction=update_instruction,
+                )},
+            ], f"concept:{name}")
+
+        return name, page_content, is_update
+
+    sys.stdout.write(f"    Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n")
+    sys.stdout.flush()
+
+    results = await asyncio.gather(
+        *[_gen_concept(c) for c in concepts_list],
+        return_exceptions=True,
     )
 
-    await Runner.run(agent, message)
+    concept_names = []
+    for r in results:
+        if isinstance(r, Exception):
+            logger.warning("Concept generation failed: %s", r)
+            continue
+        name, page_content, is_update = r
+        _write_concept(wiki_dir, name, page_content, source_file, is_update)
+        concept_names.append(name)
+
+    # --- Step 3: Update index (code only) ---
+    _update_index(wiki_dir, doc_name, concept_names)
diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py
index 7a5b1ca..40875f3 100644
--- a/openkb/agent/tools.py
+++ b/openkb/agent/tools.py
@@ -72,3 +72,42 @@ def write_wiki_file(path: str, content: str, wiki_root: str) -> str:
     full_path.parent.mkdir(parents=True, exist_ok=True)
     full_path.write_text(content, encoding="utf-8")
     return f"Written: {path}"
+
+
+def write_wiki_files(files_json: str, wiki_root: str) -> str:
+    """Write multiple Markdown files to the wiki in one call.
+
+    Args:
+        files_json: JSON array of objects, each with ``"path"`` and ``"content"`` keys.
+            Example: ``[{"path": "concepts/foo.md", "content": "# Foo\\n..."}]``
+        wiki_root: Absolute path to the wiki root directory.
+
+    Returns:
+        Summary of written files, or error message on failure.
+    """
+    import json
+
+    try:
+        files = json.loads(files_json)
+    except json.JSONDecodeError as exc:
+        return f"Invalid JSON: {exc}"
+
+    if not isinstance(files, list):
+        return "Expected a JSON array of {path, content} objects."
+
+    root = Path(wiki_root).resolve()
+    written: list[str] = []
+    for entry in files:
+        path = entry.get("path", "")
+        content = entry.get("content", "")
+        if not path:
+            continue
+        full_path = (root / path).resolve()
+        if not full_path.is_relative_to(root):
+            written.append(f"Skipped (path escape): {path}")
+            continue
+        full_path.parent.mkdir(parents=True, exist_ok=True)
+        full_path.write_text(content, encoding="utf-8")
+        written.append(path)
+
+    return f"Written {len(written)} files: {', '.join(written)}"
diff --git a/openkb/cli.py b/openkb/cli.py
index da664f5..388ac87 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -3,13 +3,20 @@
 
 import asyncio
 import json
+import logging
 import time
 from pathlib import Path
 
 import os
 
+# Disable Agents SDK tracing (requires OPENAI_API_KEY otherwise)
+os.environ.setdefault("OPENAI_AGENTS_DISABLE_TRACING", "1")
+# Use local model cost map — skip fetching from GitHub on every invocation
+os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True")
+
 import click
 import litellm
+litellm.suppress_debug_info = True
 from dotenv import load_dotenv
 
 from openkb.config import DEFAULT_CONFIG, load_config, save_config
@@ -17,14 +24,28 @@
 from openkb.log import append_log
 from openkb.schema import AGENTS_MD
 
-load_dotenv()
+load_dotenv()  # load from cwd (covers running inside the KB dir)
+
 
+def _setup_llm_key(kb_dir: Path | None = None) -> None:
+    """Set LiteLLM API key from LLM_API_KEY env var if present.
+
+    If *kb_dir* is given, also loads ``.env`` from the KB root so that
+    the key is found even when the CLI is invoked from another directory.
+    Also propagates to provider-specific env vars (OPENAI_API_KEY, etc.)
+    so that the Agents SDK litellm provider can pick them up.
+    """
+    if kb_dir is not None:
+        env_file = kb_dir / ".env"
+        if env_file.exists():
+            load_dotenv(env_file, override=False)
 
-def _setup_llm_key() -> None:
-    """Set LiteLLM API key from LLM_API_KEY env var if present."""
     api_key = os.environ.get("LLM_API_KEY", "")
     if api_key:
         litellm.api_key = api_key
+        for env_var in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY"):
+            if not os.environ.get(env_var):
+                os.environ[env_var] = api_key
 
 # Supported document extensions for the `add` command
 SUPPORTED_EXTENSIONS = {
@@ -73,9 +94,10 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
     from openkb.agent.compiler import compile_long_doc, compile_short_doc
     from openkb.state import HashRegistry
 
+    logger = logging.getLogger(__name__)
     openkb_dir = kb_dir / ".openkb"
     config = load_config(openkb_dir / "config.yaml")
-    _setup_llm_key()
+    _setup_llm_key(kb_dir)
     model: str = config.get("model", DEFAULT_CONFIG["model"])
     registry = HashRegistry(openkb_dir / "hashes.json")
 
@@ -85,6 +107,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
         result = convert_document(file_path, kb_dir)
     except Exception as exc:
         click.echo(f"  [ERROR] Conversion failed: {exc}")
+        logger.debug("Conversion traceback:", exc_info=True)
         return
 
     if result.skipped:
@@ -101,6 +124,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
             index_result = index_long_document(result.raw_path, kb_dir)
         except Exception as exc:
             click.echo(f"  [ERROR] Indexing failed: {exc}")
+            logger.debug("Indexing traceback:", exc_info=True)
             return
 
         summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md"
@@ -117,6 +141,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
                     time.sleep(2)
                 else:
                     click.echo(f"  [ERROR] Compilation failed: {exc}")
+                    logger.debug("Compilation traceback:", exc_info=True)
                     return
     else:
         click.echo(f"  Compiling short doc…")
@@ -130,6 +155,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
                     time.sleep(2)
                 else:
                     click.echo(f"  [ERROR] Compilation failed: {exc}")
+                    logger.debug("Compilation traceback:", exc_info=True)
                     return
 
     # Register hash only after successful compilation
@@ -146,8 +172,15 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
 # ---------------------------------------------------------------------------
 
 @click.group()
-def cli():
+@click.option("-v", "--verbose", is_flag=True, default=False, help="Enable verbose logging.")
+def cli(verbose):
     """OpenKB — Karpathy's LLM Knowledge Base workflow, powered by PageIndex."""
+    logging.basicConfig(
+        format="%(name)s %(levelname)s: %(message)s",
+        level=logging.WARNING,
+    )
+    if verbose:
+        logging.getLogger("openkb").setLevel(logging.DEBUG)
 
 
 @cli.command()
@@ -249,7 +282,7 @@ def query(question, save):
 
     openkb_dir = kb_dir / ".openkb"
     config = load_config(openkb_dir / "config.yaml")
-    _setup_llm_key()
+    _setup_llm_key(kb_dir)
     model: str = config.get("model", DEFAULT_CONFIG["model"])
 
     try:
@@ -314,7 +347,7 @@ def lint(fix):
 
     openkb_dir = kb_dir / ".openkb"
     config = load_config(openkb_dir / "config.yaml")
-    _setup_llm_key()
+    _setup_llm_key(kb_dir)
     model: str = config.get("model", DEFAULT_CONFIG["model"])
 
     # Structural lint
diff --git a/pyproject.toml b/pyproject.toml
index 393cbd0..eb1cdde 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ classifiers = [
 keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"]
 dependencies = [
     "pageindex==0.3.0.dev0",
-    "markitdown[all]",
+    "markitdown",
     "click>=8.0",
     "watchdog>=3.0",
     "litellm",
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index 0549bb9..1d17c6c 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -1,158 +1,266 @@
-"""Tests for openkb.agent.compiler."""
+"""Tests for openkb.agent.compiler pipeline."""
 from __future__ import annotations
 
+import json
 from pathlib import Path
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import MagicMock, patch, AsyncMock
 
 import pytest
 
 from openkb.agent.compiler import (
-    build_compiler_agent,
     compile_long_doc,
     compile_short_doc,
+    _parse_json,
+    _write_summary,
+    _write_concept,
+    _update_index,
+    _read_wiki_context,
 )
-from openkb.schema import SCHEMA_MD
 
 
-class TestBuildCompilerAgent:
-    def test_agent_name(self, tmp_path):
-        agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini")
-        assert agent.name == "wiki-compiler"
-
-    def test_agent_tools_count(self, tmp_path):
-        agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini")
-        # list_files, read_file, write_file
-        assert len(agent.tools) == 3
-
-    def test_schema_in_instructions(self, tmp_path):
-        agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini")
-        assert SCHEMA_MD in agent.instructions
-
-    def test_agent_model(self, tmp_path):
-        agent = build_compiler_agent(str(tmp_path), "my-custom-model")
-        assert agent.model == "litellm/my-custom-model"
-
-    def test_tool_names(self, tmp_path):
-        agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini")
-        tool_names = {t.name for t in agent.tools}
-        assert "list_files" in tool_names
-        assert "read_file" in tool_names
-        assert "write_file" in tool_names
+class TestParseJson:
+    def test_plain_json(self):
+        assert _parse_json('[{"name": "foo"}]') == [{"name": "foo"}]
+
+    def test_fenced_json(self):
+        text = '```json\n[{"name": "bar"}]\n```'
+        assert _parse_json(text) == [{"name": "bar"}]
+
+    def test_invalid_json(self):
+        with pytest.raises((json.JSONDecodeError, ValueError)):
+            _parse_json("not json")
+
+
+class TestWriteSummary:
+    def test_writes_with_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.")
+        path = wiki / "summaries" / "my-doc.md"
+        assert path.exists()
+        text = path.read_text()
+        assert "sources: [my-doc.pdf]" in text
+        assert "# Summary" in text
+
+
+class TestWriteConcept:
+    def test_new_concept(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False)
+        path = wiki / "concepts" / "attention.md"
+        assert path.exists()
+        text = path.read_text()
+        assert "sources: [paper.pdf]" in text
+        assert "# Attention" in text
+
+    def test_update_concept_appends_source(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nOld content.",
+            encoding="utf-8",
+        )
+        _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True)
+        text = (concepts / "attention.md").read_text()
+        assert "paper2.pdf" in text
+        assert "paper1.pdf" in text
+        assert "New info from paper2." in text
+
+
+class TestUpdateIndex:
+    def test_appends_entries(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        _update_index(wiki, "my-doc", ["attention", "transformer"])
+        text = (wiki / "index.md").read_text()
+        assert "[[summaries/my-doc]]" in text
+        assert "[[concepts/attention]]" in text
+        assert "[[concepts/transformer]]" in text
+
+    def test_no_duplicates(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n- [[summaries/my-doc]]\n\n## Concepts\n",
+            encoding="utf-8",
+        )
+        _update_index(wiki, "my-doc", [])
+        text = (wiki / "index.md").read_text()
+        assert text.count("[[summaries/my-doc]]") == 1
+
+
+class TestReadWikiContext:
+    def test_empty_wiki(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        index, concepts = _read_wiki_context(wiki)
+        assert index == ""
+        assert concepts == []
+
+    def test_with_content(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text("# Index\n", encoding="utf-8")
+        concepts_dir = wiki / "concepts"
+        concepts_dir.mkdir()
+        (concepts_dir / "attention.md").write_text("# Attention", encoding="utf-8")
+        (concepts_dir / "transformer.md").write_text("# Transformer", encoding="utf-8")
+        index, concepts = _read_wiki_context(wiki)
+        assert "# Index" in index
+        assert concepts == ["attention", "transformer"]
+
+
+def _mock_completion(responses: list[str]):
+    """Create a mock for litellm.completion that returns responses in order."""
+    call_count = {"n": 0}
+
+    def side_effect(*args, **kwargs):
+        idx = min(call_count["n"], len(responses) - 1)
+        call_count["n"] += 1
+        mock_resp = MagicMock()
+        mock_resp.choices = [MagicMock()]
+        mock_resp.choices[0].message.content = responses[idx]
+        mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50)
+        mock_resp.usage.prompt_tokens_details = None
+        return mock_resp
+
+    return side_effect
+
+
+def _mock_acompletion(responses: list[str]):
+    """Create an async mock for litellm.acompletion."""
+    call_count = {"n": 0}
+
+    async def side_effect(*args, **kwargs):
+        idx = min(call_count["n"], len(responses) - 1)
+        call_count["n"] += 1
+        mock_resp = MagicMock()
+        mock_resp.choices = [MagicMock()]
+        mock_resp.choices[0].message.content = responses[idx]
+        mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50)
+        mock_resp.usage.prompt_tokens_details = None
+        return mock_resp
+
+    return side_effect
 
 
 class TestCompileShortDoc:
     @pytest.mark.asyncio
-    async def test_calls_runner_run(self, tmp_path):
-        # Create a source file
-        wiki_dir = tmp_path / "wiki"
-        wiki_dir.mkdir()
-        source_path = wiki_dir / "sources" / "my_doc.md"
-        source_path.parent.mkdir(parents=True)
-        source_path.write_text("# My Doc\n\nSome content.", encoding="utf-8")
-
-        # Create .openkb dir for agent build
-        openkb_dir = tmp_path / ".openkb"
-        openkb_dir.mkdir()
-
-        mock_result = MagicMock()
-        mock_result.final_output = "Done"
+    async def test_full_pipeline(self, tmp_path):
+        # Setup KB structure
+        wiki = tmp_path / "wiki"
+        (wiki / "sources").mkdir(parents=True)
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "concepts").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        source_path = wiki / "sources" / "test-doc.md"
+        source_path.write_text("# Test Doc\n\nSome content about transformers.", encoding="utf-8")
+        (tmp_path / ".openkb").mkdir()
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
+
+        summary_response = "# Summary\n\nThis document discusses transformers."
+        concepts_list_response = json.dumps([
+            {"name": "transformer", "title": "Transformer", "is_update": False},
+        ])
+        concept_page_response = "# Transformer\n\nA neural network architecture."
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([summary_response, concepts_list_response])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([concept_page_response])
+            )
+            await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini")
 
-        with patch("openkb.agent.compiler.Runner.run", new_callable=AsyncMock) as mock_run:
-            mock_run.return_value = mock_result
-            await compile_short_doc("my_doc", source_path, tmp_path, "gpt-4o-mini")
+        # Verify summary written
+        summary_path = wiki / "summaries" / "test-doc.md"
+        assert summary_path.exists()
+        assert "sources: [test-doc.pdf]" in summary_path.read_text()
 
-        mock_run.assert_called_once()
-        call_args = mock_run.call_args
-        agent_arg = call_args[0][0]
-        message_arg = call_args[0][1]
+        # Verify concept written
+        concept_path = wiki / "concepts" / "transformer.md"
+        assert concept_path.exists()
+        assert "sources: [test-doc.pdf]" in concept_path.read_text()
 
-        assert agent_arg.name == "wiki-compiler"
-        assert "my_doc" in message_arg
-        assert "Some content." in message_arg
-        assert "Generate summary" in message_arg
+        # Verify index updated
+        index_text = (wiki / "index.md").read_text()
+        assert "[[summaries/test-doc]]" in index_text
+        assert "[[concepts/transformer]]" in index_text
 
     @pytest.mark.asyncio
-    async def test_message_contains_doc_name_and_content(self, tmp_path):
-        wiki_dir = tmp_path / "wiki"
-        source_path = wiki_dir / "sources" / "test_paper.md"
-        source_path.parent.mkdir(parents=True)
-        source_path.write_text("# Test Paper\n\nKey findings here.", encoding="utf-8")
-
+    async def test_handles_bad_json(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        (wiki / "sources").mkdir(parents=True)
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n",
+            encoding="utf-8",
+        )
+        source_path = wiki / "sources" / "doc.md"
+        source_path.write_text("Content", encoding="utf-8")
         (tmp_path / ".openkb").mkdir()
 
-        captured = {}
-
-        async def fake_run(agent, message, **kwargs):
-            captured["message"] = message
-            return MagicMock(final_output="ok")
-
-        with patch("openkb.agent.compiler.Runner.run", side_effect=fake_run):
-            await compile_short_doc("test_paper", source_path, tmp_path, "gpt-4o-mini")
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion(["Summary text", "not valid json"])
+            )
+            # Should not raise
+            await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini")
 
-        assert "test_paper" in captured["message"]
-        assert "Key findings here." in captured["message"]
+        # Summary should still be written
+        assert (wiki / "summaries" / "doc.md").exists()
 
 
 class TestCompileLongDoc:
     @pytest.mark.asyncio
-    async def test_calls_runner_run(self, tmp_path):
-        wiki_dir = tmp_path / "wiki"
-        summary_path = wiki_dir / "summaries" / "big_doc.md"
-        summary_path.parent.mkdir(parents=True)
-        summary_path.write_text("# Big Doc Summary\n\nSection tree.", encoding="utf-8")
-
+    async def test_full_pipeline(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "concepts").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n",
+            encoding="utf-8",
+        )
+        summary_path = wiki / "summaries" / "big-doc.md"
+        summary_path.write_text("# Big Doc\n\nPageIndex summary tree.", encoding="utf-8")
         openkb_dir = tmp_path / ".openkb"
         openkb_dir.mkdir()
-        # Write minimal config
         (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n")
-
-        mock_result = MagicMock()
-        mock_result.final_output = "Done"
-
-        with patch("openkb.agent.compiler.Runner.run", new_callable=AsyncMock) as mock_run, \
-             patch("openkb.agent.compiler.PageIndexClient") as mock_client_cls:
-            mock_client = MagicMock()
-            mock_client_cls.return_value = mock_client
-            mock_run.return_value = mock_result
-
-            await compile_long_doc(
-                "big_doc", summary_path, "doc-abc123", tmp_path, "gpt-4o-mini"
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake")
+
+        overview_response = "Overview of the big document."
+        concepts_list_response = json.dumps([
+            {"name": "deep-learning", "title": "Deep Learning", "is_update": False},
+        ])
+        concept_page_response = "# Deep Learning\n\nA subfield of ML."
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([overview_response, concepts_list_response])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([concept_page_response])
             )
-
-        mock_run.assert_called_once()
-        call_args = mock_run.call_args
-        message_arg = call_args[0][1]
-
-        assert "big_doc" in message_arg
-        assert "doc-abc123" in message_arg
-        assert "Do NOT regenerate summary" in message_arg
-
-    @pytest.mark.asyncio
-    async def test_long_doc_agent_has_four_tools(self, tmp_path):
-        wiki_dir = tmp_path / "wiki"
-        summary_path = wiki_dir / "summaries" / "big.md"
-        summary_path.parent.mkdir(parents=True)
-        summary_path.write_text("Summary content", encoding="utf-8")
-
-        openkb_dir = tmp_path / ".openkb"
-        openkb_dir.mkdir()
-        (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n")
-
-        captured_agent = {}
-
-        async def fake_run(agent, message, **kwargs):
-            captured_agent["agent"] = agent
-            return MagicMock(final_output="ok")
-
-        with patch("openkb.agent.compiler.Runner.run", side_effect=fake_run), \
-             patch("openkb.agent.compiler.PageIndexClient") as mock_client_cls:
-            mock_client_cls.return_value = MagicMock()
-
             await compile_long_doc(
-                "big", summary_path, "doc-xyz", tmp_path, "gpt-4o-mini"
+                "big-doc", summary_path, "doc-123", tmp_path, "gpt-4o-mini"
             )
 
-        agent = captured_agent["agent"]
-        assert len(agent.tools) == 4
-        tool_names = {t.name for t in agent.tools}
-        assert "get_page_content" in tool_names
+        concept_path = wiki / "concepts" / "deep-learning.md"
+        assert concept_path.exists()
+        assert "Deep Learning" in concept_path.read_text()
+
+        index_text = (wiki / "index.md").read_text()
+        assert "[[summaries/big-doc]]" in index_text
+        assert "[[concepts/deep-learning]]" in index_text

From 864068173cde37571b59fcbff00f3f50e104ba85 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 21:07:37 +0800
Subject: [PATCH 02/44] feat: add _read_concept_briefs for concept dedup
 context

---
 openkb/agent/compiler.py | 55 +++++++++++++++++++++++++++++++---
 tests/test_compiler.py   | 64 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 8307abb..9d721ca 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -12,7 +12,9 @@
 import asyncio
 import json
 import logging
+import re
 import sys
+import threading
 import time
 from pathlib import Path
 
@@ -95,9 +97,6 @@
 # LLM helpers
 # ---------------------------------------------------------------------------
 
-import threading
-
-
 class _Spinner:
     """Animated dots spinner that runs in a background thread."""
 
@@ -208,6 +207,37 @@ def _read_wiki_context(wiki_dir: Path) -> tuple[str, list[str]]:
     return index_content, existing
 
 
+def _read_concept_briefs(wiki_dir: Path) -> str:
+    """Read existing concept pages and return compact one-line summaries.
+
+    For each concept, skips YAML frontmatter, takes the first 150 chars of the
+    body (newlines collapsed to spaces), and formats as ``- {slug}: {brief}``.
+
+    Returns "(none yet)" if the concepts directory is missing or empty.
+    """
+    concepts_dir = wiki_dir / "concepts"
+    if not concepts_dir.exists():
+        return "(none yet)"
+
+    md_files = sorted(concepts_dir.glob("*.md"))
+    if not md_files:
+        return "(none yet)"
+
+    lines: list[str] = []
+    for path in md_files:
+        text = path.read_text(encoding="utf-8")
+        # Strip YAML frontmatter if present
+        if text.startswith("---"):
+            end = text.find("---", 3)
+            if end != -1:
+                text = text[end + 3:]
+        body = text.strip().replace("\n", " ")
+        brief = body[:150]
+        lines.append(f"- {path.stem}: {brief}")
+
+    return "\n".join(lines)
+
+
 def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
     """Find the original filename in raw/ for a given doc stem."""
     raw_dir = kb_dir / "raw"
@@ -226,11 +256,24 @@ def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str
     (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
 
 
+_SAFE_NAME_RE = re.compile(r'[^a-zA-Z0-9_\-]')
+
+
+def _sanitize_concept_name(name: str) -> str:
+    """Sanitize a concept name for safe use as a filename."""
+    sanitized = _SAFE_NAME_RE.sub("-", name).strip("-")
+    return sanitized or "unnamed-concept"
+
+
 def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool) -> None:
     """Write or update a concept page, managing the sources frontmatter."""
     concepts_dir = wiki_dir / "concepts"
     concepts_dir.mkdir(parents=True, exist_ok=True)
-    path = concepts_dir / f"{name}.md"
+    safe_name = _sanitize_concept_name(name)
+    path = (concepts_dir / f"{safe_name}.md").resolve()
+    if not path.is_relative_to(concepts_dir.resolve()):
+        logger.warning("Concept name escapes concepts dir: %s", name)
+        return
 
     if is_update and path.exists():
         existing = path.read_text(encoding="utf-8")
@@ -241,7 +284,11 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
                 body = existing[end + 3:]
                 if "sources:" in fm:
                     fm = fm.replace("sources: [", f"sources: [{source_file}, ")
+                else:
+                    fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
                 existing = fm + body
+            else:
+                existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
             existing += f"\n\n{content}"
         path.write_text(existing, encoding="utf-8")
     else:
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index 1d17c6c..4be4aa7 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -15,6 +15,7 @@
     _write_concept,
     _update_index,
     _read_wiki_context,
+    _read_concept_briefs,
 )
 
 
@@ -116,6 +117,69 @@ def test_with_content(self, tmp_path):
         assert concepts == ["attention", "transformer"]
 
 
+class TestReadConceptBriefs:
+    def test_empty_wiki(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "concepts").mkdir()
+        assert _read_concept_briefs(wiki) == "(none yet)"
+
+    def test_no_concepts_dir(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        assert _read_concept_briefs(wiki) == "(none yet)"
+
+    def test_reads_briefs_with_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper.pdf]\n---\n\nAttention is a mechanism that allows models to focus on relevant parts.",
+            encoding="utf-8",
+        )
+        result = _read_concept_briefs(wiki)
+        assert "- attention:" in result
+        assert "Attention is a mechanism" in result
+        assert "sources" not in result
+        assert "---" not in result
+
+    def test_reads_briefs_without_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "transformer.md").write_text(
+            "Transformer is a neural network architecture based on attention.",
+            encoding="utf-8",
+        )
+        result = _read_concept_briefs(wiki)
+        assert "- transformer:" in result
+        assert "Transformer is a neural network" in result
+
+    def test_truncates_long_content(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        long_body = "A" * 300
+        (concepts / "longconcept.md").write_text(long_body, encoding="utf-8")
+        result = _read_concept_briefs(wiki)
+        # The brief part should be truncated at 150 chars
+        brief = result.split("- longconcept: ", 1)[1]
+        assert len(brief) == 150
+        assert brief == "A" * 150
+
+    def test_sorted_alphabetically(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8")
+        (concepts / "apple.md").write_text("Apple concept.", encoding="utf-8")
+        (concepts / "mango.md").write_text("Mango concept.", encoding="utf-8")
+        result = _read_concept_briefs(wiki)
+        lines = result.strip().splitlines()
+        slugs = [line.split(":")[0].lstrip("- ") for line in lines]
+        assert slugs == ["apple", "mango", "zebra"]
+
+
 def _mock_completion(responses: list[str]):
     """Create a mock for litellm.completion that returns responses in order."""
     call_count = {"n": 0}

From 4f1d3323cc01f4be1cbd053c90773e3349b6748f Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 21:10:30 +0800
Subject: [PATCH 03/44] feat: add concepts plan and update prompt templates

Add _CONCEPTS_PLAN_USER (create/update/related JSON structure) and
_CONCEPT_UPDATE_USER templates; add TestParseConceptsPlan tests.
---
 openkb/agent/compiler.py | 71 ++++++++++++++++++++++++++++++++++++++++
 tests/test_compiler.py   | 59 +++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 9d721ca..6830b69 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -69,6 +69,33 @@
 Return ONLY valid JSON array, no fences, no explanation.
 """
 
+_CONCEPTS_PLAN_USER = """\
+Based on the summary above, decide how to update the wiki's concept pages.
+
+Existing concept pages:
+{concept_briefs}
+
+Return a JSON object with three keys:
+
+1. "create" — new concepts not covered by any existing page. Array of objects:
+   {{"name": "concept-slug", "title": "Human-Readable Title"}}
+
+2. "update" — existing concepts that have significant new information from \
+this document worth integrating. Array of objects:
+   {{"name": "existing-slug", "title": "Existing Title"}}
+
+3. "related" — existing concepts tangentially related to this document but \
+not needing content changes, just a cross-reference link. Array of slug strings.
+
+Rules:
+- For the first few documents, create 2-3 foundational concepts at most.
+- Do NOT create a concept that overlaps with an existing one — use "update".
+- Do NOT create concepts that are just the document topic itself.
+- "related" is for lightweight cross-linking only, no content rewrite needed.
+
+Return ONLY valid JSON, no fences, no explanation.
+"""
+
 _CONCEPT_PAGE_USER = """\
 Write the concept page for: {title}
 
@@ -81,6 +108,20 @@
 - [[wikilinks]] to related concepts and [[summaries/{doc_name}]]
 """
 
+_CONCEPT_UPDATE_USER = """\
+Update the concept page for: {title}
+
+Current content of this page:
+{existing_content}
+
+New information from document "{doc_name}" (summarized above) should be \
+integrated into this page. Rewrite the full page incorporating the new \
+information naturally — do not just append. Maintain existing \
+[[wikilinks]] and add new ones where appropriate.
+
+Return ONLY the Markdown content (no frontmatter, no code fences).
+"""
+
 _LONG_DOC_SUMMARY_USER = """\
 This is a PageIndex summary for long document "{doc_name}" (doc_id: {doc_id}):
 
@@ -296,6 +337,36 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
         path.write_text(frontmatter + content, encoding="utf-8")
 
 
+def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None:
+    """Add a cross-reference link to an existing concept page (no LLM call)."""
+    concepts_dir = wiki_dir / "concepts"
+    path = concepts_dir / f"{concept_slug}.md"
+    if not path.exists():
+        return
+
+    text = path.read_text(encoding="utf-8")
+    link = f"[[summaries/{doc_name}]]"
+    if link in text:
+        return
+
+    # Update sources in frontmatter
+    if source_file not in text:
+        if text.startswith("---"):
+            end = text.index("---", 3)
+            fm = text[:end + 3]
+            body = text[end + 3:]
+            if "sources:" in fm:
+                fm = fm.replace("sources: [", f"sources: [{source_file}, ")
+            else:
+                fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
+            text = fm + body
+        else:
+            text = f"---\nsources: [{source_file}]\n---\n\n" + text
+
+    text += f"\n\nSee also: {link}"
+    path.write_text(text, encoding="utf-8")
+
+
 def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> None:
     """Append document and concept entries to index.md."""
     index_path = wiki_dir / "index.md"
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index 4be4aa7..fd5b249 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -16,6 +16,7 @@
     _update_index,
     _read_wiki_context,
     _read_concept_briefs,
+    _add_related_link,
 )
 
 
@@ -32,6 +33,31 @@ def test_invalid_json(self):
             _parse_json("not json")
 
 
+class TestParseConceptsPlan:
+    def test_dict_format(self):
+        text = json.dumps({
+            "create": [{"name": "foo", "title": "Foo"}],
+            "update": [{"name": "bar", "title": "Bar"}],
+            "related": ["baz"],
+        })
+        parsed = _parse_json(text)
+        assert isinstance(parsed, dict)
+        assert len(parsed["create"]) == 1
+        assert len(parsed["update"]) == 1
+        assert parsed["related"] == ["baz"]
+
+    def test_fallback_list_format(self):
+        text = json.dumps([{"name": "foo", "title": "Foo"}])
+        parsed = _parse_json(text)
+        assert isinstance(parsed, list)
+
+    def test_fenced_dict(self):
+        text = '```json\n{"create": [], "update": [], "related": []}\n```'
+        parsed = _parse_json(text)
+        assert isinstance(parsed, dict)
+        assert parsed["create"] == []
+
+
 class TestWriteSummary:
     def test_writes_with_frontmatter(self, tmp_path):
         wiki = tmp_path / "wiki"
@@ -180,6 +206,39 @@ def test_sorted_alphabetically(self, tmp_path):
         assert slugs == ["apple", "mango", "zebra"]
 
 
+class TestAddRelatedLink:
+    def test_adds_see_also_link(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSome content.",
+            encoding="utf-8",
+        )
+        _add_related_link(wiki, "attention", "new-doc", "paper2.pdf")
+        text = (concepts / "attention.md").read_text()
+        assert "[[summaries/new-doc]]" in text
+        assert "paper2.pdf" in text
+
+    def test_skips_if_already_linked(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSee also: [[summaries/new-doc]]",
+            encoding="utf-8",
+        )
+        _add_related_link(wiki, "attention", "new-doc", "paper1.pdf")
+        text = (concepts / "attention.md").read_text()
+        assert text.count("[[summaries/new-doc]]") == 1
+
+    def test_skips_if_file_missing(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        # Should not raise
+        _add_related_link(wiki, "nonexistent", "doc", "file.pdf")
+
+
 def _mock_completion(responses: list[str]):
     """Create a mock for litellm.completion that returns responses in order."""
     call_count = {"n": 0}

From fc0857e4109e93a6dff2d9ba21422572bc055813 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 21:14:15 +0800
Subject: [PATCH 04/44] feat: concept dedup with briefs, update/related paths,
 extract _compile_concepts

---
 openkb/agent/compiler.py | 270 +++++++++++++++++++--------------------
 tests/test_compiler.py   | 174 ++++++++++++++++++++++++-
 2 files changed, 301 insertions(+), 143 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 6830b69..a6f5bdc 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -396,81 +396,67 @@ def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> No
 DEFAULT_COMPILE_CONCURRENCY = 5
 
 
-async def compile_short_doc(
-    doc_name: str,
-    source_path: Path,
+async def _compile_concepts(
+    wiki_dir: Path,
     kb_dir: Path,
     model: str,
-    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
+    system_msg: dict,
+    doc_msg: dict,
+    summary: str,
+    doc_name: str,
+    max_concurrency: int,
 ) -> None:
-    """Compile a short document using a multi-step LLM pipeline with caching.
+    """Shared Steps 2-4: concepts plan → generate/update → index.
 
-    Step 1: Build base context A (schema + doc content).
-    Step 2: A → generate summary.
-    Step 3: A + summary → extract concept list.
-    Step 4: Concurrent LLM calls (A cached) → generate each concept page.
-    Step 5: Code writes files, updates index.
+    Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related
+    actions, then executes each action type accordingly.
     """
-    from openkb.config import load_config
-
-    openkb_dir = kb_dir / ".openkb"
-    config = load_config(openkb_dir / "config.yaml")
-    language: str = config.get("language", "en")
-
-    wiki_dir = kb_dir / "wiki"
-    schema_md = get_agents_md(wiki_dir)
     source_file = _find_source_filename(doc_name, kb_dir)
-    content = source_path.read_text(encoding="utf-8")
 
-    # Base context A: system + document
-    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
-        schema_md=schema_md, language=language,
-    )}
-    doc_msg = {"role": "user", "content": _SUMMARY_USER.format(
-        doc_name=doc_name, content=content,
-    )}
+    # --- Step 2: Get concepts plan (A cached) ---
+    concept_briefs = _read_concept_briefs(wiki_dir)
 
-    # --- Step 1: Generate summary ---
-    summary = _llm_call(model, [system_msg, doc_msg], "summary")
-    _write_summary(wiki_dir, doc_name, source_file, summary)
-
-    # --- Step 2: Extract concept list (A cached) ---
-    _, existing_concepts = _read_wiki_context(wiki_dir)
-
-    concepts_list_raw = _llm_call(model, [
+    plan_raw = _llm_call(model, [
         system_msg,
         doc_msg,
         {"role": "assistant", "content": summary},
-        {"role": "user", "content": _CONCEPTS_LIST_USER.format(
-            existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)",
+        {"role": "user", "content": _CONCEPTS_PLAN_USER.format(
+            concept_briefs=concept_briefs,
         )},
-    ], "concepts-list", max_tokens=512)
+    ], "concepts-plan", max_tokens=1024)
 
     try:
-        concepts_list = _parse_json(concepts_list_raw)
+        parsed = _parse_json(plan_raw)
     except (json.JSONDecodeError, ValueError) as exc:
-        logger.warning("Failed to parse concepts list: %s", exc)
-        logger.debug("Raw: %s", concepts_list_raw)
+        logger.warning("Failed to parse concepts plan: %s", exc)
+        logger.debug("Raw: %s", plan_raw)
         _update_index(wiki_dir, doc_name, [])
         return
 
-    if not concepts_list:
+    # Fallback: if LLM returns a flat list, treat all items as "create"
+    if isinstance(parsed, list):
+        plan = {"create": parsed, "update": [], "related": []}
+    else:
+        plan = {
+            "create": parsed.get("create", []),
+            "update": parsed.get("update", []),
+            "related": parsed.get("related", []),
+        }
+
+    create_items = plan["create"]
+    update_items = plan["update"]
+    related_items = plan["related"]
+
+    if not create_items and not update_items and not related_items:
         _update_index(wiki_dir, doc_name, [])
         return
 
-    # --- Step 3: Generate concept pages concurrently (A cached) ---
+    # --- Step 3: Generate/update concept pages concurrently (A cached) ---
     semaphore = asyncio.Semaphore(max_concurrency)
 
-    async def _gen_concept(concept: dict) -> tuple[str, str, bool]:
+    async def _gen_create(concept: dict) -> tuple[str, str, bool]:
         name = concept["name"]
         title = concept.get("title", name)
-        is_update = concept.get("is_update", False)
-        update_instruction = (
-            "This concept page already exists. Add new information from this document "
-            "without duplicating existing content."
-            if is_update else ""
-        )
-
         async with semaphore:
             page_content = await _llm_call_async(model, [
                 system_msg,
@@ -478,45 +464,76 @@ async def _gen_concept(concept: dict) -> tuple[str, str, bool]:
                 {"role": "assistant", "content": summary},
                 {"role": "user", "content": _CONCEPT_PAGE_USER.format(
                     title=title, doc_name=doc_name,
-                    update_instruction=update_instruction,
+                    update_instruction="",
                 )},
             ], f"concept:{name}")
+        return name, page_content, False
 
-        return name, page_content, is_update
+    async def _gen_update(concept: dict) -> tuple[str, str, bool]:
+        name = concept["name"]
+        title = concept.get("title", name)
+        concept_path = wiki_dir / "concepts" / f"{name}.md"
+        if concept_path.exists():
+            raw_text = concept_path.read_text(encoding="utf-8")
+            if raw_text.startswith("---"):
+                parts = raw_text.split("---", 2)
+                existing_content = parts[2].strip() if len(parts) >= 3 else raw_text
+            else:
+                existing_content = raw_text
+        else:
+            existing_content = "(page not found — create from scratch)"
+        async with semaphore:
+            page_content = await _llm_call_async(model, [
+                system_msg,
+                doc_msg,
+                {"role": "assistant", "content": summary},
+                {"role": "user", "content": _CONCEPT_UPDATE_USER.format(
+                    title=title, doc_name=doc_name,
+                    existing_content=existing_content,
+                )},
+            ], f"update:{name}")
+        return name, page_content, True
 
-    sys.stdout.write(f"    Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n")
-    sys.stdout.flush()
+    tasks = []
+    tasks.extend(_gen_create(c) for c in create_items)
+    tasks.extend(_gen_update(c) for c in update_items)
 
-    results = await asyncio.gather(
-        *[_gen_concept(c) for c in concepts_list],
-        return_exceptions=True,
-    )
+    concept_names: list[str] = []
 
-    concept_names = []
-    for r in results:
-        if isinstance(r, Exception):
-            logger.warning("Concept generation failed: %s", r)
-            continue
-        name, page_content, is_update = r
-        _write_concept(wiki_dir, name, page_content, source_file, is_update)
-        concept_names.append(name)
+    if tasks:
+        total = len(tasks)
+        sys.stdout.write(f"    Generating {total} concept(s) (concurrency={max_concurrency})...\n")
+        sys.stdout.flush()
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        for r in results:
+            if isinstance(r, Exception):
+                logger.warning("Concept generation failed: %s", r)
+                continue
+            name, page_content, is_update = r
+            _write_concept(wiki_dir, name, page_content, source_file, is_update)
+            concept_names.append(name)
+
+    # --- Step 3b: Process related items (code only, no LLM) ---
+    for slug in related_items:
+        _add_related_link(wiki_dir, slug, doc_name, source_file)
 
     # --- Step 4: Update index (code only) ---
     _update_index(wiki_dir, doc_name, concept_names)
 
 
-async def compile_long_doc(
+async def compile_short_doc(
     doc_name: str,
-    summary_path: Path,
-    doc_id: str,
+    source_path: Path,
     kb_dir: Path,
     model: str,
     max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
 ) -> None:
-    """Compile a long (PageIndex) document's concepts and index.
+    """Compile a short document using a multi-step LLM pipeline with caching.
 
-    The summary page is already written by the indexer. This function
-    generates concept pages and updates the index.
+    Step 1: Build base context A (schema + doc content), generate summary.
+    Steps 2-4: Delegated to ``_compile_concepts``.
     """
     from openkb.config import load_config
 
@@ -527,84 +544,63 @@ async def compile_long_doc(
     wiki_dir = kb_dir / "wiki"
     schema_md = get_agents_md(wiki_dir)
     source_file = _find_source_filename(doc_name, kb_dir)
-    summary = summary_path.read_text(encoding="utf-8")
+    content = source_path.read_text(encoding="utf-8")
 
-    # Base context A
+    # Base context A: system + document
     system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
         schema_md=schema_md, language=language,
     )}
-    doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format(
-        doc_name=doc_name, doc_id=doc_id, content=summary,
+    doc_msg = {"role": "user", "content": _SUMMARY_USER.format(
+        doc_name=doc_name, content=content,
     )}
 
-    # --- Step 1: Extract concept list ---
-    _, existing_concepts = _read_wiki_context(wiki_dir)
-
-    # Get a concise overview first (for concept generation context)
-    overview = _llm_call(model, [system_msg, doc_msg], "overview")
+    # --- Step 1: Generate summary ---
+    summary = _llm_call(model, [system_msg, doc_msg], "summary")
+    _write_summary(wiki_dir, doc_name, source_file, summary)
 
-    concepts_list_raw = _llm_call(model, [
-        system_msg,
-        doc_msg,
-        {"role": "assistant", "content": overview},
-        {"role": "user", "content": _CONCEPTS_LIST_USER.format(
-            existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)",
-        )},
-    ], "concepts-list", max_tokens=512)
+    # --- Steps 2-4: Concept plan → generate/update → index ---
+    await _compile_concepts(
+        wiki_dir, kb_dir, model, system_msg, doc_msg,
+        summary, doc_name, max_concurrency,
+    )
 
-    try:
-        concepts_list = _parse_json(concepts_list_raw)
-    except (json.JSONDecodeError, ValueError) as exc:
-        logger.warning("Failed to parse concepts list: %s", exc)
-        logger.debug("Raw: %s", concepts_list_raw)
-        _update_index(wiki_dir, doc_name, [])
-        return
 
-    if not concepts_list:
-        _update_index(wiki_dir, doc_name, [])
-        return
+async def compile_long_doc(
+    doc_name: str,
+    summary_path: Path,
+    doc_id: str,
+    kb_dir: Path,
+    model: str,
+    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
+) -> None:
+    """Compile a long (PageIndex) document's concepts and index.
 
-    # --- Step 2: Generate concept pages concurrently ---
-    semaphore = asyncio.Semaphore(max_concurrency)
+    The summary page is already written by the indexer. This function
+    generates concept pages and updates the index.
+    """
+    from openkb.config import load_config
 
-    async def _gen_concept(concept: dict) -> tuple[str, str, bool]:
-        name = concept["name"]
-        title = concept.get("title", name)
-        is_update = concept.get("is_update", False)
-        update_instruction = (
-            "This concept page already exists. Add new information."
-            if is_update else ""
-        )
+    openkb_dir = kb_dir / ".openkb"
+    config = load_config(openkb_dir / "config.yaml")
+    language: str = config.get("language", "en")
 
-        async with semaphore:
-            page_content = await _llm_call_async(model, [
-                system_msg,
-                doc_msg,
-                {"role": "assistant", "content": overview},
-                {"role": "user", "content": _CONCEPT_PAGE_USER.format(
-                    title=title, doc_name=doc_name,
-                    update_instruction=update_instruction,
-                )},
-            ], f"concept:{name}")
+    wiki_dir = kb_dir / "wiki"
+    schema_md = get_agents_md(wiki_dir)
+    summary_content = summary_path.read_text(encoding="utf-8")
 
-        return name, page_content, is_update
+    # Base context A
+    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
+        schema_md=schema_md, language=language,
+    )}
+    doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format(
+        doc_name=doc_name, doc_id=doc_id, content=summary_content,
+    )}
 
-    sys.stdout.write(f"    Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n")
-    sys.stdout.flush()
+    # --- Step 1: Generate overview ---
+    overview = _llm_call(model, [system_msg, doc_msg], "overview")
 
-    results = await asyncio.gather(
-        *[_gen_concept(c) for c in concepts_list],
-        return_exceptions=True,
+    # --- Steps 2-4: Concept plan → generate/update → index ---
+    await _compile_concepts(
+        wiki_dir, kb_dir, model, system_msg, doc_msg,
+        overview, doc_name, max_concurrency,
     )
-
-    concept_names = []
-    for r in results:
-        if isinstance(r, Exception):
-            logger.warning("Concept generation failed: %s", r)
-            continue
-        name, page_content, is_update = r
-        _write_concept(wiki_dir, name, page_content, source_file, is_update)
-        concept_names.append(name)
-
-    # --- Step 3: Update index (code only) ---
-    _update_index(wiki_dir, doc_name, concept_names)
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index fd5b249..d0903f5 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -10,6 +10,7 @@
 from openkb.agent.compiler import (
     compile_long_doc,
     compile_short_doc,
+    _compile_concepts,
     _parse_json,
     _write_summary,
     _write_concept,
@@ -292,9 +293,11 @@ async def test_full_pipeline(self, tmp_path):
         (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
 
         summary_response = "# Summary\n\nThis document discusses transformers."
-        concepts_list_response = json.dumps([
-            {"name": "transformer", "title": "Transformer", "is_update": False},
-        ])
+        concepts_list_response = json.dumps({
+            "create": [{"name": "transformer", "title": "Transformer"}],
+            "update": [],
+            "related": [],
+        })
         concept_page_response = "# Transformer\n\nA neural network architecture."
 
         with patch("openkb.agent.compiler.litellm") as mock_litellm:
@@ -364,9 +367,11 @@ async def test_full_pipeline(self, tmp_path):
         (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake")
 
         overview_response = "Overview of the big document."
-        concepts_list_response = json.dumps([
-            {"name": "deep-learning", "title": "Deep Learning", "is_update": False},
-        ])
+        concepts_list_response = json.dumps({
+            "create": [{"name": "deep-learning", "title": "Deep Learning"}],
+            "update": [],
+            "related": [],
+        })
         concept_page_response = "# Deep Learning\n\nA subfield of ML."
 
         with patch("openkb.agent.compiler.litellm") as mock_litellm:
@@ -387,3 +392,160 @@ async def test_full_pipeline(self, tmp_path):
         index_text = (wiki / "index.md").read_text()
         assert "[[summaries/big-doc]]" in index_text
         assert "[[concepts/deep-learning]]" in index_text
+
+
+class TestCompileConceptsPlan:
+    """Integration tests for _compile_concepts with the new plan format."""
+
+    def _setup_wiki(self, tmp_path, existing_concepts=None):
+        """Helper to set up a wiki directory with optional existing concepts."""
+        wiki = tmp_path / "wiki"
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "concepts").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n",
+            encoding="utf-8",
+        )
+        (tmp_path / "raw").mkdir(exist_ok=True)
+        (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
+
+        if existing_concepts:
+            for name, content in existing_concepts.items():
+                (wiki / "concepts" / f"{name}.md").write_text(
+                    content, encoding="utf-8",
+                )
+
+        return wiki
+
+    @pytest.mark.asyncio
+    async def test_create_and_update_flow(self, tmp_path):
+        """Pre-existing 'attention' concept; plan creates 'flash-attention' and updates 'attention'."""
+        wiki = self._setup_wiki(tmp_path, existing_concepts={
+            "attention": "---\nsources: [old-paper.pdf]\n---\n\n# Attention\n\nOriginal content about attention.",
+        })
+
+        plan_response = json.dumps({
+            "create": [{"name": "flash-attention", "title": "Flash Attention"}],
+            "update": [{"name": "attention", "title": "Attention"}],
+            "related": [],
+        })
+        create_page_response = "# Flash Attention\n\nAn efficient attention algorithm."
+        update_page_response = "# Attention\n\nUpdated content with new info."
+
+        system_msg = {"role": "system", "content": "You are a wiki agent."}
+        doc_msg = {"role": "user", "content": "Document about attention mechanisms."}
+        summary = "Summary of the document."
+
+        call_order = {"n": 0}
+
+        async def ordered_acompletion(*args, **kwargs):
+            idx = call_order["n"]
+            call_order["n"] += 1
+            mock_resp = MagicMock()
+            mock_resp.choices = [MagicMock()]
+            # create tasks come first, then update tasks
+            if idx == 0:
+                mock_resp.choices[0].message.content = create_page_response
+            else:
+                mock_resp.choices[0].message.content = update_page_response
+            mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50)
+            mock_resp.usage.prompt_tokens_details = None
+            return mock_resp
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([plan_response])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=ordered_acompletion
+            )
+            await _compile_concepts(
+                wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg,
+                summary, "test-doc", 5,
+            )
+
+        # Verify flash-attention created
+        fa_path = wiki / "concepts" / "flash-attention.md"
+        assert fa_path.exists()
+        fa_text = fa_path.read_text()
+        assert "sources: [test-doc.pdf]" in fa_text
+        assert "Flash Attention" in fa_text
+
+        # Verify attention updated (is_update=True path in _write_concept)
+        att_path = wiki / "concepts" / "attention.md"
+        assert att_path.exists()
+        att_text = att_path.read_text()
+        assert "test-doc.pdf" in att_text
+        assert "old-paper.pdf" in att_text
+
+        # Verify index updated
+        index_text = (wiki / "index.md").read_text()
+        assert "[[concepts/flash-attention]]" in index_text
+        assert "[[concepts/attention]]" in index_text
+
+    @pytest.mark.asyncio
+    async def test_related_adds_link_no_llm(self, tmp_path):
+        """Plan has only related items. No acompletion calls should be made."""
+        wiki = self._setup_wiki(tmp_path, existing_concepts={
+            "transformer": "---\nsources: [old.pdf]\n---\n\n# Transformer\n\nContent about transformers.",
+        })
+
+        plan_response = json.dumps({
+            "create": [],
+            "update": [],
+            "related": ["transformer"],
+        })
+
+        system_msg = {"role": "system", "content": "You are a wiki agent."}
+        doc_msg = {"role": "user", "content": "Document content."}
+        summary = "Summary."
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([plan_response])
+            )
+            mock_litellm.acompletion = AsyncMock()
+            await _compile_concepts(
+                wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg,
+                summary, "test-doc", 5,
+            )
+            # acompletion should never be called — related is code-only
+            mock_litellm.acompletion.assert_not_called()
+
+        # Verify link added to transformer page
+        transformer_text = (wiki / "concepts" / "transformer.md").read_text()
+        assert "[[summaries/test-doc]]" in transformer_text
+        assert "test-doc.pdf" in transformer_text
+
+    @pytest.mark.asyncio
+    async def test_fallback_list_format(self, tmp_path):
+        """LLM returns a flat array instead of dict — treated as all create."""
+        wiki = self._setup_wiki(tmp_path)
+
+        plan_response = json.dumps([
+            {"name": "attention", "title": "Attention"},
+        ])
+        concept_page_response = "# Attention\n\nA mechanism for focusing."
+
+        system_msg = {"role": "system", "content": "You are a wiki agent."}
+        doc_msg = {"role": "user", "content": "Document content."}
+        summary = "Summary."
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([plan_response])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([concept_page_response])
+            )
+            await _compile_concepts(
+                wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg,
+                summary, "test-doc", 5,
+            )
+
+        # Verify concept was created (not updated)
+        att_path = wiki / "concepts" / "attention.md"
+        assert att_path.exists()
+        att_text = att_path.read_text()
+        assert "sources: [test-doc.pdf]" in att_text
+        assert "Attention" in att_text

From 4249d5374b4e6f86b7bbb0c16dcc5781883f0bae Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 21:16:27 +0800
Subject: [PATCH 05/44] chore: update compiler docstring, remove dead
 _CONCEPTS_LIST_USER

---
 openkb/agent/compiler.py | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index a6f5bdc..326708a 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -3,9 +3,9 @@
 Pipeline leveraging LLM prompt caching:
   Step 1: Build base context A (schema + document content).
   Step 2: A → generate summary.
-  Step 3: A + summary → extract concept list.
-  Step 4: Concurrent LLM calls (A cached) → generate each concept page.
-  Step 5: Code writes all files, updates index, appends log.
+  Step 3: A + summary → concepts plan (create/update/related).
+  Step 4: Concurrent LLM calls (A cached) → generate new + rewrite updated concepts.
+  Step 5: Code adds cross-ref links to related concepts, updates index.
 """
 from __future__ import annotations
 
@@ -50,24 +50,6 @@
 Return ONLY the Markdown content (no frontmatter, no code fences).
 """
 
-_CONCEPTS_LIST_USER = """\
-Based on the summary above, identify the key concepts worth creating as \
-standalone wiki concept pages.
-
-Existing concept pages: {existing_concepts}
-
-Return a JSON array of objects, each with:
-- "name": concept slug (e.g. "transformer-architecture")
-- "title": human-readable title (e.g. "Transformer Architecture")
-- "is_update": true if this concept already exists and should be updated
-
-Only include concepts for significant themes. For the first document, \
-create 2-3 foundational concepts at most. Do NOT create concepts that are \
-just the document topic itself (e.g. don't create "machine-translation" \
-for a translation paper).
-
-Return ONLY valid JSON array, no fences, no explanation.
-"""
 
 _CONCEPTS_PLAN_USER = """\
 Based on the summary above, decide how to update the wiki's concept pages.

From 1a28c11f24a8412bb709c79c180e9502c38dcf45 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 21:24:48 +0800
Subject: [PATCH 06/44] =?UTF-8?q?fix:=20code=20review=20fixes=20=E2=80=94?=
 =?UTF-8?q?=20security,=20robustness,=20tests,=20and=20CI=20hardening?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Restore markitdown[all] extras for docx/pptx/xlsx support
- Sanitize concept names to prevent path traversal in compiler
- Add path traversal guard in copy_relative_images
- Fix _write_concept duplicate append when frontmatter lacks sources key
- Remove dead write_wiki_files function
- Fix watcher thread race in _schedule_flush
- Warn when unimplemented --fix flag is used in lint command
- Harden CI publish workflow with environment gate and SHA-pinned actions
- Fix test_indexer to actually assert IndexConfig flag values
- Fix test_converter to test correct PDF code path (pymupdf, not markitdown)
- Use str.find() instead of str.index() in frontmatter parsing to avoid ValueError
---
 .github/workflows/publish.yml |  7 ++++---
 openkb/agent/compiler.py      | 34 ++++++++++++++++---------------
 openkb/agent/tools.py         | 38 -----------------------------------
 openkb/cli.py                 |  4 +++-
 openkb/images.py              |  5 ++++-
 openkb/watcher.py             | 11 +++++-----
 pyproject.toml                |  2 +-
 tests/test_converter.py       | 11 ++++------
 tests/test_indexer.py         | 11 +++++-----
 9 files changed, 46 insertions(+), 77 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 78fd0e0..17b26c2 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -8,12 +8,13 @@ on:
 jobs:
   publish:
     runs-on: ubuntu-latest
+    environment: pypi
     permissions:
       id-token: write
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332  # v4.2.2
 
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3  # v5.2.0
         with:
           python-version: "3.12"
 
@@ -24,4 +25,4 @@ jobs:
         run: python -m build
 
       - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
+        uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa  # release/v1.11.0
diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 326708a..9119b03 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -302,14 +302,15 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
         existing = path.read_text(encoding="utf-8")
         if source_file not in existing:
             if existing.startswith("---"):
-                end = existing.index("---", 3)
-                fm = existing[:end + 3]
-                body = existing[end + 3:]
-                if "sources:" in fm:
-                    fm = fm.replace("sources: [", f"sources: [{source_file}, ")
-                else:
-                    fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
-                existing = fm + body
+                end = existing.find("---", 3)
+                if end != -1:
+                    fm = existing[:end + 3]
+                    body = existing[end + 3:]
+                    if "sources:" in fm:
+                        fm = fm.replace("sources: [", f"sources: [{source_file}, ")
+                    else:
+                        fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
+                    existing = fm + body
             else:
                 existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
             existing += f"\n\n{content}"
@@ -334,14 +335,15 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f
     # Update sources in frontmatter
     if source_file not in text:
         if text.startswith("---"):
-            end = text.index("---", 3)
-            fm = text[:end + 3]
-            body = text[end + 3:]
-            if "sources:" in fm:
-                fm = fm.replace("sources: [", f"sources: [{source_file}, ")
-            else:
-                fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
-            text = fm + body
+            end = text.find("---", 3)
+            if end != -1:
+                fm = text[:end + 3]
+                body = text[end + 3:]
+                if "sources:" in fm:
+                    fm = fm.replace("sources: [", f"sources: [{source_file}, ")
+                else:
+                    fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
+                text = fm + body
         else:
             text = f"---\nsources: [{source_file}]\n---\n\n" + text
 
diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py
index 40875f3..185344b 100644
--- a/openkb/agent/tools.py
+++ b/openkb/agent/tools.py
@@ -73,41 +73,3 @@ def write_wiki_file(path: str, content: str, wiki_root: str) -> str:
     full_path.write_text(content, encoding="utf-8")
     return f"Written: {path}"
 
-
-def write_wiki_files(files_json: str, wiki_root: str) -> str:
-    """Write multiple Markdown files to the wiki in one call.
-
-    Args:
-        files_json: JSON array of objects, each with ``"path"`` and ``"content"`` keys.
-            Example: ``[{"path": "concepts/foo.md", "content": "# Foo\\n..."}]``
-        wiki_root: Absolute path to the wiki root directory.
-
-    Returns:
-        Summary of written files, or error message on failure.
-    """
-    import json
-
-    try:
-        files = json.loads(files_json)
-    except json.JSONDecodeError as exc:
-        return f"Invalid JSON: {exc}"
-
-    if not isinstance(files, list):
-        return "Expected a JSON array of {path, content} objects."
-
-    root = Path(wiki_root).resolve()
-    written: list[str] = []
-    for entry in files:
-        path = entry.get("path", "")
-        content = entry.get("content", "")
-        if not path:
-            continue
-        full_path = (root / path).resolve()
-        if not full_path.is_relative_to(root):
-            written.append(f"Skipped (path escape): {path}")
-            continue
-        full_path.parent.mkdir(parents=True, exist_ok=True)
-        full_path.write_text(content, encoding="utf-8")
-        written.append(path)
-
-    return f"Written {len(written)} files: {', '.join(written)}"
diff --git a/openkb/cli.py b/openkb/cli.py
index 388ac87..149f391 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -334,9 +334,11 @@ def on_new_files(paths):
 
 
 @cli.command()
-@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues.")  # TODO: --fix not yet implemented
+@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues (not yet implemented).")
 def lint(fix):
     """Lint the knowledge base for structural and semantic inconsistencies."""
+    if fix:
+        click.echo("Warning: --fix is not yet implemented. Running lint in report-only mode.")
     kb_dir = _find_kb_dir()
     if kb_dir is None:
         click.echo("No knowledge base found. Run `openkb init` first.")
diff --git a/openkb/images.py b/openkb/images.py
index 80ef37f..d72cec7 100644
--- a/openkb/images.py
+++ b/openkb/images.py
@@ -171,7 +171,10 @@ def copy_relative_images(
 
     for match in _RELATIVE_RE.finditer(markdown):
         alt, rel_path = match.group(1), match.group(2)
-        src = source_dir / rel_path
+        src = (source_dir / rel_path).resolve()
+        if not src.is_relative_to(source_dir.resolve()):
+            logger.warning("Image path escapes source dir: %s; skipping.", rel_path)
+            continue
         if not src.exists():
             logger.warning(
                 "Relative image not found: %s; leaving original link.", src
diff --git a/openkb/watcher.py b/openkb/watcher.py
index 77fdf24..2a0fae9 100644
--- a/openkb/watcher.py
+++ b/openkb/watcher.py
@@ -37,11 +37,12 @@ def __init__(self, callback: Callable[[list[str]], None], debounce_seconds: floa
 
     def _schedule_flush(self) -> None:
         """Cancel any existing timer and start a fresh debounce timer."""
-        if self._timer is not None:
-            self._timer.cancel()
-        self._timer = threading.Timer(self._debounce_seconds, self._flush)
-        self._timer.daemon = True
-        self._timer.start()
+        with self._lock:
+            if self._timer is not None:
+                self._timer.cancel()
+            self._timer = threading.Timer(self._debounce_seconds, self._flush)
+            self._timer.daemon = True
+            self._timer.start()
 
     def _flush(self) -> None:
         """Call the callback with all collected pending paths, then clear."""
diff --git a/pyproject.toml b/pyproject.toml
index eb1cdde..393cbd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ classifiers = [
 keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"]
 dependencies = [
     "pageindex==0.3.0.dev0",
-    "markitdown",
+    "markitdown[all]",
     "click>=8.0",
     "watchdog>=3.0",
     "litellm",
diff --git a/tests/test_converter.py b/tests/test_converter.py
index 5efb6eb..6c184fd 100644
--- a/tests/test_converter.py
+++ b/tests/test_converter.py
@@ -81,27 +81,24 @@ def test_md_raw_file_copied(self, kb_dir):
 
 
 class TestConvertDocumentPdfShort:
-    def test_short_pdf_converted_via_markitdown(self, kb_dir, tmp_path):
-        """PDF under threshold is converted with markitdown."""
+    def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path):
+        """PDF under threshold is converted with pymupdf (convert_pdf_with_images)."""
         src = tmp_path / "short.pdf"
         src.write_bytes(b"%PDF-1.4 fake content")
 
-        fake_result = MagicMock()
-        fake_result.text_content = "# Short PDF\n\nConverted content."
-
         with (
             patch("openkb.converter.pymupdf.open") as mock_mu,
-            patch("openkb.converter.MarkItDown") as mock_mid_cls,
+            patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi,
         ):
             fake_doc = MagicMock()
             fake_doc.page_count = 5  # below default threshold of 20
             fake_doc.__enter__ = MagicMock(return_value=fake_doc)
             fake_doc.__exit__ = MagicMock(return_value=False)
             mock_mu.return_value = fake_doc
-            mock_mid_cls.return_value.convert.return_value = fake_result
 
             result = convert_document(src, kb_dir)
 
+        mock_cpwi.assert_called_once()
         assert result.skipped is False
         assert result.is_long_doc is False
         assert result.source_path is not None
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
index c9c7101..e35c969 100644
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
@@ -95,10 +95,11 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls:
             index_long_document(pdf_path, kb_dir)
 
-        # Verify PageIndexClient was instantiated
+        # Verify PageIndexClient was instantiated with correct IndexConfig
         mock_cls.assert_called_once()
-        # Check that index_config with correct flags was passed
         _, kwargs = mock_cls.call_args
-        ic = kwargs.get("index_config") or mock_cls.call_args[0][0] if mock_cls.call_args[0] else None
-        # Either as positional or keyword — either way PageIndexClient was called
-        assert mock_cls.called
+        ic = kwargs.get("index_config")
+        assert ic is not None, "index_config must be passed to PageIndexClient"
+        assert ic.if_add_node_text is True
+        assert ic.if_add_node_summary is True
+        assert ic.if_add_doc_description is True

From 4b891fa0db1df37eb822842ede0ef5114a234b60 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:07:38 +0800
Subject: [PATCH 07/44] feat: bidirectional backlinks between summaries and
 concepts

- Add _backlink_summary: ensures summary pages link to all related concepts
- Add _backlink_concepts: ensures concept pages link back to source summaries
- _update_index auto-creates index.md if missing
- Both merge into existing sections instead of duplicating
---
 openkb/agent/compiler.py |  65 ++++++++++++++++++++++-
 tests/test_compiler.py   | 110 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 174 insertions(+), 1 deletion(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 9119b03..5075278 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -351,11 +351,68 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f
     path.write_text(text, encoding="utf-8")
 
 
+def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
+    """Append missing concept wikilinks to the summary page (no LLM call).
+
+    After all concepts are generated, this ensures the summary page links
+    back to every related concept — closing the bidirectional link that
+    concept pages already have toward the summary.
+
+    If a ``## Related Concepts`` section already exists, new links are
+    appended into it rather than creating a duplicate section.
+    """
+    summary_path = wiki_dir / "summaries" / f"{doc_name}.md"
+    if not summary_path.exists():
+        return
+
+    text = summary_path.read_text(encoding="utf-8")
+    missing = [slug for slug in concept_slugs if f"[[concepts/{slug}]]" not in text]
+    if not missing:
+        return
+
+    new_links = "\n".join(f"- [[concepts/{s}]]" for s in missing)
+    if "## Related Concepts" in text:
+        # Append into existing section
+        text = text.replace("## Related Concepts\n", f"## Related Concepts\n{new_links}\n", 1)
+    else:
+        text += f"\n\n## Related Concepts\n{new_links}\n"
+    summary_path.write_text(text, encoding="utf-8")
+
+
+def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
+    """Append missing summary wikilink to each concept page (no LLM call).
+
+    Ensures every concept page links back to the source document's summary,
+    regardless of whether the LLM included the link in its output.
+
+    If a ``## Related Documents`` section already exists, the link is
+    appended into it rather than creating a duplicate section.
+    """
+    link = f"[[summaries/{doc_name}]]"
+    concepts_dir = wiki_dir / "concepts"
+
+    for slug in concept_slugs:
+        path = concepts_dir / f"{slug}.md"
+        if not path.exists():
+            continue
+        text = path.read_text(encoding="utf-8")
+        if link in text:
+            continue
+        if "## Related Documents" in text:
+            text = text.replace("## Related Documents\n", f"## Related Documents\n- {link}\n", 1)
+        else:
+            text += f"\n\n## Related Documents\n- {link}\n"
+        path.write_text(text, encoding="utf-8")
+
+
 def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> None:
     """Append document and concept entries to index.md."""
     index_path = wiki_dir / "index.md"
     if not index_path.exists():
-        return
+        index_path.write_text(
+            "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
 
     text = index_path.read_text(encoding="utf-8")
 
@@ -503,6 +560,12 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
     for slug in related_items:
         _add_related_link(wiki_dir, slug, doc_name, source_file)
 
+    # --- Step 3c: Backlink — summary ↔ concepts (code only) ---
+    all_concept_slugs = concept_names + [s for s in related_items]
+    if all_concept_slugs:
+        _backlink_summary(wiki_dir, doc_name, all_concept_slugs)
+        _backlink_concepts(wiki_dir, doc_name, all_concept_slugs)
+
     # --- Step 4: Update index (code only) ---
     _update_index(wiki_dir, doc_name, concept_names)
 
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index d0903f5..e1238df 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -18,6 +18,8 @@
     _read_wiki_context,
     _read_concept_briefs,
     _add_related_link,
+    _backlink_summary,
+    _backlink_concepts,
 )
 
 
@@ -207,6 +209,114 @@ def test_sorted_alphabetically(self, tmp_path):
         assert slugs == ["apple", "mango", "zebra"]
 
 
+class TestBacklinkSummary:
+    def test_adds_missing_concept_links(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        summaries = wiki / "summaries"
+        summaries.mkdir(parents=True)
+        (summaries / "paper.md").write_text(
+            "---\nsources: [paper.pdf]\n---\n\n# Summary\n\nContent about attention.",
+            encoding="utf-8",
+        )
+        _backlink_summary(wiki, "paper", ["attention", "transformer"])
+        text = (summaries / "paper.md").read_text()
+        assert "[[concepts/attention]]" in text
+        assert "[[concepts/transformer]]" in text
+
+    def test_skips_already_linked(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        summaries = wiki / "summaries"
+        summaries.mkdir(parents=True)
+        (summaries / "paper.md").write_text(
+            "---\nsources: [paper.pdf]\n---\n\n# Summary\n\nSee [[concepts/attention]].",
+            encoding="utf-8",
+        )
+        _backlink_summary(wiki, "paper", ["attention", "transformer"])
+        text = (summaries / "paper.md").read_text()
+        # attention already linked, should not duplicate
+        assert text.count("[[concepts/attention]]") == 1
+        # transformer should be added
+        assert "[[concepts/transformer]]" in text
+
+    def test_no_op_when_all_linked(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        summaries = wiki / "summaries"
+        summaries.mkdir(parents=True)
+        original = "# Summary\n\n[[concepts/attention]] and [[concepts/transformer]]"
+        (summaries / "paper.md").write_text(original, encoding="utf-8")
+        _backlink_summary(wiki, "paper", ["attention", "transformer"])
+        assert (summaries / "paper.md").read_text() == original
+
+    def test_skips_if_file_missing(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        # Should not raise
+        _backlink_summary(wiki, "nonexistent", ["attention"])
+
+    def test_merges_into_existing_section(self, tmp_path):
+        """Second add should merge into existing ## Related Concepts, not duplicate."""
+        wiki = tmp_path / "wiki"
+        summaries = wiki / "summaries"
+        summaries.mkdir(parents=True)
+        (summaries / "paper.md").write_text(
+            "# Summary\n\nContent.\n\n## Related Concepts\n- [[concepts/attention]]\n",
+            encoding="utf-8",
+        )
+        _backlink_summary(wiki, "paper", ["attention", "transformer"])
+        text = (summaries / "paper.md").read_text()
+        assert text.count("## Related Concepts") == 1
+        assert "[[concepts/transformer]]" in text
+        assert text.count("[[concepts/attention]]") == 1
+
+
+class TestBacklinkConcepts:
+    def test_adds_summary_link_to_concept(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper.pdf]\n---\n\n# Attention\n\nContent.",
+            encoding="utf-8",
+        )
+        _backlink_concepts(wiki, "paper", ["attention"])
+        text = (concepts / "attention.md").read_text()
+        assert "[[summaries/paper]]" in text
+        assert "## Related Documents" in text
+
+    def test_skips_if_already_linked(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "# Attention\n\nBased on [[summaries/paper]].",
+            encoding="utf-8",
+        )
+        _backlink_concepts(wiki, "paper", ["attention"])
+        text = (concepts / "attention.md").read_text()
+        assert text.count("[[summaries/paper]]") == 1
+        assert "## Related Documents" not in text
+
+    def test_merges_into_existing_section(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "# Attention\n\n## Related Documents\n- [[summaries/old-paper]]\n",
+            encoding="utf-8",
+        )
+        _backlink_concepts(wiki, "new-paper", ["attention"])
+        text = (concepts / "attention.md").read_text()
+        assert text.count("## Related Documents") == 1
+        assert "[[summaries/old-paper]]" in text
+        assert "[[summaries/new-paper]]" in text
+
+    def test_skips_missing_concept_file(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        (wiki / "concepts").mkdir(parents=True)
+        # Should not raise
+        _backlink_concepts(wiki, "paper", ["nonexistent"])
+
+
 class TestAddRelatedLink:
     def test_adds_see_also_link(self, tmp_path):
         wiki = tmp_path / "wiki"

From 072d9f557e66dfa3f4b50bccc52eb96466e8d434 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:31:38 +0800
Subject: [PATCH 08/44] docs: specs and plans for concept dedup and retrieve
 redesign

---
 .../2026-04-09-concept-dedup-and-update.md    |  888 +++++++++++++
 .../plans/2026-04-09-retrieve-redesign.md     | 1104 +++++++++++++++++
 ...6-04-09-concept-dedup-and-update-design.md |  163 +++
 .../specs/2026-04-09-retrieve-redesign.md     |  262 ++++
 4 files changed, 2417 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md
 create mode 100644 docs/superpowers/plans/2026-04-09-retrieve-redesign.md
 create mode 100644 docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md
 create mode 100644 docs/superpowers/specs/2026-04-09-retrieve-redesign.md

diff --git a/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md b/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md
new file mode 100644
index 0000000..1a312a6
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md
@@ -0,0 +1,888 @@
+# Concept Dedup & Existing Page Update — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Give the compiler enough context about existing concepts to make smart dedup/update decisions, and add the ability to rewrite existing concept pages with new information — all without breaking prompt caching.
+
+**Architecture:** Extend the deterministic pipeline in `compiler.py` with: (1) concept briefs read from disk before the concepts-plan LLM call, (2) a new JSON output format with create/update/related actions, (3) a new concurrent "update" path that sends existing page content to the LLM for rewriting, (4) a code-only "related" path for cross-ref links. Extract shared logic between `compile_short_doc` and `compile_long_doc` into `_compile_concepts`.
+
+**Tech Stack:** Python, litellm, asyncio, pytest
+
+---
+
+### Task 1: Add `_read_concept_briefs` and test
+
+**Files:**
+- Modify: `openkb/agent/compiler.py:199-207` (File I/O helpers section)
+- Modify: `tests/test_compiler.py:98-116` (TestReadWikiContext section)
+
+- [ ] **Step 1: Write the failing test**
+
+Add to `tests/test_compiler.py`:
+
+```python
+from openkb.agent.compiler import _read_concept_briefs
+
+class TestReadConceptBriefs:
+    def test_empty_wiki(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        assert _read_concept_briefs(wiki) == "(none yet)"
+
+    def test_no_concepts_dir(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        assert _read_concept_briefs(wiki) == "(none yet)"
+
+    def test_reads_briefs_with_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper.pdf]\n---\n\nAttention allows models to focus on relevant input parts selectively.",
+            encoding="utf-8",
+        )
+        result = _read_concept_briefs(wiki)
+        assert "- attention: Attention allows models" in result
+
+    def test_reads_briefs_without_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "rnn.md").write_text(
+            "Recurrent neural networks process sequences step by step.",
+            encoding="utf-8",
+        )
+        result = _read_concept_briefs(wiki)
+        assert "- rnn: Recurrent neural networks" in result
+
+    def test_truncates_long_content(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "long.md").write_text("A" * 300, encoding="utf-8")
+        result = _read_concept_briefs(wiki)
+        brief_line = result.split("\n")[0]
+        # slug + ": " + 150 chars = well under 200
+        assert len(brief_line) < 200
+
+    def test_sorted_alphabetically(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8")
+        (concepts / "alpha.md").write_text("Alpha concept.", encoding="utf-8")
+        result = _read_concept_briefs(wiki)
+        lines = result.strip().split("\n")
+        assert lines[0].startswith("- alpha:")
+        assert lines[1].startswith("- zebra:")
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v`
+Expected: FAIL with `ImportError: cannot import name '_read_concept_briefs'`
+
+- [ ] **Step 3: Implement `_read_concept_briefs`**
+
+Add to `openkb/agent/compiler.py` in the File I/O helpers section (after `_read_wiki_context`):
+
+```python
+def _read_concept_briefs(wiki_dir: Path) -> str:
+    """Read existing concept pages and return compact briefs for the LLM.
+
+    Returns a string like:
+        - attention: Attention allows models to focus on relevant input parts...
+        - transformer: The Transformer is a neural network architecture...
+
+    Or "(none yet)" if no concept pages exist.
+    """
+    concepts_dir = wiki_dir / "concepts"
+    if not concepts_dir.exists():
+        return "(none yet)"
+    briefs = []
+    for p in sorted(concepts_dir.glob("*.md")):
+        text = p.read_text(encoding="utf-8")
+        # Skip YAML frontmatter
+        if text.startswith("---"):
+            parts = text.split("---", 2)
+            body = parts[2].strip() if len(parts) >= 3 else ""
+        else:
+            body = text.strip()
+        brief = body[:150].replace("\n", " ")
+        if brief:
+            briefs.append(f"- {p.stem}: {brief}")
+    return "\n".join(briefs) or "(none yet)"
+```
+
+- [ ] **Step 4: Run test to verify it passes**
+
+Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v`
+Expected: All 6 tests PASS
+
+- [ ] **Step 5: Update the import in test file**
+
+Add `_read_concept_briefs` to the existing import block at the top of `tests/test_compiler.py`:
+
+```python
+from openkb.agent.compiler import (
+    compile_long_doc,
+    compile_short_doc,
+    _parse_json,
+    _write_summary,
+    _write_concept,
+    _update_index,
+    _read_wiki_context,
+    _read_concept_briefs,
+)
+```
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add openkb/agent/compiler.py tests/test_compiler.py
+git commit -m "feat: add _read_concept_briefs for concept dedup context"
+```
+
+---
+
+### Task 2: Replace prompt template and update JSON parsing
+
+**Files:**
+- Modify: `openkb/agent/compiler.py:53-70` (prompt templates section)
+- Modify: `tests/test_compiler.py:21-31` (TestParseJson section)
+
+- [ ] **Step 1: Write the failing test for new JSON format**
+
+Add to `tests/test_compiler.py`:
+
+```python
+class TestParseConceptsPlan:
+    def test_dict_format(self):
+        text = json.dumps({
+            "create": [{"name": "foo", "title": "Foo"}],
+            "update": [{"name": "bar", "title": "Bar"}],
+            "related": ["baz"],
+        })
+        parsed = _parse_json(text)
+        assert isinstance(parsed, dict)
+        assert len(parsed["create"]) == 1
+        assert len(parsed["update"]) == 1
+        assert parsed["related"] == ["baz"]
+
+    def test_fallback_list_format(self):
+        """If LLM returns old flat array, _parse_json still works."""
+        text = json.dumps([{"name": "foo", "title": "Foo"}])
+        parsed = _parse_json(text)
+        assert isinstance(parsed, list)
+
+    def test_fenced_dict(self):
+        text = '```json\n{"create": [], "update": [], "related": []}\n```'
+        parsed = _parse_json(text)
+        assert isinstance(parsed, dict)
+        assert parsed["create"] == []
+```
+
+- [ ] **Step 2: Run test to verify it passes (these use existing `_parse_json`)**
+
+Run: `pytest tests/test_compiler.py::TestParseConceptsPlan -v`
+Expected: All 3 PASS — `_parse_json` already handles dicts. This confirms compatibility.
+
+- [ ] **Step 3: Replace `_CONCEPTS_LIST_USER` with `_CONCEPTS_PLAN_USER`**
+
+In `openkb/agent/compiler.py`, replace the `_CONCEPTS_LIST_USER` template (lines 53-70) with:
+
+```python
+_CONCEPTS_PLAN_USER = """\
+Based on the summary above, decide how to update the wiki's concept pages.
+
+Existing concept pages:
+{concept_briefs}
+
+Return a JSON object with three keys:
+
+1. "create" — new concepts not covered by any existing page. Array of objects:
+   {{"name": "concept-slug", "title": "Human-Readable Title"}}
+
+2. "update" — existing concepts that have significant new information from \
+this document worth integrating. Array of objects:
+   {{"name": "existing-slug", "title": "Existing Title"}}
+
+3. "related" — existing concepts tangentially related to this document but \
+not needing content changes, just a cross-reference link. Array of slug strings.
+
+Rules:
+- For the first few documents, create 2-3 foundational concepts at most.
+- Do NOT create a concept that overlaps with an existing one — use "update".
+- Do NOT create concepts that are just the document topic itself.
+- "related" is for lightweight cross-linking only, no content rewrite needed.
+
+Return ONLY valid JSON, no fences, no explanation.
+"""
+```
+
+- [ ] **Step 4: Add `_CONCEPT_UPDATE_USER` template**
+
+Add after `_CONCEPT_PAGE_USER` (after line 82):
+
+```python
+_CONCEPT_UPDATE_USER = """\
+Update the concept page for: {title}
+
+Current content of this page:
+{existing_content}
+
+New information from document "{doc_name}" (summarized above) should be \
+integrated into this page. Rewrite the full page incorporating the new \
+information naturally — do not just append. Maintain existing \
+[[wikilinks]] and add new ones where appropriate.
+
+Return ONLY the Markdown content (no frontmatter, no code fences).
+"""
+```
+
+- [ ] **Step 5: Run all existing tests to verify nothing breaks**
+
+Run: `pytest tests/test_compiler.py -v`
+Expected: All PASS (templates aren't tested directly, only via integration tests which we'll update later)
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add openkb/agent/compiler.py tests/test_compiler.py
+git commit -m "feat: add concepts plan and update prompt templates"
+```
+
+---
+
+### Task 3: Add `_add_related_link` and test
+
+**Files:**
+- Modify: `openkb/agent/compiler.py` (File I/O helpers section, after `_write_concept`)
+- Modify: `tests/test_compiler.py`
+
+- [ ] **Step 1: Write the failing test**
+
+Add to `tests/test_compiler.py`:
+
+```python
+from openkb.agent.compiler import _add_related_link
+
+class TestAddRelatedLink:
+    def test_adds_see_also_link(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSome content.",
+            encoding="utf-8",
+        )
+        _add_related_link(wiki, "attention", "new-doc", "paper2.pdf")
+        text = (concepts / "attention.md").read_text()
+        assert "[[summaries/new-doc]]" in text
+        assert "paper2.pdf" in text
+
+    def test_skips_if_already_linked(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSee also: [[summaries/new-doc]]",
+            encoding="utf-8",
+        )
+        _add_related_link(wiki, "attention", "new-doc", "paper1.pdf")
+        text = (concepts / "attention.md").read_text()
+        # Should not duplicate
+        assert text.count("[[summaries/new-doc]]") == 1
+
+    def test_skips_if_file_missing(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        # Should not raise
+        _add_related_link(wiki, "nonexistent", "doc", "file.pdf")
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v`
+Expected: FAIL with `ImportError: cannot import name '_add_related_link'`
+
+- [ ] **Step 3: Implement `_add_related_link`**
+
+Add to `openkb/agent/compiler.py` after `_write_concept`:
+
+```python
+def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None:
+    """Add a cross-reference link to an existing concept page (no LLM call)."""
+    concepts_dir = wiki_dir / "concepts"
+    path = concepts_dir / f"{concept_slug}.md"
+    if not path.exists():
+        return
+
+    text = path.read_text(encoding="utf-8")
+    link = f"[[summaries/{doc_name}]]"
+    if link in text:
+        return
+
+    # Update sources in frontmatter
+    if source_file not in text:
+        if text.startswith("---"):
+            end = text.index("---", 3)
+            fm = text[:end + 3]
+            body = text[end + 3:]
+            if "sources:" in fm:
+                fm = fm.replace("sources: [", f"sources: [{source_file}, ")
+            else:
+                fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
+            text = fm + body
+        else:
+            text = f"---\nsources: [{source_file}]\n---\n\n" + text
+
+    text += f"\n\nSee also: {link}"
+    path.write_text(text, encoding="utf-8")
+```
+
+- [ ] **Step 4: Run test to verify it passes**
+
+Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v`
+Expected: All 3 tests PASS
+
+- [ ] **Step 5: Update the import in test file**
+
+Add `_add_related_link` to the import block at top of `tests/test_compiler.py`.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add openkb/agent/compiler.py tests/test_compiler.py
+git commit -m "feat: add _add_related_link for code-only cross-referencing"
+```
+
+---
+
+### Task 4: Extract `_compile_concepts` and refactor both public functions
+
+**Files:**
+- Modify: `openkb/agent/compiler.py:290-509` (Public API section — full rewrite)
+- Modify: `tests/test_compiler.py:153-267` (integration tests)
+
+This is the core task. It extracts the shared Steps 2-4 into `_compile_concepts`, updates both public functions to call it, and switches to the new concepts plan format.
+
+- [ ] **Step 1: Write integration test for new create/update/related flow**
+
+Add to `tests/test_compiler.py`:
+
+```python
+class TestCompileConceptsPlan:
+    """Integration tests for the new create/update/related flow."""
+
+    @pytest.mark.asyncio
+    async def test_create_and_update_flow(self, tmp_path):
+        """New doc creates one concept and updates an existing one."""
+        wiki = tmp_path / "wiki"
+        (wiki / "sources").mkdir(parents=True)
+        (wiki / "summaries").mkdir(parents=True)
+        concepts_dir = wiki / "concepts"
+        concepts_dir.mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        # Pre-existing concept
+        (concepts_dir / "attention.md").write_text(
+            "---\nsources: [old-paper.pdf]\n---\n\n# Attention\n\nOld content about attention.",
+            encoding="utf-8",
+        )
+
+        source_path = wiki / "sources" / "new-paper.md"
+        source_path.write_text("# New Paper\n\nContent about flash attention and transformers.", encoding="utf-8")
+        (tmp_path / ".openkb").mkdir()
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "new-paper.pdf").write_bytes(b"fake")
+
+        summary_resp = "This paper introduces flash attention, improving on attention mechanisms."
+        plan_resp = json.dumps({
+            "create": [{"name": "flash-attention", "title": "Flash Attention"}],
+            "update": [{"name": "attention", "title": "Attention Mechanism"}],
+            "related": [],
+        })
+        create_page_resp = "# Flash Attention\n\nAn efficient attention algorithm."
+        update_page_resp = "# Attention\n\nUpdated content with flash attention details."
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([summary_resp, plan_resp])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([create_page_resp, update_page_resp])
+            )
+            await compile_short_doc("new-paper", source_path, tmp_path, "gpt-4o-mini")
+
+        # New concept created
+        flash_path = concepts_dir / "flash-attention.md"
+        assert flash_path.exists()
+        assert "sources: [new-paper.pdf]" in flash_path.read_text()
+
+        # Existing concept rewritten (not appended)
+        attn_text = (concepts_dir / "attention.md").read_text()
+        assert "new-paper.pdf" in attn_text
+        assert "Updated content with flash attention details" in attn_text
+
+        # Index updated for both
+        index_text = (wiki / "index.md").read_text()
+        assert "[[concepts/flash-attention]]" in index_text
+
+    @pytest.mark.asyncio
+    async def test_related_adds_link_no_llm(self, tmp_path):
+        """Related concepts get cross-ref links without LLM calls."""
+        wiki = tmp_path / "wiki"
+        (wiki / "sources").mkdir(parents=True)
+        (wiki / "summaries").mkdir(parents=True)
+        concepts_dir = wiki / "concepts"
+        concepts_dir.mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        (concepts_dir / "transformer.md").write_text(
+            "---\nsources: [old.pdf]\n---\n\n# Transformer\n\nArchitecture details.",
+            encoding="utf-8",
+        )
+
+        source_path = wiki / "sources" / "doc.md"
+        source_path.write_text("Content", encoding="utf-8")
+        (tmp_path / ".openkb").mkdir()
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake")
+
+        summary_resp = "A short summary."
+        plan_resp = json.dumps({
+            "create": [],
+            "update": [],
+            "related": ["transformer"],
+        })
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([summary_resp, plan_resp])
+            )
+            # acompletion should NOT be called (no create/update)
+            mock_litellm.acompletion = AsyncMock(side_effect=AssertionError("should not be called"))
+            await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini")
+
+        # Related concept should have cross-ref link
+        transformer_text = (concepts_dir / "transformer.md").read_text()
+        assert "[[summaries/doc]]" in transformer_text
+
+    @pytest.mark.asyncio
+    async def test_fallback_list_format(self, tmp_path):
+        """If LLM returns old flat array, treat all as create."""
+        wiki = tmp_path / "wiki"
+        (wiki / "sources").mkdir(parents=True)
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "concepts").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        source_path = wiki / "sources" / "doc.md"
+        source_path.write_text("Content", encoding="utf-8")
+        (tmp_path / ".openkb").mkdir()
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake")
+
+        summary_resp = "Summary."
+        # Old format: flat array
+        plan_resp = json.dumps([{"name": "foo", "title": "Foo"}])
+        page_resp = "# Foo\n\nContent."
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([summary_resp, plan_resp])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([page_resp])
+            )
+            await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini")
+
+        assert (wiki / "concepts" / "foo.md").exists()
+```
+
+- [ ] **Step 2: Run the new tests to verify they fail**
+
+Run: `pytest tests/test_compiler.py::TestCompileConceptsPlan -v`
+Expected: FAIL — the current code uses old prompt format and doesn't handle dict responses
+
+- [ ] **Step 3: Implement `_compile_concepts` and refactor public functions**
+
+Replace the entire Public API section (from `DEFAULT_COMPILE_CONCURRENCY` to end of file) in `openkb/agent/compiler.py` with:
+
+```python
+DEFAULT_COMPILE_CONCURRENCY = 5
+
+
+async def _compile_concepts(
+    wiki_dir: Path,
+    kb_dir: Path,
+    model: str,
+    system_msg: dict,
+    doc_msg: dict,
+    summary: str,
+    doc_name: str,
+    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
+) -> None:
+    """Shared concept compilation logic: plan → create/update/related → index.
+
+    This is the core of the compilation pipeline, shared by both
+    compile_short_doc and compile_long_doc.
+    """
+    source_file = _find_source_filename(doc_name, kb_dir)
+    concept_briefs = _read_concept_briefs(wiki_dir)
+
+    # --- Concepts plan (A cached) ---
+    plan_raw = _llm_call(model, [
+        system_msg,
+        doc_msg,
+        {"role": "assistant", "content": summary},
+        {"role": "user", "content": _CONCEPTS_PLAN_USER.format(
+            concept_briefs=concept_briefs,
+        )},
+    ], "concepts-plan", max_tokens=1024)
+
+    try:
+        parsed = _parse_json(plan_raw)
+    except (json.JSONDecodeError, ValueError) as exc:
+        logger.warning("Failed to parse concepts plan: %s", exc)
+        logger.debug("Raw: %s", plan_raw)
+        _update_index(wiki_dir, doc_name, [])
+        return
+
+    # Fallback: if LLM returns flat array, treat all as create
+    if isinstance(parsed, list):
+        create_list, update_list, related_list = parsed, [], []
+    else:
+        create_list = parsed.get("create", [])
+        update_list = parsed.get("update", [])
+        related_list = parsed.get("related", [])
+
+    if not create_list and not update_list and not related_list:
+        _update_index(wiki_dir, doc_name, [])
+        return
+
+    # --- Concurrent concept generation (A cached) ---
+    semaphore = asyncio.Semaphore(max_concurrency)
+
+    async def _gen_create(concept: dict) -> tuple[str, str, bool]:
+        name = concept["name"]
+        title = concept.get("title", name)
+        async with semaphore:
+            page_content = await _llm_call_async(model, [
+                system_msg,
+                doc_msg,
+                {"role": "assistant", "content": summary},
+                {"role": "user", "content": _CONCEPT_PAGE_USER.format(
+                    title=title, doc_name=doc_name,
+                    update_instruction="",
+                )},
+            ], f"create:{name}")
+        return name, page_content, False
+
+    async def _gen_update(concept: dict) -> tuple[str, str, bool]:
+        name = concept["name"]
+        title = concept.get("title", name)
+        # Read existing page content for the LLM to integrate
+        concept_path = wiki_dir / "concepts" / f"{name}.md"
+        if concept_path.exists():
+            raw_text = concept_path.read_text(encoding="utf-8")
+            # Strip frontmatter for the LLM
+            if raw_text.startswith("---"):
+                parts = raw_text.split("---", 2)
+                existing_content = parts[2].strip() if len(parts) >= 3 else raw_text
+            else:
+                existing_content = raw_text
+        else:
+            existing_content = "(page not found — create from scratch)"
+        async with semaphore:
+            page_content = await _llm_call_async(model, [
+                system_msg,
+                doc_msg,
+                {"role": "assistant", "content": summary},
+                {"role": "user", "content": _CONCEPT_UPDATE_USER.format(
+                    title=title, doc_name=doc_name,
+                    existing_content=existing_content,
+                )},
+            ], f"update:{name}")
+        return name, page_content, True
+
+    tasks = []
+    tasks.extend(_gen_create(c) for c in create_list)
+    tasks.extend(_gen_update(c) for c in update_list)
+
+    if tasks:
+        total = len(tasks)
+        sys.stdout.write(f"    Generating {total} concept(s) (concurrency={max_concurrency})...\n")
+        sys.stdout.flush()
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+    else:
+        results = []
+
+    concept_names = []
+    for r in results:
+        if isinstance(r, Exception):
+            logger.warning("Concept generation failed: %s", r)
+            continue
+        name, page_content, is_update = r
+        _write_concept(wiki_dir, name, page_content, source_file, is_update)
+        concept_names.append(name)
+
+    # --- Related: code-only cross-ref links ---
+    for slug in related_list:
+        _add_related_link(wiki_dir, slug, doc_name, source_file)
+
+    # --- Update index ---
+    _update_index(wiki_dir, doc_name, concept_names)
+
+
+async def compile_short_doc(
+    doc_name: str,
+    source_path: Path,
+    kb_dir: Path,
+    model: str,
+    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
+) -> None:
+    """Compile a short document into wiki pages.
+
+    Step 1: Generate summary from full document text.
+    Step 2: Plan + generate/update concept pages (via _compile_concepts).
+    """
+    from openkb.config import load_config
+
+    openkb_dir = kb_dir / ".openkb"
+    config = load_config(openkb_dir / "config.yaml")
+    language: str = config.get("language", "en")
+
+    wiki_dir = kb_dir / "wiki"
+    schema_md = get_agents_md(wiki_dir)
+    source_file = _find_source_filename(doc_name, kb_dir)
+    content = source_path.read_text(encoding="utf-8")
+
+    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
+        schema_md=schema_md, language=language,
+    )}
+    doc_msg = {"role": "user", "content": _SUMMARY_USER.format(
+        doc_name=doc_name, content=content,
+    )}
+
+    # Step 1: Generate summary
+    summary = _llm_call(model, [system_msg, doc_msg], "summary")
+    _write_summary(wiki_dir, doc_name, source_file, summary)
+
+    # Step 2: Compile concepts
+    await _compile_concepts(
+        wiki_dir, kb_dir, model, system_msg, doc_msg, summary,
+        doc_name, max_concurrency,
+    )
+
+
+async def compile_long_doc(
+    doc_name: str,
+    summary_path: Path,
+    doc_id: str,
+    kb_dir: Path,
+    model: str,
+    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
+) -> None:
+    """Compile a long (PageIndex) document into wiki concept pages.
+
+    The summary page is already written by the indexer. This function
+    generates an overview, then plans + generates/updates concept pages.
+    """
+    from openkb.config import load_config
+
+    openkb_dir = kb_dir / ".openkb"
+    config = load_config(openkb_dir / "config.yaml")
+    language: str = config.get("language", "en")
+
+    wiki_dir = kb_dir / "wiki"
+    schema_md = get_agents_md(wiki_dir)
+    summary_text = summary_path.read_text(encoding="utf-8")
+
+    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
+        schema_md=schema_md, language=language,
+    )}
+    doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format(
+        doc_name=doc_name, doc_id=doc_id, content=summary_text,
+    )}
+
+    # Step 1: Generate overview
+    overview = _llm_call(model, [system_msg, doc_msg], "overview")
+
+    # Step 2: Compile concepts
+    await _compile_concepts(
+        wiki_dir, kb_dir, model, system_msg, doc_msg, overview,
+        doc_name, max_concurrency,
+    )
+```
+
+- [ ] **Step 4: Update existing integration tests**
+
+Update `TestCompileShortDoc.test_full_pipeline` — the concepts-list response now needs to be the new dict format:
+
+```python
+class TestCompileShortDoc:
+    @pytest.mark.asyncio
+    async def test_full_pipeline(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        (wiki / "sources").mkdir(parents=True)
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "concepts").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        source_path = wiki / "sources" / "test-doc.md"
+        source_path.write_text("# Test Doc\n\nSome content about transformers.", encoding="utf-8")
+        (tmp_path / ".openkb").mkdir()
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
+
+        summary_response = "# Summary\n\nThis document discusses transformers."
+        plan_response = json.dumps({
+            "create": [{"name": "transformer", "title": "Transformer"}],
+            "update": [],
+            "related": [],
+        })
+        concept_page_response = "# Transformer\n\nA neural network architecture."
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([summary_response, plan_response])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([concept_page_response])
+            )
+            await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini")
+
+        summary_path = wiki / "summaries" / "test-doc.md"
+        assert summary_path.exists()
+        assert "sources: [test-doc.pdf]" in summary_path.read_text()
+
+        concept_path = wiki / "concepts" / "transformer.md"
+        assert concept_path.exists()
+        assert "sources: [test-doc.pdf]" in concept_path.read_text()
+
+        index_text = (wiki / "index.md").read_text()
+        assert "[[summaries/test-doc]]" in index_text
+        assert "[[concepts/transformer]]" in index_text
+```
+
+Update `TestCompileShortDoc.test_handles_bad_json` — no changes needed (bad JSON still triggers fallback).
+
+Update `TestCompileLongDoc.test_full_pipeline`:
+
+```python
+class TestCompileLongDoc:
+    @pytest.mark.asyncio
+    async def test_full_pipeline(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "concepts").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n",
+            encoding="utf-8",
+        )
+        summary_path = wiki / "summaries" / "big-doc.md"
+        summary_path.write_text("# Big Doc\n\nPageIndex summary tree.", encoding="utf-8")
+        openkb_dir = tmp_path / ".openkb"
+        openkb_dir.mkdir()
+        (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n")
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake")
+
+        overview_response = "Overview of the big document."
+        plan_response = json.dumps({
+            "create": [{"name": "deep-learning", "title": "Deep Learning"}],
+            "update": [],
+            "related": [],
+        })
+        concept_page_response = "# Deep Learning\n\nA subfield of ML."
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([overview_response, plan_response])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([concept_page_response])
+            )
+            await compile_long_doc(
+                "big-doc", summary_path, "doc-123", tmp_path, "gpt-4o-mini"
+            )
+
+        concept_path = wiki / "concepts" / "deep-learning.md"
+        assert concept_path.exists()
+        assert "Deep Learning" in concept_path.read_text()
+
+        index_text = (wiki / "index.md").read_text()
+        assert "[[summaries/big-doc]]" in index_text
+        assert "[[concepts/deep-learning]]" in index_text
+```
+
+- [ ] **Step 5: Run all tests**
+
+Run: `pytest tests/test_compiler.py -v`
+Expected: All PASS
+
+- [ ] **Step 6: Run the full test suite**
+
+Run: `pytest tests/ -v`
+Expected: All 149+ tests PASS
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add openkb/agent/compiler.py tests/test_compiler.py
+git commit -m "feat: concept dedup with briefs, update/related paths, extract _compile_concepts"
+```
+
+---
+
+### Task 5: Clean up old references and update module docstring
+
+**Files:**
+- Modify: `openkb/agent/compiler.py:1-9` (module docstring)
+
+- [ ] **Step 1: Update module docstring**
+
+Replace the docstring at the top of `openkb/agent/compiler.py`:
+
+```python
+"""Wiki compilation pipeline for OpenKB.
+
+Pipeline leveraging LLM prompt caching:
+  Step 1: Build base context A (schema + document content).
+  Step 2: A → generate summary.
+  Step 3: A + summary → concepts plan (create/update/related).
+  Step 4: Concurrent LLM calls (A cached) → generate new + rewrite updated concepts.
+  Step 5: Code adds cross-ref links to related concepts, updates index.
+"""
+```
+
+- [ ] **Step 2: Verify `_CONCEPTS_LIST_USER` is fully removed**
+
+Search for any remaining references to `_CONCEPTS_LIST_USER` in the codebase:
+
+Run: `grep -r "_CONCEPTS_LIST_USER" openkb/ tests/`
+Expected: No matches
+
+- [ ] **Step 3: Run full test suite one final time**
+
+Run: `pytest tests/ -q`
+Expected: All tests pass
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add openkb/agent/compiler.py
+git commit -m "chore: update compiler docstring for new pipeline"
+```
diff --git a/docs/superpowers/plans/2026-04-09-retrieve-redesign.md b/docs/superpowers/plans/2026-04-09-retrieve-redesign.md
new file mode 100644
index 0000000..3c659bc
--- /dev/null
+++ b/docs/superpowers/plans/2026-04-09-retrieve-redesign.md
@@ -0,0 +1,1104 @@
+# Retrieve Redesign Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Unify query across long/short docs, add brief summaries to index.md and frontmatter, store long doc sources as JSON with per-page access.
+
+**Architecture:** (1) LLM prompts return `{"brief", "content"}` JSON — briefs flow into frontmatter and index.md. (2) Indexer stores long doc pages as JSON array. (3) New `get_page_content` tool replaces `pageindex_retrieve`. (4) Query agent uses same tools for all docs.
+
+**Tech Stack:** Python, litellm, asyncio, pytest
+
+---
+
+### Task 1: Add `get_page_content` tool and `parse_pages` helper
+
+**Files:**
+- Modify: `openkb/agent/tools.py`
+- Modify: `tests/test_agent_tools.py`
+
+- [ ] **Step 1: Write failing tests**
+
+Add to `tests/test_agent_tools.py`:
+
+```python
+from openkb.agent.tools import get_page_content, parse_pages
+
+class TestParsePages:
+    def test_single_page(self):
+        assert parse_pages("3") == [3]
+
+    def test_range(self):
+        assert parse_pages("3-5") == [3, 4, 5]
+
+    def test_comma_separated(self):
+        assert parse_pages("1,3,5") == [1, 3, 5]
+
+    def test_mixed(self):
+        assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12]
+
+    def test_deduplication(self):
+        assert parse_pages("3,3,3") == [3]
+
+    def test_sorted(self):
+        assert parse_pages("5,1,3") == [1, 3, 5]
+
+    def test_ignores_zero_and_negative(self):
+        assert parse_pages("0,-1,3") == [3]
+
+
+class TestGetPageContent:
+    def test_reads_pages_from_json(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [
+            {"page": 1, "content": "Page one text."},
+            {"page": 2, "content": "Page two text."},
+            {"page": 3, "content": "Page three text."},
+        ]
+        (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8")
+
+        result = get_page_content("paper", "1,3", wiki_root)
+        assert "[Page 1]" in result
+        assert "Page one text." in result
+        assert "[Page 3]" in result
+        assert "Page three text." in result
+        assert "Page two" not in result
+
+    def test_returns_error_for_missing_file(self, tmp_path):
+        wiki_root = str(tmp_path)
+        (tmp_path / "sources").mkdir()
+        result = get_page_content("nonexistent", "1", wiki_root)
+        assert "not found" in result.lower()
+
+    def test_returns_error_for_no_matching_pages(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [{"page": 1, "content": "Only page."}]
+        (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8")
+
+        result = get_page_content("paper", "99", wiki_root)
+        assert "no content" in result.lower() or result.strip() == ""
+
+    def test_includes_images_info(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [
+            {"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]},
+        ]
+        (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8")
+
+        result = get_page_content("doc", "1", wiki_root)
+        assert "img.png" in result
+
+    def test_path_escape_denied(self, tmp_path):
+        wiki_root = str(tmp_path)
+        (tmp_path / "sources").mkdir()
+        result = get_page_content("../../etc/passwd", "1", wiki_root)
+        assert "denied" in result.lower() or "not found" in result.lower()
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `pytest tests/test_agent_tools.py::TestParsePages tests/test_agent_tools.py::TestGetPageContent -v`
+Expected: FAIL with `ImportError`
+
+- [ ] **Step 3: Implement `parse_pages` and `get_page_content`**
+
+Add to `openkb/agent/tools.py`:
+
+```python
+import json as _json
+
+
+def parse_pages(pages: str) -> list[int]:
+    """Parse a page specification like '3-5,7,10-12' into a sorted list of ints."""
+    result: set[int] = set()
+    for part in pages.split(","):
+        part = part.strip()
+        if "-" in part:
+            start_str, end_str = part.split("-", 1)
+            try:
+                start, end = int(start_str), int(end_str)
+                result.update(range(start, end + 1))
+            except ValueError:
+                continue
+        else:
+            try:
+                result.add(int(part))
+            except ValueError:
+                continue
+    return sorted(n for n in result if n >= 1)
+
+
+def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
+    """Get text content of specific pages from a long document.
+
+    Reads from ``wiki/sources/{doc_name}.json`` which contains a JSON array
+    of ``{"page": int, "content": str, "images": [...]}`` objects.
+
+    Args:
+        doc_name: Document name (stem, e.g. ``'attention-is-all-you-need'``).
+        pages: Page specification (e.g. ``'3-5,7,10-12'``).
+        wiki_root: Absolute path to the wiki root directory.
+
+    Returns:
+        Formatted text of requested pages, or error message if not found.
+    """
+    root = Path(wiki_root).resolve()
+    json_path = (root / "sources" / f"{doc_name}.json").resolve()
+    if not json_path.is_relative_to(root):
+        return "Access denied: path escapes wiki root."
+    if not json_path.exists():
+        return f"Document not found: {doc_name}. No sources/{doc_name}.json file."
+
+    data = _json.loads(json_path.read_text(encoding="utf-8"))
+    page_nums = set(parse_pages(pages))
+    matched = [p for p in data if p["page"] in page_nums]
+
+    if not matched:
+        return f"No content found for pages: {pages}"
+
+    parts: list[str] = []
+    for p in matched:
+        header = f"[Page {p['page']}]"
+        text = p.get("content", "")
+        if "images" in p:
+            img_refs = ", ".join(img["path"] for img in p["images"])
+            text += f"\n[Images: {img_refs}]"
+        parts.append(f"{header}\n{text}")
+
+    return "\n\n".join(parts)
+```
+
+- [ ] **Step 4: Run tests to verify they pass**
+
+Run: `pytest tests/test_agent_tools.py -v`
+Expected: All PASS
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add openkb/agent/tools.py tests/test_agent_tools.py
+git commit -m "feat: add get_page_content tool and parse_pages helper"
+```
+
+---
+
+### Task 2: Change LLM prompts to return `{"brief", "content"}` JSON
+
+**Files:**
+- Modify: `openkb/agent/compiler.py` (prompt templates, lines 40-105)
+- Modify: `tests/test_compiler.py` (TestParseConceptsPlan)
+
+- [ ] **Step 1: Write test for brief+content JSON parsing**
+
+Add to `tests/test_compiler.py`:
+
+```python
+class TestParseBriefContent:
+    def test_dict_with_brief_and_content(self):
+        text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."})
+        parsed = _parse_json(text)
+        assert parsed["brief"] == "A short desc"
+        assert "# Full page" in parsed["content"]
+
+    def test_plain_text_fallback(self):
+        """If LLM returns plain text, _parse_json raises — caller handles fallback."""
+        with pytest.raises((json.JSONDecodeError, ValueError)):
+            _parse_json("Just plain markdown text without JSON")
+```
+
+- [ ] **Step 2: Run test to verify it passes (existing _parse_json handles dicts)**
+
+Run: `pytest tests/test_compiler.py::TestParseBriefContent -v`
+Expected: PASS — `_parse_json` already handles dicts
+
+- [ ] **Step 3: Update `_SUMMARY_USER` prompt**
+
+Replace in `openkb/agent/compiler.py`:
+
+```python
+_SUMMARY_USER = """\
+New document: {doc_name}
+
+Full text:
+{content}
+
+Write a summary page for this document in Markdown.
+
+Return a JSON object with two keys:
+- "brief": A single sentence (under 100 chars) describing the document's main contribution
+- "content": The full summary in Markdown. Include key concepts, findings, ideas, \
+and [[wikilinks]] to concepts that could become cross-document concept pages
+
+Return ONLY valid JSON, no fences.
+"""
+```
+
+- [ ] **Step 4: Update `_CONCEPT_PAGE_USER` prompt**
+
+Replace in `openkb/agent/compiler.py`:
+
+```python
+_CONCEPT_PAGE_USER = """\
+Write the concept page for: {title}
+
+This concept relates to the document "{doc_name}" summarized above.
+{update_instruction}
+
+Return a JSON object with two keys:
+- "brief": A single sentence (under 100 chars) defining this concept
+- "content": The full concept page in Markdown. Include clear explanation, \
+key details from the source document, and [[wikilinks]] to related concepts \
+and [[summaries/{doc_name}]]
+
+Return ONLY valid JSON, no fences.
+"""
+```
+
+- [ ] **Step 5: Update `_CONCEPT_UPDATE_USER` prompt**
+
+Replace in `openkb/agent/compiler.py`:
+
+```python
+_CONCEPT_UPDATE_USER = """\
+Update the concept page for: {title}
+
+Current content of this page:
+{existing_content}
+
+New information from document "{doc_name}" (summarized above) should be \
+integrated into this page. Rewrite the full page incorporating the new \
+information naturally — do not just append. Maintain existing \
+[[wikilinks]] and add new ones where appropriate.
+
+Return a JSON object with two keys:
+- "brief": A single sentence (under 100 chars) defining this concept (may differ from before)
+- "content": The rewritten full concept page in Markdown
+
+Return ONLY valid JSON, no fences.
+"""
+```
+
+- [ ] **Step 6: Run all tests (prompts aren't tested directly)**
+
+Run: `pytest tests/test_compiler.py -v`
+Expected: All PASS
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add openkb/agent/compiler.py tests/test_compiler.py
+git commit -m "feat: update LLM prompts to return brief+content JSON"
+```
+
+---
+
+### Task 3: Update `_write_summary` and `_write_concept` to store `brief` in frontmatter
+
+**Files:**
+- Modify: `openkb/agent/compiler.py` (lines 274-320, `_write_summary` and `_write_concept`)
+- Modify: `tests/test_compiler.py`
+
+- [ ] **Step 1: Write failing tests**
+
+Update existing and add new tests in `tests/test_compiler.py`:
+
+```python
+class TestWriteSummary:
+    def test_writes_with_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers")
+        path = wiki / "summaries" / "my-doc.md"
+        assert path.exists()
+        text = path.read_text()
+        assert "sources: [my-doc.pdf]" in text
+        assert "brief: Introduces transformers" in text
+        assert "# Summary" in text
+
+    def test_writes_without_brief(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.")
+        path = wiki / "summaries" / "my-doc.md"
+        text = path.read_text()
+        assert "sources: [my-doc.pdf]" in text
+        assert "brief:" not in text
+```
+
+Update `TestWriteConcept`:
+
+```python
+class TestWriteConcept:
+    def test_new_concept_with_brief(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False, brief="Mechanism for selective focus")
+        path = wiki / "concepts" / "attention.md"
+        assert path.exists()
+        text = path.read_text()
+        assert "sources: [paper.pdf]" in text
+        assert "brief: Mechanism for selective focus" in text
+        assert "# Attention" in text
+
+    def test_new_concept(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False)
+        path = wiki / "concepts" / "attention.md"
+        assert path.exists()
+        text = path.read_text()
+        assert "sources: [paper.pdf]" in text
+        assert "# Attention" in text
+
+    def test_update_concept_appends_source(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper1.pdf]\nbrief: Old brief\n---\n\n# Attention\n\nOld content.",
+            encoding="utf-8",
+        )
+        _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True, brief="Updated brief")
+        text = (concepts / "attention.md").read_text()
+        assert "paper2.pdf" in text
+        assert "paper1.pdf" in text
+        assert "brief: Updated brief" in text
+        assert "New info from paper2." in text
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v`
+Expected: FAIL — `_write_summary` and `_write_concept` don't accept `brief` parameter
+
+- [ ] **Step 3: Update `_write_summary` to accept `brief`**
+
+```python
+def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None:
+    """Write summary page with frontmatter."""
+    summaries_dir = wiki_dir / "summaries"
+    summaries_dir.mkdir(parents=True, exist_ok=True)
+    fm_lines = [f"sources: [{source_file}]"]
+    if brief:
+        fm_lines.append(f"brief: {brief}")
+    frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
+    (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
+```
+
+- [ ] **Step 4: Update `_write_concept` to accept `brief`**
+
+Add `brief: str = ""` parameter to `_write_concept`. In the new-concept branch:
+
+```python
+    else:
+        fm_lines = [f"sources: [{source_file}]"]
+        if brief:
+            fm_lines.append(f"brief: {brief}")
+        frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
+        path.write_text(frontmatter + content, encoding="utf-8")
+```
+
+In the update branch, after updating sources in frontmatter, also update brief:
+
+```python
+    if is_update and path.exists():
+        existing = path.read_text(encoding="utf-8")
+        if source_file not in existing:
+            # ... existing frontmatter update logic ...
+        # Update brief in frontmatter if provided
+        if brief and existing.startswith("---"):
+            end = existing.find("---", 3)
+            if end != -1:
+                fm = existing[:end + 3]
+                body = existing[end + 3:]
+                if "brief:" in fm:
+                    import re
+                    fm = re.sub(r"brief:.*", f"brief: {brief}", fm)
+                else:
+                    fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1)
+                existing = fm + body
+        path.write_text(existing, encoding="utf-8")
+```
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v`
+Expected: All PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add openkb/agent/compiler.py tests/test_compiler.py
+git commit -m "feat: store brief in frontmatter of summary and concept pages"
+```
+
+---
+
+### Task 4: Update `_update_index` to include briefs, and update `_read_concept_briefs` to read from frontmatter
+
+**Files:**
+- Modify: `openkb/agent/compiler.py` (lines 233-261 and 408-430)
+- Modify: `tests/test_compiler.py`
+
+- [ ] **Step 1: Write failing tests for `_update_index` with briefs**
+
+```python
+class TestUpdateIndex:
+    def test_appends_entries_with_briefs(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        _update_index(wiki, "my-doc", ["attention", "transformer"],
+                       doc_brief="Introduces transformers",
+                       concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"})
+        text = (wiki / "index.md").read_text()
+        assert "[[summaries/my-doc]] — Introduces transformers" in text
+        assert "[[concepts/attention]] — Focus mechanism" in text
+        assert "[[concepts/transformer]] — NN architecture" in text
+
+    def test_no_duplicates(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n- [[summaries/my-doc]] — Old brief\n\n## Concepts\n",
+            encoding="utf-8",
+        )
+        _update_index(wiki, "my-doc", [], doc_brief="New brief")
+        text = (wiki / "index.md").read_text()
+        assert text.count("[[summaries/my-doc]]") == 1
+
+    def test_backwards_compat_no_briefs(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        _update_index(wiki, "my-doc", ["attention"])
+        text = (wiki / "index.md").read_text()
+        assert "[[summaries/my-doc]]" in text
+        assert "[[concepts/attention]]" in text
+```
+
+Write test for updated `_read_concept_briefs`:
+
+```python
+class TestReadConceptBriefs:
+    # ... keep existing tests ...
+
+    def test_reads_brief_from_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper.pdf]\nbrief: Selective focus mechanism\n---\n\n# Attention\n\nLong content...",
+            encoding="utf-8",
+        )
+        result = _read_concept_briefs(wiki)
+        assert "- attention: Selective focus mechanism" in result
+
+    def test_falls_back_to_body_truncation(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "old.md").write_text(
+            "---\nsources: [paper.pdf]\n---\n\nOld concept without brief field.",
+            encoding="utf-8",
+        )
+        result = _read_concept_briefs(wiki)
+        assert "- old: Old concept without brief field." in result
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `pytest tests/test_compiler.py::TestUpdateIndex tests/test_compiler.py::TestReadConceptBriefs -v`
+Expected: FAIL — `_update_index` doesn't accept `doc_brief`/`concept_briefs` parameters
+
+- [ ] **Step 3: Update `_update_index`**
+
+```python
+def _update_index(
+    wiki_dir: Path, doc_name: str, concept_names: list[str],
+    doc_brief: str = "", concept_briefs: dict[str, str] | None = None,
+) -> None:
+    """Append document and concept entries to index.md with optional briefs."""
+    index_path = wiki_dir / "index.md"
+    if not index_path.exists():
+        index_path.write_text(
+            "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+
+    text = index_path.read_text(encoding="utf-8")
+
+    doc_link = f"[[summaries/{doc_name}]]"
+    if doc_link not in text:
+        doc_entry = f"- {doc_link}"
+        if doc_brief:
+            doc_entry += f" — {doc_brief}"
+        if "## Documents" in text:
+            text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1)
+
+    if concept_briefs is None:
+        concept_briefs = {}
+    for name in concept_names:
+        concept_link = f"[[concepts/{name}]]"
+        if concept_link not in text:
+            concept_entry = f"- {concept_link}"
+            if name in concept_briefs:
+                concept_entry += f" — {concept_briefs[name]}"
+            if "## Concepts" in text:
+                text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1)
+
+    index_path.write_text(text, encoding="utf-8")
+```
+
+- [ ] **Step 4: Update `_read_concept_briefs` to read from frontmatter `brief:` field**
+
+```python
+def _read_concept_briefs(wiki_dir: Path) -> str:
+    """Read existing concept pages and return compact one-line summaries.
+
+    Reads ``brief:`` from YAML frontmatter if available, otherwise falls back
+    to the first 150 characters of the body text.
+    """
+    concepts_dir = wiki_dir / "concepts"
+    if not concepts_dir.exists():
+        return "(none yet)"
+
+    md_files = sorted(concepts_dir.glob("*.md"))
+    if not md_files:
+        return "(none yet)"
+
+    lines: list[str] = []
+    for path in md_files:
+        text = path.read_text(encoding="utf-8")
+        brief = ""
+        body = text
+        if text.startswith("---"):
+            end = text.find("---", 3)
+            if end != -1:
+                fm = text[:end + 3]
+                body = text[end + 3:]
+                # Try to extract brief from frontmatter
+                for line in fm.split("\n"):
+                    if line.startswith("brief:"):
+                        brief = line[len("brief:"):].strip()
+                        break
+        if not brief:
+            brief = body.strip().replace("\n", " ")[:150]
+        if brief:
+            lines.append(f"- {path.stem}: {brief}")
+
+    return "\n".join(lines) or "(none yet)"
+```
+
+- [ ] **Step 5: Run tests**
+
+Run: `pytest tests/test_compiler.py -v`
+Expected: All PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add openkb/agent/compiler.py tests/test_compiler.py
+git commit -m "feat: add briefs to index.md entries and read from frontmatter"
+```
+
+---
+
+### Task 5: Wire briefs through `_compile_concepts` and public functions
+
+**Files:**
+- Modify: `openkb/agent/compiler.py` (lines 438-611, `_compile_concepts`, `compile_short_doc`, `compile_long_doc`)
+- Modify: `tests/test_compiler.py`
+
+This task connects the brief+content JSON parsing to the write functions and index update.
+
+- [ ] **Step 1: Write integration test**
+
+```python
+class TestBriefIntegration:
+    @pytest.mark.asyncio
+    async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        (wiki / "sources").mkdir(parents=True)
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "concepts").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        source_path = wiki / "sources" / "test-doc.md"
+        source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8")
+        (tmp_path / ".openkb").mkdir()
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
+
+        summary_resp = json.dumps({
+            "brief": "A paper about transformers",
+            "content": "# Summary\n\nThis paper discusses transformers.",
+        })
+        plan_resp = json.dumps({
+            "create": [{"name": "transformer", "title": "Transformer"}],
+            "update": [],
+            "related": [],
+        })
+        concept_resp = json.dumps({
+            "brief": "NN architecture using self-attention",
+            "content": "# Transformer\n\nA neural network architecture.",
+        })
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([summary_resp, plan_resp])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([concept_resp])
+            )
+            await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini")
+
+        # Check summary frontmatter has brief
+        summary_text = (wiki / "summaries" / "test-doc.md").read_text()
+        assert "brief: A paper about transformers" in summary_text
+
+        # Check concept frontmatter has brief
+        concept_text = (wiki / "concepts" / "transformer.md").read_text()
+        assert "brief: NN architecture using self-attention" in concept_text
+
+        # Check index has briefs
+        index_text = (wiki / "index.md").read_text()
+        assert "[[summaries/test-doc]] — A paper about transformers" in index_text
+        assert "[[concepts/transformer]] — NN architecture using self-attention" in index_text
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+Run: `pytest tests/test_compiler.py::TestBriefIntegration -v`
+Expected: FAIL
+
+- [ ] **Step 3: Update `compile_short_doc` to parse brief+content from summary response**
+
+In `compile_short_doc`, replace:
+
+```python
+    # --- Step 1: Generate summary ---
+    summary = _llm_call(model, [system_msg, doc_msg], "summary")
+    _write_summary(wiki_dir, doc_name, source_file, summary)
+```
+
+With:
+
+```python
+    # --- Step 1: Generate summary ---
+    summary_raw = _llm_call(model, [system_msg, doc_msg], "summary")
+    try:
+        summary_parsed = _parse_json(summary_raw)
+        doc_brief = summary_parsed.get("brief", "")
+        summary = summary_parsed.get("content", summary_raw)
+    except (json.JSONDecodeError, ValueError):
+        doc_brief = ""
+        summary = summary_raw
+    _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief)
+```
+
+- [ ] **Step 4: Update `_compile_concepts` signature and wiring**
+
+Add `doc_brief: str = ""` parameter to `_compile_concepts`.
+
+In `_gen_create`, parse the response:
+
+```python
+    async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
+        name = concept["name"]
+        title = concept.get("title", name)
+        async with semaphore:
+            raw = await _llm_call_async(model, [
+                system_msg, doc_msg,
+                {"role": "assistant", "content": summary},
+                {"role": "user", "content": _CONCEPT_PAGE_USER.format(
+                    title=title, doc_name=doc_name, update_instruction="",
+                )},
+            ], f"create:{name}")
+        try:
+            parsed = _parse_json(raw)
+            brief = parsed.get("brief", "")
+            content = parsed.get("content", raw)
+        except (json.JSONDecodeError, ValueError):
+            brief, content = "", raw
+        return name, content, False, brief
+```
+
+Same for `_gen_update` — returns `tuple[str, str, bool, str]` (name, content, is_update, brief).
+
+In the results processing loop:
+
+```python
+    concept_briefs_map: dict[str, str] = {}
+    for r in results:
+        if isinstance(r, Exception):
+            logger.warning("Concept generation failed: %s", r)
+            continue
+        name, page_content, is_update, brief = r
+        _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief)
+        concept_names.append(name)
+        if brief:
+            concept_briefs_map[name] = brief
+```
+
+Pass briefs to `_update_index`:
+
+```python
+    _update_index(wiki_dir, doc_name, concept_names,
+                  doc_brief=doc_brief, concept_briefs=concept_briefs_map)
+```
+
+- [ ] **Step 5: Update `compile_short_doc` to pass `doc_brief` to `_compile_concepts`**
+
+```python
+    await _compile_concepts(
+        wiki_dir, kb_dir, model, system_msg, doc_msg,
+        summary, doc_name, max_concurrency, doc_brief=doc_brief,
+    )
+```
+
+- [ ] **Step 6: Update `compile_long_doc` to pass `doc_brief` from `IndexResult.description`**
+
+`compile_long_doc` currently takes `doc_id` but not `description`. Add `doc_description: str = ""` parameter:
+
+```python
+async def compile_long_doc(
+    doc_name: str,
+    summary_path: Path,
+    doc_id: str,
+    kb_dir: Path,
+    model: str,
+    doc_description: str = "",
+    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
+) -> None:
+```
+
+The `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain text, not JSON). Pass `doc_description` as `doc_brief`:
+
+```python
+    await _compile_concepts(
+        wiki_dir, kb_dir, model, system_msg, doc_msg,
+        overview, doc_name, max_concurrency, doc_brief=doc_description,
+    )
+```
+
+Also update the CLI call in `cli.py` line 135:
+
+```python
+asyncio.run(
+    compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model,
+                     doc_description=index_result.description)
+)
+```
+
+- [ ] **Step 7: Update existing integration tests for new JSON response format**
+
+Update all mock LLM responses in `TestCompileShortDoc`, `TestCompileLongDoc`, and `TestCompileConceptsPlan` to return `{"brief": "...", "content": "..."}` JSON instead of plain text for summary and concept responses.
+
+- [ ] **Step 8: Run all tests**
+
+Run: `pytest tests/ -q`
+Expected: All PASS
+
+- [ ] **Step 9: Commit**
+
+```bash
+git add openkb/agent/compiler.py openkb/cli.py tests/test_compiler.py
+git commit -m "feat: wire brief+content JSON through compile pipeline to index and frontmatter"
+```
+
+---
+
+### Task 6: Indexer — long doc sources from markdown to JSON
+
+**Files:**
+- Modify: `openkb/indexer.py`
+- Modify: `openkb/tree_renderer.py` (remove `render_source_md`)
+- Modify: `tests/test_indexer.py`
+
+- [ ] **Step 1: Write failing test**
+
+Update `tests/test_indexer.py`:
+
+```python
+    def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path):
+        """Long doc source should be written as JSON, not markdown."""
+        import json as json_mod
+        doc_id = "abc-123"
+        fake_col = self._make_fake_collection(doc_id, sample_tree)
+
+        fake_client = MagicMock()
+        fake_client.collection.return_value = fake_col
+        # Mock get_page_content to return page data
+        fake_col.get_page_content.return_value = [
+            {"page": 1, "content": "Page one text."},
+            {"page": 2, "content": "Page two text."},
+        ]
+
+        pdf_path = tmp_path / "sample.pdf"
+        pdf_path.write_bytes(b"%PDF-1.4 fake")
+
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client):
+            index_long_document(pdf_path, kb_dir)
+
+        # Should be JSON, not MD
+        json_file = kb_dir / "wiki" / "sources" / "sample.json"
+        assert json_file.exists()
+        assert not (kb_dir / "wiki" / "sources" / "sample.md").exists()
+        data = json_mod.loads(json_file.read_text())
+        assert len(data) == 2
+        assert data[0]["page"] == 1
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+Run: `pytest tests/test_indexer.py::TestIndexLongDocument::test_source_page_written_as_json -v`
+Expected: FAIL
+
+- [ ] **Step 3: Update `indexer.py` to write JSON sources**
+
+Replace the source writing block (lines 103-110) with:
+
+```python
+    # Write wiki/sources/ as JSON (per-page content from PageIndex)
+    sources_dir = kb_dir / "wiki" / "sources"
+    sources_dir.mkdir(parents=True, exist_ok=True)
+    dest_images_dir = sources_dir / "images" / pdf_path.stem
+
+    # Get per-page content from PageIndex
+    all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}")
+
+    # Relocate image paths
+    dest_images_dir.mkdir(parents=True, exist_ok=True)
+    for page in all_pages:
+        if "images" in page:
+            for img in page["images"]:
+                src_path = Path(img["path"])
+                if src_path.exists():
+                    filename = src_path.name
+                    dest = dest_images_dir / filename
+                    if not dest.exists():
+                        shutil.copy2(src_path, dest)
+                    img["path"] = f"images/{pdf_path.stem}/{filename}"
+
+    import json as json_mod
+    (sources_dir / f"{pdf_path.stem}.json").write_text(
+        json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
+    )
+```
+
+Remove the `render_source_md` import and `_relocate_images` call.
+
+- [ ] **Step 4: Remove `render_source_md` from tree_renderer.py**
+
+Remove the `render_source_md` function and `_render_nodes_source` helper from `openkb/tree_renderer.py`. Keep `render_summary_md` and `_render_nodes_summary`.
+
+- [ ] **Step 5: Update existing test `test_source_page_written`**
+
+The old test checks for `.md` — update it to check for `.json` or remove it (replaced by the new test).
+
+- [ ] **Step 6: Run all tests**
+
+Run: `pytest tests/ -q`
+Expected: All PASS
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add openkb/indexer.py openkb/tree_renderer.py tests/test_indexer.py
+git commit -m "feat: store long doc sources as per-page JSON, remove render_source_md"
+```
+
+---
+
+### Task 7: Query agent — remove `pageindex_retrieve`, add `get_page_content`, update instructions
+
+**Files:**
+- Modify: `openkb/agent/query.py`
+- Modify: `openkb/schema.py`
+- Modify: `tests/test_query.py`
+
+- [ ] **Step 1: Write failing tests**
+
+Update `tests/test_query.py`:
+
+```python
+class TestBuildQueryAgent:
+    def test_agent_name(self, tmp_path):
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
+        assert agent.name == "wiki-query"
+
+    def test_agent_has_three_tools(self, tmp_path):
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
+        assert len(agent.tools) == 3
+
+    def test_agent_tool_names(self, tmp_path):
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
+        names = {t.name for t in agent.tools}
+        assert "list_files" in names
+        assert "read_file" in names
+        assert "get_page_content" in names
+        assert "pageindex_retrieve" not in names
+
+    def test_instructions_mention_get_page_content(self, tmp_path):
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
+        assert "get_page_content" in agent.instructions
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run: `pytest tests/test_query.py::TestBuildQueryAgent -v`
+Expected: FAIL — old signature requires `openkb_dir`
+
+- [ ] **Step 3: Rewrite `query.py`**
+
+Remove `_pageindex_retrieve_impl` entirely (~110 lines). Remove `PageIndexClient` import. Update `build_query_agent`:
+
+```python
+def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent:
+    """Build and return the Q&A agent."""
+    schema_md = get_agents_md(Path(wiki_root))
+    instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
+    instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
+
+    @function_tool
+    def list_files(directory: str) -> str:
+        """List all Markdown files in a wiki subdirectory."""
+        return list_wiki_files(directory, wiki_root)
+
+    @function_tool
+    def read_file(path: str) -> str:
+        """Read a Markdown file from the wiki."""
+        return read_wiki_file(path, wiki_root)
+
+    @function_tool
+    def get_page_content_tool(doc_name: str, pages: str) -> str:
+        """Get text content of specific pages from a long document.
+
+        Args:
+            doc_name: Document name (e.g. 'attention-is-all-you-need').
+            pages: Page specification (e.g. '3-5,7,10-12').
+        """
+        from openkb.agent.tools import get_page_content
+        return get_page_content(doc_name, pages, wiki_root)
+
+    from agents.model_settings import ModelSettings
+
+    return Agent(
+        name="wiki-query",
+        instructions=instructions,
+        tools=[list_files, read_file, get_page_content_tool],
+        model=f"litellm/{model}",
+        model_settings=ModelSettings(parallel_tool_calls=False),
+    )
+```
+
+Update `_QUERY_INSTRUCTIONS_TEMPLATE`:
+
+```python
+_QUERY_INSTRUCTIONS_TEMPLATE = """\
+You are a knowledge-base Q&A agent. You answer questions by searching the wiki.
+
+{schema_md}
+
+## Search strategy
+1. Read index.md to understand what documents and concepts are available.
+   Each entry has a brief summary to help you judge relevance.
+2. Read relevant summary pages (summaries/) for document overviews.
+3. Read concept pages (concepts/) for cross-document synthesis.
+4. For long documents, use get_page_content(doc_name, pages) to read
+   specific pages when you need detailed content. The summary page
+   shows chapter structure with page ranges to help you decide which
+   pages to read.
+5. Synthesise a clear, well-cited answer.
+
+Always ground your answer in the wiki content. If you cannot find relevant
+information, say so clearly.
+"""
+```
+
+Update `run_query` to match new `build_query_agent` signature (remove `openkb_dir` param):
+
+```python
+async def run_query(question: str, kb_dir: Path, model: str, stream: bool = False) -> str:
+    from openkb.config import load_config
+    openkb_dir = kb_dir / ".openkb"
+    config = load_config(openkb_dir / "config.yaml")
+    language: str = config.get("language", "en")
+
+    wiki_root = str(kb_dir / "wiki")
+    agent = build_query_agent(wiki_root, model, language=language)
+    # ... rest unchanged ...
+```
+
+- [ ] **Step 4: Update `openkb/schema.py` AGENTS_MD**
+
+Add a note about `get_page_content` for long documents in the Schema:
+
+```python
+## Page Types
+- **Summary Page** (summaries/): Key content of a single source document.
+- **Concept Page** (concepts/): Cross-document topic synthesis with [[wikilinks]].
+- **Exploration Page** (explorations/): Saved query results — analyses, comparisons, syntheses.
+- **Source Page** (sources/): Full-text for short docs (.md) or per-page JSON for long docs (.json).
+- **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained.
+```
+
+- [ ] **Step 5: Run all tests**
+
+Run: `pytest tests/ -q`
+Expected: All PASS
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add openkb/agent/query.py openkb/schema.py tests/test_query.py
+git commit -m "feat: replace pageindex_retrieve with get_page_content, unify query for all docs"
+```
+
+---
+
+### Task 8: Final cleanup and full verification
+
+**Files:**
+- Modify: `openkb/indexer.py` (remove unused imports)
+- Verify all files
+
+- [ ] **Step 1: Remove unused imports**
+
+In `indexer.py`, remove `from openkb.tree_renderer import render_source_md` if still present (keep `render_summary_md`).
+
+In `query.py`, verify `PageIndexClient` import is removed.
+
+- [ ] **Step 2: Run full test suite**
+
+Run: `pytest tests/ -v`
+Expected: All PASS
+
+- [ ] **Step 3: Grep for dead references**
+
+Run: `grep -r "pageindex_retrieve\|render_source_md\|_relocate_images" openkb/ tests/`
+Expected: No matches
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add -A
+git commit -m "chore: remove dead imports and references"
+```
diff --git a/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md b/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md
new file mode 100644
index 0000000..2fcd853
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md
@@ -0,0 +1,163 @@
+# Concept Dedup & Existing Page Update
+
+**Date:** 2026-04-09
+**Status:** Approved
+**Branch:** bugfix/compile
+
+## Problem
+
+The compiler pipeline generates concept pages per document, but:
+
+1. **No dedup** — LLM only sees concept slug names, not content. It can't reliably judge whether a new concept overlaps with an existing one. As the KB grows, concepts duplicate and diverge.
+2. **No update of existing pages** — When a new document has information relevant to existing concepts, those pages are not updated. Knowledge doesn't compound across documents.
+
+The old agent-based approach solved this (the agent could read/write wiki files freely), but was too slow — 20-30 tool-call round-trips per document.
+
+## Design
+
+Extend the existing deterministic pipeline to give the LLM enough context for dedup/update decisions, without adding agent loops or breaking prompt caching.
+
+### Prompt Caching Invariant
+
+The cached prefix `[system_msg, doc_msg]` must remain identical across all LLM calls within a single document compilation. All new context (concept briefs, existing page content) goes into messages **after** the cached prefix.
+
+### Pipeline Overview
+
+```
+Step 1: [system, doc] → summary                          (unchanged)
+Step 2: [system, doc, summary, concepts_plan_prompt] → concepts plan JSON
+Step 3a: [system, doc, summary, create_prompt] × N  → new concept pages     (concurrent)
+Step 3b: [system, doc, summary, update_prompt] × M  → rewritten concept pages (concurrent)
+Step 3c: code-only × K                              → add cross-ref links to related concepts
+Step 4: update index                                 (unchanged)
+```
+
+Steps 3a and 3b share a single semaphore and run concurrently together.
+
+### Part 1: Concept Briefs
+
+New function `_read_concept_briefs(wiki_dir)` reads existing concept pages and returns a compact summary string:
+
+```
+- attention: Attention is a mechanism that allows models to focus on relevant parts...
+- transformer-architecture: The Transformer is a neural network architecture...
+```
+
+For each concept file in `wiki/concepts/*.md`:
+- Skip YAML frontmatter
+- Take first 150 characters of body text
+- Format as `- {slug}: {brief}`
+
+This replaces the current `", ".join(existing_concepts)` in the concepts-list prompt. Pure file I/O, no LLM call.
+
+### Part 2: Concepts Plan Prompt
+
+The `_CONCEPTS_LIST_USER` template is replaced with a new `_CONCEPTS_PLAN_USER` template that asks the LLM to return a JSON object with three action types:
+
+```json
+{
+  "create": [{"name": "flash-attention", "title": "Flash Attention"}],
+  "update": [{"name": "attention", "title": "Attention Mechanism"}],
+  "related": ["transformer-architecture"]
+}
+```
+
+- **create** — New concept not covered by any existing page.
+- **update** — Existing concept with significant new information worth integrating.
+- **related** — Existing concept tangentially related; only needs a cross-reference link.
+
+The prompt includes rules:
+- Don't create concepts that overlap with existing ones — use "update" instead.
+- Don't create concepts that are just the document topic itself.
+- For first few documents, create 2-3 foundational concepts at most.
+- "related" is for lightweight cross-linking only.
+
+### Part 3: Three Execution Paths
+
+#### create (unchanged)
+
+Same as current: concurrent `_llm_call_async` with `_CONCEPT_PAGE_USER` template. Written via `_write_concept` with `is_update=False`.
+
+#### update (new)
+
+New template `_CONCEPT_UPDATE_USER`:
+
+```
+Update the concept page for: {title}
+
+Current content of this page:
+{existing_content}
+
+New information from document "{doc_name}" (summarized above) should be
+integrated into this page. Rewrite the full page incorporating the new
+information naturally. Maintain existing cross-references and add new ones
+where appropriate.
+
+Return ONLY the Markdown content (no frontmatter, no code fences).
+```
+
+Call structure: `[system_msg, doc_msg, {assistant: summary}, update_user_msg]`
+
+The cached prefix `[system_msg, doc_msg]` is shared with create calls. The `existing_content` (typically 200-500 tokens) is in the final user message only.
+
+Written via `_write_concept` with `is_update=True`. The frontmatter `sources:` list is updated to include the new source file.
+
+#### related (code-only, no LLM)
+
+For each related slug:
+1. Read the concept file
+2. If `summaries/{doc_name}` is not already linked, append `\n\nSee also: [[summaries/{doc_name}]]`
+3. Update frontmatter `sources:` list
+
+Pure file I/O, millisecond-level.
+
+### Part 4: Shared Logic Between Short and Long Doc
+
+Current `compile_short_doc` and `compile_long_doc` duplicate Steps 2-4. Extract shared logic into `_compile_concepts(wiki_dir, model, system_msg, doc_msg, summary, doc_name, kb_dir, max_concurrency)`.
+
+Public functions become:
+- `compile_short_doc`: builds context A from source text → calls `_compile_concepts`
+- `compile_long_doc`: builds context A from PageIndex summary → calls `_compile_concepts`
+
+### Part 5: JSON Parsing Fallback
+
+If the LLM returns a flat JSON array instead of the expected dict, treat it as all "create" actions:
+
+```python
+if isinstance(parsed, list):
+    create_list, update_list, related_list = parsed, [], []
+else:
+    create_list = parsed.get("create", [])
+    update_list = parsed.get("update", [])
+    related_list = parsed.get("related", [])
+```
+
+This ensures backward compatibility if the LLM doesn't follow the new format.
+
+## Token Cost Analysis
+
+Compared to current pipeline (per document with C existing concepts):
+
+| Step | Current | New | Delta |
+|------|---------|-----|-------|
+| concepts-list prompt | ~50 tokens (slug names) | ~50 + C×30 tokens (briefs) | +C×30 |
+| update calls | 0 | M × ~500 tokens (existing content) | +M×500 |
+| related | 0 | 0 (code-only) | 0 |
+
+At C=30 existing concepts: +900 tokens in concepts-list prompt.
+At M=2 update calls: +1000 tokens total.
+
+Total overhead: ~2000 tokens per document. Negligible compared to document content (5K-20K tokens).
+
+## Files Changed
+
+- `openkb/agent/compiler.py` — all changes
+  - New: `_read_concept_briefs()`, `_CONCEPTS_PLAN_USER`, `_CONCEPT_UPDATE_USER`, `_add_related_link()`, `_compile_concepts()`
+  - Modified: `compile_short_doc()`, `compile_long_doc()`, `_parse_json()` caller logic
+- `tests/test_compiler.py` — update tests for new JSON format and update/related paths
+
+## Not In Scope
+
+- Concept briefs truncation/filtering for very large KBs (100+ concepts) — revisit when needed
+- Interactive ingest (human-in-the-loop checkpoint) — separate feature
+- Lint --fix auto-repair — separate feature
diff --git a/docs/superpowers/specs/2026-04-09-retrieve-redesign.md b/docs/superpowers/specs/2026-04-09-retrieve-redesign.md
new file mode 100644
index 0000000..15224be
--- /dev/null
+++ b/docs/superpowers/specs/2026-04-09-retrieve-redesign.md
@@ -0,0 +1,262 @@
+# Retrieve Redesign: Unified Query, Brief Summaries, and Local Page Content
+
+**Date:** 2026-04-09
+**Status:** Approved
+**Branch:** bugfix/compile
+
+## Problems
+
+### 1. Long vs Short Doc Split in Query
+
+The query agent treats long documents (PageIndex-indexed) and short documents differently:
+
+- **Short docs**: agent reads `wiki/sources/{name}.md` via `read_file`
+- **Long docs**: agent calls `pageindex_retrieve(doc_id, question)` — a black-box RAG call
+
+**Design Principle**: PageIndex is an indexer, not a retriever. Query-time retrieval should be done by the agent navigating the wiki, using the same tools for all documents.
+
+### 2. index.md Has No Brief Summaries
+
+Karpathy's gist says index.md should have "each page listed with a link, **a one-line summary**". Currently it only has wikilinks with no descriptions. The query agent must open every file to understand what's available.
+
+### 3. No Brief Summaries on Concepts Either
+
+Same problem: concept entries in index.md have no description. The agent can't judge relevance from the index alone.
+
+## Design
+
+### Part 1: Structured LLM Output with Brief Summaries
+
+All LLM generation steps (summary, concept create, concept update) now return a JSON object with both a one-line brief and the full content.
+
+#### Summary Generation
+
+`_SUMMARY_USER` prompt changes to request JSON output:
+
+```
+Write a summary page for this document in Markdown.
+
+Return a JSON object with two keys:
+- "brief": A single sentence (under 100 chars) describing the document's main contribution
+- "content": The full summary in Markdown. Include key concepts, findings, and [[wikilinks]]
+
+Return ONLY valid JSON, no fences.
+```
+
+LLM returns:
+```json
+{
+  "brief": "Introduces the Transformer architecture based entirely on self-attention",
+  "content": "# Attention Is All You Need\n\nThis paper proposes..."
+}
+```
+
+The `brief` is:
+- Written into summary frontmatter: `brief: Introduces the Transformer...`
+- Passed to `_update_index` for the Documents section
+
+The `content` is written to `wiki/summaries/{name}.md` as before.
+
+#### Concept Generation (create)
+
+`_CONCEPT_PAGE_USER` prompt changes similarly:
+
+```
+Write the concept page for: {title}
+
+Return a JSON object with two keys:
+- "brief": A single sentence (under 100 chars) defining this concept
+- "content": The full concept page in Markdown with [[wikilinks]]
+
+Return ONLY valid JSON, no fences.
+```
+
+The `brief` is:
+- Written into concept frontmatter: `brief: Mechanism allowing each position to attend to all others`
+- Passed to `_update_index` for the Concepts section
+- Used by `_read_concept_briefs` (read from frontmatter instead of truncating body text)
+
+#### Concept Generation (update)
+
+`_CONCEPT_UPDATE_USER` also returns `{"brief": "...", "content": "..."}`. The brief may change as the concept evolves with new information.
+
+#### Long Doc Summary (overview)
+
+Long documents do NOT need the LLM to generate a brief. The brief comes directly from PageIndex's `doc_description` field (available via `IndexResult.description`), which is already a document-level summary generated during indexing. `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain markdown overview, not JSON) — the brief is passed through from the indexer.
+
+In `compile_long_doc`, the `doc_description` is passed to `_compile_concepts` which forwards it to `_update_index` as the doc brief.
+
+#### Parsing
+
+All LLM responses go through `_parse_json`. Callers extract `brief` and `content`:
+
+```python
+parsed = _parse_json(raw)
+brief = parsed.get("brief", "")
+content = parsed.get("content", raw)  # fallback: treat raw as content if not JSON
+```
+
+The fallback ensures backward compatibility if the LLM returns plain text instead of JSON.
+
+### Part 2: index.md with Brief Summaries
+
+`_update_index` signature changes:
+
+```python
+def _update_index(wiki_dir, doc_name, concept_names, doc_brief="", concept_briefs=None):
+```
+
+Output format:
+
+```markdown
+## Documents
+- [[summaries/attention-is-all-you-need]] — Introduces the Transformer architecture based on self-attention
+- [[summaries/flash-attention]] — Efficient attention algorithm reducing memory from quadratic to linear
+
+## Concepts
+- [[concepts/self-attention]] — Mechanism allowing each position to attend to all others in a sequence
+- [[concepts/transformer]] — Neural network architecture based entirely on attention mechanisms
+```
+
+When updating an existing entry (re-compile), the brief is updated in place.
+
+### Part 3: Frontmatter with Brief
+
+Summary and concept pages get a `brief` field in frontmatter:
+
+```markdown
+---
+sources: [paper.pdf]
+brief: Introduces the Transformer architecture based on self-attention
+---
+
+# Attention Is All You Need
+...
+```
+
+`_read_concept_briefs` is updated to read from `brief:` frontmatter field instead of truncating body text. Fallback to body truncation if `brief:` is absent (backward compat with existing pages).
+
+### Part 4: Long Doc Sources from Markdown to JSON
+
+Store per-page content as JSON instead of a giant markdown file.
+
+**Current**:
+```
+wiki/sources/paper.md          ← rendered markdown, 10K-50K tokens
+```
+
+**New**:
+```
+wiki/sources/paper.json        ← per-page JSON array
+```
+
+**JSON format** (only the `pages` array from PageIndex, not the full doc object):
+```json
+[
+    {
+        "page": 1,
+        "content": "Full text of page 1...",
+        "images": [{"path": "images/paper/p1_img1.png", "width": 400, "height": 300}]
+    },
+    {
+        "page": 2,
+        "content": "Full text of page 2..."
+    }
+]
+```
+
+`images` field is optional. Image paths are relative to `wiki/sources/`. Short documents are not affected — they stay as `.md`.
+
+#### Indexer Changes
+
+In `indexer.py`, replace `render_source_md` + `_relocate_images` with:
+1. `col.get_page_content(doc_id, "1-9999")` to get all pages
+2. Relocate image paths in each page's `images` array
+3. Write as JSON to `wiki/sources/{name}.json`
+
+### Part 5: New Tool `get_page_content`
+
+Add to `openkb/agent/tools.py`:
+
+```python
+def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
+    """Get text content of specific pages from a long document.
+
+    Args:
+        doc_name: Document name (e.g. 'attention-is-all-you-need').
+        pages: Page specification (e.g. '3-5,7,10-12').
+        wiki_root: Absolute path to the wiki root directory.
+    """
+```
+
+Implementation:
+1. Read `wiki/sources/{doc_name}.json`
+2. Parse `pages` spec into a set of page numbers (comma-separated, ranges with `-`)
+3. Filter pages, format as `[Page N]\n{content}\n\n`
+4. Return concatenated text, or error if file not found
+
+### Part 6: Query Agent Changes
+
+**Remove**: `pageindex_retrieve` tool and `_pageindex_retrieve_impl` entirely.
+
+**Add**: `get_page_content` tool.
+
+**Update instructions**:
+```
+## Search strategy
+1. Read index.md to understand what documents and concepts are available.
+   Each entry has a brief summary to help you judge relevance.
+2. Read relevant summary pages (summaries/) for document overviews.
+3. Read concept pages (concepts/) for cross-document synthesis.
+4. For long documents, use get_page_content(doc_name, pages) to read
+   specific pages. The summary page shows chapter structure with page
+   ranges to help you decide which pages to read.
+5. Synthesise a clear, well-cited answer.
+```
+
+**Remove**: `openkb_dir` and `model` parameters from `build_query_agent`.
+
+### What Gets Removed
+
+- `_pageindex_retrieve_impl` (~110 lines)
+- `pageindex_retrieve` tool
+- `render_source_md` from `tree_renderer.py`
+- `_relocate_images` in current form (replaced by per-page relocation)
+- PageIndex imports in `query.py`
+
+### What Stays
+
+- `render_summary_md` — summaries still markdown
+- Short doc pipeline — unchanged
+- Image files in `wiki/sources/images/`
+- PageIndex in `indexer.py` — still used for tree building
+
+## Compile Pipeline Changes Summary
+
+The compile pipeline (`_compile_concepts`, `compile_short_doc`, `compile_long_doc`) changes:
+
+1. **Summary step**: parse JSON response, extract `brief` + `content`
+2. **Concept create/update steps**: parse JSON response, extract `brief` + `content`
+3. **`_write_summary`**: add `brief` to frontmatter
+4. **`_write_concept`**: add/update `brief` in frontmatter
+5. **`_update_index`**: write `— {brief}` after each wikilink
+6. **`_read_concept_briefs`**: read from `brief:` frontmatter field (fallback to body truncation)
+
+## Files Changed
+
+- `openkb/agent/compiler.py` — prompt templates return JSON with brief+content, parse responses, pass briefs to index/frontmatter
+- `openkb/indexer.py` — sources output from md to json, image relocation per-page
+- `openkb/agent/tools.py` — add `get_page_content`
+- `openkb/agent/query.py` — remove `pageindex_retrieve`, add `get_page_content`, update instructions
+- `openkb/tree_renderer.py` — remove `render_source_md`
+- `openkb/schema.py` — update AGENTS_MD
+- `tests/test_compiler.py` — update for JSON LLM responses
+- `tests/test_indexer.py` — update for JSON output
+- `tests/test_query.py` — update for new tool set
+- `tests/test_agent_tools.py` — add tests for `get_page_content`
+
+## Not In Scope
+
+- Cloud PageIndex query support (removed entirely)
+- Changes to the lint pipeline
+- Interactive ingest

From 39ae5c5fa8a0a3bbe5698574a5b62e48ff5d23f2 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:33:27 +0800
Subject: [PATCH 09/44] feat: add get_page_content tool and parse_pages helper

Adds parse_pages() to expand page specs like "1-3,7" into sorted
deduplicated int lists, and get_page_content() to read per-page JSON
(sources/{doc}.json) and format output with optional image paths.
Includes path-traversal guard consistent with existing tools.
---
 openkb/agent/tools.py     | 81 ++++++++++++++++++++++++++++++++++++
 tests/test_agent_tools.py | 87 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py
index 185344b..0d1164c 100644
--- a/openkb/agent/tools.py
+++ b/openkb/agent/tools.py
@@ -6,6 +6,7 @@
 """
 from __future__ import annotations
 
+import json as _json
 from pathlib import Path
 
 
@@ -52,6 +53,86 @@ def read_wiki_file(path: str, wiki_root: str) -> str:
     return full_path.read_text(encoding="utf-8")
 
 
+def parse_pages(pages: str) -> list[int]:
+    """Parse a page specification string into a sorted, deduplicated list of page numbers.
+
+    Args:
+        pages: Page spec such as ``"3-5,7,10-12"``.
+
+    Returns:
+        Sorted list of positive page numbers, e.g. ``[3, 4, 5, 7, 10, 11, 12]``.
+    """
+    result: set[int] = set()
+    for part in pages.split(","):
+        part = part.strip()
+        if "-" in part:
+            # Handle ranges like "3-5"; also handle negative numbers by only
+            # splitting on the first "-" that follows a digit.
+            segments = part.split("-")
+            # Re-join to handle leading negatives: segments[0] may be empty
+            # if part starts with "-".  We just try to parse start/end.
+            try:
+                if len(segments) == 2:
+                    start, end = int(segments[0]), int(segments[1])
+                    result.update(range(start, end + 1))
+                elif len(segments) == 3 and segments[0] == "":
+                    # e.g. "-1" split gives ['', '1']
+                    result.add(-int(segments[1]))
+                # More complex cases (e.g. negative range) are ignored.
+            except ValueError:
+                pass
+        else:
+            try:
+                result.add(int(part))
+            except ValueError:
+                pass
+    return sorted(n for n in result if n > 0)
+
+
+def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
+    """Return formatted content for specified pages of a document.
+
+    Reads ``{wiki_root}/sources/{doc_name}.json`` which must be a JSON array of
+    objects with at least ``{"page": int, "content": str}`` fields and an
+    optional ``"images"`` list of ``{"path": str, ...}`` objects.
+
+    Args:
+        doc_name: Document name without extension (e.g. ``"paper"``).
+        pages: Page specification string (e.g. ``"1-3,7"``).
+        wiki_root: Absolute path to the wiki root directory.
+
+    Returns:
+        Formatted page content, or an error message string.
+    """
+    root = Path(wiki_root).resolve()
+    target = (root / "sources" / f"{doc_name}.json").resolve()
+    if not target.is_relative_to(root):
+        return "Access denied: path escapes wiki root."
+    if not target.exists():
+        return f"File not found: sources/{doc_name}.json"
+
+    data = _json.loads(target.read_text(encoding="utf-8"))
+    requested = set(parse_pages(pages))
+    matches = [entry for entry in data if entry.get("page") in requested]
+
+    if not matches:
+        return f"No content found for pages {pages} in {doc_name}."
+
+    parts: list[str] = []
+    for entry in matches:
+        page_num = entry["page"]
+        content = entry.get("content", "")
+        block = f"[Page {page_num}]\n{content}"
+        images = entry.get("images")
+        if images:
+            paths = ", ".join(img["path"] for img in images if "path" in img)
+            if paths:
+                block += f"\n[Images: {paths}]"
+        parts.append(block)
+
+    return "\n\n".join(parts) + "\n\n"
+
+
 def write_wiki_file(path: str, content: str, wiki_root: str) -> str:
     """Write or overwrite a Markdown file in the wiki.
 
diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py
index bfffc2f..3d95a88 100644
--- a/tests/test_agent_tools.py
+++ b/tests/test_agent_tools.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from openkb.agent.tools import list_wiki_files, read_wiki_file, write_wiki_file
+from openkb.agent.tools import get_page_content, list_wiki_files, parse_pages, read_wiki_file, write_wiki_file
 
 
 # ---------------------------------------------------------------------------
@@ -128,3 +128,88 @@ def test_returns_written_path(self, tmp_path):
         result = write_wiki_file("reports/health.md", "All good.", wiki_root)
 
         assert result == "Written: reports/health.md"
+
+
+# ---------------------------------------------------------------------------
+# parse_pages
+# ---------------------------------------------------------------------------
+
+
+class TestParsePages:
+    def test_single_page(self):
+        assert parse_pages("3") == [3]
+
+    def test_range(self):
+        assert parse_pages("3-5") == [3, 4, 5]
+
+    def test_comma_separated(self):
+        assert parse_pages("1,3,5") == [1, 3, 5]
+
+    def test_mixed(self):
+        assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12]
+
+    def test_deduplication(self):
+        assert parse_pages("3,3,3") == [3]
+
+    def test_sorted(self):
+        assert parse_pages("5,1,3") == [1, 3, 5]
+
+    def test_ignores_zero_and_negative(self):
+        assert parse_pages("0,-1,3") == [3]
+
+
+# ---------------------------------------------------------------------------
+# get_page_content
+# ---------------------------------------------------------------------------
+
+
+class TestGetPageContent:
+    def test_reads_pages_from_json(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [
+            {"page": 1, "content": "Page one text."},
+            {"page": 2, "content": "Page two text."},
+            {"page": 3, "content": "Page three text."},
+        ]
+        (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8")
+        result = get_page_content("paper", "1,3", wiki_root)
+        assert "[Page 1]" in result
+        assert "Page one text." in result
+        assert "[Page 3]" in result
+        assert "Page three text." in result
+        assert "Page two" not in result
+
+    def test_returns_error_for_missing_file(self, tmp_path):
+        wiki_root = str(tmp_path)
+        (tmp_path / "sources").mkdir()
+        result = get_page_content("nonexistent", "1", wiki_root)
+        assert "not found" in result.lower()
+
+    def test_returns_error_for_no_matching_pages(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [{"page": 1, "content": "Only page."}]
+        (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8")
+        result = get_page_content("paper", "99", wiki_root)
+        assert "no content" in result.lower()
+
+    def test_includes_images_info(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [{"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]}]
+        (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8")
+        result = get_page_content("doc", "1", wiki_root)
+        assert "img.png" in result
+
+    def test_path_escape_denied(self, tmp_path):
+        wiki_root = str(tmp_path)
+        (tmp_path / "sources").mkdir()
+        result = get_page_content("../../etc/passwd", "1", wiki_root)
+        assert "denied" in result.lower() or "not found" in result.lower()

From b6ce04e02267751c423c205a80543474568bf4f2 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:33:29 +0800
Subject: [PATCH 10/44] feat: update LLM prompts to return brief+content JSON

Replace _SUMMARY_USER, _CONCEPT_PAGE_USER, and _CONCEPT_UPDATE_USER to
request a JSON object with "brief" (one-line summary) and "content" (full
Markdown). Add TestParseBriefContent to tests/test_compiler.py.
---
 openkb/agent/compiler.py | 28 +++++++++++++++++++---------
 tests/test_compiler.py   | 13 +++++++++++++
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 5075278..947b0cc 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -43,11 +43,14 @@
 Full text:
 {content}
 
-Write a summary page for this document in Markdown. Include:
-- Key concepts, findings, and ideas
-- [[wikilinks]] to concepts that could become cross-document concept pages
+Write a summary page for this document in Markdown.
 
-Return ONLY the Markdown content (no frontmatter, no code fences).
+Return a JSON object with two keys:
+- "brief": A single sentence (under 100 chars) describing the document's main contribution
+- "content": The full summary in Markdown. Include key concepts, findings, ideas, \
+and [[wikilinks]] to concepts that could become cross-document concept pages
+
+Return ONLY valid JSON, no fences.
 """
 
 
@@ -84,10 +87,13 @@
 This concept relates to the document "{doc_name}" summarized above.
 {update_instruction}
 
-Return ONLY the Markdown content (no frontmatter, no code fences). Include:
-- Clear explanation of the concept
-- Key details from the source document
-- [[wikilinks]] to related concepts and [[summaries/{doc_name}]]
+Return a JSON object with two keys:
+- "brief": A single sentence (under 100 chars) defining this concept
+- "content": The full concept page in Markdown. Include clear explanation, \
+key details from the source document, and [[wikilinks]] to related concepts \
+and [[summaries/{doc_name}]]
+
+Return ONLY valid JSON, no fences.
 """
 
 _CONCEPT_UPDATE_USER = """\
@@ -101,7 +107,11 @@
 information naturally — do not just append. Maintain existing \
 [[wikilinks]] and add new ones where appropriate.
 
-Return ONLY the Markdown content (no frontmatter, no code fences).
+Return a JSON object with two keys:
+- "brief": A single sentence (under 100 chars) defining this concept (may differ from before)
+- "content": The rewritten full concept page in Markdown
+
+Return ONLY valid JSON, no fences.
 """
 
 _LONG_DOC_SUMMARY_USER = """\
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index e1238df..2d5f376 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -61,6 +61,19 @@ def test_fenced_dict(self):
         assert parsed["create"] == []
 
 
+class TestParseBriefContent:
+    def test_dict_with_brief_and_content(self):
+        text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."})
+        parsed = _parse_json(text)
+        assert parsed["brief"] == "A short desc"
+        assert "# Full page" in parsed["content"]
+
+    def test_plain_text_fallback(self):
+        """If LLM returns plain text, _parse_json raises — caller handles fallback."""
+        with pytest.raises((json.JSONDecodeError, ValueError)):
+            _parse_json("Just plain markdown text without JSON")
+
+
 class TestWriteSummary:
     def test_writes_with_frontmatter(self, tmp_path):
         wiki = tmp_path / "wiki"

From a172c433ac23d9f5ebca38adba98813a8b7c3214 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:35:25 +0800
Subject: [PATCH 11/44] feat: store brief in frontmatter of summary and concept
 pages

---
 openkb/agent/compiler.py | 48 ++++++++++++++++++++++++++++++----------
 tests/test_compiler.py   | 42 ++++++++++++++++++++++++++++++++---
 2 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 947b0cc..62ab44f 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -243,8 +243,9 @@ def _read_wiki_context(wiki_dir: Path) -> tuple[str, list[str]]:
 def _read_concept_briefs(wiki_dir: Path) -> str:
     """Read existing concept pages and return compact one-line summaries.
 
-    For each concept, skips YAML frontmatter, takes the first 150 chars of the
-    body (newlines collapsed to spaces), and formats as ``- {slug}: {brief}``.
+    For each concept, reads the ``brief:`` field from YAML frontmatter if
+    present; otherwise falls back to truncating the first 150 chars of the body
+    (newlines collapsed to spaces).  Formats each as ``- {slug}: {brief}``.
 
     Returns "(none yet)" if the concepts directory is missing or empty.
     """
@@ -259,16 +260,23 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
     lines: list[str] = []
     for path in md_files:
         text = path.read_text(encoding="utf-8")
-        # Strip YAML frontmatter if present
+        brief = ""
+        body = text
         if text.startswith("---"):
             end = text.find("---", 3)
             if end != -1:
-                text = text[end + 3:]
-        body = text.strip().replace("\n", " ")
-        brief = body[:150]
-        lines.append(f"- {path.stem}: {brief}")
+                fm = text[:end + 3]
+                body = text[end + 3:]
+                for line in fm.split("\n"):
+                    if line.startswith("brief:"):
+                        brief = line[len("brief:"):].strip()
+                        break
+        if not brief:
+            brief = body.strip().replace("\n", " ")[:150]
+        if brief:
+            lines.append(f"- {path.stem}: {brief}")
 
-    return "\n".join(lines)
+    return "\n".join(lines) or "(none yet)"
 
 
 def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
@@ -281,11 +289,14 @@ def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
     return f"{doc_name}.pdf"
 
 
-def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str) -> None:
+def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None:
     """Write summary page with frontmatter."""
     summaries_dir = wiki_dir / "summaries"
     summaries_dir.mkdir(parents=True, exist_ok=True)
-    frontmatter = f"---\nsources: [{source_file}]\n---\n\n"
+    fm_lines = [f"sources: [{source_file}]"]
+    if brief:
+        fm_lines.append(f"brief: {brief}")
+    frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
     (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
 
 
@@ -298,7 +309,7 @@ def _sanitize_concept_name(name: str) -> str:
     return sanitized or "unnamed-concept"
 
 
-def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool) -> None:
+def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool, brief: str = "") -> None:
     """Write or update a concept page, managing the sources frontmatter."""
     concepts_dir = wiki_dir / "concepts"
     concepts_dir.mkdir(parents=True, exist_ok=True)
@@ -324,9 +335,22 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
             else:
                 existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
             existing += f"\n\n{content}"
+        if brief and existing.startswith("---"):
+            end = existing.find("---", 3)
+            if end != -1:
+                fm = existing[:end + 3]
+                body = existing[end + 3:]
+                if "brief:" in fm:
+                    fm = re.sub(r"brief:.*", f"brief: {brief}", fm)
+                else:
+                    fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1)
+                existing = fm + body
         path.write_text(existing, encoding="utf-8")
     else:
-        frontmatter = f"---\nsources: [{source_file}]\n---\n\n"
+        fm_lines = [f"sources: [{source_file}]"]
+        if brief:
+            fm_lines.append(f"brief: {brief}")
+        frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
         path.write_text(frontmatter + content, encoding="utf-8")
 
 
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index 2d5f376..b64ce31 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -78,25 +78,61 @@ class TestWriteSummary:
     def test_writes_with_frontmatter(self, tmp_path):
         wiki = tmp_path / "wiki"
         wiki.mkdir()
-        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.")
+        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers")
         path = wiki / "summaries" / "my-doc.md"
         assert path.exists()
         text = path.read_text()
         assert "sources: [my-doc.pdf]" in text
+        assert "brief: Introduces transformers" in text
         assert "# Summary" in text
 
+    def test_writes_without_brief(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.")
+        path = wiki / "summaries" / "my-doc.md"
+        text = path.read_text()
+        assert "sources: [my-doc.pdf]" in text
+        assert "brief:" not in text
+
 
 class TestWriteConcept:
-    def test_new_concept(self, tmp_path):
+    def test_new_concept_with_brief(self, tmp_path):
         wiki = tmp_path / "wiki"
         wiki.mkdir()
-        _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False)
+        _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False, brief="Mechanism for selective focus")
         path = wiki / "concepts" / "attention.md"
         assert path.exists()
         text = path.read_text()
         assert "sources: [paper.pdf]" in text
+        assert "brief: Mechanism for selective focus" in text
         assert "# Attention" in text
 
+    def test_new_concept_without_brief(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False)
+        path = wiki / "concepts" / "attention.md"
+        assert path.exists()
+        text = path.read_text()
+        assert "sources: [paper.pdf]" in text
+        assert "brief:" not in text
+
+    def test_update_concept_updates_brief(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper1.pdf]\nbrief: Old brief\n---\n\n# Attention\n\nOld content.",
+            encoding="utf-8",
+        )
+        _write_concept(wiki, "attention", "New info.", "paper2.pdf", True, brief="Updated brief")
+        text = (concepts / "attention.md").read_text()
+        assert "paper2.pdf" in text
+        assert "paper1.pdf" in text
+        assert "brief: Updated brief" in text
+        assert "Old brief" not in text
+
     def test_update_concept_appends_source(self, tmp_path):
         wiki = tmp_path / "wiki"
         concepts = wiki / "concepts"

From ca2391297f2c16621ff5eeaab4503f9e151d233a Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:36:44 +0800
Subject: [PATCH 12/44] feat: add briefs to index.md entries and read from
 frontmatter

---
 openkb/agent/compiler.py | 29 ++++++++++++++++++-----
 tests/test_compiler.py   | 50 ++++++++++++++++++++++++++++++++++------
 2 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 62ab44f..9a169b0 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -439,8 +439,19 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str])
         path.write_text(text, encoding="utf-8")
 
 
-def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> None:
-    """Append document and concept entries to index.md."""
+def _update_index(
+    wiki_dir: Path, doc_name: str, concept_names: list[str],
+    doc_brief: str = "", concept_briefs: dict[str, str] | None = None,
+) -> None:
+    """Append document and concept entries to index.md.
+
+    When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries
+    are written as ``- [[link]] — brief text``.  Existing entries are detected
+    by the link part only, so updating a brief on a re-compile works correctly.
+    """
+    if concept_briefs is None:
+        concept_briefs = {}
+
     index_path = wiki_dir / "index.md"
     if not index_path.exists():
         index_path.write_text(
@@ -450,14 +461,20 @@ def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> No
 
     text = index_path.read_text(encoding="utf-8")
 
-    doc_entry = f"- [[summaries/{doc_name}]]"
-    if doc_entry not in text:
+    doc_link = f"[[summaries/{doc_name}]]"
+    if doc_link not in text:
+        doc_entry = f"- {doc_link}"
+        if doc_brief:
+            doc_entry += f" — {doc_brief}"
         if "## Documents" in text:
             text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1)
 
     for name in concept_names:
-        concept_entry = f"- [[concepts/{name}]]"
-        if concept_entry not in text:
+        concept_link = f"[[concepts/{name}]]"
+        if concept_link not in text:
+            concept_entry = f"- {concept_link}"
+            if name in concept_briefs:
+                concept_entry += f" — {concept_briefs[name]}"
             if "## Concepts" in text:
                 text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1)
 
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index b64ce31..085116d 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -149,30 +149,44 @@ def test_update_concept_appends_source(self, tmp_path):
 
 
 class TestUpdateIndex:
-    def test_appends_entries(self, tmp_path):
+    def test_appends_entries_with_briefs(self, tmp_path):
         wiki = tmp_path / "wiki"
         wiki.mkdir()
         (wiki / "index.md").write_text(
             "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
             encoding="utf-8",
         )
-        _update_index(wiki, "my-doc", ["attention", "transformer"])
+        _update_index(wiki, "my-doc", ["attention", "transformer"],
+                       doc_brief="Introduces transformers",
+                       concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"})
         text = (wiki / "index.md").read_text()
-        assert "[[summaries/my-doc]]" in text
-        assert "[[concepts/attention]]" in text
-        assert "[[concepts/transformer]]" in text
+        assert "[[summaries/my-doc]] — Introduces transformers" in text
+        assert "[[concepts/attention]] — Focus mechanism" in text
+        assert "[[concepts/transformer]] — NN architecture" in text
 
     def test_no_duplicates(self, tmp_path):
         wiki = tmp_path / "wiki"
         wiki.mkdir()
         (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n- [[summaries/my-doc]]\n\n## Concepts\n",
+            "# Index\n\n## Documents\n- [[summaries/my-doc]] — Old brief\n\n## Concepts\n",
             encoding="utf-8",
         )
-        _update_index(wiki, "my-doc", [])
+        _update_index(wiki, "my-doc", [], doc_brief="New brief")
         text = (wiki / "index.md").read_text()
         assert text.count("[[summaries/my-doc]]") == 1
 
+    def test_backwards_compat_no_briefs(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        wiki.mkdir()
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        _update_index(wiki, "my-doc", ["attention"])
+        text = (wiki / "index.md").read_text()
+        assert "[[summaries/my-doc]]" in text
+        assert "[[concepts/attention]]" in text
+
 
 class TestReadWikiContext:
     def test_empty_wiki(self, tmp_path):
@@ -257,6 +271,28 @@ def test_sorted_alphabetically(self, tmp_path):
         slugs = [line.split(":")[0].lstrip("- ") for line in lines]
         assert slugs == ["apple", "mango", "zebra"]
 
+    def test_reads_brief_from_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "attention.md").write_text(
+            "---\nsources: [paper.pdf]\nbrief: Selective focus mechanism\n---\n\n# Attention\n\nLong content...",
+            encoding="utf-8",
+        )
+        result = _read_concept_briefs(wiki)
+        assert "- attention: Selective focus mechanism" in result
+
+    def test_falls_back_to_body_truncation(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        concepts = wiki / "concepts"
+        concepts.mkdir(parents=True)
+        (concepts / "old.md").write_text(
+            "---\nsources: [paper.pdf]\n---\n\nOld concept without brief field.",
+            encoding="utf-8",
+        )
+        result = _read_concept_briefs(wiki)
+        assert "- old: Old concept without brief field." in result
+
 
 class TestBacklinkSummary:
     def test_adds_missing_concept_links(self, tmp_path):

From 5b086a54835a2050f7997496b633d988864f4355 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:40:28 +0800
Subject: [PATCH 13/44] feat: wire brief+content JSON through compile pipeline
 to index and frontmatter

---
 openkb/agent/compiler.py | 53 ++++++++++++++++++-------
 openkb/cli.py            |  3 +-
 tests/test_compiler.py   | 86 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 120 insertions(+), 22 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 9a169b0..b4b549e 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -497,6 +497,7 @@ async def _compile_concepts(
     summary: str,
     doc_name: str,
     max_concurrency: int,
+    doc_brief: str = "",
 ) -> None:
     """Shared Steps 2-4: concepts plan → generate/update → index.
 
@@ -546,11 +547,11 @@ async def _compile_concepts(
     # --- Step 3: Generate/update concept pages concurrently (A cached) ---
     semaphore = asyncio.Semaphore(max_concurrency)
 
-    async def _gen_create(concept: dict) -> tuple[str, str, bool]:
+    async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
         name = concept["name"]
         title = concept.get("title", name)
         async with semaphore:
-            page_content = await _llm_call_async(model, [
+            raw = await _llm_call_async(model, [
                 system_msg,
                 doc_msg,
                 {"role": "assistant", "content": summary},
@@ -559,9 +560,15 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool]:
                     update_instruction="",
                 )},
             ], f"concept:{name}")
-        return name, page_content, False
-
-    async def _gen_update(concept: dict) -> tuple[str, str, bool]:
+        try:
+            parsed = _parse_json(raw)
+            brief = parsed.get("brief", "")
+            content = parsed.get("content", raw)
+        except (json.JSONDecodeError, ValueError):
+            brief, content = "", raw
+        return name, content, False, brief
+
+    async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
         name = concept["name"]
         title = concept.get("title", name)
         concept_path = wiki_dir / "concepts" / f"{name}.md"
@@ -575,7 +582,7 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
         else:
             existing_content = "(page not found — create from scratch)"
         async with semaphore:
-            page_content = await _llm_call_async(model, [
+            raw = await _llm_call_async(model, [
                 system_msg,
                 doc_msg,
                 {"role": "assistant", "content": summary},
@@ -584,13 +591,20 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
                     existing_content=existing_content,
                 )},
             ], f"update:{name}")
-        return name, page_content, True
+        try:
+            parsed = _parse_json(raw)
+            brief = parsed.get("brief", "")
+            content = parsed.get("content", raw)
+        except (json.JSONDecodeError, ValueError):
+            brief, content = "", raw
+        return name, content, True, brief
 
     tasks = []
     tasks.extend(_gen_create(c) for c in create_items)
     tasks.extend(_gen_update(c) for c in update_items)
 
     concept_names: list[str] = []
+    concept_briefs_map: dict[str, str] = {}
 
     if tasks:
         total = len(tasks)
@@ -603,9 +617,11 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
             if isinstance(r, Exception):
                 logger.warning("Concept generation failed: %s", r)
                 continue
-            name, page_content, is_update = r
-            _write_concept(wiki_dir, name, page_content, source_file, is_update)
+            name, page_content, is_update, brief = r
+            _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief)
             concept_names.append(name)
+            if brief:
+                concept_briefs_map[name] = brief
 
     # --- Step 3b: Process related items (code only, no LLM) ---
     for slug in related_items:
@@ -618,7 +634,8 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
         _backlink_concepts(wiki_dir, doc_name, all_concept_slugs)
 
     # --- Step 4: Update index (code only) ---
-    _update_index(wiki_dir, doc_name, concept_names)
+    _update_index(wiki_dir, doc_name, concept_names,
+                  doc_brief=doc_brief, concept_briefs=concept_briefs_map)
 
 
 async def compile_short_doc(
@@ -653,13 +670,20 @@ async def compile_short_doc(
     )}
 
     # --- Step 1: Generate summary ---
-    summary = _llm_call(model, [system_msg, doc_msg], "summary")
-    _write_summary(wiki_dir, doc_name, source_file, summary)
+    summary_raw = _llm_call(model, [system_msg, doc_msg], "summary")
+    try:
+        summary_parsed = _parse_json(summary_raw)
+        doc_brief = summary_parsed.get("brief", "")
+        summary = summary_parsed.get("content", summary_raw)
+    except (json.JSONDecodeError, ValueError):
+        doc_brief = ""
+        summary = summary_raw
+    _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief)
 
     # --- Steps 2-4: Concept plan → generate/update → index ---
     await _compile_concepts(
         wiki_dir, kb_dir, model, system_msg, doc_msg,
-        summary, doc_name, max_concurrency,
+        summary, doc_name, max_concurrency, doc_brief=doc_brief,
     )
 
 
@@ -669,6 +693,7 @@ async def compile_long_doc(
     doc_id: str,
     kb_dir: Path,
     model: str,
+    doc_description: str = "",
     max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
 ) -> None:
     """Compile a long (PageIndex) document's concepts and index.
@@ -700,5 +725,5 @@ async def compile_long_doc(
     # --- Steps 2-4: Concept plan → generate/update → index ---
     await _compile_concepts(
         wiki_dir, kb_dir, model, system_msg, doc_msg,
-        overview, doc_name, max_concurrency,
+        overview, doc_name, max_concurrency, doc_brief=doc_description,
     )
diff --git a/openkb/cli.py b/openkb/cli.py
index 149f391..d8ec0fd 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -132,7 +132,8 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
         for attempt in range(2):
             try:
                 asyncio.run(
-                    compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model)
+                    compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model,
+                                     doc_description=index_result.description)
                 )
                 break
             except Exception as exc:
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index 085116d..bbb6259 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -487,13 +487,19 @@ async def test_full_pipeline(self, tmp_path):
         (tmp_path / "raw").mkdir()
         (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
 
-        summary_response = "# Summary\n\nThis document discusses transformers."
+        summary_response = json.dumps({
+            "brief": "Discusses transformers",
+            "content": "# Summary\n\nThis document discusses transformers.",
+        })
         concepts_list_response = json.dumps({
             "create": [{"name": "transformer", "title": "Transformer"}],
             "update": [],
             "related": [],
         })
-        concept_page_response = "# Transformer\n\nA neural network architecture."
+        concept_page_response = json.dumps({
+            "brief": "NN architecture using self-attention",
+            "content": "# Transformer\n\nA neural network architecture.",
+        })
 
         with patch("openkb.agent.compiler.litellm") as mock_litellm:
             mock_litellm.completion = MagicMock(
@@ -534,7 +540,7 @@ async def test_handles_bad_json(self, tmp_path):
 
         with patch("openkb.agent.compiler.litellm") as mock_litellm:
             mock_litellm.completion = MagicMock(
-                side_effect=_mock_completion(["Summary text", "not valid json"])
+                side_effect=_mock_completion(["Plain summary text", "not valid json"])
             )
             # Should not raise
             await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini")
@@ -567,7 +573,10 @@ async def test_full_pipeline(self, tmp_path):
             "update": [],
             "related": [],
         })
-        concept_page_response = "# Deep Learning\n\nA subfield of ML."
+        concept_page_response = json.dumps({
+            "brief": "Subfield of ML using neural networks",
+            "content": "# Deep Learning\n\nA subfield of ML.",
+        })
 
         with patch("openkb.agent.compiler.litellm") as mock_litellm:
             mock_litellm.completion = MagicMock(
@@ -624,8 +633,14 @@ async def test_create_and_update_flow(self, tmp_path):
             "update": [{"name": "attention", "title": "Attention"}],
             "related": [],
         })
-        create_page_response = "# Flash Attention\n\nAn efficient attention algorithm."
-        update_page_response = "# Attention\n\nUpdated content with new info."
+        create_page_response = json.dumps({
+            "brief": "Efficient attention algorithm",
+            "content": "# Flash Attention\n\nAn efficient attention algorithm.",
+        })
+        update_page_response = json.dumps({
+            "brief": "Updated attention mechanism",
+            "content": "# Attention\n\nUpdated content with new info.",
+        })
 
         system_msg = {"role": "system", "content": "You are a wiki agent."}
         doc_msg = {"role": "user", "content": "Document about attention mechanisms."}
@@ -720,7 +735,10 @@ async def test_fallback_list_format(self, tmp_path):
         plan_response = json.dumps([
             {"name": "attention", "title": "Attention"},
         ])
-        concept_page_response = "# Attention\n\nA mechanism for focusing."
+        concept_page_response = json.dumps({
+            "brief": "A mechanism for focusing",
+            "content": "# Attention\n\nA mechanism for focusing.",
+        })
 
         system_msg = {"role": "system", "content": "You are a wiki agent."}
         doc_msg = {"role": "user", "content": "Document content."}
@@ -744,3 +762,57 @@ async def test_fallback_list_format(self, tmp_path):
         att_text = att_path.read_text()
         assert "sources: [test-doc.pdf]" in att_text
         assert "Attention" in att_text
+
+
+class TestBriefIntegration:
+    @pytest.mark.asyncio
+    async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path):
+        wiki = tmp_path / "wiki"
+        (wiki / "sources").mkdir(parents=True)
+        (wiki / "summaries").mkdir(parents=True)
+        (wiki / "concepts").mkdir(parents=True)
+        (wiki / "index.md").write_text(
+            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+            encoding="utf-8",
+        )
+        source_path = wiki / "sources" / "test-doc.md"
+        source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8")
+        (tmp_path / ".openkb").mkdir()
+        (tmp_path / "raw").mkdir()
+        (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
+
+        summary_resp = json.dumps({
+            "brief": "A paper about transformers",
+            "content": "# Summary\n\nThis paper discusses transformers.",
+        })
+        plan_resp = json.dumps({
+            "create": [{"name": "transformer", "title": "Transformer"}],
+            "update": [],
+            "related": [],
+        })
+        concept_resp = json.dumps({
+            "brief": "NN architecture using self-attention",
+            "content": "# Transformer\n\nA neural network architecture.",
+        })
+
+        with patch("openkb.agent.compiler.litellm") as mock_litellm:
+            mock_litellm.completion = MagicMock(
+                side_effect=_mock_completion([summary_resp, plan_resp])
+            )
+            mock_litellm.acompletion = AsyncMock(
+                side_effect=_mock_acompletion([concept_resp])
+            )
+            await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini")
+
+        # Summary frontmatter has brief
+        summary_text = (wiki / "summaries" / "test-doc.md").read_text()
+        assert "brief: A paper about transformers" in summary_text
+
+        # Concept frontmatter has brief
+        concept_text = (wiki / "concepts" / "transformer.md").read_text()
+        assert "brief: NN architecture using self-attention" in concept_text
+
+        # Index has briefs
+        index_text = (wiki / "index.md").read_text()
+        assert "— A paper about transformers" in index_text
+        assert "— NN architecture using self-attention" in index_text

From cc6215a23b93beaab0be3509c39d0eb4760ef519 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:42:39 +0800
Subject: [PATCH 14/44] feat: store long doc sources as per-page JSON, remove
 render_source_md

Replace markdown source generation with per-page JSON from PageIndex
get_page_content; remove render_source_md, _render_nodes_source,
_relocate_images, and _IMG_REF_RE. Image relocation is now done inline
per page. Update tests to assert .json output and mock get_page_content.
---
 openkb/indexer.py       | 57 +++++++++++++++++------------------------
 openkb/tree_renderer.py | 31 ----------------------
 tests/test_indexer.py   | 24 ++++++++++++-----
 3 files changed, 41 insertions(+), 71 deletions(-)

diff --git a/openkb/indexer.py b/openkb/indexer.py
index 18aafc6..c8b81f3 100644
--- a/openkb/indexer.py
+++ b/openkb/indexer.py
@@ -1,8 +1,8 @@
 """PageIndex indexer for long documents."""
 from __future__ import annotations
 
+import json as json_mod
 import logging
-import re
 import shutil
 from dataclasses import dataclass
 from pathlib import Path
@@ -12,12 +12,10 @@
 from pageindex import IndexConfig, PageIndexClient
 
 from openkb.config import load_config
-from openkb.tree_renderer import render_source_md, render_summary_md
+from openkb.tree_renderer import render_summary_md
 
 logger = logging.getLogger(__name__)
 
-_IMG_REF_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
-
 
 @dataclass
 class IndexResult:
@@ -28,31 +26,6 @@ class IndexResult:
     tree: dict
 
 
-def _relocate_images(markdown: str, doc_stem: str, dest_images_dir: Path) -> str:
-    """Copy images from PageIndex internal paths to wiki/sources/images/ and rewrite refs.
-
-    PageIndex stores images internally (e.g. .openkb/files/{collection}/{doc_id}/images/).
-    We copy them to dest_images_dir and rewrite paths to be relative to the .md file
-    (i.e. images/{doc_stem}/filename).
-    """
-    dest_images_dir.mkdir(parents=True, exist_ok=True)
-
-    def _replace(match: re.Match) -> str:
-        alt = match.group(1)
-        src_path_str = match.group(2)
-        src_path = Path(src_path_str)
-        if not src_path.exists():
-            logger.warning("Image not found: %s", src_path)
-            return match.group(0)
-        filename = src_path.name
-        dest = dest_images_dir / filename
-        if not dest.exists():
-            shutil.copy2(src_path, dest)
-        return f"![{alt}](images/{doc_stem}/{filename})"
-
-    return _IMG_REF_RE.sub(_replace, markdown)
-
-
 def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
     """Index a long PDF document using PageIndex and write wiki pages."""
     openkb_dir = kb_dir / ".openkb"
@@ -100,14 +73,30 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
         "structure": structure,
     }
 
-    # Write wiki/sources/ — copy images from PageIndex internal location
-    # and rewrite paths to be relative to the .md file (images/{stem}/filename)
+    # Write wiki/sources/ — get per-page content from PageIndex and store as JSON
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
     dest_images_dir = sources_dir / "images" / pdf_path.stem
-    source_md = render_source_md(tree, doc_name, doc_id)
-    source_md = _relocate_images(source_md, pdf_path.stem, dest_images_dir)
-    (sources_dir / f"{pdf_path.stem}.md").write_text(source_md, encoding="utf-8")
+
+    # Get per-page content from PageIndex
+    all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}")
+
+    # Relocate image paths in each page
+    dest_images_dir.mkdir(parents=True, exist_ok=True)
+    for page in all_pages:
+        if "images" in page:
+            for img in page["images"]:
+                src_path = Path(img["path"])
+                if src_path.exists():
+                    filename = src_path.name
+                    dest = dest_images_dir / filename
+                    if not dest.exists():
+                        shutil.copy2(src_path, dest)
+                    img["path"] = f"images/{pdf_path.stem}/{filename}"
+
+    (sources_dir / f"{pdf_path.stem}.json").write_text(
+        json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
+    )
 
     # Write wiki/summaries/ (no images, just summaries)
     summaries_dir = kb_dir / "wiki" / "summaries"
diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py
index f745b48..991434e 100644
--- a/openkb/tree_renderer.py
+++ b/openkb/tree_renderer.py
@@ -13,25 +13,6 @@ def _yaml_frontmatter(source_name: str, doc_id: str) -> str:
     )
 
 
-def _render_nodes_source(nodes: list[dict], depth: int) -> str:
-    """Recursively render nodes for the *source* view (text content)."""
-    lines: list[str] = []
-    heading_prefix = "#" * min(depth, 6)
-    for node in nodes:
-        title = node.get("title", "")
-        start = node.get("start_index", "")
-        end = node.get("end_index", "")
-        text = node.get("text", "")
-        children = node.get("nodes", [])
-
-        lines.append(f"{heading_prefix} {title} (pages {start}\u2013{end})\n")
-        if text:
-            lines.append(f"{text}\n")
-        if children:
-            lines.append(_render_nodes_source(children, depth + 1))
-
-    return "\n".join(lines)
-
 
 def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
     """Recursively render nodes for the *summary* view (summaries only)."""
@@ -53,18 +34,6 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
     return "\n".join(lines)
 
 
-def render_source_md(tree: dict, source_name: str, doc_id: str) -> str:
-    """Render the full-text (source) Markdown page for a PageIndex tree.
-
-    The page begins with YAML frontmatter, then recursively renders
-    every node as a heading with its ``(pages X–Y)`` range and full text.
-    Heading level equals tree depth (h1 at root), capped at h6.
-    """
-    frontmatter = _yaml_frontmatter(source_name, doc_id)
-    structure = tree.get("structure", [])
-    body = _render_nodes_source(structure, depth=1)
-    return frontmatter + "\n" + body
-
 
 def render_summary_md(tree: dict, source_name: str, doc_id: str) -> str:
     """Render the summary Markdown page for a PageIndex tree.
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
index e35c969..0948d64 100644
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
@@ -23,6 +23,9 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict):
             "doc_type": "pdf",
             "structure": sample_tree["structure"],
         }
+
+        # get_page_content returns empty list by default (overridden per test as needed)
+        col.get_page_content.return_value = []
         return col
 
     def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
@@ -43,12 +46,19 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
         assert result.description == sample_tree["doc_description"]
         assert result.tree is not None
 
-    def test_source_page_written(self, kb_dir, sample_tree, tmp_path):
+    def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path):
+        """Long doc source should be written as JSON, not markdown."""
+        import json as json_mod
         doc_id = "abc-123"
         fake_col = self._make_fake_collection(doc_id, sample_tree)
 
         fake_client = MagicMock()
         fake_client.collection.return_value = fake_col
+        # Mock get_page_content to return page data
+        fake_col.get_page_content.return_value = [
+            {"page": 1, "content": "Page one text."},
+            {"page": 2, "content": "Page two text."},
+        ]
 
         pdf_path = tmp_path / "sample.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
@@ -56,11 +66,13 @@ def test_source_page_written(self, kb_dir, sample_tree, tmp_path):
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client):
             index_long_document(pdf_path, kb_dir)
 
-        source_file = kb_dir / "wiki" / "sources" / "sample.md"
-        assert source_file.exists()
-        content = source_file.read_text(encoding="utf-8")
-        assert "type: pageindex" in content
-        assert "Introduction" in content
+        json_file = kb_dir / "wiki" / "sources" / "sample.json"
+        assert json_file.exists()
+        assert not (kb_dir / "wiki" / "sources" / "sample.md").exists()
+        data = json_mod.loads(json_file.read_text())
+        assert len(data) == 2
+        assert data[0]["page"] == 1
+        assert data[0]["content"] == "Page one text."
 
     def test_summary_page_written(self, kb_dir, sample_tree, tmp_path):
         doc_id = "abc-123"

From 49afbdb508281e987c9a95422abd604500c9a3d3 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:43:09 +0800
Subject: [PATCH 15/44] feat: replace pageindex_retrieve with get_page_content,
 unify query for all docs

Remove _pageindex_retrieve_impl and the pageindex_retrieve tool; add
get_page_content_tool that uses the local JSON-based page store for all
long documents. Update instructions and schema description accordingly.
---
 openkb/agent/query.py | 167 +++++-------------------------------------
 openkb/schema.py      |   2 +-
 tests/test_query.py   | 102 +++-----------------------
 3 files changed, 32 insertions(+), 239 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 6a740fb..051d8e7 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -3,11 +3,7 @@
 
 from pathlib import Path
 
-import litellm
 from agents import Agent, Runner, function_tool
-import os
-
-from pageindex import PageIndexClient
 
 from openkb.agent.tools import list_wiki_files, read_wiki_file
 from openkb.schema import SCHEMA_MD, get_agents_md
@@ -18,11 +14,14 @@
 {schema_md}
 
 ## Search strategy
-1. Start by reading index.md to understand what documents and concepts are available.
-2. Read relevant summary pages (summaries/) to get document overviews.
+1. Read index.md to understand what documents and concepts are available.
+   Each entry has a brief summary to help you judge relevance.
+2. Read relevant summary pages (summaries/) for document overviews.
 3. Read concept pages (concepts/) for cross-document synthesis.
-4. For long documents indexed with PageIndex, call pageindex_retrieve with the
-   document ID and the user's question to get detailed page-level content.
+4. For long documents, use get_page_content(doc_name, pages) to read
+   specific pages when you need detailed content. The summary page
+   shows chapter structure with page ranges to help you decide which
+   pages to read.
 5. Synthesise a clear, well-cited answer.
 
 Always ground your answer in the wiki content. If you cannot find relevant
@@ -30,132 +29,8 @@
 """
 
 
-def _pageindex_retrieve_impl(doc_id: str, question: str, openkb_dir: str, model: str) -> str:
-    """Retrieve relevant content from a long document via PageIndex.
-
-    For cloud-indexed docs: delegates to col.query() directly.
-    For local docs: uses structure-based page selection + get_page_content.
-    """
-    pageindex_api_key = os.environ.get("PAGEINDEX_API_KEY", "")
-    # Determine if this doc was cloud-indexed (cloud doc_ids have "pi-" prefix)
-    is_cloud_doc = doc_id.startswith("pi-")
-
-    if is_cloud_doc:
-        # Cloud doc: use PageIndex streaming query (avoids timeout, shows progress)
-        import sys
-        import asyncio
-        import threading
-
-        client = PageIndexClient(api_key=pageindex_api_key or None, model=model)
-        col = client.collection()
-        try:
-            stream = col.query(question, doc_ids=[doc_id], stream=True)
-            collected: list[str] = []
-            done = threading.Event()
-
-            async def _consume():
-                try:
-                    async for event in stream:
-                        if event.type == "answer_delta":
-                            sys.stdout.write(event.data)
-                            sys.stdout.flush()
-                            collected.append(event.data)
-                        elif event.type == "tool_call":
-                            name = event.data.get("name", "")
-                            args = event.data.get("args", "")
-                            sys.stdout.write(f"\n  [PageIndex] {name}({args})\n")
-                            sys.stdout.flush()
-                    sys.stdout.write("\n")
-                    sys.stdout.flush()
-                finally:
-                    done.set()
-
-            # Run streaming in a separate thread with its own event loop
-            def _run():
-                loop = asyncio.new_event_loop()
-                loop.run_until_complete(_consume())
-                loop.close()
-
-            t = threading.Thread(target=_run, daemon=True)
-            t.start()
-            t.join(timeout=120)
-            return "".join(collected) if collected else "No answer from PageIndex."
-        except Exception as exc:
-            return f"Error querying cloud PageIndex: {exc}"
-
-    # Local doc: use local PageIndex with structure-based retrieval
-    client = PageIndexClient(model=model, storage_path=openkb_dir)
-    col = client.collection()
-
-    try:
-        structure = col.get_document_structure(doc_id)
-    except Exception as exc:
-        return f"Error retrieving document structure: {exc}"
-
-    if not structure:
-        return "No structure found for document."
-    sections = []
-    for idx, node in enumerate(structure):
-        title = node.get("title", f"Section {idx + 1}")
-        node_id = node.get("node_id", str(idx))
-        summary = node.get("summary", "")
-        start = node.get("start_index", idx)
-        end = node.get("end_index", idx)
-        sections.append(
-            f"node_id={node_id} title='{title}' pages={start}-{end} summary='{summary}'"
-        )
-
-    sections_text = "\n".join(sections)
-    prompt = (
-        f"Given the following document sections:\n{sections_text}\n\n"
-        f"Which page ranges are most relevant to this question: '{question}'?\n"
-        "Reply with a comma-separated list of page numbers or ranges (e.g. '1-3,7,10-12'). "
-        "Return ONLY the page specification, nothing else."
-    )
-
-    # 2. Ask LLM which pages are relevant
-    try:
-        response = litellm.completion(
-            model=model,
-            messages=[{"role": "user", "content": prompt}],
-        )
-        page_spec = response.choices[0].message.content.strip()
-    except Exception as exc:
-        return f"Error selecting relevant pages: {exc}"
-
-    if not page_spec:
-        return "Could not determine relevant pages."
-
-    # 3. Fetch those pages
-    try:
-        pages = col.get_page_content(doc_id, page_spec)
-    except Exception as exc:
-        return f"Error fetching page content: {exc}"
-
-    if not pages:
-        return f"No content found for pages: {page_spec}"
-
-    parts = []
-    for item in pages:
-        page_num = item.get("page_index", "?")
-        text = item.get("text", "")
-        parts.append(f"[Page {page_num}]\n{text}")
-
-    return "\n\n".join(parts)
-
-
-def build_query_agent(wiki_root: str, openkb_dir: str, model: str, language: str = "en") -> Agent:
-    """Build and return the Q&A agent.
-
-    Args:
-        wiki_root: Absolute path to the wiki directory.
-        openkb_dir: Path to the .openkb/ state directory.
-        model: LLM model name.
-        language: Language code for wiki content (e.g. 'en', 'fr').
-
-    Returns:
-        Configured :class:`~agents.Agent` instance.
-    """
+def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent:
+    """Build and return the Q&A agent."""
     schema_md = get_agents_md(Path(wiki_root))
     instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
     instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
@@ -163,7 +38,6 @@ def build_query_agent(wiki_root: str, openkb_dir: str, model: str, language: str
     @function_tool
     def list_files(directory: str) -> str:
         """List all Markdown files in a wiki subdirectory.
-
         Args:
             directory: Subdirectory path relative to wiki root (e.g. 'sources').
         """
@@ -172,31 +46,29 @@ def list_files(directory: str) -> str:
     @function_tool
     def read_file(path: str) -> str:
         """Read a Markdown file from the wiki.
-
         Args:
             path: File path relative to wiki root (e.g. 'summaries/paper.md').
         """
         return read_wiki_file(path, wiki_root)
 
     @function_tool
-    def pageindex_retrieve(doc_id: str, question: str) -> str:
-        """Retrieve relevant content from a long document via PageIndex.
-
-        Use this when you need detailed content from a document that was
-        indexed with PageIndex (long documents).
-
+    def get_page_content_tool(doc_name: str, pages: str) -> str:
+        """Get text content of specific pages from a long document.
+        Use this when you need detailed content from a document. The summary
+        page shows chapter structure with page ranges.
         Args:
-            doc_id: PageIndex document identifier (found in index.md).
-            question: The question you are trying to answer.
+            doc_name: Document name (e.g. 'attention-is-all-you-need').
+            pages: Page specification (e.g. '3-5,7,10-12').
         """
-        return _pageindex_retrieve_impl(doc_id, question, openkb_dir, model)
+        from openkb.agent.tools import get_page_content
+        return get_page_content(doc_name, pages, wiki_root)
 
     from agents.model_settings import ModelSettings
 
     return Agent(
         name="wiki-query",
         instructions=instructions,
-        tools=[list_files, read_file, pageindex_retrieve],
+        tools=[list_files, read_file, get_page_content_tool],
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
@@ -224,9 +96,8 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals
     language: str = config.get("language", "en")
 
     wiki_root = str(kb_dir / "wiki")
-    openkb_path = str(openkb_dir)
 
-    agent = build_query_agent(wiki_root, openkb_path, model, language=language)
+    agent = build_query_agent(wiki_root, model, language=language)
 
     if not stream:
         result = await Runner.run(agent, question)
diff --git a/openkb/schema.py b/openkb/schema.py
index d0fc602..1911e86 100644
--- a/openkb/schema.py
+++ b/openkb/schema.py
@@ -6,7 +6,7 @@
 # Wiki Schema
 
 ## Directory Structure
-- sources/ — Full-text converted from raw documents. Do not modify directly.
+- sources/ — Document content. Short docs as .md, long docs as .json (per-page). Do not modify directly.
 - sources/images/ — Extracted images from documents, referenced by sources.
 - summaries/ — One per source document. Summary of key content.
 - concepts/ — Cross-document topic synthesis. Created when a theme spans multiple documents.
diff --git a/tests/test_query.py b/tests/test_query.py
index 084fc9e..dc14779 100644
--- a/tests/test_query.py
+++ b/tests/test_query.py
@@ -6,119 +6,41 @@
 
 import pytest
 
-from openkb.agent.query import _pageindex_retrieve_impl, build_query_agent, run_query
+from openkb.agent.query import build_query_agent, run_query
 from openkb.schema import SCHEMA_MD
 
 
 class TestBuildQueryAgent:
     def test_agent_name(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert agent.name == "wiki-query"
 
     def test_agent_has_three_tools(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert len(agent.tools) == 3
 
     def test_agent_tool_names(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         names = {t.name for t in agent.tools}
         assert "list_files" in names
         assert "read_file" in names
-        assert "pageindex_retrieve" in names
+        assert "get_page_content_tool" in names
+        assert "pageindex_retrieve" not in names
 
-    def test_instructions_reference_registered_pageindex_tool(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
-        tool_names = {t.name for t in agent.tools}
-        assert "pageindex_retrieve" in agent.instructions
-        assert "pageindex_retrieve" in tool_names
+    def test_instructions_mention_get_page_content(self, tmp_path):
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
+        assert "get_page_content" in agent.instructions
+        assert "pageindex_retrieve" not in agent.instructions
 
     def test_schema_in_instructions(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert SCHEMA_MD in agent.instructions
 
     def test_agent_model(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "my-model")
+        agent = build_query_agent(str(tmp_path), "my-model")
         assert agent.model == "litellm/my-model"
 
 
-class TestPageindexRetrieve:
-    def test_returns_page_content(self, tmp_path):
-        mock_structure = [
-            {
-                "node_id": "n1",
-                "title": "Introduction",
-                "start_index": 1,
-                "end_index": 5,
-                "summary": "Overview section",
-            }
-        ]
-        mock_pages = [
-            {"page_index": 1, "text": "Introduction text here."},
-            {"page_index": 2, "text": "More intro content."},
-        ]
-
-        mock_col = MagicMock()
-        mock_col.get_document_structure.return_value = mock_structure
-        mock_col.get_page_content.return_value = mock_pages
-
-        mock_client = MagicMock()
-        mock_client.collection.return_value = mock_col
-
-        with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \
-             patch("openkb.agent.query.litellm.completion") as mock_llm, \
-             patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False):
-            mock_llm.return_value = MagicMock(
-                choices=[MagicMock(message=MagicMock(content="1-2"))]
-            )
-            result = _pageindex_retrieve_impl("doc123", "What is the intro?", "/db", "gpt-4o-mini")
-
-        assert "Introduction text here." in result
-        assert "More intro content." in result
-
-    def test_cloud_doc_uses_streaming_query(self, tmp_path):
-        """Cloud doc (pi- prefix) delegates to col.query(stream=True)."""
-        from dataclasses import dataclass
-        from typing import Any
-
-        @dataclass
-        class FakeEvent:
-            type: str
-            data: Any
-
-        class FakeStream:
-            async def __aiter__(self):
-                yield FakeEvent(type="answer_delta", data="Cloud ")
-                yield FakeEvent(type="answer_delta", data="answer about MCP.")
-
-        mock_stream = FakeStream()
-
-        mock_col = MagicMock()
-        mock_col.query.return_value = mock_stream
-
-        mock_client = MagicMock()
-        mock_client.collection.return_value = mock_col
-
-        with patch("openkb.agent.query.PageIndexClient", return_value=mock_client):
-            result = _pageindex_retrieve_impl("pi-abc123", "What is MCP?", "/db", "gpt-4o-mini")
-
-        assert "Cloud answer about MCP." in result
-        mock_col.query.assert_called_once_with("What is MCP?", doc_ids=["pi-abc123"], stream=True)
-
-    def test_local_empty_structure_returns_error(self, tmp_path):
-        """Local doc with empty structure returns error."""
-        mock_col = MagicMock()
-        mock_col.get_document_structure.return_value = []
-
-        mock_client = MagicMock()
-        mock_client.collection.return_value = mock_col
-
-        with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \
-             patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False):
-            result = _pageindex_retrieve_impl("local-uuid-123", "What?", "/db", "gpt-4o-mini")
-
-        assert "No structure found" in result
-
-
 class TestRunQuery:
     @pytest.mark.asyncio
     async def test_run_query_returns_final_output(self, tmp_path):

From 8b75b7e3416a06fcb3c9ba39f06daebcd840e65a Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:44:15 +0800
Subject: [PATCH 16/44] fix: remove tests for deleted render_source_md

---
 tests/test_tree_renderer.py | 119 +-----------------------------------
 1 file changed, 1 insertion(+), 118 deletions(-)

diff --git a/tests/test_tree_renderer.py b/tests/test_tree_renderer.py
index 1d81b3b..d636d5b 100644
--- a/tests/test_tree_renderer.py
+++ b/tests/test_tree_renderer.py
@@ -3,124 +3,7 @@
 
 import pytest
 
-from openkb.tree_renderer import render_source_md, render_summary_md
-
-
-# ---------------------------------------------------------------------------
-# render_source_md
-# ---------------------------------------------------------------------------
-
-
-class TestRenderSourceMd:
-    def test_has_yaml_frontmatter(self, sample_tree):
-        output = render_source_md(sample_tree, "Sample Document", "doc-abc")
-        assert output.startswith("---\n")
-        assert "source: Sample Document" in output
-        assert "type: pageindex" in output
-        assert "doc_id: doc-abc" in output
-        assert "---\n" in output
-
-    def test_top_level_nodes_are_h1(self, sample_tree):
-        output = render_source_md(sample_tree, "Sample Document", "doc-abc")
-        assert "# Introduction" in output
-        assert "# Conclusion" in output
-
-    def test_nested_nodes_are_h2(self, sample_tree):
-        output = render_source_md(sample_tree, "Sample Document", "doc-abc")
-        assert "## Background" in output
-        assert "## Motivation" in output
-
-    def test_page_range_included(self, sample_tree):
-        output = render_source_md(sample_tree, "Sample Document", "doc-abc")
-        assert "(pages 0–120)" in output  # Introduction
-        assert "(pages 0–60)" in output   # Background
-        assert "(pages 61–120)" in output  # Motivation
-        assert "(pages 121–200)" in output  # Conclusion
-
-    def test_node_text_included(self, sample_tree):
-        output = render_source_md(sample_tree, "Sample Document", "doc-abc")
-        assert "This document introduces the core concepts of the system." in output
-        assert "Background information on the subject." in output
-
-    def test_no_summary_in_source(self, sample_tree):
-        output = render_source_md(sample_tree, "Sample Document", "doc-abc")
-        # Source pages show text, not summaries
-        assert "Summary:" not in output
-
-    def test_heading_depth_capped_at_6(self):
-        """Deeply nested nodes must not exceed h6."""
-        deep_tree = {
-            "doc_name": "Deep",
-            "doc_description": "A deeply nested doc.",
-            "structure": [
-                {
-                    "title": "L1",
-                    "start_index": 0,
-                    "end_index": 10,
-                    "text": "L1 text",
-                    "summary": "L1 summary",
-                    "nodes": [
-                        {
-                            "title": "L2",
-                            "start_index": 0,
-                            "end_index": 5,
-                            "text": "L2 text",
-                            "summary": "L2 summary",
-                            "nodes": [
-                                {
-                                    "title": "L3",
-                                    "start_index": 0,
-                                    "end_index": 3,
-                                    "text": "L3 text",
-                                    "summary": "L3 summary",
-                                    "nodes": [
-                                        {
-                                            "title": "L4",
-                                            "start_index": 0,
-                                            "end_index": 1,
-                                            "text": "L4 text",
-                                            "summary": "L4 summary",
-                                            "nodes": [
-                                                {
-                                                    "title": "L5",
-                                                    "start_index": 0,
-                                                    "end_index": 1,
-                                                    "text": "L5 text",
-                                                    "summary": "L5 summary",
-                                                    "nodes": [
-                                                        {
-                                                            "title": "L6",
-                                                            "start_index": 0,
-                                                            "end_index": 1,
-                                                            "text": "L6 text",
-                                                            "summary": "L6 summary",
-                                                            "nodes": [
-                                                                {
-                                                                    "title": "L7",
-                                                                    "start_index": 0,
-                                                                    "end_index": 1,
-                                                                    "text": "L7 text",
-                                                                    "summary": "L7 summary",
-                                                                    "nodes": [],
-                                                                }
-                                                            ],
-                                                        }
-                                                    ],
-                                                }
-                                            ],
-                                        }
-                                    ],
-                                }
-                            ],
-                        }
-                    ],
-                }
-            ],
-        }
-        output = render_source_md(deep_tree, "Deep", "doc-deep")
-        # L7 is at depth 7 — must render as h6, not h7
-        assert "#######" not in output
-        assert "L7 text" in output
+from openkb.tree_renderer import render_summary_md
 
 
 # ---------------------------------------------------------------------------

From 36ae619cfb72c48b3954ae93e674c97ca97dde67 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Thu, 9 Apr 2026 23:45:01 +0800
Subject: [PATCH 17/44] chore: remove dead references to render_source_md

---
 openkb/tree_renderer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py
index 991434e..6770d7e 100644
--- a/openkb/tree_renderer.py
+++ b/openkb/tree_renderer.py
@@ -38,8 +38,7 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
 def render_summary_md(tree: dict, source_name: str, doc_id: str) -> str:
     """Render the summary Markdown page for a PageIndex tree.
 
-    Identical structure to :func:`render_source_md` but replaces node text
-    with ``Summary: {summary}`` for each node.
+    Renders each node as a heading with page range and its summary text.
     """
     frontmatter = _yaml_frontmatter(source_name, doc_id)
     structure = tree.get("structure", [])

From 27a9e3a89e59d443dfc8487e94a3bd9770a24b11 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Fri, 10 Apr 2026 00:19:20 +0800
Subject: [PATCH 18/44] fix: change default model to gpt-5.4-mini, fix
 page_count fallback in indexer

- Default model changed from gpt-5.4 to gpt-5.4-mini
- Indexer get_page_content no longer uses hardcoded 9999 fallback
- Infers page_count from structure end_index when doc lacks page_count field
- Added debug logging for doc keys and page_count diagnosis
---
 openkb/config.py  | 32 +++++++++++++++++++++++++++++++-
 openkb/indexer.py | 18 ++++++++++++++++--
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/openkb/config.py b/openkb/config.py
index fbd7bca..b83e134 100644
--- a/openkb/config.py
+++ b/openkb/config.py
@@ -6,11 +6,14 @@
 import yaml
 
 DEFAULT_CONFIG: dict[str, Any] = {
-    "model": "gpt-5.4",
+    "model": "gpt-5.4-mini",
     "language": "en",
     "pageindex_threshold": 20,
 }
 
+GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"
+GLOBAL_CONFIG_PATH = GLOBAL_CONFIG_DIR / "global.yaml"
+
 
 def load_config(config_path: Path) -> dict[str, Any]:
     """Load YAML config from config_path, merged with DEFAULT_CONFIG.
@@ -30,3 +33,30 @@ def save_config(config_path: Path, config: dict) -> None:
     config_path.parent.mkdir(parents=True, exist_ok=True)
     with config_path.open("w", encoding="utf-8") as fh:
         yaml.safe_dump(config, fh, allow_unicode=True, sort_keys=True)
+
+
+def load_global_config() -> dict[str, Any]:
+    """Load the global config from ~/.config/openkb/global.yaml."""
+    if GLOBAL_CONFIG_PATH.exists():
+        with GLOBAL_CONFIG_PATH.open("r", encoding="utf-8") as fh:
+            return yaml.safe_load(fh) or {}
+    return {}
+
+
+def save_global_config(config: dict[str, Any]) -> None:
+    """Save the global config to ~/.config/openkb/global.yaml."""
+    GLOBAL_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+    with GLOBAL_CONFIG_PATH.open("w", encoding="utf-8") as fh:
+        yaml.safe_dump(config, fh, allow_unicode=True, sort_keys=True)
+
+
+def register_kb(kb_path: Path) -> None:
+    """Register a KB path in the global config's known_kbs list."""
+    gc = load_global_config()
+    known = gc.get("known_kbs", [])
+    resolved = str(kb_path.resolve())
+    if resolved not in known:
+        known.append(resolved)
+        gc["known_kbs"] = known
+    gc["default_kb"] = resolved
+    save_global_config(gc)
diff --git a/openkb/indexer.py b/openkb/indexer.py
index c8b81f3..8cd6913 100644
--- a/openkb/indexer.py
+++ b/openkb/indexer.py
@@ -67,6 +67,10 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
     description: str = doc.get("doc_description", "")
     structure: list = doc.get("structure", [])
 
+    # Debug: print doc keys and page_count to diagnose get_page_content range
+    logger.info("Doc keys: %s", list(doc.keys()))
+    logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT"))
+
     tree = {
         "doc_name": doc_name,
         "doc_description": description,
@@ -78,8 +82,18 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
     sources_dir.mkdir(parents=True, exist_ok=True)
     dest_images_dir = sources_dir / "images" / pdf_path.stem
 
-    # Get per-page content from PageIndex
-    all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}")
+    # Get per-page content from PageIndex — use actual page count
+    page_count = doc.get("page_count")
+    if page_count is None:
+        # Fallback: count pages from structure's max end_index
+        max_page = 0
+        for node in structure:
+            end = node.get("end_index", 0)
+            if end > max_page:
+                max_page = end
+        page_count = max_page if max_page > 0 else 100
+        logger.info("page_count not in doc, inferred from structure: %d", page_count)
+    all_pages = col.get_page_content(doc_id, f"1-{page_count}")
 
     # Relocate image paths in each page
     dest_images_dir.mkdir(parents=True, exist_ok=True)

From 0bc0b441497c4d32cfe1c427dc39c9e943874d51 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Fri, 10 Apr 2026 00:40:09 +0800
Subject: [PATCH 19/44] feat: doc type in index.md, remove list_files from
 query agent, source backlink for short docs

- index.md entries now show (short) or (pageindex) type marker
- Query agent prompt updated: guides agent to read sources for detail
- Removed list_files tool from query agent (index.md is sufficient)
- Short doc summaries now have source_doc frontmatter linking to sources/
- Reverted list_wiki_files to only list .md files
- Fixed tests for model name change and agent tool count
---
 openkb/agent/compiler.py | 26 ++++++++++++++++++++------
 openkb/agent/query.py    | 34 +++++++++++++---------------------
 tests/test_compiler.py   |  2 +-
 tests/test_config.py     |  2 +-
 tests/test_query.py      |  6 +++---
 5 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index b4b549e..f79e0f4 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -289,13 +289,20 @@ def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
     return f"{doc_name}.pdf"
 
 
-def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None:
-    """Write summary page with frontmatter."""
+def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str,
+                    brief: str = "", doc_type: str = "short") -> None:
+    """Write summary page with frontmatter.
+
+    For short docs, includes a ``source_doc`` field linking to the full
+    source text in ``sources/{doc_name}.md``.
+    """
     summaries_dir = wiki_dir / "summaries"
     summaries_dir.mkdir(parents=True, exist_ok=True)
     fm_lines = [f"sources: [{source_file}]"]
     if brief:
         fm_lines.append(f"brief: {brief}")
+    if doc_type == "short":
+        fm_lines.append(f"source_doc: sources/{doc_name}.md")
     frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
     (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
 
@@ -442,12 +449,15 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str])
 def _update_index(
     wiki_dir: Path, doc_name: str, concept_names: list[str],
     doc_brief: str = "", concept_briefs: dict[str, str] | None = None,
+    doc_type: str = "short",
 ) -> None:
     """Append document and concept entries to index.md.
 
     When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries
-    are written as ``- [[link]] — brief text``.  Existing entries are detected
-    by the link part only, so updating a brief on a re-compile works correctly.
+    are written as ``- [[link]] (type) — brief text``.  Existing entries are
+    detected by the link part only, so updating a brief on a re-compile works.
+    ``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the
+    query agent knows how to access detailed content.
     """
     if concept_briefs is None:
         concept_briefs = {}
@@ -463,7 +473,7 @@ def _update_index(
 
     doc_link = f"[[summaries/{doc_name}]]"
     if doc_link not in text:
-        doc_entry = f"- {doc_link}"
+        doc_entry = f"- {doc_link} ({doc_type})"
         if doc_brief:
             doc_entry += f" — {doc_brief}"
         if "## Documents" in text:
@@ -498,6 +508,7 @@ async def _compile_concepts(
     doc_name: str,
     max_concurrency: int,
     doc_brief: str = "",
+    doc_type: str = "short",
 ) -> None:
     """Shared Steps 2-4: concepts plan → generate/update → index.
 
@@ -635,7 +646,8 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
 
     # --- Step 4: Update index (code only) ---
     _update_index(wiki_dir, doc_name, concept_names,
-                  doc_brief=doc_brief, concept_briefs=concept_briefs_map)
+                  doc_brief=doc_brief, concept_briefs=concept_briefs_map,
+                  doc_type=doc_type)
 
 
 async def compile_short_doc(
@@ -684,6 +696,7 @@ async def compile_short_doc(
     await _compile_concepts(
         wiki_dir, kb_dir, model, system_msg, doc_msg,
         summary, doc_name, max_concurrency, doc_brief=doc_brief,
+        doc_type="short",
     )
 
 
@@ -726,4 +739,5 @@ async def compile_long_doc(
     await _compile_concepts(
         wiki_dir, kb_dir, model, system_msg, doc_msg,
         overview, doc_name, max_concurrency, doc_brief=doc_description,
+        doc_type="pageindex",
     )
diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 051d8e7..134901a 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -5,8 +5,8 @@
 
 from agents import Agent, Runner, function_tool
 
-from openkb.agent.tools import list_wiki_files, read_wiki_file
-from openkb.schema import SCHEMA_MD, get_agents_md
+from openkb.agent.tools import read_wiki_file
+from openkb.schema import get_agents_md
 
 _QUERY_INSTRUCTIONS_TEMPLATE = """\
 You are a knowledge-base Q&A agent. You answer questions by searching the wiki.
@@ -14,18 +14,18 @@
 {schema_md}
 
 ## Search strategy
-1. Read index.md to understand what documents and concepts are available.
-   Each entry has a brief summary to help you judge relevance.
+1. Read index.md to see all documents and concepts with brief summaries.
+   Each document is marked (short) or (pageindex) to indicate its type.
 2. Read relevant summary pages (summaries/) for document overviews.
 3. Read concept pages (concepts/) for cross-document synthesis.
-4. For long documents, use get_page_content(doc_name, pages) to read
-   specific pages when you need detailed content. The summary page
-   shows chapter structure with page ranges to help you decide which
-   pages to read.
-5. Synthesise a clear, well-cited answer.
-
-Always ground your answer in the wiki content. If you cannot find relevant
-information, say so clearly.
+4. When you need detailed source content:
+   - Short documents: read_file("sources/{{doc_name}}.md") for the full text.
+   - PageIndex documents: use get_page_content(doc_name, pages) to read
+     specific pages. The summary page shows chapter structure with page
+     ranges to help you decide which pages to read.
+5. Synthesise a clear, well-cited answer grounded in wiki content.
+
+If you cannot find relevant information, say so clearly.
 """
 
 
@@ -35,14 +35,6 @@ def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent
     instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
     instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
 
-    @function_tool
-    def list_files(directory: str) -> str:
-        """List all Markdown files in a wiki subdirectory.
-        Args:
-            directory: Subdirectory path relative to wiki root (e.g. 'sources').
-        """
-        return list_wiki_files(directory, wiki_root)
-
     @function_tool
     def read_file(path: str) -> str:
         """Read a Markdown file from the wiki.
@@ -68,7 +60,7 @@ def get_page_content_tool(doc_name: str, pages: str) -> str:
     return Agent(
         name="wiki-query",
         instructions=instructions,
-        tools=[list_files, read_file, get_page_content_tool],
+        tools=[read_file, get_page_content_tool],
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index bbb6259..b3746d1 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -160,7 +160,7 @@ def test_appends_entries_with_briefs(self, tmp_path):
                        doc_brief="Introduces transformers",
                        concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"})
         text = (wiki / "index.md").read_text()
-        assert "[[summaries/my-doc]] — Introduces transformers" in text
+        assert "[[summaries/my-doc]] (short) — Introduces transformers" in text
         assert "[[concepts/attention]] — Focus mechanism" in text
         assert "[[concepts/transformer]] — NN architecture" in text
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 31bd0ab..495e075 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -10,7 +10,7 @@ def test_default_config_keys():
 
 
 def test_default_config_values():
-    assert DEFAULT_CONFIG["model"] == "gpt-5.4"
+    assert DEFAULT_CONFIG["model"] == "gpt-5.4-mini"
     assert DEFAULT_CONFIG["language"] == "en"
     assert DEFAULT_CONFIG["pageindex_threshold"] == 20
 
diff --git a/tests/test_query.py b/tests/test_query.py
index dc14779..8be4cb9 100644
--- a/tests/test_query.py
+++ b/tests/test_query.py
@@ -15,16 +15,16 @@ def test_agent_name(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert agent.name == "wiki-query"
 
-    def test_agent_has_three_tools(self, tmp_path):
+    def test_agent_has_two_tools(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
-        assert len(agent.tools) == 3
+        assert len(agent.tools) == 2
 
     def test_agent_tool_names(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         names = {t.name for t in agent.tools}
-        assert "list_files" in names
         assert "read_file" in names
         assert "get_page_content_tool" in names
+        assert "list_files" not in names
         assert "pageindex_retrieve" not in names
 
     def test_instructions_mention_get_page_content(self, tmp_path):

From 739c8eb5d507d7a23a71cb2c38cb1dcfe8328634 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Fri, 10 Apr 2026 00:50:30 +0800
Subject: [PATCH 20/44] feat: warn when no LLM API key found instead of failing
 silently

---
 openkb/cli.py | 112 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 90 insertions(+), 22 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index d8ec0fd..b14f45c 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -19,7 +19,7 @@
 litellm.suppress_debug_info = True
 from dotenv import load_dotenv
 
-from openkb.config import DEFAULT_CONFIG, load_config, save_config
+from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb
 from openkb.converter import convert_document
 from openkb.log import append_log
 from openkb.schema import AGENTS_MD
@@ -30,8 +30,11 @@
 def _setup_llm_key(kb_dir: Path | None = None) -> None:
     """Set LiteLLM API key from LLM_API_KEY env var if present.
 
-    If *kb_dir* is given, also loads ``.env`` from the KB root so that
-    the key is found even when the CLI is invoked from another directory.
+    Load order (override=False, so first one wins):
+    1. System environment variables (already set)
+    2. KB-local .env  (kb_dir/.env)
+    3. Global .env    (~/.config/openkb/.env)
+
     Also propagates to provider-specific env vars (OPENAI_API_KEY, etc.)
     so that the Agents SDK litellm provider can pick them up.
     """
@@ -40,8 +43,23 @@ def _setup_llm_key(kb_dir: Path | None = None) -> None:
         if env_file.exists():
             load_dotenv(env_file, override=False)
 
+    from openkb.config import GLOBAL_CONFIG_DIR
+    global_env = GLOBAL_CONFIG_DIR / ".env"
+    if global_env.exists():
+        load_dotenv(global_env, override=False)
+
     api_key = os.environ.get("LLM_API_KEY", "")
-    if api_key:
+    if not api_key:
+        # Check if any provider key is already set
+        has_key = any(os.environ.get(k) for k in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY"))
+        if not has_key:
+            click.echo(
+                "Warning: No LLM API key found. Set one of:\n"
+                f"  1. {kb_dir / '.env' if kb_dir else '<kb_dir>/.env'} — LLM_API_KEY=sk-...\n"
+                f"  2. {GLOBAL_CONFIG_DIR / '.env'} — LLM_API_KEY=sk-...\n"
+                "  3. Export LLM_API_KEY in your shell profile"
+            )
+    else:
         litellm.api_key = api_key
         for env_var in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY"):
             if not os.environ.get(env_var):
@@ -74,11 +92,29 @@ def _display_type(raw_type: str) -> str:
 # Helpers
 # ---------------------------------------------------------------------------
 
-def _find_kb_dir() -> Path | None:
-    """Return the knowledge-base root if .openkb/ exists in cwd, else None."""
-    candidate = Path(".openkb")
-    if candidate.exists() and candidate.is_dir():
-        return Path(".")
+def _find_kb_dir(override: Path | None = None) -> Path | None:
+    """Find the KB root: explicit override → walk up from cwd → global default_kb."""
+    # 0. Explicit override (--kb-dir or OPENKB_DIR)
+    if override is not None:
+        if (override / ".openkb").is_dir():
+            return override
+        return None
+    # 1. Walk up from cwd
+    current = Path.cwd().resolve()
+    while True:
+        if (current / ".openkb").is_dir():
+            return current
+        parent = current.parent
+        if parent == current:
+            break
+        current = parent
+    # 2. Fall back to global config default_kb
+    gc = load_global_config()
+    default = gc.get("default_kb")
+    if default:
+        p = Path(default)
+        if (p / ".openkb").is_dir():
+            return p
     return None
 
 
@@ -174,7 +210,9 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
 
 @click.group()
 @click.option("-v", "--verbose", is_flag=True, default=False, help="Enable verbose logging.")
-def cli(verbose):
+@click.option("--kb-dir", "kb_dir_override", default=None, type=click.Path(exists=True, file_okay=False, resolve_path=True), help="Path to a KB root directory (overrides auto-detection).")
+@click.pass_context
+def cli(ctx, verbose, kb_dir_override):
     """OpenKB — Karpathy's LLM Knowledge Base workflow, powered by PageIndex."""
     logging.basicConfig(
         format="%(name)s %(levelname)s: %(message)s",
@@ -182,6 +220,27 @@ def cli(verbose):
     )
     if verbose:
         logging.getLogger("openkb").setLevel(logging.DEBUG)
+    ctx.ensure_object(dict)
+    if kb_dir_override:
+        ctx.obj["kb_dir_override"] = Path(kb_dir_override)
+    else:
+        env_kb = os.environ.get("OPENKB_DIR")
+        if env_kb:
+            ctx.obj["kb_dir_override"] = Path(env_kb).resolve()
+        else:
+            ctx.obj["kb_dir_override"] = None
+
+
+@cli.command()
+@click.argument("path", default=".")
+def use(path):
+    """Set PATH as the default knowledge base."""
+    target = Path(path).resolve()
+    if not (target / ".openkb").is_dir():
+        click.echo(f"Not a knowledge base: {target}")
+        return
+    register_kb(target)
+    click.echo(f"Default KB set to: {target}")
 
 
 @cli.command()
@@ -229,14 +288,18 @@ def init():
     save_config(openkb_dir / "config.yaml", config)
     (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
 
+    # Register this KB in the global config
+    register_kb(Path.cwd())
+
     click.echo("Knowledge base initialised.")
 
 
 @cli.command()
 @click.argument("path")
-def add(path):
+@click.pass_context
+def add(ctx, path):
     """Add a document or directory of documents at PATH to the knowledge base."""
-    kb_dir = _find_kb_dir()
+    kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
     if kb_dir is None:
         click.echo("No knowledge base found. Run `openkb init` first.")
         return
@@ -272,9 +335,10 @@ def add(path):
 @cli.command()
 @click.argument("question")
 @click.option("--save", is_flag=True, default=False, help="Save the answer to wiki/explorations/.")
-def query(question, save):
+@click.pass_context
+def query(ctx, question, save):
     """Query the knowledge base with QUESTION."""
-    kb_dir = _find_kb_dir()
+    kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
     if kb_dir is None:
         click.echo("No knowledge base found. Run `openkb init` first.")
         return
@@ -307,9 +371,10 @@ def query(question, save):
 
 
 @cli.command()
-def watch():
+@click.pass_context
+def watch(ctx):
     """Watch the raw/ directory for new documents and process them automatically."""
-    kb_dir = _find_kb_dir()
+    kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
     if kb_dir is None:
         click.echo("No knowledge base found. Run `openkb init` first.")
         return
@@ -336,11 +401,12 @@ def on_new_files(paths):
 
 @cli.command()
 @click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues (not yet implemented).")
-def lint(fix):
+@click.pass_context
+def lint(ctx, fix):
     """Lint the knowledge base for structural and semantic inconsistencies."""
     if fix:
         click.echo("Warning: --fix is not yet implemented. Running lint in report-only mode.")
-    kb_dir = _find_kb_dir()
+    kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
     if kb_dir is None:
         click.echo("No knowledge base found. Run `openkb init` first.")
         return
@@ -379,9 +445,10 @@ def lint(fix):
 
 
 @cli.command(name="list")
-def list_cmd():
+@click.pass_context
+def list_cmd(ctx):
     """List all documents in the knowledge base."""
-    kb_dir = _find_kb_dir()
+    kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
     if kb_dir is None:
         click.echo("No knowledge base found. Run `openkb init` first.")
         return
@@ -439,9 +506,10 @@ def list_cmd():
 
 
 @cli.command()
-def status():
+@click.pass_context
+def status(ctx):
     """Show the current status of the knowledge base."""
-    kb_dir = _find_kb_dir()
+    kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
     if kb_dir is None:
         click.echo("No knowledge base found. Run `openkb init` first.")
         return

From be66e31281fdeae1aa4eede5c606eb22f70e7c81 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Fri, 10 Apr 2026 00:51:05 +0800
Subject: [PATCH 21/44] fix: strengthen query agent instructions to always read
 source content

---
 openkb/agent/query.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 134901a..0904571 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -18,14 +18,17 @@
    Each document is marked (short) or (pageindex) to indicate its type.
 2. Read relevant summary pages (summaries/) for document overviews.
 3. Read concept pages (concepts/) for cross-document synthesis.
-4. When you need detailed source content:
+4. **Always read source content before answering.** Summaries and concepts
+   are overviews — for accurate, detailed answers you MUST consult sources:
    - Short documents: read_file("sources/{{doc_name}}.md") for the full text.
    - PageIndex documents: use get_page_content(doc_name, pages) to read
      specific pages. The summary page shows chapter structure with page
      ranges to help you decide which pages to read.
-5. Synthesise a clear, well-cited answer grounded in wiki content.
+5. Synthesise a clear, well-cited answer grounded in source content.
 
-If you cannot find relevant information, say so clearly.
+IMPORTANT: Do NOT answer based on summaries alone. Always verify and enrich
+your answer by reading the actual source content. If the question asks about
+details, experiments, specific data, or quotes, reading the source is mandatory.
 """
 
 

From 7b3bc0ca0e5efc8c03a2bcaeb64c8fb1d688bbec Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Fri, 10 Apr 2026 00:53:12 +0800
Subject: [PATCH 22/44] Revert "fix: strengthen query agent instructions to
 always read source content"

This reverts commit be66e31281fdeae1aa4eede5c606eb22f70e7c81.
---
 openkb/agent/query.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 0904571..134901a 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -18,17 +18,14 @@
    Each document is marked (short) or (pageindex) to indicate its type.
 2. Read relevant summary pages (summaries/) for document overviews.
 3. Read concept pages (concepts/) for cross-document synthesis.
-4. **Always read source content before answering.** Summaries and concepts
-   are overviews — for accurate, detailed answers you MUST consult sources:
+4. When you need detailed source content:
    - Short documents: read_file("sources/{{doc_name}}.md") for the full text.
    - PageIndex documents: use get_page_content(doc_name, pages) to read
      specific pages. The summary page shows chapter structure with page
      ranges to help you decide which pages to read.
-5. Synthesise a clear, well-cited answer grounded in source content.
+5. Synthesise a clear, well-cited answer grounded in wiki content.
 
-IMPORTANT: Do NOT answer based on summaries alone. Always verify and enrich
-your answer by reading the actual source content. If the question asks about
-details, experiments, specific data, or quotes, reading the source is mandatory.
+If you cannot find relevant information, say so clearly.
 """
 
 

From 634b212fcdc43dd5a92969e1d68c714753259f4a Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Fri, 10 Apr 2026 00:53:50 +0800
Subject: [PATCH 23/44] fix: isolate tests from real KB directories via mocking

---
 tests/test_add_command.py |  5 +++--
 tests/test_cli.py         | 11 ++++++++---
 tests/test_list_status.py |  6 ++++--
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/test_add_command.py b/tests/test_add_command.py
index 0ad9397..ca97d26 100644
--- a/tests/test_add_command.py
+++ b/tests/test_add_command.py
@@ -37,8 +37,9 @@ def test_finds_openkb_dir(self, tmp_path, monkeypatch):
 
     def test_returns_none_if_no_openkb(self, tmp_path, monkeypatch):
         monkeypatch.chdir(tmp_path)
-        result = _find_kb_dir()
-        assert result is None
+        with patch("openkb.cli.load_global_config", return_value={}):
+            result = _find_kb_dir()
+            assert result is None
 
 
 class TestAddCommand:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1ad10b3..22c27fc 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,4 +1,6 @@
 import json
+from unittest.mock import patch
+
 import pytest
 from click.testing import CliRunner
 
@@ -8,7 +10,8 @@
 
 def test_init_creates_structure(tmp_path):
     runner = CliRunner()
-    with runner.isolated_filesystem(temp_dir=tmp_path):
+    with runner.isolated_filesystem(temp_dir=tmp_path), \
+         patch("openkb.cli.register_kb"):
         result = runner.invoke(cli, ["init"])
         assert result.exit_code == 0
 
@@ -42,7 +45,8 @@ def test_init_creates_structure(tmp_path):
 
 def test_init_schema_content(tmp_path):
     runner = CliRunner()
-    with runner.isolated_filesystem(temp_dir=tmp_path):
+    with runner.isolated_filesystem(temp_dir=tmp_path), \
+         patch("openkb.cli.register_kb"):
         result = runner.invoke(cli, ["init"])
         assert result.exit_code == 0
 
@@ -53,7 +57,8 @@ def test_init_schema_content(tmp_path):
 
 def test_init_already_exists(tmp_path):
     runner = CliRunner()
-    with runner.isolated_filesystem(temp_dir=tmp_path):
+    with runner.isolated_filesystem(temp_dir=tmp_path), \
+         patch("openkb.cli.register_kb"):
         # First run should succeed
         result = runner.invoke(cli, ["init"])
         assert result.exit_code == 0
diff --git a/tests/test_list_status.py b/tests/test_list_status.py
index 0ef9f56..21b8de4 100644
--- a/tests/test_list_status.py
+++ b/tests/test_list_status.py
@@ -32,7 +32,8 @@ def _setup_kb(tmp_path: Path) -> Path:
 class TestListCommand:
     def test_list_no_kb(self, tmp_path):
         runner = CliRunner()
-        with runner.isolated_filesystem(temp_dir=tmp_path):
+        with runner.isolated_filesystem(temp_dir=tmp_path), \
+             patch("openkb.cli._find_kb_dir", return_value=None):
             result = runner.invoke(cli, ["list"])
             assert "No knowledge base found" in result.output
 
@@ -91,7 +92,8 @@ def test_list_no_concepts_section_when_empty(self, tmp_path):
 class TestStatusCommand:
     def test_status_no_kb(self, tmp_path):
         runner = CliRunner()
-        with runner.isolated_filesystem(temp_dir=tmp_path):
+        with runner.isolated_filesystem(temp_dir=tmp_path), \
+             patch("openkb.cli._find_kb_dir", return_value=None):
             result = runner.invoke(cli, ["status"])
             assert "No knowledge base found" in result.output
 

From 19ebfeda9fca3b29a442372c45aa74d3894ccd35 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 03:35:37 +0800
Subject: [PATCH 24/44] fix: suppress warnings and disable agents SDK tracing
 via API

---
 openkb/cli.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index b14f45c..c43e588 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -9,11 +9,14 @@
 
 import os
 
-# Disable Agents SDK tracing (requires OPENAI_API_KEY otherwise)
-os.environ.setdefault("OPENAI_AGENTS_DISABLE_TRACING", "1")
+from agents import set_tracing_disabled
+set_tracing_disabled(True)
 # Use local model cost map — skip fetching from GitHub on every invocation
 os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True")
 
+import warnings
+warnings.filterwarnings("ignore")
+
 import click
 import litellm
 litellm.suppress_debug_info = True

From dde64d1d9baa68b11ec20a84beef0a8a4415660f Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 03:35:43 +0800
Subject: [PATCH 25/44] fix: add MAX_TURNS limit to agent Runner calls

---
 openkb/agent/linter.py | 4 +++-
 openkb/agent/query.py  | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/openkb/agent/linter.py b/openkb/agent/linter.py
index 5201949..fb81da7 100644
--- a/openkb/agent/linter.py
+++ b/openkb/agent/linter.py
@@ -6,6 +6,8 @@
 from agents import Agent, Runner, function_tool
 
 from openkb.agent.tools import list_wiki_files, read_wiki_file
+
+MAX_TURNS = 50
 from openkb.schema import SCHEMA_MD, get_agents_md
 
 _LINTER_INSTRUCTIONS_TEMPLATE = """\
@@ -102,5 +104,5 @@ async def run_knowledge_lint(kb_dir: Path, model: str) -> str:
         "Produce a structured Markdown report."
     )
 
-    result = await Runner.run(agent, prompt)
+    result = await Runner.run(agent, prompt, max_turns=MAX_TURNS)
     return result.final_output or "Knowledge lint completed. No output produced."
diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 134901a..c9a8986 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -6,6 +6,8 @@
 from agents import Agent, Runner, function_tool
 
 from openkb.agent.tools import read_wiki_file
+
+MAX_TURNS = 50
 from openkb.schema import get_agents_md
 
 _QUERY_INSTRUCTIONS_TEMPLATE = """\
@@ -92,10 +94,10 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals
     agent = build_query_agent(wiki_root, model, language=language)
 
     if not stream:
-        result = await Runner.run(agent, question)
+        result = await Runner.run(agent, question, max_turns=MAX_TURNS)
         return result.final_output or ""
 
-    result = Runner.run_streamed(agent, question)
+    result = Runner.run_streamed(agent, question, max_turns=MAX_TURNS)
     collected = []
     async for event in result.stream_events():
         if isinstance(event, RawResponsesStreamEvent):

From 63da1fe4d9aeeed5a2aba2697990d53755d32298 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 04:24:34 +0800
Subject: [PATCH 26/44] refactor: unify summary frontmatter to doc_type +
 full_text

Replace sources/brief/source_doc/doc_id/source fields with two
consistent fields: doc_type (short|pageindex) and full_text pointing
to the actual source content under sources/.
---
 openkb/agent/compiler.py    | 22 +++++++++-------------
 openkb/schema.py            |  2 +-
 openkb/tree_renderer.py     |  5 ++---
 tests/test_compiler.py      | 19 ++++++++++---------
 tests/test_indexer.py       |  2 +-
 tests/test_tree_renderer.py |  5 ++---
 6 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index f79e0f4..d97b0eb 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -289,20 +289,16 @@ def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
     return f"{doc_name}.pdf"
 
 
-def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str,
-                    brief: str = "", doc_type: str = "short") -> None:
-    """Write summary page with frontmatter.
-
-    For short docs, includes a ``source_doc`` field linking to the full
-    source text in ``sources/{doc_name}.md``.
-    """
+def _write_summary(wiki_dir: Path, doc_name: str, summary: str,
+                    doc_type: str = "short") -> None:
+    """Write summary page with frontmatter."""
     summaries_dir = wiki_dir / "summaries"
     summaries_dir.mkdir(parents=True, exist_ok=True)
-    fm_lines = [f"sources: [{source_file}]"]
-    if brief:
-        fm_lines.append(f"brief: {brief}")
-    if doc_type == "short":
-        fm_lines.append(f"source_doc: sources/{doc_name}.md")
+    ext = "md" if doc_type == "short" else "json"
+    fm_lines = [
+        f"doc_type: {doc_type}",
+        f"full_text: sources/{doc_name}.{ext}",
+    ]
     frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
     (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
 
@@ -690,7 +686,7 @@ async def compile_short_doc(
     except (json.JSONDecodeError, ValueError):
         doc_brief = ""
         summary = summary_raw
-    _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief)
+    _write_summary(wiki_dir, doc_name, summary)
 
     # --- Steps 2-4: Concept plan → generate/update → index ---
     await _compile_concepts(
diff --git a/openkb/schema.py b/openkb/schema.py
index 1911e86..9684733 100644
--- a/openkb/schema.py
+++ b/openkb/schema.py
@@ -35,7 +35,7 @@
 
 ## Format
 - Use [[wikilink]] to link other wiki pages (e.g., [[concepts/attention]])
-- Summary pages header: `sources: [paper.pdf]`
+- Summary pages header: `doc_type: short|pageindex` and `full_text: sources/{name}.md|.json`
 - Concept pages header: `sources: [paper1.pdf, paper2.pdf, ...]`
 - Standard Markdown heading hierarchy
 - Keep each page focused on a single topic
diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py
index 6770d7e..efad980 100644
--- a/openkb/tree_renderer.py
+++ b/openkb/tree_renderer.py
@@ -6,9 +6,8 @@ def _yaml_frontmatter(source_name: str, doc_id: str) -> str:
     """Return a YAML frontmatter block for a PageIndex wiki page."""
     return (
         "---\n"
-        f"source: {source_name}\n"
-        "type: pageindex\n"
-        f"doc_id: {doc_id}\n"
+        "doc_type: pageindex\n"
+        f"full_text: sources/{source_name}.json\n"
         "---\n"
     )
 
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index b3746d1..6b3ad0d 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -78,22 +78,22 @@ class TestWriteSummary:
     def test_writes_with_frontmatter(self, tmp_path):
         wiki = tmp_path / "wiki"
         wiki.mkdir()
-        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers")
+        _write_summary(wiki, "my-doc", "# Summary\n\nContent here.")
         path = wiki / "summaries" / "my-doc.md"
         assert path.exists()
         text = path.read_text()
-        assert "sources: [my-doc.pdf]" in text
-        assert "brief: Introduces transformers" in text
+        assert "doc_type: short" in text
+        assert "full_text: sources/my-doc.md" in text
         assert "# Summary" in text
 
     def test_writes_without_brief(self, tmp_path):
         wiki = tmp_path / "wiki"
         wiki.mkdir()
-        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.")
+        _write_summary(wiki, "my-doc", "# Summary\n\nContent here.")
         path = wiki / "summaries" / "my-doc.md"
         text = path.read_text()
-        assert "sources: [my-doc.pdf]" in text
-        assert "brief:" not in text
+        assert "doc_type: short" in text
+        assert "full_text: sources/my-doc.md" in text
 
 
 class TestWriteConcept:
@@ -513,7 +513,7 @@ async def test_full_pipeline(self, tmp_path):
         # Verify summary written
         summary_path = wiki / "summaries" / "test-doc.md"
         assert summary_path.exists()
-        assert "sources: [test-doc.pdf]" in summary_path.read_text()
+        assert "full_text: sources/test-doc.md" in summary_path.read_text()
 
         # Verify concept written
         concept_path = wiki / "concepts" / "transformer.md"
@@ -804,9 +804,10 @@ async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path):
             )
             await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini")
 
-        # Summary frontmatter has brief
+        # Summary frontmatter has doc_type and full_text
         summary_text = (wiki / "summaries" / "test-doc.md").read_text()
-        assert "brief: A paper about transformers" in summary_text
+        assert "doc_type: short" in summary_text
+        assert "full_text: sources/test-doc.md" in summary_text
 
         # Concept frontmatter has brief
         concept_text = (wiki / "concepts" / "transformer.md").read_text()
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
index 0948d64..ee7909c 100644
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
@@ -90,7 +90,7 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path):
         summary_file = kb_dir / "wiki" / "summaries" / "sample.md"
         assert summary_file.exists()
         content = summary_file.read_text(encoding="utf-8")
-        assert "type: pageindex" in content
+        assert "doc_type: pageindex" in content
         assert "Summary:" in content
 
     def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_path):
diff --git a/tests/test_tree_renderer.py b/tests/test_tree_renderer.py
index d636d5b..f20e174 100644
--- a/tests/test_tree_renderer.py
+++ b/tests/test_tree_renderer.py
@@ -15,9 +15,8 @@ class TestRenderSummaryMd:
     def test_has_yaml_frontmatter(self, sample_tree):
         output = render_summary_md(sample_tree, "Sample Document", "doc-abc")
         assert output.startswith("---\n")
-        assert "source: Sample Document" in output
-        assert "type: pageindex" in output
-        assert "doc_id: doc-abc" in output
+        assert "doc_type: pageindex" in output
+        assert "full_text: sources/Sample Document.json" in output
 
     def test_top_level_nodes_are_h1(self, sample_tree):
         output = render_summary_md(sample_tree, "Sample Document", "doc-abc")

From 06e26cea7d950cb734f10d43b66a039914f95034 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 04:26:04 +0800
Subject: [PATCH 27/44] fix: concept sources link to summaries and strip
 duplicate frontmatter

Concept pages now reference summaries/{doc}.md instead of raw PDF
filenames. Also strips frontmatter from LLM content during concept
updates to prevent duplicate YAML blocks. Removes unused
_find_source_filename.
---
 openkb/agent/compiler.py | 20 ++++++++------------
 openkb/schema.py         |  2 +-
 tests/test_compiler.py   | 10 +++++-----
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index d97b0eb..df417ba 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -279,15 +279,6 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
     return "\n".join(lines) or "(none yet)"
 
 
-def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
-    """Find the original filename in raw/ for a given doc stem."""
-    raw_dir = kb_dir / "raw"
-    if raw_dir.exists():
-        for f in raw_dir.iterdir():
-            if f.stem == doc_name:
-                return f.name
-    return f"{doc_name}.pdf"
-
 
 def _write_summary(wiki_dir: Path, doc_name: str, summary: str,
                     doc_type: str = "short") -> None:
@@ -337,7 +328,13 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
                     existing = fm + body
             else:
                 existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
-            existing += f"\n\n{content}"
+            # Strip frontmatter from LLM content to avoid duplicate blocks
+            clean = content
+            if clean.startswith("---"):
+                end = clean.find("---", 3)
+                if end != -1:
+                    clean = clean[end + 3:].lstrip("\n")
+            existing += f"\n\n{clean}"
         if brief and existing.startswith("---"):
             end = existing.find("---", 3)
             if end != -1:
@@ -511,7 +508,7 @@ async def _compile_concepts(
     Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related
     actions, then executes each action type accordingly.
     """
-    source_file = _find_source_filename(doc_name, kb_dir)
+    source_file = f"summaries/{doc_name}.md"
 
     # --- Step 2: Get concepts plan (A cached) ---
     concept_briefs = _read_concept_briefs(wiki_dir)
@@ -666,7 +663,6 @@ async def compile_short_doc(
 
     wiki_dir = kb_dir / "wiki"
     schema_md = get_agents_md(wiki_dir)
-    source_file = _find_source_filename(doc_name, kb_dir)
     content = source_path.read_text(encoding="utf-8")
 
     # Base context A: system + document
diff --git a/openkb/schema.py b/openkb/schema.py
index 9684733..8642521 100644
--- a/openkb/schema.py
+++ b/openkb/schema.py
@@ -36,7 +36,7 @@
 ## Format
 - Use [[wikilink]] to link other wiki pages (e.g., [[concepts/attention]])
 - Summary pages header: `doc_type: short|pageindex` and `full_text: sources/{name}.md|.json`
-- Concept pages header: `sources: [paper1.pdf, paper2.pdf, ...]`
+- Concept pages header: `sources: [summaries/doc1.md, summaries/doc2.md, ...]`
 - Standard Markdown heading hierarchy
 - Keep each page focused on a single topic
 """
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index 6b3ad0d..a895b79 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -518,7 +518,7 @@ async def test_full_pipeline(self, tmp_path):
         # Verify concept written
         concept_path = wiki / "concepts" / "transformer.md"
         assert concept_path.exists()
-        assert "sources: [test-doc.pdf]" in concept_path.read_text()
+        assert "sources: [summaries/test-doc.md]" in concept_path.read_text()
 
         # Verify index updated
         index_text = (wiki / "index.md").read_text()
@@ -678,14 +678,14 @@ async def ordered_acompletion(*args, **kwargs):
         fa_path = wiki / "concepts" / "flash-attention.md"
         assert fa_path.exists()
         fa_text = fa_path.read_text()
-        assert "sources: [test-doc.pdf]" in fa_text
+        assert "sources: [summaries/test-doc.md]" in fa_text
         assert "Flash Attention" in fa_text
 
         # Verify attention updated (is_update=True path in _write_concept)
         att_path = wiki / "concepts" / "attention.md"
         assert att_path.exists()
         att_text = att_path.read_text()
-        assert "test-doc.pdf" in att_text
+        assert "summaries/test-doc.md" in att_text
         assert "old-paper.pdf" in att_text
 
         # Verify index updated
@@ -725,7 +725,7 @@ async def test_related_adds_link_no_llm(self, tmp_path):
         # Verify link added to transformer page
         transformer_text = (wiki / "concepts" / "transformer.md").read_text()
         assert "[[summaries/test-doc]]" in transformer_text
-        assert "test-doc.pdf" in transformer_text
+        assert "summaries/test-doc.md" in transformer_text
 
     @pytest.mark.asyncio
     async def test_fallback_list_format(self, tmp_path):
@@ -760,7 +760,7 @@ async def test_fallback_list_format(self, tmp_path):
         att_path = wiki / "concepts" / "attention.md"
         assert att_path.exists()
         att_text = att_path.read_text()
-        assert "sources: [test-doc.pdf]" in att_text
+        assert "sources: [summaries/test-doc.md]" in att_text
         assert "Attention" in att_text
 
 

From f38781e4c032638513ce508c687081181c21613d Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 04:30:01 +0800
Subject: [PATCH 28/44] fix: update query agent to use summary full_text field

Add hint that summaries may omit details. Update search strategy to
reference the full_text frontmatter field instead of hardcoded paths.
---
 openkb/agent/query.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index c9a8986..8bbc93b 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -19,12 +19,12 @@
 1. Read index.md to see all documents and concepts with brief summaries.
    Each document is marked (short) or (pageindex) to indicate its type.
 2. Read relevant summary pages (summaries/) for document overviews.
+   Note: summaries may omit details.
 3. Read concept pages (concepts/) for cross-document synthesis.
-4. When you need detailed source content:
-   - Short documents: read_file("sources/{{doc_name}}.md") for the full text.
-   - PageIndex documents: use get_page_content(doc_name, pages) to read
-     specific pages. The summary page shows chapter structure with page
-     ranges to help you decide which pages to read.
+4. When you need detailed source content, check the summary's full_text field:
+   - Short documents (full_text ends in .md): read_file(full_text) for the full text.
+   - PageIndex documents (full_text ends in .json): use get_page_content(doc_name, pages)
+     to read specific pages. The summary shows chapter structure with page ranges.
 5. Synthesise a clear, well-cited answer grounded in wiki content.
 
 If you cannot find relevant information, say so clearly.

From bebfbdb5e2989778d3afd1f66e4e8d13c5cd2dd0 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 04:30:01 +0800
Subject: [PATCH 29/44] fix: remove page marker comments from short doc source
 markdown

---
 openkb/images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openkb/images.py b/openkb/images.py
index d72cec7..6916842 100644
--- a/openkb/images.py
+++ b/openkb/images.py
@@ -89,7 +89,7 @@ def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) ->
         for page_idx in range(len(doc)):
             page = doc[page_idx]
             page_num = page_idx + 1
-            parts.append(f"\n\n<!-- Page {page_num} -->\n")
+            parts.append("\n\n")
 
             for block in page.get_text("dict")["blocks"]:
                 if block["type"] == 0:  # text block

From 4d34baf816f6aa5c26bbfa34e4c323cbe8ae920d Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 04:31:52 +0800
Subject: [PATCH 30/44] fix: rename chapter structure to document tree
 structure in query prompt

---
 openkb/agent/query.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 8bbc93b..3f12ddf 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -24,7 +24,7 @@
 4. When you need detailed source content, check the summary's full_text field:
    - Short documents (full_text ends in .md): read_file(full_text) for the full text.
    - PageIndex documents (full_text ends in .json): use get_page_content(doc_name, pages)
-     to read specific pages. The summary shows chapter structure with page ranges.
+     to read specific pages. The summary shows document tree structure with page ranges.
 5. Synthesise a clear, well-cited answer grounded in wiki content.
 
 If you cannot find relevant information, say so clearly.
@@ -49,7 +49,7 @@ def read_file(path: str) -> str:
     def get_page_content_tool(doc_name: str, pages: str) -> str:
         """Get text content of specific pages from a long document.
         Use this when you need detailed content from a document. The summary
-        page shows chapter structure with page ranges.
+        page shows document tree structure with page ranges.
         Args:
             doc_name: Document name (e.g. 'attention-is-all-you-need').
             pages: Page specification (e.g. '3-5,7,10-12').

From 5f563eeafff7596d60341febbb0915b417cb1d51 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 04:35:41 +0800
Subject: [PATCH 31/44] fix: improve query agent prompt wording for source
 content

---
 openkb/agent/query.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 3f12ddf..234863a 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -21,9 +21,10 @@
 2. Read relevant summary pages (summaries/) for document overviews.
    Note: summaries may omit details.
 3. Read concept pages (concepts/) for cross-document synthesis.
-4. When you need detailed source content, check the summary's full_text field:
-   - Short documents (full_text ends in .md): read_file(full_text) for the full text.
-   - PageIndex documents (full_text ends in .json): use get_page_content(doc_name, pages)
+4. When you need detailed source document content, each summary page has a
+   `full_text` frontmatter field with the path to the original document content:
+   - Short documents (doc_type: short): read_file with that path.
+   - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages)
      to read specific pages. The summary shows document tree structure with page ranges.
 5. Synthesise a clear, well-cited answer grounded in wiki content.
 

From 0b07a8edb53fbe1a2cf67d103d0c9e22058df6d4 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 04:54:49 +0800
Subject: [PATCH 32/44] fix: move warning suppression after imports to avoid
 markitdown override

---
 openkb/cli.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index c43e588..f5c271b 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -14,9 +14,6 @@
 # Use local model cost map — skip fetching from GitHub on every invocation
 os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True")
 
-import warnings
-warnings.filterwarnings("ignore")
-
 import click
 import litellm
 litellm.suppress_debug_info = True
@@ -27,6 +24,10 @@
 from openkb.log import append_log
 from openkb.schema import AGENTS_MD
 
+# Suppress warnings after all imports — markitdown overrides filters at import time
+import warnings
+warnings.filterwarnings("ignore")
+
 load_dotenv()  # load from cwd (covers running inside the KB dir)
 
 

From 45c5b6ce0baf8351f910fd4119338257dd09846f Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 05:03:14 +0800
Subject: [PATCH 33/44] fix: add blank line between tool calls and before
 answer in query output

---
 openkb/agent/query.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 234863a..98a7f21 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -113,13 +113,10 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals
             if item.type == "tool_call_item":
                 raw = item.raw_item
                 args = getattr(raw, "arguments", "{}")
-                sys.stdout.write(f"\n[tool call] {raw.name}({args})\n")
+                sys.stdout.write(f"[tool call] {raw.name}({args})\n\n")
                 sys.stdout.flush()
             elif item.type == "tool_call_output_item":
-                output = str(item.output)
-                preview = output[:200] + "..." if len(output) > 200 else output
-                sys.stdout.write(f"[tool output] {preview}\n\n")
-                sys.stdout.flush()
+                pass
     sys.stdout.write("\n")
     sys.stdout.flush()
     return "".join(collected) if collected else result.final_output or ""

From 0118d2d3d0d71790be6ab4271292ca7f206edd22 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 05:11:11 +0800
Subject: [PATCH 34/44] fix: add self-talk before tool calls and fix output
 formatting

---
 openkb/agent/query.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 98a7f21..3cbe6e0 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -28,6 +28,8 @@
      to read specific pages. The summary shows document tree structure with page ranges.
 5. Synthesise a clear, well-cited answer grounded in wiki content.
 
+Before each tool call, briefly state what you are about to do.
+
 If you cannot find relevant information, say so clearly.
 """
 
@@ -113,7 +115,7 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals
             if item.type == "tool_call_item":
                 raw = item.raw_item
                 args = getattr(raw, "arguments", "{}")
-                sys.stdout.write(f"[tool call] {raw.name}({args})\n\n")
+                sys.stdout.write(f"\n[tool call] {raw.name}({args})\n\n")
                 sys.stdout.flush()
             elif item.type == "tool_call_output_item":
                 pass

From 15f970d529219a4c507279fffde6eff16126637e Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 05:21:22 +0800
Subject: [PATCH 35/44] fix: add space after colon in concept/update step names

---
 openkb/agent/compiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index df417ba..64bc204 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -563,7 +563,7 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
                     title=title, doc_name=doc_name,
                     update_instruction="",
                 )},
-            ], f"concept:{name}")
+            ], f"concept: {name}")
         try:
             parsed = _parse_json(raw)
             brief = parsed.get("brief", "")
@@ -594,7 +594,7 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
                     title=title, doc_name=doc_name,
                     existing_content=existing_content,
                 )},
-            ], f"update:{name}")
+            ], f"update: {name}")
         try:
             parsed = _parse_json(raw)
             brief = parsed.get("brief", "")

From c8f96ebc56a2a13aab7825c196f4c5a4f05444a5 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 05:33:51 +0800
Subject: [PATCH 36/44] fix: prevent duplicate frontmatter in LLM-generated
 content

Remove frontmatter format from schema to avoid LLM copying it.
Add strip as fallback in _write_summary and _write_concept create path.
---
 openkb/agent/compiler.py | 8 ++++++++
 openkb/schema.py         | 3 +--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 64bc204..73b1a9c 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -283,6 +283,10 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
 def _write_summary(wiki_dir: Path, doc_name: str, summary: str,
                     doc_type: str = "short") -> None:
     """Write summary page with frontmatter."""
+    if summary.startswith("---"):
+        end = summary.find("---", 3)
+        if end != -1:
+            summary = summary[end + 3:].lstrip("\n")
     summaries_dir = wiki_dir / "summaries"
     summaries_dir.mkdir(parents=True, exist_ok=True)
     ext = "md" if doc_type == "short" else "json"
@@ -347,6 +351,10 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
                 existing = fm + body
         path.write_text(existing, encoding="utf-8")
     else:
+        if content.startswith("---"):
+            end = content.find("---", 3)
+            if end != -1:
+                content = content[end + 3:].lstrip("\n")
         fm_lines = [f"sources: [{source_file}]"]
         if brief:
             fm_lines.append(f"brief: {brief}")
diff --git a/openkb/schema.py b/openkb/schema.py
index 8642521..b2c8cf0 100644
--- a/openkb/schema.py
+++ b/openkb/schema.py
@@ -35,10 +35,9 @@
 
 ## Format
 - Use [[wikilink]] to link other wiki pages (e.g., [[concepts/attention]])
-- Summary pages header: `doc_type: short|pageindex` and `full_text: sources/{name}.md|.json`
-- Concept pages header: `sources: [summaries/doc1.md, summaries/doc2.md, ...]`
 - Standard Markdown heading hierarchy
 - Keep each page focused on a single topic
+- Do not include YAML frontmatter (---) in generated content; it is managed by code
 """
 
 # Backward compat alias

From febc8c98bf7e64229604e839b1536a61ee22a945 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 05:41:10 +0800
Subject: [PATCH 37/44] fix: improve init prompts, prevent duplicate
 frontmatter, use American English

---
 openkb/agent/query.py |  8 +++++---
 openkb/cli.py         | 14 ++++++++++----
 openkb/lint.py        |  2 +-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 3cbe6e0..5f3fc77 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -25,10 +25,12 @@
    `full_text` frontmatter field with the path to the original document content:
    - Short documents (doc_type: short): read_file with that path.
    - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages)
-     to read specific pages. The summary shows document tree structure with page ranges.
-5. Synthesise a clear, well-cited answer grounded in wiki content.
+     with tight page ranges. The summary shows document tree structure with page
+     ranges to help you target. Never fetch the whole document.
+5. Synthesize a clear, well-cited answer grounded in wiki content.
 
-Before each tool call, briefly state what you are about to do.
+Answer based only on wiki content. Before each tool call, briefly state what you
+are about to do. Be concise.
 
 If you cannot find relevant information, say so clearly.
 """
diff --git a/openkb/cli.py b/openkb/cli.py
index f5c271b..b29d5be 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -257,14 +257,20 @@ def init():
 
     # Interactive prompts
     model = click.prompt(
-        "Model (e.g. gpt-5.4, anthropic/claude-sonnet-4-6, gemini/gemini-3.1-pro-preview)",
+        f"Model (e.g. gpt-5.4, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]",
         default=DEFAULT_CONFIG["model"],
+        show_default=False,
+    )
+    language = click.prompt(
+        f"Language [default: {DEFAULT_CONFIG['language']}]",
+        default=DEFAULT_CONFIG["language"],
+        show_default=False,
     )
-    language = click.prompt("Language", default=DEFAULT_CONFIG["language"])
     pageindex_threshold = click.prompt(
-        "PageIndex threshold (pages)",
+        f"PageIndex threshold (pages) [default: {DEFAULT_CONFIG['pageindex_threshold']}]",
         default=DEFAULT_CONFIG["pageindex_threshold"],
         type=int,
+        show_default=False,
     )
     # Create directory structure
     Path("raw").mkdir(exist_ok=True)
@@ -295,7 +301,7 @@ def init():
     # Register this KB in the global config
     register_kb(Path.cwd())
 
-    click.echo("Knowledge base initialised.")
+    click.echo("Knowledge base initialized.")
 
 
 @cli.command()
diff --git a/openkb/lint.py b/openkb/lint.py
index c1c9105..78b22e5 100644
--- a/openkb/lint.py
+++ b/openkb/lint.py
@@ -29,7 +29,7 @@ def _read_md(path: Path) -> str:
 def _all_wiki_pages(wiki: Path) -> dict[str, Path]:
     """Return a mapping of stem/relative-path → absolute Path for all .md files.
 
-    Keys are normalised: 'concepts/attention', 'summaries/paper', 'index', etc.
+    Keys are normalized: 'concepts/attention', 'summaries/paper', 'index', etc.
     """
     pages: dict[str, Path] = {}
     for md in wiki.rglob("*.md"):

From 4938cd7d194d93b524d38c8ecc5f1f4b180715b6 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 05:54:12 +0800
Subject: [PATCH 38/44] fix: improve query agent tool descriptions and prompt
 clarity

---
 openkb/agent/query.py | 12 ++++++------
 openkb/cli.py         |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 5f3fc77..5c24dba 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -27,10 +27,10 @@
    - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages)
      with tight page ranges. The summary shows document tree structure with page
      ranges to help you target. Never fetch the whole document.
-5. Synthesize a clear, well-cited answer grounded in wiki content.
+5. Synthesize a clear, concise, well-cited answer grounded in wiki content.
 
-Answer based only on wiki content. Before each tool call, briefly state what you
-are about to do. Be concise.
+Answer based only on wiki content. Be concise.
+Before each tool call, briefly state what you are about to do.
 
 If you cannot find relevant information, say so clearly.
 """
@@ -52,9 +52,9 @@ def read_file(path: str) -> str:
 
     @function_tool
     def get_page_content_tool(doc_name: str, pages: str) -> str:
-        """Get text content of specific pages from a long document.
-        Use this when you need detailed content from a document. The summary
-        page shows document tree structure with page ranges.
+        """Get text content of specific pages from a PageIndex (long) document.
+        Only use for documents with doc_type: pageindex. For short documents,
+        use read_file instead.
         Args:
             doc_name: Document name (e.g. 'attention-is-all-you-need').
             pages: Page specification (e.g. '3-5,7,10-12').
diff --git a/openkb/cli.py b/openkb/cli.py
index b29d5be..6495a85 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -257,7 +257,7 @@ def init():
 
     # Interactive prompts
     model = click.prompt(
-        f"Model (e.g. gpt-5.4, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]",
+        f"Model (e.g. gpt-5.4-mini, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]",
         default=DEFAULT_CONFIG["model"],
         show_default=False,
     )

From 5a1f014f59ed5a4a129ad56238b17127157fac9f Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 06:23:34 +0800
Subject: [PATCH 39/44] fix: replace unicode ellipsis, fix image paths in
 pageindex content, remove empty dirs on init

---
 openkb/cli.py     | 12 +++++-------
 openkb/indexer.py |  6 +++++-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index 6495a85..3683371 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -158,7 +158,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
 
     # 3/4. Index and compile
     if result.is_long_doc:
-        click.echo(f"  Long document detected — indexing with PageIndex…")
+        click.echo(f"  Long document detected — indexing with PageIndex...")
         try:
             from openkb.indexer import index_long_document
             index_result = index_long_document(result.raw_path, kb_dir)
@@ -168,7 +168,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
             return
 
         summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md"
-        click.echo(f"  Compiling long doc (doc_id={index_result.doc_id})…")
+        click.echo(f"  Compiling long doc (doc_id={index_result.doc_id})...")
         for attempt in range(2):
             try:
                 asyncio.run(
@@ -185,7 +185,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
                     logger.debug("Compilation traceback:", exc_info=True)
                     return
     else:
-        click.echo(f"  Compiling short doc…")
+        click.echo(f"  Compiling short doc...")
         for attempt in range(2):
             try:
                 asyncio.run(compile_short_doc(doc_name, result.source_path, kb_dir, model))
@@ -277,8 +277,6 @@ def init():
     Path("wiki/sources/images").mkdir(parents=True, exist_ok=True)
     Path("wiki/summaries").mkdir(parents=True, exist_ok=True)
     Path("wiki/concepts").mkdir(parents=True, exist_ok=True)
-    Path("wiki/explorations").mkdir(parents=True, exist_ok=True)
-    Path("wiki/reports").mkdir(parents=True, exist_ok=True)
 
     # Write wiki files
     Path("wiki/AGENTS.md").write_text(AGENTS_MD, encoding="utf-8")
@@ -430,12 +428,12 @@ def lint(ctx, fix):
     model: str = config.get("model", DEFAULT_CONFIG["model"])
 
     # Structural lint
-    click.echo("Running structural lint…")
+    click.echo("Running structural lint...")
     structural_report = run_structural_lint(kb_dir)
     click.echo(structural_report)
 
     # Knowledge lint (semantic)
-    click.echo("Running knowledge lint…")
+    click.echo("Running knowledge lint...")
     try:
         knowledge_report = asyncio.run(run_knowledge_lint(kb_dir, model))
     except Exception as exc:
diff --git a/openkb/indexer.py b/openkb/indexer.py
index 8cd6913..78ebf36 100644
--- a/openkb/indexer.py
+++ b/openkb/indexer.py
@@ -106,7 +106,11 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
                     dest = dest_images_dir / filename
                     if not dest.exists():
                         shutil.copy2(src_path, dest)
-                    img["path"] = f"images/{pdf_path.stem}/{filename}"
+                    new_path = f"images/{pdf_path.stem}/{filename}"
+                    # Also fix image references in page content
+                    if "content" in page:
+                        page["content"] = page["content"].replace(str(src_path), new_path)
+                    img["path"] = new_path
 
     (sources_dir / f"{pdf_path.stem}.json").write_text(
         json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",

From ad05577f355af9351b5ff2b42176bb68d4fa9d45 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 07:46:34 +0800
Subject: [PATCH 40/44] refactor: use pymupdf for page content extraction,
 unify image paths

Replace PageIndex get_page_content with pymupdf-based convert_pdf_to_pages
for long doc JSON generation. All image paths now use sources/images/ prefix
relative to wiki root. Removes dependency on PageIndex for source content.
---
 openkb/images.py  | 67 ++++++++++++++++++++++++++++++++++++++++++-----
 openkb/indexer.py | 39 +++++----------------------
 2 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/openkb/images.py b/openkb/images.py
index 6916842..7628414 100644
--- a/openkb/images.py
+++ b/openkb/images.py
@@ -67,11 +67,66 @@ def extract_pdf_images(pdf_path: Path, doc_name: str, images_dir: Path) -> dict[
                     logger.warning("Failed to save image block on page %d", page_num)
                     continue
 
-                rel_path = f"images/{doc_name}/{filename}"
+                rel_path = f"sources/images/{doc_name}/{filename}"
                 page_images.setdefault(page_num, []).append(rel_path)
     return page_images
 
 
+def convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict]:
+    """Convert a PDF to per-page dicts with text content and images.
+
+    Each dict has ``{"page": int, "content": str, "images": [{"path": str}]}``.
+    Images are saved to *images_dir* and referenced with wiki-root-relative paths.
+    """
+    images_dir.mkdir(parents=True, exist_ok=True)
+    pages: list[dict] = []
+    img_counter = 0
+
+    with pymupdf.open(str(pdf_path)) as doc:
+        for page_idx in range(len(doc)):
+            page = doc[page_idx]
+            page_num = page_idx + 1
+            parts: list[str] = []
+            page_images: list[dict] = []
+
+            for block in page.get_text("dict")["blocks"]:
+                if block["type"] == 0:  # text block
+                    lines = []
+                    for line in block["lines"]:
+                        spans_text = "".join(span["text"] for span in line["spans"])
+                        lines.append(spans_text)
+                    parts.append("\n".join(lines))
+
+                elif block["type"] == 1:  # image block
+                    width = block.get("width", 0)
+                    height = block.get("height", 0)
+                    if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM:
+                        continue
+                    image_bytes = block.get("image")
+                    if not image_bytes:
+                        continue
+                    try:
+                        pix = pymupdf.Pixmap(image_bytes)
+                        if pix.n > 4:
+                            pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                        img_counter += 1
+                        filename = f"p{page_num}_img{img_counter}.png"
+                        (images_dir / filename).write_bytes(pix.tobytes("png"))
+                        pix = None
+                        img_path = f"sources/images/{doc_name}/{filename}"
+                        parts.append(f"\n![image]({img_path})\n")
+                        page_images.append({"path": img_path})
+                    except Exception:
+                        logger.warning("Failed to save image block on page %d", page_num)
+
+            pages.append({
+                "page": page_num,
+                "content": "\n".join(parts),
+                "images": page_images,
+            })
+    return pages
+
+
 def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> str:
     """Convert a PDF to markdown with inline images using pymupdf dict-mode.
 
@@ -115,7 +170,7 @@ def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) ->
                         filename = f"p{page_num}_img{img_counter}.png"
                         (images_dir / filename).write_bytes(pix.tobytes("png"))
                         pix = None
-                        parts.append(f"\n![image](images/{doc_name}/{filename})\n")
+                        parts.append(f"\n![image](sources/images/{doc_name}/{filename})\n")
                     except Exception:
                         logger.warning("Failed to save image block on page %d", page_num)
     return "\n".join(parts)
@@ -126,7 +181,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
 
     For each ``![alt](data:image/ext;base64,DATA)`` match:
     - Decode base64 bytes → save to ``images_dir/img_NNN.ext``
-    - Replace the link with ``![alt](images/{doc_name}/img_NNN.ext)``
+    - Replace the link with ``![alt](sources/images/{doc_name}/img_NNN.ext)``
     - On decode failure: log a warning and leave the original text unchanged.
     """
     counter = 0
@@ -150,7 +205,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
         images_dir.mkdir(parents=True, exist_ok=True)
         dest.write_bytes(image_bytes)
 
-        new_ref = f"![{alt}](images/{doc_name}/{filename})"
+        new_ref = f"![{alt}](sources/images/{doc_name}/{filename})"
         result = result.replace(match.group(0), new_ref, 1)
 
     return result
@@ -164,7 +219,7 @@ def copy_relative_images(
     For each ``![alt](relative/path)`` match (skipping http/https and data URIs):
     - Resolve path relative to ``source_dir``
     - Copy to ``images_dir/{filename}``
-    - Replace link with ``![alt](images/{doc_name}/{filename})``
+    - Replace link with ``![alt](sources/images/{doc_name}/{filename})``
     - Missing source file: log a warning and leave the original text unchanged.
     """
     result = markdown
@@ -186,7 +241,7 @@ def copy_relative_images(
         images_dir.mkdir(parents=True, exist_ok=True)
         shutil.copy2(src, dest)
 
-        new_ref = f"![{alt}](images/{doc_name}/{filename})"
+        new_ref = f"![{alt}](sources/images/{doc_name}/{filename})"
         result = result.replace(match.group(0), new_ref, 1)
 
     return result
diff --git a/openkb/indexer.py b/openkb/indexer.py
index 78ebf36..dd8ddaf 100644
--- a/openkb/indexer.py
+++ b/openkb/indexer.py
@@ -3,7 +3,7 @@
 
 import json as json_mod
 import logging
-import shutil
+
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -77,40 +77,13 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
         "structure": structure,
     }
 
-    # Write wiki/sources/ — get per-page content from PageIndex and store as JSON
+    # Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
-    dest_images_dir = sources_dir / "images" / pdf_path.stem
-
-    # Get per-page content from PageIndex — use actual page count
-    page_count = doc.get("page_count")
-    if page_count is None:
-        # Fallback: count pages from structure's max end_index
-        max_page = 0
-        for node in structure:
-            end = node.get("end_index", 0)
-            if end > max_page:
-                max_page = end
-        page_count = max_page if max_page > 0 else 100
-        logger.info("page_count not in doc, inferred from structure: %d", page_count)
-    all_pages = col.get_page_content(doc_id, f"1-{page_count}")
-
-    # Relocate image paths in each page
-    dest_images_dir.mkdir(parents=True, exist_ok=True)
-    for page in all_pages:
-        if "images" in page:
-            for img in page["images"]:
-                src_path = Path(img["path"])
-                if src_path.exists():
-                    filename = src_path.name
-                    dest = dest_images_dir / filename
-                    if not dest.exists():
-                        shutil.copy2(src_path, dest)
-                    new_path = f"images/{pdf_path.stem}/{filename}"
-                    # Also fix image references in page content
-                    if "content" in page:
-                        page["content"] = page["content"].replace(str(src_path), new_path)
-                    img["path"] = new_path
+    images_dir = sources_dir / "images" / pdf_path.stem
+
+    from openkb.images import convert_pdf_to_pages
+    all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
 
     (sources_dir / f"{pdf_path.stem}.json").write_text(
         json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",

From 0340cb13557701b4c3da08e707f956976a4e0d09 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 07:46:42 +0800
Subject: [PATCH 41/44] feat: add multimodal get_image tool to query agent

Query agent can now view images referenced in source documents via
get_image tool, which returns ToolOutputImage for the LLM to inspect.
Prompt updated to use images when questions involve figures or visuals.
---
 openkb/agent/query.py | 24 ++++++++++++++++++++----
 openkb/agent/tools.py | 35 +++++++++++++++++++++++++++++++++++
 tests/test_query.py   |  7 +++----
 3 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 5c24dba..d252ee6 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -5,7 +5,8 @@
 
 from agents import Agent, Runner, function_tool
 
-from openkb.agent.tools import read_wiki_file
+from agents import ToolOutputImage, ToolOutputText
+from openkb.agent.tools import read_wiki_file, read_wiki_image
 
 MAX_TURNS = 50
 from openkb.schema import get_agents_md
@@ -27,10 +28,13 @@
    - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages)
      with tight page ranges. The summary shows document tree structure with page
      ranges to help you target. Never fetch the whole document.
-5. Synthesize a clear, concise, well-cited answer grounded in wiki content.
+5. When source content references images (e.g. ![image](sources/images/doc/file.png)),
+   use get_image to view them. Always view images when the question asks about
+   a figure, chart, diagram, or visual content.
+6. Synthesize a clear, concise, well-cited answer grounded in wiki content.
 
 Answer based only on wiki content. Be concise.
-Before each tool call, briefly state what you are about to do.
+Before each tool call, output one short sentence explaining the reason.
 
 If you cannot find relevant information, say so clearly.
 """
@@ -62,12 +66,24 @@ def get_page_content_tool(doc_name: str, pages: str) -> str:
         from openkb.agent.tools import get_page_content
         return get_page_content(doc_name, pages, wiki_root)
 
+    @function_tool
+    def get_image(image_path: str) -> ToolOutputImage | ToolOutputText:
+        """View an image from the wiki.
+        Use when source content references images you need to see.
+        Args:
+            image_path: Image path relative to wiki root (e.g. 'sources/images/doc/p1_img1.png').
+        """
+        result = read_wiki_image(image_path, wiki_root)
+        if result["type"] == "image":
+            return ToolOutputImage(image_url=result["image_url"])
+        return ToolOutputText(text=result["text"])
+
     from agents.model_settings import ModelSettings
 
     return Agent(
         name="wiki-query",
         instructions=instructions,
-        tools=[read_file, get_page_content_tool],
+        tools=[read_file, get_page_content_tool, get_image],
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py
index 0d1164c..2fe930b 100644
--- a/openkb/agent/tools.py
+++ b/openkb/agent/tools.py
@@ -133,6 +133,41 @@ def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
     return "\n\n".join(parts) + "\n\n"
 
 
+_MIME_TYPES = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".gif": "image/gif",
+    ".webp": "image/webp",
+    ".bmp": "image/bmp",
+}
+
+
+def read_wiki_image(path: str, wiki_root: str) -> dict:
+    """Read an image file from the wiki and return as base64 data URL.
+
+    Args:
+        path: Image path relative to *wiki_root* (e.g. ``"sources/images/doc/p1_img1.png"``).
+        wiki_root: Absolute path to the wiki root directory.
+
+    Returns:
+        A dict with ``type``, ``image_url`` keys for ``ToolOutputImage``,
+        or a dict with ``type``, ``text`` keys on error.
+    """
+    import base64
+
+    root = Path(wiki_root).resolve()
+    full_path = (root / path).resolve()
+    if not full_path.is_relative_to(root):
+        return {"type": "text", "text": "Access denied: path escapes wiki root."}
+    if not full_path.exists():
+        return {"type": "text", "text": f"Image not found: {path}"}
+
+    mime = _MIME_TYPES.get(full_path.suffix.lower(), "image/png")
+    b64 = base64.b64encode(full_path.read_bytes()).decode()
+    return {"type": "image", "image_url": f"data:{mime};base64,{b64}"}
+
+
 def write_wiki_file(path: str, content: str, wiki_root: str) -> str:
     """Write or overwrite a Markdown file in the wiki.
 
diff --git a/tests/test_query.py b/tests/test_query.py
index 8be4cb9..e00d2ea 100644
--- a/tests/test_query.py
+++ b/tests/test_query.py
@@ -15,17 +15,16 @@ def test_agent_name(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert agent.name == "wiki-query"
 
-    def test_agent_has_two_tools(self, tmp_path):
+    def test_agent_has_three_tools(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
-        assert len(agent.tools) == 2
+        assert len(agent.tools) == 3
 
     def test_agent_tool_names(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         names = {t.name for t in agent.tools}
         assert "read_file" in names
         assert "get_page_content_tool" in names
-        assert "list_files" not in names
-        assert "pageindex_retrieve" not in names
+        assert "get_image" in names
 
     def test_instructions_mention_get_page_content(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")

From 151b90e8a69dab7e4a25db30d14e217d4ea48f25 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 07:52:30 +0800
Subject: [PATCH 42/44] fix: update tests for image path changes and removed
 init dirs

---
 tests/test_cli.py     |  2 --
 tests/test_images.py  | 14 +++++++-------
 tests/test_indexer.py | 18 ++++++++++++++----
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 22c27fc..afb961d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -23,13 +23,11 @@ def test_init_creates_structure(tmp_path):
         assert (cwd / "wiki" / "sources" / "images").is_dir()
         assert (cwd / "wiki" / "summaries").is_dir()
         assert (cwd / "wiki" / "concepts").is_dir()
-        assert (cwd / "wiki" / "reports").is_dir()
         assert (cwd / ".openkb").is_dir()
 
         # Files
         assert (cwd / "wiki" / "AGENTS.md").is_file()
         assert (cwd / "wiki" / "log.md").is_file()
-        assert (cwd / "wiki" / "explorations").is_dir()
         assert (cwd / "wiki" / "index.md").is_file()
         assert (cwd / ".openkb" / "config.yaml").is_file()
         assert (cwd / ".openkb" / "hashes.json").is_file()
diff --git a/tests/test_images.py b/tests/test_images.py
index 0b3be21..8bbc722 100644
--- a/tests/test_images.py
+++ b/tests/test_images.py
@@ -44,7 +44,7 @@ def test_single_base64_image_extracted(self, tmp_path):
 
         # Result should reference a saved file, not the raw base64
         assert "data:image/png;base64," not in result
-        assert "![alt text](images/doc/img_001.png)" == result
+        assert "![alt text](sources/images/doc/img_001.png)" == result
 
         # File should exist on disk
         saved = images_dir / "img_001.png"
@@ -62,8 +62,8 @@ def test_multiple_base64_images_numbered_sequentially(self, tmp_path):
         )
         result = extract_base64_images(md, "doc", images_dir)
 
-        assert "![fig1](images/doc/img_001.png)" in result
-        assert "![fig2](images/doc/img_002.jpeg)" in result
+        assert "![fig1](sources/images/doc/img_001.png)" in result
+        assert "![fig2](sources/images/doc/img_002.jpeg)" in result
         assert (images_dir / "img_001.png").exists()
         assert (images_dir / "img_002.jpeg").exists()
 
@@ -92,7 +92,7 @@ def test_mixed_valid_invalid_base64(self, tmp_path, caplog):
         import logging
         with caplog.at_level(logging.WARNING, logger="openkb.images"):
             result = extract_base64_images(md, "doc", images_dir)
-        assert "![good](images/doc/img_001.png)" in result
+        assert "![good](sources/images/doc/img_001.png)" in result
         assert f"data:image/png;base64,{bad}" in result
 
 
@@ -114,7 +114,7 @@ def test_existing_relative_image_copied_and_rewritten(self, tmp_path):
         md = "![diagram](diagram.png)"
         result = copy_relative_images(md, source_dir, "doc", images_dir)
 
-        assert "![diagram](images/doc/diagram.png)" == result
+        assert "![diagram](sources/images/doc/diagram.png)" == result
         assert (images_dir / "diagram.png").read_bytes() == FAKE_PNG
 
     def test_missing_relative_image_leaves_original(self, tmp_path, caplog):
@@ -163,7 +163,7 @@ def test_multiple_relative_images_all_copied(self, tmp_path):
         md = "![a](a.png)\n![b](b.jpg)"
         result = copy_relative_images(md, source_dir, "doc", images_dir)
 
-        assert "![a](images/doc/a.png)" in result
-        assert "![b](images/doc/b.jpg)" in result
+        assert "![a](sources/images/doc/a.png)" in result
+        assert "![b](sources/images/doc/b.jpg)" in result
         assert (images_dir / "a.png").exists()
         assert (images_dir / "b.jpg").exists()
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
index ee7909c..3dbb677 100644
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
@@ -28,6 +28,12 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict):
         col.get_page_content.return_value = []
         return col
 
+    def _fake_pages(self):
+        return [
+            {"page": 1, "content": "Page one text.", "images": []},
+            {"page": 2, "content": "Page two text.", "images": []},
+        ]
+
     def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
         doc_id = "abc-123"
         fake_col = self._make_fake_collection(doc_id, sample_tree)
@@ -38,7 +44,8 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
         pdf_path = tmp_path / "sample.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
-        with patch("openkb.indexer.PageIndexClient", return_value=fake_client):
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
+             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
             result = index_long_document(pdf_path, kb_dir)
 
         assert isinstance(result, IndexResult)
@@ -63,7 +70,8 @@ def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path):
         pdf_path = tmp_path / "sample.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
-        with patch("openkb.indexer.PageIndexClient", return_value=fake_client):
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
+             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
             index_long_document(pdf_path, kb_dir)
 
         json_file = kb_dir / "wiki" / "sources" / "sample.json"
@@ -84,7 +92,8 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path):
         pdf_path = tmp_path / "sample.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
-        with patch("openkb.indexer.PageIndexClient", return_value=fake_client):
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
+             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
             index_long_document(pdf_path, kb_dir)
 
         summary_file = kb_dir / "wiki" / "summaries" / "sample.md"
@@ -104,7 +113,8 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat
         pdf_path = tmp_path / "report.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
-        with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls:
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls, \
+             patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()):
             index_long_document(pdf_path, kb_dir)
 
         # Verify PageIndexClient was instantiated with correct IndexConfig

From f383fbec6f68d40c1162fd4d37fc1fa31a1cf925 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Fri, 10 Apr 2026 07:55:23 +0800
Subject: [PATCH 43/44] fix: mock _find_kb_dir in test_add_missing_init to
 isolate from real KB dirs

---
 tests/test_add_command.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_add_command.py b/tests/test_add_command.py
index ca97d26..2ad22e7 100644
--- a/tests/test_add_command.py
+++ b/tests/test_add_command.py
@@ -58,7 +58,8 @@ def _setup_kb(self, tmp_path):
 
     def test_add_missing_init(self, tmp_path):
         runner = CliRunner()
-        with runner.isolated_filesystem(temp_dir=tmp_path):
+        with runner.isolated_filesystem(temp_dir=tmp_path), \
+             patch("openkb.cli._find_kb_dir", return_value=None):
             result = runner.invoke(cli, ["add", "somefile.pdf"])
             assert "No knowledge base found" in result.output
 

From a1460b407252081bdfd50eedd24af7092c543f34 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sat, 11 Apr 2026 10:56:36 +0800
Subject: [PATCH 44/44] chore: remove docs/ directory from branch

---
 .../2026-04-09-concept-dedup-and-update.md    |  888 -------------
 .../plans/2026-04-09-retrieve-redesign.md     | 1104 -----------------
 ...6-04-09-concept-dedup-and-update-design.md |  163 ---
 .../specs/2026-04-09-retrieve-redesign.md     |  262 ----
 4 files changed, 2417 deletions(-)
 delete mode 100644 docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md
 delete mode 100644 docs/superpowers/plans/2026-04-09-retrieve-redesign.md
 delete mode 100644 docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md
 delete mode 100644 docs/superpowers/specs/2026-04-09-retrieve-redesign.md

diff --git a/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md b/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md
deleted file mode 100644
index 1a312a6..0000000
--- a/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md
+++ /dev/null
@@ -1,888 +0,0 @@
-# Concept Dedup & Existing Page Update — Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Give the compiler enough context about existing concepts to make smart dedup/update decisions, and add the ability to rewrite existing concept pages with new information — all without breaking prompt caching.
-
-**Architecture:** Extend the deterministic pipeline in `compiler.py` with: (1) concept briefs read from disk before the concepts-plan LLM call, (2) a new JSON output format with create/update/related actions, (3) a new concurrent "update" path that sends existing page content to the LLM for rewriting, (4) a code-only "related" path for cross-ref links. Extract shared logic between `compile_short_doc` and `compile_long_doc` into `_compile_concepts`.
-
-**Tech Stack:** Python, litellm, asyncio, pytest
-
----
-
-### Task 1: Add `_read_concept_briefs` and test
-
-**Files:**
-- Modify: `openkb/agent/compiler.py:199-207` (File I/O helpers section)
-- Modify: `tests/test_compiler.py:98-116` (TestReadWikiContext section)
-
-- [ ] **Step 1: Write the failing test**
-
-Add to `tests/test_compiler.py`:
-
-```python
-from openkb.agent.compiler import _read_concept_briefs
-
-class TestReadConceptBriefs:
-    def test_empty_wiki(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        assert _read_concept_briefs(wiki) == "(none yet)"
-
-    def test_no_concepts_dir(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        assert _read_concept_briefs(wiki) == "(none yet)"
-
-    def test_reads_briefs_with_frontmatter(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "attention.md").write_text(
-            "---\nsources: [paper.pdf]\n---\n\nAttention allows models to focus on relevant input parts selectively.",
-            encoding="utf-8",
-        )
-        result = _read_concept_briefs(wiki)
-        assert "- attention: Attention allows models" in result
-
-    def test_reads_briefs_without_frontmatter(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "rnn.md").write_text(
-            "Recurrent neural networks process sequences step by step.",
-            encoding="utf-8",
-        )
-        result = _read_concept_briefs(wiki)
-        assert "- rnn: Recurrent neural networks" in result
-
-    def test_truncates_long_content(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "long.md").write_text("A" * 300, encoding="utf-8")
-        result = _read_concept_briefs(wiki)
-        brief_line = result.split("\n")[0]
-        # slug + ": " + 150 chars = well under 200
-        assert len(brief_line) < 200
-
-    def test_sorted_alphabetically(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8")
-        (concepts / "alpha.md").write_text("Alpha concept.", encoding="utf-8")
-        result = _read_concept_briefs(wiki)
-        lines = result.strip().split("\n")
-        assert lines[0].startswith("- alpha:")
-        assert lines[1].startswith("- zebra:")
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v`
-Expected: FAIL with `ImportError: cannot import name '_read_concept_briefs'`
-
-- [ ] **Step 3: Implement `_read_concept_briefs`**
-
-Add to `openkb/agent/compiler.py` in the File I/O helpers section (after `_read_wiki_context`):
-
-```python
-def _read_concept_briefs(wiki_dir: Path) -> str:
-    """Read existing concept pages and return compact briefs for the LLM.
-
-    Returns a string like:
-        - attention: Attention allows models to focus on relevant input parts...
-        - transformer: The Transformer is a neural network architecture...
-
-    Or "(none yet)" if no concept pages exist.
-    """
-    concepts_dir = wiki_dir / "concepts"
-    if not concepts_dir.exists():
-        return "(none yet)"
-    briefs = []
-    for p in sorted(concepts_dir.glob("*.md")):
-        text = p.read_text(encoding="utf-8")
-        # Skip YAML frontmatter
-        if text.startswith("---"):
-            parts = text.split("---", 2)
-            body = parts[2].strip() if len(parts) >= 3 else ""
-        else:
-            body = text.strip()
-        brief = body[:150].replace("\n", " ")
-        if brief:
-            briefs.append(f"- {p.stem}: {brief}")
-    return "\n".join(briefs) or "(none yet)"
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v`
-Expected: All 6 tests PASS
-
-- [ ] **Step 5: Update the import in test file**
-
-Add `_read_concept_briefs` to the existing import block at the top of `tests/test_compiler.py`:
-
-```python
-from openkb.agent.compiler import (
-    compile_long_doc,
-    compile_short_doc,
-    _parse_json,
-    _write_summary,
-    _write_concept,
-    _update_index,
-    _read_wiki_context,
-    _read_concept_briefs,
-)
-```
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add openkb/agent/compiler.py tests/test_compiler.py
-git commit -m "feat: add _read_concept_briefs for concept dedup context"
-```
-
----
-
-### Task 2: Replace prompt template and update JSON parsing
-
-**Files:**
-- Modify: `openkb/agent/compiler.py:53-70` (prompt templates section)
-- Modify: `tests/test_compiler.py:21-31` (TestParseJson section)
-
-- [ ] **Step 1: Write the failing test for new JSON format**
-
-Add to `tests/test_compiler.py`:
-
-```python
-class TestParseConceptsPlan:
-    def test_dict_format(self):
-        text = json.dumps({
-            "create": [{"name": "foo", "title": "Foo"}],
-            "update": [{"name": "bar", "title": "Bar"}],
-            "related": ["baz"],
-        })
-        parsed = _parse_json(text)
-        assert isinstance(parsed, dict)
-        assert len(parsed["create"]) == 1
-        assert len(parsed["update"]) == 1
-        assert parsed["related"] == ["baz"]
-
-    def test_fallback_list_format(self):
-        """If LLM returns old flat array, _parse_json still works."""
-        text = json.dumps([{"name": "foo", "title": "Foo"}])
-        parsed = _parse_json(text)
-        assert isinstance(parsed, list)
-
-    def test_fenced_dict(self):
-        text = '```json\n{"create": [], "update": [], "related": []}\n```'
-        parsed = _parse_json(text)
-        assert isinstance(parsed, dict)
-        assert parsed["create"] == []
-```
-
-- [ ] **Step 2: Run test to verify it passes (these use existing `_parse_json`)**
-
-Run: `pytest tests/test_compiler.py::TestParseConceptsPlan -v`
-Expected: All 3 PASS — `_parse_json` already handles dicts. This confirms compatibility.
-
-- [ ] **Step 3: Replace `_CONCEPTS_LIST_USER` with `_CONCEPTS_PLAN_USER`**
-
-In `openkb/agent/compiler.py`, replace the `_CONCEPTS_LIST_USER` template (lines 53-70) with:
-
-```python
-_CONCEPTS_PLAN_USER = """\
-Based on the summary above, decide how to update the wiki's concept pages.
-
-Existing concept pages:
-{concept_briefs}
-
-Return a JSON object with three keys:
-
-1. "create" — new concepts not covered by any existing page. Array of objects:
-   {{"name": "concept-slug", "title": "Human-Readable Title"}}
-
-2. "update" — existing concepts that have significant new information from \
-this document worth integrating. Array of objects:
-   {{"name": "existing-slug", "title": "Existing Title"}}
-
-3. "related" — existing concepts tangentially related to this document but \
-not needing content changes, just a cross-reference link. Array of slug strings.
-
-Rules:
-- For the first few documents, create 2-3 foundational concepts at most.
-- Do NOT create a concept that overlaps with an existing one — use "update".
-- Do NOT create concepts that are just the document topic itself.
-- "related" is for lightweight cross-linking only, no content rewrite needed.
-
-Return ONLY valid JSON, no fences, no explanation.
-"""
-```
-
-- [ ] **Step 4: Add `_CONCEPT_UPDATE_USER` template**
-
-Add after `_CONCEPT_PAGE_USER` (after line 82):
-
-```python
-_CONCEPT_UPDATE_USER = """\
-Update the concept page for: {title}
-
-Current content of this page:
-{existing_content}
-
-New information from document "{doc_name}" (summarized above) should be \
-integrated into this page. Rewrite the full page incorporating the new \
-information naturally — do not just append. Maintain existing \
-[[wikilinks]] and add new ones where appropriate.
-
-Return ONLY the Markdown content (no frontmatter, no code fences).
-"""
-```
-
-- [ ] **Step 5: Run all existing tests to verify nothing breaks**
-
-Run: `pytest tests/test_compiler.py -v`
-Expected: All PASS (templates aren't tested directly, only via integration tests which we'll update later)
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add openkb/agent/compiler.py tests/test_compiler.py
-git commit -m "feat: add concepts plan and update prompt templates"
-```
-
----
-
-### Task 3: Add `_add_related_link` and test
-
-**Files:**
-- Modify: `openkb/agent/compiler.py` (File I/O helpers section, after `_write_concept`)
-- Modify: `tests/test_compiler.py`
-
-- [ ] **Step 1: Write the failing test**
-
-Add to `tests/test_compiler.py`:
-
-```python
-from openkb.agent.compiler import _add_related_link
-
-class TestAddRelatedLink:
-    def test_adds_see_also_link(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "attention.md").write_text(
-            "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSome content.",
-            encoding="utf-8",
-        )
-        _add_related_link(wiki, "attention", "new-doc", "paper2.pdf")
-        text = (concepts / "attention.md").read_text()
-        assert "[[summaries/new-doc]]" in text
-        assert "paper2.pdf" in text
-
-    def test_skips_if_already_linked(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "attention.md").write_text(
-            "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSee also: [[summaries/new-doc]]",
-            encoding="utf-8",
-        )
-        _add_related_link(wiki, "attention", "new-doc", "paper1.pdf")
-        text = (concepts / "attention.md").read_text()
-        # Should not duplicate
-        assert text.count("[[summaries/new-doc]]") == 1
-
-    def test_skips_if_file_missing(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        # Should not raise
-        _add_related_link(wiki, "nonexistent", "doc", "file.pdf")
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v`
-Expected: FAIL with `ImportError: cannot import name '_add_related_link'`
-
-- [ ] **Step 3: Implement `_add_related_link`**
-
-Add to `openkb/agent/compiler.py` after `_write_concept`:
-
-```python
-def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None:
-    """Add a cross-reference link to an existing concept page (no LLM call)."""
-    concepts_dir = wiki_dir / "concepts"
-    path = concepts_dir / f"{concept_slug}.md"
-    if not path.exists():
-        return
-
-    text = path.read_text(encoding="utf-8")
-    link = f"[[summaries/{doc_name}]]"
-    if link in text:
-        return
-
-    # Update sources in frontmatter
-    if source_file not in text:
-        if text.startswith("---"):
-            end = text.index("---", 3)
-            fm = text[:end + 3]
-            body = text[end + 3:]
-            if "sources:" in fm:
-                fm = fm.replace("sources: [", f"sources: [{source_file}, ")
-            else:
-                fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
-            text = fm + body
-        else:
-            text = f"---\nsources: [{source_file}]\n---\n\n" + text
-
-    text += f"\n\nSee also: {link}"
-    path.write_text(text, encoding="utf-8")
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v`
-Expected: All 3 tests PASS
-
-- [ ] **Step 5: Update the import in test file**
-
-Add `_add_related_link` to the import block at top of `tests/test_compiler.py`.
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add openkb/agent/compiler.py tests/test_compiler.py
-git commit -m "feat: add _add_related_link for code-only cross-referencing"
-```
-
----
-
-### Task 4: Extract `_compile_concepts` and refactor both public functions
-
-**Files:**
-- Modify: `openkb/agent/compiler.py:290-509` (Public API section — full rewrite)
-- Modify: `tests/test_compiler.py:153-267` (integration tests)
-
-This is the core task. It extracts the shared Steps 2-4 into `_compile_concepts`, updates both public functions to call it, and switches to the new concepts plan format.
-
-- [ ] **Step 1: Write integration test for new create/update/related flow**
-
-Add to `tests/test_compiler.py`:
-
-```python
-class TestCompileConceptsPlan:
-    """Integration tests for the new create/update/related flow."""
-
-    @pytest.mark.asyncio
-    async def test_create_and_update_flow(self, tmp_path):
-        """New doc creates one concept and updates an existing one."""
-        wiki = tmp_path / "wiki"
-        (wiki / "sources").mkdir(parents=True)
-        (wiki / "summaries").mkdir(parents=True)
-        concepts_dir = wiki / "concepts"
-        concepts_dir.mkdir(parents=True)
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
-            encoding="utf-8",
-        )
-        # Pre-existing concept
-        (concepts_dir / "attention.md").write_text(
-            "---\nsources: [old-paper.pdf]\n---\n\n# Attention\n\nOld content about attention.",
-            encoding="utf-8",
-        )
-
-        source_path = wiki / "sources" / "new-paper.md"
-        source_path.write_text("# New Paper\n\nContent about flash attention and transformers.", encoding="utf-8")
-        (tmp_path / ".openkb").mkdir()
-        (tmp_path / "raw").mkdir()
-        (tmp_path / "raw" / "new-paper.pdf").write_bytes(b"fake")
-
-        summary_resp = "This paper introduces flash attention, improving on attention mechanisms."
-        plan_resp = json.dumps({
-            "create": [{"name": "flash-attention", "title": "Flash Attention"}],
-            "update": [{"name": "attention", "title": "Attention Mechanism"}],
-            "related": [],
-        })
-        create_page_resp = "# Flash Attention\n\nAn efficient attention algorithm."
-        update_page_resp = "# Attention\n\nUpdated content with flash attention details."
-
-        with patch("openkb.agent.compiler.litellm") as mock_litellm:
-            mock_litellm.completion = MagicMock(
-                side_effect=_mock_completion([summary_resp, plan_resp])
-            )
-            mock_litellm.acompletion = AsyncMock(
-                side_effect=_mock_acompletion([create_page_resp, update_page_resp])
-            )
-            await compile_short_doc("new-paper", source_path, tmp_path, "gpt-4o-mini")
-
-        # New concept created
-        flash_path = concepts_dir / "flash-attention.md"
-        assert flash_path.exists()
-        assert "sources: [new-paper.pdf]" in flash_path.read_text()
-
-        # Existing concept rewritten (not appended)
-        attn_text = (concepts_dir / "attention.md").read_text()
-        assert "new-paper.pdf" in attn_text
-        assert "Updated content with flash attention details" in attn_text
-
-        # Index updated for both
-        index_text = (wiki / "index.md").read_text()
-        assert "[[concepts/flash-attention]]" in index_text
-
-    @pytest.mark.asyncio
-    async def test_related_adds_link_no_llm(self, tmp_path):
-        """Related concepts get cross-ref links without LLM calls."""
-        wiki = tmp_path / "wiki"
-        (wiki / "sources").mkdir(parents=True)
-        (wiki / "summaries").mkdir(parents=True)
-        concepts_dir = wiki / "concepts"
-        concepts_dir.mkdir(parents=True)
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
-            encoding="utf-8",
-        )
-        (concepts_dir / "transformer.md").write_text(
-            "---\nsources: [old.pdf]\n---\n\n# Transformer\n\nArchitecture details.",
-            encoding="utf-8",
-        )
-
-        source_path = wiki / "sources" / "doc.md"
-        source_path.write_text("Content", encoding="utf-8")
-        (tmp_path / ".openkb").mkdir()
-        (tmp_path / "raw").mkdir()
-        (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake")
-
-        summary_resp = "A short summary."
-        plan_resp = json.dumps({
-            "create": [],
-            "update": [],
-            "related": ["transformer"],
-        })
-
-        with patch("openkb.agent.compiler.litellm") as mock_litellm:
-            mock_litellm.completion = MagicMock(
-                side_effect=_mock_completion([summary_resp, plan_resp])
-            )
-            # acompletion should NOT be called (no create/update)
-            mock_litellm.acompletion = AsyncMock(side_effect=AssertionError("should not be called"))
-            await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini")
-
-        # Related concept should have cross-ref link
-        transformer_text = (concepts_dir / "transformer.md").read_text()
-        assert "[[summaries/doc]]" in transformer_text
-
-    @pytest.mark.asyncio
-    async def test_fallback_list_format(self, tmp_path):
-        """If LLM returns old flat array, treat all as create."""
-        wiki = tmp_path / "wiki"
-        (wiki / "sources").mkdir(parents=True)
-        (wiki / "summaries").mkdir(parents=True)
-        (wiki / "concepts").mkdir(parents=True)
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
-            encoding="utf-8",
-        )
-        source_path = wiki / "sources" / "doc.md"
-        source_path.write_text("Content", encoding="utf-8")
-        (tmp_path / ".openkb").mkdir()
-        (tmp_path / "raw").mkdir()
-        (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake")
-
-        summary_resp = "Summary."
-        # Old format: flat array
-        plan_resp = json.dumps([{"name": "foo", "title": "Foo"}])
-        page_resp = "# Foo\n\nContent."
-
-        with patch("openkb.agent.compiler.litellm") as mock_litellm:
-            mock_litellm.completion = MagicMock(
-                side_effect=_mock_completion([summary_resp, plan_resp])
-            )
-            mock_litellm.acompletion = AsyncMock(
-                side_effect=_mock_acompletion([page_resp])
-            )
-            await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini")
-
-        assert (wiki / "concepts" / "foo.md").exists()
-```
-
-- [ ] **Step 2: Run the new tests to verify they fail**
-
-Run: `pytest tests/test_compiler.py::TestCompileConceptsPlan -v`
-Expected: FAIL — the current code uses old prompt format and doesn't handle dict responses
-
-- [ ] **Step 3: Implement `_compile_concepts` and refactor public functions**
-
-Replace the entire Public API section (from `DEFAULT_COMPILE_CONCURRENCY` to end of file) in `openkb/agent/compiler.py` with:
-
-```python
-DEFAULT_COMPILE_CONCURRENCY = 5
-
-
-async def _compile_concepts(
-    wiki_dir: Path,
-    kb_dir: Path,
-    model: str,
-    system_msg: dict,
-    doc_msg: dict,
-    summary: str,
-    doc_name: str,
-    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
-) -> None:
-    """Shared concept compilation logic: plan → create/update/related → index.
-
-    This is the core of the compilation pipeline, shared by both
-    compile_short_doc and compile_long_doc.
-    """
-    source_file = _find_source_filename(doc_name, kb_dir)
-    concept_briefs = _read_concept_briefs(wiki_dir)
-
-    # --- Concepts plan (A cached) ---
-    plan_raw = _llm_call(model, [
-        system_msg,
-        doc_msg,
-        {"role": "assistant", "content": summary},
-        {"role": "user", "content": _CONCEPTS_PLAN_USER.format(
-            concept_briefs=concept_briefs,
-        )},
-    ], "concepts-plan", max_tokens=1024)
-
-    try:
-        parsed = _parse_json(plan_raw)
-    except (json.JSONDecodeError, ValueError) as exc:
-        logger.warning("Failed to parse concepts plan: %s", exc)
-        logger.debug("Raw: %s", plan_raw)
-        _update_index(wiki_dir, doc_name, [])
-        return
-
-    # Fallback: if LLM returns flat array, treat all as create
-    if isinstance(parsed, list):
-        create_list, update_list, related_list = parsed, [], []
-    else:
-        create_list = parsed.get("create", [])
-        update_list = parsed.get("update", [])
-        related_list = parsed.get("related", [])
-
-    if not create_list and not update_list and not related_list:
-        _update_index(wiki_dir, doc_name, [])
-        return
-
-    # --- Concurrent concept generation (A cached) ---
-    semaphore = asyncio.Semaphore(max_concurrency)
-
-    async def _gen_create(concept: dict) -> tuple[str, str, bool]:
-        name = concept["name"]
-        title = concept.get("title", name)
-        async with semaphore:
-            page_content = await _llm_call_async(model, [
-                system_msg,
-                doc_msg,
-                {"role": "assistant", "content": summary},
-                {"role": "user", "content": _CONCEPT_PAGE_USER.format(
-                    title=title, doc_name=doc_name,
-                    update_instruction="",
-                )},
-            ], f"create:{name}")
-        return name, page_content, False
-
-    async def _gen_update(concept: dict) -> tuple[str, str, bool]:
-        name = concept["name"]
-        title = concept.get("title", name)
-        # Read existing page content for the LLM to integrate
-        concept_path = wiki_dir / "concepts" / f"{name}.md"
-        if concept_path.exists():
-            raw_text = concept_path.read_text(encoding="utf-8")
-            # Strip frontmatter for the LLM
-            if raw_text.startswith("---"):
-                parts = raw_text.split("---", 2)
-                existing_content = parts[2].strip() if len(parts) >= 3 else raw_text
-            else:
-                existing_content = raw_text
-        else:
-            existing_content = "(page not found — create from scratch)"
-        async with semaphore:
-            page_content = await _llm_call_async(model, [
-                system_msg,
-                doc_msg,
-                {"role": "assistant", "content": summary},
-                {"role": "user", "content": _CONCEPT_UPDATE_USER.format(
-                    title=title, doc_name=doc_name,
-                    existing_content=existing_content,
-                )},
-            ], f"update:{name}")
-        return name, page_content, True
-
-    tasks = []
-    tasks.extend(_gen_create(c) for c in create_list)
-    tasks.extend(_gen_update(c) for c in update_list)
-
-    if tasks:
-        total = len(tasks)
-        sys.stdout.write(f"    Generating {total} concept(s) (concurrency={max_concurrency})...\n")
-        sys.stdout.flush()
-
-        results = await asyncio.gather(*tasks, return_exceptions=True)
-    else:
-        results = []
-
-    concept_names = []
-    for r in results:
-        if isinstance(r, Exception):
-            logger.warning("Concept generation failed: %s", r)
-            continue
-        name, page_content, is_update = r
-        _write_concept(wiki_dir, name, page_content, source_file, is_update)
-        concept_names.append(name)
-
-    # --- Related: code-only cross-ref links ---
-    for slug in related_list:
-        _add_related_link(wiki_dir, slug, doc_name, source_file)
-
-    # --- Update index ---
-    _update_index(wiki_dir, doc_name, concept_names)
-
-
-async def compile_short_doc(
-    doc_name: str,
-    source_path: Path,
-    kb_dir: Path,
-    model: str,
-    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
-) -> None:
-    """Compile a short document into wiki pages.
-
-    Step 1: Generate summary from full document text.
-    Step 2: Plan + generate/update concept pages (via _compile_concepts).
-    """
-    from openkb.config import load_config
-
-    openkb_dir = kb_dir / ".openkb"
-    config = load_config(openkb_dir / "config.yaml")
-    language: str = config.get("language", "en")
-
-    wiki_dir = kb_dir / "wiki"
-    schema_md = get_agents_md(wiki_dir)
-    source_file = _find_source_filename(doc_name, kb_dir)
-    content = source_path.read_text(encoding="utf-8")
-
-    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
-        schema_md=schema_md, language=language,
-    )}
-    doc_msg = {"role": "user", "content": _SUMMARY_USER.format(
-        doc_name=doc_name, content=content,
-    )}
-
-    # Step 1: Generate summary
-    summary = _llm_call(model, [system_msg, doc_msg], "summary")
-    _write_summary(wiki_dir, doc_name, source_file, summary)
-
-    # Step 2: Compile concepts
-    await _compile_concepts(
-        wiki_dir, kb_dir, model, system_msg, doc_msg, summary,
-        doc_name, max_concurrency,
-    )
-
-
-async def compile_long_doc(
-    doc_name: str,
-    summary_path: Path,
-    doc_id: str,
-    kb_dir: Path,
-    model: str,
-    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
-) -> None:
-    """Compile a long (PageIndex) document into wiki concept pages.
-
-    The summary page is already written by the indexer. This function
-    generates an overview, then plans + generates/updates concept pages.
-    """
-    from openkb.config import load_config
-
-    openkb_dir = kb_dir / ".openkb"
-    config = load_config(openkb_dir / "config.yaml")
-    language: str = config.get("language", "en")
-
-    wiki_dir = kb_dir / "wiki"
-    schema_md = get_agents_md(wiki_dir)
-    summary_text = summary_path.read_text(encoding="utf-8")
-
-    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
-        schema_md=schema_md, language=language,
-    )}
-    doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format(
-        doc_name=doc_name, doc_id=doc_id, content=summary_text,
-    )}
-
-    # Step 1: Generate overview
-    overview = _llm_call(model, [system_msg, doc_msg], "overview")
-
-    # Step 2: Compile concepts
-    await _compile_concepts(
-        wiki_dir, kb_dir, model, system_msg, doc_msg, overview,
-        doc_name, max_concurrency,
-    )
-```
-
-- [ ] **Step 4: Update existing integration tests**
-
-Update `TestCompileShortDoc.test_full_pipeline` — the concepts-list response now needs to be the new dict format:
-
-```python
-class TestCompileShortDoc:
-    @pytest.mark.asyncio
-    async def test_full_pipeline(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        (wiki / "sources").mkdir(parents=True)
-        (wiki / "summaries").mkdir(parents=True)
-        (wiki / "concepts").mkdir(parents=True)
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
-            encoding="utf-8",
-        )
-        source_path = wiki / "sources" / "test-doc.md"
-        source_path.write_text("# Test Doc\n\nSome content about transformers.", encoding="utf-8")
-        (tmp_path / ".openkb").mkdir()
-        (tmp_path / "raw").mkdir()
-        (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
-
-        summary_response = "# Summary\n\nThis document discusses transformers."
-        plan_response = json.dumps({
-            "create": [{"name": "transformer", "title": "Transformer"}],
-            "update": [],
-            "related": [],
-        })
-        concept_page_response = "# Transformer\n\nA neural network architecture."
-
-        with patch("openkb.agent.compiler.litellm") as mock_litellm:
-            mock_litellm.completion = MagicMock(
-                side_effect=_mock_completion([summary_response, plan_response])
-            )
-            mock_litellm.acompletion = AsyncMock(
-                side_effect=_mock_acompletion([concept_page_response])
-            )
-            await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini")
-
-        summary_path = wiki / "summaries" / "test-doc.md"
-        assert summary_path.exists()
-        assert "sources: [test-doc.pdf]" in summary_path.read_text()
-
-        concept_path = wiki / "concepts" / "transformer.md"
-        assert concept_path.exists()
-        assert "sources: [test-doc.pdf]" in concept_path.read_text()
-
-        index_text = (wiki / "index.md").read_text()
-        assert "[[summaries/test-doc]]" in index_text
-        assert "[[concepts/transformer]]" in index_text
-```
-
-Update `TestCompileShortDoc.test_handles_bad_json` — no changes needed (bad JSON still triggers fallback).
-
-Update `TestCompileLongDoc.test_full_pipeline`:
-
-```python
-class TestCompileLongDoc:
-    @pytest.mark.asyncio
-    async def test_full_pipeline(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        (wiki / "summaries").mkdir(parents=True)
-        (wiki / "concepts").mkdir(parents=True)
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n\n## Concepts\n",
-            encoding="utf-8",
-        )
-        summary_path = wiki / "summaries" / "big-doc.md"
-        summary_path.write_text("# Big Doc\n\nPageIndex summary tree.", encoding="utf-8")
-        openkb_dir = tmp_path / ".openkb"
-        openkb_dir.mkdir()
-        (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n")
-        (tmp_path / "raw").mkdir()
-        (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake")
-
-        overview_response = "Overview of the big document."
-        plan_response = json.dumps({
-            "create": [{"name": "deep-learning", "title": "Deep Learning"}],
-            "update": [],
-            "related": [],
-        })
-        concept_page_response = "# Deep Learning\n\nA subfield of ML."
-
-        with patch("openkb.agent.compiler.litellm") as mock_litellm:
-            mock_litellm.completion = MagicMock(
-                side_effect=_mock_completion([overview_response, plan_response])
-            )
-            mock_litellm.acompletion = AsyncMock(
-                side_effect=_mock_acompletion([concept_page_response])
-            )
-            await compile_long_doc(
-                "big-doc", summary_path, "doc-123", tmp_path, "gpt-4o-mini"
-            )
-
-        concept_path = wiki / "concepts" / "deep-learning.md"
-        assert concept_path.exists()
-        assert "Deep Learning" in concept_path.read_text()
-
-        index_text = (wiki / "index.md").read_text()
-        assert "[[summaries/big-doc]]" in index_text
-        assert "[[concepts/deep-learning]]" in index_text
-```
-
-- [ ] **Step 5: Run all tests**
-
-Run: `pytest tests/test_compiler.py -v`
-Expected: All PASS
-
-- [ ] **Step 6: Run the full test suite**
-
-Run: `pytest tests/ -v`
-Expected: All 149+ tests PASS
-
-- [ ] **Step 7: Commit**
-
-```bash
-git add openkb/agent/compiler.py tests/test_compiler.py
-git commit -m "feat: concept dedup with briefs, update/related paths, extract _compile_concepts"
-```
-
----
-
-### Task 5: Clean up old references and update module docstring
-
-**Files:**
-- Modify: `openkb/agent/compiler.py:1-9` (module docstring)
-
-- [ ] **Step 1: Update module docstring**
-
-Replace the docstring at the top of `openkb/agent/compiler.py`:
-
-```python
-"""Wiki compilation pipeline for OpenKB.
-
-Pipeline leveraging LLM prompt caching:
-  Step 1: Build base context A (schema + document content).
-  Step 2: A → generate summary.
-  Step 3: A + summary → concepts plan (create/update/related).
-  Step 4: Concurrent LLM calls (A cached) → generate new + rewrite updated concepts.
-  Step 5: Code adds cross-ref links to related concepts, updates index.
-"""
-```
-
-- [ ] **Step 2: Verify `_CONCEPTS_LIST_USER` is fully removed**
-
-Search for any remaining references to `_CONCEPTS_LIST_USER` in the codebase:
-
-Run: `grep -r "_CONCEPTS_LIST_USER" openkb/ tests/`
-Expected: No matches
-
-- [ ] **Step 3: Run full test suite one final time**
-
-Run: `pytest tests/ -q`
-Expected: All tests pass
-
-- [ ] **Step 4: Commit**
-
-```bash
-git add openkb/agent/compiler.py
-git commit -m "chore: update compiler docstring for new pipeline"
-```
diff --git a/docs/superpowers/plans/2026-04-09-retrieve-redesign.md b/docs/superpowers/plans/2026-04-09-retrieve-redesign.md
deleted file mode 100644
index 3c659bc..0000000
--- a/docs/superpowers/plans/2026-04-09-retrieve-redesign.md
+++ /dev/null
@@ -1,1104 +0,0 @@
-# Retrieve Redesign Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Unify query across long/short docs, add brief summaries to index.md and frontmatter, store long doc sources as JSON with per-page access.
-
-**Architecture:** (1) LLM prompts return `{"brief", "content"}` JSON — briefs flow into frontmatter and index.md. (2) Indexer stores long doc pages as JSON array. (3) New `get_page_content` tool replaces `pageindex_retrieve`. (4) Query agent uses same tools for all docs.
-
-**Tech Stack:** Python, litellm, asyncio, pytest
-
----
-
-### Task 1: Add `get_page_content` tool and `parse_pages` helper
-
-**Files:**
-- Modify: `openkb/agent/tools.py`
-- Modify: `tests/test_agent_tools.py`
-
-- [ ] **Step 1: Write failing tests**
-
-Add to `tests/test_agent_tools.py`:
-
-```python
-from openkb.agent.tools import get_page_content, parse_pages
-
-class TestParsePages:
-    def test_single_page(self):
-        assert parse_pages("3") == [3]
-
-    def test_range(self):
-        assert parse_pages("3-5") == [3, 4, 5]
-
-    def test_comma_separated(self):
-        assert parse_pages("1,3,5") == [1, 3, 5]
-
-    def test_mixed(self):
-        assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12]
-
-    def test_deduplication(self):
-        assert parse_pages("3,3,3") == [3]
-
-    def test_sorted(self):
-        assert parse_pages("5,1,3") == [1, 3, 5]
-
-    def test_ignores_zero_and_negative(self):
-        assert parse_pages("0,-1,3") == [3]
-
-
-class TestGetPageContent:
-    def test_reads_pages_from_json(self, tmp_path):
-        import json
-        wiki_root = str(tmp_path)
-        sources = tmp_path / "sources"
-        sources.mkdir()
-        pages = [
-            {"page": 1, "content": "Page one text."},
-            {"page": 2, "content": "Page two text."},
-            {"page": 3, "content": "Page three text."},
-        ]
-        (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8")
-
-        result = get_page_content("paper", "1,3", wiki_root)
-        assert "[Page 1]" in result
-        assert "Page one text." in result
-        assert "[Page 3]" in result
-        assert "Page three text." in result
-        assert "Page two" not in result
-
-    def test_returns_error_for_missing_file(self, tmp_path):
-        wiki_root = str(tmp_path)
-        (tmp_path / "sources").mkdir()
-        result = get_page_content("nonexistent", "1", wiki_root)
-        assert "not found" in result.lower()
-
-    def test_returns_error_for_no_matching_pages(self, tmp_path):
-        import json
-        wiki_root = str(tmp_path)
-        sources = tmp_path / "sources"
-        sources.mkdir()
-        pages = [{"page": 1, "content": "Only page."}]
-        (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8")
-
-        result = get_page_content("paper", "99", wiki_root)
-        assert "no content" in result.lower() or result.strip() == ""
-
-    def test_includes_images_info(self, tmp_path):
-        import json
-        wiki_root = str(tmp_path)
-        sources = tmp_path / "sources"
-        sources.mkdir()
-        pages = [
-            {"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]},
-        ]
-        (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8")
-
-        result = get_page_content("doc", "1", wiki_root)
-        assert "img.png" in result
-
-    def test_path_escape_denied(self, tmp_path):
-        wiki_root = str(tmp_path)
-        (tmp_path / "sources").mkdir()
-        result = get_page_content("../../etc/passwd", "1", wiki_root)
-        assert "denied" in result.lower() or "not found" in result.lower()
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-Run: `pytest tests/test_agent_tools.py::TestParsePages tests/test_agent_tools.py::TestGetPageContent -v`
-Expected: FAIL with `ImportError`
-
-- [ ] **Step 3: Implement `parse_pages` and `get_page_content`**
-
-Add to `openkb/agent/tools.py`:
-
-```python
-import json as _json
-
-
-def parse_pages(pages: str) -> list[int]:
-    """Parse a page specification like '3-5,7,10-12' into a sorted list of ints."""
-    result: set[int] = set()
-    for part in pages.split(","):
-        part = part.strip()
-        if "-" in part:
-            start_str, end_str = part.split("-", 1)
-            try:
-                start, end = int(start_str), int(end_str)
-                result.update(range(start, end + 1))
-            except ValueError:
-                continue
-        else:
-            try:
-                result.add(int(part))
-            except ValueError:
-                continue
-    return sorted(n for n in result if n >= 1)
-
-
-def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
-    """Get text content of specific pages from a long document.
-
-    Reads from ``wiki/sources/{doc_name}.json`` which contains a JSON array
-    of ``{"page": int, "content": str, "images": [...]}`` objects.
-
-    Args:
-        doc_name: Document name (stem, e.g. ``'attention-is-all-you-need'``).
-        pages: Page specification (e.g. ``'3-5,7,10-12'``).
-        wiki_root: Absolute path to the wiki root directory.
-
-    Returns:
-        Formatted text of requested pages, or error message if not found.
-    """
-    root = Path(wiki_root).resolve()
-    json_path = (root / "sources" / f"{doc_name}.json").resolve()
-    if not json_path.is_relative_to(root):
-        return "Access denied: path escapes wiki root."
-    if not json_path.exists():
-        return f"Document not found: {doc_name}. No sources/{doc_name}.json file."
-
-    data = _json.loads(json_path.read_text(encoding="utf-8"))
-    page_nums = set(parse_pages(pages))
-    matched = [p for p in data if p["page"] in page_nums]
-
-    if not matched:
-        return f"No content found for pages: {pages}"
-
-    parts: list[str] = []
-    for p in matched:
-        header = f"[Page {p['page']}]"
-        text = p.get("content", "")
-        if "images" in p:
-            img_refs = ", ".join(img["path"] for img in p["images"])
-            text += f"\n[Images: {img_refs}]"
-        parts.append(f"{header}\n{text}")
-
-    return "\n\n".join(parts)
-```
-
-- [ ] **Step 4: Run tests to verify they pass**
-
-Run: `pytest tests/test_agent_tools.py -v`
-Expected: All PASS
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add openkb/agent/tools.py tests/test_agent_tools.py
-git commit -m "feat: add get_page_content tool and parse_pages helper"
-```
-
----
-
-### Task 2: Change LLM prompts to return `{"brief", "content"}` JSON
-
-**Files:**
-- Modify: `openkb/agent/compiler.py` (prompt templates, lines 40-105)
-- Modify: `tests/test_compiler.py` (TestParseConceptsPlan)
-
-- [ ] **Step 1: Write test for brief+content JSON parsing**
-
-Add to `tests/test_compiler.py`:
-
-```python
-class TestParseBriefContent:
-    def test_dict_with_brief_and_content(self):
-        text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."})
-        parsed = _parse_json(text)
-        assert parsed["brief"] == "A short desc"
-        assert "# Full page" in parsed["content"]
-
-    def test_plain_text_fallback(self):
-        """If LLM returns plain text, _parse_json raises — caller handles fallback."""
-        with pytest.raises((json.JSONDecodeError, ValueError)):
-            _parse_json("Just plain markdown text without JSON")
-```
-
-- [ ] **Step 2: Run test to verify it passes (existing _parse_json handles dicts)**
-
-Run: `pytest tests/test_compiler.py::TestParseBriefContent -v`
-Expected: PASS — `_parse_json` already handles dicts
-
-- [ ] **Step 3: Update `_SUMMARY_USER` prompt**
-
-Replace in `openkb/agent/compiler.py`:
-
-```python
-_SUMMARY_USER = """\
-New document: {doc_name}
-
-Full text:
-{content}
-
-Write a summary page for this document in Markdown.
-
-Return a JSON object with two keys:
-- "brief": A single sentence (under 100 chars) describing the document's main contribution
-- "content": The full summary in Markdown. Include key concepts, findings, ideas, \
-and [[wikilinks]] to concepts that could become cross-document concept pages
-
-Return ONLY valid JSON, no fences.
-"""
-```
-
-- [ ] **Step 4: Update `_CONCEPT_PAGE_USER` prompt**
-
-Replace in `openkb/agent/compiler.py`:
-
-```python
-_CONCEPT_PAGE_USER = """\
-Write the concept page for: {title}
-
-This concept relates to the document "{doc_name}" summarized above.
-{update_instruction}
-
-Return a JSON object with two keys:
-- "brief": A single sentence (under 100 chars) defining this concept
-- "content": The full concept page in Markdown. Include clear explanation, \
-key details from the source document, and [[wikilinks]] to related concepts \
-and [[summaries/{doc_name}]]
-
-Return ONLY valid JSON, no fences.
-"""
-```
-
-- [ ] **Step 5: Update `_CONCEPT_UPDATE_USER` prompt**
-
-Replace in `openkb/agent/compiler.py`:
-
-```python
-_CONCEPT_UPDATE_USER = """\
-Update the concept page for: {title}
-
-Current content of this page:
-{existing_content}
-
-New information from document "{doc_name}" (summarized above) should be \
-integrated into this page. Rewrite the full page incorporating the new \
-information naturally — do not just append. Maintain existing \
-[[wikilinks]] and add new ones where appropriate.
-
-Return a JSON object with two keys:
-- "brief": A single sentence (under 100 chars) defining this concept (may differ from before)
-- "content": The rewritten full concept page in Markdown
-
-Return ONLY valid JSON, no fences.
-"""
-```
-
-- [ ] **Step 6: Run all tests (prompts aren't tested directly)**
-
-Run: `pytest tests/test_compiler.py -v`
-Expected: All PASS
-
-- [ ] **Step 7: Commit**
-
-```bash
-git add openkb/agent/compiler.py tests/test_compiler.py
-git commit -m "feat: update LLM prompts to return brief+content JSON"
-```
-
----
-
-### Task 3: Update `_write_summary` and `_write_concept` to store `brief` in frontmatter
-
-**Files:**
-- Modify: `openkb/agent/compiler.py` (lines 274-320, `_write_summary` and `_write_concept`)
-- Modify: `tests/test_compiler.py`
-
-- [ ] **Step 1: Write failing tests**
-
-Update existing and add new tests in `tests/test_compiler.py`:
-
-```python
-class TestWriteSummary:
-    def test_writes_with_frontmatter(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers")
-        path = wiki / "summaries" / "my-doc.md"
-        assert path.exists()
-        text = path.read_text()
-        assert "sources: [my-doc.pdf]" in text
-        assert "brief: Introduces transformers" in text
-        assert "# Summary" in text
-
-    def test_writes_without_brief(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.")
-        path = wiki / "summaries" / "my-doc.md"
-        text = path.read_text()
-        assert "sources: [my-doc.pdf]" in text
-        assert "brief:" not in text
-```
-
-Update `TestWriteConcept`:
-
-```python
-class TestWriteConcept:
-    def test_new_concept_with_brief(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False, brief="Mechanism for selective focus")
-        path = wiki / "concepts" / "attention.md"
-        assert path.exists()
-        text = path.read_text()
-        assert "sources: [paper.pdf]" in text
-        assert "brief: Mechanism for selective focus" in text
-        assert "# Attention" in text
-
-    def test_new_concept(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False)
-        path = wiki / "concepts" / "attention.md"
-        assert path.exists()
-        text = path.read_text()
-        assert "sources: [paper.pdf]" in text
-        assert "# Attention" in text
-
-    def test_update_concept_appends_source(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "attention.md").write_text(
-            "---\nsources: [paper1.pdf]\nbrief: Old brief\n---\n\n# Attention\n\nOld content.",
-            encoding="utf-8",
-        )
-        _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True, brief="Updated brief")
-        text = (concepts / "attention.md").read_text()
-        assert "paper2.pdf" in text
-        assert "paper1.pdf" in text
-        assert "brief: Updated brief" in text
-        assert "New info from paper2." in text
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v`
-Expected: FAIL — `_write_summary` and `_write_concept` don't accept `brief` parameter
-
-- [ ] **Step 3: Update `_write_summary` to accept `brief`**
-
-```python
-def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None:
-    """Write summary page with frontmatter."""
-    summaries_dir = wiki_dir / "summaries"
-    summaries_dir.mkdir(parents=True, exist_ok=True)
-    fm_lines = [f"sources: [{source_file}]"]
-    if brief:
-        fm_lines.append(f"brief: {brief}")
-    frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
-    (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
-```
-
-- [ ] **Step 4: Update `_write_concept` to accept `brief`**
-
-Add `brief: str = ""` parameter to `_write_concept`. In the new-concept branch:
-
-```python
-    else:
-        fm_lines = [f"sources: [{source_file}]"]
-        if brief:
-            fm_lines.append(f"brief: {brief}")
-        frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
-        path.write_text(frontmatter + content, encoding="utf-8")
-```
-
-In the update branch, after updating sources in frontmatter, also update brief:
-
-```python
-    if is_update and path.exists():
-        existing = path.read_text(encoding="utf-8")
-        if source_file not in existing:
-            # ... existing frontmatter update logic ...
-        # Update brief in frontmatter if provided
-        if brief and existing.startswith("---"):
-            end = existing.find("---", 3)
-            if end != -1:
-                fm = existing[:end + 3]
-                body = existing[end + 3:]
-                if "brief:" in fm:
-                    import re
-                    fm = re.sub(r"brief:.*", f"brief: {brief}", fm)
-                else:
-                    fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1)
-                existing = fm + body
-        path.write_text(existing, encoding="utf-8")
-```
-
-- [ ] **Step 5: Run tests to verify they pass**
-
-Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v`
-Expected: All PASS
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add openkb/agent/compiler.py tests/test_compiler.py
-git commit -m "feat: store brief in frontmatter of summary and concept pages"
-```
-
----
-
-### Task 4: Update `_update_index` to include briefs, and update `_read_concept_briefs` to read from frontmatter
-
-**Files:**
-- Modify: `openkb/agent/compiler.py` (lines 233-261 and 408-430)
-- Modify: `tests/test_compiler.py`
-
-- [ ] **Step 1: Write failing tests for `_update_index` with briefs**
-
-```python
-class TestUpdateIndex:
-    def test_appends_entries_with_briefs(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
-            encoding="utf-8",
-        )
-        _update_index(wiki, "my-doc", ["attention", "transformer"],
-                       doc_brief="Introduces transformers",
-                       concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"})
-        text = (wiki / "index.md").read_text()
-        assert "[[summaries/my-doc]] — Introduces transformers" in text
-        assert "[[concepts/attention]] — Focus mechanism" in text
-        assert "[[concepts/transformer]] — NN architecture" in text
-
-    def test_no_duplicates(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n- [[summaries/my-doc]] — Old brief\n\n## Concepts\n",
-            encoding="utf-8",
-        )
-        _update_index(wiki, "my-doc", [], doc_brief="New brief")
-        text = (wiki / "index.md").read_text()
-        assert text.count("[[summaries/my-doc]]") == 1
-
-    def test_backwards_compat_no_briefs(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        wiki.mkdir()
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
-            encoding="utf-8",
-        )
-        _update_index(wiki, "my-doc", ["attention"])
-        text = (wiki / "index.md").read_text()
-        assert "[[summaries/my-doc]]" in text
-        assert "[[concepts/attention]]" in text
-```
-
-Write test for updated `_read_concept_briefs`:
-
-```python
-class TestReadConceptBriefs:
-    # ... keep existing tests ...
-
-    def test_reads_brief_from_frontmatter(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "attention.md").write_text(
-            "---\nsources: [paper.pdf]\nbrief: Selective focus mechanism\n---\n\n# Attention\n\nLong content...",
-            encoding="utf-8",
-        )
-        result = _read_concept_briefs(wiki)
-        assert "- attention: Selective focus mechanism" in result
-
-    def test_falls_back_to_body_truncation(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        concepts = wiki / "concepts"
-        concepts.mkdir(parents=True)
-        (concepts / "old.md").write_text(
-            "---\nsources: [paper.pdf]\n---\n\nOld concept without brief field.",
-            encoding="utf-8",
-        )
-        result = _read_concept_briefs(wiki)
-        assert "- old: Old concept without brief field." in result
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-Run: `pytest tests/test_compiler.py::TestUpdateIndex tests/test_compiler.py::TestReadConceptBriefs -v`
-Expected: FAIL — `_update_index` doesn't accept `doc_brief`/`concept_briefs` parameters
-
-- [ ] **Step 3: Update `_update_index`**
-
-```python
-def _update_index(
-    wiki_dir: Path, doc_name: str, concept_names: list[str],
-    doc_brief: str = "", concept_briefs: dict[str, str] | None = None,
-) -> None:
-    """Append document and concept entries to index.md with optional briefs."""
-    index_path = wiki_dir / "index.md"
-    if not index_path.exists():
-        index_path.write_text(
-            "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
-            encoding="utf-8",
-        )
-
-    text = index_path.read_text(encoding="utf-8")
-
-    doc_link = f"[[summaries/{doc_name}]]"
-    if doc_link not in text:
-        doc_entry = f"- {doc_link}"
-        if doc_brief:
-            doc_entry += f" — {doc_brief}"
-        if "## Documents" in text:
-            text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1)
-
-    if concept_briefs is None:
-        concept_briefs = {}
-    for name in concept_names:
-        concept_link = f"[[concepts/{name}]]"
-        if concept_link not in text:
-            concept_entry = f"- {concept_link}"
-            if name in concept_briefs:
-                concept_entry += f" — {concept_briefs[name]}"
-            if "## Concepts" in text:
-                text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1)
-
-    index_path.write_text(text, encoding="utf-8")
-```
-
-- [ ] **Step 4: Update `_read_concept_briefs` to read from frontmatter `brief:` field**
-
-```python
-def _read_concept_briefs(wiki_dir: Path) -> str:
-    """Read existing concept pages and return compact one-line summaries.
-
-    Reads ``brief:`` from YAML frontmatter if available, otherwise falls back
-    to the first 150 characters of the body text.
-    """
-    concepts_dir = wiki_dir / "concepts"
-    if not concepts_dir.exists():
-        return "(none yet)"
-
-    md_files = sorted(concepts_dir.glob("*.md"))
-    if not md_files:
-        return "(none yet)"
-
-    lines: list[str] = []
-    for path in md_files:
-        text = path.read_text(encoding="utf-8")
-        brief = ""
-        body = text
-        if text.startswith("---"):
-            end = text.find("---", 3)
-            if end != -1:
-                fm = text[:end + 3]
-                body = text[end + 3:]
-                # Try to extract brief from frontmatter
-                for line in fm.split("\n"):
-                    if line.startswith("brief:"):
-                        brief = line[len("brief:"):].strip()
-                        break
-        if not brief:
-            brief = body.strip().replace("\n", " ")[:150]
-        if brief:
-            lines.append(f"- {path.stem}: {brief}")
-
-    return "\n".join(lines) or "(none yet)"
-```
-
-- [ ] **Step 5: Run tests**
-
-Run: `pytest tests/test_compiler.py -v`
-Expected: All PASS
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add openkb/agent/compiler.py tests/test_compiler.py
-git commit -m "feat: add briefs to index.md entries and read from frontmatter"
-```
-
----
-
-### Task 5: Wire briefs through `_compile_concepts` and public functions
-
-**Files:**
-- Modify: `openkb/agent/compiler.py` (lines 438-611, `_compile_concepts`, `compile_short_doc`, `compile_long_doc`)
-- Modify: `tests/test_compiler.py`
-
-This task connects the brief+content JSON parsing to the write functions and index update.
-
-- [ ] **Step 1: Write integration test**
-
-```python
-class TestBriefIntegration:
-    @pytest.mark.asyncio
-    async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path):
-        wiki = tmp_path / "wiki"
-        (wiki / "sources").mkdir(parents=True)
-        (wiki / "summaries").mkdir(parents=True)
-        (wiki / "concepts").mkdir(parents=True)
-        (wiki / "index.md").write_text(
-            "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
-            encoding="utf-8",
-        )
-        source_path = wiki / "sources" / "test-doc.md"
-        source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8")
-        (tmp_path / ".openkb").mkdir()
-        (tmp_path / "raw").mkdir()
-        (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
-
-        summary_resp = json.dumps({
-            "brief": "A paper about transformers",
-            "content": "# Summary\n\nThis paper discusses transformers.",
-        })
-        plan_resp = json.dumps({
-            "create": [{"name": "transformer", "title": "Transformer"}],
-            "update": [],
-            "related": [],
-        })
-        concept_resp = json.dumps({
-            "brief": "NN architecture using self-attention",
-            "content": "# Transformer\n\nA neural network architecture.",
-        })
-
-        with patch("openkb.agent.compiler.litellm") as mock_litellm:
-            mock_litellm.completion = MagicMock(
-                side_effect=_mock_completion([summary_resp, plan_resp])
-            )
-            mock_litellm.acompletion = AsyncMock(
-                side_effect=_mock_acompletion([concept_resp])
-            )
-            await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini")
-
-        # Check summary frontmatter has brief
-        summary_text = (wiki / "summaries" / "test-doc.md").read_text()
-        assert "brief: A paper about transformers" in summary_text
-
-        # Check concept frontmatter has brief
-        concept_text = (wiki / "concepts" / "transformer.md").read_text()
-        assert "brief: NN architecture using self-attention" in concept_text
-
-        # Check index has briefs
-        index_text = (wiki / "index.md").read_text()
-        assert "[[summaries/test-doc]] — A paper about transformers" in index_text
-        assert "[[concepts/transformer]] — NN architecture using self-attention" in index_text
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `pytest tests/test_compiler.py::TestBriefIntegration -v`
-Expected: FAIL
-
-- [ ] **Step 3: Update `compile_short_doc` to parse brief+content from summary response**
-
-In `compile_short_doc`, replace:
-
-```python
-    # --- Step 1: Generate summary ---
-    summary = _llm_call(model, [system_msg, doc_msg], "summary")
-    _write_summary(wiki_dir, doc_name, source_file, summary)
-```
-
-With:
-
-```python
-    # --- Step 1: Generate summary ---
-    summary_raw = _llm_call(model, [system_msg, doc_msg], "summary")
-    try:
-        summary_parsed = _parse_json(summary_raw)
-        doc_brief = summary_parsed.get("brief", "")
-        summary = summary_parsed.get("content", summary_raw)
-    except (json.JSONDecodeError, ValueError):
-        doc_brief = ""
-        summary = summary_raw
-    _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief)
-```
-
-- [ ] **Step 4: Update `_compile_concepts` signature and wiring**
-
-Add `doc_brief: str = ""` parameter to `_compile_concepts`.
-
-In `_gen_create`, parse the response:
-
-```python
-    async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
-        name = concept["name"]
-        title = concept.get("title", name)
-        async with semaphore:
-            raw = await _llm_call_async(model, [
-                system_msg, doc_msg,
-                {"role": "assistant", "content": summary},
-                {"role": "user", "content": _CONCEPT_PAGE_USER.format(
-                    title=title, doc_name=doc_name, update_instruction="",
-                )},
-            ], f"create:{name}")
-        try:
-            parsed = _parse_json(raw)
-            brief = parsed.get("brief", "")
-            content = parsed.get("content", raw)
-        except (json.JSONDecodeError, ValueError):
-            brief, content = "", raw
-        return name, content, False, brief
-```
-
-Same for `_gen_update` — returns `tuple[str, str, bool, str]` (name, content, is_update, brief).
-
-In the results processing loop:
-
-```python
-    concept_briefs_map: dict[str, str] = {}
-    for r in results:
-        if isinstance(r, Exception):
-            logger.warning("Concept generation failed: %s", r)
-            continue
-        name, page_content, is_update, brief = r
-        _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief)
-        concept_names.append(name)
-        if brief:
-            concept_briefs_map[name] = brief
-```
-
-Pass briefs to `_update_index`:
-
-```python
-    _update_index(wiki_dir, doc_name, concept_names,
-                  doc_brief=doc_brief, concept_briefs=concept_briefs_map)
-```
-
-- [ ] **Step 5: Update `compile_short_doc` to pass `doc_brief` to `_compile_concepts`**
-
-```python
-    await _compile_concepts(
-        wiki_dir, kb_dir, model, system_msg, doc_msg,
-        summary, doc_name, max_concurrency, doc_brief=doc_brief,
-    )
-```
-
-- [ ] **Step 6: Update `compile_long_doc` to pass `doc_brief` from `IndexResult.description`**
-
-`compile_long_doc` currently takes `doc_id` but not `description`. Add `doc_description: str = ""` parameter:
-
-```python
-async def compile_long_doc(
-    doc_name: str,
-    summary_path: Path,
-    doc_id: str,
-    kb_dir: Path,
-    model: str,
-    doc_description: str = "",
-    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
-) -> None:
-```
-
-The `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain text, not JSON). Pass `doc_description` as `doc_brief`:
-
-```python
-    await _compile_concepts(
-        wiki_dir, kb_dir, model, system_msg, doc_msg,
-        overview, doc_name, max_concurrency, doc_brief=doc_description,
-    )
-```
-
-Also update the CLI call in `cli.py` line 135:
-
-```python
-asyncio.run(
-    compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model,
-                     doc_description=index_result.description)
-)
-```
-
-- [ ] **Step 7: Update existing integration tests for new JSON response format**
-
-Update all mock LLM responses in `TestCompileShortDoc`, `TestCompileLongDoc`, and `TestCompileConceptsPlan` to return `{"brief": "...", "content": "..."}` JSON instead of plain text for summary and concept responses.
-
-- [ ] **Step 8: Run all tests**
-
-Run: `pytest tests/ -q`
-Expected: All PASS
-
-- [ ] **Step 9: Commit**
-
-```bash
-git add openkb/agent/compiler.py openkb/cli.py tests/test_compiler.py
-git commit -m "feat: wire brief+content JSON through compile pipeline to index and frontmatter"
-```
-
----
-
-### Task 6: Indexer — long doc sources from markdown to JSON
-
-**Files:**
-- Modify: `openkb/indexer.py`
-- Modify: `openkb/tree_renderer.py` (remove `render_source_md`)
-- Modify: `tests/test_indexer.py`
-
-- [ ] **Step 1: Write failing test**
-
-Update `tests/test_indexer.py`:
-
-```python
-    def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path):
-        """Long doc source should be written as JSON, not markdown."""
-        import json as json_mod
-        doc_id = "abc-123"
-        fake_col = self._make_fake_collection(doc_id, sample_tree)
-
-        fake_client = MagicMock()
-        fake_client.collection.return_value = fake_col
-        # Mock get_page_content to return page data
-        fake_col.get_page_content.return_value = [
-            {"page": 1, "content": "Page one text."},
-            {"page": 2, "content": "Page two text."},
-        ]
-
-        pdf_path = tmp_path / "sample.pdf"
-        pdf_path.write_bytes(b"%PDF-1.4 fake")
-
-        with patch("openkb.indexer.PageIndexClient", return_value=fake_client):
-            index_long_document(pdf_path, kb_dir)
-
-        # Should be JSON, not MD
-        json_file = kb_dir / "wiki" / "sources" / "sample.json"
-        assert json_file.exists()
-        assert not (kb_dir / "wiki" / "sources" / "sample.md").exists()
-        data = json_mod.loads(json_file.read_text())
-        assert len(data) == 2
-        assert data[0]["page"] == 1
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `pytest tests/test_indexer.py::TestIndexLongDocument::test_source_page_written_as_json -v`
-Expected: FAIL
-
-- [ ] **Step 3: Update `indexer.py` to write JSON sources**
-
-Replace the source writing block (lines 103-110) with:
-
-```python
-    # Write wiki/sources/ as JSON (per-page content from PageIndex)
-    sources_dir = kb_dir / "wiki" / "sources"
-    sources_dir.mkdir(parents=True, exist_ok=True)
-    dest_images_dir = sources_dir / "images" / pdf_path.stem
-
-    # Get per-page content from PageIndex
-    all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}")
-
-    # Relocate image paths
-    dest_images_dir.mkdir(parents=True, exist_ok=True)
-    for page in all_pages:
-        if "images" in page:
-            for img in page["images"]:
-                src_path = Path(img["path"])
-                if src_path.exists():
-                    filename = src_path.name
-                    dest = dest_images_dir / filename
-                    if not dest.exists():
-                        shutil.copy2(src_path, dest)
-                    img["path"] = f"images/{pdf_path.stem}/{filename}"
-
-    import json as json_mod
-    (sources_dir / f"{pdf_path.stem}.json").write_text(
-        json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
-    )
-```
-
-Remove the `render_source_md` import and `_relocate_images` call.
-
-- [ ] **Step 4: Remove `render_source_md` from tree_renderer.py**
-
-Remove the `render_source_md` function and `_render_nodes_source` helper from `openkb/tree_renderer.py`. Keep `render_summary_md` and `_render_nodes_summary`.
-
-- [ ] **Step 5: Update existing test `test_source_page_written`**
-
-The old test checks for `.md` — update it to check for `.json` or remove it (replaced by the new test).
-
-- [ ] **Step 6: Run all tests**
-
-Run: `pytest tests/ -q`
-Expected: All PASS
-
-- [ ] **Step 7: Commit**
-
-```bash
-git add openkb/indexer.py openkb/tree_renderer.py tests/test_indexer.py
-git commit -m "feat: store long doc sources as per-page JSON, remove render_source_md"
-```
-
----
-
-### Task 7: Query agent — remove `pageindex_retrieve`, add `get_page_content`, update instructions
-
-**Files:**
-- Modify: `openkb/agent/query.py`
-- Modify: `openkb/schema.py`
-- Modify: `tests/test_query.py`
-
-- [ ] **Step 1: Write failing tests**
-
-Update `tests/test_query.py`:
-
-```python
-class TestBuildQueryAgent:
-    def test_agent_name(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
-        assert agent.name == "wiki-query"
-
-    def test_agent_has_three_tools(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
-        assert len(agent.tools) == 3
-
-    def test_agent_tool_names(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
-        names = {t.name for t in agent.tools}
-        assert "list_files" in names
-        assert "read_file" in names
-        assert "get_page_content" in names
-        assert "pageindex_retrieve" not in names
-
-    def test_instructions_mention_get_page_content(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
-        assert "get_page_content" in agent.instructions
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-Run: `pytest tests/test_query.py::TestBuildQueryAgent -v`
-Expected: FAIL — old signature requires `openkb_dir`
-
-- [ ] **Step 3: Rewrite `query.py`**
-
-Remove `_pageindex_retrieve_impl` entirely (~110 lines). Remove `PageIndexClient` import. Update `build_query_agent`:
-
-```python
-def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent:
-    """Build and return the Q&A agent."""
-    schema_md = get_agents_md(Path(wiki_root))
-    instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
-    instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
-
-    @function_tool
-    def list_files(directory: str) -> str:
-        """List all Markdown files in a wiki subdirectory."""
-        return list_wiki_files(directory, wiki_root)
-
-    @function_tool
-    def read_file(path: str) -> str:
-        """Read a Markdown file from the wiki."""
-        return read_wiki_file(path, wiki_root)
-
-    @function_tool
-    def get_page_content_tool(doc_name: str, pages: str) -> str:
-        """Get text content of specific pages from a long document.
-
-        Args:
-            doc_name: Document name (e.g. 'attention-is-all-you-need').
-            pages: Page specification (e.g. '3-5,7,10-12').
-        """
-        from openkb.agent.tools import get_page_content
-        return get_page_content(doc_name, pages, wiki_root)
-
-    from agents.model_settings import ModelSettings
-
-    return Agent(
-        name="wiki-query",
-        instructions=instructions,
-        tools=[list_files, read_file, get_page_content_tool],
-        model=f"litellm/{model}",
-        model_settings=ModelSettings(parallel_tool_calls=False),
-    )
-```
-
-Update `_QUERY_INSTRUCTIONS_TEMPLATE`:
-
-```python
-_QUERY_INSTRUCTIONS_TEMPLATE = """\
-You are a knowledge-base Q&A agent. You answer questions by searching the wiki.
-
-{schema_md}
-
-## Search strategy
-1. Read index.md to understand what documents and concepts are available.
-   Each entry has a brief summary to help you judge relevance.
-2. Read relevant summary pages (summaries/) for document overviews.
-3. Read concept pages (concepts/) for cross-document synthesis.
-4. For long documents, use get_page_content(doc_name, pages) to read
-   specific pages when you need detailed content. The summary page
-   shows chapter structure with page ranges to help you decide which
-   pages to read.
-5. Synthesise a clear, well-cited answer.
-
-Always ground your answer in the wiki content. If you cannot find relevant
-information, say so clearly.
-"""
-```
-
-Update `run_query` to match new `build_query_agent` signature (remove `openkb_dir` param):
-
-```python
-async def run_query(question: str, kb_dir: Path, model: str, stream: bool = False) -> str:
-    from openkb.config import load_config
-    openkb_dir = kb_dir / ".openkb"
-    config = load_config(openkb_dir / "config.yaml")
-    language: str = config.get("language", "en")
-
-    wiki_root = str(kb_dir / "wiki")
-    agent = build_query_agent(wiki_root, model, language=language)
-    # ... rest unchanged ...
-```
-
-- [ ] **Step 4: Update `openkb/schema.py` AGENTS_MD**
-
-Add a note about `get_page_content` for long documents in the Schema:
-
-```python
-## Page Types
-- **Summary Page** (summaries/): Key content of a single source document.
-- **Concept Page** (concepts/): Cross-document topic synthesis with [[wikilinks]].
-- **Exploration Page** (explorations/): Saved query results — analyses, comparisons, syntheses.
-- **Source Page** (sources/): Full-text for short docs (.md) or per-page JSON for long docs (.json).
-- **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained.
-```
-
-- [ ] **Step 5: Run all tests**
-
-Run: `pytest tests/ -q`
-Expected: All PASS
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add openkb/agent/query.py openkb/schema.py tests/test_query.py
-git commit -m "feat: replace pageindex_retrieve with get_page_content, unify query for all docs"
-```
-
----
-
-### Task 8: Final cleanup and full verification
-
-**Files:**
-- Modify: `openkb/indexer.py` (remove unused imports)
-- Verify all files
-
-- [ ] **Step 1: Remove unused imports**
-
-In `indexer.py`, remove `from openkb.tree_renderer import render_source_md` if still present (keep `render_summary_md`).
-
-In `query.py`, verify `PageIndexClient` import is removed.
-
-- [ ] **Step 2: Run full test suite**
-
-Run: `pytest tests/ -v`
-Expected: All PASS
-
-- [ ] **Step 3: Grep for dead references**
-
-Run: `grep -r "pageindex_retrieve\|render_source_md\|_relocate_images" openkb/ tests/`
-Expected: No matches
-
-- [ ] **Step 4: Commit**
-
-```bash
-git add -A
-git commit -m "chore: remove dead imports and references"
-```
diff --git a/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md b/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md
deleted file mode 100644
index 2fcd853..0000000
--- a/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md
+++ /dev/null
@@ -1,163 +0,0 @@
-# Concept Dedup & Existing Page Update
-
-**Date:** 2026-04-09
-**Status:** Approved
-**Branch:** bugfix/compile
-
-## Problem
-
-The compiler pipeline generates concept pages per document, but:
-
-1. **No dedup** — LLM only sees concept slug names, not content. It can't reliably judge whether a new concept overlaps with an existing one. As the KB grows, concepts duplicate and diverge.
-2. **No update of existing pages** — When a new document has information relevant to existing concepts, those pages are not updated. Knowledge doesn't compound across documents.
-
-The old agent-based approach solved this (the agent could read/write wiki files freely), but was too slow — 20-30 tool-call round-trips per document.
-
-## Design
-
-Extend the existing deterministic pipeline to give the LLM enough context for dedup/update decisions, without adding agent loops or breaking prompt caching.
-
-### Prompt Caching Invariant
-
-The cached prefix `[system_msg, doc_msg]` must remain identical across all LLM calls within a single document compilation. All new context (concept briefs, existing page content) goes into messages **after** the cached prefix.
-
-### Pipeline Overview
-
-```
-Step 1: [system, doc] → summary                          (unchanged)
-Step 2: [system, doc, summary, concepts_plan_prompt] → concepts plan JSON
-Step 3a: [system, doc, summary, create_prompt] × N  → new concept pages     (concurrent)
-Step 3b: [system, doc, summary, update_prompt] × M  → rewritten concept pages (concurrent)
-Step 3c: code-only × K                              → add cross-ref links to related concepts
-Step 4: update index                                 (unchanged)
-```
-
-Steps 3a and 3b share a single semaphore and run concurrently together.
-
-### Part 1: Concept Briefs
-
-New function `_read_concept_briefs(wiki_dir)` reads existing concept pages and returns a compact summary string:
-
-```
-- attention: Attention is a mechanism that allows models to focus on relevant parts...
-- transformer-architecture: The Transformer is a neural network architecture...
-```
-
-For each concept file in `wiki/concepts/*.md`:
-- Skip YAML frontmatter
-- Take first 150 characters of body text
-- Format as `- {slug}: {brief}`
-
-This replaces the current `", ".join(existing_concepts)` in the concepts-list prompt. Pure file I/O, no LLM call.
-
-### Part 2: Concepts Plan Prompt
-
-The `_CONCEPTS_LIST_USER` template is replaced with a new `_CONCEPTS_PLAN_USER` template that asks the LLM to return a JSON object with three action types:
-
-```json
-{
-  "create": [{"name": "flash-attention", "title": "Flash Attention"}],
-  "update": [{"name": "attention", "title": "Attention Mechanism"}],
-  "related": ["transformer-architecture"]
-}
-```
-
-- **create** — New concept not covered by any existing page.
-- **update** — Existing concept with significant new information worth integrating.
-- **related** — Existing concept tangentially related; only needs a cross-reference link.
-
-The prompt includes rules:
-- Don't create concepts that overlap with existing ones — use "update" instead.
-- Don't create concepts that are just the document topic itself.
-- For first few documents, create 2-3 foundational concepts at most.
-- "related" is for lightweight cross-linking only.
-
-### Part 3: Three Execution Paths
-
-#### create (unchanged)
-
-Same as current: concurrent `_llm_call_async` with `_CONCEPT_PAGE_USER` template. Written via `_write_concept` with `is_update=False`.
-
-#### update (new)
-
-New template `_CONCEPT_UPDATE_USER`:
-
-```
-Update the concept page for: {title}
-
-Current content of this page:
-{existing_content}
-
-New information from document "{doc_name}" (summarized above) should be
-integrated into this page. Rewrite the full page incorporating the new
-information naturally. Maintain existing cross-references and add new ones
-where appropriate.
-
-Return ONLY the Markdown content (no frontmatter, no code fences).
-```
-
-Call structure: `[system_msg, doc_msg, {assistant: summary}, update_user_msg]`
-
-The cached prefix `[system_msg, doc_msg]` is shared with create calls. The `existing_content` (typically 200-500 tokens) is in the final user message only.
-
-Written via `_write_concept` with `is_update=True`. The frontmatter `sources:` list is updated to include the new source file.
-
-#### related (code-only, no LLM)
-
-For each related slug:
-1. Read the concept file
-2. If `summaries/{doc_name}` is not already linked, append `\n\nSee also: [[summaries/{doc_name}]]`
-3. Update frontmatter `sources:` list
-
-Pure file I/O, millisecond-level.
-
-### Part 4: Shared Logic Between Short and Long Doc
-
-Current `compile_short_doc` and `compile_long_doc` duplicate Steps 2-4. Extract shared logic into `_compile_concepts(wiki_dir, model, system_msg, doc_msg, summary, doc_name, kb_dir, max_concurrency)`.
-
-Public functions become:
-- `compile_short_doc`: builds context A from source text → calls `_compile_concepts`
-- `compile_long_doc`: builds context A from PageIndex summary → calls `_compile_concepts`
-
-### Part 5: JSON Parsing Fallback
-
-If the LLM returns a flat JSON array instead of the expected dict, treat it as all "create" actions:
-
-```python
-if isinstance(parsed, list):
-    create_list, update_list, related_list = parsed, [], []
-else:
-    create_list = parsed.get("create", [])
-    update_list = parsed.get("update", [])
-    related_list = parsed.get("related", [])
-```
-
-This ensures backward compatibility if the LLM doesn't follow the new format.
-
-## Token Cost Analysis
-
-Compared to current pipeline (per document with C existing concepts):
-
-| Step | Current | New | Delta |
-|------|---------|-----|-------|
-| concepts-list prompt | ~50 tokens (slug names) | ~50 + C×30 tokens (briefs) | +C×30 |
-| update calls | 0 | M × ~500 tokens (existing content) | +M×500 |
-| related | 0 | 0 (code-only) | 0 |
-
-At C=30 existing concepts: +900 tokens in concepts-list prompt.
-At M=2 update calls: +1000 tokens total.
-
-Total overhead: ~2000 tokens per document. Negligible compared to document content (5K-20K tokens).
-
-## Files Changed
-
-- `openkb/agent/compiler.py` — all changes
-  - New: `_read_concept_briefs()`, `_CONCEPTS_PLAN_USER`, `_CONCEPT_UPDATE_USER`, `_add_related_link()`, `_compile_concepts()`
-  - Modified: `compile_short_doc()`, `compile_long_doc()`, `_parse_json()` caller logic
-- `tests/test_compiler.py` — update tests for new JSON format and update/related paths
-
-## Not In Scope
-
-- Concept briefs truncation/filtering for very large KBs (100+ concepts) — revisit when needed
-- Interactive ingest (human-in-the-loop checkpoint) — separate feature
-- Lint --fix auto-repair — separate feature
diff --git a/docs/superpowers/specs/2026-04-09-retrieve-redesign.md b/docs/superpowers/specs/2026-04-09-retrieve-redesign.md
deleted file mode 100644
index 15224be..0000000
--- a/docs/superpowers/specs/2026-04-09-retrieve-redesign.md
+++ /dev/null
@@ -1,262 +0,0 @@
-# Retrieve Redesign: Unified Query, Brief Summaries, and Local Page Content
-
-**Date:** 2026-04-09
-**Status:** Approved
-**Branch:** bugfix/compile
-
-## Problems
-
-### 1. Long vs Short Doc Split in Query
-
-The query agent treats long documents (PageIndex-indexed) and short documents differently:
-
-- **Short docs**: agent reads `wiki/sources/{name}.md` via `read_file`
-- **Long docs**: agent calls `pageindex_retrieve(doc_id, question)` — a black-box RAG call
-
-**Design Principle**: PageIndex is an indexer, not a retriever. Query-time retrieval should be done by the agent navigating the wiki, using the same tools for all documents.
-
-### 2. index.md Has No Brief Summaries
-
-Karpathy's gist says index.md should have "each page listed with a link, **a one-line summary**". Currently it only has wikilinks with no descriptions. The query agent must open every file to understand what's available.
-
-### 3. No Brief Summaries on Concepts Either
-
-Same problem: concept entries in index.md have no description. The agent can't judge relevance from the index alone.
-
-## Design
-
-### Part 1: Structured LLM Output with Brief Summaries
-
-All LLM generation steps (summary, concept create, concept update) now return a JSON object with both a one-line brief and the full content.
-
-#### Summary Generation
-
-`_SUMMARY_USER` prompt changes to request JSON output:
-
-```
-Write a summary page for this document in Markdown.
-
-Return a JSON object with two keys:
-- "brief": A single sentence (under 100 chars) describing the document's main contribution
-- "content": The full summary in Markdown. Include key concepts, findings, and [[wikilinks]]
-
-Return ONLY valid JSON, no fences.
-```
-
-LLM returns:
-```json
-{
-  "brief": "Introduces the Transformer architecture based entirely on self-attention",
-  "content": "# Attention Is All You Need\n\nThis paper proposes..."
-}
-```
-
-The `brief` is:
-- Written into summary frontmatter: `brief: Introduces the Transformer...`
-- Passed to `_update_index` for the Documents section
-
-The `content` is written to `wiki/summaries/{name}.md` as before.
-
-#### Concept Generation (create)
-
-`_CONCEPT_PAGE_USER` prompt changes similarly:
-
-```
-Write the concept page for: {title}
-
-Return a JSON object with two keys:
-- "brief": A single sentence (under 100 chars) defining this concept
-- "content": The full concept page in Markdown with [[wikilinks]]
-
-Return ONLY valid JSON, no fences.
-```
-
-The `brief` is:
-- Written into concept frontmatter: `brief: Mechanism allowing each position to attend to all others`
-- Passed to `_update_index` for the Concepts section
-- Used by `_read_concept_briefs` (read from frontmatter instead of truncating body text)
-
-#### Concept Generation (update)
-
-`_CONCEPT_UPDATE_USER` also returns `{"brief": "...", "content": "..."}`. The brief may change as the concept evolves with new information.
-
-#### Long Doc Summary (overview)
-
-Long documents do NOT need the LLM to generate a brief. The brief comes directly from PageIndex's `doc_description` field (available via `IndexResult.description`), which is already a document-level summary generated during indexing. `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain markdown overview, not JSON) — the brief is passed through from the indexer.
-
-In `compile_long_doc`, the `doc_description` is passed to `_compile_concepts` which forwards it to `_update_index` as the doc brief.
-
-#### Parsing
-
-All LLM responses go through `_parse_json`. Callers extract `brief` and `content`:
-
-```python
-parsed = _parse_json(raw)
-brief = parsed.get("brief", "")
-content = parsed.get("content", raw)  # fallback: treat raw as content if not JSON
-```
-
-The fallback ensures backward compatibility if the LLM returns plain text instead of JSON.
-
-### Part 2: index.md with Brief Summaries
-
-`_update_index` signature changes:
-
-```python
-def _update_index(wiki_dir, doc_name, concept_names, doc_brief="", concept_briefs=None):
-```
-
-Output format:
-
-```markdown
-## Documents
-- [[summaries/attention-is-all-you-need]] — Introduces the Transformer architecture based on self-attention
-- [[summaries/flash-attention]] — Efficient attention algorithm reducing memory from quadratic to linear
-
-## Concepts
-- [[concepts/self-attention]] — Mechanism allowing each position to attend to all others in a sequence
-- [[concepts/transformer]] — Neural network architecture based entirely on attention mechanisms
-```
-
-When updating an existing entry (re-compile), the brief is updated in place.
-
-### Part 3: Frontmatter with Brief
-
-Summary and concept pages get a `brief` field in frontmatter:
-
-```markdown
----
-sources: [paper.pdf]
-brief: Introduces the Transformer architecture based on self-attention
----
-
-# Attention Is All You Need
-...
-```
-
-`_read_concept_briefs` is updated to read from `brief:` frontmatter field instead of truncating body text. Fallback to body truncation if `brief:` is absent (backward compat with existing pages).
-
-### Part 4: Long Doc Sources from Markdown to JSON
-
-Store per-page content as JSON instead of a giant markdown file.
-
-**Current**:
-```
-wiki/sources/paper.md          ← rendered markdown, 10K-50K tokens
-```
-
-**New**:
-```
-wiki/sources/paper.json        ← per-page JSON array
-```
-
-**JSON format** (only the `pages` array from PageIndex, not the full doc object):
-```json
-[
-    {
-        "page": 1,
-        "content": "Full text of page 1...",
-        "images": [{"path": "images/paper/p1_img1.png", "width": 400, "height": 300}]
-    },
-    {
-        "page": 2,
-        "content": "Full text of page 2..."
-    }
-]
-```
-
-`images` field is optional. Image paths are relative to `wiki/sources/`. Short documents are not affected — they stay as `.md`.
-
-#### Indexer Changes
-
-In `indexer.py`, replace `render_source_md` + `_relocate_images` with:
-1. `col.get_page_content(doc_id, "1-9999")` to get all pages
-2. Relocate image paths in each page's `images` array
-3. Write as JSON to `wiki/sources/{name}.json`
-
-### Part 5: New Tool `get_page_content`
-
-Add to `openkb/agent/tools.py`:
-
-```python
-def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
-    """Get text content of specific pages from a long document.
-
-    Args:
-        doc_name: Document name (e.g. 'attention-is-all-you-need').
-        pages: Page specification (e.g. '3-5,7,10-12').
-        wiki_root: Absolute path to the wiki root directory.
-    """
-```
-
-Implementation:
-1. Read `wiki/sources/{doc_name}.json`
-2. Parse `pages` spec into a set of page numbers (comma-separated, ranges with `-`)
-3. Filter pages, format as `[Page N]\n{content}\n\n`
-4. Return concatenated text, or error if file not found
-
-### Part 6: Query Agent Changes
-
-**Remove**: `pageindex_retrieve` tool and `_pageindex_retrieve_impl` entirely.
-
-**Add**: `get_page_content` tool.
-
-**Update instructions**:
-```
-## Search strategy
-1. Read index.md to understand what documents and concepts are available.
-   Each entry has a brief summary to help you judge relevance.
-2. Read relevant summary pages (summaries/) for document overviews.
-3. Read concept pages (concepts/) for cross-document synthesis.
-4. For long documents, use get_page_content(doc_name, pages) to read
-   specific pages. The summary page shows chapter structure with page
-   ranges to help you decide which pages to read.
-5. Synthesise a clear, well-cited answer.
-```
-
-**Remove**: `openkb_dir` and `model` parameters from `build_query_agent`.
-
-### What Gets Removed
-
-- `_pageindex_retrieve_impl` (~110 lines)
-- `pageindex_retrieve` tool
-- `render_source_md` from `tree_renderer.py`
-- `_relocate_images` in current form (replaced by per-page relocation)
-- PageIndex imports in `query.py`
-
-### What Stays
-
-- `render_summary_md` — summaries still markdown
-- Short doc pipeline — unchanged
-- Image files in `wiki/sources/images/`
-- PageIndex in `indexer.py` — still used for tree building
-
-## Compile Pipeline Changes Summary
-
-The compile pipeline (`_compile_concepts`, `compile_short_doc`, `compile_long_doc`) changes:
-
-1. **Summary step**: parse JSON response, extract `brief` + `content`
-2. **Concept create/update steps**: parse JSON response, extract `brief` + `content`
-3. **`_write_summary`**: add `brief` to frontmatter
-4. **`_write_concept`**: add/update `brief` in frontmatter
-5. **`_update_index`**: write `— {brief}` after each wikilink
-6. **`_read_concept_briefs`**: read from `brief:` frontmatter field (fallback to body truncation)
-
-## Files Changed
-
-- `openkb/agent/compiler.py` — prompt templates return JSON with brief+content, parse responses, pass briefs to index/frontmatter
-- `openkb/indexer.py` — sources output from md to json, image relocation per-page
-- `openkb/agent/tools.py` — add `get_page_content`
-- `openkb/agent/query.py` — remove `pageindex_retrieve`, add `get_page_content`, update instructions
-- `openkb/tree_renderer.py` — remove `render_source_md`
-- `openkb/schema.py` — update AGENTS_MD
-- `tests/test_compiler.py` — update for JSON LLM responses
-- `tests/test_indexer.py` — update for JSON output
-- `tests/test_query.py` — update for new tool set
-- `tests/test_agent_tools.py` — add tests for `get_page_content`
-
-## Not In Scope
-
-- Cloud PageIndex query support (removed entirely)
-- Changes to the lint pipeline
-- Interactive ingest