diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 0674ae35d..3a9f10fdd 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -14,7 +14,7 @@ permissions:
   actions: read
 
 jobs:
-  analyze:
+  analyze-actions:
     name: Analyze (actions)
     runs-on: ubuntu-latest
     timeout-minutes: 30
@@ -32,3 +32,22 @@ jobs:
         uses: github/codeql-action/analyze@v4
         with:
           category: /language:actions
+
+  analyze-python:
+    name: Analyze (python)
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v4
+        with:
+          languages: python
+          build-mode: none
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v4
+        with:
+          category: /language:python
diff --git a/.gitignore b/.gitignore
index 23d6b5655..685045e17 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,8 @@ __pycache__
 .DS_Store
 .env*
 .venv/
+.claude/
+.codex/
 logs/
+examples/pifs_workspace/
+examples/Benchmark/
diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py
new file mode 100644
index 000000000..7dcfd0d09
--- /dev/null
+++ b/examples/pifs_demo.py
@@ -0,0 +1,788 @@
+"""
+PageIndex FileSystem (PIFS) agent demo.
+
+This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus
+through the PageIndex FileSystem shell instead of direct PageIndex document
+tools. The agent receives one read-only bash-like PIFS tool and must retrieve
+evidence through commands such as ls, tree, find, grep, search-summary,
+cat <path> --structure, cat <path> --page, and cat <path> --node.
+
+The demo registers supported files under examples/documents. When a matching
+examples/documents/results/*_structure.json file exists, it is loaded into the
+PIFS workspace's PageIndexClient cache. Files without a cache exercise the
+normal PageIndexClient.index() path during register().
+
+Requirements:
+  pip install openai-agents
+
+Example:
+  python examples/pifs_demo.py --stream-mode all --verbose
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import shutil
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import PyPDF2
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+# Keep the local demo quiet in offline environments.
+os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true")
+
+from pageindex import PageIndexClient
+from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor
+from pageindex.filesystem.agent import run_pifs_agent
+
+
+EXAMPLES_DIR = Path(__file__).parent
+DOCUMENTS_DIR = EXAMPLES_DIR / "documents"
+WORKSPACE = EXAMPLES_DIR / "pifs_workspace"
+DEFAULT_MODEL = os.environ.get("PIFS_DEMO_MODEL", "gpt-5.4-mini")
+DEFAULT_METADATA_PROVIDER = os.environ.get("PIFS_DEMO_METADATA_PROVIDER") or os.environ.get(
+    "PIFS_METADATA_PROVIDER", "openai"
+)
+DEFAULT_EMBEDDING_PROVIDER = os.environ.get("PIFS_DEMO_EMBEDDING_PROVIDER") or os.environ.get(
+    "PIFS_EMBEDDING_PROVIDER", "openai"
+)
+DEFAULT_QUESTION = (
+    "Use the PIFS workspace to find the Federal Reserve annual report. "
+    "Which section covers supervision and regulation, and what page range "
+    "should I inspect? Cite the document and evidence you used."
+)
+
+PIFS_DEMO_AGENT_PROMPT = """
+You are a PageIndex FileSystem retrieval agent for a local demo workspace.
+
+Use only the bash tool. It is a read-only PIFS virtual shell, not a real OS
+shell. The workspace contains registered example PDFs.
+
+Retrieval strategy:
+- Start with ls or tree to understand the workspace.
+- Use concrete PIFS paths from ls/find output, such as /documents/report.pdf,
+  or stable file_ref/document ids. Do not invent temporary ref_N aliases.
+- Folder paths such as /documents are positional command targets; do not put
+  folder paths inside --where.
+- Use search-summary when available to find likely documents.
+  Quote multi-word queries and include a path, for example:
+  search-summary "Federal Reserve supervision regulation" /documents
+- Use find --where only with JSON metadata DSL, for example:
+  find /documents --where '{"file_format":"pdf"}'
+- Use grep -R only for lexical evidence; do not treat semantic candidates as
+  literal matches.
+- Run one evidence command at a time. Do not chain large commands like
+  cat <path> --structure, grep, and cat <path> --page in one bash call.
+- For PDFs, use cat <path> --structure to inspect the PageIndex tree, then
+  cat <path> --page <range> for evidence, for example:
+  cat /documents/2023-annual-report.pdf --page 31-35
+- For page-range questions, use cat <path> --structure to identify the full section
+  range. Then run cat <path> --page on the smallest useful evidence range, usually the
+  section start page or first 1-2 pages, before the final answer. Do not print
+  a broad multi-page section unless the user asks to read the whole section.
+- Do not use cat --all on PDFs.
+- Answer only from PIFS tool output and cite file refs or document ids.
+"""
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run a PIFS document retrieval agent demo.")
+    parser.add_argument("--workspace", type=Path, default=WORKSPACE)
+    parser.add_argument("--documents-dir", type=Path, default=DOCUMENTS_DIR)
+    parser.add_argument(
+        "--document",
+        action="append",
+        default=[],
+        help="Specific document filename or path to register. May be repeated.",
+    )
+    parser.add_argument(
+        "--max-docs",
+        type=int,
+        default=0,
+        help="Limit number of cached example documents to register. 0 means all.",
+    )
+    parser.add_argument("--reset", action="store_true", help="Delete and rebuild the demo workspace.")
+    parser.add_argument(
+        "--prepare-only",
+        action="store_true",
+        help="Register documents and print PIFS smoke commands without running the agent.",
+    )
+    parser.add_argument("--question", default=DEFAULT_QUESTION)
+    parser.add_argument("--model", default=DEFAULT_MODEL)
+    parser.add_argument(
+        "--metadata-provider",
+        default=DEFAULT_METADATA_PROVIDER,
+        help="Provider used for register-time metadata generation.",
+    )
+    parser.add_argument(
+        "--metadata-model",
+        default=os.environ.get("PIFS_METADATA_MODEL", "gpt-5-nano"),
+        help="Model used for register-time metadata generation.",
+    )
+    parser.add_argument("--stream-mode", default="all", choices=["off", "tools", "model", "all"])
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--max-turns", type=int, default=12)
+    parser.add_argument("--max-seconds", type=float, default=90)
+    parser.add_argument("--reasoning-effort", default=None)
+    parser.add_argument("--reasoning-summary", default="auto")
+    parser.add_argument(
+        "--embedding-provider",
+        default=DEFAULT_EMBEDDING_PROVIDER,
+        help="Provider used for register-time summary projection embeddings.",
+    )
+    parser.add_argument(
+        "--embedding-model",
+        default=os.environ.get("PIFS_DEMO_EMBEDDING_MODEL", "text-embedding-3-small"),
+        help="Embedding model used for register-time summary projection.",
+    )
+    parser.add_argument("--embedding-dimensions", type=int, default=256)
+    return parser.parse_args()
+
+
+def require_runtime_environment(*, metadata_provider: str, embedding_provider: str) -> None:
+    metadata_provider = metadata_provider.lower()
+    embedding_provider = embedding_provider.lower()
+    missing: list[str] = []
+    if not os.environ.get("OPENAI_API_KEY"):
+        missing.append("OPENAI_API_KEY for the OpenAI Agents SDK demo agent")
+    if metadata_provider == "openai" and not (
+        os.environ.get("PIFS_METADATA_API_KEY") or os.environ.get("OPENAI_API_KEY")
+    ):
+        missing.append("PIFS_METADATA_API_KEY or OPENAI_API_KEY for metadata generation")
+    if embedding_provider == "openai" and not (
+        os.environ.get("PIFS_EMBEDDING_API_KEY") or os.environ.get("OPENAI_API_KEY")
+    ):
+        missing.append("PIFS_EMBEDDING_API_KEY or OPENAI_API_KEY for summary embeddings")
+    if missing:
+        raise RuntimeError(
+            "Missing required environment variable(s): "
+            + "; ".join(missing)
+            + ". Source your .env or export the required key before running."
+        )
+
+
+SUPPORTED_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown", ".txt", ".text"}
+
+
+def discover_documents(documents_dir: Path) -> list[Path]:
+    return sorted(
+        path
+        for path in documents_dir.iterdir()
+        if path.is_file() and path.suffix.lower() in SUPPORTED_DOCUMENT_SUFFIXES
+    )
+
+
+def resolve_requested_documents(documents_dir: Path, requested: list[str]) -> list[Path]:
+    if not requested:
+        return discover_documents(documents_dir)
+    paths: list[Path] = []
+    for item in requested:
+        path = Path(item).expanduser()
+        if not path.is_absolute():
+            path = documents_dir / path
+        if not path.exists():
+            raise FileNotFoundError(f"document not found: {path}")
+        paths.append(path)
+    return paths
+
+
+def structure_path_for(document_path: Path, documents_dir: Path) -> Path | None:
+    path = documents_dir / "results" / f"{document_path.stem}_structure.json"
+    return path if path.exists() else None
+
+
+def deterministic_doc_id(document_path: Path) -> str:
+    digest = hashlib.sha1(str(document_path.resolve()).encode("utf-8")).hexdigest()[:16]
+    return f"pifs_demo_{digest}"
+
+
+def read_pdf_pages(document_path: Path) -> list[dict[str, Any]]:
+    pages: list[dict[str, Any]] = []
+    with document_path.open("rb") as handle:
+        reader = PyPDF2.PdfReader(handle)
+        for page_num, page in enumerate(reader.pages, 1):
+            pages.append({"page": page_num, "content": page.extract_text() or ""})
+    return pages
+
+
+def load_structure_json(structure_path: Path) -> dict[str, Any]:
+    with structure_path.open("r", encoding="utf-8") as handle:
+        payload = json.load(handle)
+    if not isinstance(payload, dict) or not isinstance(payload.get("structure"), list):
+        raise ValueError(f"invalid PageIndex structure cache: {structure_path}")
+    return payload
+
+
+def seed_pageindex_cache(
+    filesystem: PageIndexFileSystem,
+    document_path: Path,
+    *,
+    documents_dir: Path,
+) -> str | None:
+    structure_path = structure_path_for(document_path, documents_dir)
+    if structure_path is None:
+        return None
+
+    filesystem.pageindex_client_workspace.mkdir(parents=True, exist_ok=True)
+    meta_path = filesystem.pageindex_client_workspace / "_meta.json"
+    if not meta_path.exists():
+        meta_path.write_text("{}", encoding="utf-8")
+    client = PageIndexClient(workspace=str(filesystem.pageindex_client_workspace))
+    canonical_path = str(document_path.resolve())
+    for doc_id, doc in client.documents.items():
+        if Path(str(doc.get("path") or "")).resolve(strict=False) == Path(canonical_path):
+            return doc_id
+
+    payload = load_structure_json(structure_path)
+    doc_id = deterministic_doc_id(document_path)
+    suffix = document_path.suffix.lower()
+    if suffix == ".pdf":
+        pages = read_pdf_pages(document_path)
+        client.documents[doc_id] = {
+            "id": doc_id,
+            "type": "pdf",
+            "path": canonical_path,
+            "doc_name": payload.get("doc_name") or document_path.name,
+            "doc_description": payload.get("doc_description") or "",
+            "page_count": len(pages),
+            "structure": payload["structure"],
+            "pages": pages,
+        }
+    elif suffix in {".md", ".markdown"}:
+        text = document_path.read_text(encoding="utf-8")
+        client.documents[doc_id] = {
+            "id": doc_id,
+            "type": "md",
+            "path": canonical_path,
+            "doc_name": payload.get("doc_name") or document_path.name,
+            "doc_description": payload.get("doc_description") or "",
+            "line_count": len(text.splitlines()),
+            "structure": payload["structure"],
+        }
+    else:
+        return None
+    client._save_doc(doc_id)
+    return doc_id
+
+
+def content_type_for(path: Path) -> str:
+    suffix = path.suffix.lower()
+    if suffix == ".pdf":
+        return "application/pdf"
+    if suffix in {".md", ".markdown"}:
+        return "text/markdown"
+    return "text/plain"
+
+
+def external_id_for(path: Path) -> str:
+    slug = "".join(ch.lower() if ch.isalnum() else "_" for ch in path.stem).strip("_")
+    slug = "_".join(part for part in slug.split("_") if part)
+    return f"example_{slug}"
+
+
+def log_progress(message: str, *, indent: int = 0) -> None:
+    prefix = "  " * indent
+    print(f"{prefix}{message}", flush=True)
+
+
+def register_demo_metadata_schema(filesystem: PageIndexFileSystem) -> None:
+    filesystem.metadata.register_schema(
+        {
+            "fields": {
+                "source_collection": {
+                    "type": "string",
+                    "description": "Local example corpus collection.",
+                },
+                "file_format": {
+                    "type": "string",
+                    "description": "Source file extension without the leading dot.",
+                },
+            }
+        },
+        source="demo",
+    )
+
+
+def backfill_registered_metadata_values(filesystem: PageIndexFileSystem, file_ref: str) -> None:
+    entry = filesystem.store.get_file(file_ref)
+    indexed_metadata = dict(entry.metadata or {})
+    with filesystem.store.connect() as conn:
+        filesystem.store.replace_metadata_values(conn, file_ref, indexed_metadata)
+
+
+def configure_summary_projection_backend(
+    filesystem: PageIndexFileSystem,
+    *,
+    embedding_provider: str,
+    embedding_model: str,
+    embedding_dimensions: int,
+) -> None:
+    if not (filesystem.summary_projection_index_dir / "summary_only_vector.sqlite").exists():
+        return
+    filesystem.configure_hybrid_projection_retrieval(
+        filesystem.summary_projection_index_dir,
+        embedding_provider=embedding_provider,
+        embedding_model=embedding_model,
+        embedding_dimensions=embedding_dimensions,
+    )
+
+
+def has_ready_register_outputs(filesystem: PageIndexFileSystem, external_id: str) -> bool:
+    try:
+        file_ref = filesystem.store.resolve_file_ref(external_id)
+        entry = filesystem.store.get_file(file_ref)
+    except KeyError:
+        return False
+    status = entry.metadata_status or {}
+    fields = status.get("fields") or {}
+    required = ("summary", "doc_type", "domain", "topic")
+    if any(fields.get(field, {}).get("status") != "generated" for field in required):
+        return False
+    summary_projection = (status.get("projection_indexes") or {}).get("summary") or {}
+    return summary_projection.get("status") == "ready"
+
+
+def register_documents(
+    filesystem: PageIndexFileSystem,
+    documents: list[Path],
+    *,
+    documents_dir: Path,
+) -> list[dict[str, Any]]:
+    registered: list[dict[str, Any]] = []
+    total = len(documents)
+    for index, document_path in enumerate(documents, 1):
+        document_path = document_path.resolve()
+        external_id = external_id_for(document_path)
+        log_progress(f"[{index}/{total}] {document_path.name}")
+        log_progress("PageIndex tree cache: checking examples/documents/results", indent=1)
+        cache_started = time.perf_counter()
+        cached_doc_id = seed_pageindex_cache(
+            filesystem,
+            document_path,
+            documents_dir=documents_dir,
+        )
+        cache_seconds = time.perf_counter() - cache_started
+        if cached_doc_id:
+            log_progress(
+                f"PageIndex tree cache: ready doc_id={cached_doc_id} ({cache_seconds:.2f}s)",
+                indent=1,
+            )
+        else:
+            log_progress(
+                f"PageIndex tree cache: no cached structure; register() will index if supported ({cache_seconds:.2f}s)",
+                indent=1,
+            )
+        if has_ready_register_outputs(filesystem, external_id):
+            file_ref = filesystem.store.resolve_file_ref(external_id)
+            backfill_registered_metadata_values(filesystem, file_ref)
+            log_progress(
+                f"PIFS register: cached file_ref={file_ref}; metadata and summary projection already ready",
+                indent=1,
+            )
+            registered.append(
+                {
+                    "file_ref": file_ref,
+                    "external_id": external_id,
+                    "path": str(document_path),
+                    "status": "cached",
+                    "pageindex_doc_id": cached_doc_id,
+                }
+            )
+            continue
+
+        log_progress(
+            "PIFS register: running register() -> metadata generation -> summary embedding -> sqlite upsert",
+            indent=1,
+        )
+        register_started = time.perf_counter()
+        file_ref = filesystem.register(
+            storage_uri=document_path.as_uri(),
+            source_path=str(document_path),
+            folder_path="/documents",
+            external_id=external_id,
+            title=document_path.name,
+            content_type=content_type_for(document_path),
+            source_type="examples-documents",
+            metadata={
+                "title": document_path.name,
+                "source_collection": "examples/documents",
+                "file_format": document_path.suffix.lower().lstrip("."),
+            },
+        )
+        register_seconds = time.perf_counter() - register_started
+        entry = filesystem.store.get_file(file_ref)
+        field_status = {
+            field: state.get("status")
+            for field, state in (entry.metadata_status.get("fields") or {}).items()
+        }
+        summary_projection = (
+            entry.metadata_status.get("projection_indexes", {}).get("summary", {})
+        )
+        log_progress(
+            f"PIFS register: done file_ref={file_ref} ({register_seconds:.2f}s)",
+            indent=1,
+        )
+        log_progress(
+            f"metadata: {entry.metadata_status.get('status', 'unknown')} fields={field_status}",
+            indent=1,
+        )
+        log_progress(
+            "summary projection: "
+            f"{summary_projection.get('status', 'not_requested')} "
+            f"index={summary_projection.get('index_path', '')}",
+            indent=1,
+        )
+        registered.append(
+            {
+                "file_ref": file_ref,
+                "external_id": external_id,
+                "path": str(document_path),
+                "status": entry.metadata_status.get("status", "unknown"),
+                "pageindex_tree_status": entry.pageindex_tree_status,
+                "pageindex_doc_id": entry.pageindex_doc_id,
+            }
+        )
+    return registered
+
+
+def print_section(title: str) -> None:
+    print("\n" + "#" * 78, flush=True)
+    print(f"# {title}", flush=True)
+    print("#" * 78, flush=True)
+
+
+def print_step(title: str, detail: str = "") -> None:
+    print(f"\n>>> {title}", flush=True)
+    if detail:
+        print(f"    {detail}", flush=True)
+
+
+def sanitize_preview_text(text: str) -> str:
+    cleaned = str(text).replace("\r", "\n").replace("\f", "\n")
+    cleaned = "".join(
+        ch if ch == "\n" or ch == "\t" or ord(ch) >= 32 else " "
+        for ch in cleaned
+    )
+    return "\n".join(
+        re.sub(r"[ \t]{2,}", " ", line).strip()
+        for line in cleaned.splitlines()
+    )
+
+
+def compact_lines(text: str, *, max_lines: int = 6, max_chars: int = 900) -> str:
+    lines = [line for line in sanitize_preview_text(text).splitlines() if line.strip()]
+    preview = "\n".join(lines[:max_lines])
+    if len(preview) > max_chars:
+        preview = preview[:max_chars].rstrip() + "..."
+    omitted = len(lines) - min(len(lines), max_lines)
+    if omitted > 0:
+        preview += f"\n    ... {omitted} more lines"
+    return preview
+
+
+def find_structure_node(structure: Any, title_fragment: str) -> dict[str, Any] | None:
+    if isinstance(structure, list):
+        for item in structure:
+            found = find_structure_node(item, title_fragment)
+            if found:
+                return found
+        return None
+    if not isinstance(structure, dict):
+        return None
+    if title_fragment.lower() in str(structure.get("title", "")).lower():
+        return structure
+    return find_structure_node(structure.get("nodes", []), title_fragment)
+
+
+def page_range_for_node(node: dict[str, Any] | None) -> str:
+    if not node:
+        return ""
+    ranges: list[tuple[int, int]] = []
+
+    def collect(item: Any) -> None:
+        if not isinstance(item, dict):
+            return
+        start = item.get("start_index")
+        end = item.get("end_index")
+        if isinstance(start, int) and isinstance(end, int):
+            ranges.append((start, end))
+        for child in item.get("nodes") or []:
+            collect(child)
+
+    collect(node)
+    if not ranges:
+        return ""
+    start = min(item[0] for item in ranges)
+    end = max(item[1] for item in ranges)
+    return f"{start}-{end}" if start != end else str(start)
+
+
+def opening_page_range_for_node(node: dict[str, Any] | None, *, max_pages: int = 2) -> str:
+    if not node:
+        return ""
+    ranges: list[tuple[int, int]] = []
+
+    def collect(item: Any) -> None:
+        if not isinstance(item, dict):
+            return
+        start = item.get("start_index")
+        end = item.get("end_index")
+        if isinstance(start, int) and isinstance(end, int):
+            ranges.append((start, end))
+        for child in item.get("nodes") or []:
+            collect(child)
+
+    collect(node)
+    if not ranges:
+        return ""
+    start = min(item[0] for item in ranges)
+    end = max(item[1] for item in ranges)
+    preview_end = min(end, start + max_pages - 1)
+    return f"{start}-{preview_end}" if start != preview_end else str(start)
+
+
+def execute_json_command(executor: PIFSCommandExecutor, command: str) -> dict[str, Any]:
+    try:
+        return json.loads(executor.execute(command))
+    except Exception as exc:
+        return {"ok": False, "error": str(exc), "data": None}
+
+
+def show_capability(
+    *,
+    label: str,
+    command: str,
+    result: str,
+    raw: str = "",
+    verbose: bool = False,
+) -> None:
+    print_step(label, command)
+    print(f"    result: {result}", flush=True)
+    if verbose and raw:
+        print("    raw:", flush=True)
+        print(compact_lines(raw, max_lines=10, max_chars=1600), flush=True)
+
+
+def show_registered_documents(registered: list[dict[str, Any]], *, verbose: bool = False) -> None:
+    print(f"\nRegistered {len(registered)} document(s):", flush=True)
+    for item in registered:
+        print(
+            "  - "
+            f"{Path(str(item.get('path', ''))).name}: "
+            f"file_ref={item.get('file_ref')} | "
+            f"status={item.get('status')} | "
+            f"pageindex_doc_id={item.get('pageindex_doc_id')}",
+            flush=True,
+        )
+    if verbose:
+        print("\nRaw registration records:", flush=True)
+        print(json.dumps(registered, ensure_ascii=False, indent=2), flush=True)
+
+
+def run_smoke_commands(
+    filesystem: PageIndexFileSystem,
+    registered: list[dict[str, Any]],
+    *,
+    verbose: bool = False,
+) -> None:
+    json_executor = PIFSCommandExecutor(filesystem, json_output=True)
+    shell_executor = PIFSCommandExecutor(filesystem, json_output=False)
+
+    command = "tree / --depth 2"
+    tree = execute_json_command(json_executor, command)
+    folders = (tree.get("data") or {}).get("folders") or []
+    documents_folder = next((item for item in folders if item.get("path") == "/documents"), {})
+    show_capability(
+        label="Folder browse",
+        command=command,
+        result=f"/documents contains {documents_folder.get('file_count', len(registered))} files",
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+    command = "ls /documents"
+    listing = execute_json_command(json_executor, command)
+    files = (listing.get("data") or {}).get("files") or []
+    file_titles = ", ".join(item.get("title", "") for item in files[:3])
+    show_capability(
+        label="List registered files",
+        command=command,
+        result=f"{len(files)} files: {file_titles}",
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+    command = "stat --schema"
+    schema = execute_json_command(json_executor, command)
+    fields = sorted(((schema.get("data") or {}).get("fields") or {}).keys())
+    show_capability(
+        label="Metadata schema",
+        command=command,
+        result=", ".join(fields),
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+    command = "find /documents --where '{\"source_collection\":\"examples/documents\"}' --limit 5"
+    found = execute_json_command(json_executor, command)
+    found_files = found.get("data") or []
+    show_capability(
+        label="Metadata DSL filter",
+        command=command,
+        result=f"{len(found_files)} documents matched source_collection=examples/documents",
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+    command = 'search-summary "Federal Reserve annual report supervision regulation section page range" /documents'
+    summary = execute_json_command(json_executor, command)
+    summary_hits = ((summary.get("data") or {}).get("data") or [])
+    if summary_hits:
+        summary_result = f"{len(summary_hits)} summary-vector candidates; top={summary_hits[0].get('external_id')}"
+    else:
+        summary_result = "summary-vector command is available, but this tiny two-doc demo returned no candidates"
+    show_capability(
+        label="Semantic summary search",
+        command=command,
+        result=summary_result,
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+    first_target = f"/documents/{Path(str(registered[0]['path'])).name}" if registered else None
+    if not first_target:
+        return
+
+    command = f"stat {first_target}"
+    stat = execute_json_command(json_executor, command)
+    stat_data = stat.get("data") or {}
+    show_capability(
+        label="File stat",
+        command=command,
+        result=(
+            f"{stat_data.get('title')} | tree={stat_data.get('pageindex_tree_status')} | "
+            f"metadata_status={(stat_data.get('metadata_status') or {}).get('status')}"
+        ),
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+    command = f"cat {first_target} --structure"
+    structure_payload = execute_json_command(json_executor, command)
+    structure_data = structure_payload.get("data") or {}
+    structure = structure_data.get("structure") or []
+    supervision_node = find_structure_node(structure, "Supervision and Regulation")
+    supervision_range = page_range_for_node(supervision_node)
+    show_capability(
+        label="PageIndex document structure",
+        command=command,
+        result=(
+            "found section 'Supervision and Regulation'"
+            + (f" with page span {supervision_range}" if supervision_range else "")
+        ),
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+    evidence_range = opening_page_range_for_node(supervision_node) or "1-2"
+    command = f"cat {first_target} --page {evidence_range}"
+    page = execute_json_command(json_executor, command)
+    page_text = str((page.get("data") or {}).get("text") or "")
+    show_capability(
+        label="Page evidence",
+        command=command,
+        result=compact_lines(page_text, max_lines=3, max_chars=420),
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+    command = 'grep -R "Supervision and Regulation" /documents'
+    grep = execute_json_command(json_executor, command)
+    grep_hits = ((grep.get("data") or {}).get("data") or [])
+    show_capability(
+        label="Lexical grep",
+        command=command,
+        result=f"{len(grep_hits)} real text matches",
+        raw=shell_executor.execute(command) if verbose else "",
+        verbose=verbose,
+    )
+
+
+def main() -> None:
+    args = parse_args()
+    require_runtime_environment(
+        metadata_provider=args.metadata_provider,
+        embedding_provider=args.embedding_provider,
+    )
+    workspace = args.workspace.expanduser()
+    documents_dir = args.documents_dir.expanduser()
+    if args.reset and workspace.exists():
+        shutil.rmtree(workspace)
+    workspace.mkdir(parents=True, exist_ok=True)
+
+    documents = resolve_requested_documents(documents_dir, args.document)
+    if args.max_docs > 0:
+        documents = documents[: args.max_docs]
+    if not documents:
+        raise RuntimeError(f"no cached example documents found under {documents_dir}")
+
+    filesystem = PageIndexFileSystem(
+        workspace,
+        metadata_generator=MetadataGenerator(
+            provider=args.metadata_provider,
+            model=args.metadata_model,
+        ),
+        summary_projection_embedding_provider=args.embedding_provider,
+        summary_projection_embedding_model=args.embedding_model,
+        summary_projection_embedding_dimensions=args.embedding_dimensions,
+    )
+    register_demo_metadata_schema(filesystem)
+
+    print_section("STEP 1/3  Register Documents")
+    print(f"Workspace: {workspace}", flush=True)
+    print(f"Documents: {len(documents)}", flush=True)
+    registered = register_documents(filesystem, documents, documents_dir=documents_dir)
+    configure_summary_projection_backend(
+        filesystem,
+        embedding_provider=args.embedding_provider,
+        embedding_model=args.embedding_model,
+        embedding_dimensions=args.embedding_dimensions,
+    )
+    show_registered_documents(registered, verbose=args.verbose)
+
+    print_section("STEP 2/3  Explore PIFS Tool Surface")
+    run_smoke_commands(filesystem, registered, verbose=args.verbose)
+
+    if args.prepare_only:
+        return
+
+    print_section("STEP 3/3  Ask An Agent Using Only PIFS")
+    print(f"Question: {args.question}", flush=True)
+    answer = run_pifs_agent(
+        filesystem,
+        args.question,
+        model=args.model,
+        root="/",
+        system_prompt=PIFS_DEMO_AGENT_PROMPT,
+        max_turns=args.max_turns,
+        max_seconds=args.max_seconds,
+        verbose=args.verbose,
+        stream_mode=args.stream_mode,
+        reasoning_effort=args.reasoning_effort,
+        reasoning_summary=args.reasoning_summary,
+    )
+    if answer:
+        print("\nFinal answer:", flush=True)
+        print(answer, flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pageindex/__init__.py b/pageindex/__init__.py
index 658003bf5..c3fb0b0ae 100644
--- a/pageindex/__init__.py
+++ b/pageindex/__init__.py
@@ -1,4 +1,22 @@
-from .page_index import *
-from .page_index_md import md_to_tree
-from .retrieve import get_document, get_document_structure, get_page_content
-from .client import PageIndexClient
+import os
+
+os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true")
+
+_OPTIONAL_CORE_IMPORTS = {"litellm", "openai", "PyPDF2", "pymupdf"}
+
+try:
+    from .page_index import *
+    from .page_index_md import md_to_tree
+    from .retrieve import get_document, get_document_structure, get_page_content
+    from .client import PageIndexClient
+except ModuleNotFoundError as exc:
+    if exc.name not in _OPTIONAL_CORE_IMPORTS:
+        raise
+
+
+def __getattr__(name: str):
+    if name == "PageIndexFileSystem":
+        from .filesystem import PageIndexFileSystem
+
+        return PageIndexFileSystem
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/pageindex/filesystem/__init__.py b/pageindex/filesystem/__init__.py
new file mode 100644
index 000000000..7908393d8
--- /dev/null
+++ b/pageindex/filesystem/__init__.py
@@ -0,0 +1,63 @@
+from importlib import import_module
+from typing import TYPE_CHECKING
+
+from .commands import PIFSCommandExecutor
+from .core import PageIndexFileSystem
+from .metadata_generation import (
+    MetadataGenerationBackend,
+    MetadataGenerationError,
+    MetadataGenerationInput,
+    MetadataGenerationResult,
+    MetadataGenerator,
+)
+from .types import OpenResult, SearchResult
+
+if TYPE_CHECKING:
+    from .hybrid_projection import HybridProjectionSearchBackend
+    from .projection_indexing import SummaryProjectionIndexer
+    from .semantic_index import (
+        RebuildableSemanticIndex,
+        SemanticIndexRecord,
+        SemanticSearchResult,
+        SQLiteVecSemanticIndex,
+    )
+
+_LAZY_EXPORTS = {
+    "HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"),
+    "RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"),
+    "SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"),
+    "SemanticSearchResult": (".semantic_index", "SemanticSearchResult"),
+    "SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"),
+    "SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"),
+}
+
+__all__ = [
+    "OpenResult",
+    "HybridProjectionSearchBackend",
+    "MetadataGenerationBackend",
+    "MetadataGenerationError",
+    "MetadataGenerationInput",
+    "MetadataGenerationResult",
+    "MetadataGenerator",
+    "PIFSCommandExecutor",
+    "PageIndexFileSystem",
+    "RebuildableSemanticIndex",
+    "SearchResult",
+    "SemanticIndexRecord",
+    "SemanticSearchResult",
+    "SummaryProjectionIndexer",
+    "SQLiteVecSemanticIndex",
+]
+
+
+def __getattr__(name: str):
+    if name in _LAZY_EXPORTS:
+        module_name, attribute_name = _LAZY_EXPORTS[name]
+        value = getattr(import_module(module_name, __name__), attribute_name)
+        globals()[name] = value
+        return value
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__() -> list[str]:
+    return sorted(set(globals()) | set(__all__) | set(_LAZY_EXPORTS))
diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py
new file mode 100644
index 000000000..b1f162504
--- /dev/null
+++ b/pageindex/filesystem/agent.py
@@ -0,0 +1,661 @@
+from __future__ import annotations
+
+import asyncio
+import concurrent.futures
+import json
+import os
+import re
+import sys
+import time
+from dataclasses import asdict, is_dataclass
+from typing import Any, Mapping, TextIO
+
+from .commands import PIFSCommandError, PIFSCommandExecutor
+from .core import PageIndexFileSystem
+
+
+TRUTHY_ENV_VALUES = {"1", "true", "yes", "on"}
+PIFS_AGENT_TRACING_ENV = "PAGEINDEX_PIFS_AGENT_TRACING"
+PIFS_AGENT_RAW_REASONING_ENV = "PAGEINDEX_PIFS_AGENT_RAW_REASONING"
+
+AGENT_SYSTEM_PROMPT = """
+You are the PageIndex FileSystem Demo Agent, developed by the VectifyAI Team.
+Your job is to answer questions about the caller's current PageIndex FileSystem
+workspace.
+
+You can inspect the corpus only by calling the bash tool. The bash tool is a
+read-only PageIndex virtual shell, not a real operating-system shell.
+
+If the user asks who you are, answer with this identity and mention that you can
+help inspect and answer questions about the current PIFS workspace. If the user
+asks a general question unrelated to the current workspace, do not answer it as
+a general-purpose assistant; briefly say that you can only answer workspace-
+related questions and invite them to ask about files, folders, metadata, or
+document contents in the workspace.
+
+If the user asks what tools or capabilities you have, describe only the PIFS
+virtual shell capabilities available inside this workspace: ls, tree, find,
+stat, grep, cat, and semantic search commands such as search-summary when they
+are available. Do not mention host runtime tools, SDK internals, or orchestration
+helpers that are not part of the PIFS shell.
+
+If the user asks a workspace-related topic question without naming a specific
+file, treat it as a retrieval task. Use available PIFS discovery commands to
+look for relevant files and inspect evidence before answering. Ask the user to
+clarify only after a reasonable search cannot identify relevant evidence.
+Do not conclude that no relevant document exists from one failed grep. If grep
+returns no matches for a workspace topic, verify with available semantic
+candidate discovery such as search-summary, or inspect likely document
+structure, before saying that the workspace lacks evidence.
+
+Follow the task prompt for command policy, retrieval strategy, and answer
+format. If the caller needs stricter behavior, pass an explicit system_prompt.
+"""
+
+BASH_TOOL_DESCRIPTION = """
+Run a command in the PageIndex FileSystem virtual shell. This is not a real
+operating-system shell. By default the tool is read-only: use ls, tree, find,
+grep, cat, stat, head, tail, sed, and any dynamically available semantic search
+commands described in the workspace context. grep -R is lexical evidence search;
+grep does not support regex alternation such as "a|b"; run multiple grep
+commands or use search-summary for semantic candidate discovery instead.
+semantic search commands such as search-summary return candidate documents and
+do not guarantee literal text matches or final answer evidence. After choosing
+a likely search-summary candidate, verify the relevant claim with cat before
+answering. Use search-summary when the user asks for summary search, semantic
+search, or vector search and the command is listed as available. Quote
+multi-word semantic queries, for example:
+search-summary "Federal Reserve" /documents. Do not write
+search-summary Federal Reserve /documents. Errors are returned as text prefixed
+with ERROR. Do not call
+commands that are not listed as available. When evidence is required, inspect it
+with cat or grep before answering. Prefer shell-like target-first cat syntax
+with stable targets: cat <path> --structure, cat <path> --page 31-59, and
+cat <path> --node 0009. You may also use file_ref or document_id when a path is
+ambiguous. Do not reconstruct paths from document titles; use exact targets
+returned by PIFS commands and quote paths containing spaces. After structure
+identifies a relevant section node, prefer
+cat <path> --node <node_id>; use cat <path> --page <range> when the user asks
+for page-level evidence, no suitable node exists, or exact page text is needed.
+cat <path> --structure is paginated; request more with --offset if needed. Page
+reads are limited to five pages at once, node reads to at most ten node ids,
+and text cat --all returns only the first page of text lines. If a cat limit
+error requires a smaller call, stop when the evidence is sufficient; otherwise
+continue with another chunk before answering.
+For questions about metadata fields, available summaries, or whether metadata
+was provided, inspect stat --schema and stat <target> before making claims.
+Do not use stat as a general content/topic discovery step. For document Q&A,
+prefer search-summary/find/grep for candidates, then cat --structure and
+cat --node or cat --page for evidence.
+"""
+
+AGENT_TOOL_POLICY = """
+Tool policy:
+- The bash tool is a PageIndex virtual shell, not an operating-system shell.
+- The default agent tool surface is read-only.
+- Use only commands listed in the workspace capabilities.
+- Folder paths such as /documents are positional command targets; never put folder paths in --where.
+- Use --where only with metadata fields shown by stat --schema.
+- grep -R performs lexical evidence search.
+- grep does not support regex alternation such as "a|b"; run separate grep commands or use search-summary for semantic candidate discovery.
+- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches or final answer evidence. After selecting a likely search-summary candidate, verify the relevant facts with cat before answering.
+- Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, search-summary, grep on a narrowed target, or cat on likely candidates instead.
+- A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with search-summary or another available semantic/vector candidate command, or inspect likely document structure, before answering no-evidence.
+- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary "<query>" <folder>; quote multi-word queries, for example search-summary "Federal Reserve" /documents; do not translate that request into find --where.
+- Tool errors are returned as ERROR text; recover by trying an available command.
+- Use cat or grep to gather evidence before making source-backed claims.
+- Do not reconstruct a file path from a title. Use exact paths returned by PIFS commands, or use file_ref/document_id when available; quote paths that contain spaces.
+- For broad topic, method, or "what solution" questions that are likely about the workspace, search for candidate documents before asking the user to choose a document.
+- Use stat only for metadata/schema/status questions or to resolve ambiguous target identity. Do not run stat merely to understand what a document says.
+- Prefer target-first cat syntax with stable targets: cat <path> --structure, cat <path> --page 31-59, cat <path> --node <node_id>.
+- cat <target> --structure returns at most 25 nodes; use --offset and --limit for more structure pages.
+- cat <target> --page accepts at most 5 pages at once. If a larger range is needed, first inspect cat <target> --structure and then read a smaller page range or node.
+- cat <target> --node accepts at most 10 node ids at once. Prefer relevant nodes from structure when possible.
+- When recovering from cat page/node/text limit errors, stop if the evidence is sufficient; if it is not sufficient, make another smaller call before answering.
+- cat <target> --all returns at most 100 text lines; use cat <target> --range <start>-<end> for the next page.
+- After cat <target> --structure finds a relevant section/subsection with a node_id, prefer cat <target> --node <node_id> for content from that semantic unit.
+- Use cat <target> --page <start>-<end> when the user explicitly asks for pages/page ranges, when no suitable node_id exists, or when you need exact page text to verify page-level evidence.
+- Avoid fetching a broad page span after a matching node is available unless page-level citation or verification is required.
+- Do not call cat --page <target> <start> <end>; if you need a page span, use cat <target> --page <start>-<end>.
+- For metadata or summary-field questions, run stat --schema and stat <target> for relevant files before answering; do not infer metadata presence or absence from ls/find output alone.
+- Distinguish default/register metadata from caller-provided custom metadata when the evidence supports it.
+"""
+
+STREAM_MODE_ALIASES = {
+    "": "off",
+    "none": "off",
+    "false": "off",
+    "0": "off",
+    "off": "off",
+    "tool": "tools",
+    "tools": "tools",
+    "model": "model",
+    "output": "model",
+    "outputs": "model",
+    "think": "model",
+    "all": "all",
+    "debug": "all",
+}
+AGENT_STREAM_MODE_CHOICES = sorted(item for item in STREAM_MODE_ALIASES if item)
+REASONING_EFFORT_CHOICES = ["none", "minimal", "low", "medium", "high", "xhigh"]
+REASONING_SUMMARY_CHOICES = ["none", "auto", "concise", "detailed"]
+
+
+def should_use_openai_compatible_chat_model(base_url: str | None) -> bool:
+    if not base_url:
+        return False
+    normalized = base_url.strip().rstrip("/")
+    return normalized not in {"https://api.openai.com", "https://api.openai.com/v1"}
+
+
+def env_flag_enabled(name: str, environ: Mapping[str, str] | None = None) -> bool:
+    source = os.environ if environ is None else environ
+    value = source.get(name, "")
+    return value.strip().lower() in TRUTHY_ENV_VALUES
+
+
+def pifs_agent_tracing_enabled(environ: Mapping[str, str] | None = None) -> bool:
+    return env_flag_enabled(PIFS_AGENT_TRACING_ENV, environ)
+
+
+def should_disable_pifs_agent_tracing(environ: Mapping[str, str] | None = None) -> bool:
+    return not pifs_agent_tracing_enabled(environ)
+
+
+def pifs_agent_raw_reasoning_enabled(environ: Mapping[str, str] | None = None) -> bool:
+    return env_flag_enabled(PIFS_AGENT_RAW_REASONING_ENV, environ)
+
+
+def normalize_reasoning_effort(reasoning_effort: str | None) -> str | None:
+    if reasoning_effort is None or not reasoning_effort.strip():
+        return None
+    effort = reasoning_effort.strip().lower()
+    if effort not in REASONING_EFFORT_CHOICES:
+        allowed = ", ".join(REASONING_EFFORT_CHOICES)
+        raise ValueError(f"Unknown reasoning effort: {reasoning_effort!r}. Allowed: {allowed}")
+    return effort
+
+
+def normalize_reasoning_summary(reasoning_summary: str | None) -> str | None:
+    if reasoning_summary is None or not reasoning_summary.strip():
+        return None
+    summary = reasoning_summary.strip().lower()
+    if summary not in REASONING_SUMMARY_CHOICES:
+        allowed = ", ".join(REASONING_SUMMARY_CHOICES)
+        raise ValueError(f"Unknown reasoning summary: {reasoning_summary!r}. Allowed: {allowed}")
+    return None if summary == "none" else summary
+
+
+def build_agent_model_settings(
+    *,
+    reasoning_effort: str | None = None,
+    reasoning_summary: str | None = None,
+) -> Any | None:
+    effort = normalize_reasoning_effort(reasoning_effort)
+    summary = normalize_reasoning_summary(reasoning_summary)
+    if effort is None and summary is None:
+        return None
+    if effort not in {None, "none"} and summary is None:
+        summary = "auto"
+
+    from agents import ModelSettings
+    from openai.types.shared import Reasoning
+
+    reasoning_kwargs = {}
+    if effort is not None:
+        reasoning_kwargs["effort"] = effort
+    if summary is not None:
+        reasoning_kwargs["summary"] = summary
+    return ModelSettings(reasoning=Reasoning(**reasoning_kwargs), verbosity="low")
+
+
+def normalize_agent_stream_mode(stream_mode: str | None) -> str:
+    mode = STREAM_MODE_ALIASES.get((stream_mode or "off").strip().lower())
+    if mode is None:
+        allowed = ", ".join(sorted({"off", "tools", "model", "all"}))
+        raise ValueError(f"Unknown PIFS agent stream mode: {stream_mode!r}. Allowed: {allowed}")
+    return mode
+
+
+def serialize_agent_final_output(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    if hasattr(value, "model_dump_json"):
+        return value.model_dump_json()
+    if is_dataclass(value):
+        return json.dumps(asdict(value), ensure_ascii=False)
+    if isinstance(value, (dict, list)):
+        return json.dumps(value, ensure_ascii=False)
+    return str(value)
+
+
+def compact_tool_output_preview(
+    output: str,
+    *,
+    preview_chars: int = 700,
+    max_lines: int = 8,
+) -> str:
+    cleaned = str(output).replace("\r", "\n").replace("\f", "\n")
+    cleaned = "".join(
+        ch if ch == "\n" or ch == "\t" or ord(ch) >= 32 else " "
+        for ch in cleaned
+    )
+    lines = [
+        re.sub(r"[ \t]{2,}", " ", line).strip()
+        for line in cleaned.splitlines()
+        if line.strip()
+    ]
+    is_large_result = len(cleaned) > preview_chars or len(lines) > max_lines
+    preview = "\n".join(lines[:max_lines])
+    if len(preview) > preview_chars:
+        preview = preview[:preview_chars].rstrip() + "..."
+    omitted = len(lines) - min(len(lines), max_lines)
+    if is_large_result:
+        preview = f"[large PIFS result: {len(cleaned)} chars; showing compact preview]\n" + preview
+    if omitted > 0:
+        preview += f"\n... [{omitted} more lines omitted from preview]"
+    if len(cleaned) > preview_chars:
+        preview += "\n... [full result returned to agent; terminal preview shortened]"
+    return preview
+
+
+def build_agent_initial_context(
+    filesystem: PageIndexFileSystem,
+    *,
+    root: str = "/",
+    executor: PIFSCommandExecutor | None = None,
+    query_context: str | None = None,
+) -> str:
+    executor = executor or PIFSCommandExecutor(
+        filesystem,
+        json_output=False,
+        query_context=query_context,
+    )
+    schema = filesystem._metadata_schema()
+    schema_fields = schema.get("fields", {})
+    schema_sample = dict(list(schema_fields.items())[:50])
+    return "\n".join(
+        [
+            f"Root path: {root}",
+            "Top-level listing:",
+            executor.execute(f"ls {root}"),
+            "Metadata schema summary:",
+            json.dumps(
+                {
+                    "field_count": len(schema_fields),
+                    "sample_fields": schema_sample,
+                },
+                ensure_ascii=False,
+            ),
+            "Workspace retrieval capabilities:",
+            executor.describe_available_command_surfaces(),
+        ]
+    )
+
+
+def build_pifs_agent_instructions(
+    filesystem: PageIndexFileSystem,
+    *,
+    root: str = "/",
+    system_prompt: str | None = None,
+    executor: PIFSCommandExecutor | None = None,
+    query_context: str | None = None,
+) -> str:
+    initial_context = build_agent_initial_context(
+        filesystem,
+        root=root,
+        executor=executor,
+        query_context=query_context,
+    )
+    return "\n\n".join(
+        [
+            (system_prompt or AGENT_SYSTEM_PROMPT).strip(),
+            AGENT_TOOL_POLICY.strip(),
+            "Workspace context:\n" + initial_context,
+        ]
+    )
+
+
+class PIFSAgentStreamObserver:
+    def __init__(
+        self,
+        stream_mode: str,
+        *,
+        stream_log: list[dict[str, Any]] | None = None,
+        output: TextIO | None = None,
+        include_raw_reasoning: bool | None = None,
+    ) -> None:
+        self.stream_mode = normalize_agent_stream_mode(stream_mode)
+        self.stream_log = stream_log
+        self.output = output or sys.stdout
+        self.include_raw_reasoning = (
+            pifs_agent_raw_reasoning_enabled()
+            if include_raw_reasoning is None
+            else include_raw_reasoning
+        )
+        self._printed_section: str | None = None
+        self._buffers: dict[str, list[str]] = {
+            "output": [],
+            "think": [],
+            "think_summary": [],
+            "tool_args": [],
+        }
+
+    @property
+    def wants_model_stream(self) -> bool:
+        return self.stream_mode in {"model", "all"}
+
+    @property
+    def wants_tool_stream(self) -> bool:
+        return self.stream_mode in {"tools", "all"}
+
+    @property
+    def has_output_text(self) -> bool:
+        return bool(self._buffers["output"])
+
+    def handle_event(self, event: Any) -> None:
+        if getattr(event, "type", None) == "raw_response_event":
+            self._handle_raw_response_event(getattr(event, "data", None))
+        elif getattr(event, "type", None) == "run_item_stream_event":
+            self._handle_run_item_event(event)
+
+    def finish(self, final_output: Any = None) -> None:
+        if self.wants_model_stream and not self.has_output_text and final_output:
+            self._emit("output", str(final_output), "[llm final output stream]")
+        if self._printed_section is not None:
+            print(file=self.output, flush=True)
+            self._printed_section = None
+        if self.stream_log is not None:
+            for kind, parts in self._buffers.items():
+                text = "".join(parts)
+                if text:
+                    self.stream_log.append({"kind": kind, "text": text})
+
+    def _handle_raw_response_event(self, data: Any) -> None:
+        event_type = getattr(data, "type", "")
+        delta = getattr(data, "delta", None)
+        if not isinstance(delta, str) or not delta:
+            return
+        if event_type == "response.output_text.delta":
+            self._emit("output", delta, "[llm final output stream]")
+        elif event_type == "response.reasoning_text.delta":
+            if self.include_raw_reasoning:
+                self._emit("think", delta, "[llm reasoning text stream]")
+        elif event_type == "response.reasoning_summary_text.delta":
+            self._emit("think_summary", delta, "[llm reasoning summary stream]")
+        elif event_type == "response.function_call_arguments.delta":
+            self._buffers["tool_args"].append(delta)
+
+    def _handle_run_item_event(self, event: Any) -> None:
+        name = getattr(event, "name", "")
+        item = getattr(event, "item", None)
+        item_type = getattr(item, "type", "")
+        if self.stream_log is not None and name in {"message_output_created", "reasoning_item_created"}:
+            self.stream_log.append({"kind": "run_item", "name": name, "item_type": item_type})
+
+    def _emit(self, kind: str, text: str, label: str) -> None:
+        if kind == "tool_args":
+            should_print = self.wants_tool_stream
+        else:
+            should_print = self.wants_model_stream
+        if not should_print:
+            return
+        self._buffers[kind].append(text)
+        if self._printed_section != kind:
+            if self._printed_section is not None:
+                print(file=self.output, flush=True)
+            print(f"\n{label}", file=self.output, flush=True)
+            self._printed_section = kind
+        print(text, end="", file=self.output, flush=True)
+
+    def emit_tool_call(self, command: str, *, force: bool = False) -> None:
+        if not command.strip():
+            return
+        if self.stream_log is not None:
+            self.stream_log.append({"kind": "tool_call", "command": command})
+        if not (force or self.wants_tool_stream):
+            return
+        self._start_section("tool_call", "[llm -> pifs command]")
+        print(command, file=self.output, flush=True)
+
+    def emit_tool_result(
+        self,
+        *,
+        ok: bool,
+        output: str,
+        seconds: float,
+        force: bool = False,
+        preview_chars: int = 1000,
+    ) -> None:
+        if self.stream_log is not None:
+            self.stream_log.append(
+                {
+                    "kind": "tool_result",
+                    "ok": ok,
+                    "seconds": round(seconds, 4),
+                    "output_chars": len(output),
+                    "preview": compact_tool_output_preview(output, preview_chars=preview_chars),
+                }
+            )
+        if not (force or self.wants_tool_stream):
+            return
+        preview = compact_tool_output_preview(output, preview_chars=preview_chars)
+        self._start_section("tool_result", "[pifs -> llm result preview]")
+        print(
+            f"ok={str(ok).lower()} seconds={seconds:.4f} output_chars={len(output)}",
+            file=self.output,
+            flush=True,
+        )
+        print(preview, file=self.output, flush=True)
+
+    def _start_section(self, kind: str, label: str) -> None:
+        if self._printed_section is not None:
+            print(file=self.output, flush=True)
+        print(f"\n{label}", file=self.output, flush=True)
+        self._printed_section = kind
+
+
+def run_pifs_agent(
+    filesystem: PageIndexFileSystem,
+    question: str,
+    *,
+    model: str,
+    root: str = "/",
+    system_prompt: str | None = None,
+    max_turns: int = 20,
+    max_seconds: float | None = 60,
+    verbose: bool = False,
+    stream_mode: str = "off",
+    reasoning_effort: str | None = None,
+    reasoning_summary: str | None = None,
+    output_type: type[Any] | None = None,
+    tool_log: list[dict[str, Any]] | None = None,
+    agent_log: list[dict[str, Any]] | None = None,
+) -> str:
+    session = PIFSAgentSession(
+        filesystem,
+        model=model,
+        root=root,
+        system_prompt=system_prompt,
+        max_turns=max_turns,
+        max_seconds=max_seconds,
+        verbose=verbose,
+        stream_mode=stream_mode,
+        reasoning_effort=reasoning_effort,
+        reasoning_summary=reasoning_summary,
+        output_type=output_type,
+        tool_log=tool_log,
+        agent_log=agent_log,
+        persist_conversation=False,
+    )
+    return session.run(question)
+
+
+class PIFSAgentSession:
+    def __init__(
+        self,
+        filesystem: PageIndexFileSystem,
+        *,
+        model: str,
+        root: str = "/",
+        system_prompt: str | None = None,
+        max_turns: int = 20,
+        max_seconds: float | None = 60,
+        verbose: bool = False,
+        stream_mode: str = "off",
+        reasoning_effort: str | None = None,
+        reasoning_summary: str | None = None,
+        output_type: type[Any] | None = None,
+        tool_log: list[dict[str, Any]] | None = None,
+        agent_log: list[dict[str, Any]] | None = None,
+        persist_conversation: bool = True,
+    ) -> None:
+        self.filesystem = filesystem
+        self.max_turns = max_turns
+        self.max_seconds = max_seconds
+        self.verbose = verbose
+        self.tool_log = tool_log
+        self.agent_log = agent_log
+        self.normalized_stream_mode = normalize_agent_stream_mode(stream_mode)
+        self.observer: PIFSAgentStreamObserver | None = None
+
+        try:
+            from agents import (
+                Agent,
+                OpenAIChatCompletionsModel,
+                function_tool,
+                set_tracing_disabled,
+            )
+            from agents.memory import SQLiteSession
+            from openai import AsyncOpenAI
+        except ModuleNotFoundError as exc:
+            if exc.name == "agents":
+                raise RuntimeError(
+                    "openai-agents is required to run the PageIndex FileSystem agent"
+                ) from exc
+            raise
+
+        set_tracing_disabled(should_disable_pifs_agent_tracing())
+        self.executor = PIFSCommandExecutor(filesystem, json_output=False)
+        instructions = build_pifs_agent_instructions(
+            filesystem,
+            root=root,
+            system_prompt=system_prompt,
+            executor=self.executor,
+        )
+
+        @function_tool(description_override=BASH_TOOL_DESCRIPTION.strip())
+        def bash(command: str) -> str:
+            """Run an allowed PageIndex FileSystem virtual shell command."""
+            return self._run_bash(command)
+
+        model_settings = build_agent_model_settings(
+            reasoning_effort=reasoning_effort,
+            reasoning_summary=reasoning_summary,
+        )
+        base_url = os.environ.get("OPENAI_BASE_URL")
+        model_config = model
+        if should_use_openai_compatible_chat_model(base_url):
+            model_config = OpenAIChatCompletionsModel(
+                model=model,
+                openai_client=AsyncOpenAI(
+                    api_key=os.environ.get("OPENAI_API_KEY"),
+                    base_url=base_url,
+                ),
+            )
+
+        agent_kwargs: dict[str, Any] = {
+            "name": "PageIndexFileSystem",
+            "instructions": instructions,
+            "tools": [bash],
+            "model": model_config,
+        }
+        if model_settings is not None:
+            agent_kwargs["model_settings"] = model_settings
+        if output_type is not None:
+            agent_kwargs["output_type"] = output_type
+        self.agent = Agent(**agent_kwargs)
+        self.session = SQLiteSession("pifs-chat") if persist_conversation else None
+
+    def run(self, question: str) -> str:
+        self.executor.query_context = extract_agent_question_text(question)
+        self.observer = PIFSAgentStreamObserver(
+            self.normalized_stream_mode,
+            stream_log=self.agent_log,
+        )
+
+        async def _run_streamed() -> str:
+            from agents import Runner
+
+            streamed_run = Runner.run_streamed(
+                self.agent,
+                question,
+                max_turns=self.max_turns,
+                session=self.session,
+            )
+            final_output = ""
+            try:
+                async for event in streamed_run.stream_events():
+                    self.observer.handle_event(event)
+                final_output = serialize_agent_final_output(streamed_run.final_output)
+                return final_output
+            finally:
+                if not final_output and streamed_run.final_output:
+                    final_output = serialize_agent_final_output(streamed_run.final_output)
+                self.observer.finish(final_output)
+
+        async def _run() -> str:
+            if self.max_seconds is None or self.max_seconds <= 0:
+                return await _run_streamed()
+            try:
+                return await asyncio.wait_for(_run_streamed(), timeout=self.max_seconds)
+            except asyncio.TimeoutError as exc:
+                raise TimeoutError(f"MaxSecondsExceeded: exceeded {self.max_seconds:g}s") from exc
+
+        try:
+            asyncio.get_running_loop()
+        except RuntimeError:
+            return asyncio.run(_run())
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+            return pool.submit(asyncio.run, _run()).result()
+
+    def _run_bash(self, command: str) -> str:
+        started = time.time()
+        ok = True
+        assert self.observer is not None
+        self.observer.emit_tool_call(command, force=self.verbose)
+        try:
+            output = self.executor.execute(command)
+        except PIFSCommandError as exc:
+            ok = False
+            output = f"ERROR: {exc}"
+        seconds = time.time() - started
+        if self.tool_log is not None:
+            self.tool_log.append(
+                {
+                    "command": command,
+                    "ok": ok,
+                    "seconds": round(seconds, 4),
+                    "output_chars": len(output),
+                    "preview": output[:500],
+                }
+            )
+        self.observer.emit_tool_result(
+            ok=ok,
+            output=output,
+            seconds=seconds,
+            force=self.verbose,
+        )
+        return output
+
+
+def extract_agent_question_text(prompt: str) -> str:
+    for line in str(prompt or "").splitlines():
+        if line.startswith("Question:"):
+            value = line.split(":", 1)[1].strip()
+            if value:
+                return value
+    return str(prompt or "").strip()
diff --git a/pageindex/filesystem/cli.py b/pageindex/filesystem/cli.py
new file mode 100644
index 000000000..e808d32ea
--- /dev/null
+++ b/pageindex/filesystem/cli.py
@@ -0,0 +1,350 @@
+from __future__ import annotations
+
+import argparse
+import contextlib
+import json
+import os
+import re
+import shlex
+import sys
+from pathlib import Path
+from typing import Iterator, TextIO
+
+from .agent import (
+    PIFSAgentSession,
+    REASONING_EFFORT_CHOICES,
+    REASONING_SUMMARY_CHOICES,
+    run_pifs_agent,
+)
+from .commands import PIFSCommandError, PIFSCommandExecutor
+from .core import PageIndexFileSystem
+
+
+AGENT_STREAM_MODE_CHOICES = ("off", "tools", "model", "all")
+DEFAULT_AGENT_MODEL = "gpt-5.4-mini"
+EXIT_COMMANDS = {"exit", "quit", ":q"}
+ANSI_ESCAPE_RE = re.compile(r"\x1b(?:\[[0-?]*[ -/]*[@-~]|.)")
+PIFS_CONFIG_FILE_ENV = "PIFS_CONFIG_FILE"
+PIFS_WORKSPACE_ENV = "PIFS_WORKSPACE"
+
+
+def _config_path() -> Path:
+    override = os.environ.get(PIFS_CONFIG_FILE_ENV)
+    if override:
+        return Path(override).expanduser()
+    config_home = os.environ.get("XDG_CONFIG_HOME")
+    root = Path(config_home).expanduser() if config_home else Path.home() / ".config"
+    return root / "pageindex" / "pifs.json"
+
+
+def _read_config() -> dict[str, str]:
+    path = _config_path()
+    if not path.exists():
+        return {}
+    with path.open("r", encoding="utf-8") as handle:
+        payload = json.load(handle)
+    if not isinstance(payload, dict):
+        raise ValueError(f"invalid PIFS config file: {path}")
+    return {str(key): str(value) for key, value in payload.items() if value is not None}
+
+
+def _write_config(config: dict[str, str]) -> Path:
+    path = _config_path()
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        json.dump(config, handle, indent=2, sort_keys=True)
+        handle.write("\n")
+    return path
+
+
+def _configured_workspace() -> str | None:
+    return _read_config().get("workspace")
+
+
+def _resolve_workspace(value: str | None) -> str | None:
+    return value or os.environ.get(PIFS_WORKSPACE_ENV) or _configured_workspace()
+
+
+def _load_env_file(path: str | None = None, *, workspace: str | None = None) -> Path | None:
+    from dotenv import load_dotenv
+
+    if path:
+        env_path = Path(path).expanduser()
+        if not env_path.exists():
+            raise FileNotFoundError(f"env file not found: {env_path}")
+        load_dotenv(env_path, override=True)
+        return env_path
+
+    env_override = os.environ.get("PIFS_ENV_FILE")
+    if env_override:
+        return _load_env_file(env_override)
+
+    starts = [Path.cwd()]
+    if workspace:
+        starts.append(Path(workspace).expanduser())
+    seen: set[Path] = set()
+    for start in starts:
+        current = start.resolve() if start.exists() else start.resolve(strict=False)
+        if current.is_file():
+            current = current.parent
+        for parent in (current, *current.parents):
+            candidate = parent / ".env"
+            if candidate in seen:
+                continue
+            seen.add(candidate)
+            if candidate.exists():
+                load_dotenv(candidate, override=False)
+                return candidate
+    return None
+
+
+def _agent_model_default() -> str:
+    return (
+        os.environ.get("PIFS_AGENT_MODEL")
+        or os.environ.get("PIFS_MODEL")
+        or DEFAULT_AGENT_MODEL
+    )
+
+
+def _add_agent_arguments(
+    parser: argparse.ArgumentParser,
+    *,
+    workspace_default: str | None,
+    default_stream_mode: str,
+) -> None:
+    parser.add_argument("--workspace", default=workspace_default)
+    parser.add_argument("--env-file", default=None)
+    parser.add_argument("--model", default=_agent_model_default())
+    parser.add_argument(
+        "--stream-mode",
+        default=default_stream_mode,
+        choices=AGENT_STREAM_MODE_CHOICES,
+    )
+    parser.add_argument("--max-turns", type=int, default=20)
+    parser.add_argument("--max-seconds", type=float, default=60)
+    parser.add_argument(
+        "--reasoning-effort",
+        choices=REASONING_EFFORT_CHOICES,
+        default=None,
+    )
+    parser.add_argument(
+        "--reasoning-summary",
+        choices=REASONING_SUMMARY_CHOICES,
+        default=None,
+    )
+
+
+def _parse_agent_command(
+    command_name: str,
+    argv: list[str],
+    *,
+    workspace_default: str | None,
+    default_stream_mode: str,
+) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog=f"pifs {command_name}",
+        description=f"PageIndex FileSystem {command_name}",
+    )
+    _add_agent_arguments(
+        parser,
+        workspace_default=workspace_default,
+        default_stream_mode=default_stream_mode,
+    )
+    if command_name == "ask":
+        parser.add_argument("question", nargs=argparse.REMAINDER)
+    args = parser.parse_args(argv)
+    _load_env_file(args.env_file, workspace=args.workspace)
+    args.workspace = _resolve_workspace(args.workspace)
+    if not args.workspace:
+        parser.error("--workspace is required unless PIFS_WORKSPACE is set or `pifs set workspace <path>` has been run")
+    return args
+
+
+def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem:
+    filesystem = PageIndexFileSystem(Path(workspace).expanduser())
+    with contextlib.suppress(Exception):
+        filesystem.configure_existing_projection_retrieval()
+    return filesystem
+
+
+def _agent_kwargs(args: argparse.Namespace) -> dict[str, object]:
+    return {
+        "model": args.model,
+        "stream_mode": args.stream_mode,
+        "max_turns": args.max_turns,
+        "max_seconds": args.max_seconds,
+        "reasoning_effort": args.reasoning_effort,
+        "reasoning_summary": args.reasoning_summary,
+    }
+
+
+def _sanitize_chat_question(raw: str) -> str:
+    text = ANSI_ESCAPE_RE.sub("", raw)
+    chars: list[str] = []
+    for char in text:
+        if char in {"\b", "\x7f"}:
+            if chars:
+                chars.pop()
+            continue
+        if char in {"\r", "\n"}:
+            continue
+        if ord(char) < 32 or ord(char) == 127:
+            continue
+        chars.append(char)
+    return "".join(chars).strip()
+
+
+@contextlib.contextmanager
+def _suppress_tty_input_echo(stdin: TextIO | None = None) -> Iterator[None]:
+    stream = sys.stdin if stdin is None else stdin
+    if not hasattr(stream, "isatty") or not stream.isatty():
+        yield
+        return
+    try:
+        import termios
+
+        fd = stream.fileno()
+        original = termios.tcgetattr(fd)
+        muted = original[:]
+        muted[3] = muted[3] & ~termios.ECHO
+        termios.tcsetattr(fd, termios.TCSADRAIN, muted)
+    except Exception:
+        yield
+        return
+    try:
+        yield
+    finally:
+        with contextlib.suppress(Exception):
+            termios.tcflush(fd, termios.TCIFLUSH)
+        with contextlib.suppress(Exception):
+            termios.tcsetattr(fd, termios.TCSADRAIN, original)
+
+
+def _run_ask(argv: list[str], *, workspace_default: str | None) -> int:
+    args = _parse_agent_command(
+        "ask",
+        argv,
+        workspace_default=workspace_default,
+        default_stream_mode="off",
+    )
+    question_tokens = [token for token in args.question if token != "--"]
+    question = " ".join(question_tokens).strip()
+    if not question:
+        raise ValueError("ask requires a question")
+
+    filesystem = _filesystem_from_workspace(args.workspace)
+    answer = run_pifs_agent(filesystem, question, **_agent_kwargs(args))
+    if args.stream_mode == "off":
+        print(answer)
+    return 0
+
+
+def _run_chat(argv: list[str], *, workspace_default: str | None) -> int:
+    args = _parse_agent_command(
+        "chat",
+        argv,
+        workspace_default=workspace_default,
+        default_stream_mode="all",
+    )
+    filesystem = _filesystem_from_workspace(args.workspace)
+    session = PIFSAgentSession(filesystem, **_agent_kwargs(args))
+    while True:
+        try:
+            question = _sanitize_chat_question(input("pifs> "))
+        except EOFError:
+            break
+        except KeyboardInterrupt:
+            print()
+            break
+        if not question:
+            continue
+        if question.lower() in EXIT_COMMANDS:
+            break
+        with _suppress_tty_input_echo():
+            answer = session.run(question)
+        if args.stream_mode == "off":
+            print(answer)
+    return 0
+
+
+def _run_passthrough(
+    command_tokens: list[str],
+    *,
+    workspace: str,
+    json_output: bool,
+) -> int:
+    filesystem = _filesystem_from_workspace(workspace)
+    executor = PIFSCommandExecutor(filesystem, json_output=json_output)
+    command = " ".join(shlex.quote(token) for token in command_tokens)
+    print(executor.execute(command))
+    return 0
+
+
+def _run_set(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser(
+        prog="pifs set",
+        description="Set PageIndex FileSystem CLI defaults",
+    )
+    parser.add_argument("name", choices=["workspace"])
+    parser.add_argument("value")
+    args = parser.parse_args(argv)
+
+    config = _read_config()
+    if args.name == "workspace":
+        workspace = Path(args.value).expanduser().resolve(strict=False)
+        config["workspace"] = str(workspace)
+        path = _write_config(config)
+        print(f"workspace: {workspace}")
+        print(f"config: {path}")
+        return 0
+    raise ValueError(f"unknown config key: {args.name}")
+
+
+def main(argv: list[str] | None = None) -> int:
+    argv = list(sys.argv[1:] if argv is None else argv)
+    _load_env_file()
+    parser = argparse.ArgumentParser(description="PageIndex FileSystem CLI")
+    parser.add_argument("--workspace", default=None)
+    parser.add_argument("--env-file", default=None)
+    parser.add_argument("--json", action="store_true", dest="json_output")
+    parser.add_argument("command", nargs=argparse.REMAINDER)
+    args = parser.parse_args(argv)
+    _load_env_file(args.env_file, workspace=args.workspace)
+    args.workspace = _resolve_workspace(args.workspace)
+
+    command_tokens = [token for token in args.command if token != "--"]
+    json_output = args.json_output
+
+    if not command_tokens:
+        parser.error("a filesystem command is required")
+
+    try:
+        command_name = command_tokens[0]
+        command_args = command_tokens[1:]
+        if command_name == "set":
+            return _run_set(command_args)
+        if command_name == "ask":
+            return _run_ask(command_args, workspace_default=args.workspace)
+        if command_name == "chat":
+            return _run_chat(command_args, workspace_default=args.workspace)
+
+        if "--json" in command_tokens:
+            command_tokens = [token for token in command_tokens if token != "--json"]
+            json_output = True
+        if not args.workspace:
+            parser.error("--workspace is required unless PIFS_WORKSPACE is set or `pifs set workspace <path>` has been run")
+        return _run_passthrough(
+            command_tokens,
+            workspace=args.workspace,
+            json_output=json_output,
+        )
+    except PIFSCommandError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
+    except Exception as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py
new file mode 100644
index 000000000..18a85cc2f
--- /dev/null
+++ b/pageindex/filesystem/commands.py
@@ -0,0 +1,2226 @@
+from __future__ import annotations
+
+import json
+import re
+import shlex
+import subprocess
+from dataclasses import asdict, is_dataclass
+from pathlib import Path
+from typing import Any
+
+from .core import SEMANTIC_GREP_CHANNELS, SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem
+
+
+class PIFSCommandError(ValueError):
+    pass
+
+
+class PIFSCommandExecutor:
+    FORBIDDEN_SUBSTRINGS = (";", "`", "$(", "||", "\n", "\r")
+    FORBIDDEN_TOKENS = {"|", ">", "<", ">>", "<<", "&"}
+    BASE_ALLOWED_COMMANDS = {
+        "ls",
+        "tree",
+        "find",
+        "grep",
+        "cat",
+        "stat",
+        "head",
+        "tail",
+        "sed",
+    }
+    SEMANTIC_CHANNEL_COMMANDS = {
+        "summary": "search-summary",
+        "entity": "search-entity",
+        "relation": "search-relation",
+    }
+    ALLOWED_COMMANDS = (
+        BASE_ALLOWED_COMMANDS
+        | {"semantic-grep"}
+        | set(SEMANTIC_CHANNEL_COMMANDS.values())
+    )
+    ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"}
+    COMMAND_METHODS = {
+        "search-summary": "_cmd_search_summary",
+        "search-entity": "_cmd_search_entity",
+        "search-relation": "_cmd_search_relation",
+        "semantic-grep": "_cmd_semantic_grep",
+    }
+    MAX_CHAINED_COMMANDS = 3
+    MAX_PIPE_COMMANDS = 3
+    MAX_LS_LIMIT = 100
+    MAX_TREE_LIMIT = 200
+    MAX_FIND_LIMIT = 50
+    MAX_GREP_LIMIT = 20
+    MAX_SEMANTIC_LIMIT = 20
+    MAX_TEXT_LINES = 100
+    MAX_PAGE_SPAN = 5
+    MAX_STRUCTURE_NODES = 25
+    MAX_NODE_IDS = 10
+    MAX_NODE_TEXT_LINES = 100
+    MAX_NODE_TEXT_CHARS = 12_000
+    MAX_STAT_FIELD_TARGETS = 20
+    MAX_TREE_DEPTH = 4
+    MAX_LS_RENDER_FILES = 25
+    MAX_STAT_METADATA_FIELDS = 8
+    SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT = 20
+    GREP_RECURSIVE_FOLDER_DEPTH_LIMIT = 2
+    GREP_RECURSIVE_FOLDER_FILE_LIMIT = 10
+
+    def __init__(
+        self,
+        filesystem: PageIndexFileSystem,
+        *,
+        json_output: bool = False,
+        query_context: str | None = None,
+    ):
+        self.filesystem = filesystem
+        self.json_output = json_output
+        self.query_context = query_context
+
+    def allowed_commands(self) -> set[str]:
+        commands = set(self.BASE_ALLOWED_COMMANDS)
+        semantic_channels = set(self.filesystem.semantic_retrieval_channels())
+        for channel in SEMANTIC_RETRIEVAL_CHANNELS:
+            if channel in semantic_channels:
+                commands.add(self.SEMANTIC_CHANNEL_COMMANDS[channel])
+        if any(channel in semantic_channels for channel in SEMANTIC_GREP_CHANNELS):
+            commands.add("semantic-grep")
+        return commands
+
+    def command_capabilities(self) -> dict[str, Any]:
+        return {
+            "allowed_commands": sorted(self.allowed_commands()),
+            "retrieval": self.filesystem.retrieval_capabilities(),
+        }
+
+    def describe_available_command_surfaces(self) -> str:
+        capabilities = self.filesystem.retrieval_capabilities()
+        semantic = capabilities["semantic"]
+        semantic_channels = set(semantic["channels"])
+        lines = [
+            "Available command surfaces for this workspace:",
+            "- mode: read-only inspection",
+            "- ls/tree: folder browsing",
+            "- find <folder>: folder path is positional; do not put paths in --where",
+            "- find --where: exact/canonical metadata DSL filtering using stat --schema fields only",
+            "- find <folder> -maxdepth N -type f|d: bounded folder traversal for find",
+            "- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled",
+            "- cat <path|file_ref|document_id> --structure: cached PageIndex node list, paginated at 25 nodes",
+            "- cat <path|file_ref|document_id> --page: cached PageIndex page reads, limited to 5 pages",
+            "- cat <path|file_ref|document_id> --node: cached PageIndex node reads, limited to 10 node ids",
+            "- cat <path|file_ref|document_id> --all: text artifact reads for txt/text files, paginated at 100 lines",
+            "- stat --field <metadata_field> <target...>: one metadata field across up to 20 documents",
+        ]
+        if "entity" in semantic_channels:
+            lines.append("- find --name: entity semantic candidate discovery alias")
+        if "relation" in semantic_channels:
+            lines.append("- find --relation: relation semantic candidate discovery alias")
+        for channel in SEMANTIC_RETRIEVAL_CHANNELS:
+            if channel not in semantic_channels:
+                continue
+            lines.append(
+                f"- {self.SEMANTIC_CHANNEL_COMMANDS[channel]}: "
+                f"{channel} semantic vector candidate discovery"
+            )
+        semantic_grep_channels = semantic.get("semantic_grep_channels") or []
+        if semantic_grep_channels:
+            lines.append(
+                "- semantic-grep -R: semantic candidates from "
+                + ", ".join(semantic_grep_channels)
+                + " indexes followed by real line matching"
+            )
+        if not semantic.get("commands"):
+            lines.append("- semantic vector commands: none available in this workspace")
+        lines.append("- grep <query> <path|file_ref|document_id>, cat, stat: evidence inspection")
+        return "\n".join(lines)
+
+    def execute(self, command: str) -> str:
+        try:
+            if not command.strip():
+                raise PIFSCommandError("Empty command")
+            commands = self._split_chained_commands(command)
+            if len(commands) > self.MAX_CHAINED_COMMANDS:
+                raise PIFSCommandError(
+                    f"Command chain supports at most {self.MAX_CHAINED_COMMANDS} commands. "
+                    "Run fewer commands or narrow the request first; if you are unsure where "
+                    "to inspect, use cat <target> --structure."
+                )
+            if len(commands) > 1:
+                return "\n".join(self._execute_pipeline(part) for part in commands)
+            return self._execute_pipeline(commands[0])
+        except PIFSCommandError:
+            raise
+        except (KeyError, ValueError) as exc:
+            raise PIFSCommandError(self._clean_error_message(exc)) from exc
+
+    def _execute_pipeline(self, command: str) -> str:
+        commands = self._split_piped_commands(command)
+        if len(commands) > self.MAX_PIPE_COMMANDS:
+            raise PIFSCommandError(
+                f"Pipeline supports at most {self.MAX_PIPE_COMMANDS} commands. "
+                "Use a smaller command and explicit limits; if you are unsure where "
+                "to inspect, use cat <target> --structure."
+            )
+        output = self._execute_single(commands[0])
+        for pipe_command in commands[1:]:
+            output = self._execute_pipe_filter(output, pipe_command)
+        return output
+
+    def _execute_single(self, command: str) -> str:
+        self._validate_raw_command(command)
+        try:
+            tokens = shlex.split(command)
+        except ValueError as exc:
+            raise PIFSCommandError(f"Invalid command syntax: {exc}") from exc
+        if not tokens:
+            raise PIFSCommandError("Empty command")
+        self._validate_tokens(tokens)
+        if "--json" in tokens:
+            tokens = [token for token in tokens if token != "--json"]
+            json_output = True
+        else:
+            json_output = self.json_output
+        name = tokens[0]
+        if name not in self.allowed_commands():
+            raise PIFSCommandError(f"Unsupported command: {name}")
+        method_name = self.COMMAND_METHODS.get(name, f"_cmd_{name}")
+        data = getattr(self, method_name)(tokens[1:])
+        return self._render(data, json_output=json_output, command_name=name)
+
+    def _execute_pipe_filter(self, input_text: str, command: str) -> str:
+        self._validate_raw_command(command)
+        try:
+            tokens = shlex.split(command)
+        except ValueError as exc:
+            raise PIFSCommandError(f"Invalid command syntax: {exc}") from exc
+        if not tokens:
+            raise PIFSCommandError("Empty pipe command")
+        self._validate_tokens(tokens)
+        name = tokens[0]
+        if name not in self.ALLOWED_PIPE_FILTERS:
+            raise PIFSCommandError(
+                f"Unsupported pipe command: {name}. Supported pipes are: "
+                f"{', '.join(sorted(self.ALLOWED_PIPE_FILTERS))}. "
+                "If you meant regex alternation such as a|b, PIFS grep/search "
+                "does not support it; run multiple grep or search-summary "
+                "commands with one phrase each."
+            )
+        if name == "head":
+            return self._pipe_head_tail(input_text, tokens[1:], from_tail=False)
+        if name == "tail":
+            return self._pipe_head_tail(input_text, tokens[1:], from_tail=True)
+        if name == "grep":
+            return self._pipe_grep(input_text, tokens[1:])
+        if name == "sed":
+            return self._pipe_sed(input_text, tokens[1:])
+        raise PIFSCommandError(f"Unsupported pipe command: {name}")
+
+    def _cmd_ls(self, args: list[str]) -> Any:
+        recursive = False
+        limit = self.MAX_LS_LIMIT
+        path = "/"
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg in {"-R", "-r", "--recursive"}:
+                recursive = True
+            elif arg == "--limit":
+                i += 1
+                limit = self._parse_bounded_int(
+                    args[i], "ls --limit", max_value=self.MAX_LS_LIMIT
+                )
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported ls option: {arg}")
+            else:
+                path = arg
+            i += 1
+        return self.filesystem.browse(path, recursive=recursive, limit=limit)
+
+    def _cmd_tree(self, args: list[str]) -> Any:
+        path = "/"
+        limit = self.MAX_TREE_LIMIT
+        depth = 2
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg == "--limit":
+                i += 1
+                limit = self._parse_bounded_int(
+                    args[i], "tree --limit", max_value=self.MAX_TREE_LIMIT
+                )
+            elif arg in {"--depth", "-L"}:
+                i += 1
+                depth = self._parse_non_negative_int(args[i], "tree --depth")
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported tree option: {arg}")
+            else:
+                path = arg
+            i += 1
+        if depth < 1:
+            raise PIFSCommandError("tree --depth must be at least 1")
+        if depth > self.MAX_TREE_DEPTH:
+            depth = self.MAX_TREE_DEPTH
+        listing = self.filesystem.browse(path, recursive=True, limit=limit)
+        return {"path": path, "depth": depth, "limit": limit, **listing}
+
+    def _cmd_find(self, args: list[str]) -> Any:
+        path = "/"
+        where = None
+        name = None
+        relation = None
+        limit = 10
+        file_type = None
+        max_depth = None
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg == "--where":
+                i += 1
+                where = args[i]
+            elif arg == "--name":
+                i += 1
+                name = args[i]
+            elif arg == "--relation":
+                i += 1
+                relation = args[i]
+            elif arg == "--limit":
+                i += 1
+                limit = self._parse_bounded_int(
+                    args[i], "find --limit", max_value=self.MAX_FIND_LIMIT
+                )
+            elif arg == "-type":
+                i += 1
+                file_type = args[i]
+            elif arg == "-maxdepth":
+                i += 1
+                max_depth = self._parse_find_maxdepth(args[i] if i < len(args) else None)
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported find option: {arg}")
+            else:
+                path = arg
+            i += 1
+        if file_type and file_type not in {"f", "d"}:
+            raise PIFSCommandError("find -type supports only f or d")
+        if name and relation:
+            raise PIFSCommandError("find supports only one of --name or --relation")
+        if file_type == "d":
+            if where:
+                return self.filesystem.find_folders(
+                    path,
+                    metadata_filter=where,
+                    limit=limit,
+                    max_depth=max_depth,
+                )
+            folders = self.filesystem.browse(
+                path,
+                recursive=True,
+                limit=limit,
+                max_depth=max_depth,
+            )["folders"]
+            if max_depth is not None and limit != 0:
+                return [self.filesystem.folder_info(path), *folders][:limit]
+            return folders
+        scope = {"folder_path": path, "recursive": True}
+        if max_depth is not None:
+            if max_depth == 0:
+                return []
+            scope["max_depth"] = max_depth
+        if relation:
+            if not self.filesystem.has_semantic_channel("relation"):
+                raise PIFSCommandError(
+                    "find --relation requires a relation semantic index in this workspace"
+                )
+            return self.filesystem.search_semantic_channel(
+                "relation",
+                self._semantic_retrieval_query(relation),
+                scope=scope,
+                metadata_filter=where,
+                limit=limit,
+            )
+        if name and self.filesystem.has_semantic_channel("entity"):
+            return self.filesystem.search_semantic_channel(
+                "entity",
+                self._semantic_retrieval_query(name),
+                scope=scope,
+                metadata_filter=where,
+                limit=limit,
+            )
+        return self.filesystem.search(
+            query=name,
+            scope=scope,
+            metadata_filter=where,
+            limit=limit,
+            semantic=False,
+        )
+
+    def _cmd_grep(self, args: list[str]) -> Any:
+        recursive = False
+        where = None
+        limit = 10
+        positionals = []
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg in {"-R", "-r", "--recursive"}:
+                recursive = True
+            elif self._is_combined_grep_flag(arg):
+                recursive = recursive or "R" in arg or "r" in arg
+            elif arg in {"-n", "--line-number", "-i", "--ignore-case"}:
+                pass
+            elif arg == "--where":
+                i += 1
+                where = args[i]
+            elif arg == "--limit":
+                i += 1
+                limit = self._parse_bounded_int(
+                    args[i], "grep --limit", max_value=self.MAX_GREP_LIMIT
+                )
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported grep option: {arg}")
+            else:
+                positionals.append(arg)
+            i += 1
+        if not positionals:
+            raise PIFSCommandError("grep requires a query")
+        query = positionals[0]
+        self._reject_regex_alternation_query(query, "grep")
+        path = positionals[1] if len(positionals) > 1 else "/"
+        if self._is_folder(path):
+            normalized = self._normalize_folder_path(path)
+            if recursive:
+                limit_notice = self._recursive_grep_limit_notice(normalized, query)
+                if limit_notice:
+                    return limit_notice
+                children = self.filesystem.browse(normalized, recursive=False, limit=1000)["folders"]
+                if children:
+                    direct_results = self.filesystem.search(
+                        query=query,
+                        scope={"folder_path": normalized, "recursive": False},
+                        metadata_filter=where,
+                        limit=limit,
+                        semantic=False,
+                    )
+                    if direct_results:
+                        return {
+                            "mode": "files",
+                            "query": query,
+                            "scope": normalized,
+                            "data": self._grep_file_hits_from_results(direct_results, query),
+                        }
+                    if where is None:
+                        direct_source_hits = self._grep_source_file_hits(
+                            normalized,
+                            query,
+                            limit=limit,
+                            direct_only=True,
+                        )
+                        if direct_source_hits:
+                            return {
+                                "mode": "files",
+                                "query": query,
+                                "scope": normalized,
+                                "data": direct_source_hits,
+                            }
+                    ranked = self._rank_child_folders(
+                        query=query,
+                        children=children,
+                        metadata_filter=where,
+                        limit=limit,
+                    )
+                    if not ranked and where is None:
+                        ranked = self._rank_child_folders_from_source(
+                            query=query,
+                            parent_path=normalized,
+                            children=children,
+                            limit=limit,
+                        )
+                    return {
+                        "mode": "folders",
+                        "query": query,
+                        "scope": normalized,
+                        "data": ranked,
+                        "hint": "narrow into one directory, then run grep -R again",
+                    }
+            results = self.filesystem.search(
+                query=query,
+                scope={"folder_path": normalized, "recursive": recursive},
+                metadata_filter=where,
+                limit=limit,
+                semantic=False,
+            )
+            if not results and where is None:
+                source_hits = self._grep_source_file_hits(normalized, query, limit=limit)
+                return {
+                    "mode": "files",
+                    "query": query,
+                    "scope": normalized,
+                    "data": source_hits,
+                }
+            return {
+                "mode": "files",
+                "query": query,
+                "scope": normalized,
+                "data": self._grep_file_hits_from_results(results, query),
+            }
+        return {
+            "mode": "matches",
+            "query": query,
+            "target": path,
+            "data": self._grep_file_matches(path, query, limit=limit),
+        }
+
+    def _cmd_cat(self, args: list[str]) -> Any:
+        if not args:
+            raise PIFSCommandError("cat requires a file target")
+        target = args[0]
+        if target.startswith("-"):
+            raise PIFSCommandError(
+                "cat syntax is target-first: cat <path|file_ref|document_id> --structure, "
+                "cat <path|file_ref|document_id> --page 31-59, or "
+                "cat <path|file_ref|document_id> --node 0009"
+            )
+        location = "all"
+        structural_mode: str | None = None
+        node_ids: list[str] = []
+        page_range: str | None = None
+        structure_offset = 0
+        structure_limit = self.MAX_STRUCTURE_NODES
+        i = 1
+        while i < len(args):
+            arg = args[i]
+            if arg == "--range":
+                i += 1
+                if i >= len(args):
+                    raise PIFSCommandError("cat --range requires a range")
+                location = args[i]
+            elif arg == "--all":
+                location = "all"
+            elif arg == "--structure":
+                structural_mode = "structure"
+            elif arg == "--offset":
+                i += 1
+                if i >= len(args):
+                    raise PIFSCommandError("cat --structure --offset requires a value")
+                structure_offset = self._parse_non_negative_int(args[i], "cat --structure --offset")
+            elif arg == "--limit":
+                i += 1
+                if i >= len(args):
+                    raise PIFSCommandError("cat --structure --limit requires a value")
+                structure_limit = self._parse_bounded_int(
+                    args[i],
+                    "cat --structure --limit",
+                    max_value=self.MAX_STRUCTURE_NODES,
+                )
+            elif arg == "--node":
+                i += 1
+                if i >= len(args):
+                    raise PIFSCommandError("cat --node requires a node id")
+                structural_mode = "node"
+                while i < len(args) and not args[i].startswith("-"):
+                    node_ids.extend(self._parse_node_ids(args[i]))
+                    i += 1
+                i -= 1
+            elif arg == "--page":
+                i += 1
+                if i >= len(args):
+                    raise PIFSCommandError("cat --page requires a page range")
+                structural_mode = "page"
+                page_range = args[i]
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported cat option: {arg}")
+            else:
+                raise PIFSCommandError(
+                    "cat accepts one file target. Use target-first syntax: "
+                    "cat <path|file_ref|document_id> --structure, "
+                    "cat <path|file_ref|document_id> --node 0002 0004, or "
+                    "cat <path|file_ref|document_id> --page 31-33. "
+                    f"Unexpected extra argument: {arg!r}. If the target path or title contains "
+                    "spaces, quote the whole target, for example: cat \"/documents/report name.pdf\" "
+                    "--structure. If a title-derived path is ambiguous, use the file_ref or "
+                    "document_id instead."
+                )
+            i += 1
+        if structural_mode == "structure":
+            if structure_limit < 1:
+                raise PIFSCommandError(
+                    "cat --structure --limit must be at least 1 and at most "
+                    f"{self.MAX_STRUCTURE_NODES}."
+                )
+            data = self.filesystem.pageindex_structure(
+                target,
+                offset=structure_offset,
+                limit=structure_limit,
+            )
+            self._attach_structure_next_command(data, target)
+            return data
+        if structural_mode == "node":
+            self._require_at_most(
+                len(node_ids),
+                "cat --node node count",
+                self.MAX_NODE_IDS,
+            )
+            if not node_ids:
+                raise PIFSCommandError("cat --node requires a node id")
+            node_results = [
+                self._bounded_node_result(
+                    self.filesystem.pageindex_node(target, node_id),
+                    target=target,
+                    node_id=node_id,
+                )
+                for node_id in node_ids
+            ]
+            if len(node_results) == 1:
+                return node_results[0]
+            return {
+                "mode": "nodes",
+                "target": target,
+                "available": all(result.get("available") is not False for result in node_results),
+                "node_ids": node_ids,
+                "nodes": node_results,
+                "text": "\n\n".join(
+                    f"[node {result.get('node_id') or node_id}]\n{result.get('text', '')}"
+                    for node_id, result in zip(node_ids, node_results)
+                ),
+            }
+        if structural_mode == "page":
+            if not page_range or not re.fullmatch(r"\d+(?:-\d+)?", page_range):
+                raise PIFSCommandError(
+                    "cat --page requires one page selector like 31 or 31-59. "
+                    "Use: cat <path|file_ref|document_id> --page <page-or-range>"
+                )
+            start, end = self._parse_numeric_range(page_range, "cat --page")
+            self._require_at_most(
+                end - start + 1,
+                "cat --page page count",
+                self.MAX_PAGE_SPAN,
+            )
+            data = self.filesystem.pageindex_pages(target, page_range)
+            self._attach_page_next_command(data, target, start=start, end=end)
+            return data
+        return self._bounded_text_artifact(target, location)
+
+    def _cmd_stat(self, args: list[str]) -> Any:
+        schema = False
+        field: str | None = None
+        targets: list[str] = []
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg == "--schema":
+                schema = True
+            elif arg == "--field":
+                i += 1
+                if i >= len(args):
+                    raise PIFSCommandError("stat --field requires a metadata field name")
+                field = args[i]
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported stat option: {arg}")
+            else:
+                targets.append(arg)
+            i += 1
+        if schema:
+            if field or targets:
+                raise PIFSCommandError("stat --schema cannot be combined with file targets or --field")
+            return self.filesystem._metadata_schema()
+        if field:
+            if not targets:
+                raise PIFSCommandError("stat --field requires at least one file target")
+            self._require_at_most(
+                len(targets),
+                "stat --field target count",
+                self.MAX_STAT_FIELD_TARGETS,
+            )
+            self._validate_metadata_field_for_stat(field)
+            return {
+                "mode": "field_values",
+                "field": field,
+                "target_count": len(targets),
+                "max_targets": self.MAX_STAT_FIELD_TARGETS,
+                "data": [self._stat_field_row(field, target) for target in targets],
+            }
+        if not targets:
+            raise PIFSCommandError("stat requires a file target or --schema")
+        self._require_at_most(
+            len(targets),
+            "stat target count",
+            self.MAX_STAT_FIELD_TARGETS,
+        )
+        if len(targets) == 1:
+            return {"target": targets[0], **self.filesystem._stat(targets[0])}
+        return {
+            "mode": "files",
+            "target_count": len(targets),
+            "data": [{"target": target, **self.filesystem._stat(target)} for target in targets],
+        }
+
+    def _cmd_head(self, args: list[str]) -> Any:
+        count, target = self._parse_standalone_head_tail(args, default_count=10)
+        count = self._require_at_most(count, "head line count", self.MAX_TEXT_LINES)
+        opened = self.filesystem.cat_text_artifact(target, "all")
+        lines = opened.text.splitlines()
+        text = "\n".join(lines[:count])
+        return {**self._jsonable(opened), "text": text, "end_line": min(count, len(lines))}
+
+    def _cmd_tail(self, args: list[str]) -> Any:
+        count, target = self._parse_standalone_head_tail(args, default_count=10)
+        count = self._require_at_most(count, "tail line count", self.MAX_TEXT_LINES)
+        opened = self.filesystem.cat_text_artifact(target, "all")
+        lines = opened.text.splitlines()
+        selected = lines[-count:] if count else []
+        start_line = max(1, len(lines) - len(selected) + 1)
+        return {
+            **self._jsonable(opened),
+            "text": "\n".join(selected),
+            "start_line": start_line,
+            "end_line": len(lines),
+        }
+
+    def _cmd_sed(self, args: list[str]) -> Any:
+        if len(args) < 3 or args[0] != "-n":
+            raise PIFSCommandError("sed supports only: sed -n '<start>,<end>p' <target>")
+        match = re.fullmatch(r"(\d+),(\d+)p", args[1])
+        if not match:
+            raise PIFSCommandError("sed supports only: sed -n '<start>,<end>p' <target>")
+        start, end = int(match.group(1)), int(match.group(2))
+        if start < 1 or end < start:
+            raise PIFSCommandError("Invalid sed line range")
+        self._require_at_most(end - start + 1, "sed line count", self.MAX_TEXT_LINES)
+        return self.filesystem.cat_text_artifact(
+            args[2],
+            f"{start}-{end}",
+        )
+
+    def _cmd_search_summary(self, args: list[str]) -> Any:
+        return self._cmd_semantic_channel("summary", args)
+
+    def _cmd_search_entity(self, args: list[str]) -> Any:
+        return self._cmd_semantic_channel("entity", args)
+
+    def _cmd_search_relation(self, args: list[str]) -> Any:
+        return self._cmd_semantic_channel("relation", args)
+
+    def _cmd_semantic_grep(self, args: list[str]) -> Any:
+        recursive = False
+        where = None
+        limit = 10
+        positionals = []
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg in {"-R", "-r", "--recursive"}:
+                recursive = True
+            elif self._is_combined_grep_flag(arg):
+                recursive = recursive or "R" in arg or "r" in arg
+            elif arg in {"-n", "--line-number", "-i", "--ignore-case"}:
+                pass
+            elif arg == "--where":
+                i += 1
+                where = args[i]
+            elif arg == "--limit":
+                i += 1
+                limit = self._parse_bounded_int(
+                    args[i], "semantic-grep --limit", max_value=self.MAX_SEMANTIC_LIMIT
+                )
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported semantic-grep option: {arg}")
+            else:
+                positionals.append(arg)
+            i += 1
+        if not recursive:
+            raise PIFSCommandError("semantic-grep requires -R/--recursive")
+        channels = self._semantic_grep_channels()
+        if not channels:
+            raise PIFSCommandError(
+                "semantic-grep is not available; entity/relation semantic indexes are not configured"
+            )
+        if not positionals:
+            raise PIFSCommandError("semantic-grep requires a query")
+        self._validate_search_positionals("semantic-grep", positionals)
+        query = positionals[0]
+        self._reject_regex_alternation_query(query, "semantic-grep")
+        path = positionals[1] if len(positionals) > 1 else "/"
+        if not self._is_folder(path):
+            raise PIFSCommandError("semantic-grep target must be a folder")
+        return self._semantic_recursive_grep(
+            self._normalize_folder_path(path),
+            query,
+            metadata_filter=where,
+            limit=limit,
+            channels=channels,
+        )
+
+    def _cmd_semantic_channel(self, channel: str, args: list[str]) -> Any:
+        if not self.filesystem.has_semantic_channel(channel):
+            raise PIFSCommandError(
+                f"search-{channel} is not available; {channel} semantic index is not configured"
+            )
+        where = None
+        limit = 10
+        positionals = []
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg == "--where":
+                i += 1
+                where = args[i]
+            elif arg == "--limit":
+                i += 1
+                limit = self._parse_bounded_int(
+                    args[i],
+                    f"search-{channel} --limit",
+                    max_value=self.MAX_SEMANTIC_LIMIT,
+                )
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported search-{channel} option: {arg}")
+            else:
+                positionals.append(arg)
+            i += 1
+        if not positionals:
+            raise PIFSCommandError(f"search-{channel} requires a query")
+        self._validate_search_positionals(f"search-{channel}", positionals)
+        query = positionals[0]
+        self._reject_regex_alternation_query(query, f"search-{channel}")
+        path = positionals[1] if len(positionals) > 1 else "/"
+        normalized = self._normalize_folder_path(path)
+        results = self.filesystem.search_semantic_channel(
+            channel,
+            self._semantic_retrieval_query(query),
+            scope={"folder_path": normalized, "recursive": True},
+            metadata_filter=where,
+            limit=limit,
+        )
+        return {
+            "mode": "files",
+            "query": query,
+            "scope": normalized,
+            "retrieval": f"{channel}_vector",
+            "data": self._semantic_channel_hits_from_results(channel, results, query),
+        }
+
+    def _semantic_recursive_grep(
+        self,
+        folder_path: str,
+        query: str,
+        *,
+        metadata_filter: str | None,
+        limit: int,
+        channels: tuple[str, ...],
+    ) -> dict[str, Any]:
+        vector_query = str(query or "").strip()
+        candidate_debug: dict[str, Any] = {}
+        for channel in channels:
+            channel_results = self.filesystem.search_semantic_channel(
+                channel,
+                vector_query,
+                scope={"folder_path": folder_path, "recursive": True},
+                metadata_filter=metadata_filter,
+                limit=self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
+            )
+            matches = self._grep_file_hits_from_results(
+                channel_results,
+                query,
+                require_match=True,
+                limit=limit,
+            )
+            candidate_debug[channel] = {
+                "candidates": len(channel_results),
+                "line_matches": len(matches),
+                "candidate_doc_ids": [
+                    getattr(result, "external_id", None)
+                    for result in channel_results[:5]
+                ],
+            }
+            if matches:
+                return {
+                    "mode": "files",
+                    "query": query,
+                    "scope": folder_path,
+                    "retrieval": "semantic_grep_" + "_then_".join(channels),
+                    "candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
+                    "matched_channel": channel,
+                    "candidate_debug": candidate_debug,
+                    "data": matches,
+                }
+        return {
+            "mode": "files",
+            "query": query,
+            "scope": folder_path,
+            "retrieval": "semantic_grep_" + "_then_".join(channels),
+            "candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT,
+            "matched_channel": "",
+            "candidate_debug": candidate_debug,
+            "data": [],
+        }
+
+    def _semantic_grep_channels(self) -> tuple[str, ...]:
+        available = set(self.filesystem.semantic_retrieval_channels())
+        return tuple(channel for channel in SEMANTIC_GREP_CHANNELS if channel in available)
+
+    def _bounded_text_artifact(self, target: str, location: str) -> dict[str, Any]:
+        if str(location).strip().lower() in {"all", "full", "*"}:
+            start, end = 1, self.MAX_TEXT_LINES
+        else:
+            start, end = self._parse_numeric_range(location, "cat --range")
+            self._require_at_most(
+                end - start + 1,
+                "cat --range line count",
+                self.MAX_TEXT_LINES,
+            )
+        opened = self.filesystem.cat_text_artifact(target, f"{start}-{end}")
+        data = self._jsonable(opened)
+        total_lines = len(self.filesystem.store.read_text(opened.file_ref).splitlines())
+        has_more = int(data.get("end_line") or end) < total_lines
+        pagination = {
+            "offset_line": start,
+            "limit": self.MAX_TEXT_LINES,
+            "returned_lines": max(0, int(data.get("end_line") or end) - start + 1),
+            "total_lines": total_lines,
+            "has_more": has_more,
+            "next_range": None,
+            "next_command": None,
+        }
+        if has_more:
+            next_start = int(data.get("end_line") or end) + 1
+            next_end = min(total_lines, next_start + self.MAX_TEXT_LINES - 1)
+            next_range = f"{next_start}-{next_end}"
+            pagination["next_range"] = next_range
+            pagination["next_command"] = (
+                f"cat {shlex.quote(target)} --range {shlex.quote(next_range)}"
+            )
+            data["text"] = (
+                str(data.get("text") or "").rstrip()
+                + "\n"
+                + self._pagination_footer(
+                    "cat --all",
+                    f"showing lines {start}-{data.get('end_line')} of {total_lines}",
+                    str(pagination["next_command"]),
+                )
+            ).strip()
+        data["pagination"] = pagination
+        return data
+
+    def _bounded_node_result(
+        self,
+        data: dict[str, Any],
+        *,
+        target: str,
+        node_id: str,
+    ) -> dict[str, Any]:
+        if not isinstance(data, dict) or data.get("available") is False:
+            return data
+        text = str(data.get("text") or "")
+        lines = text.splitlines()
+        truncated_by_lines = len(lines) > self.MAX_NODE_TEXT_LINES
+        truncated_by_chars = len(text) > self.MAX_NODE_TEXT_CHARS
+        if not truncated_by_lines and not truncated_by_chars:
+            data["node_pagination"] = {
+                "limit_nodes": self.MAX_NODE_IDS,
+                "text_truncated": False,
+            }
+            return data
+
+        selected = "\n".join(lines[: self.MAX_NODE_TEXT_LINES])
+        if len(selected) > self.MAX_NODE_TEXT_CHARS:
+            selected = selected[: self.MAX_NODE_TEXT_CHARS].rstrip()
+        data["text"] = (
+            selected.rstrip()
+            + "\n"
+            + self._pagination_footer(
+                "cat --node",
+                (
+                    f"node text limited to {self.MAX_NODE_TEXT_LINES} lines/"
+                    f"{self.MAX_NODE_TEXT_CHARS} chars"
+                ),
+                f"cat {shlex.quote(target)} --structure",
+            )
+        ).strip()
+        data["node_pagination"] = {
+            "limit_nodes": self.MAX_NODE_IDS,
+            "line_limit": self.MAX_NODE_TEXT_LINES,
+            "char_limit": self.MAX_NODE_TEXT_CHARS,
+            "original_lines": len(lines),
+            "original_chars": len(text),
+            "text_truncated": True,
+            "suggested_command": f"cat {shlex.quote(target)} --structure",
+            "node_id": node_id,
+        }
+        return data
+
+    def _attach_structure_next_command(self, data: dict[str, Any], target: str) -> None:
+        pagination = data.get("structure_pagination")
+        if not isinstance(pagination, dict):
+            return
+        if pagination.get("has_more") and pagination.get("next_offset") is not None:
+            next_command = (
+                f"cat {shlex.quote(target)} --structure "
+                f"--offset {pagination['next_offset']} --limit {pagination['limit']}"
+            )
+            pagination["next_command"] = next_command
+        else:
+            pagination["next_command"] = None
+
+    def _attach_page_next_command(
+        self,
+        data: dict[str, Any],
+        target: str,
+        *,
+        start: int,
+        end: int,
+    ) -> None:
+        page_count = end - start + 1
+        next_command = None
+        if page_count == self.MAX_PAGE_SPAN:
+            next_start = end + 1
+            next_end = next_start + self.MAX_PAGE_SPAN - 1
+            next_command = f"cat {shlex.quote(target)} --page {next_start}-{next_end}"
+        data["page_pagination"] = {
+            "start": start,
+            "end": end,
+            "returned_pages": page_count,
+            "limit": self.MAX_PAGE_SPAN,
+            "next_command": next_command,
+        }
+
+    @staticmethod
+    def _pagination_footer(command: str, reason: str, next_command: str) -> str:
+        return (
+            f"# output limited by {command}: {reason}. "
+            f"Next: {next_command}. If unsure, use cat <target> --structure."
+        )
+
+    @staticmethod
+    def _parse_node_ids(value: str) -> list[str]:
+        return [part.strip() for part in value.split(",") if part.strip()]
+
+    @staticmethod
+    def _reject_regex_alternation_query(query: str, command_name: str) -> None:
+        if "|" not in str(query):
+            return
+        raise PIFSCommandError(
+            f"{command_name} does not support regex alternation '|'. "
+            "Run multiple grep commands or multiple search-summary commands "
+            "with one phrase each."
+        )
+
+    @staticmethod
+    def _validate_search_positionals(command_name: str, positionals: list[str]) -> None:
+        if len(positionals) > 2:
+            raise PIFSCommandError(
+                f"{command_name} accepts one query and an optional folder path. "
+                f"Quote multi-word queries, for example: {command_name} "
+                '"Federal Reserve" /documents'
+            )
+        if len(positionals) == 2 and not positionals[1].startswith("/"):
+            raise PIFSCommandError(
+                f"{command_name} target must be a PIFS folder path like /documents. "
+                f"If your query has spaces, quote it, for example: {command_name} "
+                '"Federal Reserve" /documents'
+            )
+
+    @staticmethod
+    def _parse_numeric_range(value: str, label: str) -> tuple[int, int]:
+        try:
+            if "-" in value:
+                left, right = value.split("-", 1)
+                start, end = int(left), int(right)
+            else:
+                start = end = int(value)
+        except ValueError as exc:
+            raise PIFSCommandError(f"{label} requires a numeric range") from exc
+        if start < 1 or end < start:
+            raise PIFSCommandError(f"Invalid {label} range: {value}")
+        return start, end
+
+    def _validate_metadata_field_for_stat(self, field: str) -> None:
+        schema = self.filesystem._metadata_schema()
+        fields = schema.get("fields", {})
+        if field not in fields:
+            available = ", ".join(sorted(fields)[:20]) or "(none)"
+            raise PIFSCommandError(
+                f"Unknown metadata field: {field}. Use stat --schema to inspect fields. "
+                f"Available fields include: {available}"
+            )
+
+    def _stat_field_row(self, field: str, target: str) -> dict[str, Any]:
+        info = self.filesystem._stat(target)
+        folder_paths = [
+            folder.get("path", "")
+            for folder in info.get("folders", [])
+            if folder.get("path")
+        ]
+        row = dict(info)
+        row["target"] = target
+        row["folder_paths"] = folder_paths
+        metadata = info.get("metadata") or {}
+        raw_value = metadata.get(field)
+        row.update(
+            {
+                "field": field,
+                "present": field in metadata,
+                "value": raw_value if field in metadata else None,
+                "display_target": self._file_target_path(row),
+            }
+        )
+        return row
+
+    def _render(self, data: Any, *, json_output: bool, command_name: str) -> str:
+        jsonable = self._jsonable(data)
+        if json_output:
+            return json.dumps({"ok": True, "data": jsonable}, ensure_ascii=False)
+        return self._render_shell(command_name, jsonable)
+
+    def _render_shell(self, command_name: str, data: Any) -> str:
+        if command_name == "cat":
+            return self._render_cat(data)
+        if command_name == "ls":
+            return self._render_listing(data)
+        if command_name == "tree":
+            return self._render_tree(data)
+        if command_name in {"grep", "semantic-grep"}:
+            return self._render_grep(data)
+        if command_name in {"search-summary", "search-entity", "search-relation"}:
+            return self._render_semantic_search(data)
+        if command_name == "find":
+            return self._render_find(data)
+        if command_name == "stat":
+            return self._render_stat(data)
+        if command_name in {"head", "tail", "sed"}:
+            return str(data.get("text", "")) if isinstance(data, dict) else str(data)
+        if isinstance(data, dict):
+            return "\n".join(f"{key}: {value}" for key, value in data.items())
+        if isinstance(data, list):
+            return "\n".join(str(item) for item in data)
+        return str(data)
+
+    def _render_cat(self, data: Any) -> str:
+        if not isinstance(data, dict):
+            return str(data)
+        if data.get("available") is False:
+            return f"# {data.get('message', 'PageIndex structural content is unavailable')}"
+        if data.get("mode") == "structure":
+            return json.dumps(
+                {
+                    "structure": data.get("structure", []),
+                    "pagination": data.get("structure_pagination", {}),
+                },
+                ensure_ascii=False,
+                indent=2,
+            )
+        return str(data.get("text", ""))
+
+    def _render_listing(self, data: Any) -> str:
+        if not isinstance(data, dict):
+            return str(data)
+        lines: list[str] = []
+        for folder in data.get("folders", []):
+            name = folder["path"] if folder.get("path", "").startswith("/") else folder["name"]
+            if not name.endswith("/"):
+                name = f"{name}/"
+            lines.append(
+                f"{name} folders={folder.get('children_count', 0)} files={folder.get('file_count', 0)}"
+            )
+        files = data.get("files", [])
+        for file in files[: self.MAX_LS_RENDER_FILES]:
+            lines.append(self._file_row_text(file))
+        if len(files) > self.MAX_LS_RENDER_FILES:
+            remaining = len(files) - self.MAX_LS_RENDER_FILES
+            lines.append(
+                f"# ... {remaining} more files omitted from ls output; use grep/find to search this folder"
+            )
+        return "\n".join(lines)
+
+    def _render_tree(self, data: Any) -> str:
+        if not isinstance(data, dict):
+            return str(data)
+        root = self._normalize_folder_path(data.get("path", "/"))
+        max_depth = int(data.get("depth", 2))
+        lines = [root]
+        folders = [
+            folder
+            for folder in data.get("folders", [])
+            if self._relative_depth(root, folder["path"]) <= max_depth
+        ]
+        for folder in folders:
+            depth = self._relative_depth(root, folder["path"])
+            indent = "  " * max(depth - 1, 0)
+            lines.append(
+                f"{indent}{folder['name']}/ folders={folder.get('children_count', 0)} "
+                f"files={folder.get('file_count', 0)}"
+            )
+        if len(folders) < len(data.get("folders", [])):
+            lines.append(f"# truncated at depth={max_depth}")
+        return "\n".join(lines)
+
+    def _render_grep(self, data: Any) -> str:
+        if not isinstance(data, dict):
+            return str(data)
+        mode = data.get("mode")
+        if mode == "folders":
+            lines = [f"# folder matches for: {data.get('query', '')}"]
+            for folder in data.get("data", []):
+                path = folder["path"]
+                if not path.endswith("/"):
+                    path = f"{path}/"
+                lines.append(
+                    f"{path} matched_files={folder.get('matched_files', 0)} "
+                    f"files={folder.get('files', 0)}"
+                )
+            lines.append(f"# {data.get('hint', 'narrow into one directory, then run grep -R again')}")
+            return "\n".join(lines)
+        if mode == "limited":
+            query = str(data.get("query") or "")
+            scope = str(data.get("scope") or "/")
+            suggested_commands = list(data.get("suggested_commands") or [])
+            lines = [
+                f"# grep -R skipped for broad folder: {scope}",
+                (
+                    "# reason: recursive lexical grep is limited when a folder is deeper "
+                    f"than {data.get('folder_depth_limit', self.GREP_RECURSIVE_FOLDER_DEPTH_LIMIT)} "
+                    f"levels or has more than {data.get('file_count_limit', self.GREP_RECURSIVE_FOLDER_FILE_LIMIT)} files"
+                ),
+            ]
+            if suggested_commands:
+                lines.extend(f"# suggested: {command}" for command in suggested_commands)
+                lines.append("# also try: narrow with ls/tree/find --where")
+            else:
+                lines.append("# suggested: narrow with ls/tree/find --where")
+            if data.get("sample_deep_folder_path"):
+                lines.append(f"# deep descendant example: {data['sample_deep_folder_path']}/")
+            return "\n".join(lines)
+        if mode == "files":
+            if not data.get("data", []):
+                return f"# no matches for: {data.get('query', '')}"
+            return "\n".join(
+                self._grep_file_hit_text(item)
+                for item in data.get("data", [])
+            )
+        if mode == "matches":
+            return "\n".join(
+                f"{self._file_target_path(item)}:{item['line']}: "
+                f"{self._compact_text(item['text'], max_chars=220)}"
+                for item in data.get("data", [])
+            )
+        return str(data)
+
+    def _render_semantic_search(self, data: Any) -> str:
+        if not isinstance(data, dict):
+            return str(data)
+        if data.get("mode") != "files":
+            return self._render_grep(data)
+        if not data.get("data", []):
+            return f"# no matches for: {data.get('query', '')}"
+        lines: list[str] = []
+        for item in data.get("data", []):
+            lines.append(f"path: {item.get('path') or '-'}")
+            lines.append(f"summary: {self._one_line_value(item.get('summary') or '')}")
+            if "entity" in item:
+                lines.append(f"entity: {self._one_line_value(item.get('entity') or '')}")
+            if "relation" in item:
+                lines.append(f"relation: {self._one_line_value(item.get('relation') or '')}")
+            line_text = self._one_line_value(item.get("line_text") or "")
+            lines.append(f"line_text: {line_text or '-'}")
+            lines.append("")
+        return "\n".join(lines).rstrip()
+
+    def _render_find(self, data: Any) -> str:
+        if not isinstance(data, list):
+            return str(data)
+        if data and isinstance(data[0], dict) and "path" in data[0] and "file_ref" not in data[0]:
+            return "\n".join(
+                (
+                    f"{self._folder_row_path(item['path'])} matched_files={item['matched_files']} "
+                    f"files={item.get('file_count', 0)}"
+                    if item.get("matched_files")
+                    else f"{self._folder_row_path(item['path'])} folders={item.get('children_count', 0)} "
+                    f"files={item.get('file_count', 0)}"
+                )
+                for item in data
+            )
+        return "\n".join(self._file_row_text(item) for item in data)
+
+    def _folder_row_path(self, path: str) -> str:
+        normalized = self._normalize_folder_path(path)
+        return "/" if normalized == "/" else f"{normalized}/"
+
+    def _render_stat(self, data: Any) -> str:
+        if not isinstance(data, dict):
+            return str(data)
+        if "fields" in data:
+            lines = ["metadata schema:"]
+            for name, field in sorted(data["fields"].items()):
+                lines.append(f"{name}: {field.get('type', 'string')}")
+            return "\n".join(lines)
+        if data.get("mode") == "field_values":
+            field = data.get("field", "")
+            lines = []
+            for item in data.get("data", []):
+                lines.append(f"{item.get('display_target') or item.get('target')}:")
+                value = item.get("value")
+                if value is None:
+                    lines.append(f"{field}: -")
+                else:
+                    lines.append(f"{field}: {self._one_line_value(value)}")
+            return "\n\n".join(lines)
+        if data.get("mode") == "files":
+            return "\n\n".join(self._render_stat(item) for item in data.get("data", []))
+        lines = [
+            f"target: {data.get('target') or data.get('file_ref')}",
+            f"file_ref: {data.get('file_ref')}",
+            f"document_id: {data.get('external_id') or data.get('document_id') or '-'}",
+            f"source_path: {data.get('source_path') or '-'}",
+            f"storage_uri: {data.get('storage_uri') or '-'}",
+        ]
+        folders = data.get("folders") or []
+        if folders:
+            lines.append("folders:")
+            lines.extend(f"  {folder['path']}" for folder in folders)
+        metadata = data.get("metadata") or {}
+        if metadata:
+            lines.append("metadata:")
+            metadata_items = sorted(metadata.items())[: self.MAX_STAT_METADATA_FIELDS]
+            for key, value in metadata_items:
+                lines.append(f"  {key}: {self._compact_value(value)}")
+            if len(metadata) > self.MAX_STAT_METADATA_FIELDS:
+                lines.append(f"  ... {len(metadata) - self.MAX_STAT_METADATA_FIELDS} more fields")
+        metadata_status = data.get("metadata_status") or {}
+        if metadata_status:
+            lines.append(f"metadata_status: {metadata_status.get('status', '-')}")
+            pageindex_tree = metadata_status.get("pageindex_tree") or {}
+            if isinstance(pageindex_tree, dict) and pageindex_tree:
+                lines.append(f"pageindex_tree_status: {pageindex_tree.get('status', '-')}")
+                message = str(pageindex_tree.get("message") or "").strip()
+                error_type = str(pageindex_tree.get("error_type") or "").strip()
+                if error_type and message:
+                    lines.append(f"pageindex_tree_error: {error_type}: {message}")
+                elif message or error_type:
+                    lines.append(f"pageindex_tree_error: {message or error_type}")
+            summary_projection = (
+                metadata_status.get("projection_indexes", {}).get("summary", {})
+            )
+            if summary_projection:
+                lines.append(
+                    f"summary_projection_status: {summary_projection.get('status', '-')}"
+                )
+        return "\n".join(lines)
+
+    def _file_row_text(self, item: dict[str, Any]) -> str:
+        file_ref = item.get("file_ref")
+        doc_id = item.get("external_id") or item.get("document_id") or "-"
+        title = self._compact_text(item.get("title") or item.get("name") or "", max_chars=80)
+        source_path = item.get("source_path") or "-"
+        folder_paths = item.get("folder_paths") or self._folder_paths_for_file(file_ref)
+        folders = f" folders={','.join(folder_paths)}" if folder_paths else ""
+        target = self._file_target_path(item)
+        return f"{target} id={doc_id} file_ref={file_ref or '-'} title={title} source={source_path}{folders}".strip()
+
+    def _grep_file_hit_text(self, item: dict[str, Any]) -> str:
+        doc_id = item.get("external_id") or "-"
+        line = item.get("line") or 1
+        target = self._file_target_path(item)
+        return (
+            f"{target}:{line}: id={doc_id} "
+            f"{self._compact_text(item.get('text') or '', max_chars=180)}"
+        )
+
+    def _file_target_path(self, item: dict[str, Any]) -> str:
+        file_ref = item.get("file_ref")
+        title = str(item.get("title") or item.get("name") or "").strip()
+        folder_paths = item.get("folder_paths") or []
+        folder_path = item.get("folder_path")
+        if not folder_paths and folder_path:
+            folder_paths = [folder_path]
+        if not folder_paths:
+            folder_paths = self._folder_paths_for_file(file_ref)
+        if folder_paths and title:
+            folder = str(folder_paths[0] or "/").rstrip("/")
+            return f"{folder}/{title}" if folder else f"/{title}"
+        return str(item.get("source_path") or item.get("external_id") or file_ref or "-")
+
+    def _stable_file_target_path(self, item: dict[str, Any]) -> str:
+        file_ref = str(item.get("file_ref") or "").strip()
+        source_path = str(item.get("source_path") or "").strip()
+        if source_path:
+            target = "/" + source_path.strip("/")
+            try:
+                if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref:
+                    return target
+            except KeyError:
+                pass
+        external_id = str(item.get("external_id") or "").strip()
+        if external_id:
+            return external_id
+        if file_ref:
+            return file_ref
+        return str(item.get("external_id") or item.get("file_ref") or "-")
+
+    def _semantic_retrieval_query(self, query: str) -> str:
+        query = str(query or "").strip()
+        context = str(self.query_context or "").strip()
+        if context and query and query.lower() not in context.lower():
+            return f"{context}\nSearch phrase: {query}"
+        return context or query
+
+    def _recursive_grep_limit_notice(self, folder_path: str, query: str) -> dict[str, Any] | None:
+        stats = self.filesystem.store.folder_subtree_thresholds(
+            folder_path,
+            depth_limit=self.GREP_RECURSIVE_FOLDER_DEPTH_LIMIT,
+            file_limit=self.GREP_RECURSIVE_FOLDER_FILE_LIMIT,
+        )
+        if not (
+            stats["folder_depth_exceeds_limit"]
+            or stats["file_count_exceeds_limit"]
+        ):
+            return None
+        suggested_commands = self._semantic_alternative_commands(query, folder_path)
+        semantic_hint = (
+            "Use " + "; ".join(suggested_commands) + " to discover candidates. "
+            if suggested_commands
+            else ""
+        )
+        return {
+            "mode": "limited",
+            "query": query,
+            "scope": folder_path,
+            "folder_depth_limit": stats["depth_limit"],
+            "file_count_limit": stats["file_limit"],
+            "folder_depth_exceeds_limit": stats["folder_depth_exceeds_limit"],
+            "file_count_exceeds_limit": stats["file_count_exceeds_limit"],
+            "sampled_file_count": stats["sampled_file_count"],
+            "sample_deep_folder_path": stats["sample_deep_folder_path"],
+            "suggested_commands": suggested_commands,
+            "hint": (
+                "Default grep -R remains lexical and is intentionally limited for broad deep folders "
+                "because the SQLite FTS path cannot guarantee fast recursive search at this scope. "
+                f"{semantic_hint}Use ls/tree or find --where to narrow first."
+            ),
+        }
+
+    def _semantic_alternative_commands(self, query: str, folder_path: str) -> list[str]:
+        commands = []
+        quoted_query = shlex.quote(query)
+        quoted_folder = shlex.quote(folder_path)
+        if self._semantic_grep_channels():
+            commands.append(f"semantic-grep -R {quoted_query} {quoted_folder}")
+        for channel in SEMANTIC_RETRIEVAL_CHANNELS:
+            if self.filesystem.has_semantic_channel(channel):
+                command = self.SEMANTIC_CHANNEL_COMMANDS[channel]
+                commands.append(f"{command} {quoted_query} {quoted_folder}")
+        return commands
+
+    def _rank_child_folders(
+        self,
+        *,
+        query: str,
+        children: list[dict[str, Any]],
+        metadata_filter: str | None,
+        limit: int,
+    ) -> list[dict[str, Any]]:
+        ranked: list[dict[str, Any]] = []
+        for child in children:
+            results = self.filesystem.search(
+                query=query,
+                scope={"folder_path": child["path"], "recursive": True},
+                metadata_filter=metadata_filter,
+                limit=max(limit, 50),
+                semantic=False,
+            )
+            if not results:
+                continue
+            ranked.append(
+                {
+                    "path": child["path"],
+                    "name": child["name"],
+                    "matched_files": len(results),
+                    "files": self.filesystem.store.count_files_in_folder(child["path"], recursive=True),
+                    "children_count": child.get("children_count", 0),
+                }
+            )
+        ranked.sort(key=lambda item: (-item["matched_files"], item["path"]))
+        return ranked[:limit]
+
+    def _grep_file_hits_from_results(
+        self,
+        results: list[Any],
+        query: str,
+        *,
+        require_match: bool = False,
+        limit: int | None = None,
+    ) -> list[dict[str, Any]]:
+        hits = []
+        for result in results:
+            line, text = self._first_matching_line(result.file_ref, query)
+            if require_match and not text:
+                continue
+            hits.append(
+                {
+                    "file_ref": result.file_ref,
+                    "external_id": result.external_id,
+                    "title": result.title,
+                    "source_path": result.source_path,
+                    "folder_paths": result.folder_paths,
+                    "line": line,
+                    "text": text or result.snippet,
+                }
+            )
+            if limit is not None and len(hits) >= limit:
+                break
+        return hits
+
+    def _semantic_channel_hits_from_results(
+        self,
+        channel: str,
+        results: list[Any],
+        query: str,
+    ) -> list[dict[str, Any]]:
+        hits = []
+        for result in results:
+            metadata = result.metadata or {}
+            line, text = self._first_matching_line(result.file_ref, query)
+            line_text = ""
+            if text:
+                line_text = f"{line}: {self._compact_text(text, max_chars=220)}"
+            hit = {
+                "path": self._stable_file_target_path(
+                    {
+                        "file_ref": result.file_ref,
+                        "title": result.title,
+                        "folder_paths": result.folder_paths,
+                        "source_path": result.source_path,
+                        "external_id": result.external_id,
+                    }
+                ),
+                "summary": metadata.get("summary") or "",
+                "line_text": line_text,
+            }
+            if channel in {"entity", "relation"}:
+                hit[channel] = metadata.get(channel) or ""
+            hits.append(hit)
+        return hits
+
+    def _rank_child_folders_from_source(
+        self,
+        *,
+        query: str,
+        parent_path: str,
+        children: list[dict[str, Any]],
+        limit: int,
+    ) -> list[dict[str, Any]]:
+        source_dir = self._source_dir_for_folder(parent_path)
+        source_root = self._source_root()
+        if source_dir is None or source_root is None:
+            return []
+        child_paths = {child["path"]: child for child in children}
+        counts: dict[str, int] = {}
+        for path in self._rg_candidate_files(query, source_dir, max_files=5000):
+            source_path = self._source_path_from_storage(path, source_root)
+            folder_path = "/" + str(Path(source_path).parent).strip("/")
+            child_path = self._matching_child_path(parent_path, folder_path, child_paths)
+            if child_path:
+                counts[child_path] = counts.get(child_path, 0) + 1
+        ranked = [
+            {
+                "path": path,
+                "name": child_paths[path]["name"],
+                "matched_files": matched,
+                "files": self.filesystem.store.count_files_in_folder(path, recursive=True),
+                "children_count": child_paths[path].get("children_count", 0),
+            }
+            for path, matched in counts.items()
+        ]
+        ranked.sort(key=lambda item: (-item["matched_files"], item["path"]))
+        return ranked[:limit]
+
+    def _grep_source_file_hits(
+        self,
+        folder_path: str,
+        query: str,
+        *,
+        limit: int,
+        direct_only: bool = False,
+    ) -> list[dict[str, Any]]:
+        source_dir = self._source_dir_for_folder(folder_path)
+        source_root = self._source_root()
+        if source_dir is None or source_root is None:
+            return []
+        hits = []
+        for path in self._rg_candidate_files(query, source_dir, max_files=max(limit * 10, 50)):
+            file_row = self._file_row_for_storage(path)
+            if not file_row:
+                continue
+            if direct_only and self._folder_path_for_source_path(file_row["source_path"]) != folder_path:
+                continue
+            line_number, text = self._first_matching_source_line(path, query)
+            if line_number is None:
+                continue
+            hits.append(
+                {
+                    "file_ref": file_row["file_ref"],
+                    "external_id": file_row["external_id"],
+                    "title": file_row["title"],
+                    "source_path": file_row["source_path"],
+                    "folder_paths": self._folder_paths_for_file(file_row["file_ref"]),
+                    "line": line_number,
+                    "text": text or file_row["title"],
+                }
+            )
+            if len(hits) >= limit:
+                break
+        return hits
+
+    def _grep_file_matches(self, target: str, query: str, *, limit: int) -> list[dict[str, Any]]:
+        file_ref = self.filesystem._resolve_target(target)
+        entry = self.filesystem.store.get_file(file_ref)
+        matches = []
+        for line_number, line in enumerate(self.filesystem.store.read_text(file_ref).splitlines(), 1):
+            if self._line_matches(line, query):
+                matches.append(
+                    {
+                        "file_ref": file_ref,
+                        "external_id": entry.external_id,
+                        "title": entry.title,
+                        "source_path": entry.source_path,
+                        "folder_paths": self._folder_paths_for_file(file_ref),
+                        "line": line_number,
+                        "text": self._compact_text(line, max_chars=220),
+                    }
+                )
+                if len(matches) >= limit:
+                    break
+        return matches
+
+    def _first_matching_line(self, file_ref: str, query: str) -> tuple[int, str]:
+        for line_number, line in enumerate(self.filesystem.store.read_text(file_ref).splitlines(), 1):
+            if self._line_matches(line, query):
+                return line_number, self._compact_text(line, max_chars=220)
+        return 1, ""
+
+    def _line_matches(self, line: str, query: str) -> bool:
+        haystack = line.lower()
+        needle = query.lower().strip()
+        if needle and needle in haystack:
+            return True
+        terms = [term for term in re.findall(r"[A-Za-z0-9_]+", needle) if term]
+        return bool(terms) and all(term in haystack for term in terms)
+
+    @staticmethod
+    def _is_combined_grep_flag(arg: str) -> bool:
+        return bool(re.fullmatch(r"-[Rrni]+", arg)) and len(arg) > 2
+
+    def _rg_candidate_files(self, query: str, directory: Path, *, max_files: int) -> list[Path]:
+        if not directory.exists():
+            return []
+        terms = [term.lower() for term in re.findall(r"[A-Za-z0-9_]{3,}", query)]
+        if not terms:
+            return []
+        primary = max(terms, key=len)
+        try:
+            completed = subprocess.run(
+                [
+                    "rg",
+                    "-l",
+                    "-i",
+                    "-F",
+                    primary,
+                    str(directory),
+                    "--glob",
+                    "*.json",
+                    "--no-messages",
+                ],
+                check=False,
+                capture_output=True,
+                text=True,
+                timeout=20,
+            )
+        except (OSError, subprocess.TimeoutExpired):
+            return []
+        candidates = [Path(line) for line in completed.stdout.splitlines() if line.strip()]
+        filtered = []
+        for path in candidates[: max(max_files * 20, max_files)]:
+            try:
+                text = path.read_text(encoding="utf-8", errors="ignore").lower()
+            except OSError:
+                continue
+            if all(term in text for term in terms):
+                filtered.append(path)
+                if len(filtered) >= max_files:
+                    break
+        return filtered
+
+    def _first_matching_source_line(self, path: Path, query: str) -> tuple[int | None, str]:
+        try:
+            lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
+        except OSError:
+            return None, ""
+        for line_number, line in enumerate(lines, 1):
+            if self._line_matches(line, query):
+                return line_number, self._compact_text(line, max_chars=220)
+        return None, ""
+
+    def _source_root(self) -> Path | None:
+        with self.filesystem.store.connect() as conn:
+            row = conn.execute(
+                """
+                SELECT storage_uri, source_path
+                FROM files
+                WHERE deleted_at IS NULL
+                LIMIT 1
+                """
+            ).fetchone()
+        if row is None:
+            return None
+        storage_path = Path(row["storage_uri"])
+        source_path = Path(row["source_path"])
+        root = storage_path
+        for _ in range(len(source_path.parts)):
+            root = root.parent
+        return root
+
+    def _source_dir_for_folder(self, folder_path: str) -> Path | None:
+        source_root = self._source_root()
+        if source_root is None:
+            return None
+        stripped = folder_path.strip("/")
+        return source_root / stripped if stripped else source_root
+
+    @staticmethod
+    def _source_path_from_storage(path: Path, source_root: Path) -> str:
+        try:
+            return path.relative_to(source_root).as_posix()
+        except ValueError:
+            return path.name
+
+    @staticmethod
+    def _matching_child_path(
+        parent_path: str,
+        folder_path: str,
+        child_paths: dict[str, dict[str, Any]],
+    ) -> str | None:
+        normalized_parent = parent_path.rstrip("/")
+        if normalized_parent == "":
+            normalized_parent = "/"
+        if normalized_parent == "/":
+            parts = [part for part in folder_path.strip("/").split("/") if part]
+            candidate = "/" + parts[0] if parts else "/"
+            return candidate if candidate in child_paths else None
+        prefix = normalized_parent + "/"
+        if not folder_path.startswith(prefix):
+            return None
+        remainder = folder_path[len(prefix):]
+        first = remainder.split("/", 1)[0]
+        candidate = prefix + first
+        return candidate if candidate in child_paths else None
+
+    def _file_row_for_storage(self, path: Path) -> dict[str, Any] | None:
+        storage_uri = str(path)
+        with self.filesystem.store.connect() as conn:
+            row = conn.execute(
+                """
+                SELECT file_ref, external_id, title, source_path
+                FROM files
+                WHERE storage_uri = ? AND deleted_at IS NULL
+                LIMIT 1
+                """,
+                (storage_uri,),
+            ).fetchone()
+        if row is None:
+            return None
+        return {
+            "file_ref": row["file_ref"],
+            "external_id": row["external_id"],
+            "title": row["title"],
+            "source_path": row["source_path"],
+        }
+
+    @staticmethod
+    def _folder_path_for_source_path(source_path: str) -> str:
+        parent = str(Path(source_path).parent).strip(".")
+        return "/" + parent.strip("/") if parent and parent != "." else "/"
+
+    def _folder_paths_for_file(self, file_ref: str | None) -> list[str]:
+        if not file_ref:
+            return []
+        try:
+            return [folder["path"] for folder in self.filesystem.store.folder_memberships(file_ref)]
+        except KeyError:
+            return []
+
+    def _is_folder(self, path: str) -> bool:
+        try:
+            self.filesystem.browse(path, recursive=False, limit=1)
+            return True
+        except KeyError:
+            return False
+
+    @staticmethod
+    def _normalize_folder_path(path: str) -> str:
+        value = str(path or "/").strip()
+        if not value or value == "/":
+            return "/"
+        return "/" + value.strip("/")
+
+    @classmethod
+    def _relative_depth(cls, root: str, path: str) -> int:
+        root = cls._normalize_folder_path(root).rstrip("/")
+        path = cls._normalize_folder_path(path).rstrip("/")
+        if root == "":
+            root = "/"
+        if root == "/":
+            rel = path.strip("/")
+        else:
+            rel = path[len(root):].strip("/")
+        return 0 if not rel else len(rel.split("/"))
+
+    @classmethod
+    def _compact_value(cls, value: Any) -> str:
+        if isinstance(value, list):
+            rendered = ", ".join(cls._compact_text(str(item), max_chars=40) for item in value[:3])
+            if len(value) > 3:
+                rendered += f", ... {len(value) - 3} more"
+            return rendered
+        if isinstance(value, dict):
+            return cls._compact_text(json.dumps(value, ensure_ascii=False, sort_keys=True), max_chars=120)
+        return cls._compact_text(str(value), max_chars=120)
+
+    @staticmethod
+    def _one_line_value(value: Any) -> str:
+        if isinstance(value, (dict, list)):
+            value = json.dumps(value, ensure_ascii=False, sort_keys=True)
+        return re.sub(r"\s+", " ", str(value or "")).strip()
+
+    @staticmethod
+    def _compact_text(text: str, *, max_chars: int) -> str:
+        collapsed = re.sub(r"\s+", " ", text or "").strip()
+        if len(collapsed) <= max_chars:
+            return collapsed
+        return collapsed[: max_chars - 3].rstrip() + "..."
+
+    @staticmethod
+    def _clean_error_message(exc: BaseException) -> str:
+        message = str(exc)
+        if isinstance(exc, KeyError) and len(exc.args) == 1:
+            message = str(exc.args[0])
+        return message or exc.__class__.__name__
+
+    @classmethod
+    def _jsonable(cls, value: Any) -> Any:
+        if is_dataclass(value):
+            return asdict(value)
+        if isinstance(value, list):
+            return [cls._jsonable(item) for item in value]
+        if isinstance(value, dict):
+            return {key: cls._jsonable(item) for key, item in value.items()}
+        return value
+
+    @classmethod
+    def _validate_raw_command(cls, command: str) -> None:
+        if any(token in command for token in cls.FORBIDDEN_SUBSTRINGS):
+            raise PIFSCommandError("Only PageIndex FileSystem commands are allowed")
+
+    @classmethod
+    def _validate_tokens(cls, tokens: list[str]) -> None:
+        if any(token in cls.FORBIDDEN_TOKENS for token in tokens):
+            raise PIFSCommandError("Only PageIndex FileSystem commands are allowed")
+
+    @classmethod
+    def _split_chained_commands(cls, command: str) -> list[str]:
+        return cls._split_unquoted_operator(command, "&&", reject_single_amp=True)
+
+    @classmethod
+    def _split_piped_commands(cls, command: str) -> list[str]:
+        return cls._split_unquoted_operator(command, "|")
+
+    @classmethod
+    def _split_unquoted_operator(
+        cls,
+        command: str,
+        operator: str,
+        *,
+        reject_single_amp: bool = False,
+    ) -> list[str]:
+        cls._validate_raw_command(command)
+        parts: list[str] = []
+        current: list[str] = []
+        quote: str | None = None
+        escaped = False
+        i = 0
+        while i < len(command):
+            char = command[i]
+            if escaped:
+                current.append(char)
+                escaped = False
+                i += 1
+                continue
+            if char == "\\" and quote != "'":
+                current.append(char)
+                escaped = True
+                i += 1
+                continue
+            if quote:
+                current.append(char)
+                if char == quote:
+                    quote = None
+                i += 1
+                continue
+            if char in {"'", '"'}:
+                quote = char
+                current.append(char)
+                i += 1
+                continue
+            if command.startswith(operator, i):
+                part = "".join(current).strip()
+                if not part:
+                    raise PIFSCommandError("Invalid command syntax")
+                parts.append(part)
+                current = []
+                i += len(operator)
+                continue
+            if reject_single_amp and char == "&":
+                raise PIFSCommandError("Only PageIndex FileSystem commands are allowed")
+            current.append(char)
+            i += 1
+        part = "".join(current).strip()
+        if quote:
+            raise PIFSCommandError("Invalid command syntax: No closing quotation")
+        if not part:
+            raise PIFSCommandError("Invalid command syntax")
+        parts.append(part)
+        return parts
+
+    def _pipe_head_tail(self, input_text: str, args: list[str], *, from_tail: bool) -> str:
+        count = self._parse_head_tail_count(args)
+        count = self._require_at_most(
+            count,
+            "pipe head/tail line count",
+            self.MAX_TEXT_LINES,
+        )
+        payload = self._try_json_loads(input_text)
+        if payload is not None:
+            return self._render_json_payload(self._slice_payload(payload, count, from_tail=from_tail))
+        lines = input_text.splitlines()
+        selected = [] if count == 0 else lines[-count:] if from_tail else lines[:count]
+        return "\n".join(selected)
+
+    def _pipe_grep(self, input_text: str, args: list[str]) -> str:
+        ignore_case = False
+        invert = False
+        regex = False
+        patterns: list[str] = []
+        for arg in args:
+            if arg in {"-i", "--ignore-case"}:
+                ignore_case = True
+            elif arg in {"-v", "--invert-match"}:
+                invert = True
+            elif arg in {"-E", "--extended-regexp"}:
+                regex = True
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported pipe grep option: {arg}")
+            else:
+                patterns.append(arg)
+        if len(patterns) != 1:
+            raise PIFSCommandError("pipe grep requires exactly one pattern")
+        pattern = patterns[0]
+        self._reject_regex_alternation_query(pattern, "pipe grep")
+        payload = self._try_json_loads(input_text)
+        if payload is not None:
+            return self._render_json_payload(
+                self._filter_payload(
+                    payload,
+                    pattern,
+                    ignore_case=ignore_case,
+                    invert=invert,
+                    regex=regex,
+                )
+            )
+        filtered = [
+            line
+            for line in input_text.splitlines()
+            if self._text_matches(line, pattern, ignore_case=ignore_case, invert=invert, regex=regex)
+        ]
+        return "\n".join(filtered)
+
+    def _pipe_sed(self, input_text: str, args: list[str]) -> str:
+        if not args:
+            raise PIFSCommandError("pipe sed requires an expression")
+        if args[0] == "-n":
+            args = args[1:]
+        if len(args) != 1:
+            raise PIFSCommandError("pipe sed supports only -n '<start>,<end>p'")
+        match = re.fullmatch(r"(\d+)(?:,(\d+))?p", args[0])
+        if not match:
+            raise PIFSCommandError("pipe sed supports only -n '<start>,<end>p'")
+        start = int(match.group(1))
+        end = int(match.group(2) or match.group(1))
+        if start < 1 or end < start:
+            raise PIFSCommandError("Invalid sed line range")
+        self._require_at_most(end - start + 1, "pipe sed line count", self.MAX_TEXT_LINES)
+        payload = self._try_json_loads(input_text)
+        if payload is not None:
+            return self._render_json_payload(self._slice_text_payload(payload, start, end))
+        lines = input_text.splitlines()
+        return "\n".join(lines[start - 1 : end])
+
+    @staticmethod
+    def _parse_head_tail_count(args: list[str]) -> int:
+        count = 10
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg == "-n":
+                i += 1
+                if i >= len(args):
+                    raise PIFSCommandError("head/tail -n requires a count")
+                count = PIFSCommandExecutor._parse_non_negative_int(args[i], "head/tail count")
+            elif re.fullmatch(r"-\d+", arg):
+                count = PIFSCommandExecutor._parse_non_negative_int(arg[1:], "head/tail count")
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported head/tail option: {arg}")
+            else:
+                count = PIFSCommandExecutor._parse_non_negative_int(arg, "head/tail count")
+            i += 1
+        return count
+
+    @staticmethod
+    def _parse_standalone_head_tail(args: list[str], *, default_count: int) -> tuple[int, str]:
+        count = default_count
+        target = ""
+        i = 0
+        while i < len(args):
+            arg = args[i]
+            if arg == "-n":
+                i += 1
+                if i >= len(args):
+                    raise PIFSCommandError("head/tail -n requires a count")
+                count = PIFSCommandExecutor._parse_non_negative_int(args[i], "head/tail count")
+            elif re.fullmatch(r"-\d+", arg):
+                count = PIFSCommandExecutor._parse_non_negative_int(arg[1:], "head/tail count")
+            elif arg.startswith("-"):
+                raise PIFSCommandError(f"Unsupported head/tail option: {arg}")
+            else:
+                target = arg
+            i += 1
+        if not target:
+            raise PIFSCommandError("head/tail requires a file target")
+        return count, target
+
+    @staticmethod
+    def _parse_non_negative_int(value: str, label: str) -> int:
+        try:
+            parsed = int(value)
+        except ValueError as exc:
+            raise PIFSCommandError(f"{label} must be an integer") from exc
+        if parsed < 0:
+            raise PIFSCommandError(f"{label} must be non-negative")
+        return parsed
+
+    @classmethod
+    def _parse_bounded_int(cls, value: str, label: str, *, max_value: int) -> int:
+        parsed = cls._parse_non_negative_int(value, label)
+        return cls._require_at_most(parsed, label, max_value)
+
+    @classmethod
+    def _require_at_most(cls, value: int, label: str, max_value: int) -> int:
+        if value > max_value:
+            raise PIFSCommandError(
+                f"{label} supports at most {max_value}; requested {value}. "
+                "Split it into a smaller call. If the evidence is sufficient, "
+                "stop; if not, continue with additional chunks before "
+                "answering. If you are unsure where to inspect, use cat <target> "
+                "--structure first."
+            )
+        return value
+
+    @staticmethod
+    def _parse_find_maxdepth(value: str | None) -> int:
+        if value is None:
+            raise PIFSCommandError("find -maxdepth requires an integer >= 0")
+        try:
+            parsed = int(value)
+        except ValueError as exc:
+            raise PIFSCommandError("find -maxdepth requires an integer >= 0") from exc
+        if parsed < 0:
+            raise PIFSCommandError("find -maxdepth requires an integer >= 0")
+        return parsed
+
+    @staticmethod
+    def _try_json_loads(input_text: str) -> Any | None:
+        try:
+            return json.loads(input_text)
+        except json.JSONDecodeError:
+            return None
+
+    @staticmethod
+    def _render_json_payload(payload: Any) -> str:
+        return json.dumps(payload, ensure_ascii=False)
+
+    @classmethod
+    def _slice_payload(cls, payload: Any, count: int, *, from_tail: bool) -> Any:
+        if isinstance(payload, list):
+            return payload[-count:] if from_tail and count else payload[:count]
+        if not isinstance(payload, dict):
+            return payload
+        sliced = dict(payload)
+        if "data" in sliced:
+            sliced["data"] = cls._slice_data(sliced["data"], count, from_tail=from_tail)
+        else:
+            sliced = cls._slice_mapping_lists(sliced, count, from_tail=from_tail)
+        return sliced
+
+    @classmethod
+    def _slice_data(cls, data: Any, count: int, *, from_tail: bool) -> Any:
+        if isinstance(data, list):
+            return data[-count:] if from_tail and count else data[:count]
+        if isinstance(data, dict):
+            if isinstance(data.get("text"), str):
+                copied = dict(data)
+                lines = copied["text"].splitlines()
+                copied["text"] = "\n".join(lines[-count:] if from_tail and count else lines[:count])
+                return copied
+            return cls._slice_mapping_lists(data, count, from_tail=from_tail)
+        return data
+
+    @classmethod
+    def _slice_mapping_lists(cls, data: dict[str, Any], count: int, *, from_tail: bool) -> dict[str, Any]:
+        copied = dict(data)
+        for key, value in copied.items():
+            if isinstance(value, list):
+                copied[key] = value[-count:] if from_tail and count else value[:count]
+        return copied
+
+    @classmethod
+    def _filter_payload(
+        cls,
+        payload: Any,
+        pattern: str,
+        *,
+        ignore_case: bool,
+        invert: bool,
+        regex: bool,
+    ) -> Any:
+        if isinstance(payload, list):
+            return [
+                item
+                for item in payload
+                if cls._json_matches(item, pattern, ignore_case=ignore_case, invert=invert, regex=regex)
+            ]
+        if not isinstance(payload, dict):
+            return payload
+        filtered = dict(payload)
+        if "data" in filtered:
+            filtered["data"] = cls._filter_data(
+                filtered["data"],
+                pattern,
+                ignore_case=ignore_case,
+                invert=invert,
+                regex=regex,
+            )
+        else:
+            filtered = cls._filter_mapping_lists(
+                filtered,
+                pattern,
+                ignore_case=ignore_case,
+                invert=invert,
+                regex=regex,
+            )
+        return filtered
+
+    @classmethod
+    def _filter_data(
+        cls,
+        data: Any,
+        pattern: str,
+        *,
+        ignore_case: bool,
+        invert: bool,
+        regex: bool,
+    ) -> Any:
+        if isinstance(data, list):
+            return [
+                item
+                for item in data
+                if cls._json_matches(item, pattern, ignore_case=ignore_case, invert=invert, regex=regex)
+            ]
+        if isinstance(data, dict):
+            return cls._filter_mapping_lists(
+                data,
+                pattern,
+                ignore_case=ignore_case,
+                invert=invert,
+                regex=regex,
+            )
+        if isinstance(data, str):
+            return "\n".join(
+                line
+                for line in data.splitlines()
+                if cls._text_matches(line, pattern, ignore_case=ignore_case, invert=invert, regex=regex)
+            )
+        return data
+
+    @classmethod
+    def _filter_mapping_lists(
+        cls,
+        data: dict[str, Any],
+        pattern: str,
+        *,
+        ignore_case: bool,
+        invert: bool,
+        regex: bool,
+    ) -> dict[str, Any]:
+        filtered = dict(data)
+        for key, value in filtered.items():
+            if isinstance(value, list):
+                filtered[key] = [
+                    item
+                    for item in value
+                    if cls._json_matches(item, pattern, ignore_case=ignore_case, invert=invert, regex=regex)
+                ]
+        return filtered
+
+    @classmethod
+    def _json_matches(
+        cls,
+        value: Any,
+        pattern: str,
+        *,
+        ignore_case: bool,
+        invert: bool,
+        regex: bool,
+    ) -> bool:
+        text = json.dumps(value, ensure_ascii=False, sort_keys=True)
+        return cls._text_matches(text, pattern, ignore_case=ignore_case, invert=invert, regex=regex)
+
+    @staticmethod
+    def _text_matches(
+        text: str,
+        pattern: str,
+        *,
+        ignore_case: bool,
+        invert: bool,
+        regex: bool,
+    ) -> bool:
+        flags = re.IGNORECASE if ignore_case else 0
+        if regex:
+            try:
+                matched = re.search(pattern, text, flags) is not None
+            except re.error as exc:
+                raise PIFSCommandError(f"Invalid grep regex: {exc}") from exc
+        elif ignore_case:
+            matched = pattern.lower() in text.lower()
+        else:
+            matched = pattern in text
+        return not matched if invert else matched
+
+    @classmethod
+    def _slice_text_payload(cls, payload: Any, start: int, end: int) -> Any:
+        if not isinstance(payload, dict):
+            return payload
+        sliced = dict(payload)
+        data = sliced.get("data")
+        if isinstance(data, dict) and isinstance(data.get("text"), str):
+            copied_data = dict(data)
+            lines = copied_data["text"].splitlines()
+            copied_data["text"] = "\n".join(lines[start - 1 : end])
+            copied_data["start_line"] = start
+            copied_data["end_line"] = min(end, len(lines))
+            sliced["data"] = copied_data
+        return sliced
diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py
new file mode 100644
index 000000000..72833b78a
--- /dev/null
+++ b/pageindex/filesystem/core.py
@@ -0,0 +1,1949 @@
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional, Union
+from urllib.parse import unquote, urlparse
+
+from .metadata import MetadataQueryEngine
+from .metadata_generation import (
+    MetadataGenerationBackend,
+    MetadataGenerationError,
+    MetadataGenerationInput,
+    MetadataGenerationResult,
+    MetadataGenerator,
+)
+from .semantic_folder_policy import (
+    SEMANTIC_FOLDER_BASE_FIELDS,
+    SEMANTIC_FOLDER_ROOT,
+    SEMANTIC_FOLDER_SYSTEM_FIELDS,
+    canonical_semantic_folder_field_name,
+    is_semantic_folder_forbidden_field,
+    semantic_folder_allowed_extension_fields,
+)
+from .store import (
+    SQLiteFileSystemStore,
+    fingerprint,
+    make_file_ref,
+    metadata_text,
+    normalize_path,
+)
+from .structural_read import (
+    flatten_pageindex_structure_nodes,
+    first_node_location,
+    find_pageindex_node,
+    strip_pageindex_text_fields,
+)
+from .types import OpenResult, SearchResult
+
+if TYPE_CHECKING:
+    from ..client import PageIndexClient
+    from .projection_indexing import SummaryProjectionIndexer
+
+DEFAULT_METADATA_GENERATION_FIELDS = {
+    "summary": True,
+    "doc_type": True,
+    "domain": True,
+    "topic": True,
+    "entity": False,
+    "relation": False,
+}
+
+DEFAULT_METADATA_FIELD_TYPES = {
+    "summary": "string",
+    "doc_type": "string",
+    "domain": "string",
+    "topic": "string",
+    "entity": "string",
+    "relation": "string",
+}
+
+METADATA_STATUSES = {
+    "skipped",
+    "pending_submit",
+    "pending_generate",
+    "generated",
+    "failed",
+}
+
+PROJECTION_INDEX_STATUSES = {
+    "not_indexed",
+    "pending_index",
+    "generated",
+    "ready",
+    "failed",
+}
+
+SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation")
+SEMANTIC_GREP_CHANNELS = ("entity", "relation")
+PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"}
+PAGEINDEX_DOCUMENT_CONTENT_TYPES = {
+    "application/pdf",
+    "text/markdown",
+    "text/x-markdown",
+    "application/markdown",
+}
+TEXT_ARTIFACT_SUFFIXES = {".txt", ".text"}
+TEXT_ARTIFACT_CONTENT_TYPES = {"text/plain"}
+
+
+class PageIndexFileSystem:
+    def __init__(
+        self,
+        workspace: Union[str, Path],
+        *,
+        semantic_retrieval_backend: Any | None = None,
+        metadata_generator: MetadataGenerationBackend | None = None,
+        metadata_provider: str = "openai",
+        metadata_model: str | None = None,
+        metadata_base_url: str | None = None,
+        metadata_max_text_chars: int = 24000,
+        summary_projection_indexer: SummaryProjectionIndexer | None = None,
+        summary_projection_index: bool = True,
+        summary_projection_index_dir: Union[str, Path, None] = None,
+        summary_projection_embedding_provider: str = "openai",
+        summary_projection_embedding_model: str = "text-embedding-3-small",
+        summary_projection_embedding_dimensions: int = 256,
+        summary_projection_embedding_timeout: float = 60,
+    ):
+        self.workspace = Path(workspace).expanduser()
+        self.store = SQLiteFileSystemStore(self.workspace)
+        self.metadata = MetadataQueryEngine(self.store)
+        self.semantic_retrieval_backend = semantic_retrieval_backend
+        self.metadata_generator = metadata_generator
+        self.metadata_provider = metadata_provider
+        self.metadata_model = metadata_model
+        self.metadata_base_url = metadata_base_url
+        self.metadata_max_text_chars = metadata_max_text_chars
+        self.summary_projection_indexer = summary_projection_indexer
+        self.summary_projection_index = summary_projection_index
+        self.summary_projection_index_dir = (
+            Path(summary_projection_index_dir).expanduser()
+            if summary_projection_index_dir is not None
+            else self.workspace / "artifacts" / "projection_indexes"
+        )
+        self.summary_projection_embedding_provider = summary_projection_embedding_provider
+        self.summary_projection_embedding_model = summary_projection_embedding_model
+        self.summary_projection_embedding_dimensions = summary_projection_embedding_dimensions
+        self.summary_projection_embedding_timeout = summary_projection_embedding_timeout
+
+    def register_file(
+        self,
+        *,
+        storage_uri: str,
+        source_path: str,
+        folder_path: Optional[str] = None,
+        metadata: Optional[dict[str, Any]] = None,
+        external_id: Optional[str] = None,
+        title: Optional[str] = None,
+        content: str = "",
+        content_type: str = "text/plain",
+        source_type: Optional[str] = None,
+        metadata_policy: Optional[dict[str, Any]] = None,
+        metadata_status: Optional[str] = None,
+    ) -> str:
+        return self.register_files(
+            [
+                {
+                    "storage_uri": storage_uri,
+                    "source_path": source_path,
+                    "folder_path": folder_path,
+                    "metadata": metadata,
+                    "external_id": external_id,
+                    "title": title,
+                    "content": content,
+                    "content_type": content_type,
+                    "source_type": source_type,
+                    "metadata_policy": metadata_policy,
+                    "metadata_status": metadata_status,
+                }
+            ]
+        )[0]
+
+    def register(self, **kwargs: Any) -> str:
+        if not self._register_uses_deferred_metadata(kwargs.get("metadata_policy")):
+            self._ensure_register_completion_defaults()
+        return self.register_file(**kwargs)
+
+    def register_files(self, files: list[dict[str, Any]]) -> list[str]:
+        records = [self._prepare_file_record(file) for file in files]
+        try:
+            for record in records:
+                self._generate_register_metadata(record)
+            self._register_generation_policy_schema(records)
+            self.store.insert_files(records)
+        except Exception:
+            self._cleanup_failed_register_artifacts(records)
+            raise
+        for record in records:
+            if self._complete_summary_projection_index(record):
+                self.store.update_file_metadata_status(
+                    record["file_ref"],
+                    metadata=record["metadata"],
+                    metadata_status=record["metadata_status"],
+                )
+            self._sync_owned_raw_artifact(record)
+        return [record["file_ref"] for record in records]
+
+    def batch_generate(self, *, limit: int | None = None) -> dict[str, Any]:
+        if self.metadata_generator is None:
+            raise MetadataGenerationError(
+                "metadata_generator is required to generate pending PIFS metadata"
+            )
+        rows = self.store.list_pending_metadata_status(limit=limit)
+        generated = 0
+        failed = 0
+        file_refs: list[str] = []
+        for row in rows:
+            record = self._record_from_file_entry(row)
+            self._generate_register_metadata(record, force=True)
+            self._complete_summary_projection_index(record)
+            self._register_generation_policy_schema([record])
+            self.store.update_file_metadata_status(
+                record["file_ref"],
+                metadata=record["metadata"],
+                metadata_status=record["metadata_status"],
+            )
+            self._sync_owned_raw_artifact(record)
+            file_refs.append(record["file_ref"])
+            if record["metadata_status"]["status"] == "failed":
+                failed += 1
+            else:
+                generated += 1
+        return {
+            "processed": len(rows),
+            "generated": generated,
+            "failed": failed,
+            "file_refs": file_refs,
+        }
+
+    def _ensure_register_completion_defaults(self) -> None:
+        if self.metadata_generator is None:
+            self.metadata_generator = MetadataGenerator(
+                provider=self.metadata_provider,
+                model=self.metadata_model,
+                base_url=self.metadata_base_url,
+                max_text_chars=self.metadata_max_text_chars,
+            )
+        if self.summary_projection_index and self.summary_projection_indexer is None:
+            from .projection_indexing import SummaryProjectionIndexer
+
+            self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
+                self.summary_projection_index_dir,
+                embedding_provider=self.summary_projection_embedding_provider,
+                embedding_model=self.summary_projection_embedding_model,
+                embedding_dimensions=self.summary_projection_embedding_dimensions,
+                embedding_timeout=self.summary_projection_embedding_timeout,
+            )
+        if self.summary_projection_index and self.semantic_retrieval_backend is None:
+            self.configure_hybrid_projection_retrieval(
+                self.summary_projection_index_dir,
+                embedding_provider=self.summary_projection_embedding_provider,
+                embedding_model=self.summary_projection_embedding_model,
+                embedding_dimensions=self.summary_projection_embedding_dimensions,
+                embedding_timeout=self.summary_projection_embedding_timeout,
+            )
+
+    def configure_existing_projection_retrieval(self) -> bool:
+        """Attach semantic retrieval to already-built projection indexes.
+
+        Register-time generation owns building the index files. Opening an
+        existing workspace should still expose the corresponding read commands,
+        such as search-summary, without forcing a re-register step.
+        """
+        if self.semantic_retrieval_backend is not None:
+            return bool(self.semantic_retrieval_channels())
+        index_config = self._existing_projection_index_config()
+        if index_config is None:
+            return False
+        metadata = dict(index_config.get("metadata") or {})
+        embedding_provider = str(
+            metadata.get("embedding_provider")
+            or self.summary_projection_embedding_provider
+        )
+        embedding_model = str(
+            metadata.get("embedding_model")
+            or self.summary_projection_embedding_model
+        )
+        embedding_dimensions = int(
+            metadata.get("embedding_dimensions")
+            or index_config.get("dimension")
+            or self.summary_projection_embedding_dimensions
+        )
+        self.configure_hybrid_projection_retrieval(
+            self.summary_projection_index_dir,
+            embedding_provider=embedding_provider,
+            embedding_model=embedding_model,
+            embedding_dimensions=embedding_dimensions,
+            embedding_timeout=self.summary_projection_embedding_timeout,
+        )
+        return bool(self.semantic_retrieval_channels())
+
+    def _existing_projection_index_config(self) -> dict[str, Any] | None:
+        from .hybrid_projection import INDEX_BY_CHANNEL
+        from .semantic_index import SQLiteVecSemanticIndex
+
+        for channel in SEMANTIC_RETRIEVAL_CHANNELS:
+            index_name = INDEX_BY_CHANNEL.get(channel)
+            if not index_name:
+                continue
+            index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
+            if not index_path.exists():
+                continue
+            try:
+                info = SQLiteVecSemanticIndex(index_path).info()
+            except Exception:
+                continue
+            if int(info.get("document_count") or 0) <= 0:
+                continue
+            metadata = dict(info.get("metadata") or {})
+            if metadata.get("channel") and metadata.get("channel") != channel:
+                continue
+            return info
+        return None
+
+    @staticmethod
+    def _register_uses_deferred_metadata(policy: Any) -> bool:
+        if not isinstance(policy, dict):
+            return False
+        return bool(policy.get("batch")) or policy.get("mode") == "batch"
+
+    @classmethod
+    def default_metadata_policy(cls) -> dict[str, Any]:
+        return {
+            "fields": dict(DEFAULT_METADATA_GENERATION_FIELDS),
+            "projection_indexes": {"summary": True},
+            "batch": False,
+        }
+
+    def browse(
+        self,
+        path: str = "/",
+        recursive: bool = False,
+        limit: int = 100,
+        max_depth: int | None = None,
+    ) -> dict[str, list[dict[str, Any]]]:
+        return self.store.list_folder(
+            path,
+            recursive=recursive,
+            limit=limit,
+            max_depth=max_depth,
+        )
+
+    def folder_info(self, path: str = "/") -> dict[str, Any]:
+        return self.store.folder_info(path)
+
+    def find_folders(
+        self,
+        path: str = "/",
+        metadata_filter: Optional[dict[str, Any] | str] = None,
+        limit: int = 100,
+        max_depth: int | None = None,
+    ) -> list[dict[str, Any]]:
+        parsed_filter = self.metadata.parse_filter(metadata_filter)
+        return self.store.find_folders(
+            path,
+            metadata_filter=parsed_filter,
+            limit=limit,
+            max_depth=max_depth,
+        )
+
+    def create_folder(
+        self,
+        path: str,
+        kind: str = "manual",
+        description: str = "",
+        metadata: Optional[dict[str, Any]] = None,
+    ) -> str:
+        return self.store.create_folder(
+            path,
+            kind=kind,
+            description=description,
+            metadata=metadata,
+        )
+
+    def attach_file_to_folder(
+        self,
+        file_ref: str,
+        folder_path_or_id: str,
+        metadata: Optional[dict[str, Any]] = None,
+    ) -> None:
+        self.store.attach_file_to_folder(file_ref, folder_path_or_id, metadata=metadata)
+
+    def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None:
+        self.store.attach_files_to_folders(items)
+
+    def apply_semantic_folder_projection(
+        self,
+        projection_plan: dict[str, Any],
+        *,
+        file_ref_by_document_id: Optional[dict[str, str]] = None,
+    ) -> dict[str, Any]:
+        """Attach registered files to a Semantic Folder Projection.
+
+        Registration remains the explicit folder placement step. This method is
+        the separate product API for adding derived `/semantic/...` memberships.
+        """
+        folders = list(projection_plan.get("folders") or [])
+        memberships = list(projection_plan.get("memberships") or [])
+        policy_raw = projection_plan.get("policy")
+        policy = policy_raw if isinstance(policy_raw, dict) else {}
+        allowed_extension_fields = semantic_folder_allowed_extension_fields(
+            policy.get("allowed_extension_fields", [])
+        )
+        for folder in folders:
+            self._validate_semantic_folder_projection_item(folder, allowed_extension_fields)
+        for membership in memberships:
+            self._validate_semantic_folder_projection_item(membership, allowed_extension_fields)
+
+        for folder in folders:
+            folder_metadata = folder.get("metadata")
+            self.create_folder(
+                self._validate_semantic_folder_projection_path(str(folder["path"])),
+                kind=str(folder.get("kind") or "semantic_projection"),
+                description=str(folder.get("description") or ""),
+                metadata=folder_metadata if isinstance(folder_metadata, dict) else {},
+            )
+
+        items: list[dict[str, Any]] = []
+        file_ref_by_document_id = file_ref_by_document_id or {}
+        for membership in memberships:
+            document_id = self._semantic_folder_projection_document_id(membership)
+            file_ref = file_ref_by_document_id.get(document_id)
+            if not file_ref:
+                file_ref = self.store.resolve_file_ref(document_id)
+            metadata = (
+                dict(membership.get("folder_metadata"))
+                if isinstance(membership.get("folder_metadata"), dict)
+                else {}
+            )
+            metadata.update(
+                {
+                    "projection": "Semantic Folder Projection",
+                    "field": membership.get("field", ""),
+                    "value": membership.get("value", ""),
+                    "mount_kind": membership.get(
+                        "mount_kind",
+                        "semantic_folder_projection",
+                    ),
+                }
+            )
+            items.append(
+                {
+                    "file_ref": file_ref,
+                    "folder": self._validate_semantic_folder_projection_path(
+                        str(membership["folder_path"])
+                    ),
+                    "metadata": metadata,
+                }
+            )
+        self.attach_files_to_folders(items)
+        return {
+            "projection": "Semantic Folder Projection",
+            "folders_applied": len(folders),
+            "memberships_attached": len(items),
+        }
+
+    def search(
+        self,
+        query: Union[str, list[str], None] = None,
+        scope: Optional[dict[str, Any]] = None,
+        metadata_filter: Optional[dict[str, Any] | str] = None,
+        limit: int = 10,
+        semantic: bool = True,
+    ) -> list[SearchResult]:
+        parsed_filter = self.metadata.parse_filter(metadata_filter)
+        if semantic and self._should_use_semantic_retrieval(query, scope):
+            semantic_results = self._semantic_search(
+                query,
+                scope=scope,
+                metadata_filter=parsed_filter,
+                limit=limit,
+            )
+            if semantic_results:
+                return semantic_results
+        rows = self.store.search_files(
+            query,
+            scope=scope,
+            metadata_filter=parsed_filter,
+            limit=limit,
+        )
+        results = []
+        scope_path = self._scope_folder_path(scope)
+        for row in rows:
+            folder_paths = [
+                folder["path"]
+                for folder in self.store.folder_memberships(row["file_ref"])
+            ]
+            folder_path = self._preferred_folder_path(folder_paths, scope_path, row["folder_path"])
+            results.append(
+                SearchResult(
+                    file_ref=row["file_ref"],
+                    external_id=row["external_id"],
+                    title=row["title"],
+                    snippet=row["snippet"],
+                    folder_path=folder_path,
+                    folder_paths=folder_paths,
+                    metadata=row["metadata"],
+                    metadata_status=row["metadata_status"],
+                    source_path=row["source_path"],
+                    id=row["id"],
+                    document_id=row["document_id"],
+                    name=row["name"],
+                    description=row["description"],
+                    status=row["status"],
+                    pageNum=row["pageNum"],
+                    createdAt=row["createdAt"],
+                    folderId=row["folderId"],
+                )
+            )
+        return results
+
+    def search_semantic_channel(
+        self,
+        channel: str,
+        query: Union[str, list[str], None],
+        *,
+        scope: Optional[dict[str, Any]] = None,
+        metadata_filter: Optional[dict[str, Any] | str] = None,
+        limit: int = 10,
+    ) -> list[SearchResult]:
+        parsed_filter = self.metadata.parse_filter(metadata_filter)
+        if (
+            self.semantic_retrieval_backend is None
+            or not self.has_semantic_channel(channel)
+            or not self._query_text(query)
+        ):
+            return []
+        return self._semantic_search(
+            query,
+            scope=scope,
+            metadata_filter=parsed_filter,
+            limit=limit,
+            channel=channel,
+        )
+
+    def configure_hybrid_projection_retrieval(
+        self,
+        index_dir: Union[str, Path],
+        *,
+        embedding_provider: str = "openai",
+        embedding_model: str = "text-embedding-3-small",
+        embedding_dimensions: int = 256,
+        embedding_timeout: float = 60,
+        per_channel_limit: int = 100,
+        fetch_multiplier: int = 100,
+    ) -> Any:
+        from .hybrid_projection import HybridProjectionSearchBackend
+
+        self.semantic_retrieval_backend = HybridProjectionSearchBackend.from_provider(
+            index_dir,
+            embedding_provider=embedding_provider,
+            embedding_model=embedding_model,
+            embedding_dimensions=embedding_dimensions,
+            embedding_timeout=embedding_timeout,
+            per_channel_limit=per_channel_limit,
+            fetch_multiplier=fetch_multiplier,
+        )
+        return self.semantic_retrieval_backend
+
+    @property
+    def has_semantic_retrieval_backend(self) -> bool:
+        return self.semantic_retrieval_backend is not None
+
+    def semantic_retrieval_channels(self) -> tuple[str, ...]:
+        backend = self.semantic_retrieval_backend
+        if backend is None:
+            return ()
+        available_channels = getattr(backend, "available_channels", None)
+        if callable(available_channels):
+            raw_channels = available_channels()
+        else:
+            raw_channels = getattr(backend, "semantic_tool_channels", ())
+        available = set(raw_channels or ())
+        return tuple(channel for channel in SEMANTIC_RETRIEVAL_CHANNELS if channel in available)
+
+    def has_semantic_channel(self, channel: str) -> bool:
+        return channel in self.semantic_retrieval_channels()
+
+    def retrieval_capabilities(self) -> dict[str, Any]:
+        semantic_channels = self.semantic_retrieval_channels()
+        semantic_commands = [f"search-{channel}" for channel in semantic_channels]
+        semantic_grep_channels = [
+            channel for channel in SEMANTIC_GREP_CHANNELS if channel in semantic_channels
+        ]
+        if semantic_grep_channels:
+            semantic_commands.append("semantic-grep")
+        return {
+            "lexical": {
+                "grep_recursive": True,
+                "grep_recursive_semantic_prefilter": False,
+                "grep_recursive_guard": "bounded broad-folder notice",
+                "find_maxdepth": True,
+            },
+            "semantic": {
+                "backend_configured": self.semantic_retrieval_backend is not None,
+                "channels": list(semantic_channels),
+                "commands": semantic_commands,
+                "semantic_grep_channels": semantic_grep_channels,
+            },
+        }
+
+    def find(
+        self,
+        target: str,
+        patterns: Union[str, list[str]],
+        limit: int = 20,
+    ) -> list[OpenResult]:
+        file_ref = self._resolve_target(target)
+        patterns = [patterns] if isinstance(patterns, str) else list(patterns)
+        lowered_patterns = [pattern.lower() for pattern in patterns if pattern]
+        if not lowered_patterns:
+            return []
+        text = self.store.read_text(file_ref)
+        lines = text.splitlines()
+        matches = []
+        for i, line in enumerate(lines, 1):
+            haystack = line.lower()
+            if any(pattern in haystack for pattern in lowered_patterns):
+                start = max(1, i - 1)
+                end = min(len(lines), i + 1)
+                matches.append(self._open_lines(file_ref, start, end))
+                if len(matches) >= limit:
+                    break
+        return matches
+
+    def open(self, target: str, location: str = "all") -> OpenResult:
+        file_ref = self._resolve_target(target)
+        entry = self.store.get_file(file_ref)
+        if self._file_format(entry) in {"pdf", "markdown", "pageindex"}:
+            raise ValueError(
+                "open() text artifact reads are not supported for PDF/Markdown PageIndex files; "
+                "use pageindex_structure(), pageindex_pages(), or pageindex_node()."
+            )
+        if str(location).strip().lower() in {"all", "full", "*"}:
+            return self._open_all(file_ref)
+        start, end = self._parse_line_range(location)
+        return self._open_lines(file_ref, start, end)
+
+    def cat_text_artifact(self, target: str, location: str = "all") -> OpenResult:
+        file_ref = self._resolve_target(target)
+        entry = self.store.get_file(file_ref)
+        self._require_text_artifact_file(entry, "cat --all")
+        if str(location).strip().lower() in {"all", "full", "*"}:
+            return self._open_all(file_ref)
+        start, end = self._parse_line_range(location)
+        return self._open_lines(file_ref, start, end)
+
+    def pageindex_structure(
+        self,
+        target: str,
+        *,
+        offset: int = 0,
+        limit: int = 25,
+    ) -> dict[str, Any]:
+        file_ref = self._resolve_target(target)
+        entry = self.store.get_file(file_ref)
+        self._require_pageindex_document_file(entry, "cat --structure")
+        client, doc_id = self._pageindex_client_doc_for_entry(entry)
+        if doc_id is None:
+            return self._structural_unavailable(
+                "structure",
+                entry,
+                message=(
+                    "PageIndex structure is not cached for this file in the "
+                    "PageIndexClient workspace."
+                ),
+            )
+        structure = self._client_json(client.get_document_structure(doc_id))
+        if isinstance(structure, dict) and structure.get("error"):
+            return self._structural_unavailable(
+                "structure",
+                entry,
+                message=str(structure["error"]),
+            )
+        node_rows = flatten_pageindex_structure_nodes(structure)
+        offset = max(0, offset)
+        limit = max(0, limit)
+        window = node_rows[offset : offset + limit] if limit else []
+        next_offset = offset + len(window)
+        has_more = next_offset < len(node_rows)
+        return {
+            "mode": "structure",
+            "file_ref": file_ref,
+            "external_id": entry.external_id,
+            "source_path": entry.source_path,
+            "status": entry.pageindex_tree_status,
+            "available": True,
+            "pageindex_doc_id": doc_id,
+            "structure": window,
+            "structure_pagination": {
+                "offset": offset,
+                "limit": limit,
+                "returned_nodes": len(window),
+                "total_nodes": len(node_rows),
+                "has_more": has_more,
+                "next_offset": next_offset if has_more else None,
+            },
+        }
+
+    def pageindex_node(self, target: str, node_id: str) -> dict[str, Any]:
+        file_ref = self._resolve_target(target)
+        entry = self.store.get_file(file_ref)
+        self._require_pageindex_document_file(entry, "cat --node")
+        client, doc_id = self._pageindex_client_doc_for_entry(entry)
+        if doc_id is None:
+            return self._structural_unavailable(
+                "node",
+                entry,
+                node_id=node_id,
+                message=(
+                    "PageIndex structure is not cached for this file in the "
+                    "PageIndexClient workspace."
+                ),
+            )
+        client._ensure_doc_loaded(doc_id)
+        doc = client.documents.get(doc_id, {})
+        node = find_pageindex_node(doc.get("structure", []), node_id)
+        if node is None:
+            return self._structural_unavailable(
+                "node",
+                entry,
+                node_id=node_id,
+                message="PageIndex node was not found in the cached structure.",
+            )
+        text = str(node.get("text") or "")
+        if not text:
+            location = first_node_location(node)
+            if location:
+                content = self._client_json(client.get_page_content(doc_id, location))
+                if isinstance(content, list):
+                    text = "\n\n".join(str(page.get("content") or "") for page in content)
+        if not text:
+            return self._structural_unavailable(
+                "node",
+                entry,
+                node_id=node_id,
+                message="Cached PageIndex node has no text content.",
+            )
+        return {
+            "mode": "node",
+            "file_ref": file_ref,
+            "external_id": entry.external_id,
+            "source_path": entry.source_path,
+            "status": entry.pageindex_tree_status,
+            "available": True,
+            "pageindex_doc_id": doc_id,
+            "node_id": node_id,
+            "node": strip_pageindex_text_fields(node),
+            "text": text,
+        }
+
+    def pageindex_pages(self, target: str, pages: str) -> dict[str, Any]:
+        file_ref = self._resolve_target(target)
+        entry = self.store.get_file(file_ref)
+        self._require_pageindex_document_file(entry, "cat --page")
+        client, doc_id = self._pageindex_client_doc_for_entry(entry)
+        if doc_id is None:
+            return self._structural_unavailable(
+                "page",
+                entry,
+                pages=pages,
+                message=(
+                    "PageIndex page content is not cached for this file in the "
+                    "PageIndexClient workspace."
+                ),
+            )
+        page_entries = self._client_json(client.get_page_content(doc_id, pages))
+        if isinstance(page_entries, dict) and page_entries.get("error"):
+            return self._structural_unavailable(
+                "page",
+                entry,
+                pages=pages,
+                message=str(page_entries["error"]),
+            )
+        if not isinstance(page_entries, list) or not page_entries:
+            return self._structural_unavailable(
+                "page",
+                entry,
+                pages=pages,
+                message="Requested PageIndex page content is not cached for this file.",
+            )
+        text = "\n\n".join(str(page.get("content") or "") for page in page_entries)
+        return {
+            "mode": "page",
+            "file_ref": file_ref,
+            "external_id": entry.external_id,
+            "source_path": entry.source_path,
+            "status": entry.pageindex_tree_status,
+            "available": True,
+            "pageindex_doc_id": doc_id,
+            "pages": pages,
+            "data": page_entries,
+            "text": text,
+        }
+
+    def _stat(self, target: str) -> dict[str, Any]:
+        file_ref = self._resolve_target(target)
+        return self.store.file_info(file_ref)
+
+    def _require_text_artifact_file(self, entry: Any, command: str) -> None:
+        if self._file_format(entry) == "text":
+            return
+        raise ValueError(
+            f"{command} is only supported for txt/text files; "
+            f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
+            "Use cat <path|file_ref|document_id> --structure, "
+            "cat <path|file_ref|document_id> --page, or "
+            "cat <path|file_ref|document_id> --node for PDF/Markdown PageIndex files."
+        )
+
+    def _require_pageindex_document_file(self, entry: Any, command: str) -> None:
+        if self._file_format(entry) in {"pdf", "markdown", "pageindex"}:
+            return
+        raise ValueError(
+            f"{command} is only supported for PDF/Markdown PageIndex files; "
+            f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
+            "Use cat <path|file_ref|document_id> --all for txt/text files."
+        )
+
+    @classmethod
+    def _file_format(cls, entry: Any) -> str:
+        suffix = Path(str(entry.source_path or "")).suffix.lower()
+        content_type = cls._normalized_content_type(entry.content_type)
+        if suffix == ".pdf" or content_type == "application/pdf":
+            return "pdf"
+        if suffix in PAGEINDEX_DOCUMENT_SUFFIXES or content_type in PAGEINDEX_DOCUMENT_CONTENT_TYPES:
+            return "markdown"
+        if suffix in TEXT_ARTIFACT_SUFFIXES:
+            return "text"
+        if entry.pageindex_doc_id or entry.pageindex_tree_status != "not_built":
+            return "pageindex"
+        if content_type in TEXT_ARTIFACT_CONTENT_TYPES:
+            return "text"
+        return "unsupported"
+
+    @classmethod
+    def _source_format(cls, source_path: Any, content_type: str | None) -> str:
+        suffix = Path(str(source_path or "")).suffix.lower()
+        normalized_content_type = cls._normalized_content_type(content_type)
+        if suffix == ".pdf" or normalized_content_type == "application/pdf":
+            return "pdf"
+        if (
+            suffix in PAGEINDEX_DOCUMENT_SUFFIXES
+            or normalized_content_type in PAGEINDEX_DOCUMENT_CONTENT_TYPES
+        ):
+            return "markdown"
+        if suffix in TEXT_ARTIFACT_SUFFIXES:
+            return "text"
+        if normalized_content_type in TEXT_ARTIFACT_CONTENT_TYPES:
+            return "text"
+        return "unsupported"
+
+    @staticmethod
+    def _normalized_content_type(content_type: str | None) -> str:
+        return str(content_type or "").split(";", 1)[0].strip().lower()
+
+    @property
+    def pageindex_client_workspace(self) -> Path:
+        return self.workspace / "artifacts" / "pageindex_client"
+
+    def _pageindex_client(self) -> PageIndexClient:
+        from ..client import PageIndexClient
+
+        return PageIndexClient(workspace=str(self.pageindex_client_workspace))
+
+    def _pageindex_client_doc_for_entry(self, entry: Any) -> tuple[PageIndexClient, str | None]:
+        client = self._pageindex_client()
+        if not entry.pageindex_doc_id:
+            return client, None
+        if entry.pageindex_doc_id not in client.documents:
+            return client, None
+        return client, entry.pageindex_doc_id
+
+    def _registration_pageindex_pointer(
+        self,
+        *,
+        storage_uri: str,
+        source_path: str,
+        content_type: str,
+    ) -> tuple[str | None, str, dict[str, Any] | None]:
+        if self._source_format(source_path, content_type) not in {"pdf", "markdown"}:
+            return None, "not_built", None
+        client = self._pageindex_client()
+        source = self._canonical_source_path(storage_uri=storage_uri, source_path=source_path)
+        cached_doc_id = self._find_cached_pageindex_doc_id(client, source)
+        if cached_doc_id:
+            return cached_doc_id, "built", None
+        if source is None:
+            return None, "failed", self._pageindex_tree_failure_record(
+                source="PageIndexFileSystem.registration",
+                error_type="UnresolvableSourcePath",
+                message=(
+                    "PageIndex source path must resolve to a local file path for "
+                    "PDF/Markdown registration."
+                ),
+            )
+        try:
+            doc_id = client.index(source)
+            return doc_id, "built", None
+        except Exception as exc:
+            return None, "failed", self._pageindex_tree_failure_record(
+                source="PageIndexClient.index",
+                error_type=exc.__class__.__name__,
+                message=str(exc) or exc.__class__.__name__,
+            )
+
+    @staticmethod
+    def _pageindex_tree_failure_record(
+        *,
+        source: str,
+        error_type: str,
+        message: str,
+    ) -> dict[str, Any]:
+        return {
+            "status": "failed",
+            "owner": "pageindex",
+            "source": source,
+            "error_type": error_type,
+            "message": message,
+        }
+
+    def _find_cached_pageindex_doc_id(
+        self,
+        client: PageIndexClient,
+        source_path: str | None,
+    ) -> str | None:
+        if source_path is None:
+            return None
+        for doc_id, doc in client.documents.items():
+            if self._canonical_path(doc.get("path")) == source_path:
+                return doc_id
+        return None
+
+    def _canonical_source_path(self, *, storage_uri: str, source_path: str) -> str | None:
+        parsed = urlparse(storage_uri)
+        if parsed.scheme == "file":
+            return self._canonical_path(unquote(parsed.path))
+        if storage_uri and not parsed.scheme:
+            return self._canonical_path(storage_uri)
+        if Path(source_path).expanduser().is_absolute():
+            return self._canonical_path(source_path)
+        return None
+
+    @staticmethod
+    def _canonical_path(path: Any) -> str | None:
+        if not path:
+            return None
+        return str(Path(os.path.expanduser(str(path))).resolve(strict=False))
+
+    @staticmethod
+    def _client_json(payload: str) -> Any:
+        try:
+            return json.loads(payload)
+        except json.JSONDecodeError:
+            return {"error": f"Invalid PageIndexClient JSON response: {payload}"}
+
+    def _metadata_schema(self) -> dict[str, Any]:
+        return self.metadata.export_schema()
+
+    def _register_metadata_schema(self, schema: dict[str, Any]) -> None:
+        self.metadata.register_schema(schema)
+
+    def _create_folder(self, path: str) -> str:
+        return self.create_folder(path)
+
+    def _prepare_file_record(self, file: dict[str, Any]) -> dict[str, Any]:
+        storage_uri = file["storage_uri"]
+        raw_source_path = str(file["source_path"])
+        source_path = raw_source_path.strip("/")
+        metadata = file.get("metadata") or {}
+        if not isinstance(metadata, dict):
+            raise ValueError("metadata must be a JSON object")
+        legacy_value_key = "derived_" + "metadata"
+        legacy_policy_key = "metadata_" + "generation_policy"
+        legacy_status_key = "metadata_" + "generation_status"
+        if legacy_value_key in file:
+            raise ValueError("legacy generated metadata map has been removed; put values in metadata")
+        if legacy_policy_key in file:
+            raise ValueError("legacy metadata policy key has been renamed to metadata_policy")
+        if legacy_status_key in file:
+            raise ValueError("legacy metadata status key has been renamed to metadata_status")
+        self._validate_register_metadata(metadata)
+        external_id = file.get("external_id")
+        content = file.get("content") or ""
+        content_type = file.get("content_type") or "text/plain"
+        (
+            pageindex_doc_id,
+            pageindex_tree_status,
+            pageindex_tree_failure,
+        ) = self._registration_pageindex_pointer(
+            storage_uri=storage_uri,
+            source_path=raw_source_path,
+            content_type=content_type,
+        )
+        artifact_content = self._registration_text_artifact_content(
+            source_path=raw_source_path,
+            content_type=content_type,
+            pageindex_doc_id=pageindex_doc_id,
+            pageindex_tree_status=pageindex_tree_status,
+            fallback_content=content,
+        )
+        fts_content = file.get("fts_content", artifact_content)
+        source_type = file.get("source_type") or self._infer_source_type(source_path)
+        metadata_policy = self._normalize_metadata_policy(
+            file.get("metadata_policy"),
+            metadata=metadata,
+        )
+        metadata_status = self._metadata_status_state(
+            metadata_policy,
+            metadata=metadata,
+            status=file.get("metadata_status"),
+        )
+        self._attach_pageindex_tree_failure(metadata_status, pageindex_tree_failure)
+        indexed_metadata = SQLiteFileSystemStore.indexed_metadata_values(metadata)
+        searchable_metadata = dict(metadata)
+        folder_path = normalize_path(file.get("folder_path") or "/")
+        title = file.get("title") or metadata.get("title") or Path(source_path).stem
+        file_ref = make_file_ref(external_id or source_path)
+        text_artifact_path = file.get("text_artifact_path")
+        owns_text_artifact = text_artifact_path is None
+        if text_artifact_path is None:
+            text_artifact_path = self.store.write_text_artifact(file_ref, artifact_content)
+        raw_artifact_path = file.get("raw_artifact_path")
+        owns_raw_artifact = False
+        if raw_artifact_path is None and file.get("write_raw_artifact", True):
+            raw_artifact_path = self.store.raw_dir / f"{file_ref}.json"
+            owns_raw_artifact = True
+        descriptor = self._build_descriptor(title, metadata)
+        return {
+            "file_ref": file_ref,
+            "external_id": external_id,
+            "storage_uri": storage_uri,
+            "source_path": source_path,
+            "title": title,
+            "descriptor": descriptor,
+            "content_type": content_type,
+            "source_type": source_type,
+            "fingerprint": fingerprint(artifact_content),
+            "text_artifact_path": str(text_artifact_path),
+            "raw_artifact_path": str(raw_artifact_path) if raw_artifact_path is not None else None,
+            "pageindex_doc_id": pageindex_doc_id,
+            "pageindex_tree_status": pageindex_tree_status,
+            "metadata": metadata,
+            "metadata_json": json.dumps(metadata, ensure_ascii=False),
+            "metadata_status": metadata_status,
+            "metadata_status_json": json.dumps(metadata_status, ensure_ascii=False),
+            "indexed_metadata": indexed_metadata,
+            "metadata_text": metadata_text(searchable_metadata),
+            "folder_path": folder_path,
+            "content": fts_content,
+            "skip_fts": bool(file.get("skip_fts", False)),
+            "_pifs_owned_text_artifact": owns_text_artifact,
+            "_pifs_owned_raw_artifact": owns_raw_artifact,
+        }
+
+    def _registration_text_artifact_content(
+        self,
+        *,
+        source_path: str,
+        content_type: str,
+        pageindex_doc_id: str | None,
+        pageindex_tree_status: str,
+        fallback_content: str,
+    ) -> str:
+        if self._source_format(source_path, content_type) not in {"pdf", "markdown"}:
+            return fallback_content
+        if pageindex_tree_status != "built" or not pageindex_doc_id:
+            return fallback_content
+        return self._pageindex_extracted_text(pageindex_doc_id)
+
+    def _pageindex_extracted_text(self, doc_id: str) -> str:
+        client = self._pageindex_client()
+        if doc_id not in client.documents:
+            return ""
+        client._ensure_doc_loaded(doc_id)
+        doc = client.documents.get(doc_id) or {}
+        page_text = self._pageindex_pages_text(doc.get("pages"))
+        if page_text:
+            return page_text
+        return self._pageindex_structure_text(doc.get("structure", []))
+
+    @staticmethod
+    def _pageindex_pages_text(pages: Any) -> str:
+        if not isinstance(pages, list):
+            return ""
+        parts: list[str] = []
+        for page in pages:
+            if not isinstance(page, dict):
+                continue
+            content = str(page.get("content") or "").strip()
+            if content:
+                parts.append(content)
+        return "\n\n".join(parts)
+
+    @classmethod
+    def _pageindex_structure_text(cls, structure: Any) -> str:
+        parts: list[str] = []
+        cls._collect_pageindex_node_text(structure, parts)
+        return "\n\n".join(parts)
+
+    @classmethod
+    def _collect_pageindex_node_text(cls, node: Any, parts: list[str]) -> None:
+        if isinstance(node, list):
+            for item in node:
+                cls._collect_pageindex_node_text(item, parts)
+            return
+        if not isinstance(node, dict):
+            return
+        text = str(node.get("text") or "").strip()
+        if text:
+            parts.append(text)
+        cls._collect_pageindex_node_text(node.get("nodes", []), parts)
+
+    @staticmethod
+    def _raw_artifact_payload(
+        *,
+        storage_uri: str,
+        source_path: str,
+        folder_path: str,
+        metadata: dict[str, Any],
+        metadata_status: dict[str, Any],
+    ) -> dict[str, Any]:
+        return {
+            "storage_uri": storage_uri,
+            "source_path": source_path,
+            "folder_path": folder_path,
+            "metadata": metadata,
+            "metadata_status": metadata_status,
+        }
+
+    def _sync_owned_raw_artifact(self, record: dict[str, Any]) -> None:
+        raw_artifact_path = record.get("raw_artifact_path")
+        if not raw_artifact_path:
+            return
+        default_raw_artifact_path = self.store.raw_dir / f"{record['file_ref']}.json"
+        if Path(raw_artifact_path).expanduser().resolve(strict=False) != (
+            default_raw_artifact_path.resolve(strict=False)
+        ):
+            return
+        record["raw_artifact_path"] = str(
+            self.store.write_raw_artifact(
+                record["file_ref"],
+                self._raw_artifact_payload(
+                    storage_uri=record["storage_uri"],
+                    source_path=record["source_path"],
+                    folder_path=record["folder_path"],
+                    metadata=record["metadata"],
+                    metadata_status=record["metadata_status"],
+                ),
+            )
+        )
+
+    def _record_from_file_entry(self, entry: Any) -> dict[str, Any]:
+        content = self.store.read_text(entry.file_ref)
+        metadata_policy = self._normalize_metadata_policy(
+            entry.metadata_status.get("policy", {}),
+            metadata=entry.metadata,
+        )
+        metadata_status = self._metadata_status_state(
+            metadata_policy,
+            metadata=entry.metadata,
+            status=entry.metadata_status.get("status"),
+        )
+        self._attach_pageindex_tree_failure(
+            metadata_status,
+            entry.metadata_status.get("pageindex_tree"),
+        )
+        return {
+            "file_ref": entry.file_ref,
+            "external_id": entry.external_id,
+            "storage_uri": entry.storage_uri,
+            "source_path": entry.source_path,
+            "title": entry.title,
+            "descriptor": entry.descriptor,
+            "content_type": entry.content_type,
+            "source_type": entry.source_type,
+            "fingerprint": entry.fingerprint,
+            "text_artifact_path": entry.text_artifact_path,
+            "raw_artifact_path": entry.raw_artifact_path,
+            "pageindex_doc_id": entry.pageindex_doc_id,
+            "pageindex_tree_status": entry.pageindex_tree_status,
+            "metadata": dict(entry.metadata),
+            "metadata_json": json.dumps(entry.metadata, ensure_ascii=False),
+            "metadata_status": metadata_status,
+            "metadata_status_json": json.dumps(metadata_status, ensure_ascii=False),
+            "indexed_metadata": SQLiteFileSystemStore.indexed_metadata_values(entry.metadata),
+            "metadata_text": metadata_text(entry.metadata),
+            "folder_path": entry.folder_path,
+            "content": content,
+            "skip_fts": False,
+        }
+
+    def _generate_register_metadata(self, record: dict[str, Any], *, force: bool = False) -> None:
+        status = record["metadata_status"]
+        policy = status.get("policy", {})
+        if self._metadata_policy_is_batch(policy) and not force:
+            self._mark_requested_generation_status(record, "pending_submit")
+            return
+        fields = self._metadata_fields_to_generate(record)
+        if not fields:
+            return
+        if self.metadata_generator is None:
+            if self._metadata_policy_requires_sync(policy):
+                raise MetadataGenerationError(
+                    "metadata_generator is required for synchronous PIFS metadata generation; "
+                    "set metadata_policy batch=true to defer"
+                )
+            return
+        try:
+            result = self.metadata_generator.generate(
+                MetadataGenerationInput(
+                    file_ref=record["file_ref"],
+                    external_id=record.get("external_id"),
+                    title=record["title"],
+                    source_path=record["source_path"],
+                    content_type=record["content_type"],
+                    source_type=record.get("source_type"),
+                    text=Path(record["text_artifact_path"]).read_text(encoding="utf-8"),
+                    metadata=dict(record.get("metadata") or {}),
+                    text_artifact_path=record.get("text_artifact_path"),
+                ),
+                fields=fields,
+            )
+            if isinstance(result, dict):
+                result = MetadataGenerationResult(values=result)
+        except Exception as exc:
+            self._apply_metadata_status_failures(record, fields, str(exc))
+            return
+        failures = dict(result.failures)
+        for field in fields:
+            if field in result.values:
+                record["metadata"][field] = result.values[field]
+                status["fields"][field] = {
+                    "requested": True,
+                    "status": "generated",
+                    "owner": "pifs",
+                    "source": "llm",
+                }
+            else:
+                failures.setdefault(field, "metadata generator did not return field")
+        for field, reason in failures.items():
+            status["fields"][field] = {
+                "requested": True,
+                "status": "failed",
+                "owner": "pifs",
+                "source": "llm",
+                "error": str(reason),
+            }
+        self._refresh_record_metadata_status(record)
+
+    def _complete_summary_projection_index(self, record: dict[str, Any]) -> bool:
+        metadata_status = record["metadata_status"]
+        summary_index = metadata_status.get("projection_indexes", {}).get("summary")
+        if not summary_index or not summary_index.get("requested"):
+            return False
+        summary = str(record.get("metadata", {}).get("summary") or "").strip()
+        if not summary:
+            return False
+        if self.summary_projection_indexer is None:
+            self._refresh_record_metadata_status(record)
+            return True
+        try:
+            result = self.summary_projection_indexer.upsert_summary(record)
+        except Exception as exc:
+            summary_index["status"] = "failed"
+            summary_index["error"] = str(exc)
+            self._refresh_record_metadata_status(record)
+            return True
+        summary_index.clear()
+        summary_index.update({"requested": True, **result})
+        if summary_index.get("status") != "ready":
+            summary_index["status"] = "ready"
+        self._refresh_record_metadata_status(record)
+        return True
+
+    @staticmethod
+    def _unlink_artifact(path: Any) -> None:
+        try:
+            Path(path).unlink()
+        except FileNotFoundError:
+            return
+
+    def _cleanup_failed_register_artifacts(self, records: list[dict[str, Any]]) -> None:
+        for record in records:
+            if record.get("_pifs_owned_text_artifact"):
+                self._unlink_artifact(record["text_artifact_path"])
+            if record.get("_pifs_owned_raw_artifact") and record.get("raw_artifact_path"):
+                self._unlink_artifact(record["raw_artifact_path"])
+
+    @staticmethod
+    def _metadata_policy_is_batch(policy: dict[str, Any]) -> bool:
+        return bool(policy.get("batch")) or policy.get("mode") == "batch"
+
+    @staticmethod
+    def _metadata_policy_requires_sync(policy: dict[str, Any]) -> bool:
+        return policy.get("batch") is False or policy.get("mode") == "sync"
+
+    def _metadata_fields_to_generate(self, record: dict[str, Any]) -> list[str]:
+        fields: list[str] = []
+        for name, state in record["metadata_status"].get("fields", {}).items():
+            if not state.get("requested"):
+                continue
+            if state.get("status") == "generated" and name in record["metadata"]:
+                continue
+            fields.append(name)
+        return fields
+
+    def _mark_requested_generation_status(self, record: dict[str, Any], status: str) -> None:
+        for name, field in record["metadata_status"].get("fields", {}).items():
+            if field.get("requested") and field.get("status") != "generated":
+                record["metadata_status"]["fields"][name] = {
+                    "requested": True,
+                    "status": status,
+                    "owner": "pifs",
+                    "source": "llm",
+                }
+        self._refresh_record_metadata_status(record, explicit_status=status)
+
+    def _apply_metadata_status_failures(
+        self,
+        record: dict[str, Any],
+        fields: list[str],
+        reason: str,
+    ) -> None:
+        for field in fields:
+            record["metadata_status"]["fields"][field] = {
+                "requested": True,
+                "status": "failed",
+                "owner": "pifs",
+                "source": "llm",
+                "error": reason,
+            }
+        self._refresh_record_metadata_status(record, explicit_status="failed")
+
+    def _refresh_record_metadata_status(
+        self,
+        record: dict[str, Any],
+        *,
+        explicit_status: str | None = None,
+    ) -> None:
+        metadata_status = record["metadata_status"]
+        statuses = [
+            field.get("status")
+            for field in metadata_status.get("fields", {}).values()
+            if field.get("requested") and field.get("status")
+        ]
+        metadata_status["status"] = explicit_status or self._aggregate_metadata_status(statuses)
+        self._refresh_projection_index_statuses(metadata_status, record["metadata"])
+        record["metadata_json"] = json.dumps(record["metadata"], ensure_ascii=False)
+        record["metadata_status_json"] = json.dumps(metadata_status, ensure_ascii=False)
+        record["indexed_metadata"] = SQLiteFileSystemStore.indexed_metadata_values(record["metadata"])
+        record["metadata_text"] = metadata_text(record["metadata"])
+
+    def _open_lines(self, file_ref: str, start: int, end: int) -> OpenResult:
+        entry = self.store.get_file(file_ref)
+        lines = self.store.read_text(file_ref).splitlines()
+        start = max(1, start)
+        end = min(max(start, end), len(lines))
+        text = "\n".join(lines[start - 1:end])
+        return OpenResult(
+            file_ref=file_ref,
+            start_line=start,
+            end_line=end,
+            text=text,
+            external_id=entry.external_id,
+            folder_path=entry.folder_path,
+            source_path=entry.source_path,
+        )
+
+    def _open_all(self, file_ref: str) -> OpenResult:
+        entry = self.store.get_file(file_ref)
+        text = self.store.read_text(file_ref)
+        line_count = len(text.splitlines())
+        return OpenResult(
+            file_ref=file_ref,
+            start_line=1,
+            end_line=line_count,
+            text=text,
+            external_id=entry.external_id,
+            folder_path=entry.folder_path,
+            source_path=entry.source_path,
+        )
+
+    @classmethod
+    def _structural_unavailable(
+        cls,
+        mode: str,
+        entry: Any,
+        *,
+        message: str,
+        node_id: str | None = None,
+        pages: str | None = None,
+    ) -> dict[str, Any]:
+        pageindex_tree_error = cls._pageindex_tree_failure_message(entry.metadata_status)
+        if pageindex_tree_error and entry.pageindex_tree_status == "failed":
+            message = f"PageIndex tree build failed: {pageindex_tree_error}"
+        result = {
+            "mode": mode,
+            "file_ref": entry.file_ref,
+            "external_id": entry.external_id,
+            "source_path": entry.source_path,
+            "status": entry.pageindex_tree_status,
+            "available": False,
+            "message": message,
+        }
+        if pageindex_tree_error:
+            result["pageindex_tree_error"] = pageindex_tree_error
+        if node_id is not None:
+            result["node_id"] = node_id
+        if pages is not None:
+            result["pages"] = pages
+        return result
+
+    @staticmethod
+    def _attach_pageindex_tree_failure(
+        metadata_status: dict[str, Any],
+        pageindex_tree_failure: Any,
+    ) -> None:
+        if isinstance(pageindex_tree_failure, dict) and pageindex_tree_failure:
+            metadata_status["pageindex_tree"] = dict(pageindex_tree_failure)
+
+    @staticmethod
+    def _pageindex_tree_failure_message(metadata_status: Any) -> str | None:
+        if not isinstance(metadata_status, dict):
+            return None
+        pageindex_tree = metadata_status.get("pageindex_tree")
+        if not isinstance(pageindex_tree, dict):
+            return None
+        if pageindex_tree.get("status") != "failed":
+            return None
+        message = str(pageindex_tree.get("message") or "").strip()
+        error_type = str(pageindex_tree.get("error_type") or "").strip()
+        if error_type and message:
+            return f"{error_type}: {message}"
+        return message or error_type or None
+
+    def _resolve_target(self, target: str) -> str:
+        return self.store.resolve_file_ref(target)
+
+    def _should_use_semantic_retrieval(
+        self,
+        query: Union[str, list[str], None],
+        scope: Optional[dict[str, Any]],
+    ) -> bool:
+        if self.semantic_retrieval_backend is None:
+            return False
+        if not self._query_text(query):
+            return False
+        if not scope:
+            return True
+        return bool(scope.get("recursive", True))
+
+    def _semantic_search(
+        self,
+        query: Union[str, list[str], None],
+        *,
+        scope: Optional[dict[str, Any]],
+        metadata_filter: Optional[dict[str, Any]],
+        limit: int,
+        channel: str | None = None,
+    ) -> list[SearchResult]:
+        if self.semantic_retrieval_backend is None:
+            return []
+        filters = self._semantic_filters_for_scope(scope)
+        fetch_limit = max(limit * 10, 50)
+        query_text = self._query_text(query)
+        if channel:
+            search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None)
+            if search_channel is None:
+                return []
+            candidates = search_channel(
+                channel,
+                query_text,
+                limit=fetch_limit,
+                filters=filters,
+            )
+        else:
+            candidates = self.semantic_retrieval_backend.search(
+                query_text,
+                limit=fetch_limit,
+                filters=filters,
+            )
+        results: list[SearchResult] = []
+        seen: set[str] = set()
+        scope_path = self._scope_folder_path(scope)
+        for candidate in candidates:
+            try:
+                file_ref = self.store.resolve_file_ref(candidate.document_id)
+            except KeyError:
+                continue
+            if file_ref in seen:
+                continue
+            if not self.store.file_matches(file_ref, scope=scope, metadata_filter=metadata_filter):
+                continue
+            seen.add(file_ref)
+            entry = self.store.get_file(file_ref)
+            folder_paths = [
+                folder["path"]
+                for folder in self.store.folder_memberships(file_ref)
+            ]
+            folder_path = self._preferred_folder_path(folder_paths, scope_path, entry.folder_path)
+            results.append(
+                SearchResult(
+                    file_ref=file_ref,
+                    external_id=entry.external_id,
+                    title=entry.title,
+                    snippet=candidate.snippet or entry.descriptor,
+                    folder_path=folder_path,
+                    folder_paths=folder_paths,
+                    metadata=entry.metadata,
+                    metadata_status=entry.metadata_status,
+                    source_path=entry.source_path,
+                    id=entry.external_id or file_ref,
+                    document_id=entry.external_id,
+                    name=entry.title,
+                    description=entry.descriptor,
+                    status=entry.pageindex_tree_status,
+                    pageNum=None,
+                    createdAt=None,
+                    folderId=None,
+                )
+            )
+            if len(results) >= limit:
+                break
+        return results
+
+    @staticmethod
+    def _build_descriptor(title: str, metadata: dict[str, Any]) -> str:
+        source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel")
+        return f"{title} ({source})" if source else title
+
+    @staticmethod
+    def _validate_register_metadata(metadata: dict[str, Any]) -> None:
+        pifs_owned_fields = set(DEFAULT_METADATA_GENERATION_FIELDS)
+        conflicts = sorted(pifs_owned_fields.intersection(metadata))
+        if conflicts:
+            raise ValueError(
+                "metadata contains PIFS-owned generated field(s): "
+                + ", ".join(conflicts)
+                + "; configure metadata_policy instead of passing generated fields"
+            )
+
+    def _register_generation_policy_schema(self, records: list[dict[str, Any]]) -> None:
+        pifs_fields: dict[str, dict[str, str]] = {}
+        user_fields: dict[str, dict[str, str]] = {}
+        for record in records:
+            policy_fields = record["metadata_status"]["policy"]["fields"]
+            generated_names = {str(name) for name, requested in policy_fields.items() if requested}
+            for name, requested in policy_fields.items():
+                if requested:
+                    pifs_fields[name] = {
+                        "type": DEFAULT_METADATA_FIELD_TYPES.get(
+                            name,
+                            self._infer_metadata_field_type(
+                                record.get("metadata", {}).get(name)
+                            ),
+                        )
+                    }
+            for name, value in record.get("metadata", {}).items():
+                if name in generated_names:
+                    pifs_fields.setdefault(name, {"type": self._infer_metadata_field_type(value)})
+                else:
+                    user_fields.setdefault(name, {"type": self._infer_metadata_field_type(value)})
+        if pifs_fields:
+            self.metadata.register_schema({"fields": pifs_fields}, source="pifs")
+        if user_fields:
+            self.metadata.register_schema({"fields": user_fields}, source="user")
+
+    @classmethod
+    def _normalize_metadata_policy(
+        cls,
+        policy: Optional[dict[str, Any]],
+        *,
+        metadata: dict[str, Any],
+    ) -> dict[str, Any]:
+        fields = dict(DEFAULT_METADATA_GENERATION_FIELDS)
+        field_statuses: dict[str, str] = {}
+        projection_indexes: dict[str, bool] | None = None
+        projection_index_statuses: dict[str, str] = {}
+        mode = None
+        batch = None
+        top_level_status = None
+        if policy is not None:
+            if not isinstance(policy, dict):
+                raise ValueError("metadata_policy must be a JSON object")
+            raw_fields = policy.get("fields")
+            if raw_fields is None:
+                raw_fields = {
+                    name: declaration
+                    for name, declaration in policy.items()
+                    if name not in {"batch", "mode", "status", "projection_indexes"}
+                }
+            if not isinstance(raw_fields, dict):
+                raise ValueError("metadata_policy fields must be a JSON object")
+            for name, declaration in raw_fields.items():
+                name = str(name)
+                if isinstance(declaration, bool):
+                    fields[name] = declaration
+                    continue
+                if isinstance(declaration, dict):
+                    fields[name] = bool(
+                        declaration.get("enabled", declaration.get("requested", True))
+                    )
+                    field_status = declaration.get("status")
+                    if field_status is not None:
+                        cls._validate_metadata_status(str(field_status))
+                        field_statuses[name] = str(field_status)
+                    continue
+                raise ValueError(f"Invalid metadata generation policy for field: {name}")
+            mode = policy.get("mode")
+            if "batch" in policy:
+                batch = bool(policy["batch"])
+            elif mode == "batch":
+                batch = True
+            top_level_status = policy.get("status")
+            if top_level_status is not None:
+                cls._validate_metadata_status(str(top_level_status))
+            if "projection_indexes" in policy:
+                projection_indexes, projection_index_statuses = (
+                    cls._normalize_projection_index_policy(policy["projection_indexes"])
+                )
+        normalized: dict[str, Any] = {
+            "fields": fields,
+            "projection_indexes": (
+                projection_indexes
+                if projection_indexes is not None
+                else {"summary": bool(fields.get("summary", False))}
+            ),
+        }
+        if field_statuses:
+            normalized["field_statuses"] = field_statuses
+        if projection_index_statuses:
+            normalized["projection_index_statuses"] = projection_index_statuses
+        if mode:
+            normalized["mode"] = str(mode)
+        if batch is not None:
+            normalized["batch"] = batch
+        if top_level_status:
+            normalized["status"] = str(top_level_status)
+        return normalized
+
+    @classmethod
+    def _metadata_status_state(
+        cls,
+        policy: dict[str, Any],
+        *,
+        metadata: dict[str, Any],
+        status: Optional[str],
+    ) -> dict[str, Any]:
+        explicit_status = status or policy.get("status")
+        if explicit_status is not None:
+            explicit_status = str(explicit_status)
+            cls._validate_metadata_status(explicit_status)
+        field_statuses = policy.get("field_statuses", {})
+        fields: dict[str, dict[str, Any]] = {}
+        for name, requested in policy["fields"].items():
+            if not requested:
+                fields[name] = {
+                    "requested": False,
+                    "status": "skipped",
+                    "owner": "pifs",
+                    "source": "llm",
+                }
+                continue
+            field_status = field_statuses.get(name)
+            if field_status is None:
+                field_status = explicit_status
+            if field_status is None:
+                field_status = "generated" if name in metadata else "pending_generate"
+            fields[name] = {
+                "requested": True,
+                "status": field_status,
+                "owner": "pifs",
+                "source": "llm",
+            }
+
+        requested_statuses = [
+            item["status"]
+            for item in fields.values()
+            if item.get("requested") and item.get("status")
+        ]
+        aggregate_status = explicit_status or cls._aggregate_metadata_status(requested_statuses)
+        policy_summary = {
+            "fields": dict(policy["fields"]),
+            "projection_indexes": dict(policy.get("projection_indexes", {})),
+        }
+        if "mode" in policy:
+            policy_summary["mode"] = policy["mode"]
+        if "batch" in policy:
+            policy_summary["batch"] = policy["batch"]
+        state = {
+            "status": aggregate_status,
+            "policy": policy_summary,
+            "fields": fields,
+            "projection_indexes": {},
+        }
+        projection_statuses = policy.get("projection_index_statuses", {})
+        for name, requested in policy.get("projection_indexes", {}).items():
+            if not requested:
+                continue
+            state["projection_indexes"][name] = {
+                "requested": True,
+                "status": projection_statuses.get(name, "not_indexed"),
+                "owner": "pifs",
+                "source": "index",
+            }
+        cls._refresh_projection_index_statuses(state, metadata)
+        return state
+
+    @staticmethod
+    def _aggregate_metadata_status(statuses: list[str]) -> str:
+        if not statuses:
+            return "generated"
+        for status in ("failed", "pending_submit", "pending_generate"):
+            if status in statuses:
+                return status
+        return "generated"
+
+    @staticmethod
+    def _validate_metadata_status(status: str) -> None:
+        if status not in METADATA_STATUSES:
+            raise ValueError(f"Unsupported metadata status: {status}")
+
+    @classmethod
+    def _normalize_projection_index_policy(
+        cls,
+        projection_policy: Any,
+    ) -> tuple[dict[str, bool], dict[str, str]]:
+        if projection_policy is None:
+            return {}, {}
+        if not isinstance(projection_policy, dict):
+            raise ValueError("metadata_policy projection_indexes must be a JSON object")
+        projection_indexes: dict[str, bool] = {}
+        projection_index_statuses: dict[str, str] = {}
+        for name, declaration in projection_policy.items():
+            name = str(name)
+            if isinstance(declaration, bool):
+                projection_indexes[name] = declaration
+                continue
+            if isinstance(declaration, dict):
+                projection_indexes[name] = bool(
+                    declaration.get("enabled", declaration.get("requested", True))
+                )
+                status = declaration.get("status")
+                if status is not None:
+                    status = str(status)
+                    cls._validate_projection_index_status(status)
+                    projection_index_statuses[name] = status
+                continue
+            raise ValueError(f"Invalid projection index policy for index: {name}")
+        return projection_indexes, projection_index_statuses
+
+    @staticmethod
+    def _validate_projection_index_status(status: str) -> None:
+        if status not in PROJECTION_INDEX_STATUSES:
+            raise ValueError(f"Unsupported projection index status: {status}")
+
+    @classmethod
+    def _refresh_projection_index_statuses(
+        cls,
+        metadata_status: dict[str, Any],
+        metadata: dict[str, Any],
+    ) -> None:
+        summary_index = metadata_status.get("projection_indexes", {}).get("summary")
+        if not summary_index or not summary_index.get("requested"):
+            return
+        if "summary" not in metadata:
+            return
+        if summary_index.get("status", "not_indexed") == "not_indexed":
+            summary_index["status"] = "pending_index"
+
+    @staticmethod
+    def _infer_metadata_field_type(value: Any) -> str:
+        if isinstance(value, bool):
+            return "boolean"
+        if isinstance(value, (int, float)):
+            return "number"
+        return "string"
+
+    @staticmethod
+    def _infer_source_type(source_path: str) -> Optional[str]:
+        parts = [part for part in Path(source_path).parts if part not in ("", ".")]
+        return parts[0] if parts else None
+
+    @staticmethod
+    def _scope_folder_path(scope: Optional[dict[str, Any]]) -> Optional[str]:
+        if not scope:
+            return None
+        path = scope.get("folder_path") or scope.get("path")
+        return normalize_path(path) if path else None
+
+    @classmethod
+    def _semantic_filters_for_scope(cls, scope: Optional[dict[str, Any]]) -> dict[str, Any]:
+        path = cls._scope_folder_path(scope)
+        if not path or path == "/":
+            return {}
+        source_type = cls._source_type_filter_from_path(path)
+        return {"source_type": source_type} if source_type else {}
+
+    @staticmethod
+    def _source_type_filter_from_path(path: str) -> str:
+        segments = [segment for segment in path.strip("/").split("/") if segment]
+        if not segments:
+            return ""
+        if segments[0] == SEMANTIC_FOLDER_ROOT.strip("/"):
+            segments = segments[1:]
+        if not segments:
+            return ""
+        first_segment = segments[0]
+        if first_segment.startswith("source_type="):
+            return first_segment.split("=", 1)[1].replace("-", "_")
+        if path.startswith(f"{SEMANTIC_FOLDER_ROOT}/"):
+            return ""
+        return ""
+
+    @classmethod
+    def _validate_semantic_folder_projection_item(
+        cls,
+        item: dict[str, Any],
+        allowed_extension_fields: set[str],
+    ) -> None:
+        path = item.get("folder_path") or item.get("path")
+        if not path:
+            raise ValueError("Semantic Folder Projection items must include a folder path")
+        cls._validate_semantic_folder_projection_path(str(path))
+        allowed_fields = (
+            SEMANTIC_FOLDER_BASE_FIELDS
+            | SEMANTIC_FOLDER_SYSTEM_FIELDS
+            | allowed_extension_fields
+        )
+        if item.get("dataset_doc_uuid"):
+            raise ValueError(
+                "dataset_doc_uuid is not allowed in Semantic Folder Projection memberships; "
+                "use file_key or file_ref"
+            )
+        fields = []
+        explicit_field = cls._canonical_semantic_folder_field_name(item.get("field"))
+        if explicit_field:
+            fields.append(explicit_field)
+        fields.extend(cls._semantic_folder_projection_fields_from_path(str(path)))
+        for payload_key in ("metadata", "folder_metadata"):
+            cls._validate_semantic_folder_projection_metadata_payload(
+                item.get(payload_key),
+                allowed_fields,
+            )
+        for field in fields:
+            if is_semantic_folder_forbidden_field(field) or field not in allowed_fields:
+                raise ValueError(f"Field is not allowed for Semantic Folder Projection: {field}")
+
+    @staticmethod
+    def _validate_semantic_folder_projection_path(path: str) -> str:
+        normalized = normalize_path(path)
+        if normalized != SEMANTIC_FOLDER_ROOT and not normalized.startswith(
+            f"{SEMANTIC_FOLDER_ROOT}/"
+        ):
+            raise ValueError("Semantic Folder Projection paths must be under /semantic")
+        return normalized
+
+    @classmethod
+    def _semantic_folder_projection_fields_from_path(cls, path: str) -> list[str]:
+        normalized = cls._validate_semantic_folder_projection_path(path)
+        fields: list[str] = []
+        for segment in normalized.strip("/").split("/")[1:]:
+            if "=" not in segment:
+                continue
+            field = cls._canonical_semantic_folder_field_name(
+                segment.split("=", 1)[0]
+            )
+            if field:
+                fields.append(field)
+        return fields
+
+    @classmethod
+    def _validate_semantic_folder_projection_metadata_payload(
+        cls,
+        payload: Any,
+        allowed_fields: set[str],
+    ) -> None:
+        if isinstance(payload, dict):
+            for key, value in payload.items():
+                key_text = str(key)
+                key_field = cls._canonical_semantic_folder_field_name(key)
+                if is_semantic_folder_forbidden_field(key_field):
+                    raise ValueError(
+                        "Forbidden metadata field in Semantic Folder Projection payload: "
+                        f"{key_text}"
+                    )
+                if key_field in {"field", "source_field", "metadata_field"}:
+                    field = cls._canonical_semantic_folder_field_name(value)
+                    if field and (
+                        is_semantic_folder_forbidden_field(field)
+                        or field not in allowed_fields
+                    ):
+                        raise ValueError(
+                            f"Field is not allowed for Semantic Folder Projection: {field}"
+                        )
+                cls._validate_semantic_folder_projection_metadata_payload(value, allowed_fields)
+        elif isinstance(payload, list):
+            for item in payload:
+                cls._validate_semantic_folder_projection_metadata_payload(item, allowed_fields)
+        elif isinstance(payload, str):
+            field = cls._canonical_semantic_folder_field_name(payload)
+            if is_semantic_folder_forbidden_field(field):
+                raise ValueError(
+                    "Forbidden metadata field label in Semantic Folder Projection payload: "
+                    f"{payload}"
+                )
+
+    @staticmethod
+    def _canonical_semantic_folder_field_name(value: Any) -> str:
+        return canonical_semantic_folder_field_name(value)
+
+    @staticmethod
+    def _semantic_folder_projection_document_id(membership: dict[str, Any]) -> str:
+        for key in ("file_key", "file_ref", "document_ref"):
+            value = str(membership.get(key) or "").strip()
+            if value:
+                return value
+        raise ValueError("Semantic Folder Projection membership is missing file_key or file_ref")
+
+    @staticmethod
+    def _query_text(query: Union[str, list[str], None]) -> str:
+        if query is None:
+            return ""
+        if isinstance(query, list):
+            return " ".join(str(item) for item in query)
+        return str(query)
+
+    @staticmethod
+    def _preferred_folder_path(
+        folder_paths: list[str],
+        scope_path: Optional[str],
+        fallback: str,
+    ) -> str:
+        if scope_path:
+            scoped = [
+                path
+                for path in folder_paths
+                if path == scope_path or path.startswith(f"{scope_path.rstrip('/')}/")
+            ]
+            if scoped:
+                return sorted(scoped, key=lambda item: (len(item), item))[0]
+        non_root = [path for path in folder_paths if path != "/"]
+        if non_root:
+            return sorted(non_root, key=lambda item: (len(item), item))[0]
+        return fallback
+
+    @staticmethod
+    def _parse_line_range(location: str) -> tuple[int, int]:
+        value = str(location).strip()
+        if "-" in value:
+            left, right = value.split("-", 1)
+            start, end = int(left), int(right)
+        else:
+            start = end = int(value)
+        if start < 1 or end < start:
+            raise ValueError(f"Invalid line range: {location}")
+        return start, end
diff --git a/pageindex/filesystem/hybrid_projection.py b/pageindex/filesystem/hybrid_projection.py
new file mode 100644
index 000000000..b49d49afa
--- /dev/null
+++ b/pageindex/filesystem/hybrid_projection.py
@@ -0,0 +1,649 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+import sqlite3
+import struct
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
+
+
+INDEX_BY_CHANNEL = {
+    "metadata": "metadata_composite_vector",
+    "summary": "summary_only_vector",
+    "entity": "entity_vectors",
+    "constraint": "constraint_vectors",
+    "relation": "relation_vectors",
+}
+HYBRID_ENTITY_RELATION_CHANNELS = ("metadata", "entity", "constraint", "relation")
+SEMANTIC_TOOL_CHANNELS = ("summary", "entity", "relation")
+HYBRID_ENTITY_RELATION_WEIGHTS = {
+    "metadata": 0.25,
+    "entity": 0.25,
+    "relation": 0.30,
+    "constraint": 0.20,
+}
+
+
+@dataclass(frozen=True)
+class QueryProjection:
+    entities: list[str]
+    relations: list[str]
+    constraints: list[str]
+    expected_answer_type: str = ""
+
+
+@dataclass(frozen=True)
+class HybridProjectionCandidate:
+    document_id: str
+    score: float
+    sources: list[dict[str, Any]]
+    source_type: str
+    source_path: str
+    title: str
+    metadata: dict[str, Any]
+    snippet: str
+
+
+class HybridProjectionSearchBackend:
+    """Hybrid entity/relation/vector retrieval over rebuildable projection indexes.
+
+    The SQLite catalog remains the source of truth. This backend only reads
+    external sqlite-vec projection indexes and returns candidate document ids
+    for the catalog to resolve and filter.
+    """
+
+    def __init__(
+        self,
+        index_dir: str | Path,
+        *,
+        embedder: Any,
+        embedding_provider: str,
+        embedding_model: str,
+        embedding_dimensions: int = 256,
+        embedding_cache_path: str | Path | None = None,
+        per_channel_limit: int = 100,
+        fetch_multiplier: int = 100,
+    ) -> None:
+        self.index_dir = Path(index_dir).expanduser()
+        self.embedder = embedder
+        self.embedding_provider = embedding_provider
+        self.embedding_model = embedding_model
+        self.embedding_dimensions = embedding_dimensions
+        self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
+        self.embedding_cache = EmbeddingCache(
+            Path(embedding_cache_path).expanduser()
+            if embedding_cache_path is not None
+            else self.index_dir / "embedding_cache.sqlite"
+        )
+        self.per_channel_limit = per_channel_limit
+        self.fetch_multiplier = fetch_multiplier
+        self.indexes = {
+            channel: SQLiteVecSemanticIndex(self.index_dir / f"{index_name}.sqlite")
+            for channel, index_name in INDEX_BY_CHANNEL.items()
+        }
+
+    @classmethod
+    def from_provider(
+        cls,
+        index_dir: str | Path,
+        *,
+        embedding_provider: str = "openai",
+        embedding_model: str = "text-embedding-3-small",
+        embedding_dimensions: int = 256,
+        embedding_timeout: float = 60,
+        **kwargs: Any,
+    ) -> "HybridProjectionSearchBackend":
+        return cls(
+            index_dir,
+            embedder=make_embedder(
+                embedding_provider,
+                embedding_model,
+                dimensions=embedding_dimensions,
+                timeout=embedding_timeout,
+            ),
+            embedding_provider=embedding_provider,
+            embedding_model=embedding_model,
+            embedding_dimensions=embedding_dimensions,
+            **kwargs,
+        )
+
+    def search(
+        self,
+        query: str,
+        *,
+        limit: int = 10,
+        filters: dict[str, Any] | None = None,
+    ) -> list[HybridProjectionCandidate]:
+        query = normalize_text(query)
+        if not query:
+            return []
+        projection = heuristic_query_projection(query)
+        channels = tuple(
+            channel
+            for channel in HYBRID_ENTITY_RELATION_CHANNELS
+            if self._channel_document_count(channel) > 0
+        )
+        if not channels:
+            if self._channel_document_count("summary") > 0:
+                return self.search_channel("summary", query, limit=limit, filters=filters)
+            return []
+        channel_hits = self._search_channels(
+            query=query,
+            projection=projection,
+            limit=max(limit, self.per_channel_limit),
+            filters=filters,
+            channels=channels,
+        )
+        return aggregate_hybrid_entity_relation(channel_hits, projection)[:limit]
+
+    def search_channel(
+        self,
+        channel: str,
+        query: str,
+        *,
+        limit: int = 10,
+        filters: dict[str, Any] | None = None,
+    ) -> list[HybridProjectionCandidate]:
+        if channel not in SEMANTIC_TOOL_CHANNELS:
+            raise ValueError(f"unsupported semantic channel: {channel}")
+        if channel not in self.available_channels():
+            return []
+        query = normalize_text(query)
+        if not query:
+            return []
+        projection = heuristic_query_projection(query)
+        vector = self.embedding_cache.embed_texts(
+            [query_text_for_channel(channel, query, projection)],
+            provider=self.embedding_provider,
+            model=self.cache_model,
+            embedder=self.embedder,
+            batch_size=1,
+        )[0]
+        results = self.indexes[channel].search(
+            vector,
+            limit=limit,
+            filters=filters,
+            fetch_multiplier=self.fetch_multiplier,
+        )
+        return rank_single_semantic_channel(channel, results)
+
+    def available_channels(self) -> tuple[str, ...]:
+        return tuple(
+            channel
+            for channel in SEMANTIC_TOOL_CHANNELS
+            if self._channel_document_count(channel) > 0
+        )
+
+    def info(self) -> dict[str, Any]:
+        return {
+            "index_dir": str(self.index_dir),
+            "embedding_provider": self.embedding_provider,
+            "embedding_model": self.embedding_model,
+            "embedding_dimensions": self.embedding_dimensions,
+            "strategy": "hybrid_entity_relation_vector",
+            "available_channels": list(self.available_channels()),
+            "channels": {
+                channel: self._safe_channel_info(channel)
+                for channel in self.indexes
+            },
+        }
+
+    def _channel_document_count(self, channel: str) -> int:
+        info = self._safe_channel_info(channel)
+        if not info.get("available"):
+            return 0
+        return int(info.get("document_count") or 0)
+
+    def _safe_channel_info(self, channel: str) -> dict[str, Any]:
+        index = self.indexes[channel]
+        if not index.db_path.exists():
+            return {
+                "db_path": str(index.db_path),
+                "available": False,
+                "document_count": 0,
+                "error": "index file is missing",
+            }
+        try:
+            info = index.info()
+        except (OSError, sqlite3.Error, SemanticIndexError) as exc:
+            return {
+                "db_path": str(index.db_path),
+                "available": False,
+                "document_count": 0,
+                "error": str(exc),
+            }
+        return {**info, "available": int(info.get("document_count") or 0) > 0}
+
+    def _search_channels(
+        self,
+        *,
+        query: str,
+        projection: QueryProjection,
+        limit: int,
+        filters: dict[str, Any] | None,
+        channels: tuple[str, ...],
+    ) -> dict[str, list[SemanticSearchResult]]:
+        query_texts = {
+            channel: query_text_for_channel(channel, query, projection)
+            for channel in channels
+        }
+        vectors = self.embedding_cache.embed_texts(
+            [query_texts[channel] for channel in channels],
+            provider=self.embedding_provider,
+            model=self.cache_model,
+            embedder=self.embedder,
+            batch_size=1,
+        )
+        return {
+            channel: self.indexes[channel].search(
+                vector,
+                limit=limit,
+                filters=filters,
+                fetch_multiplier=self.fetch_multiplier,
+            )
+            for channel, vector in zip(channels, vectors)
+        }
+
+
+class EmbeddingCache:
+    def __init__(self, db_path: Path):
+        self.db_path = db_path
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        with self.connect() as conn:
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS embedding_cache (
+                    provider TEXT NOT NULL,
+                    model TEXT NOT NULL,
+                    text_hash TEXT NOT NULL,
+                    dimension INTEGER NOT NULL,
+                    vector_blob BLOB,
+                    vector_json TEXT,
+                    created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                    PRIMARY KEY(provider, model, text_hash)
+                )
+                """
+            )
+            conn.commit()
+
+    def connect(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        return conn
+
+    def embed_texts(
+        self,
+        texts: list[str],
+        *,
+        provider: str,
+        model: str,
+        embedder: Any,
+        batch_size: int,
+    ) -> list[list[float]]:
+        hashes = [SQLiteVecSemanticIndex.text_hash(text) for text in texts]
+        cached: dict[str, list[float]] = {}
+        with self.connect() as conn:
+            for text_hash in sorted(set(hashes)):
+                row = conn.execute(
+                    """
+                    SELECT vector_blob, vector_json
+                    FROM embedding_cache
+                    WHERE provider = ? AND model = ? AND text_hash = ?
+                    """,
+                    (provider, model, text_hash),
+                ).fetchone()
+                if row is not None:
+                    cached[text_hash] = decode_vector(row["vector_blob"], row["vector_json"])
+        missing_positions = [
+            index for index, text_hash in enumerate(hashes) if text_hash not in cached
+        ]
+        for start in range(0, len(missing_positions), max(1, batch_size)):
+            positions = missing_positions[start : start + max(1, batch_size)]
+            batch_texts = [texts[index] for index in positions]
+            vectors = embed_with_retry(embedder, batch_texts)
+            with self.connect() as conn:
+                conn.executemany(
+                    """
+                    INSERT OR REPLACE INTO embedding_cache(
+                        provider, model, text_hash, dimension, vector_blob, vector_json
+                    )
+                    VALUES (?, ?, ?, ?, ?, '')
+                    """,
+                    [
+                        (
+                            provider,
+                            model,
+                            hashes[index],
+                            len(vector),
+                            encode_vector(vector),
+                        )
+                        for index, vector in zip(positions, vectors)
+                    ],
+                )
+                conn.commit()
+            for index, vector in zip(positions, vectors):
+                cached[hashes[index]] = vector
+        return [cached[text_hash] for text_hash in hashes]
+
+
+class EmbeddingClient:
+    def __init__(self, *, provider: str, model: str, dimensions: int, timeout: float):
+        self.provider = provider.lower()
+        self.model = model
+        self.dimensions = dimensions
+        if self.provider != "openai":
+            raise ValueError(f"unknown embedding provider: {provider}")
+        from openai import OpenAI
+
+        api_key = os.environ.get("PIFS_EMBEDDING_API_KEY") or os.environ.get("OPENAI_API_KEY")
+        base_url = os.environ.get("PIFS_EMBEDDING_BASE_URL") or os.environ.get("OPENAI_BASE_URL")
+        if not api_key:
+            raise ValueError(
+                "PIFS_EMBEDDING_API_KEY or OPENAI_API_KEY is required for PIFS embeddings"
+            )
+        self.client = OpenAI(api_key=api_key, base_url=base_url or None, timeout=timeout)
+
+    def embed(self, texts: list[str]) -> list[list[float]]:
+        kwargs: dict[str, Any] = {"model": self.model, "input": texts}
+        if self.dimensions > 0:
+            kwargs["dimensions"] = self.dimensions
+        response = self.client.embeddings.create(**kwargs)
+        return [list(item.embedding) for item in sorted(response.data, key=lambda item: item.index)]
+
+
+def make_embedder(provider: str, model: str, *, dimensions: int, timeout: float) -> Any:
+    return EmbeddingClient(
+        provider=provider,
+        model=model,
+        dimensions=dimensions,
+        timeout=timeout,
+    )
+
+
+def query_text_for_channel(channel: str, query: str, projection: QueryProjection) -> str:
+    if channel in {"metadata", "summary"}:
+        return query
+    if channel == "entity":
+        return compact_join(projection.entities, limit=24) or query
+    if channel == "constraint":
+        return compact_join(projection.constraints, limit=24) or query
+    if channel == "relation":
+        return "\n".join(projection.relations) or query
+    raise ValueError(f"unknown semantic channel: {channel}")
+
+
+def rank_single_semantic_channel(
+    channel: str,
+    results: list[SemanticSearchResult],
+) -> list[HybridProjectionCandidate]:
+    rows: list[HybridProjectionCandidate] = []
+    seen: set[str] = set()
+    for rank, result in enumerate(results, 1):
+        doc_id = str(result.external_id or result.file_ref)
+        if doc_id in seen:
+            continue
+        seen.add(doc_id)
+        rows.append(
+            HybridProjectionCandidate(
+                document_id=doc_id,
+                score=1 / (60 + rank),
+                sources=[{"channel": channel, "rank": rank, "distance": result.distance}],
+                source_type=result.source_type,
+                source_path=result.source_path,
+                title=result.title,
+                metadata=result.metadata,
+                snippet=f"{channel}_vector rank={rank}",
+            )
+        )
+    return rows
+
+
+def aggregate_hybrid_entity_relation(
+    channel_hits: dict[str, list[SemanticSearchResult]],
+    projection: QueryProjection,
+) -> list[HybridProjectionCandidate]:
+    by_doc: dict[str, dict[str, Any]] = {}
+    for channel, results in channel_hits.items():
+        weight = HYBRID_ENTITY_RELATION_WEIGHTS[channel]
+        seen_in_channel = set()
+        for rank, result in enumerate(results, 1):
+            doc_id = str(result.external_id or result.file_ref)
+            if doc_id in seen_in_channel:
+                continue
+            seen_in_channel.add(doc_id)
+            item = by_doc.setdefault(
+                doc_id,
+                {
+                    "document_id": doc_id,
+                    "score": 0.0,
+                    "sources": [],
+                    "source_type": result.source_type,
+                    "source_path": result.source_path,
+                    "title": result.title,
+                    "metadata": result.metadata,
+                },
+            )
+            item["score"] += weight * (1 / (60 + rank))
+            item["sources"].append({"channel": channel, "rank": rank, "distance": result.distance})
+    candidates = []
+    for item in by_doc.values():
+        item["score"] += exact_match_bonus(item, projection)
+        candidates.append(
+            HybridProjectionCandidate(
+                document_id=item["document_id"],
+                score=float(item["score"]),
+                sources=item["sources"],
+                source_type=item["source_type"],
+                source_path=item["source_path"],
+                title=item["title"],
+                metadata=item["metadata"],
+                snippet=hybrid_snippet(item),
+            )
+        )
+    return sorted(
+        candidates,
+        key=lambda item: (
+            -item.score,
+            min(source["rank"] for source in item.sources),
+            item.document_id,
+        ),
+    )
+
+
+def exact_match_bonus(item: dict[str, Any], projection: QueryProjection) -> float:
+    haystack = json.dumps(
+        {
+            "title": item.get("title", ""),
+            "source_path": item.get("source_path", ""),
+            "metadata": item.get("metadata", {}),
+        },
+        ensure_ascii=False,
+    ).lower()
+    terms = [*projection.entities[:8], *projection.constraints[:6]]
+    matched = 0
+    for term in terms:
+        normalized = str(term).lower().strip()
+        if len(normalized) >= 3 and normalized in haystack:
+            matched += 1
+    return min(0.02, matched * 0.004)
+
+
+def hybrid_snippet(item: dict[str, Any]) -> str:
+    channels = ", ".join(
+        f"{source['channel']}@{source['rank']}" for source in item.get("sources", [])[:4]
+    )
+    topic = str((item.get("metadata") or {}).get("topic") or "").strip()
+    parts = [f"hybrid_entity_relation_vector {channels}"]
+    if topic:
+        parts.append(f"topic: {topic}")
+    return "; ".join(parts)
+
+
+def heuristic_query_projection(question: str) -> QueryProjection:
+    entities = dedupe(
+        [
+            *identifier_terms(question),
+            *keyword_terms(question)[:16],
+        ]
+    )[:16]
+    constraints = dedupe(
+        [
+            *extract_constraint_terms(question),
+            *numeric_terms(question),
+        ]
+    )[:12]
+    predicate = infer_query_predicate(question)
+    subject = entities[0] if entities else "question"
+    return QueryProjection(
+        entities=entities,
+        relations=[f"{subject} | {predicate} | {question}"],
+        constraints=constraints,
+        expected_answer_type=infer_answer_type(question),
+    )
+
+
+def compact_join(values: list[str], *, limit: int) -> str:
+    return " | ".join(values[:limit])
+
+
+def identifier_terms(text: str) -> list[str]:
+    patterns = [
+        r"\b[A-Z]{2,12}-\d{2,}\b",
+        r"\b[A-Za-z_][A-Za-z0-9_]{2,}\b\s*(?:=|:)\s*[A-Za-z0-9_.:/-]+",
+        r"\b[A-Za-z][A-Za-z0-9_+-]+(?:[-_+][A-Za-z0-9]+)+\b",
+        r"\b[A-Z]{2,}[A-Za-z0-9_-]*\b",
+    ]
+    found: list[str] = []
+    for pattern in patterns:
+        found.extend(match.strip() for match in re.findall(pattern, text))
+    return found
+
+
+def keyword_terms(text: str) -> list[str]:
+    stopwords = {
+        "about",
+        "after",
+        "also",
+        "and",
+        "are",
+        "for",
+        "from",
+        "how",
+        "into",
+        "the",
+        "this",
+        "that",
+        "what",
+        "when",
+        "where",
+        "which",
+        "with",
+    }
+    terms = [
+        term.lower()
+        for term in re.findall(r"[A-Za-z][A-Za-z0-9_+-]{2,}", text)
+        if term.lower() not in stopwords
+    ]
+    return dedupe(terms)
+
+
+def extract_constraint_terms(text: str) -> list[str]:
+    constraints = []
+    for pattern in [
+        r"\b(?:must|should|required|requires?|default(?:s)?|limit(?:s)?|maximum|minimum)\b[^.!?\n]{0,120}",
+        r"\b[A-Za-z_][A-Za-z0-9_]{2,}\s*(?:=|:)\s*[A-Za-z0-9_.:/-]+",
+    ]:
+        constraints.extend(match.strip() for match in re.findall(pattern, text, flags=re.IGNORECASE))
+    return dedupe(constraints)
+
+
+def numeric_terms(text: str) -> list[str]:
+    return re.findall(
+        r"\b\d+(?:\.\d+)?\s*(?:MiB|GiB|MB|GB|ms|sec|seconds|minutes|hours|days|%|tokens?|req/s|rps)\b",
+        text,
+        flags=re.IGNORECASE,
+    )
+
+
+def infer_query_predicate(question: str) -> str:
+    lowered = question.lower()
+    rules = [
+        ("asks_default", ["default", "defaults"]),
+        ("asks_limit", ["limit", "maximum", "minimum", "size"]),
+        ("asks_cause", ["caused", "cause", "why"]),
+        ("asks_owner", ["who", "owner", "assigned"]),
+        ("asks_deadline", ["when", "deadline", "date"]),
+        ("asks_status", ["status", "state"]),
+        ("asks_requirement", ["required", "requirement", "must"]),
+    ]
+    for predicate, needles in rules:
+        if any(needle in lowered for needle in needles):
+            return predicate
+    return "asks_about"
+
+
+def infer_answer_type(question: str) -> str:
+    lowered = question.lower()
+    if "how many" in lowered or "limit" in lowered or "size" in lowered:
+        return "number_or_limit"
+    if lowered.startswith("who"):
+        return "person_or_team"
+    if lowered.startswith("when"):
+        return "date_or_time"
+    if "why" in lowered or "caused" in lowered:
+        return "cause"
+    return "fact"
+
+
+def dedupe(values: Any) -> list[str]:
+    seen = set()
+    result = []
+    for value in values:
+        normalized = re.sub(r"\s+", " ", str(value)).strip()
+        key = normalized.lower()
+        if not normalized or key in seen:
+            continue
+        seen.add(key)
+        result.append(normalized)
+    return result
+
+
+def normalize_text(text: str) -> str:
+    return re.sub(r"\s+", " ", str(text or "")).strip()
+
+
+def embedding_cache_model_key(model: str, dimensions: int) -> str:
+    return f"{model}:dimensions={dimensions}" if dimensions > 0 else model
+
+
+def embed_with_retry(embedder: Any, texts: list[str], *, max_attempts: int = 8) -> list[list[float]]:
+    for attempt in range(1, max_attempts + 1):
+        try:
+            return embedder.embed(texts)
+        except Exception:
+            if attempt >= max_attempts:
+                raise
+            time.sleep(min(120.0, 2.0 ** (attempt - 1)))
+    raise RuntimeError("unreachable embedding retry state")
+
+
+def encode_vector(vector: list[float]) -> bytes:
+    return struct.pack(f"<{len(vector)}f", *vector)
+
+
+def decode_vector(blob: bytes | None, vector_json: str | None) -> list[float]:
+    if blob:
+        if len(blob) % 4 != 0:
+            raise ValueError("invalid cached vector blob length")
+        return list(struct.unpack(f"<{len(blob) // 4}f", blob))
+    if vector_json:
+        value = json.loads(vector_json)
+        if isinstance(value, list):
+            return [float(item) for item in value]
+    raise ValueError("cached embedding row does not contain a vector")
diff --git a/pageindex/filesystem/metadata.py b/pageindex/filesystem/metadata.py
new file mode 100644
index 000000000..60d7beb97
--- /dev/null
+++ b/pageindex/filesystem/metadata.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+import json
+import re
+from typing import Any
+
+from .types import MetadataField
+
+
+class MetadataQueryError(ValueError):
+    pass
+
+
+class MetadataQueryEngine:
+    FIELD_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_]*$")
+    OPERATORS = {"$eq", "$ne", "$in", "$gt", "$gte", "$lt", "$lte", "$contains"}
+    LOGICAL_OPERATORS = {"$and", "$or"}
+    FOLDER_SCOPE_FIELD_HINTS = {"path", "folder", "folders", "folder_path", "folder_paths"}
+    MAX_DEPTH = 5
+
+    def __init__(self, store: Any):
+        self.store = store
+
+    def register_schema(self, schema: dict[str, Any], source: str = "manual") -> None:
+        fields = []
+        raw_fields = schema.get("fields", schema)
+        if not isinstance(raw_fields, dict):
+            raise MetadataQueryError("metadata schema must contain a fields object")
+        for name, declaration in raw_fields.items():
+            name = str(name)
+            self.validate_field_name(name)
+            if isinstance(declaration, str):
+                field_type = declaration
+                description = ""
+            elif isinstance(declaration, dict):
+                field_type = str(declaration.get("type", ""))
+                description = str(declaration.get("description", ""))
+            else:
+                raise MetadataQueryError(f"Invalid schema declaration for field: {name}")
+            if field_type not in {"string", "number", "boolean"}:
+                raise MetadataQueryError(f"Unsupported metadata field type for {name}: {field_type}")
+            fields.append(
+                MetadataField(
+                    name=name,
+                    field_type=field_type,
+                    description=description,
+                    source=source,
+                )
+            )
+        if fields:
+            self.store.upsert_metadata_fields(fields)
+
+    def parse_filter(self, value: str | dict[str, Any] | None) -> dict[str, Any] | None:
+        if value is None or value == "":
+            return None
+        if isinstance(value, str):
+            value = self.parse_dsl(value)
+        if not isinstance(value, dict):
+            raise MetadataQueryError("metadata_filter must be a JSON object")
+        self.validate_filter(value)
+        return value
+
+    def parse_dsl(self, dsl: str) -> dict[str, Any]:
+        try:
+            parsed = json.loads(dsl)
+        except json.JSONDecodeError as exc:
+            raise MetadataQueryError(
+                "metadata DSL must be a JSON object, for example "
+                '\'{"$and":[{"repo":"redwood"},{"year":{"$gte":2024}}]}\''
+            ) from exc
+        if not isinstance(parsed, dict):
+            raise MetadataQueryError("metadata DSL must be a JSON object")
+        return parsed
+
+    def validate_filter(self, metadata_filter: dict[str, Any], depth: int = 1) -> None:
+        if depth > self.MAX_DEPTH:
+            raise MetadataQueryError(f"metadata_filter nesting depth exceeds {self.MAX_DEPTH}")
+        if not metadata_filter:
+            return
+        for key, condition in metadata_filter.items():
+            if key in self.LOGICAL_OPERATORS:
+                self._validate_logical(key, condition, depth)
+                continue
+            self.validate_field(key)
+            self._validate_field_condition(key, condition)
+
+    def _validate_logical(self, operator: str, condition: Any, depth: int) -> None:
+        if not isinstance(condition, list) or not condition:
+            raise MetadataQueryError(f"{operator} requires a non-empty list")
+        for item in condition:
+            if not isinstance(item, dict):
+                raise MetadataQueryError(f"{operator} items must be metadata filter objects")
+            self.validate_filter(item, depth + 1)
+
+    def _validate_field_condition(self, field: str, condition: Any) -> None:
+        if not isinstance(condition, dict) or not any(
+            str(key).startswith("$") for key in condition
+        ):
+            self._validate_scalar(condition, context=field)
+            return
+        if len(condition) != 1:
+            raise MetadataQueryError(
+                f"Field {field} condition must contain exactly one metadata operator"
+            )
+        operator, expected = next(iter(condition.items()))
+        if operator not in self.OPERATORS:
+            raise MetadataQueryError(f"Unsupported metadata operator: {operator}")
+        if operator == "$in":
+            if not isinstance(expected, list):
+                raise MetadataQueryError(f"{field} $in requires a list")
+            for item in expected:
+                self._validate_scalar(item, context=f"{field} $in")
+            return
+        if operator == "$contains":
+            self._validate_scalar(expected, context=f"{field} $contains")
+            return
+        if operator in {"$gt", "$gte", "$lt", "$lte"}:
+            self._validate_range_value(expected, context=f"{field} {operator}")
+            return
+        self._validate_scalar(expected, context=f"{field} {operator}")
+
+    def validate_field(self, field: str) -> None:
+        self.validate_field_name(field)
+        if not self.store.metadata_field_exists(field):
+            if field in self.FOLDER_SCOPE_FIELD_HINTS:
+                raise MetadataQueryError(
+                    f"Unknown metadata field: {field}. Folder paths are positional PIFS paths, "
+                    "not metadata fields; use `ls /documents` or `find /documents -type f`. "
+                    "Use --where only with fields from `stat --schema`."
+                )
+            raise MetadataQueryError(f"Unknown metadata field: {field}")
+
+    def validate_field_name(self, field: str) -> None:
+        if not self.FIELD_RE.match(field):
+            raise MetadataQueryError(f"Invalid metadata field: {field}")
+
+    def export_schema(self) -> dict[str, Any]:
+        fields = {}
+        for field in self.store.list_metadata_fields():
+            fields[field.name] = {
+                "type": field.field_type,
+                "description": field.description,
+            }
+        return {"fields": fields}
+
+    @staticmethod
+    def _validate_scalar(value: Any, *, context: str) -> None:
+        if isinstance(value, bool):
+            return
+        if isinstance(value, (int, float)):
+            return
+        if isinstance(value, str):
+            return
+        raise MetadataQueryError(f"{context} must be a string, number, or boolean")
+
+    @staticmethod
+    def _validate_range_value(value: Any, *, context: str) -> None:
+        if isinstance(value, bool) or not isinstance(value, (int, float, str)):
+            raise MetadataQueryError(f"{context} must be a string or number")
diff --git a/pageindex/filesystem/metadata_generation.py b/pageindex/filesystem/metadata_generation.py
new file mode 100644
index 000000000..86b2ac6e7
--- /dev/null
+++ b/pageindex/filesystem/metadata_generation.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+
+
+GENERATED_METADATA_FIELDS = ("summary", "doc_type", "domain", "topic", "entity", "relation")
+
+
+class MetadataGenerationError(RuntimeError):
+    pass
+
+
+@dataclass(frozen=True)
+class MetadataGenerationInput:
+    file_ref: str
+    external_id: str | None
+    title: str
+    source_path: str
+    content_type: str
+    source_type: str | None
+    text: str
+    metadata: dict[str, Any] = field(default_factory=dict)
+    text_artifact_path: str | None = None
+
+
+@dataclass(frozen=True)
+class MetadataGenerationResult:
+    values: dict[str, Any] = field(default_factory=dict)
+    failures: dict[str, str] = field(default_factory=dict)
+
+
+class MetadataGenerationBackend(Protocol):
+    def generate(
+        self,
+        request: MetadataGenerationInput,
+        *,
+        fields: list[str],
+    ) -> MetadataGenerationResult | dict[str, Any]:
+        ...
+
+
+class MetadataGenerator:
+    """Default product generator for retrieval metadata.
+
+    This intentionally lives under pageindex.filesystem instead of benchmark
+    paths. It uses registered text today; callers can pass PageIndex-extracted
+    text through the same MetadataGenerationInput without changing the API.
+    Provider selection is an instance parameter rather than a provider-specific
+    public class name.
+    """
+
+    def __init__(
+        self,
+        *,
+        provider: str | None = None,
+        model: str | None = None,
+        base_url: str | None = None,
+        max_text_chars: int = 24000,
+    ):
+        self.provider = (provider or os.environ.get("PIFS_METADATA_PROVIDER", "openai")).lower()
+        self.model = model or os.environ.get("PIFS_METADATA_MODEL", "gpt-5-nano")
+        self.base_url = (
+            base_url
+            if base_url is not None
+            else os.environ.get("PIFS_METADATA_BASE_URL") or os.environ.get("OPENAI_BASE_URL")
+        )
+        self.max_text_chars = max_text_chars
+
+    def generate(
+        self,
+        request: MetadataGenerationInput,
+        *,
+        fields: list[str],
+    ) -> MetadataGenerationResult:
+        if self.provider != "openai":
+            raise MetadataGenerationError(f"unsupported metadata provider: {self.provider}")
+        return self._generate_openai(request, fields=fields)
+
+    def _generate_openai(
+        self,
+        request: MetadataGenerationInput,
+        *,
+        fields: list[str],
+    ) -> MetadataGenerationResult:
+        api_key = os.environ.get("PIFS_METADATA_API_KEY") or os.environ.get("OPENAI_API_KEY")
+        if not api_key:
+            raise MetadataGenerationError(
+                "PIFS_METADATA_API_KEY or OPENAI_API_KEY is required for PIFS metadata generation"
+            )
+
+        from openai import OpenAI
+
+        client = OpenAI(api_key=api_key, base_url=self.base_url or None)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {
+                    "role": "system",
+                    "content": (
+                        "Generate grounded retrieval metadata for one document. "
+                        "Use only the provided document text and ordinary source metadata. "
+                        "The summary must be a retrieval summary, not a title rewrite. "
+                        "Do not use filenames, paths, URLs, storage URIs, or outside knowledge. "
+                        "Return strict JSON matching the requested fields."
+                    ),
+                },
+                {
+                    "role": "user",
+                    "content": json.dumps(
+                        {
+                            "requested_fields": fields,
+                            "document": {
+                                "title": request.title,
+                                "source_type": request.source_type,
+                                "content_type": request.content_type,
+                                "metadata": request.metadata,
+                                "text": request.text[: self.max_text_chars],
+                            },
+                        },
+                        ensure_ascii=False,
+                    ),
+                },
+            ],
+            response_format=self._response_format(fields),
+        )
+        content = response.choices[0].message.content or "{}"
+        values = json.loads(content)
+        return MetadataGenerationResult(
+            values={field: values[field] for field in fields if field in values},
+        )
+
+    @staticmethod
+    def _response_format(fields: list[str]) -> dict[str, Any]:
+        properties: dict[str, Any] = {}
+        for field in fields:
+            if field in {"summary", "doc_type", "domain", "topic"}:
+                properties[field] = {"type": "string"}
+            elif field in {"entity", "relation"}:
+                properties[field] = {"type": "string"}
+            else:
+                raise MetadataGenerationError(
+                    f"MetadataGenerator does not support generated metadata field: {field}"
+                )
+        return {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "pifs_metadata_generation",
+                "strict": True,
+                "schema": {
+                    "type": "object",
+                    "additionalProperties": False,
+                    "required": fields,
+                    "properties": properties,
+                },
+            },
+        }
diff --git a/pageindex/filesystem/projection_indexing.py b/pageindex/filesystem/projection_indexing.py
new file mode 100644
index 000000000..e5d7b829e
--- /dev/null
+++ b/pageindex/filesystem/projection_indexing.py
@@ -0,0 +1,135 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from .hybrid_projection import (
+    EmbeddingCache,
+    INDEX_BY_CHANNEL,
+    embedding_cache_model_key,
+    make_embedder,
+)
+from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexRecord
+
+
+class SummaryProjectionIndexer:
+    """Synchronous register-time summary projection indexer."""
+
+    def __init__(
+        self,
+        index_dir: str | Path,
+        *,
+        embedder: Any,
+        embedding_provider: str,
+        embedding_model: str,
+        embedding_dimensions: int = 256,
+        embedding_cache_path: str | Path | None = None,
+    ) -> None:
+        self.index_dir = Path(index_dir).expanduser()
+        self.index_dir.mkdir(parents=True, exist_ok=True)
+        self.embedder = embedder
+        self.embedding_provider = embedding_provider
+        self.embedding_model = embedding_model
+        self.embedding_dimensions = embedding_dimensions
+        self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
+        self.embedding_cache = EmbeddingCache(
+            Path(embedding_cache_path).expanduser()
+            if embedding_cache_path is not None
+            else self.index_dir / "embedding_cache.sqlite"
+        )
+        self.index = SQLiteVecSemanticIndex(
+            self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
+        )
+        self._ensure_index()
+
+    @classmethod
+    def from_provider(
+        cls,
+        index_dir: str | Path,
+        *,
+        embedding_provider: str = "openai",
+        embedding_model: str = "text-embedding-3-small",
+        embedding_dimensions: int = 256,
+        embedding_timeout: float = 60,
+        **kwargs: Any,
+    ) -> "SummaryProjectionIndexer":
+        return cls(
+            index_dir,
+            embedder=make_embedder(
+                embedding_provider,
+                embedding_model,
+                dimensions=embedding_dimensions,
+                timeout=embedding_timeout,
+            ),
+            embedding_provider=embedding_provider,
+            embedding_model=embedding_model,
+            embedding_dimensions=embedding_dimensions,
+            **kwargs,
+        )
+
+    def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
+        summary = str((record.get("metadata") or {}).get("summary") or "").strip()
+        if not summary:
+            return {"status": "skipped", "reason": "missing_summary"}
+        vector = self.embedding_cache.embed_texts(
+            [summary],
+            provider=self.embedding_provider,
+            model=self.cache_model,
+            embedder=self.embedder,
+            batch_size=1,
+        )[0]
+        metadata = dict(record.get("metadata") or {})
+        count = self.index.upsert_many(
+            [
+                SemanticIndexRecord(
+                    file_ref=str(record["file_ref"]),
+                    vector=vector,
+                    text=summary,
+                    external_id=record.get("external_id"),
+                    source_type=str(record.get("source_type") or ""),
+                    source_path=str(record.get("source_path") or ""),
+                    title=str(record.get("title") or ""),
+                    metadata=metadata,
+                )
+            ]
+        )
+        return {
+            "status": "ready",
+            "indexed_rows": count,
+            "index_path": str(self.index.db_path),
+            "embedding_provider": self.embedding_provider,
+            "embedding_model": self.embedding_model,
+            "embedding_dimensions": self.embedding_dimensions,
+        }
+
+    def _ensure_index(self) -> None:
+        if not self.index.db_path.exists():
+            self.index.reset(
+                dimension=self.embedding_dimensions,
+                metadata=self._index_metadata(),
+            )
+            return
+        try:
+            existing_dimension = self.index.dimension()
+        except Exception as exc:
+            raise RuntimeError(
+                "could not validate existing summary projection index config; "
+                f"refusing to reset {self.index.db_path}. Move the existing index "
+                "aside or rebuild it intentionally before changing embedding config."
+            ) from exc
+        if existing_dimension != self.embedding_dimensions:
+            raise RuntimeError(
+                "summary projection index dimension mismatch: "
+                f"{self.index.db_path} was built with dimension {existing_dimension}, "
+                f"but configured embedding_dimensions is {self.embedding_dimensions}. "
+                "Use the matching embedding config, or rebuild the projection index "
+                "at a new path after preserving the existing data."
+            )
+
+    def _index_metadata(self) -> dict[str, Any]:
+        return {
+            "channel": "summary",
+            "embedding_provider": self.embedding_provider,
+            "embedding_model": self.embedding_model,
+            "embedding_dimensions": self.embedding_dimensions,
+        }
diff --git a/pageindex/filesystem/semantic_folder_policy.py b/pageindex/filesystem/semantic_folder_policy.py
new file mode 100644
index 000000000..8e81d5f9a
--- /dev/null
+++ b/pageindex/filesystem/semantic_folder_policy.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import re
+from typing import Any, Iterable
+
+
+SEMANTIC_FOLDER_ROOT = "/semantic"
+SEMANTIC_FOLDER_BASE_FIELDS = {"doc_type", "domain", "topic"}
+SEMANTIC_FOLDER_SYSTEM_FIELDS = {"source_type"}
+SEMANTIC_FOLDER_FORBIDDEN_FIELDS = {
+    "summary",
+    "entities",
+    "relations",
+    "constraints",
+    "retrieval_cues",
+    "dataset_doc_uuid",
+    "path",
+    "uri",
+    "source_path",
+    "storage_uri",
+    "title",
+    "content_type",
+    "created_at",
+    "updated_at",
+}
+
+
+def canonical_semantic_folder_field_name(value: Any) -> str:
+    text = str(value or "").strip()
+    if not text:
+        return ""
+    text = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", text)
+    text = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", text)
+    return re.sub(r"[^A-Za-z0-9]+", "_", text).strip("_").casefold()
+
+
+def compact_semantic_folder_field_name(value: Any) -> str:
+    return re.sub(r"[^a-z0-9]+", "", canonical_semantic_folder_field_name(value))
+
+
+def semantic_folder_field_identity_keys(value: Any) -> frozenset[str]:
+    canonical = canonical_semantic_folder_field_name(value)
+    compact = compact_semantic_folder_field_name(value)
+    return frozenset(key for key in (canonical, compact) if key)
+
+
+def semantic_folder_field_identity_set(fields: Iterable[Any]) -> frozenset[str]:
+    keys: set[str] = set()
+    for field in fields:
+        keys.update(semantic_folder_field_identity_keys(field))
+    return frozenset(keys)
+
+
+SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES = semantic_folder_field_identity_set(
+    SEMANTIC_FOLDER_FORBIDDEN_FIELDS
+)
+
+
+def is_semantic_folder_forbidden_field(value: Any) -> bool:
+    return bool(
+        semantic_folder_field_identity_keys(value)
+        & SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES
+    )
+
+
+def semantic_folder_allowed_extension_fields(fields: Iterable[Any]) -> set[str]:
+    allowed = set()
+    for field in fields:
+        name = canonical_semantic_folder_field_name(field)
+        if name and not is_semantic_folder_forbidden_field(field):
+            allowed.add(name)
+    return allowed
diff --git a/pageindex/filesystem/semantic_index.py b/pageindex/filesystem/semantic_index.py
new file mode 100644
index 000000000..2453e1f35
--- /dev/null
+++ b/pageindex/filesystem/semantic_index.py
@@ -0,0 +1,362 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import sqlite3
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Protocol
+
+import sqlite_vec
+
+
+class SemanticIndexError(RuntimeError):
+    pass
+
+
+@dataclass(frozen=True)
+class SemanticIndexRecord:
+    file_ref: str
+    vector: list[float]
+    text: str
+    external_id: str | None = None
+    source_type: str = ""
+    source_path: str = ""
+    title: str = ""
+    metadata: dict[str, Any] | None = None
+
+
+@dataclass(frozen=True)
+class SemanticSearchResult:
+    file_ref: str
+    distance: float
+    external_id: str | None
+    source_type: str
+    source_path: str
+    title: str
+    text_hash: str
+    metadata: dict[str, Any]
+
+
+class RebuildableSemanticIndex(Protocol):
+    def reset(self, *, dimension: int, metadata: dict[str, Any] | None = None) -> None:
+        ...
+
+    def upsert_many(self, records: list[SemanticIndexRecord]) -> int:
+        ...
+
+    def search(
+        self,
+        vector: list[float],
+        *,
+        limit: int = 10,
+        filters: dict[str, Any] | None = None,
+        fetch_multiplier: int = 20,
+    ) -> list[SemanticSearchResult]:
+        ...
+
+    def info(self) -> dict[str, Any]:
+        ...
+
+
+class SQLiteVecSemanticIndex:
+    """Rebuildable local semantic index backed by sqlite-vec.
+
+    This is intentionally separate from the PIFS catalog tables. The catalog
+    remains source of truth; this file is a rebuildable recall index.
+    """
+
+    def __init__(self, db_path: str | Path):
+        self.db_path = Path(db_path).expanduser()
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+
+    def reset(self, *, dimension: int, metadata: dict[str, Any] | None = None) -> None:
+        if dimension <= 0:
+            raise SemanticIndexError("semantic index dimension must be positive")
+        with self.connect() as conn:
+            conn.executescript(
+                """
+                DROP TABLE IF EXISTS semantic_index_vec;
+                DROP TABLE IF EXISTS semantic_index_docs;
+                DROP TABLE IF EXISTS semantic_index_config;
+                CREATE TABLE semantic_index_config (
+                    key TEXT PRIMARY KEY,
+                    value TEXT NOT NULL
+                );
+                CREATE TABLE semantic_index_docs (
+                    rowid INTEGER PRIMARY KEY,
+                    file_ref TEXT NOT NULL UNIQUE,
+                    external_id TEXT,
+                    source_type TEXT NOT NULL DEFAULT '',
+                    source_path TEXT NOT NULL DEFAULT '',
+                    title TEXT NOT NULL DEFAULT '',
+                    text_hash TEXT NOT NULL,
+                    text_chars INTEGER NOT NULL DEFAULT 0,
+                    metadata_json TEXT NOT NULL DEFAULT '{}',
+                    created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                    updated_at TEXT DEFAULT CURRENT_TIMESTAMP
+                );
+                CREATE INDEX idx_semantic_index_docs_file_ref
+                  ON semantic_index_docs(file_ref);
+                CREATE INDEX idx_semantic_index_docs_external_id
+                  ON semantic_index_docs(external_id);
+                CREATE INDEX idx_semantic_index_docs_source_type
+                  ON semantic_index_docs(source_type);
+                """
+            )
+            conn.execute(
+                "CREATE VIRTUAL TABLE semantic_index_vec USING "
+                f"vec0(source_type TEXT partition key, embedding float[{dimension}])"
+            )
+            config = {
+                "dimension": str(dimension),
+                "adapter": "sqlite-vec",
+                "adapter_version": sqlite_vec.__version__,
+                "metadata": json.dumps(metadata or {}, ensure_ascii=False, sort_keys=True),
+            }
+            conn.executemany(
+                "INSERT INTO semantic_index_config(key, value) VALUES (?, ?)",
+                sorted(config.items()),
+            )
+            conn.commit()
+
+    def upsert_many(self, records: list[SemanticIndexRecord]) -> int:
+        if not records:
+            return 0
+        dimension = self.dimension()
+        with self.connect() as conn:
+            inserted = 0
+            for record in records:
+                if len(record.vector) != dimension:
+                    raise SemanticIndexError(
+                        f"vector dimension mismatch for {record.file_ref}: "
+                        f"expected {dimension}, got {len(record.vector)}"
+                    )
+                rowid = self._upsert_doc(conn, record)
+                conn.execute("DELETE FROM semantic_index_vec WHERE rowid = ?", (rowid,))
+                conn.execute(
+                    "INSERT INTO semantic_index_vec(rowid, source_type, embedding) VALUES (?, ?, ?)",
+                    (
+                        rowid,
+                        record.source_type,
+                        sqlite_vec.serialize_float32(record.vector),
+                    ),
+                )
+                inserted += 1
+            conn.commit()
+            return inserted
+
+    def search(
+        self,
+        vector: list[float],
+        *,
+        limit: int = 10,
+        filters: dict[str, Any] | None = None,
+        fetch_multiplier: int = 20,
+    ) -> list[SemanticSearchResult]:
+        dimension = self.dimension()
+        if len(vector) != dimension:
+            raise SemanticIndexError(
+                f"query vector dimension mismatch: expected {dimension}, got {len(vector)}"
+            )
+        fetch_k = min(4096, max(limit, limit * max(fetch_multiplier, 1)))
+        source_types = _source_type_filters(filters or {})
+        with self.connect() as conn:
+            rows = []
+            if source_types:
+                for source_type in source_types:
+                    rows.extend(
+                        conn.execute(
+                            """
+                            SELECT
+                                d.file_ref,
+                                d.external_id,
+                                d.source_type,
+                                d.source_path,
+                                d.title,
+                                d.text_hash,
+                                d.metadata_json,
+                                v.distance
+                            FROM semantic_index_vec v
+                            JOIN semantic_index_docs d ON d.rowid = v.rowid
+                            WHERE v.embedding MATCH ? AND k = ? AND v.source_type = ?
+                            ORDER BY v.distance
+                            """,
+                            (sqlite_vec.serialize_float32(vector), fetch_k, source_type),
+                        ).fetchall()
+                    )
+                rows.sort(key=lambda row: float(row["distance"]))
+            else:
+                rows = conn.execute(
+                    """
+                    SELECT
+                        d.file_ref,
+                        d.external_id,
+                        d.source_type,
+                        d.source_path,
+                        d.title,
+                        d.text_hash,
+                        d.metadata_json,
+                        v.distance
+                    FROM semantic_index_vec v
+                    JOIN semantic_index_docs d ON d.rowid = v.rowid
+                    WHERE v.embedding MATCH ? AND k = ?
+                    ORDER BY v.distance
+                    """,
+                    (sqlite_vec.serialize_float32(vector), fetch_k),
+                ).fetchall()
+        results: list[SemanticSearchResult] = []
+        for row in rows:
+            metadata = _json_obj(row["metadata_json"])
+            if not _matches_filters(row, metadata, filters or {}):
+                continue
+            results.append(
+                SemanticSearchResult(
+                    file_ref=row["file_ref"],
+                    distance=float(row["distance"]),
+                    external_id=row["external_id"],
+                    source_type=row["source_type"],
+                    source_path=row["source_path"],
+                    title=row["title"],
+                    text_hash=row["text_hash"],
+                    metadata=metadata,
+                )
+            )
+            if len(results) >= limit:
+                break
+        return results
+
+    def info(self) -> dict[str, Any]:
+        with self.connect() as conn:
+            config = {
+                row["key"]: row["value"]
+                for row in conn.execute(
+                    "SELECT key, value FROM semantic_index_config ORDER BY key"
+                ).fetchall()
+            }
+            count = conn.execute("SELECT COUNT(*) FROM semantic_index_docs").fetchone()[0]
+        parsed_metadata: dict[str, Any]
+        try:
+            parsed_metadata = json.loads(config.get("metadata", "{}"))
+        except json.JSONDecodeError:
+            parsed_metadata = {}
+        return {
+            "db_path": str(self.db_path),
+            "adapter": config.get("adapter", "sqlite-vec"),
+            "adapter_version": config.get("adapter_version", ""),
+            "dimension": int(config.get("dimension", "0") or 0),
+            "document_count": count,
+            "metadata": parsed_metadata,
+        }
+
+    def dimension(self) -> int:
+        with self.connect() as conn:
+            row = conn.execute(
+                "SELECT value FROM semantic_index_config WHERE key = 'dimension'"
+            ).fetchone()
+        if row is None:
+            raise SemanticIndexError(
+                f"semantic index is not initialized; call reset() first: {self.db_path}"
+            )
+        return int(row["value"])
+
+    def connect(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        conn.enable_load_extension(True)
+        sqlite_vec.load(conn)
+        conn.enable_load_extension(False)
+        return conn
+
+    @staticmethod
+    def text_hash(text: str) -> str:
+        return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+    @staticmethod
+    def _upsert_doc(conn: sqlite3.Connection, record: SemanticIndexRecord) -> int:
+        existing = conn.execute(
+            "SELECT rowid FROM semantic_index_docs WHERE file_ref = ?",
+            (record.file_ref,),
+        ).fetchone()
+        metadata_json = json.dumps(record.metadata or {}, ensure_ascii=False, sort_keys=True)
+        text_hash = SQLiteVecSemanticIndex.text_hash(record.text)
+        if existing is None:
+            cursor = conn.execute(
+                """
+                INSERT INTO semantic_index_docs(
+                    file_ref, external_id, source_type, source_path, title,
+                    text_hash, text_chars, metadata_json
+                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                """,
+                (
+                    record.file_ref,
+                    record.external_id,
+                    record.source_type,
+                    record.source_path,
+                    record.title,
+                    text_hash,
+                    len(record.text),
+                    metadata_json,
+                ),
+            )
+            return int(cursor.lastrowid)
+        rowid = int(existing["rowid"])
+        conn.execute(
+            """
+            UPDATE semantic_index_docs
+            SET external_id = ?,
+                source_type = ?,
+                source_path = ?,
+                title = ?,
+                text_hash = ?,
+                text_chars = ?,
+                metadata_json = ?,
+                updated_at = CURRENT_TIMESTAMP
+            WHERE rowid = ?
+            """,
+            (
+                record.external_id,
+                record.source_type,
+                record.source_path,
+                record.title,
+                text_hash,
+                len(record.text),
+                metadata_json,
+                rowid,
+            ),
+        )
+        return rowid
+
+
+def _json_obj(text: str | None) -> dict[str, Any]:
+    if not text:
+        return {}
+    try:
+        value = json.loads(text)
+    except json.JSONDecodeError:
+        return {}
+    return value if isinstance(value, dict) else {}
+
+
+def _matches_filters(
+    row: sqlite3.Row,
+    metadata: dict[str, Any],
+    filters: dict[str, Any],
+) -> bool:
+    for key, expected in filters.items():
+        actual = row[key] if key in row.keys() else metadata.get(key)
+        if isinstance(expected, list):
+            if str(actual) not in {str(item) for item in expected}:
+                return False
+        elif str(actual) != str(expected):
+            return False
+    return True
+
+
+def _source_type_filters(filters: dict[str, Any]) -> list[str]:
+    value = filters.get("source_type")
+    if value is None:
+        return []
+    if isinstance(value, list):
+        return [str(item) for item in value if str(item)]
+    return [str(value)] if str(value) else []
diff --git a/pageindex/filesystem/store.py b/pageindex/filesystem/store.py
new file mode 100644
index 000000000..7517d70ed
--- /dev/null
+++ b/pageindex/filesystem/store.py
@@ -0,0 +1,1928 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+import sqlite3
+from pathlib import Path
+from typing import Any, Iterable, Optional
+
+from .types import FileEntry, MetadataField
+
+SCHEMA_VERSION = 1
+
+
+class SQLiteFileSystemStore:
+    def __init__(self, workspace: str | Path):
+        self.workspace = Path(workspace).expanduser()
+        self.workspace.mkdir(parents=True, exist_ok=True)
+        self.db_path = self.workspace / "filesystem.sqlite"
+        self.text_dir = self.workspace / "artifacts" / "text"
+        self.raw_dir = self.workspace / "artifacts" / "raw"
+        self.pageindex_client_dir = self.workspace / "artifacts" / "pageindex_client"
+        for path in (self.text_dir, self.raw_dir, self.pageindex_client_dir):
+            path.mkdir(parents=True, exist_ok=True)
+        self.initialize_schema()
+
+    def connect(self) -> sqlite3.Connection:
+        conn = sqlite3.connect(self.db_path)
+        conn.row_factory = sqlite3.Row
+        conn.execute("PRAGMA foreign_keys = ON")
+        return conn
+
+    def initialize_schema(self) -> None:
+        with self.connect() as conn:
+            self._create_current_schema(conn)
+            self.ensure_folder(conn, "/")
+            conn.execute(f"PRAGMA user_version = {SCHEMA_VERSION}")
+
+    def _create_current_schema(self, conn: sqlite3.Connection) -> None:
+        conn.executescript(
+            """
+            CREATE TABLE IF NOT EXISTS files (
+                file_ref TEXT PRIMARY KEY,
+                external_id TEXT,
+                storage_uri TEXT NOT NULL,
+                source_path TEXT NOT NULL,
+                title TEXT NOT NULL,
+                descriptor TEXT NOT NULL,
+                content_type TEXT NOT NULL,
+                source_type TEXT,
+                fingerprint TEXT NOT NULL,
+                text_artifact_path TEXT NOT NULL,
+                raw_artifact_path TEXT,
+                pageindex_doc_id TEXT,
+                pageindex_tree_status TEXT NOT NULL DEFAULT 'not_built',
+                metadata_json TEXT NOT NULL DEFAULT '{}',
+                metadata_status_json TEXT NOT NULL DEFAULT '{}',
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                deleted_at TEXT
+            );
+
+            CREATE TABLE IF NOT EXISTS folders (
+                folder_id TEXT PRIMARY KEY,
+                parent_id TEXT,
+                name TEXT NOT NULL,
+                path TEXT NOT NULL UNIQUE,
+                description TEXT NOT NULL DEFAULT '',
+                kind TEXT NOT NULL DEFAULT 'manual',
+                metadata_json TEXT NOT NULL DEFAULT '{}',
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                FOREIGN KEY(parent_id) REFERENCES folders(folder_id)
+            );
+
+            CREATE TABLE IF NOT EXISTS file_folders (
+                file_ref TEXT NOT NULL,
+                folder_id TEXT NOT NULL,
+                metadata_json TEXT NOT NULL DEFAULT '{}',
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                PRIMARY KEY (file_ref, folder_id),
+                FOREIGN KEY(file_ref) REFERENCES files(file_ref) ON DELETE CASCADE,
+                FOREIGN KEY(folder_id) REFERENCES folders(folder_id) ON DELETE CASCADE
+            );
+
+            CREATE TABLE IF NOT EXISTS metadata_schema (
+                schema_id TEXT PRIMARY KEY,
+                scope_path TEXT,
+                version INTEGER NOT NULL DEFAULT 1,
+                status TEXT NOT NULL DEFAULT 'active',
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                updated_at TEXT DEFAULT CURRENT_TIMESTAMP
+            );
+
+            CREATE TABLE IF NOT EXISTS metadata_fields (
+                field_id TEXT PRIMARY KEY,
+                schema_id TEXT NOT NULL DEFAULT 'default',
+                name TEXT NOT NULL,
+                type TEXT NOT NULL,
+                description TEXT NOT NULL DEFAULT '',
+                indexed INTEGER NOT NULL DEFAULT 1,
+                faceted INTEGER NOT NULL DEFAULT 0,
+                sortable INTEGER NOT NULL DEFAULT 0,
+                source TEXT NOT NULL DEFAULT 'manual',
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                UNIQUE(schema_id, name),
+                FOREIGN KEY(schema_id) REFERENCES metadata_schema(schema_id)
+            );
+
+            CREATE TABLE IF NOT EXISTS metadata_values (
+                file_ref TEXT NOT NULL,
+                field_id TEXT NOT NULL,
+                value_text TEXT,
+                value_number REAL,
+                value_bool INTEGER,
+                value_json TEXT,
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                FOREIGN KEY(file_ref) REFERENCES files(file_ref) ON DELETE CASCADE,
+                FOREIGN KEY(field_id) REFERENCES metadata_fields(field_id) ON DELETE CASCADE
+            );
+
+            CREATE VIRTUAL TABLE IF NOT EXISTS file_fts
+            USING fts5(file_ref UNINDEXED, title, body, metadata_text);
+
+            CREATE INDEX IF NOT EXISTS idx_files_external_id ON files(external_id);
+            CREATE INDEX IF NOT EXISTS idx_files_source_path ON files(source_path);
+            CREATE INDEX IF NOT EXISTS idx_files_source_type ON files(source_type);
+            CREATE INDEX IF NOT EXISTS idx_folders_path ON folders(path);
+            CREATE INDEX IF NOT EXISTS idx_folders_parent_id ON folders(parent_id);
+            CREATE INDEX IF NOT EXISTS idx_file_folders_folder ON file_folders(folder_id);
+            CREATE INDEX IF NOT EXISTS idx_metadata_fields_name ON metadata_fields(name);
+            CREATE INDEX IF NOT EXISTS idx_metadata_values_field_text ON metadata_values(field_id, value_text);
+            CREATE INDEX IF NOT EXISTS idx_metadata_values_field_number ON metadata_values(field_id, value_number);
+            """
+        )
+        conn.execute(
+            """
+            INSERT OR IGNORE INTO metadata_schema(schema_id, scope_path, version, status)
+            VALUES ('default', NULL, 1, 'active')
+            """
+        )
+
+    @staticmethod
+    def _json_object(value: Any) -> dict[str, Any]:
+        try:
+            parsed = json.loads(value or "{}") if isinstance(value, str) else value
+        except json.JSONDecodeError:
+            return {}
+        return parsed if isinstance(parsed, dict) else {}
+
+    @staticmethod
+    def _columns(conn: sqlite3.Connection, table: str) -> set[str]:
+        return {row["name"] for row in conn.execute(f"PRAGMA table_info({table})").fetchall()}
+
+    def insert_file(self, record: dict[str, Any]) -> None:
+        self.insert_files([record])
+
+    def insert_files(self, records: list[dict[str, Any]]) -> None:
+        if not records:
+            return
+        with self.connect() as conn:
+            conn.execute("PRAGMA temp_store = MEMORY")
+            folder_cache: dict[tuple[str, str], str] = {}
+            file_rows = []
+            membership_rows = []
+            file_ref_rows = []
+            fts_file_ref_rows = []
+            fts_rows = []
+            metadata_rows = []
+            metadata_field_ids = {
+                row["name"]: row["field_id"]
+                for row in conn.execute(
+                    "SELECT name, field_id FROM metadata_fields WHERE schema_id = 'default'"
+                ).fetchall()
+            }
+            for record in records:
+                folder_cache_key = (record["folder_path"], record.get("folder_kind", "manual"))
+                folder_id = folder_cache.get(folder_cache_key)
+                if folder_id is None:
+                    folder_id = self.ensure_folder(
+                        conn,
+                        record["folder_path"],
+                        kind=record.get("folder_kind", "manual"),
+                    )
+                    folder_cache[folder_cache_key] = folder_id
+                file_rows.append(self._file_insert_values(record))
+                membership_rows.append(
+                    (
+                        record["file_ref"],
+                        folder_id,
+                        json.dumps(record.get("folder_metadata") or {}, ensure_ascii=False),
+                    )
+                )
+                file_ref_rows.append((record["file_ref"],))
+                if not record.get("skip_fts", False):
+                    fts_file_ref_rows.append((record["file_ref"],))
+                    fts_rows.append(
+                        (
+                            record["file_ref"],
+                            record["title"],
+                            record["content"],
+                            record["metadata_text"],
+                        )
+                    )
+                metadata_rows.extend(
+                    self._metadata_insert_values(
+                        record["file_ref"],
+                        record.get("indexed_metadata", record["metadata"]),
+                        metadata_field_ids,
+                    )
+                )
+            conn.executemany(self._file_insert_sql(), file_rows)
+            conn.executemany(
+                """
+                INSERT OR REPLACE INTO file_folders(file_ref, folder_id, metadata_json)
+                VALUES (?, ?, ?)
+                """,
+                membership_rows,
+            )
+            conn.executemany("DELETE FROM metadata_values WHERE file_ref = ?", file_ref_rows)
+            if metadata_rows:
+                conn.executemany(
+                    """
+                    INSERT INTO metadata_values(
+                        file_ref, field_id, value_text, value_number, value_bool, value_json
+                    ) VALUES (?, ?, ?, ?, ?, ?)
+                    """,
+                    metadata_rows,
+                )
+            if fts_file_ref_rows:
+                conn.executemany("DELETE FROM file_fts WHERE file_ref = ?", fts_file_ref_rows)
+                conn.executemany(
+                    """
+                    INSERT INTO file_fts(file_ref, title, body, metadata_text)
+                    VALUES (?, ?, ?, ?)
+                    """,
+                    fts_rows,
+                )
+
+    @staticmethod
+    def _file_insert_sql() -> str:
+        columns = [
+            "file_ref",
+            "external_id",
+            "storage_uri",
+            "source_path",
+            "title",
+            "descriptor",
+            "content_type",
+            "source_type",
+            "fingerprint",
+            "text_artifact_path",
+            "raw_artifact_path",
+            "pageindex_doc_id",
+            "pageindex_tree_status",
+            "metadata_json",
+            "metadata_status_json",
+        ]
+        columns.extend(["deleted_at", "updated_at"])
+        placeholders = ", ".join(["?"] * (len(columns) - 2) + ["NULL", "CURRENT_TIMESTAMP"])
+        return f"""
+            INSERT OR REPLACE INTO files ({", ".join(columns)})
+            VALUES ({placeholders})
+        """
+
+    @staticmethod
+    def _file_insert_values(record: dict[str, Any]) -> tuple[Any, ...]:
+        values: list[Any] = [
+            record["file_ref"],
+            record["external_id"],
+            record["storage_uri"],
+            record["source_path"],
+            record["title"],
+            record["descriptor"],
+            record["content_type"],
+            record["source_type"],
+            record["fingerprint"],
+            record["text_artifact_path"],
+            record["raw_artifact_path"],
+            record.get("pageindex_doc_id"),
+            record.get("pageindex_tree_status", "not_built"),
+            record["metadata_json"],
+            record.get("metadata_status_json", "{}"),
+        ]
+        return tuple(values)
+
+    def _metadata_insert_values(
+        self,
+        file_ref: str,
+        metadata: dict[str, Any],
+        metadata_field_ids: dict[str, str],
+    ) -> list[tuple[Any, ...]]:
+        values = []
+        for name, value in metadata.items():
+            if not self._valid_field_name(name):
+                continue
+            field_id = metadata_field_ids.get(name)
+            if field_id is None:
+                continue
+            for item in self._metadata_value_items(value):
+                values.append(
+                    (
+                        file_ref,
+                        field_id,
+                        item["value_text"],
+                        item["value_number"],
+                        item["value_bool"],
+                        item["value_json"],
+                    )
+                )
+        return values
+
+    def create_folder(
+        self,
+        path: str,
+        *,
+        kind: str = "manual",
+        description: str = "",
+        metadata: dict[str, Any] | None = None,
+    ) -> str:
+        with self.connect() as conn:
+            return self.ensure_folder(
+                conn,
+                path,
+                kind=kind,
+                description=description,
+                metadata=metadata,
+            )
+
+    def attach_file_to_folder(
+        self,
+        file_ref: str,
+        folder_path_or_id: str,
+        *,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        with self.connect() as conn:
+            resolved_file_ref = self._resolve_file_ref(conn, file_ref)
+            folder_id = self._resolve_or_create_folder(conn, folder_path_or_id)
+            conn.execute(
+                """
+                INSERT INTO file_folders(file_ref, folder_id, metadata_json)
+                VALUES (?, ?, ?)
+                ON CONFLICT(file_ref, folder_id) DO UPDATE SET
+                    metadata_json = excluded.metadata_json
+                """,
+                (
+                    resolved_file_ref,
+                    folder_id,
+                    json.dumps(metadata or {}, ensure_ascii=False),
+                ),
+            )
+
+    def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None:
+        with self.connect() as conn:
+            for item in items:
+                resolved_file_ref = self._resolve_file_ref(conn, item["file_ref"])
+                folder_id = self._resolve_or_create_folder(conn, item["folder"])
+                conn.execute(
+                    """
+                    INSERT INTO file_folders(file_ref, folder_id, metadata_json)
+                    VALUES (?, ?, ?)
+                    ON CONFLICT(file_ref, folder_id) DO UPDATE SET
+                        metadata_json = excluded.metadata_json
+                    """,
+                    (
+                        resolved_file_ref,
+                        folder_id,
+                        json.dumps(item.get("metadata") or {}, ensure_ascii=False),
+                    ),
+                )
+
+    def replace_metadata_values(
+        self,
+        conn: sqlite3.Connection,
+        file_ref: str,
+        metadata: dict[str, Any],
+    ) -> None:
+        conn.execute("DELETE FROM metadata_values WHERE file_ref = ?", (file_ref,))
+        for name, value in metadata.items():
+            if not self._valid_field_name(name):
+                continue
+            field_id = self._registered_field_id(conn, name)
+            if field_id is None:
+                continue
+            for item in self._metadata_value_items(value):
+                conn.execute(
+                    """
+                    INSERT INTO metadata_values(
+                        file_ref, field_id, value_text, value_number, value_bool, value_json
+                    ) VALUES (?, ?, ?, ?, ?, ?)
+                    """,
+                    (
+                        file_ref,
+                        field_id,
+                        item["value_text"],
+                        item["value_number"],
+                        item["value_bool"],
+                        item["value_json"],
+                    ),
+                )
+
+    @staticmethod
+    def _registered_field_id(conn: sqlite3.Connection, name: str) -> str | None:
+        row = conn.execute(
+            """
+            SELECT field_id
+            FROM metadata_fields
+            WHERE schema_id = 'default' AND name = ?
+            """,
+            (name,),
+        ).fetchone()
+        return None if row is None else row["field_id"]
+
+    def replace_fts(self, conn: sqlite3.Connection, record: dict[str, Any]) -> None:
+        conn.execute("DELETE FROM file_fts WHERE file_ref = ?", (record["file_ref"],))
+        conn.execute(
+            """
+            INSERT INTO file_fts(file_ref, title, body, metadata_text)
+            VALUES (?, ?, ?, ?)
+            """,
+            (
+                record["file_ref"],
+                record["title"],
+                record["content"],
+                record["metadata_text"],
+            ),
+        )
+
+    def upsert_metadata_fields(
+        self,
+        fields: Iterable[MetadataField],
+        *,
+        conn: sqlite3.Connection | None = None,
+    ) -> None:
+        owns_connection = conn is None
+        if conn is None:
+            conn = self.connect()
+        try:
+            conn.execute(
+                """
+                INSERT OR IGNORE INTO metadata_schema(schema_id, scope_path, version, status)
+                VALUES ('default', NULL, 1, 'active')
+                """
+            )
+            for field in fields:
+                conn.execute(
+                    """
+                    INSERT INTO metadata_fields(
+                        field_id, schema_id, name, type, description,
+                        indexed, faceted, sortable, source, updated_at
+                    ) VALUES (?, 'default', ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
+                    ON CONFLICT(schema_id, name) DO UPDATE SET
+                        type = excluded.type,
+                        source = excluded.source,
+                        updated_at = CURRENT_TIMESTAMP
+                    """,
+                    (
+                        self.field_id(field.name),
+                        field.name,
+                        field.field_type,
+                        field.description,
+                        int(field.indexed),
+                        int(field.faceted),
+                        int(field.sortable),
+                        field.source,
+                    ),
+                )
+            if owns_connection:
+                conn.commit()
+        finally:
+            if owns_connection:
+                conn.close()
+
+    def metadata_field_exists(self, name: str) -> bool:
+        with self.connect() as conn:
+            row = conn.execute(
+                "SELECT 1 FROM metadata_fields WHERE schema_id = 'default' AND name = ?",
+                (name,),
+            ).fetchone()
+        return row is not None
+
+    def list_metadata_fields(self) -> list[MetadataField]:
+        with self.connect() as conn:
+            rows = conn.execute(
+                """
+                SELECT name, type, description, indexed, faceted, sortable, source
+                FROM metadata_fields
+                WHERE schema_id = 'default'
+                ORDER BY name
+                """
+            ).fetchall()
+        return [
+            MetadataField(
+                name=row["name"],
+                field_type=row["type"],
+                description=row["description"],
+                indexed=bool(row["indexed"]),
+                faceted=bool(row["faceted"]),
+                sortable=bool(row["sortable"]),
+                source=row["source"],
+            )
+            for row in rows
+        ]
+
+    def list_folder(
+        self,
+        path: str = "/",
+        recursive: bool = False,
+        limit: int = 100,
+        max_depth: int | None = None,
+    ) -> dict[str, Any]:
+        path = normalize_path(path)
+        if max_depth is not None and max_depth < 0:
+            raise ValueError("max_depth must be non-negative")
+        with self.connect() as conn:
+            folder = self._folder_by_path(conn, path)
+            if folder is None:
+                raise KeyError(f"Unknown folder path: {path}")
+            if recursive:
+                folder_depth_clause = ""
+                folder_depth_params: list[Any] = []
+                if max_depth is not None:
+                    if max_depth == 0:
+                        folder_depth_clause = "AND 0"
+                    else:
+                        folder_depth_clause = (
+                            f"AND ({self._folder_depth_sql('fo.path')} - ?) <= ?"
+                        )
+                        folder_depth_params = [self._folder_depth(path), max_depth]
+                folder_rows = conn.execute(
+                    f"""
+                    SELECT
+                        fo.folder_id,
+                        fo.parent_id,
+                        fo.name,
+                        fo.path,
+                        fo.description,
+                        fo.kind,
+                        fo.metadata_json,
+                        fo.created_at,
+                        fo.updated_at,
+                        (
+                            SELECT COUNT(DISTINCT child_ff.file_ref)
+                            FROM file_folders child_ff
+                            JOIN files child_file
+                              ON child_file.file_ref = child_ff.file_ref
+                             AND child_file.deleted_at IS NULL
+                            WHERE child_ff.folder_id = fo.folder_id
+                        ) AS file_count,
+                        (
+                            SELECT COUNT(*)
+                            FROM folders child_folder
+                            WHERE child_folder.parent_id = fo.folder_id
+                        ) AS children_count
+                    FROM folders fo
+                    WHERE fo.path != ? AND (fo.path LIKE ? ESCAPE '\\')
+                      {folder_depth_clause}
+                    ORDER BY fo.path
+                    LIMIT ?
+                    """,
+                    (path, self._descendant_like(path), *folder_depth_params, limit),
+                ).fetchall()
+                file_rows = self._file_rows_for_scope(
+                    conn,
+                    path,
+                    True,
+                    limit,
+                    max_depth=max_depth,
+                )
+            else:
+                folder_rows = conn.execute(
+                    """
+                    SELECT
+                        fo.folder_id,
+                        fo.parent_id,
+                        fo.name,
+                        fo.path,
+                        fo.description,
+                        fo.kind,
+                        fo.metadata_json,
+                        fo.created_at,
+                        fo.updated_at,
+                        (
+                            SELECT COUNT(DISTINCT child_ff.file_ref)
+                            FROM file_folders child_ff
+                            JOIN files child_file
+                              ON child_file.file_ref = child_ff.file_ref
+                             AND child_file.deleted_at IS NULL
+                            WHERE child_ff.folder_id = fo.folder_id
+                        ) AS file_count,
+                        (
+                            SELECT COUNT(*)
+                            FROM folders child_folder
+                            WHERE child_folder.parent_id = fo.folder_id
+                        ) AS children_count
+                    FROM folders fo
+                    WHERE fo.parent_id = ?
+                    ORDER BY fo.kind, fo.name
+                    LIMIT ?
+                    """,
+                    (folder["folder_id"], limit),
+                ).fetchall()
+                file_rows = self._file_rows_for_scope(conn, path, False, limit)
+        return {
+            "folders": [self._folder_row_to_dict(row) for row in folder_rows],
+            "files": [self._file_summary(row) for row in file_rows],
+        }
+
+    def folder_info(self, path: str = "/") -> dict[str, Any]:
+        path = normalize_path(path)
+        with self.connect() as conn:
+            row = conn.execute(
+                """
+                SELECT
+                    fo.folder_id,
+                    fo.parent_id,
+                    fo.name,
+                    fo.path,
+                    fo.description,
+                    fo.kind,
+                    fo.metadata_json,
+                    fo.created_at,
+                    fo.updated_at,
+                    (
+                        SELECT COUNT(DISTINCT child_ff.file_ref)
+                        FROM file_folders child_ff
+                        JOIN files child_file
+                          ON child_file.file_ref = child_ff.file_ref
+                         AND child_file.deleted_at IS NULL
+                        WHERE child_ff.folder_id = fo.folder_id
+                    ) AS file_count,
+                    (
+                        SELECT COUNT(*)
+                        FROM folders child_folder
+                        WHERE child_folder.parent_id = fo.folder_id
+                    ) AS children_count
+                FROM folders fo
+                WHERE fo.path = ?
+                """,
+                (path,),
+            ).fetchone()
+        if row is None:
+            raise KeyError(f"Unknown folder path: {path}")
+        return self._folder_row_to_dict(row)
+
+    def find_folders(
+        self,
+        path: str = "/",
+        *,
+        metadata_filter: Optional[dict[str, Any]] = None,
+        limit: int = 100,
+        max_depth: int | None = None,
+    ) -> list[dict[str, Any]]:
+        path = normalize_path(path)
+        if max_depth is not None and max_depth < 0:
+            raise ValueError("max_depth must be non-negative")
+        metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter)
+        metadata_clause = f"AND {' AND '.join(metadata_sql)}" if metadata_sql else ""
+        folder_depth_clause = ""
+        folder_depth_params: list[Any] = []
+        if max_depth is not None:
+            if max_depth == 0:
+                folder_depth_clause = "AND 0"
+            else:
+                folder_depth_clause = f"AND ({self._folder_depth_sql('fo.path')} - ?) <= ?"
+                folder_depth_params = [self._folder_depth(path), max_depth]
+        sql = f"""
+            SELECT *
+            FROM (
+                SELECT
+                    fo.folder_id,
+                    fo.parent_id,
+                    fo.name,
+                    fo.path,
+                    fo.description,
+                    fo.kind,
+                    fo.metadata_json,
+                    fo.created_at,
+                    fo.updated_at,
+                    (
+                        SELECT COUNT(DISTINCT child_ff.file_ref)
+                        FROM file_folders child_ff
+                        JOIN files child_file
+                          ON child_file.file_ref = child_ff.file_ref
+                         AND child_file.deleted_at IS NULL
+                        WHERE child_ff.folder_id = fo.folder_id
+                    ) AS file_count,
+                    (
+                        SELECT COUNT(*)
+                        FROM folders child_folder
+                        WHERE child_folder.parent_id = fo.folder_id
+                    ) AS children_count,
+                    (
+                        SELECT COUNT(DISTINCT f.file_ref)
+                        FROM files f
+                        JOIN file_folders matched_ff
+                          ON matched_ff.file_ref = f.file_ref
+                        JOIN folders matched_folder
+                          ON matched_folder.folder_id = matched_ff.folder_id
+                        WHERE f.deleted_at IS NULL
+                          AND (
+                              matched_folder.folder_id = fo.folder_id
+                              OR matched_folder.path LIKE {self._descendant_like_sql_expr("fo.path")} ESCAPE '\\'
+                          )
+                          {metadata_clause}
+                    ) AS matched_files
+                FROM folders fo
+                WHERE fo.path != ? AND fo.path LIKE ? ESCAPE '\\'
+                  {folder_depth_clause}
+            )
+            WHERE matched_files > 0
+            ORDER BY path
+            LIMIT ?
+        """
+        params = [
+            *metadata_params,
+            path,
+            self._descendant_like(path),
+            *folder_depth_params,
+            limit,
+        ]
+        with self.connect() as conn:
+            folder = self._folder_by_path(conn, path)
+            if folder is None:
+                raise KeyError(f"Unknown folder path: {path}")
+            rows = conn.execute(sql, params).fetchall()
+        return [self._folder_row_to_dict(row) for row in rows]
+
+    def search_files(
+        self,
+        query: str | list[str] | None,
+        *,
+        scope: Optional[dict[str, Any]] = None,
+        metadata_filter: Optional[dict[str, Any]] = None,
+        limit: int = 10,
+    ) -> list[dict[str, Any]]:
+        query_text = self._query_text(query)
+        match_queries = self._fts_match_queries(query_text) if query_text else [None]
+        results: list[dict[str, Any]] = []
+        seen: set[str] = set()
+        for match_query in match_queries:
+            rows = self._search_once(match_query, scope, metadata_filter, max(limit * 25, limit))
+            for row in rows:
+                if row["file_ref"] in seen:
+                    continue
+                seen.add(row["file_ref"])
+                results.append(self._search_row_to_dict(row))
+                if len(results) >= limit:
+                    return results
+            if results:
+                return results
+        return results
+
+    def _search_once(
+        self,
+        match_query: str | None,
+        scope: Optional[dict[str, Any]],
+        metadata_filter: Optional[dict[str, Any]],
+        limit: int,
+    ) -> list[sqlite3.Row]:
+        joins = []
+        selects = [
+            "f.file_ref",
+            "f.external_id",
+            "f.source_path",
+            "f.title",
+            "f.descriptor",
+            "f.pageindex_tree_status",
+            "f.metadata_json",
+            "f.metadata_status_json",
+            "f.created_at",
+            """
+            (
+                SELECT display_folder.folder_id
+                FROM file_folders display_ff
+                JOIN folders display_folder
+                  ON display_folder.folder_id = display_ff.folder_id
+                WHERE display_ff.file_ref = f.file_ref
+                ORDER BY display_folder.path
+                LIMIT 1
+            ) AS folder_id
+            """,
+            """
+            (
+                SELECT display_folder.path
+                FROM file_folders display_ff
+                JOIN folders display_folder
+                  ON display_folder.folder_id = display_ff.folder_id
+                WHERE display_ff.file_ref = f.file_ref
+                ORDER BY display_folder.path
+                LIMIT 1
+            ) AS folder_path
+            """,
+        ]
+        where = ["f.deleted_at IS NULL"]
+        params: list[Any] = []
+        if match_query:
+            joins.append("JOIN file_fts ON file_fts.file_ref = f.file_ref")
+            selects.append("snippet(file_fts, 2, '', '', '...', 16) AS snippet")
+            selects.append("bm25(file_fts) AS rank")
+            where.append("file_fts MATCH ?")
+            params.append(match_query)
+            order_by = "rank"
+        else:
+            selects.append("f.descriptor AS snippet")
+            selects.append("0 AS rank")
+            order_by = "f.created_at DESC, f.title"
+        scope_sql, scope_params = self._scope_sql(scope)
+        if scope_sql:
+            where.append(scope_sql)
+            params.extend(scope_params)
+        metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter)
+        where.extend(metadata_sql)
+        params.extend(metadata_params)
+        sql = f"""
+            SELECT {", ".join(selects)}
+            FROM files f
+            {" ".join(joins)}
+            WHERE {" AND ".join(where)}
+            ORDER BY {order_by}
+            LIMIT ?
+        """
+        params.append(limit)
+        with self.connect() as conn:
+            return conn.execute(sql, params).fetchall()
+
+    def _metadata_filter_sql(self, metadata_filter: Optional[dict[str, Any]]) -> tuple[list[str], list[Any]]:
+        if not metadata_filter:
+            return [], []
+        clause, params = self._compile_metadata_filter(metadata_filter)
+        return [clause] if clause else [], params
+
+    def _compile_metadata_filter(self, metadata_filter: dict[str, Any]) -> tuple[str, list[Any]]:
+        clauses = []
+        params: list[Any] = []
+        for key, condition in metadata_filter.items():
+            if key in {"$and", "$or"}:
+                child_clauses = []
+                child_params: list[Any] = []
+                for item in condition:
+                    child_clause, item_params = self._compile_metadata_filter(item)
+                    if child_clause:
+                        child_clauses.append(f"({child_clause})")
+                        child_params.extend(item_params)
+                if child_clauses:
+                    joiner = " AND " if key == "$and" else " OR "
+                    clauses.append(joiner.join(child_clauses))
+                    params.extend(child_params)
+                continue
+            field_clause, field_params = self._compile_metadata_field_filter(key, condition)
+            clauses.append(field_clause)
+            params.extend(field_params)
+        return " AND ".join(f"({clause})" for clause in clauses), params
+
+    def _compile_metadata_field_filter(self, field: str, condition: Any) -> tuple[str, list[Any]]:
+        if not isinstance(condition, dict) or not any(str(key).startswith("$") for key in condition):
+            condition = {"$eq": condition}
+        operator, expected = next(iter(condition.items()))
+        field_id = self.field_id(field)
+        if operator == "$eq":
+            return (
+                """
+                EXISTS (
+                    SELECT 1 FROM metadata_values mv
+                    WHERE mv.file_ref = f.file_ref
+                      AND mv.field_id = ?
+                      AND mv.value_text = ?
+                )
+                """,
+                [field_id, self._metadata_compare_text(expected)],
+            )
+        if operator == "$ne":
+            return (
+                """
+                NOT EXISTS (
+                    SELECT 1 FROM metadata_values mv
+                    WHERE mv.file_ref = f.file_ref
+                      AND mv.field_id = ?
+                      AND mv.value_text = ?
+                )
+                """,
+                [field_id, self._metadata_compare_text(expected)],
+            )
+        if operator == "$in":
+            values = [self._metadata_compare_text(item) for item in expected]
+            if not values:
+                return "0", []
+            placeholders = ", ".join("?" for _ in values)
+            return (
+                f"""
+                EXISTS (
+                    SELECT 1 FROM metadata_values mv
+                    WHERE mv.file_ref = f.file_ref
+                      AND mv.field_id = ?
+                      AND mv.value_text IN ({placeholders})
+                )
+                """,
+                [field_id, *values],
+            )
+        if operator == "$contains":
+            return (
+                """
+                EXISTS (
+                    SELECT 1 FROM metadata_values mv
+                    WHERE mv.file_ref = f.file_ref
+                      AND mv.field_id = ?
+                      AND lower(mv.value_text) LIKE lower(?) ESCAPE '\\'
+                )
+                """,
+                [field_id, self._contains_like(self._metadata_compare_text(expected))],
+            )
+        if operator in {"$gt", "$gte", "$lt", "$lte"}:
+            comparator = {
+                "$gt": ">",
+                "$gte": ">=",
+                "$lt": "<",
+                "$lte": "<=",
+            }[operator]
+            if isinstance(expected, (int, float)) and not isinstance(expected, bool):
+                return (
+                    f"""
+                    EXISTS (
+                        SELECT 1 FROM metadata_values mv
+                        WHERE mv.file_ref = f.file_ref
+                          AND mv.field_id = ?
+                          AND mv.value_number IS NOT NULL
+                          AND mv.value_number {comparator} ?
+                    )
+                    """,
+                    [field_id, float(expected)],
+                )
+            return (
+                f"""
+                EXISTS (
+                    SELECT 1 FROM metadata_values mv
+                    WHERE mv.file_ref = f.file_ref
+                      AND mv.field_id = ?
+                      AND mv.value_text {comparator} ?
+                )
+                """,
+                [field_id, self._metadata_compare_text(expected)],
+            )
+        raise ValueError(f"Unsupported metadata operator: {operator}")
+
+    def get_file(self, file_ref: str) -> FileEntry:
+        with self.connect() as conn:
+            row = self._file_entry_row(conn, file_ref)
+        if row is None:
+            raise KeyError(f"Unknown file_ref: {file_ref}")
+        return self._file_entry(row)
+
+    def list_pending_metadata_status(self, *, limit: int | None = None) -> list[FileEntry]:
+        sql = """
+            SELECT
+                f.file_ref,
+                f.external_id,
+                f.storage_uri,
+                f.source_path,
+                f.title,
+                f.descriptor,
+                f.content_type,
+                f.source_type,
+                f.fingerprint,
+                f.text_artifact_path,
+                f.raw_artifact_path,
+                f.pageindex_doc_id,
+                f.pageindex_tree_status,
+                f.metadata_json,
+                f.metadata_status_json,
+                COALESCE(primary_folder.path, '/') AS folder_path
+            FROM files f
+            LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref
+            LEFT JOIN folders primary_folder ON primary_folder.folder_id = ff.folder_id
+            WHERE f.deleted_at IS NULL
+              AND (
+                f.metadata_status_json LIKE '%pending_generate%'
+                OR f.metadata_status_json LIKE '%pending_submit%'
+              )
+            GROUP BY f.file_ref
+            ORDER BY f.created_at, f.file_ref
+        """
+        params: list[Any] = []
+        if limit is not None:
+            sql += " LIMIT ?"
+            params.append(int(limit))
+        with self.connect() as conn:
+            rows = conn.execute(sql, params).fetchall()
+        return [self._file_entry(row) for row in rows]
+
+    def update_file_metadata_status(
+        self,
+        file_ref: str,
+        *,
+        metadata: dict[str, Any],
+        metadata_status: dict[str, Any],
+    ) -> None:
+        with self.connect() as conn:
+            row = self._file_entry_row(conn, file_ref)
+            if row is None:
+                raise KeyError(f"Unknown file_ref: {file_ref}")
+            metadata_text_value = metadata_text(metadata)
+            conn.execute(
+                """
+                UPDATE files
+                SET metadata_json = ?,
+                    metadata_status_json = ?,
+                    updated_at = CURRENT_TIMESTAMP
+                WHERE file_ref = ? AND deleted_at IS NULL
+                """,
+                (
+                    json.dumps(metadata, ensure_ascii=False),
+                    json.dumps(metadata_status, ensure_ascii=False),
+                    file_ref,
+                ),
+            )
+            self.replace_metadata_values(
+                conn,
+                file_ref,
+                self.indexed_metadata_values(metadata),
+            )
+            conn.execute(
+                """
+                UPDATE file_fts
+                SET metadata_text = ?
+                WHERE file_ref = ?
+                """,
+                (metadata_text_value, file_ref),
+            )
+
+    def resolve_file_ref(self, target: str) -> str:
+        with self.connect() as conn:
+            return self._resolve_file_ref(conn, target)
+
+    def _resolve_file_ref(self, conn: sqlite3.Connection, target: str) -> str:
+        target = str(target).strip()
+        if not target:
+            raise KeyError("Empty file target")
+        row = conn.execute(
+            "SELECT file_ref FROM files WHERE file_ref = ? AND deleted_at IS NULL",
+            (target,),
+        ).fetchone()
+        if row:
+            return row["file_ref"]
+        row = conn.execute(
+            "SELECT file_ref FROM files WHERE external_id = ? AND deleted_at IS NULL",
+            (target,),
+        ).fetchone()
+        if row:
+            return row["file_ref"]
+        stripped = target.strip("/")
+        rows = conn.execute(
+            """
+            SELECT
+                f.file_ref,
+                f.external_id,
+                f.title,
+                f.source_path,
+                COALESCE(MIN(fo.path), '/') AS folder_path
+            FROM files f
+            LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref
+            LEFT JOIN folders fo ON fo.folder_id = ff.folder_id
+            WHERE f.source_path = ? AND f.deleted_at IS NULL
+            GROUP BY f.file_ref, f.external_id, f.title, f.source_path
+            ORDER BY f.file_ref
+            LIMIT 2
+            """,
+            (stripped,),
+        ).fetchall()
+        if len(rows) > 1:
+            matches = "; ".join(self._virtual_match_summary(row) for row in rows)
+            raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}")
+        if rows:
+            return rows[0]["file_ref"]
+        virtual_file_ref = self._resolve_virtual_file_ref(conn, target)
+        if virtual_file_ref:
+            return virtual_file_ref
+        raise KeyError(f"Unknown file target: {target}")
+
+    def _resolve_virtual_file_ref(self, conn: sqlite3.Connection, target: str) -> str | None:
+        virtual_target = normalize_path(target)
+        rows = conn.execute(
+            """
+            WITH virtual_matches AS (
+                SELECT
+                    f.file_ref,
+                    f.external_id,
+                    f.title,
+                    f.source_path,
+                    pf.path AS folder_path,
+                    (CASE WHEN pf.path = '/' THEN '/' ELSE pf.path || '/' END)
+                        || ltrim(f.title, '/') AS title_virtual_path,
+                    (CASE WHEN pf.path = '/' THEN '/' ELSE pf.path || '/' END)
+                        || ltrim(f.source_path, '/') AS source_virtual_path
+                FROM files f
+                JOIN file_folders ff ON ff.file_ref = f.file_ref
+                JOIN folders pf ON pf.folder_id = ff.folder_id
+                WHERE f.deleted_at IS NULL
+            )
+            SELECT
+                file_ref,
+                external_id,
+                title,
+                source_path,
+                MIN(folder_path) AS folder_path
+            FROM virtual_matches
+            WHERE title_virtual_path = ?
+               OR source_virtual_path = ?
+            GROUP BY file_ref, external_id, title, source_path
+            ORDER BY file_ref
+            LIMIT 2
+            """,
+            (virtual_target, virtual_target),
+        ).fetchall()
+        if not rows:
+            return None
+        if len(rows) > 1:
+            matches = "; ".join(self._virtual_match_summary(row) for row in rows)
+            raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}")
+        return rows[0]["file_ref"]
+
+    @staticmethod
+    def _virtual_match_summary(row: sqlite3.Row) -> str:
+        external_id = row["external_id"] or "-"
+        return (
+            f"file_ref={row['file_ref']} external_id={external_id} "
+            f"folder={row['folder_path']} title={row['title']!r} "
+            f"source_path={row['source_path']!r}"
+        )
+
+    def ensure_folder(
+        self,
+        conn: sqlite3.Connection | None,
+        path: str,
+        *,
+        kind: str = "manual",
+        description: str = "",
+        metadata: dict[str, Any] | None = None,
+    ) -> str:
+        owns_connection = conn is None
+        if conn is None:
+            conn = self.connect()
+        try:
+            normalized = normalize_path(path)
+            metadata_json = json.dumps(metadata or {}, ensure_ascii=False)
+            if normalized == "/":
+                folder_id = self.folder_id("/")
+                existing = conn.execute(
+                    "SELECT folder_id FROM folders WHERE path = '/'"
+                ).fetchone()
+                if existing is not None and not description and metadata_json == "{}":
+                    if owns_connection:
+                        conn.commit()
+                    return folder_id
+                self._upsert_folder_row(
+                    conn,
+                    folder_id=folder_id,
+                    parent_id=None,
+                    name="/",
+                    path="/",
+                    kind=kind,
+                    description=description,
+                    metadata_json=metadata_json,
+                )
+                if owns_connection:
+                    conn.commit()
+                return folder_id
+            parent_id = self.ensure_folder(conn, str(Path(normalized).parent), kind=kind)
+            name = normalized.rsplit("/", 1)[-1]
+            folder_id = self.folder_id(normalized)
+            self._upsert_folder_row(
+                conn,
+                folder_id=folder_id,
+                parent_id=parent_id,
+                name=name,
+                path=normalized,
+                kind=kind,
+                description=description,
+                metadata_json=metadata_json,
+            )
+            if owns_connection:
+                conn.commit()
+            return folder_id
+        finally:
+            if owns_connection:
+                conn.close()
+
+    def _upsert_folder_row(
+        self,
+        conn: sqlite3.Connection,
+        *,
+        folder_id: str,
+        parent_id: str | None,
+        name: str,
+        path: str,
+        kind: str,
+        description: str,
+        metadata_json: str,
+    ) -> None:
+        columns = self._columns(conn, "folders")
+        insert_columns = ["folder_id", "parent_id", "name", "path", "description", "kind", "metadata_json"]
+        values: list[Any] = [folder_id, parent_id, name, path, description, kind, metadata_json]
+        if "source" in columns:
+            insert_columns.append("source")
+            values.append("system")
+        if "sort_order" in columns:
+            insert_columns.append("sort_order")
+            values.append(0)
+        placeholders = ", ".join("?" for _ in values)
+        update_assignments = [
+            "parent_id = excluded.parent_id",
+            "name = excluded.name",
+            "kind = excluded.kind",
+            "updated_at = CURRENT_TIMESTAMP",
+        ]
+        if description:
+            update_assignments.append("description = excluded.description")
+        if metadata_json != "{}":
+            update_assignments.append("metadata_json = excluded.metadata_json")
+        conn.execute(
+            f"""
+            INSERT INTO folders({", ".join(insert_columns)})
+            VALUES ({placeholders})
+            ON CONFLICT(path) DO UPDATE SET
+                {", ".join(update_assignments)}
+            """,
+            values,
+        )
+
+    def _resolve_or_create_folder(self, conn: sqlite3.Connection, folder_path_or_id: str) -> str:
+        target = str(folder_path_or_id).strip()
+        if not target:
+            raise KeyError("Empty folder target")
+        row = conn.execute(
+            "SELECT folder_id FROM folders WHERE folder_id = ?",
+            (target,),
+        ).fetchone()
+        if row:
+            return row["folder_id"]
+        row = conn.execute(
+            "SELECT folder_id FROM folders WHERE path = ?",
+            (normalize_path(target),),
+        ).fetchone()
+        if row:
+            return row["folder_id"]
+        return self.ensure_folder(conn, target)
+
+    def read_text(self, file_ref: str) -> str:
+        entry = self.get_file(file_ref)
+        return Path(entry.text_artifact_path).read_text(encoding="utf-8")
+
+    def write_text_artifact(self, file_ref: str, content: str) -> Path:
+        path = self.text_dir / f"{file_ref}.txt"
+        path.write_text(content, encoding="utf-8")
+        return path
+
+    def update_pageindex_pointer(
+        self,
+        file_ref: str,
+        *,
+        pageindex_doc_id: str | None,
+        pageindex_tree_status: str,
+    ) -> None:
+        with self.connect() as conn:
+            resolved = self._resolve_file_ref(conn, file_ref)
+            conn.execute(
+                """
+                UPDATE files
+                SET pageindex_doc_id = ?,
+                    pageindex_tree_status = ?,
+                    updated_at = CURRENT_TIMESTAMP
+                WHERE file_ref = ? AND deleted_at IS NULL
+                """,
+                (pageindex_doc_id, pageindex_tree_status, resolved),
+            )
+
+    def write_raw_artifact(self, file_ref: str, metadata: dict[str, Any]) -> Path:
+        path = self.raw_dir / f"{file_ref}.json"
+        path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8")
+        return path
+
+    def file_info(self, target: str) -> dict[str, Any]:
+        file_ref = self.resolve_file_ref(target)
+        entry = self.get_file(file_ref)
+        info = self._file_entry_to_dict(entry)
+        info["folders"] = self.folder_memberships(file_ref)
+        return info
+
+    def file_matches(
+        self,
+        file_ref: str,
+        *,
+        scope: Optional[dict[str, Any]] = None,
+        metadata_filter: Optional[dict[str, Any]] = None,
+    ) -> bool:
+        where = ["f.file_ref = ?", "f.deleted_at IS NULL"]
+        params: list[Any] = [file_ref]
+        scope_sql, scope_params = self._scope_sql(scope)
+        if scope_sql:
+            where.append(scope_sql)
+            params.extend(scope_params)
+        metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter)
+        where.extend(metadata_sql)
+        params.extend(metadata_params)
+        with self.connect() as conn:
+            row = conn.execute(
+                f"""
+                SELECT 1
+                FROM files f
+                WHERE {" AND ".join(where)}
+                LIMIT 1
+                """,
+                params,
+            ).fetchone()
+        return row is not None
+
+    def folder_memberships(self, file_ref: str) -> list[dict[str, Any]]:
+        with self.connect() as conn:
+            rows = conn.execute(
+                """
+                SELECT
+                    fo.folder_id,
+                    fo.parent_id,
+                    fo.name,
+                    fo.path,
+                    fo.description,
+                    fo.kind,
+                    fo.metadata_json AS folder_metadata_json,
+                    ff.metadata_json AS membership_metadata_json,
+                    ff.created_at
+                FROM file_folders ff
+                JOIN folders fo ON fo.folder_id = ff.folder_id
+                WHERE ff.file_ref = ?
+                ORDER BY fo.path
+                """,
+                (file_ref,),
+            ).fetchall()
+        return [
+            {
+                "folder_id": row["folder_id"],
+                "id": row["folder_id"],
+                "parent_id": row["parent_id"],
+                "parent_folder_id": row["parent_id"],
+                "name": row["name"],
+                "path": row["path"],
+                "kind": row["kind"],
+                "description": row["description"],
+                "folder_metadata": json.loads(row["folder_metadata_json"] or "{}"),
+                "metadata": json.loads(row["membership_metadata_json"] or "{}"),
+                "created_at": row["created_at"],
+            }
+            for row in rows
+        ]
+
+    def count_files_in_folder(self, path: str, *, recursive: bool = True) -> int:
+        path = normalize_path(path)
+        with self.connect() as conn:
+            folder = self._folder_by_path(conn, path)
+            if folder is None:
+                raise KeyError(f"Unknown folder path: {path}")
+            if recursive:
+                row = conn.execute(
+                    """
+                    SELECT COUNT(DISTINCT f.file_ref) AS count
+                    FROM files f
+                    JOIN file_folders ff ON ff.file_ref = f.file_ref
+                    JOIN folders fo ON fo.folder_id = ff.folder_id
+                    WHERE f.deleted_at IS NULL
+                      AND (fo.path = ? OR fo.path LIKE ? ESCAPE '\\')
+                    """,
+                    (path, self._descendant_like(path)),
+                ).fetchone()
+            else:
+                row = conn.execute(
+                    """
+                    SELECT COUNT(DISTINCT f.file_ref) AS count
+                    FROM files f
+                    JOIN file_folders ff ON ff.file_ref = f.file_ref
+                    JOIN folders fo ON fo.folder_id = ff.folder_id
+                    WHERE f.deleted_at IS NULL
+                      AND fo.path = ?
+                    """,
+                    (path,),
+                ).fetchone()
+        return int(row["count"] or 0)
+
+    def folder_subtree_thresholds(
+        self,
+        path: str,
+        *,
+        depth_limit: int,
+        file_limit: int,
+    ) -> dict[str, Any]:
+        path = normalize_path(path)
+        with self.connect() as conn:
+            folder = self._folder_by_path(conn, path)
+            if folder is None:
+                raise KeyError(f"Unknown folder path: {path}")
+            base_depth = self._folder_depth(path)
+            deep_folder = conn.execute(
+                """
+                SELECT path
+                FROM folders
+                WHERE path != ?
+                  AND path LIKE ? ESCAPE '\\'
+                  AND (
+                    CASE
+                      WHEN TRIM(path, '/') = '' THEN 0
+                      ELSE LENGTH(TRIM(path, '/')) - LENGTH(REPLACE(TRIM(path, '/'), '/', '')) + 1
+                    END
+                  ) - ? > ?
+                LIMIT 1
+                """,
+                (path, self._descendant_like(path), base_depth, depth_limit),
+            ).fetchone()
+            file_rows = conn.execute(
+                """
+                SELECT DISTINCT f.file_ref
+                FROM files f
+                JOIN file_folders ff ON ff.file_ref = f.file_ref
+                JOIN folders fo ON fo.folder_id = ff.folder_id
+                WHERE f.deleted_at IS NULL
+                  AND (fo.path = ? OR fo.path LIKE ? ESCAPE '\\')
+                LIMIT ?
+                """,
+                (path, self._descendant_like(path), file_limit + 1),
+            ).fetchall()
+        return {
+            "depth_limit": depth_limit,
+            "file_limit": file_limit,
+            "folder_depth_exceeds_limit": deep_folder is not None,
+            "file_count_exceeds_limit": len(file_rows) > file_limit,
+            "sampled_file_count": len(file_rows),
+            "sample_deep_folder_path": deep_folder["path"] if deep_folder is not None else "",
+        }
+
+    def _file_entry_row(self, conn: sqlite3.Connection, file_ref: str) -> sqlite3.Row | None:
+        return conn.execute(
+            """
+            SELECT
+                f.file_ref,
+                f.external_id,
+                f.storage_uri,
+                f.source_path,
+                f.title,
+                f.descriptor,
+                f.content_type,
+                f.source_type,
+                f.fingerprint,
+                f.text_artifact_path,
+                f.raw_artifact_path,
+                f.pageindex_doc_id,
+                f.pageindex_tree_status,
+                f.metadata_json,
+                f.metadata_status_json,
+                COALESCE(
+                    (
+                        SELECT display_folder.path
+                        FROM file_folders display_ff
+                        JOIN folders display_folder
+                          ON display_folder.folder_id = display_ff.folder_id
+                        WHERE display_ff.file_ref = f.file_ref
+                        ORDER BY display_folder.path
+                        LIMIT 1
+                    ),
+                    '/'
+                ) AS folder_path
+            FROM files f
+            WHERE f.file_ref = ? AND f.deleted_at IS NULL
+            """,
+            (file_ref,),
+        ).fetchone()
+
+    def _file_rows_for_scope(
+        self,
+        conn: sqlite3.Connection,
+        path: str,
+        recursive: bool,
+        limit: int,
+        max_depth: int | None = None,
+    ) -> list[sqlite3.Row]:
+        sql = """
+            SELECT
+                f.file_ref,
+                f.external_id,
+                f.title,
+                f.descriptor,
+                f.source_path,
+                f.pageindex_tree_status,
+                f.metadata_json,
+                f.metadata_status_json,
+                f.created_at,
+                MIN(pf.folder_id) AS folder_id,
+                MIN(pf.path) AS folder_path
+            FROM files f
+            JOIN file_folders ff ON ff.file_ref = f.file_ref
+            JOIN folders pf ON pf.folder_id = ff.folder_id
+            WHERE f.deleted_at IS NULL
+        """
+        params: list[Any]
+        if recursive:
+            sql += " AND (pf.path = ? OR pf.path LIKE ? ESCAPE '\\')"
+            params = [path, self._descendant_like(path)]
+            if max_depth is not None:
+                if max_depth <= 0:
+                    sql += " AND 0"
+                else:
+                    sql += f" AND ({self._folder_depth_sql('pf.path')} - ?) <= ?"
+                    params.extend([self._folder_depth(path), max_depth - 1])
+        else:
+            sql += " AND pf.path = ?"
+            params = [path]
+        sql += " GROUP BY f.file_ref ORDER BY f.created_at DESC, f.title LIMIT ?"
+        params.append(limit)
+        return conn.execute(sql, params).fetchall()
+
+    def _scope_sql(self, scope: Optional[dict[str, Any]]) -> tuple[str, list[Any]]:
+        if not scope:
+            return "", []
+        recursive = scope.get("recursive", True)
+        max_depth = scope.get("max_depth")
+        if max_depth is not None:
+            max_depth = int(max_depth)
+            if max_depth < 0:
+                raise ValueError("max_depth must be non-negative")
+        folder_id = scope.get("folder_id")
+        if folder_id:
+            if folder_id == "root":
+                folder_path = "/"
+            else:
+                if recursive:
+                    if max_depth == 0:
+                        return "0", []
+                    depth_clause = ""
+                    depth_params: list[Any] = []
+                    if max_depth is not None:
+                        depth_clause = (
+                            "AND "
+                            f"({self._folder_depth_sql('scope_folder.path')} - "
+                            f"{self._folder_depth_sql('base_folder.path')}) <= ?"
+                        )
+                        depth_params = [max_depth - 1]
+                    return (
+                        f"""
+                        EXISTS (
+                            SELECT 1
+                            FROM file_folders scope_ff
+                            JOIN folders scope_folder
+                              ON scope_folder.folder_id = scope_ff.folder_id
+                            JOIN folders base_folder
+                              ON base_folder.folder_id = ?
+                            WHERE scope_ff.file_ref = f.file_ref
+                              AND (
+                                scope_folder.folder_id = base_folder.folder_id
+                                OR scope_folder.path LIKE {self._descendant_like_sql_expr("base_folder.path")} ESCAPE '\\'
+                              )
+                              {depth_clause}
+                        )
+                        """,
+                        [folder_id, *depth_params],
+                    )
+                return (
+                    """
+                    EXISTS (
+                        SELECT 1
+                        FROM file_folders scope_ff
+                        WHERE scope_ff.file_ref = f.file_ref
+                          AND scope_ff.folder_id = ?
+                    )
+                    """,
+                    [folder_id],
+                )
+        elif scope.get("folder_path") or scope.get("path"):
+            folder_path = normalize_path(scope.get("folder_path") or scope.get("path"))
+        else:
+            return "", []
+        if recursive and max_depth == 0:
+            return "0", []
+        path_clause = (
+            "(scope_folder.path = ? OR scope_folder.path LIKE ? ESCAPE '\\')"
+            if recursive
+            else "scope_folder.path = ?"
+        )
+        params = [folder_path, self._descendant_like(folder_path)] if recursive else [folder_path]
+        depth_clause = ""
+        if recursive and max_depth is not None:
+            depth_clause = f"AND ({self._folder_depth_sql('scope_folder.path')} - ?) <= ?"
+            params.extend([self._folder_depth(folder_path), max_depth - 1])
+        return (
+            f"""
+            EXISTS (
+                SELECT 1
+                FROM file_folders scope_ff
+                JOIN folders scope_folder
+                  ON scope_folder.folder_id = scope_ff.folder_id
+                WHERE scope_ff.file_ref = f.file_ref
+                  AND {path_clause}
+                  {depth_clause}
+            )
+            """,
+            params,
+        )
+
+    def _folder_by_path(self, conn: sqlite3.Connection, path: str) -> sqlite3.Row | None:
+        return conn.execute(
+            """
+            SELECT
+                folder_id,
+                parent_id,
+                name,
+                path,
+                description,
+                kind,
+                metadata_json,
+                created_at,
+                updated_at
+            FROM folders
+            WHERE path = ?
+            """,
+            (path,),
+        ).fetchone()
+
+    @classmethod
+    def _descendant_like(cls, path: str) -> str:
+        return "/%" if path == "/" else f"{cls._like_escape(path)}/%"
+
+    @staticmethod
+    def _descendant_like_sql_expr(path_expr: str) -> str:
+        escaped_expr = SQLiteFileSystemStore._like_escape_sql_expr(path_expr)
+        return f"CASE WHEN {path_expr} = '/' THEN '/%' ELSE {escaped_expr} || '/%' END"
+
+    @staticmethod
+    def _contains_like(value: str) -> str:
+        return f"%{SQLiteFileSystemStore._like_escape(value)}%"
+
+    @staticmethod
+    def _like_escape(value: str) -> str:
+        return (
+            value.replace("\\", "\\\\")
+            .replace("%", "\\%")
+            .replace("_", "\\_")
+        )
+
+    @staticmethod
+    def _like_escape_sql_expr(value_expr: str) -> str:
+        return (
+            f"replace(replace(replace({value_expr}, '\\', '\\\\'), "
+            "'%', '\\%'), '_', '\\_')"
+        )
+
+    @staticmethod
+    def _folder_depth(path: str) -> int:
+        stripped = normalize_path(path).strip("/")
+        return 0 if not stripped else len(stripped.split("/"))
+
+    @staticmethod
+    def _folder_depth_sql(path_expr: str) -> str:
+        return (
+            "(CASE "
+            f"WHEN TRIM({path_expr}, '/') = '' THEN 0 "
+            f"ELSE LENGTH(TRIM({path_expr}, '/')) "
+            f"- LENGTH(REPLACE(TRIM({path_expr}, '/'), '/', '')) + 1 "
+            "END)"
+        )
+
+    @classmethod
+    def _folder_row_to_dict(cls, row: sqlite3.Row) -> dict[str, Any]:
+        return {
+            "folder_id": row["folder_id"],
+            "id": row["folder_id"],
+            "parent_id": row["parent_id"],
+            "parent_folder_id": row["parent_id"],
+            "name": row["name"],
+            "description": cls._row_value(row, "description", ""),
+            "path": row["path"],
+            "kind": row["kind"],
+            "metadata": json.loads(cls._row_value(row, "metadata_json", "{}") or "{}"),
+            "created_at": cls._row_value(row, "created_at"),
+            "updated_at": cls._row_value(row, "updated_at"),
+            "file_count": cls._row_value(row, "file_count", 0),
+            "children_count": cls._row_value(row, "children_count", 0),
+            "matched_files": cls._row_value(row, "matched_files", 0),
+        }
+
+    @classmethod
+    def _file_summary(cls, row: sqlite3.Row) -> dict[str, Any]:
+        external_id = row["external_id"]
+        return {
+            "file_ref": row["file_ref"],
+            "id": external_id or row["file_ref"],
+            "document_id": external_id,
+            "external_id": external_id,
+            "name": row["title"],
+            "title": row["title"],
+            "description": cls._row_value(row, "descriptor", row["title"]),
+            "status": cls._row_value(row, "pageindex_tree_status", "not_built"),
+            "pageNum": None,
+            "createdAt": cls._row_value(row, "created_at"),
+            "folderId": cls._row_value(row, "folder_id"),
+            "source_path": row["source_path"],
+            "folder_path": row["folder_path"],
+            "metadata": json.loads(row["metadata_json"] or "{}"),
+            "metadata_status": json.loads(
+                cls._row_value(row, "metadata_status_json", "{}") or "{}"
+            ),
+        }
+
+    @classmethod
+    def _search_row_to_dict(cls, row: sqlite3.Row) -> dict[str, Any]:
+        external_id = row["external_id"]
+        return {
+            "file_ref": row["file_ref"],
+            "id": external_id or row["file_ref"],
+            "document_id": external_id,
+            "external_id": external_id,
+            "name": row["title"],
+            "title": row["title"],
+            "description": cls._row_value(row, "descriptor", row["title"]),
+            "status": cls._row_value(row, "pageindex_tree_status", "not_built"),
+            "pageNum": None,
+            "createdAt": cls._row_value(row, "created_at"),
+            "folderId": cls._row_value(row, "folder_id"),
+            "source_path": row["source_path"],
+            "snippet": row["snippet"] or row["title"],
+            "folder_path": row["folder_path"],
+            "metadata": json.loads(row["metadata_json"] or "{}"),
+            "metadata_status": json.loads(
+                cls._row_value(row, "metadata_status_json", "{}") or "{}"
+            ),
+        }
+
+    @staticmethod
+    def _row_value(row: sqlite3.Row, key: str, default: Any = None) -> Any:
+        return row[key] if key in row.keys() else default
+
+    @staticmethod
+    def _file_entry(row: sqlite3.Row) -> FileEntry:
+        return FileEntry(
+            file_ref=row["file_ref"],
+            external_id=row["external_id"],
+            storage_uri=row["storage_uri"],
+            source_path=row["source_path"],
+            title=row["title"],
+            descriptor=row["descriptor"],
+            content_type=row["content_type"],
+            source_type=row["source_type"],
+            fingerprint=row["fingerprint"],
+            text_artifact_path=row["text_artifact_path"],
+            raw_artifact_path=row["raw_artifact_path"],
+            pageindex_doc_id=row["pageindex_doc_id"],
+            pageindex_tree_status=row["pageindex_tree_status"],
+            metadata=json.loads(row["metadata_json"] or "{}"),
+            folder_path=row["folder_path"],
+            metadata_status=json.loads(
+                SQLiteFileSystemStore._row_value(row, "metadata_status_json", "{}") or "{}"
+            ),
+        )
+
+    @classmethod
+    def _file_entry_to_dict(cls, entry: FileEntry) -> dict[str, Any]:
+        return {
+            "file_ref": entry.file_ref,
+            "id": entry.external_id or entry.file_ref,
+            "document_id": entry.external_id,
+            "external_id": entry.external_id,
+            "name": entry.title,
+            "storage_uri": entry.storage_uri,
+            "source_path": entry.source_path,
+            "title": entry.title,
+            "description": entry.descriptor,
+            "status": entry.pageindex_tree_status,
+            "pageNum": None,
+            "descriptor": entry.descriptor,
+            "content_type": entry.content_type,
+            "source_type": entry.source_type,
+            "fingerprint": entry.fingerprint,
+            "text_artifact_path": entry.text_artifact_path,
+            "raw_artifact_path": entry.raw_artifact_path,
+            "pageindex_doc_id": entry.pageindex_doc_id,
+            "pageindex_tree_status": entry.pageindex_tree_status,
+            "metadata": entry.metadata,
+            "metadata_status": entry.metadata_status,
+            "folder_path": entry.folder_path,
+        }
+
+    @staticmethod
+    def _query_text(query: str | list[str] | None) -> str:
+        if query is None:
+            return ""
+        if isinstance(query, list):
+            return " ".join(str(item) for item in query)
+        return str(query)
+
+    @classmethod
+    def _fts_match_queries(cls, query: str) -> list[str]:
+        terms = cls._fts_terms(query)
+        if not terms:
+            return []
+        queries = [" ".join(terms)]
+        if len(terms) > 1:
+            queries.append(" OR ".join(terms))
+        return queries
+
+    @staticmethod
+    def _fts_terms(query: str) -> list[str]:
+        stopwords = {
+            "a",
+            "an",
+            "and",
+            "are",
+            "as",
+            "at",
+            "be",
+            "by",
+            "did",
+            "do",
+            "does",
+            "for",
+            "from",
+            "how",
+            "in",
+            "is",
+            "it",
+            "of",
+            "on",
+            "or",
+            "that",
+            "the",
+            "to",
+            "was",
+            "were",
+            "what",
+            "when",
+            "where",
+            "which",
+            "who",
+            "why",
+            "with",
+        }
+        terms = re.findall(r"[A-Za-z0-9_]+", query.lower())
+        unique_terms = []
+        seen = set()
+        for term in terms:
+            if term in stopwords or term in seen:
+                continue
+            seen.add(term)
+            unique_terms.append(term)
+        return unique_terms
+
+    @staticmethod
+    def _metadata_value_items(value: Any) -> list[dict[str, Any]]:
+        if value is None:
+            return []
+        if isinstance(value, list):
+            items = []
+            for item in value:
+                items.extend(SQLiteFileSystemStore._metadata_value_items(item))
+            return items
+        value_json = json.dumps(value, ensure_ascii=False, sort_keys=True)
+        value_text = SQLiteFileSystemStore._metadata_compare_text(value)
+        return [
+            {
+                "value_text": value_text,
+                "value_number": float(value) if isinstance(value, (int, float)) and not isinstance(value, bool) else None,
+                "value_bool": int(value) if isinstance(value, bool) else None,
+                "value_json": value_json,
+            }
+        ]
+
+    @staticmethod
+    def _metadata_compare_text(value: Any) -> str:
+        if isinstance(value, bool):
+            return "true" if value else "false"
+        if isinstance(value, (dict, list)):
+            return json.dumps(value, ensure_ascii=False, sort_keys=True)
+        return "" if value is None else str(value)
+
+    @staticmethod
+    def indexed_metadata_values(metadata: dict[str, Any]) -> dict[str, Any]:
+        return dict(metadata)
+
+    @staticmethod
+    def _valid_field_name(name: str) -> bool:
+        return re.match(r"^[A-Za-z][A-Za-z0-9_]*$", str(name)) is not None
+
+    @staticmethod
+    def folder_id(path: str) -> str:
+        normalized = normalize_path(path)
+        if normalized == "/":
+            return "folder_root"
+        digest = hashlib.sha1(normalized.encode("utf-8")).hexdigest()[:16]
+        return f"folder_{digest}"
+
+    @staticmethod
+    def field_id(name: str) -> str:
+        digest = hashlib.sha1(name.encode("utf-8")).hexdigest()[:16]
+        return f"field_{digest}"
+
+
+def normalize_path(path: str | Path | None) -> str:
+    if path is None:
+        return "/"
+    if str(path).strip().lower() == "root":
+        return "/"
+    parts = [part for part in str(path).replace("\\", "/").split("/") if part and part != "."]
+    return "/" + "/".join(parts) if parts else "/"
+
+
+def make_file_ref(seed: str) -> str:
+    digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
+    return f"file_{digest}"
+
+
+def fingerprint(content: str) -> str:
+    return hashlib.sha256(content.encode("utf-8")).hexdigest()
+
+
+def metadata_text(metadata: dict[str, Any]) -> str:
+    values = []
+    for value in metadata.values():
+        if isinstance(value, list):
+            values.extend(str(item) for item in value)
+        elif isinstance(value, dict):
+            values.append(json.dumps(value, ensure_ascii=False, sort_keys=True))
+        elif value is not None:
+            values.append(str(value))
+    return " ".join(values)
diff --git a/pageindex/filesystem/structural_read.py b/pageindex/filesystem/structural_read.py
new file mode 100644
index 000000000..aca2bcdcd
--- /dev/null
+++ b/pageindex/filesystem/structural_read.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from copy import deepcopy
+from typing import Any
+
+
+def strip_pageindex_text_fields(value: Any) -> Any:
+    if isinstance(value, list):
+        return [strip_pageindex_text_fields(item) for item in value]
+    if isinstance(value, dict):
+        return {
+            key: strip_pageindex_text_fields(item)
+            for key, item in value.items()
+            if key != "text"
+        }
+    return value
+
+
+def flatten_pageindex_structure_nodes(structure: Any) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+
+    def visit(value: Any, *, depth: int, parent_node_id: str | None) -> None:
+        if isinstance(value, list):
+            for item in value:
+                visit(item, depth=depth, parent_node_id=parent_node_id)
+            return
+        if not isinstance(value, dict):
+            return
+
+        node_id = value.get("node_id")
+        child_values: list[Any] = []
+        for child_key in ("nodes", "children"):
+            children = value.get(child_key)
+            if isinstance(children, list):
+                child_values.extend(children)
+
+        row = {
+            key: strip_pageindex_text_fields(item)
+            for key, item in value.items()
+            if key not in {"text", "nodes", "children"}
+        }
+        row["depth"] = depth
+        row["children_count"] = len(child_values)
+        if parent_node_id:
+            row["parent_node_id"] = parent_node_id
+        rows.append(row)
+
+        next_parent = str(node_id) if node_id is not None else parent_node_id
+        for child in child_values:
+            visit(child, depth=depth + 1, parent_node_id=next_parent)
+
+    visit(structure, depth=0, parent_node_id=None)
+    return rows
+
+
+def find_pageindex_node(structure: Any, node_id: str) -> dict[str, Any] | None:
+    if isinstance(structure, dict):
+        if str(structure.get("node_id", "")) == str(node_id):
+            return deepcopy(structure)
+        for child_key in ("nodes", "children"):
+            found = find_pageindex_node(structure.get(child_key), node_id)
+            if found is not None:
+                return found
+    if isinstance(structure, list):
+        for item in structure:
+            found = find_pageindex_node(item, node_id)
+            if found is not None:
+                return found
+    return None
+
+
+def first_node_location(node: dict[str, Any]) -> str | None:
+    for key in ("line_num", "physical_index", "start_index"):
+        value = node.get(key)
+        if value is not None and value != "":
+            return str(value)
+    return None
diff --git a/pageindex/filesystem/types.py b/pageindex/filesystem/types.py
new file mode 100644
index 000000000..103d28dd6
--- /dev/null
+++ b/pageindex/filesystem/types.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+
+@dataclass(frozen=True)
+class SearchResult:
+    file_ref: str
+    external_id: Optional[str]
+    title: str
+    snippet: str
+    folder_path: str
+    folder_paths: list[str]
+    metadata: dict[str, Any]
+    source_path: str = ""
+    id: Optional[str] = None
+    document_id: Optional[str] = None
+    name: str = ""
+    description: str = ""
+    status: str = ""
+    pageNum: Optional[int] = None
+    createdAt: Optional[str] = None
+    folderId: Optional[str] = None
+    metadata_status: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class OpenResult:
+    file_ref: str
+    start_line: int
+    end_line: int
+    text: str
+    external_id: Optional[str] = None
+    folder_path: str = ""
+    source_path: str = ""
+
+
+@dataclass(frozen=True)
+class FolderEntry:
+    folder_id: str
+    parent_id: Optional[str]
+    name: str
+    path: str
+    kind: str
+
+
+@dataclass(frozen=True)
+class FileEntry:
+    file_ref: str
+    external_id: Optional[str]
+    storage_uri: str
+    source_path: str
+    title: str
+    descriptor: str
+    content_type: str
+    source_type: Optional[str]
+    fingerprint: str
+    text_artifact_path: str
+    raw_artifact_path: Optional[str]
+    pageindex_doc_id: Optional[str]
+    pageindex_tree_status: str
+    metadata: dict[str, Any]
+    folder_path: str
+    metadata_status: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass(frozen=True)
+class MetadataField:
+    name: str
+    field_type: str
+    description: str = ""
+    indexed: bool = True
+    faceted: bool = False
+    sortable: bool = False
+    source: str = "manual"
+
+
+@dataclass(frozen=True)
+class CommandResult:
+    command: str
+    data: Any
+    text: str
diff --git a/pifs b/pifs
new file mode 100755
index 000000000..fb2dbc08e
--- /dev/null
+++ b/pifs
@@ -0,0 +1,10 @@
+#!/bin/sh
+set -eu
+
+SCRIPT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")" && pwd)
+
+if [ -x "$SCRIPT_DIR/.venv/bin/python" ]; then
+  exec "$SCRIPT_DIR/.venv/bin/python" -m pageindex.filesystem.cli "$@"
+fi
+
+exec python3 -m pageindex.filesystem.cli "$@"
diff --git a/requirements.txt b/requirements.txt
index e6ad80531..f88e7cb05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 litellm==1.83.7
-# openai-agents  # optional: required for examples/agentic_vectorless_rag_demo.py
+# openai-agents==0.17.2  # optional: required for pifs chat/ask and examples/agentic_vectorless_rag_demo.py
 pymupdf==1.26.4
 PyPDF2==3.0.1
 python-dotenv==1.2.2
 pyyaml==6.0.2
+sqlite-vec>=0.1.9
diff --git a/tests/test_filesystem_store.py b/tests/test_filesystem_store.py
new file mode 100644
index 000000000..7f425038f
--- /dev/null
+++ b/tests/test_filesystem_store.py
@@ -0,0 +1,45 @@
+import json
+
+
+def test_insert_files_does_not_disable_sqlite_synchronous(tmp_path):
+    from pageindex.filesystem.store import SQLiteFileSystemStore
+
+    statements = []
+
+    class RecordingStore(SQLiteFileSystemStore):
+        def connect(self):
+            conn = super().connect()
+            conn.set_trace_callback(statements.append)
+            return conn
+
+    store = RecordingStore(tmp_path / "workspace")
+    statements.clear()
+
+    store.insert_files(
+        [
+            {
+                "file_ref": "ref_report",
+                "external_id": "doc_report",
+                "storage_uri": "file:///tmp/report.pdf",
+                "source_path": "documents/report.pdf",
+                "folder_path": "/documents",
+                "title": "Report",
+                "descriptor": "documents/report.pdf",
+                "content_type": "application/pdf",
+                "source_type": "documents",
+                "fingerprint": "fingerprint",
+                "text_artifact_path": "artifacts/text/ref_report.txt",
+                "raw_artifact_path": None,
+                "metadata": {},
+                "metadata_json": json.dumps({}),
+                "metadata_text": "",
+                "content": "",
+                "skip_fts": True,
+            }
+        ]
+    )
+
+    assert not any(
+        statement.upper().replace(" ", "") == "PRAGMASYNCHRONOUS=OFF"
+        for statement in statements
+    )
diff --git a/tests/test_import_surface.py b/tests/test_import_surface.py
new file mode 100644
index 000000000..b4309cf05
--- /dev/null
+++ b/tests/test_import_surface.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import builtins
+import importlib
+import sys
+
+
+def test_filesystem_import_works_without_eager_optional_dependencies(monkeypatch):
+    blocked_roots = {"litellm", "openai", "PyPDF2", "pymupdf", "sqlite_vec"}
+    real_import = builtins.__import__
+
+    def clear_pageindex_modules() -> None:
+        for name in list(sys.modules):
+            if name == "pageindex" or name.startswith("pageindex."):
+                sys.modules.pop(name, None)
+
+    def import_without_optional_deps(name, globals=None, locals=None, fromlist=(), level=0):
+        root = name.split(".", 1)[0]
+        if root in blocked_roots:
+            raise ModuleNotFoundError(f"No module named '{root}'", name=root)
+        return real_import(name, globals, locals, fromlist, level)
+
+    clear_pageindex_modules()
+    try:
+        with monkeypatch.context() as patch:
+            patch.setattr(builtins, "__import__", import_without_optional_deps)
+
+            filesystem_module = importlib.import_module("pageindex.filesystem")
+            from pageindex import PageIndexFileSystem as TopLevelPageIndexFileSystem
+            from pageindex.filesystem import PageIndexFileSystem
+
+            assert filesystem_module.PageIndexFileSystem is PageIndexFileSystem
+            assert TopLevelPageIndexFileSystem is PageIndexFileSystem
+    finally:
+        clear_pageindex_modules()
diff --git a/tests/test_metadata_generation.py b/tests/test_metadata_generation.py
new file mode 100644
index 000000000..3e64a4b9e
--- /dev/null
+++ b/tests/test_metadata_generation.py
@@ -0,0 +1,30 @@
+import sys
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+
+def test_metadata_generator_uses_provider_parameter():
+    from pageindex.filesystem.metadata_generation import (
+        MetadataGenerationError,
+        MetadataGenerationInput,
+        MetadataGenerator,
+    )
+
+    generator = MetadataGenerator(provider="unsupported", model="unused")
+    request = MetadataGenerationInput(
+        file_ref="file_a",
+        external_id="doc_a",
+        title="A",
+        source_path="docs/a.txt",
+        content_type="text/plain",
+        source_type=None,
+        text="hello",
+    )
+
+    with pytest.raises(MetadataGenerationError, match="unsupported metadata provider: unsupported"):
+        generator.generate(request, fields=["summary"])
diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py
new file mode 100644
index 000000000..087473aab
--- /dev/null
+++ b/tests/test_pageindex_filesystem_scope.py
@@ -0,0 +1,496 @@
+import json
+from types import SimpleNamespace
+
+import pytest
+
+
+def test_filesystem_lazy_exports_remain_public():
+    import pageindex.filesystem as filesystem
+    from pageindex.filesystem import (
+        HybridProjectionSearchBackend,
+        RebuildableSemanticIndex,
+        SemanticIndexRecord,
+        SemanticSearchResult,
+        SQLiteVecSemanticIndex,
+        SummaryProjectionIndexer,
+    )
+
+    for name in (
+        "HybridProjectionSearchBackend",
+        "RebuildableSemanticIndex",
+        "SemanticIndexRecord",
+        "SemanticSearchResult",
+        "SQLiteVecSemanticIndex",
+        "SummaryProjectionIndexer",
+    ):
+        assert name in filesystem.__all__
+        assert name in dir(filesystem)
+
+    assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend"
+    assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex"
+    assert SemanticIndexRecord.__name__ == "SemanticIndexRecord"
+    assert SemanticSearchResult.__name__ == "SemanticSearchResult"
+    assert SQLiteVecSemanticIndex.__name__ == "SQLiteVecSemanticIndex"
+    assert SummaryProjectionIndexer.__name__ == "SummaryProjectionIndexer"
+
+
+class SummaryBackend:
+    def __init__(self, document_id):
+        self.document_id = document_id
+        self.calls = []
+
+    def available_channels(self):
+        return ("summary",)
+
+    def search_channel(self, channel, query, *, limit=10, filters=None):
+        self.calls.append((channel, query, filters))
+        return [
+            SimpleNamespace(
+                document_id=self.document_id,
+                snippet=f"summary candidate: {query}",
+            )
+        ]
+
+
+class ChannelBackend:
+    def __init__(self, document_id, channels=("summary", "entity", "relation")):
+        self.document_id = document_id
+        self.channels = channels
+
+    def available_channels(self):
+        return self.channels
+
+    def search_channel(self, channel, query, *, limit=10, filters=None):
+        return [
+            SimpleNamespace(
+                document_id=self.document_id,
+                snippet=f"{channel} candidate: {query}",
+            )
+        ]
+
+
+def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
+
+    class SummaryGenerator:
+        def generate(self, document, *, fields):
+            return MetadataGenerationResult(
+                values={"summary": "Federal Reserve annual report summary"}
+            )
+
+    filesystem = PageIndexFileSystem(
+        workspace=tmp_path / "workspace",
+        metadata_generator=SummaryGenerator(),
+    )
+    file_ref = filesystem.register_file(
+        storage_uri="file:///tmp/report.pdf",
+        source_path="examples/documents/report.pdf",
+        folder_path="/documents",
+        external_id="dsid_report",
+        title="report.pdf",
+        metadata={"source_type": "examples-documents"},
+        content="Federal Reserve supervision and regulation annual report.",
+        metadata_policy={
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+            }
+        },
+    )
+    backend = SummaryBackend("dsid_report")
+    filesystem.semantic_retrieval_backend = backend
+    executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+    result = json.loads(
+        executor.execute('search-summary "Federal Reserve annual report" /documents')
+    )
+
+    assert backend.calls[0][2] == {}
+    assert result["data"]["data"][0] == {
+        "path": "/examples/documents/report.pdf",
+        "summary": "Federal Reserve annual report summary",
+        "line_text": "1: Federal Reserve supervision and regulation annual report.",
+    }
+    assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref
+
+    executor.json_output = False
+    rendered = executor.execute('search-summary "Federal Reserve annual report" /documents')
+    assert "path: /examples/documents/report.pdf" in rendered
+    assert "summary: Federal Reserve annual report summary" in rendered
+    assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered
+    assert "id=dsid_report" not in rendered
+    assert "file_ref=" not in rendered
+
+
+def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
+
+    class SummaryGenerator:
+        def generate(self, document, *, fields):
+            return MetadataGenerationResult(
+                values={"summary": f"summary for {document.external_id}"}
+            )
+
+    filesystem = PageIndexFileSystem(
+        workspace=tmp_path / "workspace",
+        metadata_generator=SummaryGenerator(),
+    )
+    first_ref = filesystem.register_file(
+        storage_uri="file:///tmp/first.json",
+        source_path="slack/dsid_first.json",
+        folder_path="/documents",
+        external_id="dsid_first",
+        title="announcements",
+        content="first announcement mentions H200 reservations.",
+        metadata_policy={
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+            }
+        },
+    )
+    filesystem.register_file(
+        storage_uri="file:///tmp/second.json",
+        source_path="slack/dsid_second.json",
+        folder_path="/documents",
+        external_id="dsid_second",
+        title="announcements",
+        content="second announcement mentions unrelated maintenance.",
+        metadata_policy={
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+            }
+        },
+    )
+    filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
+    executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+    result = json.loads(executor.execute('search-summary "H200 reservations" /documents'))
+
+    assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json"
+    assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
+    with pytest.raises(KeyError, match="Ambiguous file target"):
+        filesystem.store.resolve_file_ref("/documents/announcements")
+
+
+def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
+
+    class SummaryGenerator:
+        def generate(self, document, *, fields):
+            return MetadataGenerationResult(
+                values={"summary": f"summary for {document.external_id}"}
+            )
+
+    filesystem = PageIndexFileSystem(
+        workspace=tmp_path / "workspace",
+        metadata_generator=SummaryGenerator(),
+    )
+    first_ref = filesystem.register_file(
+        storage_uri="file:///tmp/first.json",
+        source_path="shared/source.json",
+        folder_path="/documents",
+        external_id="dsid_first",
+        title="First",
+        content="first content",
+        metadata_policy={
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+            }
+        },
+    )
+    filesystem.register_file(
+        storage_uri="file:///tmp/second.json",
+        source_path="shared/source.json",
+        folder_path="/documents",
+        external_id="dsid_second",
+        title="Second",
+        content="second content",
+        metadata_policy={
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+            }
+        },
+    )
+    filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
+    executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+    result = json.loads(executor.execute('search-summary "first" /documents'))
+
+    assert result["data"]["data"][0]["path"] == "dsid_first"
+    assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
+
+
+def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
+
+    class MetadataGenerator:
+        def generate(self, document, *, fields):
+            values = {
+                "summary": "Risk and compliance summary",
+                "entity": "Federal Reserve; Disney",
+                "relation": "Federal Reserve affects Disney valuation",
+            }
+            return MetadataGenerationResult(values={field: values[field] for field in fields})
+
+    filesystem = PageIndexFileSystem(
+        workspace=tmp_path / "workspace",
+        metadata_generator=MetadataGenerator(),
+    )
+    filesystem.register_file(
+        storage_uri="file:///tmp/market-note.pdf",
+        source_path="examples/documents/market-note.pdf",
+        folder_path="/documents",
+        external_id="dsid_market_note",
+        title="market-note.pdf",
+        content="Federal Reserve policy affects Disney valuation.",
+        metadata_policy={
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+                "entity": True,
+                "relation": True,
+            }
+        },
+    )
+    filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note")
+    executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+    entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents'))
+    assert entity["data"]["data"][0] == {
+        "path": "/examples/documents/market-note.pdf",
+        "summary": "Risk and compliance summary",
+        "line_text": "1: Federal Reserve policy affects Disney valuation.",
+        "entity": "Federal Reserve; Disney",
+    }
+
+    relation = json.loads(executor.execute('search-relation "Disney valuation" /documents'))
+    assert relation["data"]["data"][0] == {
+        "path": "/examples/documents/market-note.pdf",
+        "summary": "Risk and compliance summary",
+        "line_text": "1: Federal Reserve policy affects Disney valuation.",
+        "relation": "Federal Reserve affects Disney valuation",
+    }
+
+    executor.json_output = False
+    rendered = executor.execute('search-entity "Federal Reserve" /documents')
+    assert "path: /examples/documents/market-note.pdf" in rendered
+    assert "summary: Risk and compliance summary" in rendered
+    assert "entity: Federal Reserve; Disney" in rendered
+    assert "file_ref=" not in rendered
+
+
+def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    filesystem.register_file(
+        storage_uri="file:///tmp/report.pdf",
+        source_path="examples/documents/report.pdf",
+        folder_path="/documents",
+        external_id="dsid_report",
+        title="Annual report",
+        content="Federal Reserve supervision and regulation annual report.",
+    )
+    filesystem.semantic_retrieval_backend = SummaryBackend("dsid_report")
+    executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+    with pytest.raises(PIFSCommandError, match="Quote multi-word queries"):
+        executor.execute("search-summary Federal Reserve /documents")
+
+    with pytest.raises(PIFSCommandError, match="quote it"):
+        executor.execute("search-summary Federal Reserve")
+
+    with pytest.raises(PIFSCommandError, match="does not support regex alternation"):
+        executor.execute('search-summary "Federal|Reserve" /documents')
+
+
+def test_semantic_search_scope_filters_explicit_source_type_facets():
+    from pageindex.filesystem import PageIndexFileSystem
+
+    assert PageIndexFileSystem._semantic_filters_for_scope(
+        {"folder_path": "/source_type=google-drive"}
+    ) == {"source_type": "google_drive"}
+    assert PageIndexFileSystem._semantic_filters_for_scope(
+        {"folder_path": "/semantic/source_type=google-drive"}
+    ) == {"source_type": "google_drive"}
+    assert PageIndexFileSystem._semantic_filters_for_scope(
+        {"folder_path": "/documents"}
+    ) == {}
+
+
+def test_grep_source_file_requires_terms_on_same_line(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    source_dir = tmp_path / "source" / "documents"
+    source_dir.mkdir(parents=True)
+    source = source_dir / "split.json"
+    source.write_text(
+        '{\n  "first": "alpha evidence lives here",\n'
+        '  "second": "omega evidence lives there"\n}\n',
+        encoding="utf-8",
+    )
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    filesystem.register_file(
+        storage_uri=str(source),
+        source_path="documents/split.json",
+        folder_path="/documents",
+        external_id="doc_split_terms",
+        title="Split source terms",
+        content="registered artifact without the searched tokens",
+    )
+    executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+    result = json.loads(executor.execute('grep -R "alpha omega" /documents'))
+
+    assert result["data"]["mode"] == "files"
+    assert result["data"]["data"] == []
+
+    matched = json.loads(executor.execute('grep -R "alpha evidence" /documents'))
+
+    assert matched["data"]["data"][0]["external_id"] == "doc_split_terms"
+    assert matched["data"]["data"][0]["line"] == 2
+    assert "alpha evidence" in matched["data"]["data"][0]["text"]
+
+
+def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
+    from pageindex.filesystem import PageIndexFileSystem
+    from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
+
+    workspace = tmp_path / "workspace"
+    index_dir = workspace / "artifacts" / "projection_indexes"
+    summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
+    summary_index.reset(
+        dimension=3,
+        metadata={
+            "channel": "summary",
+            "embedding_provider": "openai",
+            "embedding_model": "test-embedding",
+            "embedding_dimensions": 3,
+        },
+    )
+    summary_index.upsert_many(
+        [
+            SemanticIndexRecord(
+                file_ref="file_a",
+                external_id="doc_a",
+                source_type="documents",
+                source_path="documents/a.pdf",
+                title="A",
+                text="summary",
+                vector=[1.0, 0.0, 0.0],
+            )
+        ]
+    )
+    filesystem = PageIndexFileSystem(workspace)
+    calls = []
+
+    def fake_configure(index_dir_arg, **kwargs):
+        calls.append((index_dir_arg, kwargs))
+        filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
+        return filesystem.semantic_retrieval_backend
+
+    monkeypatch.setattr(
+        filesystem,
+        "configure_hybrid_projection_retrieval",
+        fake_configure,
+    )
+
+    assert filesystem.configure_existing_projection_retrieval() is True
+    assert calls == [
+        (
+            filesystem.summary_projection_index_dir,
+            {
+                "embedding_provider": "openai",
+                "embedding_model": "test-embedding",
+                "embedding_dimensions": 3,
+                "embedding_timeout": 60,
+            },
+        )
+    ]
+    assert filesystem.semantic_retrieval_channels() == ("summary",)
+
+
+def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
+    from pageindex.filesystem import PageIndexFileSystem
+    from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
+    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
+    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+
+    class FixedEmbedder:
+        def embed(self, texts):
+            return [[1.0, 0.0, 0.0] for _ in texts]
+
+    class SummaryGenerator:
+        def generate(self, document, *, fields):
+            return MetadataGenerationResult(
+                values={"summary": "vendor renewal risk matrix"}
+            )
+
+    source = tmp_path / "source.txt"
+    source.write_text("ordinary fixture body", encoding="utf-8")
+    index_dir = tmp_path / "workspace" / "artifacts" / "projection_indexes"
+    indexer = SummaryProjectionIndexer(
+        index_dir,
+        embedder=FixedEmbedder(),
+        embedding_provider="test",
+        embedding_model="fake",
+        embedding_dimensions=3,
+    )
+    backend = HybridProjectionSearchBackend(
+        index_dir,
+        embedder=FixedEmbedder(),
+        embedding_provider="test",
+        embedding_model="fake",
+        embedding_dimensions=3,
+    )
+    filesystem = PageIndexFileSystem(
+        workspace=tmp_path / "workspace",
+        metadata_generator=SummaryGenerator(),
+        summary_projection_indexer=indexer,
+        semantic_retrieval_backend=backend,
+    )
+    filesystem.register_file(
+        storage_uri=source.as_uri(),
+        source_path="docs/source.txt",
+        folder_path="/documents",
+        external_id="doc_summary_only",
+        title="Operations note",
+        content=source.read_text(encoding="utf-8"),
+        metadata={"department": "ops"},
+        metadata_policy={
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+            }
+        },
+    )
+
+    assert filesystem.search("purchase order exposure", semantic=False) == []
+
+    results = filesystem.search("purchase order exposure", semantic=True)
+
+    assert [result.external_id for result in results] == ["doc_summary_only"]
+    assert results[0].snippet == "summary_vector rank=1"
diff --git a/tests/test_pageindex_structural_read.py b/tests/test_pageindex_structural_read.py
new file mode 100644
index 000000000..3994aa413
--- /dev/null
+++ b/tests/test_pageindex_structural_read.py
@@ -0,0 +1,766 @@
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+def write_pageindex_client_doc(workspace: Path, doc_id: str, doc: dict) -> None:
+    workspace.mkdir(parents=True, exist_ok=True)
+    (workspace / f"{doc_id}.json").write_text(
+        json.dumps(doc, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    meta = {
+        doc_id: {
+            "type": doc.get("type", ""),
+            "doc_name": doc.get("doc_name", ""),
+            "doc_description": doc.get("doc_description", ""),
+            "path": doc.get("path", ""),
+        }
+    }
+    if doc.get("type") == "pdf":
+        meta[doc_id]["page_count"] = doc.get("page_count")
+    elif doc.get("type") == "md":
+        meta[doc_id]["line_count"] = doc.get("line_count")
+    (workspace / "_meta.json").write_text(
+        json.dumps(meta, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+
+
+class RecordingMetadataGenerator:
+    values = {
+        "summary": "Generated retrieval summary.",
+        "doc_type": "technical_note",
+        "domain": "documentation",
+        "topic": "pageindex extraction",
+    }
+
+    def __init__(self):
+        self.calls = []
+
+    def generate(self, request, *, fields):
+        self.calls.append((request, list(fields)))
+        return {field: self.values[field] for field in fields if field in self.values}
+
+
+def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "report.md"
+        source.write_text("# Report\n\nCached structure is not built yet.", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+
+        def fail_index(*args, **kwargs):
+            raise RuntimeError("index failed: extractor unavailable")
+
+        monkeypatch.setattr(PageIndexClient, "index", fail_index)
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/report.md",
+            external_id="dsid_structural_missing",
+            title="Structural report",
+            content=source.read_text(encoding="utf-8"),
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        structure = json.loads(executor.execute("cat dsid_structural_missing --structure"))
+        node = json.loads(executor.execute("cat dsid_structural_missing --node 0001"))
+        pages = json.loads(executor.execute("cat dsid_structural_missing --page 1-2"))
+        stat = json.loads(executor.execute("stat dsid_structural_missing"))
+
+        assert structure["data"]["mode"] == "structure"
+        assert structure["data"]["available"] is False
+        assert structure["data"]["status"] == "failed"
+        assert "RuntimeError: index failed: extractor unavailable" in structure["data"]["message"]
+        assert stat["data"]["pageindex_tree_status"] == "failed"
+        assert stat["data"]["metadata_status"]["pageindex_tree"] == {
+            "status": "failed",
+            "owner": "pageindex",
+            "source": "PageIndexClient.index",
+            "error_type": "RuntimeError",
+            "message": "index failed: extractor unavailable",
+        }
+
+        assert node["data"]["mode"] == "node"
+        assert node["data"]["available"] is False
+        assert node["data"]["node_id"] == "0001"
+
+        assert pages["data"]["mode"] == "page"
+        assert pages["data"]["available"] is False
+        assert pages["data"]["pages"] == "1-2"
+
+        assert "cp" not in executor.allowed_commands()
+        assert "mkdir" not in executor.allowed_commands()
+
+
+def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_fts(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PageIndexFileSystem
+
+    def fake_index(self, file_path, mode="auto"):
+        suffix = Path(file_path).suffix.lower()
+        doc_id = f"doc_{suffix.lstrip('.')}"
+        if suffix == ".pdf":
+            doc = {
+                "id": doc_id,
+                "type": "pdf",
+                "path": str(Path(file_path).resolve()),
+                "doc_name": "report.pdf",
+                "doc_description": "",
+                "page_count": 2,
+                "structure": [{"title": "Report", "node_id": "0001", "nodes": []}],
+                "pages": [
+                    {"page": 1, "content": "PageIndex PDF extracted alpha text."},
+                    {"page": 2, "content": "Second PageIndex PDF extracted beta text."},
+                ],
+            }
+        else:
+            doc = {
+                "id": doc_id,
+                "type": "md",
+                "path": str(Path(file_path).resolve()),
+                "doc_name": "notes",
+                "doc_description": "",
+                "line_count": 3,
+                "structure": [
+                    {
+                        "title": "Notes",
+                        "node_id": "0001",
+                        "line_num": 1,
+                        "text": "# Notes\n\nPageIndex Markdown extracted gamma text.",
+                        "nodes": [],
+                    }
+                ],
+            }
+        write_pageindex_client_doc(self.workspace, doc_id, doc)
+        self.documents[doc_id] = doc
+        return doc_id
+
+    monkeypatch.setattr(PageIndexClient, "index", fake_index)
+    with tempfile.TemporaryDirectory() as tmp:
+        source_pdf = Path(tmp) / "report.pdf"
+        source_md = Path(tmp) / "notes.md"
+        source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        source_md.write_text("# Notes\n\nCaller markdown content", encoding="utf-8")
+        generator = RecordingMetadataGenerator()
+        filesystem = PageIndexFileSystem(
+            workspace=Path(tmp) / "workspace",
+            metadata_generator=generator,
+        )
+
+        filesystem.register_file(
+            storage_uri=source_pdf.as_uri(),
+            source_path="docs/report.pdf",
+            external_id="dsid_pdf_extracted",
+            title="PDF extracted",
+            content="CALLER PDF CONTENT MUST NOT REACH GENERATOR",
+        )
+        filesystem.register_file(
+            storage_uri=source_md.as_uri(),
+            source_path="docs/notes.md",
+            external_id="dsid_md_extracted",
+            title="Markdown extracted",
+            content="CALLER MD CONTENT MUST NOT REACH GENERATOR",
+        )
+
+        pdf_request = generator.calls[0][0]
+        md_request = generator.calls[1][0]
+        pdf_stat = filesystem.store.file_info("dsid_pdf_extracted")
+        md_stat = filesystem.store.file_info("dsid_md_extracted")
+
+        assert "PageIndex PDF extracted alpha text" in pdf_request.text
+        assert "Second PageIndex PDF extracted beta text" in pdf_request.text
+        assert "CALLER PDF CONTENT" not in pdf_request.text
+        assert "PageIndex Markdown extracted gamma text" in md_request.text
+        assert "CALLER MD CONTENT" not in md_request.text
+        assert "PageIndex PDF extracted alpha text" in Path(
+            pdf_stat["text_artifact_path"]
+        ).read_text(encoding="utf-8")
+        assert "PageIndex Markdown extracted gamma text" in Path(
+            md_stat["text_artifact_path"]
+        ).read_text(encoding="utf-8")
+        assert [r.external_id for r in filesystem.search("alpha beta", limit=5)] == [
+            "dsid_pdf_extracted"
+        ]
+        assert [r.external_id for r in filesystem.search("gamma", limit=5)] == [
+            "dsid_md_extracted"
+        ]
+        assert filesystem.search("CALLER", limit=5) == []
+
+
+def test_register_text_metadata_generation_keeps_caller_content_without_pageindex(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PageIndexFileSystem
+
+    def fail_index(*args, **kwargs):
+        raise AssertionError("PageIndexClient.index should not be called for text files")
+
+    monkeypatch.setattr(PageIndexClient, "index", fail_index)
+    with tempfile.TemporaryDirectory() as tmp:
+        generator = RecordingMetadataGenerator()
+        filesystem = PageIndexFileSystem(
+            workspace=Path(tmp) / "workspace",
+            metadata_generator=generator,
+        )
+
+        filesystem.register_file(
+            storage_uri="file:///tmp/readme.txt",
+            source_path="docs/readme.txt",
+            external_id="dsid_text_generation",
+            title="Text generation",
+            content="Plain text caller content stays authoritative.",
+            content_type="text/plain",
+        )
+
+        stat = filesystem.store.file_info("dsid_text_generation")
+
+        assert generator.calls[0][0].text == "Plain text caller content stays authoritative."
+        assert stat["pageindex_doc_id"] is None
+        assert stat["pageindex_tree_status"] == "not_built"
+        assert Path(stat["text_artifact_path"]).read_text(
+            encoding="utf-8"
+        ) == "Plain text caller content stays authoritative."
+
+
+def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PageIndexFileSystem
+
+    calls: list[str] = []
+
+    def fake_index(self, file_path, mode="auto"):
+        calls.append(str(file_path))
+        doc_id = f"doc_{Path(file_path).suffix.lstrip('.')}"
+        doc_type = "pdf" if Path(file_path).suffix == ".pdf" else "md"
+        doc = {
+            "id": doc_id,
+            "type": doc_type,
+            "path": str(Path(file_path).resolve()),
+            "doc_name": Path(file_path).name,
+            "doc_description": "",
+            "structure": [{"title": Path(file_path).stem, "node_id": "0001", "nodes": []}],
+        }
+        if doc_type == "pdf":
+            doc["page_count"] = 1
+            doc["pages"] = [{"page": 1, "content": "Page one text"}]
+        else:
+            doc["line_count"] = 1
+        write_pageindex_client_doc(self.workspace, doc_id, doc)
+        self.documents[doc_id] = doc
+        return doc_id
+
+    monkeypatch.setattr(PageIndexClient, "index", fake_index)
+    with tempfile.TemporaryDirectory() as tmp:
+        source_pdf = Path(tmp) / "report.pdf"
+        source_md = Path(tmp) / "notes.md"
+        source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        source_md.write_text("# Notes", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+
+        filesystem.register_file(
+            storage_uri=str(source_pdf),
+            source_path="docs/report.pdf",
+            external_id="dsid_pdf_build",
+            title="PDF build",
+            content="pdf text",
+        )
+        filesystem.register_file(
+            storage_uri=source_md.as_uri(),
+            source_path="docs/notes.md",
+            external_id="dsid_md_build",
+            title="Markdown build",
+            content=source_md.read_text(encoding="utf-8"),
+        )
+
+        pdf_stat = filesystem.store.file_info("dsid_pdf_build")
+        md_stat = filesystem.store.file_info("dsid_md_build")
+
+        assert calls == [str(source_pdf.resolve()), str(source_md.resolve())]
+        assert pdf_stat["pageindex_doc_id"] == "doc_pdf"
+        assert pdf_stat["pageindex_tree_status"] == "built"
+        assert md_stat["pageindex_doc_id"] == "doc_md"
+        assert md_stat["pageindex_tree_status"] == "built"
+
+
+def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "report.pdf"
+        source.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        workspace = Path(tmp) / "workspace"
+        filesystem = PageIndexFileSystem(workspace=workspace)
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_cached_pdf",
+            {
+                "id": "doc_cached_pdf",
+                "type": "pdf",
+                "path": str(source.resolve()),
+                "doc_name": "report.pdf",
+                "doc_description": "",
+                "page_count": 2,
+                "structure": [
+                    {
+                        "title": "Introduction",
+                        "node_id": "0001",
+                        "text": "Intro section text",
+                        "nodes": [
+                            {
+                                "title": "Findings",
+                                "node_id": "0002",
+                                "physical_index": 2,
+                                "nodes": [],
+                            }
+                        ],
+                    }
+                ],
+                "pages": [
+                    {"page": 1, "content": "Page one text"},
+                    {"page": 2, "content": "Page two text"},
+                ],
+            },
+        )
+
+        def fail_index(*args, **kwargs):
+            raise AssertionError("PageIndexClient.index should not be called on cache hit")
+
+        monkeypatch.setattr(PageIndexClient, "index", fail_index)
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/report.pdf",
+            external_id="dsid_structural_cached",
+            title="Cached structural report",
+            content="text artifact remains available for grep, not cat --all",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        structure = json.loads(executor.execute("cat dsid_structural_cached --structure"))
+        pages = json.loads(executor.execute("cat dsid_structural_cached --page 1-2"))
+        stat = json.loads(executor.execute("stat dsid_structural_cached"))
+
+        assert structure["data"]["available"] is True
+        assert structure["data"]["pageindex_doc_id"] == "doc_cached_pdf"
+        assert structure["data"]["structure"][0]["title"] == "Introduction"
+        assert structure["data"]["structure"][1]["title"] == "Findings"
+        assert structure["data"]["structure_pagination"]["limit"] == 25
+        assert "text" not in structure["data"]["structure"][0]
+        assert "text" not in structure["data"]["structure"][1]
+
+        assert pages["data"]["available"] is True
+        assert pages["data"]["text"] == "Page one text\n\nPage two text"
+        with pytest.raises(PIFSCommandError, match="target-first"):
+            executor.execute("cat --page 1-2 dsid_structural_cached")
+        with pytest.raises(PIFSCommandError, match="one file target"):
+            executor.execute("cat dsid_structural_cached --page 1 2")
+
+        assert stat["data"]["pageindex_doc_id"] == "doc_cached_pdf"
+        assert stat["data"]["pageindex_tree_status"] == "built"
+
+
+def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "notes.md"
+        source.write_text("# Notes\n\nBody", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_cached_md",
+            {
+                "id": "doc_cached_md",
+                "type": "md",
+                "path": str(source.resolve()),
+                "doc_name": "notes",
+                "doc_description": "",
+                "line_count": 3,
+                "structure": [
+                    {
+                        "title": "Notes",
+                        "node_id": "0001",
+                        "line_num": 1,
+                        "text": "# Notes\n\nBody",
+                        "nodes": [],
+                    }
+                ],
+            },
+        )
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/notes.md",
+            external_id="dsid_md_cached",
+            title="Cached markdown notes",
+            content=source.read_text(encoding="utf-8"),
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        node = json.loads(executor.execute("cat dsid_md_cached --node 0001"))
+
+        assert node["data"]["available"] is True
+        assert node["data"]["pageindex_doc_id"] == "doc_cached_md"
+        assert node["data"]["node"]["title"] == "Notes"
+        assert node["data"]["text"] == "# Notes\n\nBody"
+        assert "text" not in node["data"]["node"]
+
+
+def test_cat_structure_page_node_and_text_outputs_are_hard_limited():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "report.pdf"
+        source.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        structure_nodes = [
+            {
+                "title": f"Section {index}",
+                "node_id": f"{index:04d}",
+                "start_index": index,
+                "end_index": index,
+                "text": f"node {index} text",
+                "nodes": [],
+            }
+            for index in range(1, 31)
+        ]
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_limited_pdf",
+            {
+                "id": "doc_limited_pdf",
+                "type": "pdf",
+                "path": str(source.resolve()),
+                "doc_name": "report.pdf",
+                "doc_description": "",
+                "page_count": 10,
+                "structure": structure_nodes,
+                "pages": [
+                    {"page": index, "content": f"Page {index} text"}
+                    for index in range(1, 11)
+                ],
+            },
+        )
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/report.pdf",
+            external_id="dsid_limited_pdf",
+            title="Limited structural report",
+            content="text artifact remains available for grep",
+        )
+        text_content = "\n".join(f"line {index}" for index in range(1, 106))
+        filesystem.register_file(
+            storage_uri="file:///tmp/long.txt",
+            source_path="docs/long.txt",
+            external_id="dsid_long_text",
+            title="Long text",
+            content=text_content,
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        first_structure = json.loads(executor.execute("cat dsid_limited_pdf --structure"))
+        assert len(first_structure["data"]["structure"]) == 25
+        assert first_structure["data"]["structure_pagination"]["has_more"] is True
+        assert first_structure["data"]["structure_pagination"]["next_offset"] == 25
+
+        second_structure = json.loads(
+            executor.execute("cat dsid_limited_pdf --structure --offset 25")
+        )
+        assert len(second_structure["data"]["structure"]) == 5
+        assert second_structure["data"]["structure"][0]["node_id"] == "0026"
+
+        pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-5"))
+        assert pages["data"]["text"] == (
+            "Page 1 text\n\nPage 2 text\n\nPage 3 text\n\nPage 4 text\n\nPage 5 text"
+        )
+        assert pages["data"]["page_pagination"]["limit"] == 5
+        with pytest.raises(PIFSCommandError, match="at most 5"):
+            executor.execute("cat dsid_limited_pdf --page 1-6")
+        with pytest.raises(PIFSCommandError, match="evidence is sufficient"):
+            executor.execute("cat dsid_limited_pdf --page 1-6")
+
+        nodes = json.loads(
+            executor.execute(
+                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
+                "0006 0007 0008 0009 0010"
+            )
+        )
+        assert nodes["data"]["node_ids"] == [
+            "0001",
+            "0002",
+            "0003",
+            "0004",
+            "0005",
+            "0006",
+            "0007",
+            "0008",
+            "0009",
+            "0010",
+        ]
+        comma_nodes = json.loads(
+            executor.execute("cat dsid_limited_pdf --node 0001,0002")
+        )
+        assert comma_nodes["data"]["node_ids"] == ["0001", "0002"]
+        with pytest.raises(PIFSCommandError, match="at most 10"):
+            executor.execute(
+                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
+                "0006 0007 0008 0009 0010 0011"
+            )
+        with pytest.raises(PIFSCommandError, match="continue with additional chunks"):
+            executor.execute(
+                "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
+                "0006 0007 0008 0009 0010 0011"
+            )
+
+        with pytest.raises(PIFSCommandError, match="quote the whole target"):
+            executor.execute("cat dsid_limited_pdf 0001")
+
+        text = json.loads(executor.execute("cat dsid_long_text --all"))
+        assert "line 100" in text["data"]["text"]
+        assert "line 101" not in text["data"]["text"]
+        assert text["data"]["pagination"]["has_more"] is True
+        assert text["data"]["pagination"]["next_range"] == "101-105"
+        with pytest.raises(PIFSCommandError, match="at most 100"):
+            executor.execute("cat dsid_long_text --range 1-101")
+
+
+def test_tree_folder_behavior_is_preserved():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        filesystem.register_file(
+            storage_uri="file:///tmp/report.txt",
+            source_path="docs/report.txt",
+            folder_path="/docs/reports",
+            external_id="dsid_folder_tree",
+            title="Folder report",
+            content="folder tree behavior remains intact",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        folder_tree = json.loads(executor.execute("tree /docs --depth 2"))
+
+        assert folder_tree["data"]["path"] == "/docs"
+        assert folder_tree["data"]["folders"][0]["path"] == "/docs/reports"
+
+
+def test_tree_does_not_read_file_internal_pageindex_structure():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "report.pdf"
+        source.write_bytes(b"%PDF-1.4\n% test fixture\n")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_tree_is_folder_only",
+            {
+                "id": "doc_tree_is_folder_only",
+                "type": "pdf",
+                "path": str(source.resolve()),
+                "doc_name": "report.pdf",
+                "doc_description": "",
+                "page_count": 1,
+                "structure": [
+                    {"title": "Introduction", "node_id": "0001", "nodes": []}
+                ],
+                "pages": [{"page": 1, "content": "Page one text"}],
+            },
+        )
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/report.pdf",
+            external_id="dsid_tree_is_folder_only",
+            title="Cached structural report",
+            content="text artifact remains available",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        with pytest.raises(PIFSCommandError):
+            executor.execute("tree dsid_tree_is_folder_only")
+
+        structure = json.loads(executor.execute("cat dsid_tree_is_folder_only --structure"))
+        assert structure["data"]["structure"][0]["title"] == "Introduction"
+
+
+def test_cat_all_is_limited_to_text_files():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        filesystem.register_file(
+            storage_uri="file:///tmp/readme.txt",
+            source_path="docs/readme.txt",
+            external_id="dsid_text_file",
+            title="Text readme",
+            content="plain text body",
+        )
+        filesystem.register_file(
+            storage_uri="file:///tmp/report.pdf",
+            source_path="docs/report.pdf",
+            external_id="dsid_pdf_file",
+            title="PDF report",
+            content="extracted text should not be served through cat --all",
+        )
+        filesystem.register_file(
+            storage_uri="file:///tmp/notes.md",
+            source_path="docs/notes.md",
+            external_id="dsid_md_file",
+            title="Markdown notes",
+            content="markdown text should use PageIndex structure reads",
+        )
+        filesystem.register_file(
+            storage_uri="file:///tmp/data.json",
+            source_path="docs/data.json",
+            external_id="dsid_json_file",
+            title="JSON record",
+            content='{"body":"json"}',
+            content_type="application/json",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        text = json.loads(executor.execute("cat dsid_text_file --all"))
+        assert text["data"]["text"] == "plain text body"
+
+        with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+            executor.execute("cat dsid_pdf_file --all")
+        with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
+            filesystem.open("dsid_pdf_file")
+        with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+            executor.execute("cat dsid_md_file --all")
+        with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
+            filesystem.open("dsid_md_file")
+        with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+            executor.execute("cat dsid_json_file --all")
+        opened_json = filesystem.open("dsid_json_file")
+        assert opened_json.text == '{"body":"json"}'
+        for command in (
+            "head dsid_pdf_file",
+            "tail dsid_pdf_file",
+            "sed -n 1,1p dsid_pdf_file",
+            "head dsid_md_file",
+            "tail dsid_md_file",
+            "sed -n 1,1p dsid_md_file",
+        ):
+            with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+                executor.execute(command)
+
+
+def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        filesystem.register_file(
+            storage_uri="file:///tmp/readme.txt",
+            source_path="docs/readme.txt",
+            external_id="dsid_text_only",
+            title="Text readme",
+            content="plain text body",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        for command in (
+            "cat dsid_text_only --structure",
+            "cat dsid_text_only --page 1",
+            "cat dsid_text_only --node 0001",
+        ):
+            with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"):
+                executor.execute(command)
+
+
+def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "uploaded"
+        source.write_text("# Uploaded\n\nBody", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+        file_ref = filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="uploads/uploaded",
+            external_id="dsid_legacy_pageindex",
+            title="Legacy PageIndex record",
+            content="text/plain is only a weak default here",
+        )
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_legacy_pageindex",
+            {
+                "id": "doc_legacy_pageindex",
+                "type": "md",
+                "path": str(source.resolve()),
+                "doc_name": "uploaded",
+                "doc_description": "",
+                "line_count": 3,
+                "structure": [
+                    {"title": "Uploaded", "node_id": "0001", "text": "Body", "nodes": []}
+                ],
+            },
+        )
+        filesystem.store.update_pageindex_pointer(
+            file_ref,
+            pageindex_doc_id="doc_legacy_pageindex",
+            pageindex_tree_status="built",
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        structure = json.loads(executor.execute("cat dsid_legacy_pageindex --structure"))
+        assert structure["data"]["structure"][0]["title"] == "Uploaded"
+        with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
+            executor.execute("cat dsid_legacy_pageindex --all")
+
+
+def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monkeypatch):
+    from pageindex import PageIndexClient
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    with tempfile.TemporaryDirectory() as tmp:
+        source = Path(tmp) / "late.md"
+        source.write_text("# Late\n\nBody", encoding="utf-8")
+        filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
+
+        def fail_index(*args, **kwargs):
+            raise RuntimeError("index failed")
+
+        monkeypatch.setattr(PageIndexClient, "index", fail_index)
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/late.md",
+            external_id="dsid_late_cache",
+            title="Late cache",
+            content=source.read_text(encoding="utf-8"),
+        )
+        write_pageindex_client_doc(
+            filesystem.pageindex_client_workspace,
+            "doc_late_cache",
+            {
+                "id": "doc_late_cache",
+                "type": "md",
+                "path": str(source.resolve()),
+                "doc_name": "late",
+                "doc_description": "",
+                "line_count": 3,
+                "structure": [
+                    {"title": "Late", "node_id": "0001", "text": "Body", "nodes": []}
+                ],
+            },
+        )
+        executor = PIFSCommandExecutor(filesystem, json_output=True)
+
+        structure = json.loads(executor.execute("cat dsid_late_cache --structure"))
+        stat = json.loads(executor.execute("stat dsid_late_cache"))
+
+        assert structure["data"]["available"] is False
+        assert stat["data"]["pageindex_doc_id"] is None
+        assert stat["data"]["pageindex_tree_status"] == "failed"
diff --git a/tests/test_pifs_agent_stream.py b/tests/test_pifs_agent_stream.py
new file mode 100644
index 000000000..69f62edde
--- /dev/null
+++ b/tests/test_pifs_agent_stream.py
@@ -0,0 +1,272 @@
+import io
+import os
+import threading
+import unittest
+from unittest.mock import patch
+from types import SimpleNamespace
+
+from pydantic import BaseModel, ConfigDict
+
+from pageindex.filesystem import agent as agent_module
+from pageindex.filesystem.agent import (
+    AGENT_TOOL_POLICY,
+    AGENT_SYSTEM_PROMPT,
+    BASH_TOOL_DESCRIPTION,
+    PIFSAgentSession,
+    PIFSAgentStreamObserver,
+    build_agent_model_settings,
+    normalize_agent_stream_mode,
+    normalize_reasoning_effort,
+    normalize_reasoning_summary,
+    pifs_agent_raw_reasoning_enabled,
+    serialize_agent_final_output,
+    should_disable_pifs_agent_tracing,
+    should_use_openai_compatible_chat_model,
+)
+
+
+class StructuredAnswer(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    answer: str
+    document_ids: list[str]
+
+
+class PIFSAgentStreamTest(unittest.TestCase):
+    def raw_event(self, event_type, delta):
+        return SimpleNamespace(
+            type="raw_response_event",
+            data=SimpleNamespace(type=event_type, delta=delta),
+        )
+
+    def test_model_stream_prints_output_and_think_deltas(self):
+        output = io.StringIO()
+        stream_log = []
+        observer = PIFSAgentStreamObserver("model", stream_log=stream_log, output=output)
+
+        observer.handle_event(self.raw_event("response.reasoning_summary_text.delta", "look up folder"))
+        observer.handle_event(self.raw_event("response.output_text.delta", '{"answer":'))
+        observer.handle_event(self.raw_event("response.output_text.delta", '"done"}'))
+        observer.finish()
+
+        printed = output.getvalue()
+        self.assertIn("[llm reasoning summary stream]", printed)
+        self.assertIn("look up folder", printed)
+        self.assertIn("[llm final output stream]", printed)
+        self.assertIn('{"answer":"done"}', printed.replace("\n", ""))
+        self.assertEqual(
+            stream_log,
+            [
+                {"kind": "output", "text": '{"answer":"done"}'},
+                {"kind": "think_summary", "text": "look up folder"},
+            ],
+        )
+
+    def test_tools_mode_does_not_print_model_text(self):
+        output = io.StringIO()
+        stream_log = []
+        observer = PIFSAgentStreamObserver("tools", stream_log=stream_log, output=output)
+
+        observer.handle_event(self.raw_event("response.output_text.delta", "hidden from tools mode"))
+        observer.handle_event(self.raw_event("response.function_call_arguments.delta", '{"command":"ls /"}'))
+        observer.emit_tool_call("ls /")
+        observer.emit_tool_result(ok=True, output='{"ok": true}', seconds=0.001)
+        observer.finish()
+
+        printed = output.getvalue()
+        self.assertNotIn("hidden from tools mode", printed)
+        self.assertIn("[llm -> pifs command]", printed)
+        self.assertIn("ls /", printed)
+        self.assertIn("[pifs -> llm result preview]", printed)
+        self.assertIn('{"ok": true}', printed)
+        self.assertEqual(stream_log[0], {"kind": "tool_call", "command": "ls /"})
+        self.assertEqual(stream_log[1]["kind"], "tool_result")
+        self.assertEqual(stream_log[2], {"kind": "tool_args", "text": '{"command":"ls /"}'})
+
+    def test_empty_tool_command_is_not_printed_or_logged(self):
+        output = io.StringIO()
+        stream_log = []
+        observer = PIFSAgentStreamObserver("tools", stream_log=stream_log, output=output)
+
+        observer.emit_tool_call("")
+        observer.emit_tool_call("   ")
+
+        self.assertEqual(output.getvalue(), "")
+        self.assertEqual(stream_log, [])
+
+    def test_tool_result_preview_compacts_large_outputs(self):
+        output = io.StringIO()
+        observer = PIFSAgentStreamObserver("tools", output=output)
+
+        observer.emit_tool_result(
+            ok=True,
+            output="\n".join(f"line {index}" for index in range(50)),
+            seconds=0.001,
+        )
+
+        printed = output.getvalue()
+        self.assertIn("[large PIFS result", printed)
+        self.assertIn("line 0", printed)
+        self.assertIn("more lines omitted from preview", printed)
+        self.assertNotIn("line 49", printed)
+
+    def test_raw_reasoning_is_not_logged_by_default_but_summary_is(self):
+        output = io.StringIO()
+        stream_log = []
+        previous = os.environ.pop("PAGEINDEX_PIFS_AGENT_RAW_REASONING", None)
+        try:
+            observer = PIFSAgentStreamObserver("model", stream_log=stream_log, output=output)
+            observer.handle_event(self.raw_event("response.reasoning_text.delta", "private chain"))
+            observer.handle_event(
+                self.raw_event("response.reasoning_summary_text.delta", "visible summary")
+            )
+            observer.finish()
+        finally:
+            if previous is not None:
+                os.environ["PAGEINDEX_PIFS_AGENT_RAW_REASONING"] = previous
+
+        printed = output.getvalue()
+        self.assertNotIn("private chain", printed)
+        self.assertIn("visible summary", printed)
+        self.assertEqual(stream_log, [{"kind": "think_summary", "text": "visible summary"}])
+
+    def test_raw_reasoning_requires_debug_env_flag(self):
+        self.assertFalse(pifs_agent_raw_reasoning_enabled({}))
+        self.assertTrue(
+            pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "on"})
+        )
+        self.assertTrue(
+            pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "TRUE"})
+        )
+        self.assertFalse(
+            pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "0"})
+        )
+
+    def test_stream_mode_aliases(self):
+        self.assertEqual(normalize_agent_stream_mode("think"), "model")
+        self.assertEqual(normalize_agent_stream_mode("debug"), "all")
+        self.assertEqual(normalize_agent_stream_mode(""), "off")
+        with self.assertRaises(ValueError):
+            normalize_agent_stream_mode("nope")
+
+    def test_reasoning_settings_enable_effort_and_summary(self):
+        settings = build_agent_model_settings(
+            reasoning_effort="medium",
+            reasoning_summary="detailed",
+        )
+
+        self.assertIsNotNone(settings)
+        self.assertEqual(settings.reasoning.effort, "medium")
+        self.assertEqual(settings.reasoning.summary, "detailed")
+        self.assertEqual(settings.verbosity, "low")
+
+    def test_reasoning_effort_defaults_to_visible_summary(self):
+        settings = build_agent_model_settings(reasoning_effort="low")
+
+        self.assertIsNotNone(settings)
+        self.assertEqual(settings.reasoning.effort, "low")
+        self.assertEqual(settings.reasoning.summary, "auto")
+
+    def test_reasoning_and_base_url_normalization(self):
+        self.assertEqual(normalize_reasoning_effort("xhigh"), "xhigh")
+        self.assertIsNone(normalize_reasoning_summary("none"))
+        self.assertFalse(should_use_openai_compatible_chat_model(None))
+        self.assertFalse(should_use_openai_compatible_chat_model("https://api.openai.com/v1/"))
+        self.assertTrue(should_use_openai_compatible_chat_model("https://example.test/v1"))
+        with self.assertRaises(ValueError):
+            normalize_reasoning_effort("maximum")
+
+    def test_tracing_is_disabled_by_default_unless_env_enables_it(self):
+        self.assertTrue(should_disable_pifs_agent_tracing({}))
+        self.assertFalse(
+            should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "1"})
+        )
+        self.assertFalse(
+            should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "true"})
+        )
+        self.assertFalse(
+            should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "on"})
+        )
+        self.assertTrue(
+            should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "0"})
+        )
+
+    def test_structured_agent_output_serializes_to_json(self):
+        output = serialize_agent_final_output(
+            StructuredAnswer(answer="done", document_ids=["dsid_1"])
+        )
+
+        self.assertEqual(output, '{"answer":"done","document_ids":["dsid_1"]}')
+
+    def test_prompt_tells_agent_when_to_choose_node_or_page(self):
+        self.assertIn("prefer cat <target> --node <node_id>", AGENT_TOOL_POLICY)
+        self.assertIn("page-level evidence", AGENT_TOOL_POLICY)
+        self.assertIn("prefer\ncat <path> --node <node_id>", BASH_TOOL_DESCRIPTION)
+        self.assertIn("stop if the evidence is sufficient", AGENT_TOOL_POLICY)
+        self.assertIn("continue with another chunk before answering", BASH_TOOL_DESCRIPTION)
+        self.assertIn("Do not reconstruct paths from document titles", BASH_TOOL_DESCRIPTION)
+        self.assertIn("file_ref/document_id", AGENT_TOOL_POLICY)
+
+    def test_prompt_requires_stat_for_metadata_questions(self):
+        self.assertIn("stat --schema and stat <target>", AGENT_TOOL_POLICY)
+        self.assertIn("do not infer metadata presence or absence", AGENT_TOOL_POLICY)
+        self.assertIn("questions about metadata fields", BASH_TOOL_DESCRIPTION)
+        self.assertIn("Use stat only for metadata/schema/status questions", AGENT_TOOL_POLICY)
+        self.assertIn("Do not run stat merely to understand what a document says", AGENT_TOOL_POLICY)
+        self.assertIn("Do not use stat as a general content/topic discovery step", BASH_TOOL_DESCRIPTION)
+
+    def test_prompt_routes_summary_search_to_search_summary(self):
+        self.assertIn("search-summary when the user asks for", BASH_TOOL_DESCRIPTION)
+        self.assertIn('use search-summary "<query>" <folder>', AGENT_TOOL_POLICY)
+        self.assertIn('search-summary "Federal Reserve" /documents', BASH_TOOL_DESCRIPTION)
+        self.assertIn("do not translate that request into find --where", AGENT_TOOL_POLICY)
+        self.assertIn("verify the relevant facts with cat", AGENT_TOOL_POLICY)
+        self.assertIn("verify the relevant claim with cat", BASH_TOOL_DESCRIPTION)
+
+    def test_prompt_rejects_find_grep_as_exhaustive_search(self):
+        self.assertIn("Do not use find | grep as an exhaustive search", AGENT_TOOL_POLICY)
+        self.assertIn("find output can be scoped or limited", AGENT_TOOL_POLICY)
+
+    def test_system_prompt_sets_workspace_identity_and_scope(self):
+        self.assertIn("PageIndex FileSystem Demo Agent", AGENT_SYSTEM_PROMPT)
+        self.assertIn("VectifyAI Team", AGENT_SYSTEM_PROMPT)
+        self.assertIn("current PageIndex FileSystem\nworkspace", AGENT_SYSTEM_PROMPT)
+        self.assertIn("unrelated to the current workspace", AGENT_SYSTEM_PROMPT)
+        self.assertIn("do not answer it as\na general-purpose assistant", AGENT_SYSTEM_PROMPT)
+        self.assertIn("workspace-related topic question", AGENT_SYSTEM_PROMPT)
+        self.assertIn("clarify only after a reasonable search", AGENT_SYSTEM_PROMPT)
+        self.assertIn("search for candidate documents before asking", AGENT_TOOL_POLICY)
+        self.assertIn("Do not conclude that no relevant document exists from one failed grep", AGENT_SYSTEM_PROMPT)
+        self.assertIn("A single failed grep is not enough evidence", AGENT_TOOL_POLICY)
+
+    def test_threaded_runtime_error_is_not_retried_on_fresh_loop(self):
+        session = object.__new__(PIFSAgentSession)
+        session.executor = SimpleNamespace(query_context=None)
+        session.normalized_stream_mode = "off"
+        session.agent_log = []
+        session.max_seconds = None
+        session.max_turns = 1
+        session.session = None
+        session.agent = object()
+
+        main_thread = threading.get_ident()
+        run_threads = []
+
+        def fail_asyncio_run(coro):
+            coro.close()
+            run_threads.append(threading.get_ident())
+            raise RuntimeError("threaded agent failure")
+
+        with (
+            patch.object(agent_module.asyncio, "get_running_loop", return_value=object()),
+            patch.object(agent_module.asyncio, "run", side_effect=fail_asyncio_run),
+        ):
+            with self.assertRaisesRegex(RuntimeError, "threaded agent failure"):
+                session.run("Question: inspect workspace")
+
+        self.assertEqual(len(run_threads), 1)
+        self.assertNotEqual(run_threads[0], main_thread)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py
new file mode 100644
index 000000000..491cbb93c
--- /dev/null
+++ b/tests/test_pifs_cli.py
@@ -0,0 +1,339 @@
+import os
+from pathlib import Path
+
+
+class FakeFileSystem:
+    def __init__(self, workspace):
+        self.workspace = Path(workspace)
+        self.projection_retrieval_configured = False
+
+    def configure_existing_projection_retrieval(self):
+        self.projection_retrieval_configured = True
+        return True
+
+
+def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+
+    filesystem = cli._filesystem_from_workspace(str(workspace))
+
+    assert filesystem.workspace == workspace
+    assert filesystem.projection_retrieval_configured is True
+
+
+def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+    executor_instances = []
+
+    class FakeExecutor:
+        def __init__(self, filesystem, *, json_output=False):
+            self.filesystem = filesystem
+            self.json_output = json_output
+            self.commands = []
+            executor_instances.append(self)
+
+        def execute(self, command):
+            self.commands.append(command)
+            return f"executed:{command}"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "PIFSCommandExecutor", FakeExecutor)
+
+    status = cli.main(["--workspace", str(workspace), "ls", "/documents", "--json"])
+
+    assert status == 0
+    assert capsys.readouterr().out == "executed:ls /documents\n"
+    assert len(executor_instances) == 1
+    assert executor_instances[0].filesystem.workspace == workspace
+    assert executor_instances[0].json_output is True
+    assert executor_instances[0].commands == ["ls /documents"]
+
+
+def test_cli_set_workspace_persists_default(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    config_path = tmp_path / "pifs.json"
+    workspace = tmp_path / "workspace"
+    monkeypatch.setenv("PIFS_CONFIG_FILE", str(config_path))
+
+    status = cli.main(["set", "workspace", str(workspace)])
+
+    assert status == 0
+    output = capsys.readouterr().out
+    assert f"workspace: {workspace}" in output
+    assert f"config: {config_path}" in output
+    assert config_path.read_text(encoding="utf-8") == (
+        '{\n  "workspace": "' + str(workspace) + '"\n}\n'
+    )
+
+
+def test_cli_passthrough_uses_configured_workspace(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    config_path = tmp_path / "pifs.json"
+    workspace = tmp_path / "workspace"
+    executor_instances = []
+    monkeypatch.setenv("PIFS_CONFIG_FILE", str(config_path))
+    monkeypatch.delenv("PIFS_WORKSPACE", raising=False)
+
+    class FakeExecutor:
+        def __init__(self, filesystem, *, json_output=False):
+            self.filesystem = filesystem
+            self.json_output = json_output
+            self.commands = []
+            executor_instances.append(self)
+
+        def execute(self, command):
+            self.commands.append(command)
+            return f"executed:{command}"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "PIFSCommandExecutor", FakeExecutor)
+
+    assert cli.main(["set", "workspace", str(workspace)]) == 0
+    capsys.readouterr()
+
+    status = cli.main(["ls", "/documents"])
+
+    assert status == 0
+    assert capsys.readouterr().out == "executed:ls /documents\n"
+    assert executor_instances[0].filesystem.workspace == workspace
+
+
+def test_cli_ask_invokes_agent_with_question(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+    agent_calls = []
+
+    def fake_run_pifs_agent(filesystem, question, **kwargs):
+        agent_calls.append((filesystem, question, kwargs))
+        return "agent answer"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent)
+
+    status = cli.main(
+        [
+            "ask",
+            "--workspace",
+            str(workspace),
+            "--model",
+            "test-model",
+            "--stream-mode",
+            "off",
+            "--max-turns",
+            "7",
+            "--max-seconds",
+            "3.5",
+            "--reasoning-effort",
+            "low",
+            "--reasoning-summary",
+            "concise",
+            "What",
+            "is",
+            "inside?",
+        ]
+    )
+
+    assert status == 0
+    assert capsys.readouterr().out == "agent answer\n"
+    filesystem, question, kwargs = agent_calls[0]
+    assert filesystem.workspace == workspace
+    assert question == "What is inside?"
+    assert kwargs == {
+        "model": "test-model",
+        "stream_mode": "off",
+        "max_turns": 7,
+        "max_seconds": 3.5,
+        "reasoning_effort": "low",
+        "reasoning_summary": "concise",
+    }
+
+
+def test_cli_ask_loads_env_file_before_running_agent(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+    env_file = tmp_path / ".env"
+    env_file.write_text("OPENAI_API_KEY=from-dotenv\n", encoding="utf-8")
+    agent_keys = []
+
+    def fake_run_pifs_agent(filesystem, question, **kwargs):
+        agent_keys.append(os.environ.get("OPENAI_API_KEY"))
+        return "agent answer"
+
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent)
+
+    status = cli.main(
+        [
+            "ask",
+            "--workspace",
+            str(workspace),
+            "--env-file",
+            str(env_file),
+            "What",
+            "is",
+            "inside?",
+        ]
+    )
+
+    assert status == 0
+    assert capsys.readouterr().out == "agent answer\n"
+    assert agent_keys == ["from-dotenv"]
+
+
+def test_cli_chat_runs_one_question_and_exits(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+    inputs = iter(["", "Summarize the workspace", "exit"])
+    session_instances = []
+    session_questions = []
+
+    class FakeSession:
+        def __init__(self, filesystem, **kwargs):
+            self.filesystem = filesystem
+            self.kwargs = kwargs
+            session_instances.append(self)
+
+        def run(self, question):
+            session_questions.append((self, question))
+            return f"answer:{question}"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "PIFSAgentSession", FakeSession)
+    monkeypatch.setattr("builtins.input", lambda prompt="": next(inputs))
+
+    status = cli.main(["chat", "--workspace", str(workspace), "--model", "test-model"])
+
+    assert status == 0
+    assert capsys.readouterr().out == ""
+    assert len(session_instances) == 1
+    assert session_instances[0].filesystem.workspace == workspace
+    assert session_questions == [(session_instances[0], "Summarize the workspace")]
+    assert session_instances[0].kwargs["model"] == "test-model"
+    assert session_instances[0].kwargs["stream_mode"] == "all"
+
+
+def test_cli_chat_sanitizes_control_input(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+    inputs = iter(["\x12", "he\x7fllo\x1b[A", "exit"])
+    agent_calls = []
+
+    class FakeSession:
+        def __init__(self, filesystem, **kwargs):
+            pass
+
+        def run(self, question):
+            agent_calls.append(question)
+            return f"answer:{question}"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "PIFSAgentSession", FakeSession)
+    monkeypatch.setattr("builtins.input", lambda prompt="": next(inputs))
+
+    status = cli.main(["chat", "--workspace", str(workspace), "--stream-mode", "off"])
+
+    assert status == 0
+    assert agent_calls == ["hllo"]
+    assert capsys.readouterr().out == "answer:hllo\n"
+
+
+def test_cli_ask_does_not_reprint_streamed_agent_output(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+
+    def fake_run_pifs_agent(filesystem, question, **kwargs):
+        print("streamed answer")
+        return "returned answer"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent)
+
+    status = cli.main(
+        [
+            "ask",
+            "--workspace",
+            str(workspace),
+            "--stream-mode",
+            "all",
+            "What",
+            "is",
+            "inside?",
+        ]
+    )
+
+    assert status == 0
+    assert capsys.readouterr().out == "streamed answer\n"
+
+
+def test_cli_chat_stream_mode_can_be_overridden(monkeypatch, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+    inputs = iter(["Summarize the workspace", "exit"])
+    session_kwargs = []
+
+    class FakeSession:
+        def __init__(self, filesystem, **kwargs):
+            session_kwargs.append(kwargs)
+
+        def run(self, question):
+            return f"answer:{question}"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "PIFSAgentSession", FakeSession)
+    monkeypatch.setattr("builtins.input", lambda prompt="": next(inputs))
+
+    status = cli.main(
+        [
+            "chat",
+            "--workspace",
+            str(workspace),
+            "--stream-mode",
+            "tools",
+        ]
+    )
+
+    assert status == 0
+    assert session_kwargs[0]["stream_mode"] == "tools"
+
+
+def test_cli_chat_reuses_one_agent_session_for_multiple_questions(monkeypatch, capsys, tmp_path):
+    from pageindex.filesystem import cli
+
+    workspace = tmp_path / "workspace"
+    inputs = iter(["first", "second", "exit"])
+    sessions = []
+
+    class FakeSession:
+        def __init__(self, filesystem, **kwargs):
+            self.questions = []
+            sessions.append(self)
+
+        def run(self, question):
+            self.questions.append(question)
+            return f"answer:{question}"
+
+    monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
+    monkeypatch.setattr(cli, "PIFSAgentSession", FakeSession)
+    monkeypatch.setattr("builtins.input", lambda prompt="": next(inputs))
+
+    status = cli.main(["chat", "--workspace", str(workspace), "--stream-mode", "off"])
+
+    assert status == 0
+    assert len(sessions) == 1
+    assert sessions[0].questions == ["first", "second"]
+    assert capsys.readouterr().out == "answer:first\nanswer:second\n"
diff --git a/tests/test_pifs_find_maxdepth.py b/tests/test_pifs_find_maxdepth.py
new file mode 100644
index 000000000..c1afe9145
--- /dev/null
+++ b/tests/test_pifs_find_maxdepth.py
@@ -0,0 +1,399 @@
+import json
+from pathlib import Path
+
+import pytest
+
+
+def _register_find_fixture(tmp_path: Path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+
+    source_dir = tmp_path / "source"
+    source_dir.mkdir()
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    filesystem.metadata.register_schema({"fields": {"department": "string"}})
+
+    def add_file(
+        filename: str,
+        *,
+        folder_path: str,
+        external_id: str,
+        title: str,
+        domain: str,
+    ) -> None:
+        source = source_dir / filename
+        source.write_text(f"{title} fixture text", encoding="utf-8")
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path=f"docs/{filename}",
+            folder_path=folder_path,
+            external_id=external_id,
+            title=title,
+            content=source.read_text(encoding="utf-8"),
+            metadata={"department": domain},
+        )
+
+    add_file(
+        "root.txt",
+        folder_path="/documents",
+        external_id="doc_root",
+        title="Root document",
+        domain="ops",
+    )
+    add_file(
+        "child.txt",
+        folder_path="/documents/team",
+        external_id="doc_child",
+        title="Child document",
+        domain="ops",
+    )
+    add_file(
+        "deep.txt",
+        folder_path="/documents/team/deep",
+        external_id="doc_deep",
+        title="Deep document",
+        domain="ops",
+    )
+    add_file(
+        "other.txt",
+        folder_path="/documents/team",
+        external_id="doc_other",
+        title="Other document",
+        domain="finance",
+    )
+    return PIFSCommandExecutor(filesystem, json_output=True)
+
+
+def _data(output: str):
+    return json.loads(output)["data"]
+
+
+def test_find_maxdepth_one_returns_direct_files_only(tmp_path):
+    executor = _register_find_fixture(tmp_path)
+
+    rows = _data(executor.execute("find /documents -maxdepth 1 -type f"))
+
+    assert [row["external_id"] for row in rows] == ["doc_root"]
+
+
+def test_find_output_is_path_first_without_session_refs(tmp_path):
+    executor = _register_find_fixture(tmp_path)
+    executor.json_output = False
+
+    output = executor.execute("find /documents -maxdepth 1 -type f")
+
+    assert output.startswith("/documents/Root document id=doc_root file_ref=file_")
+    assert "ref_1" not in output
+    assert "title=Root document" in output
+
+
+def test_stable_path_targets_work_without_session_refs(tmp_path):
+    executor = _register_find_fixture(tmp_path)
+    executor.json_output = False
+
+    stat = executor.execute("stat '/documents/Root document'")
+    text = executor.execute("cat '/documents/Root document' --all")
+
+    assert "target: /documents/Root document" in stat
+    assert "document_id: doc_root" in stat
+    assert "Root document fixture text" in text
+
+
+def test_shell_limits_reject_context_expanding_counts(tmp_path):
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    executor = _register_find_fixture(tmp_path)
+
+    for command, limit in (
+        ("find /documents --limit 51", 50),
+        ("grep --limit 21 Root /documents", 20),
+        ("ls /documents --limit 101", 100),
+        ("tree /documents --limit 201", 200),
+        ("head -n 101 /documents/Root\\ document", 100),
+        ("tail -n 101 /documents/Root\\ document", 100),
+        ("sed -n 1,101p /documents/Root\\ document", 100),
+    ):
+        with pytest.raises(PIFSCommandError, match=f"at most {limit}"):
+            executor.execute(command)
+
+
+def test_grep_rejects_regex_alternation_patterns(tmp_path):
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    executor = _register_find_fixture(tmp_path)
+    executor.json_output = False
+
+    with pytest.raises(PIFSCommandError, match="does not support regex alternation"):
+        executor.execute('grep -R "Root|Child" /documents')
+
+    with pytest.raises(PIFSCommandError, match="multiple grep commands"):
+        executor.execute('find /documents -type f | grep "Root|Child"')
+
+
+def test_stat_shell_output_includes_unified_metadata_status(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
+
+    source = tmp_path / "source.txt"
+    source.write_text("fixture text", encoding="utf-8")
+
+    class SummaryGenerator:
+        def generate(self, document, *, fields):
+            return MetadataGenerationResult(
+                values={field: "Generated summary for retrieval." for field in fields}
+            )
+
+    filesystem = PageIndexFileSystem(
+        workspace=tmp_path / "workspace",
+        metadata_generator=SummaryGenerator(),
+    )
+    filesystem.register_file(
+        storage_uri=source.as_uri(),
+        source_path="docs/source.txt",
+        folder_path="/documents",
+        external_id="doc_generated",
+        title="Generated metadata document",
+        content=source.read_text(encoding="utf-8"),
+        metadata={"department": "ops"},
+        metadata_policy={
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+            }
+        },
+    )
+    executor = PIFSCommandExecutor(filesystem, json_output=False)
+
+    stat = executor.execute("stat /documents/'Generated metadata document'")
+
+    assert "metadata:" in stat
+    assert "  department: ops" in stat
+    assert "  summary: Generated summary for retrieval." in stat
+    assert "metadata_status: generated" in stat
+
+
+def test_stat_field_reads_one_metadata_field_across_multiple_targets(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
+
+    class SummaryGenerator:
+        def generate(self, document, *, fields):
+            return MetadataGenerationResult(
+                values={
+                    field: (
+                        f"Summary for {document.title}\n"
+                        + "full summary token " * 80
+                    )
+                    for field in fields
+                }
+            )
+
+    filesystem = PageIndexFileSystem(
+        workspace=tmp_path / "workspace",
+        metadata_generator=SummaryGenerator(),
+    )
+    for index in range(1, 3):
+        source = tmp_path / f"source{index}.txt"
+        source.write_text(f"fixture text {index}", encoding="utf-8")
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path=f"docs/source{index}.txt",
+            folder_path="/documents",
+            external_id=f"doc_summary_{index}",
+            title=f"Summary document {index}",
+            content=source.read_text(encoding="utf-8"),
+            metadata_policy={
+                "fields": {
+                    "summary": True,
+                    "doc_type": False,
+                    "domain": False,
+                    "topic": False,
+                }
+            },
+        )
+    executor = PIFSCommandExecutor(filesystem, json_output=False)
+
+    output = executor.execute(
+        "stat --field summary /documents/'Summary document 1' /documents/'Summary document 2'"
+    )
+
+    assert "/documents/Summary document 1:" in output
+    assert "summary: Summary for Summary document 1" in output
+    assert "full summary token" in output
+    assert "[truncated]" not in output
+    assert "/documents/Summary document 2:" in output
+    assert "summary: Summary for Summary document 2" in output
+
+    data = json.loads(
+        PIFSCommandExecutor(filesystem, json_output=True).execute(
+            "stat --field summary /documents/'Summary document 1' /documents/'Summary document 2'"
+        )
+    )["data"]
+    assert data["mode"] == "field_values"
+    assert data["target_count"] == 2
+    assert data["data"][0]["field"] == "summary"
+    assert data["data"][0]["value"].startswith("Summary for Summary document 1\n")
+    assert data["data"][0]["value"].count("full summary token") == 80
+
+    with pytest.raises(PIFSCommandError, match="Unknown metadata field"):
+        executor.execute("stat --field missing_field /documents/'Summary document 1'")
+
+
+def test_stat_field_rejects_more_than_twenty_targets(tmp_path):
+    from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    targets = []
+    for index in range(21):
+        source = tmp_path / f"source{index}.txt"
+        source.write_text(f"fixture text {index}", encoding="utf-8")
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path=f"docs/source{index}.txt",
+            folder_path="/documents",
+            external_id=f"doc_{index}",
+            title=f"Document {index}",
+            content=source.read_text(encoding="utf-8"),
+            metadata={"department": "ops"},
+        )
+        targets.append(f"/documents/'Document {index}'")
+    executor = PIFSCommandExecutor(filesystem, json_output=False)
+
+    with pytest.raises(PIFSCommandError, match="at most 20"):
+        executor.execute("stat --field department " + " ".join(targets))
+
+
+def test_register_rejects_pifs_owned_metadata_fields(tmp_path):
+    from pageindex.filesystem import PageIndexFileSystem
+
+    source = tmp_path / "source.txt"
+    source.write_text("fixture text", encoding="utf-8")
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+
+    with pytest.raises(ValueError, match="PIFS-owned generated field"):
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/source.txt",
+            folder_path="/documents",
+            external_id="doc_conflict",
+            title="Conflict document",
+            content=source.read_text(encoding="utf-8"),
+            metadata={"summary": "caller summary"},
+        )
+
+
+def test_batch_metadata_status_generates_into_unified_metadata(tmp_path):
+    from pageindex.filesystem import PageIndexFileSystem
+    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
+
+    source = tmp_path / "source.txt"
+    source.write_text("fixture text", encoding="utf-8")
+
+    class SummaryGenerator:
+        def generate(self, document, *, fields):
+            return MetadataGenerationResult(values={"summary": "Batch generated summary."})
+
+    filesystem = PageIndexFileSystem(
+        workspace=tmp_path / "workspace",
+        metadata_generator=SummaryGenerator(),
+    )
+    file_ref = filesystem.register_file(
+        storage_uri=source.as_uri(),
+        source_path="docs/source.txt",
+        folder_path="/documents",
+        external_id="doc_batch",
+        title="Batch document",
+        content=source.read_text(encoding="utf-8"),
+        metadata={"department": "ops"},
+        metadata_policy={
+            "batch": True,
+            "fields": {
+                "summary": True,
+                "doc_type": False,
+                "domain": False,
+                "topic": False,
+            },
+        },
+    )
+
+    before = filesystem.store.get_file(file_ref)
+    assert "summary" not in before.metadata
+    assert before.metadata_status["fields"]["summary"]["status"] == "pending_submit"
+
+    result = filesystem.batch_generate()
+    after = filesystem.store.get_file(file_ref)
+
+    assert result["generated"] == 1
+    assert after.metadata["summary"] == "Batch generated summary."
+    assert after.metadata["department"] == "ops"
+    assert after.metadata_status["fields"]["summary"]["status"] == "generated"
+
+
+def test_find_maxdepth_zero_type_directory_returns_start_folder(tmp_path):
+    executor = _register_find_fixture(tmp_path)
+
+    rows = _data(executor.execute("find /documents -maxdepth 0 -type d"))
+
+    assert [row["path"] for row in rows] == ["/documents"]
+
+
+def test_find_directory_output_renders_root_without_double_slash(tmp_path):
+    executor = _register_find_fixture(tmp_path)
+    executor.json_output = False
+
+    output = executor.execute("find / -maxdepth 1 -type d")
+
+    assert output.splitlines()[0] == "/ folders=1 files=0"
+    assert "//" not in output
+    assert "/documents/ folders=1 files=1" in output
+
+
+def test_find_maxdepth_combines_with_where_and_limit(tmp_path):
+    executor = _register_find_fixture(tmp_path)
+
+    rows = _data(
+        executor.execute(
+            """find /documents -maxdepth 2 -type f --where '{"department":"ops"}' --limit 1"""
+        )
+    )
+
+    assert len(rows) == 1
+    assert rows[0]["metadata"]["department"] == "ops"
+    assert rows[0]["folder_path"] in {"/documents", "/documents/team"}
+
+
+def test_find_maxdepth_rejects_invalid_values_and_unsupported_options(tmp_path):
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    executor = _register_find_fixture(tmp_path)
+
+    with pytest.raises(PIFSCommandError, match="find -maxdepth requires an integer >= 0"):
+        executor.execute("find /documents -maxdepth nope -type f")
+    with pytest.raises(PIFSCommandError, match="find -maxdepth requires an integer >= 0"):
+        executor.execute("find /documents -maxdepth -1 -type f")
+    with pytest.raises(PIFSCommandError, match="Unsupported find option: -exec"):
+        executor.execute("find /documents -maxdepth 1 -type f -exec")
+
+
+def test_find_maxdepth_is_advertised_to_agents(tmp_path):
+    executor = _register_find_fixture(tmp_path)
+
+    assert "-maxdepth N -type f|d" in executor.describe_available_command_surfaces()
+    assert executor.command_capabilities()["retrieval"]["lexical"]["find_maxdepth"] is True
+
+
+def test_where_path_error_points_to_folder_scope(tmp_path):
+    from pageindex.filesystem.commands import PIFSCommandError
+
+    executor = _register_find_fixture(tmp_path)
+
+    with pytest.raises(PIFSCommandError) as exc_info:
+        executor.execute("""find --where '{"path":"/documents"}'""")
+
+    message = str(exc_info.value)
+    assert "Folder paths are positional PIFS paths" in message
+    assert "find /documents -type f" in message
+    assert "stat --schema" in message
diff --git a/tests/test_pifs_like_escape.py b/tests/test_pifs_like_escape.py
new file mode 100644
index 000000000..82e7ef9dd
--- /dev/null
+++ b/tests/test_pifs_like_escape.py
@@ -0,0 +1,115 @@
+from pathlib import Path
+
+
+def _register_file(
+    filesystem,
+    tmp_path: Path,
+    filename: str,
+    *,
+    folder_path: str,
+    external_id: str,
+    metadata: dict[str, str] | None = None,
+) -> None:
+    source = tmp_path / filename
+    source.write_text(f"{external_id} fixture text", encoding="utf-8")
+    filesystem.register_file(
+        storage_uri=source.as_uri(),
+        source_path=f"docs/{filename}",
+        folder_path=folder_path,
+        external_id=external_id,
+        title=external_id,
+        content=source.read_text(encoding="utf-8"),
+        metadata=metadata or {},
+    )
+
+
+def test_descendant_folder_filter_treats_underscore_literally(tmp_path):
+    from pageindex.filesystem import PageIndexFileSystem
+
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    _register_file(
+        filesystem,
+        tmp_path,
+        "literal.txt",
+        folder_path="/proj_1/docs",
+        external_id="literal_underscore",
+    )
+    _register_file(
+        filesystem,
+        tmp_path,
+        "wildcard.txt",
+        folder_path="/projA1/docs",
+        external_id="wildcard_neighbor",
+    )
+
+    recursive = filesystem.browse("/proj_1", recursive=True, limit=10)
+    folder_id = filesystem.folder_info("/proj_1")["folder_id"]
+    scoped_results = filesystem.search(
+        scope={"folder_id": folder_id, "recursive": True},
+        semantic=False,
+        limit=10,
+    )
+    ranked_folders = {
+        folder["path"]: folder
+        for folder in filesystem.find_folders("/", max_depth=1, limit=10)
+    }
+
+    assert {folder["path"] for folder in recursive["folders"]} == {"/proj_1/docs"}
+    assert {file["external_id"] for file in recursive["files"]} == {"literal_underscore"}
+    assert {result.external_id for result in scoped_results} == {"literal_underscore"}
+    assert ranked_folders["/proj_1"]["matched_files"] == 1
+    assert ranked_folders["/projA1"]["matched_files"] == 1
+    assert filesystem.store.count_files_in_folder("/proj_1", recursive=True) == 1
+
+
+def test_metadata_contains_treats_percent_and_underscore_literally(tmp_path):
+    from pageindex.filesystem import PageIndexFileSystem
+
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    filesystem.metadata.register_schema({"fields": {"status": "string"}})
+    _register_file(
+        filesystem,
+        tmp_path,
+        "percent.txt",
+        folder_path="/documents",
+        external_id="literal_percent",
+        metadata={"status": "100% done"},
+    )
+    _register_file(
+        filesystem,
+        tmp_path,
+        "percent-neighbor.txt",
+        folder_path="/documents",
+        external_id="percent_neighbor",
+        metadata={"status": "100X done"},
+    )
+    _register_file(
+        filesystem,
+        tmp_path,
+        "underscore.txt",
+        folder_path="/documents",
+        external_id="literal_underscore",
+        metadata={"status": "build_alpha"},
+    )
+    _register_file(
+        filesystem,
+        tmp_path,
+        "underscore-neighbor.txt",
+        folder_path="/documents",
+        external_id="underscore_neighbor",
+        metadata={"status": "buildXalpha"},
+    )
+
+    percent_results = filesystem.search(
+        metadata_filter={"status": {"$contains": "100% done"}},
+        semantic=False,
+        limit=10,
+    )
+    underscore_results = filesystem.search(
+        metadata_filter={"status": {"$contains": "build_alpha"}},
+        semantic=False,
+        limit=10,
+    )
+
+    assert {result.external_id for result in percent_results} == {"literal_percent"}
+    assert {result.external_id for result in underscore_results} == {"literal_underscore"}
diff --git a/tests/test_pifs_path_resolution.py b/tests/test_pifs_path_resolution.py
new file mode 100644
index 000000000..184fc53da
--- /dev/null
+++ b/tests/test_pifs_path_resolution.py
@@ -0,0 +1,71 @@
+import pytest
+
+
+def test_root_virtual_file_path_resolves_without_double_slash(tmp_path):
+    from pageindex.filesystem import PageIndexFileSystem
+
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    file_ref = filesystem.register_file(
+        storage_uri="file:///tmp/root-source.txt",
+        source_path="sources/root-source.txt",
+        folder_path="/",
+        external_id="doc_root_title",
+        title="Root Title",
+        content="root content",
+    )
+
+    assert filesystem.store.resolve_file_ref("/Root Title") == file_ref
+
+
+def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path):
+    from pageindex.filesystem import PageIndexFileSystem
+
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    first_ref = filesystem.register_file(
+        storage_uri="file:///tmp/first.txt",
+        source_path="b/file.txt",
+        folder_path="/a",
+        external_id="doc_first",
+        title="First",
+        content="first content",
+    )
+    second_ref = filesystem.register_file(
+        storage_uri="file:///tmp/second.txt",
+        source_path="second-source.txt",
+        folder_path="/a/b",
+        external_id="doc_second",
+        title="file.txt",
+        content="second content",
+    )
+
+    with pytest.raises(KeyError, match="Ambiguous file target"):
+        filesystem.store.resolve_file_ref("/a/b/file.txt")
+
+    assert first_ref != second_ref
+
+
+def test_duplicate_source_path_target_raises_clear_error(tmp_path):
+    from pageindex.filesystem import PageIndexFileSystem
+
+    filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
+    first_ref = filesystem.register_file(
+        storage_uri="file:///tmp/first.txt",
+        source_path="shared/source.txt",
+        folder_path="/first",
+        external_id="doc_first",
+        title="First",
+        content="first content",
+    )
+    second_ref = filesystem.register_file(
+        storage_uri="file:///tmp/second.txt",
+        source_path="shared/source.txt",
+        folder_path="/second",
+        external_id="doc_second",
+        title="Second",
+        content="second content",
+    )
+
+    with pytest.raises(KeyError, match="Ambiguous file target"):
+        filesystem.store.resolve_file_ref("/shared/source.txt")
+
+    assert first_ref != second_ref
diff --git a/tests/test_pifs_register_side_effects.py b/tests/test_pifs_register_side_effects.py
new file mode 100644
index 000000000..867dd6bf4
--- /dev/null
+++ b/tests/test_pifs_register_side_effects.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+
+import pytest
+
+
+class SummaryGenerator:
+    def generate(self, document, *, fields):
+        return {field: "Generated registration summary." for field in fields}
+
+
+class RecordingSummaryIndexer:
+    def __init__(self):
+        self.upserted = []
+
+    def upsert_summary(self, record):
+        self.upserted.append(dict(record))
+        return {"status": "ready"}
+
+
+def test_register_insert_failure_cleans_owned_artifacts_and_skips_projection(
+    tmp_path: Path, monkeypatch
+):
+    from pageindex.filesystem import PageIndexFileSystem
+
+    workspace = tmp_path / "workspace"
+    source = tmp_path / "source.txt"
+    source.write_text("Plain text content for registration.", encoding="utf-8")
+    indexer = RecordingSummaryIndexer()
+    filesystem = PageIndexFileSystem(
+        workspace=workspace,
+        metadata_generator=SummaryGenerator(),
+        summary_projection_indexer=indexer,
+    )
+
+    def fail_insert(records):
+        raise RuntimeError("catalog insert failed")
+
+    monkeypatch.setattr(filesystem.store, "insert_files", fail_insert)
+
+    with pytest.raises(RuntimeError, match="catalog insert failed"):
+        filesystem.register_file(
+            storage_uri=source.as_uri(),
+            source_path="docs/source.txt",
+            folder_path="/documents",
+            external_id="doc_insert_failure",
+            title="Insert failure",
+            content=source.read_text(encoding="utf-8"),
+            metadata_policy={
+                "fields": {
+                    "summary": True,
+                    "doc_type": False,
+                    "domain": False,
+                    "topic": False,
+                }
+            },
+        )
+
+    assert indexer.upserted == []
+    assert list((workspace / "artifacts" / "raw").glob("*.json")) == []
+    assert list((workspace / "artifacts" / "text").glob("*.txt")) == []
diff --git a/tests/test_semantic_index.py b/tests/test_semantic_index.py
new file mode 100644
index 000000000..324ead76f
--- /dev/null
+++ b/tests/test_semantic_index.py
@@ -0,0 +1,145 @@
+import sys
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from pageindex.filesystem.semantic_index import (
+    SemanticIndexRecord,
+    SQLiteVecSemanticIndex,
+)
+
+
+def test_sqlite_vec_semantic_index_round_trip(tmp_path):
+    index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
+    index.reset(dimension=3, metadata={"field_mode": "summary"})
+
+    index.upsert_many(
+        [
+            SemanticIndexRecord(
+                file_ref="file_a",
+                external_id="doc_a",
+                source_type="github",
+                source_path="github/a.json",
+                title="Multipart upload limits",
+                text="multipart upload limits",
+                vector=[1.0, 0.0, 0.0],
+                metadata={"topic": "uploads"},
+            ),
+            SemanticIndexRecord(
+                file_ref="file_b",
+                external_id="doc_b",
+                source_type="slack",
+                source_path="slack/b.json",
+                title="GPU cache issue",
+                text="gpu cache issue",
+                vector=[0.0, 1.0, 0.0],
+                metadata={"topic": "runtime"},
+            ),
+        ]
+    )
+
+    assert index.info()["document_count"] == 2
+
+    results = index.search([0.9, 0.1, 0.0], limit=2)
+    assert [item.external_id for item in results] == ["doc_a", "doc_b"]
+
+    filtered = index.search(
+        [0.9, 0.1, 0.0],
+        limit=2,
+        filters={"source_type": "slack"},
+    )
+    assert [item.external_id for item in filtered] == ["doc_b"]
+
+
+def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
+    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+
+    class FakeEmbedder:
+        def embed(self, texts):
+            return [[1.0, 0.0, 0.0] for _ in texts]
+
+    indexer = SummaryProjectionIndexer(
+        tmp_path / "projection",
+        embedder=FakeEmbedder(),
+        embedding_provider="test",
+        embedding_model="fake",
+        embedding_dimensions=3,
+    )
+
+    result = indexer.upsert_summary(
+        {
+            "file_ref": "file_a",
+            "external_id": "doc_a",
+            "source_type": "documents",
+            "source_path": "docs/a.pdf",
+            "title": "A",
+            "metadata": {
+                "summary": "Unified metadata summary.",
+                "department": "ops",
+            },
+        }
+    )
+
+    assert result["status"] == "ready"
+    hits = indexer.index.search([1.0, 0.0, 0.0], limit=1)
+    assert hits[0].external_id == "doc_a"
+    assert hits[0].metadata["summary"] == "Unified metadata summary."
+    assert hits[0].metadata["department"] == "ops"
+
+
+def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
+    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+
+    class FakeEmbedder:
+        def embed(self, texts):
+            return [[1.0, 0.0, 0.0, 0.0] for _ in texts]
+
+    index_dir = tmp_path / "projection"
+    index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
+    index.reset(
+        dimension=3,
+        metadata={
+            "channel": "summary",
+            "embedding_provider": "test",
+            "embedding_model": "fake",
+            "embedding_dimensions": 3,
+        },
+    )
+    index.upsert_many(
+        [
+            SemanticIndexRecord(
+                file_ref="file_a",
+                external_id="doc_a",
+                source_type="documents",
+                source_path="docs/a.pdf",
+                title="A",
+                text="summary",
+                vector=[1.0, 0.0, 0.0],
+            )
+        ]
+    )
+
+    with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"):
+        SummaryProjectionIndexer(
+            index_dir,
+            embedder=FakeEmbedder(),
+            embedding_provider="test",
+            embedding_model="fake",
+            embedding_dimensions=4,
+        )
+
+    preserved = SQLiteVecSemanticIndex(index.db_path)
+    assert preserved.info()["dimension"] == 3
+    assert preserved.info()["document_count"] == 1
+    assert preserved.search([1.0, 0.0, 0.0], limit=1)[0].external_id == "doc_a"
+
+
+def test_hash_embedding_provider_is_not_available():
+    from pageindex.filesystem.hybrid_projection import make_embedder
+
+    with pytest.raises(ValueError, match="unknown embedding provider: hash"):
+        make_embedder("hash", "unused", dimensions=256, timeout=1)