diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 0674ae35d..3a9f10fdd 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -14,7 +14,7 @@ permissions: actions: read jobs: - analyze: + analyze-actions: name: Analyze (actions) runs-on: ubuntu-latest timeout-minutes: 30 @@ -32,3 +32,22 @@ jobs: uses: github/codeql-action/analyze@v4 with: category: /language:actions + + analyze-python: + name: Analyze (python) + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v4 + with: + languages: python + build-mode: none + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v4 + with: + category: /language:python diff --git a/.gitignore b/.gitignore index 23d6b5655..685045e17 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,8 @@ __pycache__ .DS_Store .env* .venv/ +.claude/ +.codex/ logs/ +examples/pifs_workspace/ +examples/Benchmark/ diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py new file mode 100644 index 000000000..7dcfd0d09 --- /dev/null +++ b/examples/pifs_demo.py @@ -0,0 +1,788 @@ +""" +PageIndex FileSystem (PIFS) agent demo. + +This mirrors examples/agentic_vectorless_rag_demo.py, but exposes a corpus +through the PageIndex FileSystem shell instead of direct PageIndex document +tools. The agent receives one read-only bash-like PIFS tool and must retrieve +evidence through commands such as ls, tree, find, grep, search-summary, +cat --structure, cat --page, and cat --node. + +The demo registers supported files under examples/documents. When a matching +examples/documents/results/*_structure.json file exists, it is loaded into the +PIFS workspace's PageIndexClient cache. Files without a cache exercise the +normal PageIndexClient.index() path during register(). + +Requirements: + pip install openai-agents + +Example: + python examples/pifs_demo.py --stream-mode all --verbose +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import re +import shutil +import sys +import time +from pathlib import Path +from typing import Any + +import PyPDF2 + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +# Keep the local demo quiet in offline environments. +os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true") + +from pageindex import PageIndexClient +from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor +from pageindex.filesystem.agent import run_pifs_agent + + +EXAMPLES_DIR = Path(__file__).parent +DOCUMENTS_DIR = EXAMPLES_DIR / "documents" +WORKSPACE = EXAMPLES_DIR / "pifs_workspace" +DEFAULT_MODEL = os.environ.get("PIFS_DEMO_MODEL", "gpt-5.4-mini") +DEFAULT_METADATA_PROVIDER = os.environ.get("PIFS_DEMO_METADATA_PROVIDER") or os.environ.get( + "PIFS_METADATA_PROVIDER", "openai" +) +DEFAULT_EMBEDDING_PROVIDER = os.environ.get("PIFS_DEMO_EMBEDDING_PROVIDER") or os.environ.get( + "PIFS_EMBEDDING_PROVIDER", "openai" +) +DEFAULT_QUESTION = ( + "Use the PIFS workspace to find the Federal Reserve annual report. " + "Which section covers supervision and regulation, and what page range " + "should I inspect? Cite the document and evidence you used." +) + +PIFS_DEMO_AGENT_PROMPT = """ +You are a PageIndex FileSystem retrieval agent for a local demo workspace. + +Use only the bash tool. It is a read-only PIFS virtual shell, not a real OS +shell. The workspace contains registered example PDFs. + +Retrieval strategy: +- Start with ls or tree to understand the workspace. +- Use concrete PIFS paths from ls/find output, such as /documents/report.pdf, + or stable file_ref/document ids. Do not invent temporary ref_N aliases. +- Folder paths such as /documents are positional command targets; do not put + folder paths inside --where. +- Use search-summary when available to find likely documents. + Quote multi-word queries and include a path, for example: + search-summary "Federal Reserve supervision regulation" /documents +- Use find --where only with JSON metadata DSL, for example: + find /documents --where '{"file_format":"pdf"}' +- Use grep -R only for lexical evidence; do not treat semantic candidates as + literal matches. +- Run one evidence command at a time. Do not chain large commands like + cat --structure, grep, and cat --page in one bash call. +- For PDFs, use cat --structure to inspect the PageIndex tree, then + cat --page for evidence, for example: + cat /documents/2023-annual-report.pdf --page 31-35 +- For page-range questions, use cat --structure to identify the full section + range. Then run cat --page on the smallest useful evidence range, usually the + section start page or first 1-2 pages, before the final answer. Do not print + a broad multi-page section unless the user asks to read the whole section. +- Do not use cat --all on PDFs. +- Answer only from PIFS tool output and cite file refs or document ids. +""" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run a PIFS document retrieval agent demo.") + parser.add_argument("--workspace", type=Path, default=WORKSPACE) + parser.add_argument("--documents-dir", type=Path, default=DOCUMENTS_DIR) + parser.add_argument( + "--document", + action="append", + default=[], + help="Specific document filename or path to register. May be repeated.", + ) + parser.add_argument( + "--max-docs", + type=int, + default=0, + help="Limit number of cached example documents to register. 0 means all.", + ) + parser.add_argument("--reset", action="store_true", help="Delete and rebuild the demo workspace.") + parser.add_argument( + "--prepare-only", + action="store_true", + help="Register documents and print PIFS smoke commands without running the agent.", + ) + parser.add_argument("--question", default=DEFAULT_QUESTION) + parser.add_argument("--model", default=DEFAULT_MODEL) + parser.add_argument( + "--metadata-provider", + default=DEFAULT_METADATA_PROVIDER, + help="Provider used for register-time metadata generation.", + ) + parser.add_argument( + "--metadata-model", + default=os.environ.get("PIFS_METADATA_MODEL", "gpt-5-nano"), + help="Model used for register-time metadata generation.", + ) + parser.add_argument("--stream-mode", default="all", choices=["off", "tools", "model", "all"]) + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--max-turns", type=int, default=12) + parser.add_argument("--max-seconds", type=float, default=90) + parser.add_argument("--reasoning-effort", default=None) + parser.add_argument("--reasoning-summary", default="auto") + parser.add_argument( + "--embedding-provider", + default=DEFAULT_EMBEDDING_PROVIDER, + help="Provider used for register-time summary projection embeddings.", + ) + parser.add_argument( + "--embedding-model", + default=os.environ.get("PIFS_DEMO_EMBEDDING_MODEL", "text-embedding-3-small"), + help="Embedding model used for register-time summary projection.", + ) + parser.add_argument("--embedding-dimensions", type=int, default=256) + return parser.parse_args() + + +def require_runtime_environment(*, metadata_provider: str, embedding_provider: str) -> None: + metadata_provider = metadata_provider.lower() + embedding_provider = embedding_provider.lower() + missing: list[str] = [] + if not os.environ.get("OPENAI_API_KEY"): + missing.append("OPENAI_API_KEY for the OpenAI Agents SDK demo agent") + if metadata_provider == "openai" and not ( + os.environ.get("PIFS_METADATA_API_KEY") or os.environ.get("OPENAI_API_KEY") + ): + missing.append("PIFS_METADATA_API_KEY or OPENAI_API_KEY for metadata generation") + if embedding_provider == "openai" and not ( + os.environ.get("PIFS_EMBEDDING_API_KEY") or os.environ.get("OPENAI_API_KEY") + ): + missing.append("PIFS_EMBEDDING_API_KEY or OPENAI_API_KEY for summary embeddings") + if missing: + raise RuntimeError( + "Missing required environment variable(s): " + + "; ".join(missing) + + ". Source your .env or export the required key before running." + ) + + +SUPPORTED_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown", ".txt", ".text"} + + +def discover_documents(documents_dir: Path) -> list[Path]: + return sorted( + path + for path in documents_dir.iterdir() + if path.is_file() and path.suffix.lower() in SUPPORTED_DOCUMENT_SUFFIXES + ) + + +def resolve_requested_documents(documents_dir: Path, requested: list[str]) -> list[Path]: + if not requested: + return discover_documents(documents_dir) + paths: list[Path] = [] + for item in requested: + path = Path(item).expanduser() + if not path.is_absolute(): + path = documents_dir / path + if not path.exists(): + raise FileNotFoundError(f"document not found: {path}") + paths.append(path) + return paths + + +def structure_path_for(document_path: Path, documents_dir: Path) -> Path | None: + path = documents_dir / "results" / f"{document_path.stem}_structure.json" + return path if path.exists() else None + + +def deterministic_doc_id(document_path: Path) -> str: + digest = hashlib.sha1(str(document_path.resolve()).encode("utf-8")).hexdigest()[:16] + return f"pifs_demo_{digest}" + + +def read_pdf_pages(document_path: Path) -> list[dict[str, Any]]: + pages: list[dict[str, Any]] = [] + with document_path.open("rb") as handle: + reader = PyPDF2.PdfReader(handle) + for page_num, page in enumerate(reader.pages, 1): + pages.append({"page": page_num, "content": page.extract_text() or ""}) + return pages + + +def load_structure_json(structure_path: Path) -> dict[str, Any]: + with structure_path.open("r", encoding="utf-8") as handle: + payload = json.load(handle) + if not isinstance(payload, dict) or not isinstance(payload.get("structure"), list): + raise ValueError(f"invalid PageIndex structure cache: {structure_path}") + return payload + + +def seed_pageindex_cache( + filesystem: PageIndexFileSystem, + document_path: Path, + *, + documents_dir: Path, +) -> str | None: + structure_path = structure_path_for(document_path, documents_dir) + if structure_path is None: + return None + + filesystem.pageindex_client_workspace.mkdir(parents=True, exist_ok=True) + meta_path = filesystem.pageindex_client_workspace / "_meta.json" + if not meta_path.exists(): + meta_path.write_text("{}", encoding="utf-8") + client = PageIndexClient(workspace=str(filesystem.pageindex_client_workspace)) + canonical_path = str(document_path.resolve()) + for doc_id, doc in client.documents.items(): + if Path(str(doc.get("path") or "")).resolve(strict=False) == Path(canonical_path): + return doc_id + + payload = load_structure_json(structure_path) + doc_id = deterministic_doc_id(document_path) + suffix = document_path.suffix.lower() + if suffix == ".pdf": + pages = read_pdf_pages(document_path) + client.documents[doc_id] = { + "id": doc_id, + "type": "pdf", + "path": canonical_path, + "doc_name": payload.get("doc_name") or document_path.name, + "doc_description": payload.get("doc_description") or "", + "page_count": len(pages), + "structure": payload["structure"], + "pages": pages, + } + elif suffix in {".md", ".markdown"}: + text = document_path.read_text(encoding="utf-8") + client.documents[doc_id] = { + "id": doc_id, + "type": "md", + "path": canonical_path, + "doc_name": payload.get("doc_name") or document_path.name, + "doc_description": payload.get("doc_description") or "", + "line_count": len(text.splitlines()), + "structure": payload["structure"], + } + else: + return None + client._save_doc(doc_id) + return doc_id + + +def content_type_for(path: Path) -> str: + suffix = path.suffix.lower() + if suffix == ".pdf": + return "application/pdf" + if suffix in {".md", ".markdown"}: + return "text/markdown" + return "text/plain" + + +def external_id_for(path: Path) -> str: + slug = "".join(ch.lower() if ch.isalnum() else "_" for ch in path.stem).strip("_") + slug = "_".join(part for part in slug.split("_") if part) + return f"example_{slug}" + + +def log_progress(message: str, *, indent: int = 0) -> None: + prefix = " " * indent + print(f"{prefix}{message}", flush=True) + + +def register_demo_metadata_schema(filesystem: PageIndexFileSystem) -> None: + filesystem.metadata.register_schema( + { + "fields": { + "source_collection": { + "type": "string", + "description": "Local example corpus collection.", + }, + "file_format": { + "type": "string", + "description": "Source file extension without the leading dot.", + }, + } + }, + source="demo", + ) + + +def backfill_registered_metadata_values(filesystem: PageIndexFileSystem, file_ref: str) -> None: + entry = filesystem.store.get_file(file_ref) + indexed_metadata = dict(entry.metadata or {}) + with filesystem.store.connect() as conn: + filesystem.store.replace_metadata_values(conn, file_ref, indexed_metadata) + + +def configure_summary_projection_backend( + filesystem: PageIndexFileSystem, + *, + embedding_provider: str, + embedding_model: str, + embedding_dimensions: int, +) -> None: + if not (filesystem.summary_projection_index_dir / "summary_only_vector.sqlite").exists(): + return + filesystem.configure_hybrid_projection_retrieval( + filesystem.summary_projection_index_dir, + embedding_provider=embedding_provider, + embedding_model=embedding_model, + embedding_dimensions=embedding_dimensions, + ) + + +def has_ready_register_outputs(filesystem: PageIndexFileSystem, external_id: str) -> bool: + try: + file_ref = filesystem.store.resolve_file_ref(external_id) + entry = filesystem.store.get_file(file_ref) + except KeyError: + return False + status = entry.metadata_status or {} + fields = status.get("fields") or {} + required = ("summary", "doc_type", "domain", "topic") + if any(fields.get(field, {}).get("status") != "generated" for field in required): + return False + summary_projection = (status.get("projection_indexes") or {}).get("summary") or {} + return summary_projection.get("status") == "ready" + + +def register_documents( + filesystem: PageIndexFileSystem, + documents: list[Path], + *, + documents_dir: Path, +) -> list[dict[str, Any]]: + registered: list[dict[str, Any]] = [] + total = len(documents) + for index, document_path in enumerate(documents, 1): + document_path = document_path.resolve() + external_id = external_id_for(document_path) + log_progress(f"[{index}/{total}] {document_path.name}") + log_progress("PageIndex tree cache: checking examples/documents/results", indent=1) + cache_started = time.perf_counter() + cached_doc_id = seed_pageindex_cache( + filesystem, + document_path, + documents_dir=documents_dir, + ) + cache_seconds = time.perf_counter() - cache_started + if cached_doc_id: + log_progress( + f"PageIndex tree cache: ready doc_id={cached_doc_id} ({cache_seconds:.2f}s)", + indent=1, + ) + else: + log_progress( + f"PageIndex tree cache: no cached structure; register() will index if supported ({cache_seconds:.2f}s)", + indent=1, + ) + if has_ready_register_outputs(filesystem, external_id): + file_ref = filesystem.store.resolve_file_ref(external_id) + backfill_registered_metadata_values(filesystem, file_ref) + log_progress( + f"PIFS register: cached file_ref={file_ref}; metadata and summary projection already ready", + indent=1, + ) + registered.append( + { + "file_ref": file_ref, + "external_id": external_id, + "path": str(document_path), + "status": "cached", + "pageindex_doc_id": cached_doc_id, + } + ) + continue + + log_progress( + "PIFS register: running register() -> metadata generation -> summary embedding -> sqlite upsert", + indent=1, + ) + register_started = time.perf_counter() + file_ref = filesystem.register( + storage_uri=document_path.as_uri(), + source_path=str(document_path), + folder_path="/documents", + external_id=external_id, + title=document_path.name, + content_type=content_type_for(document_path), + source_type="examples-documents", + metadata={ + "title": document_path.name, + "source_collection": "examples/documents", + "file_format": document_path.suffix.lower().lstrip("."), + }, + ) + register_seconds = time.perf_counter() - register_started + entry = filesystem.store.get_file(file_ref) + field_status = { + field: state.get("status") + for field, state in (entry.metadata_status.get("fields") or {}).items() + } + summary_projection = ( + entry.metadata_status.get("projection_indexes", {}).get("summary", {}) + ) + log_progress( + f"PIFS register: done file_ref={file_ref} ({register_seconds:.2f}s)", + indent=1, + ) + log_progress( + f"metadata: {entry.metadata_status.get('status', 'unknown')} fields={field_status}", + indent=1, + ) + log_progress( + "summary projection: " + f"{summary_projection.get('status', 'not_requested')} " + f"index={summary_projection.get('index_path', '')}", + indent=1, + ) + registered.append( + { + "file_ref": file_ref, + "external_id": external_id, + "path": str(document_path), + "status": entry.metadata_status.get("status", "unknown"), + "pageindex_tree_status": entry.pageindex_tree_status, + "pageindex_doc_id": entry.pageindex_doc_id, + } + ) + return registered + + +def print_section(title: str) -> None: + print("\n" + "#" * 78, flush=True) + print(f"# {title}", flush=True) + print("#" * 78, flush=True) + + +def print_step(title: str, detail: str = "") -> None: + print(f"\n>>> {title}", flush=True) + if detail: + print(f" {detail}", flush=True) + + +def sanitize_preview_text(text: str) -> str: + cleaned = str(text).replace("\r", "\n").replace("\f", "\n") + cleaned = "".join( + ch if ch == "\n" or ch == "\t" or ord(ch) >= 32 else " " + for ch in cleaned + ) + return "\n".join( + re.sub(r"[ \t]{2,}", " ", line).strip() + for line in cleaned.splitlines() + ) + + +def compact_lines(text: str, *, max_lines: int = 6, max_chars: int = 900) -> str: + lines = [line for line in sanitize_preview_text(text).splitlines() if line.strip()] + preview = "\n".join(lines[:max_lines]) + if len(preview) > max_chars: + preview = preview[:max_chars].rstrip() + "..." + omitted = len(lines) - min(len(lines), max_lines) + if omitted > 0: + preview += f"\n ... {omitted} more lines" + return preview + + +def find_structure_node(structure: Any, title_fragment: str) -> dict[str, Any] | None: + if isinstance(structure, list): + for item in structure: + found = find_structure_node(item, title_fragment) + if found: + return found + return None + if not isinstance(structure, dict): + return None + if title_fragment.lower() in str(structure.get("title", "")).lower(): + return structure + return find_structure_node(structure.get("nodes", []), title_fragment) + + +def page_range_for_node(node: dict[str, Any] | None) -> str: + if not node: + return "" + ranges: list[tuple[int, int]] = [] + + def collect(item: Any) -> None: + if not isinstance(item, dict): + return + start = item.get("start_index") + end = item.get("end_index") + if isinstance(start, int) and isinstance(end, int): + ranges.append((start, end)) + for child in item.get("nodes") or []: + collect(child) + + collect(node) + if not ranges: + return "" + start = min(item[0] for item in ranges) + end = max(item[1] for item in ranges) + return f"{start}-{end}" if start != end else str(start) + + +def opening_page_range_for_node(node: dict[str, Any] | None, *, max_pages: int = 2) -> str: + if not node: + return "" + ranges: list[tuple[int, int]] = [] + + def collect(item: Any) -> None: + if not isinstance(item, dict): + return + start = item.get("start_index") + end = item.get("end_index") + if isinstance(start, int) and isinstance(end, int): + ranges.append((start, end)) + for child in item.get("nodes") or []: + collect(child) + + collect(node) + if not ranges: + return "" + start = min(item[0] for item in ranges) + end = max(item[1] for item in ranges) + preview_end = min(end, start + max_pages - 1) + return f"{start}-{preview_end}" if start != preview_end else str(start) + + +def execute_json_command(executor: PIFSCommandExecutor, command: str) -> dict[str, Any]: + try: + return json.loads(executor.execute(command)) + except Exception as exc: + return {"ok": False, "error": str(exc), "data": None} + + +def show_capability( + *, + label: str, + command: str, + result: str, + raw: str = "", + verbose: bool = False, +) -> None: + print_step(label, command) + print(f" result: {result}", flush=True) + if verbose and raw: + print(" raw:", flush=True) + print(compact_lines(raw, max_lines=10, max_chars=1600), flush=True) + + +def show_registered_documents(registered: list[dict[str, Any]], *, verbose: bool = False) -> None: + print(f"\nRegistered {len(registered)} document(s):", flush=True) + for item in registered: + print( + " - " + f"{Path(str(item.get('path', ''))).name}: " + f"file_ref={item.get('file_ref')} | " + f"status={item.get('status')} | " + f"pageindex_doc_id={item.get('pageindex_doc_id')}", + flush=True, + ) + if verbose: + print("\nRaw registration records:", flush=True) + print(json.dumps(registered, ensure_ascii=False, indent=2), flush=True) + + +def run_smoke_commands( + filesystem: PageIndexFileSystem, + registered: list[dict[str, Any]], + *, + verbose: bool = False, +) -> None: + json_executor = PIFSCommandExecutor(filesystem, json_output=True) + shell_executor = PIFSCommandExecutor(filesystem, json_output=False) + + command = "tree / --depth 2" + tree = execute_json_command(json_executor, command) + folders = (tree.get("data") or {}).get("folders") or [] + documents_folder = next((item for item in folders if item.get("path") == "/documents"), {}) + show_capability( + label="Folder browse", + command=command, + result=f"/documents contains {documents_folder.get('file_count', len(registered))} files", + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + command = "ls /documents" + listing = execute_json_command(json_executor, command) + files = (listing.get("data") or {}).get("files") or [] + file_titles = ", ".join(item.get("title", "") for item in files[:3]) + show_capability( + label="List registered files", + command=command, + result=f"{len(files)} files: {file_titles}", + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + command = "stat --schema" + schema = execute_json_command(json_executor, command) + fields = sorted(((schema.get("data") or {}).get("fields") or {}).keys()) + show_capability( + label="Metadata schema", + command=command, + result=", ".join(fields), + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + command = "find /documents --where '{\"source_collection\":\"examples/documents\"}' --limit 5" + found = execute_json_command(json_executor, command) + found_files = found.get("data") or [] + show_capability( + label="Metadata DSL filter", + command=command, + result=f"{len(found_files)} documents matched source_collection=examples/documents", + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + command = 'search-summary "Federal Reserve annual report supervision regulation section page range" /documents' + summary = execute_json_command(json_executor, command) + summary_hits = ((summary.get("data") or {}).get("data") or []) + if summary_hits: + summary_result = f"{len(summary_hits)} summary-vector candidates; top={summary_hits[0].get('external_id')}" + else: + summary_result = "summary-vector command is available, but this tiny two-doc demo returned no candidates" + show_capability( + label="Semantic summary search", + command=command, + result=summary_result, + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + first_target = f"/documents/{Path(str(registered[0]['path'])).name}" if registered else None + if not first_target: + return + + command = f"stat {first_target}" + stat = execute_json_command(json_executor, command) + stat_data = stat.get("data") or {} + show_capability( + label="File stat", + command=command, + result=( + f"{stat_data.get('title')} | tree={stat_data.get('pageindex_tree_status')} | " + f"metadata_status={(stat_data.get('metadata_status') or {}).get('status')}" + ), + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + command = f"cat {first_target} --structure" + structure_payload = execute_json_command(json_executor, command) + structure_data = structure_payload.get("data") or {} + structure = structure_data.get("structure") or [] + supervision_node = find_structure_node(structure, "Supervision and Regulation") + supervision_range = page_range_for_node(supervision_node) + show_capability( + label="PageIndex document structure", + command=command, + result=( + "found section 'Supervision and Regulation'" + + (f" with page span {supervision_range}" if supervision_range else "") + ), + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + evidence_range = opening_page_range_for_node(supervision_node) or "1-2" + command = f"cat {first_target} --page {evidence_range}" + page = execute_json_command(json_executor, command) + page_text = str((page.get("data") or {}).get("text") or "") + show_capability( + label="Page evidence", + command=command, + result=compact_lines(page_text, max_lines=3, max_chars=420), + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + command = 'grep -R "Supervision and Regulation" /documents' + grep = execute_json_command(json_executor, command) + grep_hits = ((grep.get("data") or {}).get("data") or []) + show_capability( + label="Lexical grep", + command=command, + result=f"{len(grep_hits)} real text matches", + raw=shell_executor.execute(command) if verbose else "", + verbose=verbose, + ) + + +def main() -> None: + args = parse_args() + require_runtime_environment( + metadata_provider=args.metadata_provider, + embedding_provider=args.embedding_provider, + ) + workspace = args.workspace.expanduser() + documents_dir = args.documents_dir.expanduser() + if args.reset and workspace.exists(): + shutil.rmtree(workspace) + workspace.mkdir(parents=True, exist_ok=True) + + documents = resolve_requested_documents(documents_dir, args.document) + if args.max_docs > 0: + documents = documents[: args.max_docs] + if not documents: + raise RuntimeError(f"no cached example documents found under {documents_dir}") + + filesystem = PageIndexFileSystem( + workspace, + metadata_generator=MetadataGenerator( + provider=args.metadata_provider, + model=args.metadata_model, + ), + summary_projection_embedding_provider=args.embedding_provider, + summary_projection_embedding_model=args.embedding_model, + summary_projection_embedding_dimensions=args.embedding_dimensions, + ) + register_demo_metadata_schema(filesystem) + + print_section("STEP 1/3 Register Documents") + print(f"Workspace: {workspace}", flush=True) + print(f"Documents: {len(documents)}", flush=True) + registered = register_documents(filesystem, documents, documents_dir=documents_dir) + configure_summary_projection_backend( + filesystem, + embedding_provider=args.embedding_provider, + embedding_model=args.embedding_model, + embedding_dimensions=args.embedding_dimensions, + ) + show_registered_documents(registered, verbose=args.verbose) + + print_section("STEP 2/3 Explore PIFS Tool Surface") + run_smoke_commands(filesystem, registered, verbose=args.verbose) + + if args.prepare_only: + return + + print_section("STEP 3/3 Ask An Agent Using Only PIFS") + print(f"Question: {args.question}", flush=True) + answer = run_pifs_agent( + filesystem, + args.question, + model=args.model, + root="/", + system_prompt=PIFS_DEMO_AGENT_PROMPT, + max_turns=args.max_turns, + max_seconds=args.max_seconds, + verbose=args.verbose, + stream_mode=args.stream_mode, + reasoning_effort=args.reasoning_effort, + reasoning_summary=args.reasoning_summary, + ) + if answer: + print("\nFinal answer:", flush=True) + print(answer, flush=True) + + +if __name__ == "__main__": + main() diff --git a/pageindex/__init__.py b/pageindex/__init__.py index 658003bf5..c3fb0b0ae 100644 --- a/pageindex/__init__.py +++ b/pageindex/__init__.py @@ -1,4 +1,22 @@ -from .page_index import * -from .page_index_md import md_to_tree -from .retrieve import get_document, get_document_structure, get_page_content -from .client import PageIndexClient +import os + +os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true") + +_OPTIONAL_CORE_IMPORTS = {"litellm", "openai", "PyPDF2", "pymupdf"} + +try: + from .page_index import * + from .page_index_md import md_to_tree + from .retrieve import get_document, get_document_structure, get_page_content + from .client import PageIndexClient +except ModuleNotFoundError as exc: + if exc.name not in _OPTIONAL_CORE_IMPORTS: + raise + + +def __getattr__(name: str): + if name == "PageIndexFileSystem": + from .filesystem import PageIndexFileSystem + + return PageIndexFileSystem + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/pageindex/filesystem/__init__.py b/pageindex/filesystem/__init__.py new file mode 100644 index 000000000..7908393d8 --- /dev/null +++ b/pageindex/filesystem/__init__.py @@ -0,0 +1,63 @@ +from importlib import import_module +from typing import TYPE_CHECKING + +from .commands import PIFSCommandExecutor +from .core import PageIndexFileSystem +from .metadata_generation import ( + MetadataGenerationBackend, + MetadataGenerationError, + MetadataGenerationInput, + MetadataGenerationResult, + MetadataGenerator, +) +from .types import OpenResult, SearchResult + +if TYPE_CHECKING: + from .hybrid_projection import HybridProjectionSearchBackend + from .projection_indexing import SummaryProjectionIndexer + from .semantic_index import ( + RebuildableSemanticIndex, + SemanticIndexRecord, + SemanticSearchResult, + SQLiteVecSemanticIndex, + ) + +_LAZY_EXPORTS = { + "HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"), + "RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"), + "SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"), + "SemanticSearchResult": (".semantic_index", "SemanticSearchResult"), + "SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"), + "SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"), +} + +__all__ = [ + "OpenResult", + "HybridProjectionSearchBackend", + "MetadataGenerationBackend", + "MetadataGenerationError", + "MetadataGenerationInput", + "MetadataGenerationResult", + "MetadataGenerator", + "PIFSCommandExecutor", + "PageIndexFileSystem", + "RebuildableSemanticIndex", + "SearchResult", + "SemanticIndexRecord", + "SemanticSearchResult", + "SummaryProjectionIndexer", + "SQLiteVecSemanticIndex", +] + + +def __getattr__(name: str): + if name in _LAZY_EXPORTS: + module_name, attribute_name = _LAZY_EXPORTS[name] + value = getattr(import_module(module_name, __name__), attribute_name) + globals()[name] = value + return value + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__() -> list[str]: + return sorted(set(globals()) | set(__all__) | set(_LAZY_EXPORTS)) diff --git a/pageindex/filesystem/agent.py b/pageindex/filesystem/agent.py new file mode 100644 index 000000000..b1f162504 --- /dev/null +++ b/pageindex/filesystem/agent.py @@ -0,0 +1,661 @@ +from __future__ import annotations + +import asyncio +import concurrent.futures +import json +import os +import re +import sys +import time +from dataclasses import asdict, is_dataclass +from typing import Any, Mapping, TextIO + +from .commands import PIFSCommandError, PIFSCommandExecutor +from .core import PageIndexFileSystem + + +TRUTHY_ENV_VALUES = {"1", "true", "yes", "on"} +PIFS_AGENT_TRACING_ENV = "PAGEINDEX_PIFS_AGENT_TRACING" +PIFS_AGENT_RAW_REASONING_ENV = "PAGEINDEX_PIFS_AGENT_RAW_REASONING" + +AGENT_SYSTEM_PROMPT = """ +You are the PageIndex FileSystem Demo Agent, developed by the VectifyAI Team. +Your job is to answer questions about the caller's current PageIndex FileSystem +workspace. + +You can inspect the corpus only by calling the bash tool. The bash tool is a +read-only PageIndex virtual shell, not a real operating-system shell. + +If the user asks who you are, answer with this identity and mention that you can +help inspect and answer questions about the current PIFS workspace. If the user +asks a general question unrelated to the current workspace, do not answer it as +a general-purpose assistant; briefly say that you can only answer workspace- +related questions and invite them to ask about files, folders, metadata, or +document contents in the workspace. + +If the user asks what tools or capabilities you have, describe only the PIFS +virtual shell capabilities available inside this workspace: ls, tree, find, +stat, grep, cat, and semantic search commands such as search-summary when they +are available. Do not mention host runtime tools, SDK internals, or orchestration +helpers that are not part of the PIFS shell. + +If the user asks a workspace-related topic question without naming a specific +file, treat it as a retrieval task. Use available PIFS discovery commands to +look for relevant files and inspect evidence before answering. Ask the user to +clarify only after a reasonable search cannot identify relevant evidence. +Do not conclude that no relevant document exists from one failed grep. If grep +returns no matches for a workspace topic, verify with available semantic +candidate discovery such as search-summary, or inspect likely document +structure, before saying that the workspace lacks evidence. + +Follow the task prompt for command policy, retrieval strategy, and answer +format. If the caller needs stricter behavior, pass an explicit system_prompt. +""" + +BASH_TOOL_DESCRIPTION = """ +Run a command in the PageIndex FileSystem virtual shell. This is not a real +operating-system shell. By default the tool is read-only: use ls, tree, find, +grep, cat, stat, head, tail, sed, and any dynamically available semantic search +commands described in the workspace context. grep -R is lexical evidence search; +grep does not support regex alternation such as "a|b"; run multiple grep +commands or use search-summary for semantic candidate discovery instead. +semantic search commands such as search-summary return candidate documents and +do not guarantee literal text matches or final answer evidence. After choosing +a likely search-summary candidate, verify the relevant claim with cat before +answering. Use search-summary when the user asks for summary search, semantic +search, or vector search and the command is listed as available. Quote +multi-word semantic queries, for example: +search-summary "Federal Reserve" /documents. Do not write +search-summary Federal Reserve /documents. Errors are returned as text prefixed +with ERROR. Do not call +commands that are not listed as available. When evidence is required, inspect it +with cat or grep before answering. Prefer shell-like target-first cat syntax +with stable targets: cat --structure, cat --page 31-59, and +cat --node 0009. You may also use file_ref or document_id when a path is +ambiguous. Do not reconstruct paths from document titles; use exact targets +returned by PIFS commands and quote paths containing spaces. After structure +identifies a relevant section node, prefer +cat --node ; use cat --page when the user asks +for page-level evidence, no suitable node exists, or exact page text is needed. +cat --structure is paginated; request more with --offset if needed. Page +reads are limited to five pages at once, node reads to at most ten node ids, +and text cat --all returns only the first page of text lines. If a cat limit +error requires a smaller call, stop when the evidence is sufficient; otherwise +continue with another chunk before answering. +For questions about metadata fields, available summaries, or whether metadata +was provided, inspect stat --schema and stat before making claims. +Do not use stat as a general content/topic discovery step. For document Q&A, +prefer search-summary/find/grep for candidates, then cat --structure and +cat --node or cat --page for evidence. +""" + +AGENT_TOOL_POLICY = """ +Tool policy: +- The bash tool is a PageIndex virtual shell, not an operating-system shell. +- The default agent tool surface is read-only. +- Use only commands listed in the workspace capabilities. +- Folder paths such as /documents are positional command targets; never put folder paths in --where. +- Use --where only with metadata fields shown by stat --schema. +- grep -R performs lexical evidence search. +- grep does not support regex alternation such as "a|b"; run separate grep commands or use search-summary for semantic candidate discovery. +- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches or final answer evidence. After selecting a likely search-summary candidate, verify the relevant facts with cat before answering. +- Do not use find | grep as an exhaustive search or as proof that no document exists; find output can be scoped or limited. Use metadata filters, search-summary, grep on a narrowed target, or cat on likely candidates instead. +- A single failed grep is not enough evidence to say there is no relevant document. If grep returns no matches for a workspace-topic question, verify with search-summary or another available semantic/vector candidate command, or inspect likely document structure, before answering no-evidence. +- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary "" ; quote multi-word queries, for example search-summary "Federal Reserve" /documents; do not translate that request into find --where. +- Tool errors are returned as ERROR text; recover by trying an available command. +- Use cat or grep to gather evidence before making source-backed claims. +- Do not reconstruct a file path from a title. Use exact paths returned by PIFS commands, or use file_ref/document_id when available; quote paths that contain spaces. +- For broad topic, method, or "what solution" questions that are likely about the workspace, search for candidate documents before asking the user to choose a document. +- Use stat only for metadata/schema/status questions or to resolve ambiguous target identity. Do not run stat merely to understand what a document says. +- Prefer target-first cat syntax with stable targets: cat --structure, cat --page 31-59, cat --node . +- cat --structure returns at most 25 nodes; use --offset and --limit for more structure pages. +- cat --page accepts at most 5 pages at once. If a larger range is needed, first inspect cat --structure and then read a smaller page range or node. +- cat --node accepts at most 10 node ids at once. Prefer relevant nodes from structure when possible. +- When recovering from cat page/node/text limit errors, stop if the evidence is sufficient; if it is not sufficient, make another smaller call before answering. +- cat --all returns at most 100 text lines; use cat --range - for the next page. +- After cat --structure finds a relevant section/subsection with a node_id, prefer cat --node for content from that semantic unit. +- Use cat --page - when the user explicitly asks for pages/page ranges, when no suitable node_id exists, or when you need exact page text to verify page-level evidence. +- Avoid fetching a broad page span after a matching node is available unless page-level citation or verification is required. +- Do not call cat --page ; if you need a page span, use cat --page -. +- For metadata or summary-field questions, run stat --schema and stat for relevant files before answering; do not infer metadata presence or absence from ls/find output alone. +- Distinguish default/register metadata from caller-provided custom metadata when the evidence supports it. +""" + +STREAM_MODE_ALIASES = { + "": "off", + "none": "off", + "false": "off", + "0": "off", + "off": "off", + "tool": "tools", + "tools": "tools", + "model": "model", + "output": "model", + "outputs": "model", + "think": "model", + "all": "all", + "debug": "all", +} +AGENT_STREAM_MODE_CHOICES = sorted(item for item in STREAM_MODE_ALIASES if item) +REASONING_EFFORT_CHOICES = ["none", "minimal", "low", "medium", "high", "xhigh"] +REASONING_SUMMARY_CHOICES = ["none", "auto", "concise", "detailed"] + + +def should_use_openai_compatible_chat_model(base_url: str | None) -> bool: + if not base_url: + return False + normalized = base_url.strip().rstrip("/") + return normalized not in {"https://api.openai.com", "https://api.openai.com/v1"} + + +def env_flag_enabled(name: str, environ: Mapping[str, str] | None = None) -> bool: + source = os.environ if environ is None else environ + value = source.get(name, "") + return value.strip().lower() in TRUTHY_ENV_VALUES + + +def pifs_agent_tracing_enabled(environ: Mapping[str, str] | None = None) -> bool: + return env_flag_enabled(PIFS_AGENT_TRACING_ENV, environ) + + +def should_disable_pifs_agent_tracing(environ: Mapping[str, str] | None = None) -> bool: + return not pifs_agent_tracing_enabled(environ) + + +def pifs_agent_raw_reasoning_enabled(environ: Mapping[str, str] | None = None) -> bool: + return env_flag_enabled(PIFS_AGENT_RAW_REASONING_ENV, environ) + + +def normalize_reasoning_effort(reasoning_effort: str | None) -> str | None: + if reasoning_effort is None or not reasoning_effort.strip(): + return None + effort = reasoning_effort.strip().lower() + if effort not in REASONING_EFFORT_CHOICES: + allowed = ", ".join(REASONING_EFFORT_CHOICES) + raise ValueError(f"Unknown reasoning effort: {reasoning_effort!r}. Allowed: {allowed}") + return effort + + +def normalize_reasoning_summary(reasoning_summary: str | None) -> str | None: + if reasoning_summary is None or not reasoning_summary.strip(): + return None + summary = reasoning_summary.strip().lower() + if summary not in REASONING_SUMMARY_CHOICES: + allowed = ", ".join(REASONING_SUMMARY_CHOICES) + raise ValueError(f"Unknown reasoning summary: {reasoning_summary!r}. Allowed: {allowed}") + return None if summary == "none" else summary + + +def build_agent_model_settings( + *, + reasoning_effort: str | None = None, + reasoning_summary: str | None = None, +) -> Any | None: + effort = normalize_reasoning_effort(reasoning_effort) + summary = normalize_reasoning_summary(reasoning_summary) + if effort is None and summary is None: + return None + if effort not in {None, "none"} and summary is None: + summary = "auto" + + from agents import ModelSettings + from openai.types.shared import Reasoning + + reasoning_kwargs = {} + if effort is not None: + reasoning_kwargs["effort"] = effort + if summary is not None: + reasoning_kwargs["summary"] = summary + return ModelSettings(reasoning=Reasoning(**reasoning_kwargs), verbosity="low") + + +def normalize_agent_stream_mode(stream_mode: str | None) -> str: + mode = STREAM_MODE_ALIASES.get((stream_mode or "off").strip().lower()) + if mode is None: + allowed = ", ".join(sorted({"off", "tools", "model", "all"})) + raise ValueError(f"Unknown PIFS agent stream mode: {stream_mode!r}. Allowed: {allowed}") + return mode + + +def serialize_agent_final_output(value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + if hasattr(value, "model_dump_json"): + return value.model_dump_json() + if is_dataclass(value): + return json.dumps(asdict(value), ensure_ascii=False) + if isinstance(value, (dict, list)): + return json.dumps(value, ensure_ascii=False) + return str(value) + + +def compact_tool_output_preview( + output: str, + *, + preview_chars: int = 700, + max_lines: int = 8, +) -> str: + cleaned = str(output).replace("\r", "\n").replace("\f", "\n") + cleaned = "".join( + ch if ch == "\n" or ch == "\t" or ord(ch) >= 32 else " " + for ch in cleaned + ) + lines = [ + re.sub(r"[ \t]{2,}", " ", line).strip() + for line in cleaned.splitlines() + if line.strip() + ] + is_large_result = len(cleaned) > preview_chars or len(lines) > max_lines + preview = "\n".join(lines[:max_lines]) + if len(preview) > preview_chars: + preview = preview[:preview_chars].rstrip() + "..." + omitted = len(lines) - min(len(lines), max_lines) + if is_large_result: + preview = f"[large PIFS result: {len(cleaned)} chars; showing compact preview]\n" + preview + if omitted > 0: + preview += f"\n... [{omitted} more lines omitted from preview]" + if len(cleaned) > preview_chars: + preview += "\n... [full result returned to agent; terminal preview shortened]" + return preview + + +def build_agent_initial_context( + filesystem: PageIndexFileSystem, + *, + root: str = "/", + executor: PIFSCommandExecutor | None = None, + query_context: str | None = None, +) -> str: + executor = executor or PIFSCommandExecutor( + filesystem, + json_output=False, + query_context=query_context, + ) + schema = filesystem._metadata_schema() + schema_fields = schema.get("fields", {}) + schema_sample = dict(list(schema_fields.items())[:50]) + return "\n".join( + [ + f"Root path: {root}", + "Top-level listing:", + executor.execute(f"ls {root}"), + "Metadata schema summary:", + json.dumps( + { + "field_count": len(schema_fields), + "sample_fields": schema_sample, + }, + ensure_ascii=False, + ), + "Workspace retrieval capabilities:", + executor.describe_available_command_surfaces(), + ] + ) + + +def build_pifs_agent_instructions( + filesystem: PageIndexFileSystem, + *, + root: str = "/", + system_prompt: str | None = None, + executor: PIFSCommandExecutor | None = None, + query_context: str | None = None, +) -> str: + initial_context = build_agent_initial_context( + filesystem, + root=root, + executor=executor, + query_context=query_context, + ) + return "\n\n".join( + [ + (system_prompt or AGENT_SYSTEM_PROMPT).strip(), + AGENT_TOOL_POLICY.strip(), + "Workspace context:\n" + initial_context, + ] + ) + + +class PIFSAgentStreamObserver: + def __init__( + self, + stream_mode: str, + *, + stream_log: list[dict[str, Any]] | None = None, + output: TextIO | None = None, + include_raw_reasoning: bool | None = None, + ) -> None: + self.stream_mode = normalize_agent_stream_mode(stream_mode) + self.stream_log = stream_log + self.output = output or sys.stdout + self.include_raw_reasoning = ( + pifs_agent_raw_reasoning_enabled() + if include_raw_reasoning is None + else include_raw_reasoning + ) + self._printed_section: str | None = None + self._buffers: dict[str, list[str]] = { + "output": [], + "think": [], + "think_summary": [], + "tool_args": [], + } + + @property + def wants_model_stream(self) -> bool: + return self.stream_mode in {"model", "all"} + + @property + def wants_tool_stream(self) -> bool: + return self.stream_mode in {"tools", "all"} + + @property + def has_output_text(self) -> bool: + return bool(self._buffers["output"]) + + def handle_event(self, event: Any) -> None: + if getattr(event, "type", None) == "raw_response_event": + self._handle_raw_response_event(getattr(event, "data", None)) + elif getattr(event, "type", None) == "run_item_stream_event": + self._handle_run_item_event(event) + + def finish(self, final_output: Any = None) -> None: + if self.wants_model_stream and not self.has_output_text and final_output: + self._emit("output", str(final_output), "[llm final output stream]") + if self._printed_section is not None: + print(file=self.output, flush=True) + self._printed_section = None + if self.stream_log is not None: + for kind, parts in self._buffers.items(): + text = "".join(parts) + if text: + self.stream_log.append({"kind": kind, "text": text}) + + def _handle_raw_response_event(self, data: Any) -> None: + event_type = getattr(data, "type", "") + delta = getattr(data, "delta", None) + if not isinstance(delta, str) or not delta: + return + if event_type == "response.output_text.delta": + self._emit("output", delta, "[llm final output stream]") + elif event_type == "response.reasoning_text.delta": + if self.include_raw_reasoning: + self._emit("think", delta, "[llm reasoning text stream]") + elif event_type == "response.reasoning_summary_text.delta": + self._emit("think_summary", delta, "[llm reasoning summary stream]") + elif event_type == "response.function_call_arguments.delta": + self._buffers["tool_args"].append(delta) + + def _handle_run_item_event(self, event: Any) -> None: + name = getattr(event, "name", "") + item = getattr(event, "item", None) + item_type = getattr(item, "type", "") + if self.stream_log is not None and name in {"message_output_created", "reasoning_item_created"}: + self.stream_log.append({"kind": "run_item", "name": name, "item_type": item_type}) + + def _emit(self, kind: str, text: str, label: str) -> None: + if kind == "tool_args": + should_print = self.wants_tool_stream + else: + should_print = self.wants_model_stream + if not should_print: + return + self._buffers[kind].append(text) + if self._printed_section != kind: + if self._printed_section is not None: + print(file=self.output, flush=True) + print(f"\n{label}", file=self.output, flush=True) + self._printed_section = kind + print(text, end="", file=self.output, flush=True) + + def emit_tool_call(self, command: str, *, force: bool = False) -> None: + if not command.strip(): + return + if self.stream_log is not None: + self.stream_log.append({"kind": "tool_call", "command": command}) + if not (force or self.wants_tool_stream): + return + self._start_section("tool_call", "[llm -> pifs command]") + print(command, file=self.output, flush=True) + + def emit_tool_result( + self, + *, + ok: bool, + output: str, + seconds: float, + force: bool = False, + preview_chars: int = 1000, + ) -> None: + if self.stream_log is not None: + self.stream_log.append( + { + "kind": "tool_result", + "ok": ok, + "seconds": round(seconds, 4), + "output_chars": len(output), + "preview": compact_tool_output_preview(output, preview_chars=preview_chars), + } + ) + if not (force or self.wants_tool_stream): + return + preview = compact_tool_output_preview(output, preview_chars=preview_chars) + self._start_section("tool_result", "[pifs -> llm result preview]") + print( + f"ok={str(ok).lower()} seconds={seconds:.4f} output_chars={len(output)}", + file=self.output, + flush=True, + ) + print(preview, file=self.output, flush=True) + + def _start_section(self, kind: str, label: str) -> None: + if self._printed_section is not None: + print(file=self.output, flush=True) + print(f"\n{label}", file=self.output, flush=True) + self._printed_section = kind + + +def run_pifs_agent( + filesystem: PageIndexFileSystem, + question: str, + *, + model: str, + root: str = "/", + system_prompt: str | None = None, + max_turns: int = 20, + max_seconds: float | None = 60, + verbose: bool = False, + stream_mode: str = "off", + reasoning_effort: str | None = None, + reasoning_summary: str | None = None, + output_type: type[Any] | None = None, + tool_log: list[dict[str, Any]] | None = None, + agent_log: list[dict[str, Any]] | None = None, +) -> str: + session = PIFSAgentSession( + filesystem, + model=model, + root=root, + system_prompt=system_prompt, + max_turns=max_turns, + max_seconds=max_seconds, + verbose=verbose, + stream_mode=stream_mode, + reasoning_effort=reasoning_effort, + reasoning_summary=reasoning_summary, + output_type=output_type, + tool_log=tool_log, + agent_log=agent_log, + persist_conversation=False, + ) + return session.run(question) + + +class PIFSAgentSession: + def __init__( + self, + filesystem: PageIndexFileSystem, + *, + model: str, + root: str = "/", + system_prompt: str | None = None, + max_turns: int = 20, + max_seconds: float | None = 60, + verbose: bool = False, + stream_mode: str = "off", + reasoning_effort: str | None = None, + reasoning_summary: str | None = None, + output_type: type[Any] | None = None, + tool_log: list[dict[str, Any]] | None = None, + agent_log: list[dict[str, Any]] | None = None, + persist_conversation: bool = True, + ) -> None: + self.filesystem = filesystem + self.max_turns = max_turns + self.max_seconds = max_seconds + self.verbose = verbose + self.tool_log = tool_log + self.agent_log = agent_log + self.normalized_stream_mode = normalize_agent_stream_mode(stream_mode) + self.observer: PIFSAgentStreamObserver | None = None + + try: + from agents import ( + Agent, + OpenAIChatCompletionsModel, + function_tool, + set_tracing_disabled, + ) + from agents.memory import SQLiteSession + from openai import AsyncOpenAI + except ModuleNotFoundError as exc: + if exc.name == "agents": + raise RuntimeError( + "openai-agents is required to run the PageIndex FileSystem agent" + ) from exc + raise + + set_tracing_disabled(should_disable_pifs_agent_tracing()) + self.executor = PIFSCommandExecutor(filesystem, json_output=False) + instructions = build_pifs_agent_instructions( + filesystem, + root=root, + system_prompt=system_prompt, + executor=self.executor, + ) + + @function_tool(description_override=BASH_TOOL_DESCRIPTION.strip()) + def bash(command: str) -> str: + """Run an allowed PageIndex FileSystem virtual shell command.""" + return self._run_bash(command) + + model_settings = build_agent_model_settings( + reasoning_effort=reasoning_effort, + reasoning_summary=reasoning_summary, + ) + base_url = os.environ.get("OPENAI_BASE_URL") + model_config = model + if should_use_openai_compatible_chat_model(base_url): + model_config = OpenAIChatCompletionsModel( + model=model, + openai_client=AsyncOpenAI( + api_key=os.environ.get("OPENAI_API_KEY"), + base_url=base_url, + ), + ) + + agent_kwargs: dict[str, Any] = { + "name": "PageIndexFileSystem", + "instructions": instructions, + "tools": [bash], + "model": model_config, + } + if model_settings is not None: + agent_kwargs["model_settings"] = model_settings + if output_type is not None: + agent_kwargs["output_type"] = output_type + self.agent = Agent(**agent_kwargs) + self.session = SQLiteSession("pifs-chat") if persist_conversation else None + + def run(self, question: str) -> str: + self.executor.query_context = extract_agent_question_text(question) + self.observer = PIFSAgentStreamObserver( + self.normalized_stream_mode, + stream_log=self.agent_log, + ) + + async def _run_streamed() -> str: + from agents import Runner + + streamed_run = Runner.run_streamed( + self.agent, + question, + max_turns=self.max_turns, + session=self.session, + ) + final_output = "" + try: + async for event in streamed_run.stream_events(): + self.observer.handle_event(event) + final_output = serialize_agent_final_output(streamed_run.final_output) + return final_output + finally: + if not final_output and streamed_run.final_output: + final_output = serialize_agent_final_output(streamed_run.final_output) + self.observer.finish(final_output) + + async def _run() -> str: + if self.max_seconds is None or self.max_seconds <= 0: + return await _run_streamed() + try: + return await asyncio.wait_for(_run_streamed(), timeout=self.max_seconds) + except asyncio.TimeoutError as exc: + raise TimeoutError(f"MaxSecondsExceeded: exceeded {self.max_seconds:g}s") from exc + + try: + asyncio.get_running_loop() + except RuntimeError: + return asyncio.run(_run()) + + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + return pool.submit(asyncio.run, _run()).result() + + def _run_bash(self, command: str) -> str: + started = time.time() + ok = True + assert self.observer is not None + self.observer.emit_tool_call(command, force=self.verbose) + try: + output = self.executor.execute(command) + except PIFSCommandError as exc: + ok = False + output = f"ERROR: {exc}" + seconds = time.time() - started + if self.tool_log is not None: + self.tool_log.append( + { + "command": command, + "ok": ok, + "seconds": round(seconds, 4), + "output_chars": len(output), + "preview": output[:500], + } + ) + self.observer.emit_tool_result( + ok=ok, + output=output, + seconds=seconds, + force=self.verbose, + ) + return output + + +def extract_agent_question_text(prompt: str) -> str: + for line in str(prompt or "").splitlines(): + if line.startswith("Question:"): + value = line.split(":", 1)[1].strip() + if value: + return value + return str(prompt or "").strip() diff --git a/pageindex/filesystem/cli.py b/pageindex/filesystem/cli.py new file mode 100644 index 000000000..e808d32ea --- /dev/null +++ b/pageindex/filesystem/cli.py @@ -0,0 +1,350 @@ +from __future__ import annotations + +import argparse +import contextlib +import json +import os +import re +import shlex +import sys +from pathlib import Path +from typing import Iterator, TextIO + +from .agent import ( + PIFSAgentSession, + REASONING_EFFORT_CHOICES, + REASONING_SUMMARY_CHOICES, + run_pifs_agent, +) +from .commands import PIFSCommandError, PIFSCommandExecutor +from .core import PageIndexFileSystem + + +AGENT_STREAM_MODE_CHOICES = ("off", "tools", "model", "all") +DEFAULT_AGENT_MODEL = "gpt-5.4-mini" +EXIT_COMMANDS = {"exit", "quit", ":q"} +ANSI_ESCAPE_RE = re.compile(r"\x1b(?:\[[0-?]*[ -/]*[@-~]|.)") +PIFS_CONFIG_FILE_ENV = "PIFS_CONFIG_FILE" +PIFS_WORKSPACE_ENV = "PIFS_WORKSPACE" + + +def _config_path() -> Path: + override = os.environ.get(PIFS_CONFIG_FILE_ENV) + if override: + return Path(override).expanduser() + config_home = os.environ.get("XDG_CONFIG_HOME") + root = Path(config_home).expanduser() if config_home else Path.home() / ".config" + return root / "pageindex" / "pifs.json" + + +def _read_config() -> dict[str, str]: + path = _config_path() + if not path.exists(): + return {} + with path.open("r", encoding="utf-8") as handle: + payload = json.load(handle) + if not isinstance(payload, dict): + raise ValueError(f"invalid PIFS config file: {path}") + return {str(key): str(value) for key, value in payload.items() if value is not None} + + +def _write_config(config: dict[str, str]) -> Path: + path = _config_path() + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + json.dump(config, handle, indent=2, sort_keys=True) + handle.write("\n") + return path + + +def _configured_workspace() -> str | None: + return _read_config().get("workspace") + + +def _resolve_workspace(value: str | None) -> str | None: + return value or os.environ.get(PIFS_WORKSPACE_ENV) or _configured_workspace() + + +def _load_env_file(path: str | None = None, *, workspace: str | None = None) -> Path | None: + from dotenv import load_dotenv + + if path: + env_path = Path(path).expanduser() + if not env_path.exists(): + raise FileNotFoundError(f"env file not found: {env_path}") + load_dotenv(env_path, override=True) + return env_path + + env_override = os.environ.get("PIFS_ENV_FILE") + if env_override: + return _load_env_file(env_override) + + starts = [Path.cwd()] + if workspace: + starts.append(Path(workspace).expanduser()) + seen: set[Path] = set() + for start in starts: + current = start.resolve() if start.exists() else start.resolve(strict=False) + if current.is_file(): + current = current.parent + for parent in (current, *current.parents): + candidate = parent / ".env" + if candidate in seen: + continue + seen.add(candidate) + if candidate.exists(): + load_dotenv(candidate, override=False) + return candidate + return None + + +def _agent_model_default() -> str: + return ( + os.environ.get("PIFS_AGENT_MODEL") + or os.environ.get("PIFS_MODEL") + or DEFAULT_AGENT_MODEL + ) + + +def _add_agent_arguments( + parser: argparse.ArgumentParser, + *, + workspace_default: str | None, + default_stream_mode: str, +) -> None: + parser.add_argument("--workspace", default=workspace_default) + parser.add_argument("--env-file", default=None) + parser.add_argument("--model", default=_agent_model_default()) + parser.add_argument( + "--stream-mode", + default=default_stream_mode, + choices=AGENT_STREAM_MODE_CHOICES, + ) + parser.add_argument("--max-turns", type=int, default=20) + parser.add_argument("--max-seconds", type=float, default=60) + parser.add_argument( + "--reasoning-effort", + choices=REASONING_EFFORT_CHOICES, + default=None, + ) + parser.add_argument( + "--reasoning-summary", + choices=REASONING_SUMMARY_CHOICES, + default=None, + ) + + +def _parse_agent_command( + command_name: str, + argv: list[str], + *, + workspace_default: str | None, + default_stream_mode: str, +) -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog=f"pifs {command_name}", + description=f"PageIndex FileSystem {command_name}", + ) + _add_agent_arguments( + parser, + workspace_default=workspace_default, + default_stream_mode=default_stream_mode, + ) + if command_name == "ask": + parser.add_argument("question", nargs=argparse.REMAINDER) + args = parser.parse_args(argv) + _load_env_file(args.env_file, workspace=args.workspace) + args.workspace = _resolve_workspace(args.workspace) + if not args.workspace: + parser.error("--workspace is required unless PIFS_WORKSPACE is set or `pifs set workspace ` has been run") + return args + + +def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem: + filesystem = PageIndexFileSystem(Path(workspace).expanduser()) + with contextlib.suppress(Exception): + filesystem.configure_existing_projection_retrieval() + return filesystem + + +def _agent_kwargs(args: argparse.Namespace) -> dict[str, object]: + return { + "model": args.model, + "stream_mode": args.stream_mode, + "max_turns": args.max_turns, + "max_seconds": args.max_seconds, + "reasoning_effort": args.reasoning_effort, + "reasoning_summary": args.reasoning_summary, + } + + +def _sanitize_chat_question(raw: str) -> str: + text = ANSI_ESCAPE_RE.sub("", raw) + chars: list[str] = [] + for char in text: + if char in {"\b", "\x7f"}: + if chars: + chars.pop() + continue + if char in {"\r", "\n"}: + continue + if ord(char) < 32 or ord(char) == 127: + continue + chars.append(char) + return "".join(chars).strip() + + +@contextlib.contextmanager +def _suppress_tty_input_echo(stdin: TextIO | None = None) -> Iterator[None]: + stream = sys.stdin if stdin is None else stdin + if not hasattr(stream, "isatty") or not stream.isatty(): + yield + return + try: + import termios + + fd = stream.fileno() + original = termios.tcgetattr(fd) + muted = original[:] + muted[3] = muted[3] & ~termios.ECHO + termios.tcsetattr(fd, termios.TCSADRAIN, muted) + except Exception: + yield + return + try: + yield + finally: + with contextlib.suppress(Exception): + termios.tcflush(fd, termios.TCIFLUSH) + with contextlib.suppress(Exception): + termios.tcsetattr(fd, termios.TCSADRAIN, original) + + +def _run_ask(argv: list[str], *, workspace_default: str | None) -> int: + args = _parse_agent_command( + "ask", + argv, + workspace_default=workspace_default, + default_stream_mode="off", + ) + question_tokens = [token for token in args.question if token != "--"] + question = " ".join(question_tokens).strip() + if not question: + raise ValueError("ask requires a question") + + filesystem = _filesystem_from_workspace(args.workspace) + answer = run_pifs_agent(filesystem, question, **_agent_kwargs(args)) + if args.stream_mode == "off": + print(answer) + return 0 + + +def _run_chat(argv: list[str], *, workspace_default: str | None) -> int: + args = _parse_agent_command( + "chat", + argv, + workspace_default=workspace_default, + default_stream_mode="all", + ) + filesystem = _filesystem_from_workspace(args.workspace) + session = PIFSAgentSession(filesystem, **_agent_kwargs(args)) + while True: + try: + question = _sanitize_chat_question(input("pifs> ")) + except EOFError: + break + except KeyboardInterrupt: + print() + break + if not question: + continue + if question.lower() in EXIT_COMMANDS: + break + with _suppress_tty_input_echo(): + answer = session.run(question) + if args.stream_mode == "off": + print(answer) + return 0 + + +def _run_passthrough( + command_tokens: list[str], + *, + workspace: str, + json_output: bool, +) -> int: + filesystem = _filesystem_from_workspace(workspace) + executor = PIFSCommandExecutor(filesystem, json_output=json_output) + command = " ".join(shlex.quote(token) for token in command_tokens) + print(executor.execute(command)) + return 0 + + +def _run_set(argv: list[str]) -> int: + parser = argparse.ArgumentParser( + prog="pifs set", + description="Set PageIndex FileSystem CLI defaults", + ) + parser.add_argument("name", choices=["workspace"]) + parser.add_argument("value") + args = parser.parse_args(argv) + + config = _read_config() + if args.name == "workspace": + workspace = Path(args.value).expanduser().resolve(strict=False) + config["workspace"] = str(workspace) + path = _write_config(config) + print(f"workspace: {workspace}") + print(f"config: {path}") + return 0 + raise ValueError(f"unknown config key: {args.name}") + + +def main(argv: list[str] | None = None) -> int: + argv = list(sys.argv[1:] if argv is None else argv) + _load_env_file() + parser = argparse.ArgumentParser(description="PageIndex FileSystem CLI") + parser.add_argument("--workspace", default=None) + parser.add_argument("--env-file", default=None) + parser.add_argument("--json", action="store_true", dest="json_output") + parser.add_argument("command", nargs=argparse.REMAINDER) + args = parser.parse_args(argv) + _load_env_file(args.env_file, workspace=args.workspace) + args.workspace = _resolve_workspace(args.workspace) + + command_tokens = [token for token in args.command if token != "--"] + json_output = args.json_output + + if not command_tokens: + parser.error("a filesystem command is required") + + try: + command_name = command_tokens[0] + command_args = command_tokens[1:] + if command_name == "set": + return _run_set(command_args) + if command_name == "ask": + return _run_ask(command_args, workspace_default=args.workspace) + if command_name == "chat": + return _run_chat(command_args, workspace_default=args.workspace) + + if "--json" in command_tokens: + command_tokens = [token for token in command_tokens if token != "--json"] + json_output = True + if not args.workspace: + parser.error("--workspace is required unless PIFS_WORKSPACE is set or `pifs set workspace ` has been run") + return _run_passthrough( + command_tokens, + workspace=args.workspace, + json_output=json_output, + ) + except PIFSCommandError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 + except Exception as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py new file mode 100644 index 000000000..18a85cc2f --- /dev/null +++ b/pageindex/filesystem/commands.py @@ -0,0 +1,2226 @@ +from __future__ import annotations + +import json +import re +import shlex +import subprocess +from dataclasses import asdict, is_dataclass +from pathlib import Path +from typing import Any + +from .core import SEMANTIC_GREP_CHANNELS, SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem + + +class PIFSCommandError(ValueError): + pass + + +class PIFSCommandExecutor: + FORBIDDEN_SUBSTRINGS = (";", "`", "$(", "||", "\n", "\r") + FORBIDDEN_TOKENS = {"|", ">", "<", ">>", "<<", "&"} + BASE_ALLOWED_COMMANDS = { + "ls", + "tree", + "find", + "grep", + "cat", + "stat", + "head", + "tail", + "sed", + } + SEMANTIC_CHANNEL_COMMANDS = { + "summary": "search-summary", + "entity": "search-entity", + "relation": "search-relation", + } + ALLOWED_COMMANDS = ( + BASE_ALLOWED_COMMANDS + | {"semantic-grep"} + | set(SEMANTIC_CHANNEL_COMMANDS.values()) + ) + ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"} + COMMAND_METHODS = { + "search-summary": "_cmd_search_summary", + "search-entity": "_cmd_search_entity", + "search-relation": "_cmd_search_relation", + "semantic-grep": "_cmd_semantic_grep", + } + MAX_CHAINED_COMMANDS = 3 + MAX_PIPE_COMMANDS = 3 + MAX_LS_LIMIT = 100 + MAX_TREE_LIMIT = 200 + MAX_FIND_LIMIT = 50 + MAX_GREP_LIMIT = 20 + MAX_SEMANTIC_LIMIT = 20 + MAX_TEXT_LINES = 100 + MAX_PAGE_SPAN = 5 + MAX_STRUCTURE_NODES = 25 + MAX_NODE_IDS = 10 + MAX_NODE_TEXT_LINES = 100 + MAX_NODE_TEXT_CHARS = 12_000 + MAX_STAT_FIELD_TARGETS = 20 + MAX_TREE_DEPTH = 4 + MAX_LS_RENDER_FILES = 25 + MAX_STAT_METADATA_FIELDS = 8 + SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT = 20 + GREP_RECURSIVE_FOLDER_DEPTH_LIMIT = 2 + GREP_RECURSIVE_FOLDER_FILE_LIMIT = 10 + + def __init__( + self, + filesystem: PageIndexFileSystem, + *, + json_output: bool = False, + query_context: str | None = None, + ): + self.filesystem = filesystem + self.json_output = json_output + self.query_context = query_context + + def allowed_commands(self) -> set[str]: + commands = set(self.BASE_ALLOWED_COMMANDS) + semantic_channels = set(self.filesystem.semantic_retrieval_channels()) + for channel in SEMANTIC_RETRIEVAL_CHANNELS: + if channel in semantic_channels: + commands.add(self.SEMANTIC_CHANNEL_COMMANDS[channel]) + if any(channel in semantic_channels for channel in SEMANTIC_GREP_CHANNELS): + commands.add("semantic-grep") + return commands + + def command_capabilities(self) -> dict[str, Any]: + return { + "allowed_commands": sorted(self.allowed_commands()), + "retrieval": self.filesystem.retrieval_capabilities(), + } + + def describe_available_command_surfaces(self) -> str: + capabilities = self.filesystem.retrieval_capabilities() + semantic = capabilities["semantic"] + semantic_channels = set(semantic["channels"]) + lines = [ + "Available command surfaces for this workspace:", + "- mode: read-only inspection", + "- ls/tree: folder browsing", + "- find : folder path is positional; do not put paths in --where", + "- find --where: exact/canonical metadata DSL filtering using stat --schema fields only", + "- find -maxdepth N -type f|d: bounded folder traversal for find", + "- grep -R: recursive lexical/FTS search only; semantic vector prefilter is disabled", + "- cat --structure: cached PageIndex node list, paginated at 25 nodes", + "- cat --page: cached PageIndex page reads, limited to 5 pages", + "- cat --node: cached PageIndex node reads, limited to 10 node ids", + "- cat --all: text artifact reads for txt/text files, paginated at 100 lines", + "- stat --field : one metadata field across up to 20 documents", + ] + if "entity" in semantic_channels: + lines.append("- find --name: entity semantic candidate discovery alias") + if "relation" in semantic_channels: + lines.append("- find --relation: relation semantic candidate discovery alias") + for channel in SEMANTIC_RETRIEVAL_CHANNELS: + if channel not in semantic_channels: + continue + lines.append( + f"- {self.SEMANTIC_CHANNEL_COMMANDS[channel]}: " + f"{channel} semantic vector candidate discovery" + ) + semantic_grep_channels = semantic.get("semantic_grep_channels") or [] + if semantic_grep_channels: + lines.append( + "- semantic-grep -R: semantic candidates from " + + ", ".join(semantic_grep_channels) + + " indexes followed by real line matching" + ) + if not semantic.get("commands"): + lines.append("- semantic vector commands: none available in this workspace") + lines.append("- grep , cat, stat: evidence inspection") + return "\n".join(lines) + + def execute(self, command: str) -> str: + try: + if not command.strip(): + raise PIFSCommandError("Empty command") + commands = self._split_chained_commands(command) + if len(commands) > self.MAX_CHAINED_COMMANDS: + raise PIFSCommandError( + f"Command chain supports at most {self.MAX_CHAINED_COMMANDS} commands. " + "Run fewer commands or narrow the request first; if you are unsure where " + "to inspect, use cat --structure." + ) + if len(commands) > 1: + return "\n".join(self._execute_pipeline(part) for part in commands) + return self._execute_pipeline(commands[0]) + except PIFSCommandError: + raise + except (KeyError, ValueError) as exc: + raise PIFSCommandError(self._clean_error_message(exc)) from exc + + def _execute_pipeline(self, command: str) -> str: + commands = self._split_piped_commands(command) + if len(commands) > self.MAX_PIPE_COMMANDS: + raise PIFSCommandError( + f"Pipeline supports at most {self.MAX_PIPE_COMMANDS} commands. " + "Use a smaller command and explicit limits; if you are unsure where " + "to inspect, use cat --structure." + ) + output = self._execute_single(commands[0]) + for pipe_command in commands[1:]: + output = self._execute_pipe_filter(output, pipe_command) + return output + + def _execute_single(self, command: str) -> str: + self._validate_raw_command(command) + try: + tokens = shlex.split(command) + except ValueError as exc: + raise PIFSCommandError(f"Invalid command syntax: {exc}") from exc + if not tokens: + raise PIFSCommandError("Empty command") + self._validate_tokens(tokens) + if "--json" in tokens: + tokens = [token for token in tokens if token != "--json"] + json_output = True + else: + json_output = self.json_output + name = tokens[0] + if name not in self.allowed_commands(): + raise PIFSCommandError(f"Unsupported command: {name}") + method_name = self.COMMAND_METHODS.get(name, f"_cmd_{name}") + data = getattr(self, method_name)(tokens[1:]) + return self._render(data, json_output=json_output, command_name=name) + + def _execute_pipe_filter(self, input_text: str, command: str) -> str: + self._validate_raw_command(command) + try: + tokens = shlex.split(command) + except ValueError as exc: + raise PIFSCommandError(f"Invalid command syntax: {exc}") from exc + if not tokens: + raise PIFSCommandError("Empty pipe command") + self._validate_tokens(tokens) + name = tokens[0] + if name not in self.ALLOWED_PIPE_FILTERS: + raise PIFSCommandError( + f"Unsupported pipe command: {name}. Supported pipes are: " + f"{', '.join(sorted(self.ALLOWED_PIPE_FILTERS))}. " + "If you meant regex alternation such as a|b, PIFS grep/search " + "does not support it; run multiple grep or search-summary " + "commands with one phrase each." + ) + if name == "head": + return self._pipe_head_tail(input_text, tokens[1:], from_tail=False) + if name == "tail": + return self._pipe_head_tail(input_text, tokens[1:], from_tail=True) + if name == "grep": + return self._pipe_grep(input_text, tokens[1:]) + if name == "sed": + return self._pipe_sed(input_text, tokens[1:]) + raise PIFSCommandError(f"Unsupported pipe command: {name}") + + def _cmd_ls(self, args: list[str]) -> Any: + recursive = False + limit = self.MAX_LS_LIMIT + path = "/" + i = 0 + while i < len(args): + arg = args[i] + if arg in {"-R", "-r", "--recursive"}: + recursive = True + elif arg == "--limit": + i += 1 + limit = self._parse_bounded_int( + args[i], "ls --limit", max_value=self.MAX_LS_LIMIT + ) + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported ls option: {arg}") + else: + path = arg + i += 1 + return self.filesystem.browse(path, recursive=recursive, limit=limit) + + def _cmd_tree(self, args: list[str]) -> Any: + path = "/" + limit = self.MAX_TREE_LIMIT + depth = 2 + i = 0 + while i < len(args): + arg = args[i] + if arg == "--limit": + i += 1 + limit = self._parse_bounded_int( + args[i], "tree --limit", max_value=self.MAX_TREE_LIMIT + ) + elif arg in {"--depth", "-L"}: + i += 1 + depth = self._parse_non_negative_int(args[i], "tree --depth") + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported tree option: {arg}") + else: + path = arg + i += 1 + if depth < 1: + raise PIFSCommandError("tree --depth must be at least 1") + if depth > self.MAX_TREE_DEPTH: + depth = self.MAX_TREE_DEPTH + listing = self.filesystem.browse(path, recursive=True, limit=limit) + return {"path": path, "depth": depth, "limit": limit, **listing} + + def _cmd_find(self, args: list[str]) -> Any: + path = "/" + where = None + name = None + relation = None + limit = 10 + file_type = None + max_depth = None + i = 0 + while i < len(args): + arg = args[i] + if arg == "--where": + i += 1 + where = args[i] + elif arg == "--name": + i += 1 + name = args[i] + elif arg == "--relation": + i += 1 + relation = args[i] + elif arg == "--limit": + i += 1 + limit = self._parse_bounded_int( + args[i], "find --limit", max_value=self.MAX_FIND_LIMIT + ) + elif arg == "-type": + i += 1 + file_type = args[i] + elif arg == "-maxdepth": + i += 1 + max_depth = self._parse_find_maxdepth(args[i] if i < len(args) else None) + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported find option: {arg}") + else: + path = arg + i += 1 + if file_type and file_type not in {"f", "d"}: + raise PIFSCommandError("find -type supports only f or d") + if name and relation: + raise PIFSCommandError("find supports only one of --name or --relation") + if file_type == "d": + if where: + return self.filesystem.find_folders( + path, + metadata_filter=where, + limit=limit, + max_depth=max_depth, + ) + folders = self.filesystem.browse( + path, + recursive=True, + limit=limit, + max_depth=max_depth, + )["folders"] + if max_depth is not None and limit != 0: + return [self.filesystem.folder_info(path), *folders][:limit] + return folders + scope = {"folder_path": path, "recursive": True} + if max_depth is not None: + if max_depth == 0: + return [] + scope["max_depth"] = max_depth + if relation: + if not self.filesystem.has_semantic_channel("relation"): + raise PIFSCommandError( + "find --relation requires a relation semantic index in this workspace" + ) + return self.filesystem.search_semantic_channel( + "relation", + self._semantic_retrieval_query(relation), + scope=scope, + metadata_filter=where, + limit=limit, + ) + if name and self.filesystem.has_semantic_channel("entity"): + return self.filesystem.search_semantic_channel( + "entity", + self._semantic_retrieval_query(name), + scope=scope, + metadata_filter=where, + limit=limit, + ) + return self.filesystem.search( + query=name, + scope=scope, + metadata_filter=where, + limit=limit, + semantic=False, + ) + + def _cmd_grep(self, args: list[str]) -> Any: + recursive = False + where = None + limit = 10 + positionals = [] + i = 0 + while i < len(args): + arg = args[i] + if arg in {"-R", "-r", "--recursive"}: + recursive = True + elif self._is_combined_grep_flag(arg): + recursive = recursive or "R" in arg or "r" in arg + elif arg in {"-n", "--line-number", "-i", "--ignore-case"}: + pass + elif arg == "--where": + i += 1 + where = args[i] + elif arg == "--limit": + i += 1 + limit = self._parse_bounded_int( + args[i], "grep --limit", max_value=self.MAX_GREP_LIMIT + ) + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported grep option: {arg}") + else: + positionals.append(arg) + i += 1 + if not positionals: + raise PIFSCommandError("grep requires a query") + query = positionals[0] + self._reject_regex_alternation_query(query, "grep") + path = positionals[1] if len(positionals) > 1 else "/" + if self._is_folder(path): + normalized = self._normalize_folder_path(path) + if recursive: + limit_notice = self._recursive_grep_limit_notice(normalized, query) + if limit_notice: + return limit_notice + children = self.filesystem.browse(normalized, recursive=False, limit=1000)["folders"] + if children: + direct_results = self.filesystem.search( + query=query, + scope={"folder_path": normalized, "recursive": False}, + metadata_filter=where, + limit=limit, + semantic=False, + ) + if direct_results: + return { + "mode": "files", + "query": query, + "scope": normalized, + "data": self._grep_file_hits_from_results(direct_results, query), + } + if where is None: + direct_source_hits = self._grep_source_file_hits( + normalized, + query, + limit=limit, + direct_only=True, + ) + if direct_source_hits: + return { + "mode": "files", + "query": query, + "scope": normalized, + "data": direct_source_hits, + } + ranked = self._rank_child_folders( + query=query, + children=children, + metadata_filter=where, + limit=limit, + ) + if not ranked and where is None: + ranked = self._rank_child_folders_from_source( + query=query, + parent_path=normalized, + children=children, + limit=limit, + ) + return { + "mode": "folders", + "query": query, + "scope": normalized, + "data": ranked, + "hint": "narrow into one directory, then run grep -R again", + } + results = self.filesystem.search( + query=query, + scope={"folder_path": normalized, "recursive": recursive}, + metadata_filter=where, + limit=limit, + semantic=False, + ) + if not results and where is None: + source_hits = self._grep_source_file_hits(normalized, query, limit=limit) + return { + "mode": "files", + "query": query, + "scope": normalized, + "data": source_hits, + } + return { + "mode": "files", + "query": query, + "scope": normalized, + "data": self._grep_file_hits_from_results(results, query), + } + return { + "mode": "matches", + "query": query, + "target": path, + "data": self._grep_file_matches(path, query, limit=limit), + } + + def _cmd_cat(self, args: list[str]) -> Any: + if not args: + raise PIFSCommandError("cat requires a file target") + target = args[0] + if target.startswith("-"): + raise PIFSCommandError( + "cat syntax is target-first: cat --structure, " + "cat --page 31-59, or " + "cat --node 0009" + ) + location = "all" + structural_mode: str | None = None + node_ids: list[str] = [] + page_range: str | None = None + structure_offset = 0 + structure_limit = self.MAX_STRUCTURE_NODES + i = 1 + while i < len(args): + arg = args[i] + if arg == "--range": + i += 1 + if i >= len(args): + raise PIFSCommandError("cat --range requires a range") + location = args[i] + elif arg == "--all": + location = "all" + elif arg == "--structure": + structural_mode = "structure" + elif arg == "--offset": + i += 1 + if i >= len(args): + raise PIFSCommandError("cat --structure --offset requires a value") + structure_offset = self._parse_non_negative_int(args[i], "cat --structure --offset") + elif arg == "--limit": + i += 1 + if i >= len(args): + raise PIFSCommandError("cat --structure --limit requires a value") + structure_limit = self._parse_bounded_int( + args[i], + "cat --structure --limit", + max_value=self.MAX_STRUCTURE_NODES, + ) + elif arg == "--node": + i += 1 + if i >= len(args): + raise PIFSCommandError("cat --node requires a node id") + structural_mode = "node" + while i < len(args) and not args[i].startswith("-"): + node_ids.extend(self._parse_node_ids(args[i])) + i += 1 + i -= 1 + elif arg == "--page": + i += 1 + if i >= len(args): + raise PIFSCommandError("cat --page requires a page range") + structural_mode = "page" + page_range = args[i] + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported cat option: {arg}") + else: + raise PIFSCommandError( + "cat accepts one file target. Use target-first syntax: " + "cat --structure, " + "cat --node 0002 0004, or " + "cat --page 31-33. " + f"Unexpected extra argument: {arg!r}. If the target path or title contains " + "spaces, quote the whole target, for example: cat \"/documents/report name.pdf\" " + "--structure. If a title-derived path is ambiguous, use the file_ref or " + "document_id instead." + ) + i += 1 + if structural_mode == "structure": + if structure_limit < 1: + raise PIFSCommandError( + "cat --structure --limit must be at least 1 and at most " + f"{self.MAX_STRUCTURE_NODES}." + ) + data = self.filesystem.pageindex_structure( + target, + offset=structure_offset, + limit=structure_limit, + ) + self._attach_structure_next_command(data, target) + return data + if structural_mode == "node": + self._require_at_most( + len(node_ids), + "cat --node node count", + self.MAX_NODE_IDS, + ) + if not node_ids: + raise PIFSCommandError("cat --node requires a node id") + node_results = [ + self._bounded_node_result( + self.filesystem.pageindex_node(target, node_id), + target=target, + node_id=node_id, + ) + for node_id in node_ids + ] + if len(node_results) == 1: + return node_results[0] + return { + "mode": "nodes", + "target": target, + "available": all(result.get("available") is not False for result in node_results), + "node_ids": node_ids, + "nodes": node_results, + "text": "\n\n".join( + f"[node {result.get('node_id') or node_id}]\n{result.get('text', '')}" + for node_id, result in zip(node_ids, node_results) + ), + } + if structural_mode == "page": + if not page_range or not re.fullmatch(r"\d+(?:-\d+)?", page_range): + raise PIFSCommandError( + "cat --page requires one page selector like 31 or 31-59. " + "Use: cat --page " + ) + start, end = self._parse_numeric_range(page_range, "cat --page") + self._require_at_most( + end - start + 1, + "cat --page page count", + self.MAX_PAGE_SPAN, + ) + data = self.filesystem.pageindex_pages(target, page_range) + self._attach_page_next_command(data, target, start=start, end=end) + return data + return self._bounded_text_artifact(target, location) + + def _cmd_stat(self, args: list[str]) -> Any: + schema = False + field: str | None = None + targets: list[str] = [] + i = 0 + while i < len(args): + arg = args[i] + if arg == "--schema": + schema = True + elif arg == "--field": + i += 1 + if i >= len(args): + raise PIFSCommandError("stat --field requires a metadata field name") + field = args[i] + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported stat option: {arg}") + else: + targets.append(arg) + i += 1 + if schema: + if field or targets: + raise PIFSCommandError("stat --schema cannot be combined with file targets or --field") + return self.filesystem._metadata_schema() + if field: + if not targets: + raise PIFSCommandError("stat --field requires at least one file target") + self._require_at_most( + len(targets), + "stat --field target count", + self.MAX_STAT_FIELD_TARGETS, + ) + self._validate_metadata_field_for_stat(field) + return { + "mode": "field_values", + "field": field, + "target_count": len(targets), + "max_targets": self.MAX_STAT_FIELD_TARGETS, + "data": [self._stat_field_row(field, target) for target in targets], + } + if not targets: + raise PIFSCommandError("stat requires a file target or --schema") + self._require_at_most( + len(targets), + "stat target count", + self.MAX_STAT_FIELD_TARGETS, + ) + if len(targets) == 1: + return {"target": targets[0], **self.filesystem._stat(targets[0])} + return { + "mode": "files", + "target_count": len(targets), + "data": [{"target": target, **self.filesystem._stat(target)} for target in targets], + } + + def _cmd_head(self, args: list[str]) -> Any: + count, target = self._parse_standalone_head_tail(args, default_count=10) + count = self._require_at_most(count, "head line count", self.MAX_TEXT_LINES) + opened = self.filesystem.cat_text_artifact(target, "all") + lines = opened.text.splitlines() + text = "\n".join(lines[:count]) + return {**self._jsonable(opened), "text": text, "end_line": min(count, len(lines))} + + def _cmd_tail(self, args: list[str]) -> Any: + count, target = self._parse_standalone_head_tail(args, default_count=10) + count = self._require_at_most(count, "tail line count", self.MAX_TEXT_LINES) + opened = self.filesystem.cat_text_artifact(target, "all") + lines = opened.text.splitlines() + selected = lines[-count:] if count else [] + start_line = max(1, len(lines) - len(selected) + 1) + return { + **self._jsonable(opened), + "text": "\n".join(selected), + "start_line": start_line, + "end_line": len(lines), + } + + def _cmd_sed(self, args: list[str]) -> Any: + if len(args) < 3 or args[0] != "-n": + raise PIFSCommandError("sed supports only: sed -n ',p' ") + match = re.fullmatch(r"(\d+),(\d+)p", args[1]) + if not match: + raise PIFSCommandError("sed supports only: sed -n ',p' ") + start, end = int(match.group(1)), int(match.group(2)) + if start < 1 or end < start: + raise PIFSCommandError("Invalid sed line range") + self._require_at_most(end - start + 1, "sed line count", self.MAX_TEXT_LINES) + return self.filesystem.cat_text_artifact( + args[2], + f"{start}-{end}", + ) + + def _cmd_search_summary(self, args: list[str]) -> Any: + return self._cmd_semantic_channel("summary", args) + + def _cmd_search_entity(self, args: list[str]) -> Any: + return self._cmd_semantic_channel("entity", args) + + def _cmd_search_relation(self, args: list[str]) -> Any: + return self._cmd_semantic_channel("relation", args) + + def _cmd_semantic_grep(self, args: list[str]) -> Any: + recursive = False + where = None + limit = 10 + positionals = [] + i = 0 + while i < len(args): + arg = args[i] + if arg in {"-R", "-r", "--recursive"}: + recursive = True + elif self._is_combined_grep_flag(arg): + recursive = recursive or "R" in arg or "r" in arg + elif arg in {"-n", "--line-number", "-i", "--ignore-case"}: + pass + elif arg == "--where": + i += 1 + where = args[i] + elif arg == "--limit": + i += 1 + limit = self._parse_bounded_int( + args[i], "semantic-grep --limit", max_value=self.MAX_SEMANTIC_LIMIT + ) + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported semantic-grep option: {arg}") + else: + positionals.append(arg) + i += 1 + if not recursive: + raise PIFSCommandError("semantic-grep requires -R/--recursive") + channels = self._semantic_grep_channels() + if not channels: + raise PIFSCommandError( + "semantic-grep is not available; entity/relation semantic indexes are not configured" + ) + if not positionals: + raise PIFSCommandError("semantic-grep requires a query") + self._validate_search_positionals("semantic-grep", positionals) + query = positionals[0] + self._reject_regex_alternation_query(query, "semantic-grep") + path = positionals[1] if len(positionals) > 1 else "/" + if not self._is_folder(path): + raise PIFSCommandError("semantic-grep target must be a folder") + return self._semantic_recursive_grep( + self._normalize_folder_path(path), + query, + metadata_filter=where, + limit=limit, + channels=channels, + ) + + def _cmd_semantic_channel(self, channel: str, args: list[str]) -> Any: + if not self.filesystem.has_semantic_channel(channel): + raise PIFSCommandError( + f"search-{channel} is not available; {channel} semantic index is not configured" + ) + where = None + limit = 10 + positionals = [] + i = 0 + while i < len(args): + arg = args[i] + if arg == "--where": + i += 1 + where = args[i] + elif arg == "--limit": + i += 1 + limit = self._parse_bounded_int( + args[i], + f"search-{channel} --limit", + max_value=self.MAX_SEMANTIC_LIMIT, + ) + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported search-{channel} option: {arg}") + else: + positionals.append(arg) + i += 1 + if not positionals: + raise PIFSCommandError(f"search-{channel} requires a query") + self._validate_search_positionals(f"search-{channel}", positionals) + query = positionals[0] + self._reject_regex_alternation_query(query, f"search-{channel}") + path = positionals[1] if len(positionals) > 1 else "/" + normalized = self._normalize_folder_path(path) + results = self.filesystem.search_semantic_channel( + channel, + self._semantic_retrieval_query(query), + scope={"folder_path": normalized, "recursive": True}, + metadata_filter=where, + limit=limit, + ) + return { + "mode": "files", + "query": query, + "scope": normalized, + "retrieval": f"{channel}_vector", + "data": self._semantic_channel_hits_from_results(channel, results, query), + } + + def _semantic_recursive_grep( + self, + folder_path: str, + query: str, + *, + metadata_filter: str | None, + limit: int, + channels: tuple[str, ...], + ) -> dict[str, Any]: + vector_query = str(query or "").strip() + candidate_debug: dict[str, Any] = {} + for channel in channels: + channel_results = self.filesystem.search_semantic_channel( + channel, + vector_query, + scope={"folder_path": folder_path, "recursive": True}, + metadata_filter=metadata_filter, + limit=self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT, + ) + matches = self._grep_file_hits_from_results( + channel_results, + query, + require_match=True, + limit=limit, + ) + candidate_debug[channel] = { + "candidates": len(channel_results), + "line_matches": len(matches), + "candidate_doc_ids": [ + getattr(result, "external_id", None) + for result in channel_results[:5] + ], + } + if matches: + return { + "mode": "files", + "query": query, + "scope": folder_path, + "retrieval": "semantic_grep_" + "_then_".join(channels), + "candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT, + "matched_channel": channel, + "candidate_debug": candidate_debug, + "data": matches, + } + return { + "mode": "files", + "query": query, + "scope": folder_path, + "retrieval": "semantic_grep_" + "_then_".join(channels), + "candidate_limit_per_channel": self.SEMANTIC_GREP_VECTOR_CANDIDATE_LIMIT, + "matched_channel": "", + "candidate_debug": candidate_debug, + "data": [], + } + + def _semantic_grep_channels(self) -> tuple[str, ...]: + available = set(self.filesystem.semantic_retrieval_channels()) + return tuple(channel for channel in SEMANTIC_GREP_CHANNELS if channel in available) + + def _bounded_text_artifact(self, target: str, location: str) -> dict[str, Any]: + if str(location).strip().lower() in {"all", "full", "*"}: + start, end = 1, self.MAX_TEXT_LINES + else: + start, end = self._parse_numeric_range(location, "cat --range") + self._require_at_most( + end - start + 1, + "cat --range line count", + self.MAX_TEXT_LINES, + ) + opened = self.filesystem.cat_text_artifact(target, f"{start}-{end}") + data = self._jsonable(opened) + total_lines = len(self.filesystem.store.read_text(opened.file_ref).splitlines()) + has_more = int(data.get("end_line") or end) < total_lines + pagination = { + "offset_line": start, + "limit": self.MAX_TEXT_LINES, + "returned_lines": max(0, int(data.get("end_line") or end) - start + 1), + "total_lines": total_lines, + "has_more": has_more, + "next_range": None, + "next_command": None, + } + if has_more: + next_start = int(data.get("end_line") or end) + 1 + next_end = min(total_lines, next_start + self.MAX_TEXT_LINES - 1) + next_range = f"{next_start}-{next_end}" + pagination["next_range"] = next_range + pagination["next_command"] = ( + f"cat {shlex.quote(target)} --range {shlex.quote(next_range)}" + ) + data["text"] = ( + str(data.get("text") or "").rstrip() + + "\n" + + self._pagination_footer( + "cat --all", + f"showing lines {start}-{data.get('end_line')} of {total_lines}", + str(pagination["next_command"]), + ) + ).strip() + data["pagination"] = pagination + return data + + def _bounded_node_result( + self, + data: dict[str, Any], + *, + target: str, + node_id: str, + ) -> dict[str, Any]: + if not isinstance(data, dict) or data.get("available") is False: + return data + text = str(data.get("text") or "") + lines = text.splitlines() + truncated_by_lines = len(lines) > self.MAX_NODE_TEXT_LINES + truncated_by_chars = len(text) > self.MAX_NODE_TEXT_CHARS + if not truncated_by_lines and not truncated_by_chars: + data["node_pagination"] = { + "limit_nodes": self.MAX_NODE_IDS, + "text_truncated": False, + } + return data + + selected = "\n".join(lines[: self.MAX_NODE_TEXT_LINES]) + if len(selected) > self.MAX_NODE_TEXT_CHARS: + selected = selected[: self.MAX_NODE_TEXT_CHARS].rstrip() + data["text"] = ( + selected.rstrip() + + "\n" + + self._pagination_footer( + "cat --node", + ( + f"node text limited to {self.MAX_NODE_TEXT_LINES} lines/" + f"{self.MAX_NODE_TEXT_CHARS} chars" + ), + f"cat {shlex.quote(target)} --structure", + ) + ).strip() + data["node_pagination"] = { + "limit_nodes": self.MAX_NODE_IDS, + "line_limit": self.MAX_NODE_TEXT_LINES, + "char_limit": self.MAX_NODE_TEXT_CHARS, + "original_lines": len(lines), + "original_chars": len(text), + "text_truncated": True, + "suggested_command": f"cat {shlex.quote(target)} --structure", + "node_id": node_id, + } + return data + + def _attach_structure_next_command(self, data: dict[str, Any], target: str) -> None: + pagination = data.get("structure_pagination") + if not isinstance(pagination, dict): + return + if pagination.get("has_more") and pagination.get("next_offset") is not None: + next_command = ( + f"cat {shlex.quote(target)} --structure " + f"--offset {pagination['next_offset']} --limit {pagination['limit']}" + ) + pagination["next_command"] = next_command + else: + pagination["next_command"] = None + + def _attach_page_next_command( + self, + data: dict[str, Any], + target: str, + *, + start: int, + end: int, + ) -> None: + page_count = end - start + 1 + next_command = None + if page_count == self.MAX_PAGE_SPAN: + next_start = end + 1 + next_end = next_start + self.MAX_PAGE_SPAN - 1 + next_command = f"cat {shlex.quote(target)} --page {next_start}-{next_end}" + data["page_pagination"] = { + "start": start, + "end": end, + "returned_pages": page_count, + "limit": self.MAX_PAGE_SPAN, + "next_command": next_command, + } + + @staticmethod + def _pagination_footer(command: str, reason: str, next_command: str) -> str: + return ( + f"# output limited by {command}: {reason}. " + f"Next: {next_command}. If unsure, use cat --structure." + ) + + @staticmethod + def _parse_node_ids(value: str) -> list[str]: + return [part.strip() for part in value.split(",") if part.strip()] + + @staticmethod + def _reject_regex_alternation_query(query: str, command_name: str) -> None: + if "|" not in str(query): + return + raise PIFSCommandError( + f"{command_name} does not support regex alternation '|'. " + "Run multiple grep commands or multiple search-summary commands " + "with one phrase each." + ) + + @staticmethod + def _validate_search_positionals(command_name: str, positionals: list[str]) -> None: + if len(positionals) > 2: + raise PIFSCommandError( + f"{command_name} accepts one query and an optional folder path. " + f"Quote multi-word queries, for example: {command_name} " + '"Federal Reserve" /documents' + ) + if len(positionals) == 2 and not positionals[1].startswith("/"): + raise PIFSCommandError( + f"{command_name} target must be a PIFS folder path like /documents. " + f"If your query has spaces, quote it, for example: {command_name} " + '"Federal Reserve" /documents' + ) + + @staticmethod + def _parse_numeric_range(value: str, label: str) -> tuple[int, int]: + try: + if "-" in value: + left, right = value.split("-", 1) + start, end = int(left), int(right) + else: + start = end = int(value) + except ValueError as exc: + raise PIFSCommandError(f"{label} requires a numeric range") from exc + if start < 1 or end < start: + raise PIFSCommandError(f"Invalid {label} range: {value}") + return start, end + + def _validate_metadata_field_for_stat(self, field: str) -> None: + schema = self.filesystem._metadata_schema() + fields = schema.get("fields", {}) + if field not in fields: + available = ", ".join(sorted(fields)[:20]) or "(none)" + raise PIFSCommandError( + f"Unknown metadata field: {field}. Use stat --schema to inspect fields. " + f"Available fields include: {available}" + ) + + def _stat_field_row(self, field: str, target: str) -> dict[str, Any]: + info = self.filesystem._stat(target) + folder_paths = [ + folder.get("path", "") + for folder in info.get("folders", []) + if folder.get("path") + ] + row = dict(info) + row["target"] = target + row["folder_paths"] = folder_paths + metadata = info.get("metadata") or {} + raw_value = metadata.get(field) + row.update( + { + "field": field, + "present": field in metadata, + "value": raw_value if field in metadata else None, + "display_target": self._file_target_path(row), + } + ) + return row + + def _render(self, data: Any, *, json_output: bool, command_name: str) -> str: + jsonable = self._jsonable(data) + if json_output: + return json.dumps({"ok": True, "data": jsonable}, ensure_ascii=False) + return self._render_shell(command_name, jsonable) + + def _render_shell(self, command_name: str, data: Any) -> str: + if command_name == "cat": + return self._render_cat(data) + if command_name == "ls": + return self._render_listing(data) + if command_name == "tree": + return self._render_tree(data) + if command_name in {"grep", "semantic-grep"}: + return self._render_grep(data) + if command_name in {"search-summary", "search-entity", "search-relation"}: + return self._render_semantic_search(data) + if command_name == "find": + return self._render_find(data) + if command_name == "stat": + return self._render_stat(data) + if command_name in {"head", "tail", "sed"}: + return str(data.get("text", "")) if isinstance(data, dict) else str(data) + if isinstance(data, dict): + return "\n".join(f"{key}: {value}" for key, value in data.items()) + if isinstance(data, list): + return "\n".join(str(item) for item in data) + return str(data) + + def _render_cat(self, data: Any) -> str: + if not isinstance(data, dict): + return str(data) + if data.get("available") is False: + return f"# {data.get('message', 'PageIndex structural content is unavailable')}" + if data.get("mode") == "structure": + return json.dumps( + { + "structure": data.get("structure", []), + "pagination": data.get("structure_pagination", {}), + }, + ensure_ascii=False, + indent=2, + ) + return str(data.get("text", "")) + + def _render_listing(self, data: Any) -> str: + if not isinstance(data, dict): + return str(data) + lines: list[str] = [] + for folder in data.get("folders", []): + name = folder["path"] if folder.get("path", "").startswith("/") else folder["name"] + if not name.endswith("/"): + name = f"{name}/" + lines.append( + f"{name} folders={folder.get('children_count', 0)} files={folder.get('file_count', 0)}" + ) + files = data.get("files", []) + for file in files[: self.MAX_LS_RENDER_FILES]: + lines.append(self._file_row_text(file)) + if len(files) > self.MAX_LS_RENDER_FILES: + remaining = len(files) - self.MAX_LS_RENDER_FILES + lines.append( + f"# ... {remaining} more files omitted from ls output; use grep/find to search this folder" + ) + return "\n".join(lines) + + def _render_tree(self, data: Any) -> str: + if not isinstance(data, dict): + return str(data) + root = self._normalize_folder_path(data.get("path", "/")) + max_depth = int(data.get("depth", 2)) + lines = [root] + folders = [ + folder + for folder in data.get("folders", []) + if self._relative_depth(root, folder["path"]) <= max_depth + ] + for folder in folders: + depth = self._relative_depth(root, folder["path"]) + indent = " " * max(depth - 1, 0) + lines.append( + f"{indent}{folder['name']}/ folders={folder.get('children_count', 0)} " + f"files={folder.get('file_count', 0)}" + ) + if len(folders) < len(data.get("folders", [])): + lines.append(f"# truncated at depth={max_depth}") + return "\n".join(lines) + + def _render_grep(self, data: Any) -> str: + if not isinstance(data, dict): + return str(data) + mode = data.get("mode") + if mode == "folders": + lines = [f"# folder matches for: {data.get('query', '')}"] + for folder in data.get("data", []): + path = folder["path"] + if not path.endswith("/"): + path = f"{path}/" + lines.append( + f"{path} matched_files={folder.get('matched_files', 0)} " + f"files={folder.get('files', 0)}" + ) + lines.append(f"# {data.get('hint', 'narrow into one directory, then run grep -R again')}") + return "\n".join(lines) + if mode == "limited": + query = str(data.get("query") or "") + scope = str(data.get("scope") or "/") + suggested_commands = list(data.get("suggested_commands") or []) + lines = [ + f"# grep -R skipped for broad folder: {scope}", + ( + "# reason: recursive lexical grep is limited when a folder is deeper " + f"than {data.get('folder_depth_limit', self.GREP_RECURSIVE_FOLDER_DEPTH_LIMIT)} " + f"levels or has more than {data.get('file_count_limit', self.GREP_RECURSIVE_FOLDER_FILE_LIMIT)} files" + ), + ] + if suggested_commands: + lines.extend(f"# suggested: {command}" for command in suggested_commands) + lines.append("# also try: narrow with ls/tree/find --where") + else: + lines.append("# suggested: narrow with ls/tree/find --where") + if data.get("sample_deep_folder_path"): + lines.append(f"# deep descendant example: {data['sample_deep_folder_path']}/") + return "\n".join(lines) + if mode == "files": + if not data.get("data", []): + return f"# no matches for: {data.get('query', '')}" + return "\n".join( + self._grep_file_hit_text(item) + for item in data.get("data", []) + ) + if mode == "matches": + return "\n".join( + f"{self._file_target_path(item)}:{item['line']}: " + f"{self._compact_text(item['text'], max_chars=220)}" + for item in data.get("data", []) + ) + return str(data) + + def _render_semantic_search(self, data: Any) -> str: + if not isinstance(data, dict): + return str(data) + if data.get("mode") != "files": + return self._render_grep(data) + if not data.get("data", []): + return f"# no matches for: {data.get('query', '')}" + lines: list[str] = [] + for item in data.get("data", []): + lines.append(f"path: {item.get('path') or '-'}") + lines.append(f"summary: {self._one_line_value(item.get('summary') or '')}") + if "entity" in item: + lines.append(f"entity: {self._one_line_value(item.get('entity') or '')}") + if "relation" in item: + lines.append(f"relation: {self._one_line_value(item.get('relation') or '')}") + line_text = self._one_line_value(item.get("line_text") or "") + lines.append(f"line_text: {line_text or '-'}") + lines.append("") + return "\n".join(lines).rstrip() + + def _render_find(self, data: Any) -> str: + if not isinstance(data, list): + return str(data) + if data and isinstance(data[0], dict) and "path" in data[0] and "file_ref" not in data[0]: + return "\n".join( + ( + f"{self._folder_row_path(item['path'])} matched_files={item['matched_files']} " + f"files={item.get('file_count', 0)}" + if item.get("matched_files") + else f"{self._folder_row_path(item['path'])} folders={item.get('children_count', 0)} " + f"files={item.get('file_count', 0)}" + ) + for item in data + ) + return "\n".join(self._file_row_text(item) for item in data) + + def _folder_row_path(self, path: str) -> str: + normalized = self._normalize_folder_path(path) + return "/" if normalized == "/" else f"{normalized}/" + + def _render_stat(self, data: Any) -> str: + if not isinstance(data, dict): + return str(data) + if "fields" in data: + lines = ["metadata schema:"] + for name, field in sorted(data["fields"].items()): + lines.append(f"{name}: {field.get('type', 'string')}") + return "\n".join(lines) + if data.get("mode") == "field_values": + field = data.get("field", "") + lines = [] + for item in data.get("data", []): + lines.append(f"{item.get('display_target') or item.get('target')}:") + value = item.get("value") + if value is None: + lines.append(f"{field}: -") + else: + lines.append(f"{field}: {self._one_line_value(value)}") + return "\n\n".join(lines) + if data.get("mode") == "files": + return "\n\n".join(self._render_stat(item) for item in data.get("data", [])) + lines = [ + f"target: {data.get('target') or data.get('file_ref')}", + f"file_ref: {data.get('file_ref')}", + f"document_id: {data.get('external_id') or data.get('document_id') or '-'}", + f"source_path: {data.get('source_path') or '-'}", + f"storage_uri: {data.get('storage_uri') or '-'}", + ] + folders = data.get("folders") or [] + if folders: + lines.append("folders:") + lines.extend(f" {folder['path']}" for folder in folders) + metadata = data.get("metadata") or {} + if metadata: + lines.append("metadata:") + metadata_items = sorted(metadata.items())[: self.MAX_STAT_METADATA_FIELDS] + for key, value in metadata_items: + lines.append(f" {key}: {self._compact_value(value)}") + if len(metadata) > self.MAX_STAT_METADATA_FIELDS: + lines.append(f" ... {len(metadata) - self.MAX_STAT_METADATA_FIELDS} more fields") + metadata_status = data.get("metadata_status") or {} + if metadata_status: + lines.append(f"metadata_status: {metadata_status.get('status', '-')}") + pageindex_tree = metadata_status.get("pageindex_tree") or {} + if isinstance(pageindex_tree, dict) and pageindex_tree: + lines.append(f"pageindex_tree_status: {pageindex_tree.get('status', '-')}") + message = str(pageindex_tree.get("message") or "").strip() + error_type = str(pageindex_tree.get("error_type") or "").strip() + if error_type and message: + lines.append(f"pageindex_tree_error: {error_type}: {message}") + elif message or error_type: + lines.append(f"pageindex_tree_error: {message or error_type}") + summary_projection = ( + metadata_status.get("projection_indexes", {}).get("summary", {}) + ) + if summary_projection: + lines.append( + f"summary_projection_status: {summary_projection.get('status', '-')}" + ) + return "\n".join(lines) + + def _file_row_text(self, item: dict[str, Any]) -> str: + file_ref = item.get("file_ref") + doc_id = item.get("external_id") or item.get("document_id") or "-" + title = self._compact_text(item.get("title") or item.get("name") or "", max_chars=80) + source_path = item.get("source_path") or "-" + folder_paths = item.get("folder_paths") or self._folder_paths_for_file(file_ref) + folders = f" folders={','.join(folder_paths)}" if folder_paths else "" + target = self._file_target_path(item) + return f"{target} id={doc_id} file_ref={file_ref or '-'} title={title} source={source_path}{folders}".strip() + + def _grep_file_hit_text(self, item: dict[str, Any]) -> str: + doc_id = item.get("external_id") or "-" + line = item.get("line") or 1 + target = self._file_target_path(item) + return ( + f"{target}:{line}: id={doc_id} " + f"{self._compact_text(item.get('text') or '', max_chars=180)}" + ) + + def _file_target_path(self, item: dict[str, Any]) -> str: + file_ref = item.get("file_ref") + title = str(item.get("title") or item.get("name") or "").strip() + folder_paths = item.get("folder_paths") or [] + folder_path = item.get("folder_path") + if not folder_paths and folder_path: + folder_paths = [folder_path] + if not folder_paths: + folder_paths = self._folder_paths_for_file(file_ref) + if folder_paths and title: + folder = str(folder_paths[0] or "/").rstrip("/") + return f"{folder}/{title}" if folder else f"/{title}" + return str(item.get("source_path") or item.get("external_id") or file_ref or "-") + + def _stable_file_target_path(self, item: dict[str, Any]) -> str: + file_ref = str(item.get("file_ref") or "").strip() + source_path = str(item.get("source_path") or "").strip() + if source_path: + target = "/" + source_path.strip("/") + try: + if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref: + return target + except KeyError: + pass + external_id = str(item.get("external_id") or "").strip() + if external_id: + return external_id + if file_ref: + return file_ref + return str(item.get("external_id") or item.get("file_ref") or "-") + + def _semantic_retrieval_query(self, query: str) -> str: + query = str(query or "").strip() + context = str(self.query_context or "").strip() + if context and query and query.lower() not in context.lower(): + return f"{context}\nSearch phrase: {query}" + return context or query + + def _recursive_grep_limit_notice(self, folder_path: str, query: str) -> dict[str, Any] | None: + stats = self.filesystem.store.folder_subtree_thresholds( + folder_path, + depth_limit=self.GREP_RECURSIVE_FOLDER_DEPTH_LIMIT, + file_limit=self.GREP_RECURSIVE_FOLDER_FILE_LIMIT, + ) + if not ( + stats["folder_depth_exceeds_limit"] + or stats["file_count_exceeds_limit"] + ): + return None + suggested_commands = self._semantic_alternative_commands(query, folder_path) + semantic_hint = ( + "Use " + "; ".join(suggested_commands) + " to discover candidates. " + if suggested_commands + else "" + ) + return { + "mode": "limited", + "query": query, + "scope": folder_path, + "folder_depth_limit": stats["depth_limit"], + "file_count_limit": stats["file_limit"], + "folder_depth_exceeds_limit": stats["folder_depth_exceeds_limit"], + "file_count_exceeds_limit": stats["file_count_exceeds_limit"], + "sampled_file_count": stats["sampled_file_count"], + "sample_deep_folder_path": stats["sample_deep_folder_path"], + "suggested_commands": suggested_commands, + "hint": ( + "Default grep -R remains lexical and is intentionally limited for broad deep folders " + "because the SQLite FTS path cannot guarantee fast recursive search at this scope. " + f"{semantic_hint}Use ls/tree or find --where to narrow first." + ), + } + + def _semantic_alternative_commands(self, query: str, folder_path: str) -> list[str]: + commands = [] + quoted_query = shlex.quote(query) + quoted_folder = shlex.quote(folder_path) + if self._semantic_grep_channels(): + commands.append(f"semantic-grep -R {quoted_query} {quoted_folder}") + for channel in SEMANTIC_RETRIEVAL_CHANNELS: + if self.filesystem.has_semantic_channel(channel): + command = self.SEMANTIC_CHANNEL_COMMANDS[channel] + commands.append(f"{command} {quoted_query} {quoted_folder}") + return commands + + def _rank_child_folders( + self, + *, + query: str, + children: list[dict[str, Any]], + metadata_filter: str | None, + limit: int, + ) -> list[dict[str, Any]]: + ranked: list[dict[str, Any]] = [] + for child in children: + results = self.filesystem.search( + query=query, + scope={"folder_path": child["path"], "recursive": True}, + metadata_filter=metadata_filter, + limit=max(limit, 50), + semantic=False, + ) + if not results: + continue + ranked.append( + { + "path": child["path"], + "name": child["name"], + "matched_files": len(results), + "files": self.filesystem.store.count_files_in_folder(child["path"], recursive=True), + "children_count": child.get("children_count", 0), + } + ) + ranked.sort(key=lambda item: (-item["matched_files"], item["path"])) + return ranked[:limit] + + def _grep_file_hits_from_results( + self, + results: list[Any], + query: str, + *, + require_match: bool = False, + limit: int | None = None, + ) -> list[dict[str, Any]]: + hits = [] + for result in results: + line, text = self._first_matching_line(result.file_ref, query) + if require_match and not text: + continue + hits.append( + { + "file_ref": result.file_ref, + "external_id": result.external_id, + "title": result.title, + "source_path": result.source_path, + "folder_paths": result.folder_paths, + "line": line, + "text": text or result.snippet, + } + ) + if limit is not None and len(hits) >= limit: + break + return hits + + def _semantic_channel_hits_from_results( + self, + channel: str, + results: list[Any], + query: str, + ) -> list[dict[str, Any]]: + hits = [] + for result in results: + metadata = result.metadata or {} + line, text = self._first_matching_line(result.file_ref, query) + line_text = "" + if text: + line_text = f"{line}: {self._compact_text(text, max_chars=220)}" + hit = { + "path": self._stable_file_target_path( + { + "file_ref": result.file_ref, + "title": result.title, + "folder_paths": result.folder_paths, + "source_path": result.source_path, + "external_id": result.external_id, + } + ), + "summary": metadata.get("summary") or "", + "line_text": line_text, + } + if channel in {"entity", "relation"}: + hit[channel] = metadata.get(channel) or "" + hits.append(hit) + return hits + + def _rank_child_folders_from_source( + self, + *, + query: str, + parent_path: str, + children: list[dict[str, Any]], + limit: int, + ) -> list[dict[str, Any]]: + source_dir = self._source_dir_for_folder(parent_path) + source_root = self._source_root() + if source_dir is None or source_root is None: + return [] + child_paths = {child["path"]: child for child in children} + counts: dict[str, int] = {} + for path in self._rg_candidate_files(query, source_dir, max_files=5000): + source_path = self._source_path_from_storage(path, source_root) + folder_path = "/" + str(Path(source_path).parent).strip("/") + child_path = self._matching_child_path(parent_path, folder_path, child_paths) + if child_path: + counts[child_path] = counts.get(child_path, 0) + 1 + ranked = [ + { + "path": path, + "name": child_paths[path]["name"], + "matched_files": matched, + "files": self.filesystem.store.count_files_in_folder(path, recursive=True), + "children_count": child_paths[path].get("children_count", 0), + } + for path, matched in counts.items() + ] + ranked.sort(key=lambda item: (-item["matched_files"], item["path"])) + return ranked[:limit] + + def _grep_source_file_hits( + self, + folder_path: str, + query: str, + *, + limit: int, + direct_only: bool = False, + ) -> list[dict[str, Any]]: + source_dir = self._source_dir_for_folder(folder_path) + source_root = self._source_root() + if source_dir is None or source_root is None: + return [] + hits = [] + for path in self._rg_candidate_files(query, source_dir, max_files=max(limit * 10, 50)): + file_row = self._file_row_for_storage(path) + if not file_row: + continue + if direct_only and self._folder_path_for_source_path(file_row["source_path"]) != folder_path: + continue + line_number, text = self._first_matching_source_line(path, query) + if line_number is None: + continue + hits.append( + { + "file_ref": file_row["file_ref"], + "external_id": file_row["external_id"], + "title": file_row["title"], + "source_path": file_row["source_path"], + "folder_paths": self._folder_paths_for_file(file_row["file_ref"]), + "line": line_number, + "text": text or file_row["title"], + } + ) + if len(hits) >= limit: + break + return hits + + def _grep_file_matches(self, target: str, query: str, *, limit: int) -> list[dict[str, Any]]: + file_ref = self.filesystem._resolve_target(target) + entry = self.filesystem.store.get_file(file_ref) + matches = [] + for line_number, line in enumerate(self.filesystem.store.read_text(file_ref).splitlines(), 1): + if self._line_matches(line, query): + matches.append( + { + "file_ref": file_ref, + "external_id": entry.external_id, + "title": entry.title, + "source_path": entry.source_path, + "folder_paths": self._folder_paths_for_file(file_ref), + "line": line_number, + "text": self._compact_text(line, max_chars=220), + } + ) + if len(matches) >= limit: + break + return matches + + def _first_matching_line(self, file_ref: str, query: str) -> tuple[int, str]: + for line_number, line in enumerate(self.filesystem.store.read_text(file_ref).splitlines(), 1): + if self._line_matches(line, query): + return line_number, self._compact_text(line, max_chars=220) + return 1, "" + + def _line_matches(self, line: str, query: str) -> bool: + haystack = line.lower() + needle = query.lower().strip() + if needle and needle in haystack: + return True + terms = [term for term in re.findall(r"[A-Za-z0-9_]+", needle) if term] + return bool(terms) and all(term in haystack for term in terms) + + @staticmethod + def _is_combined_grep_flag(arg: str) -> bool: + return bool(re.fullmatch(r"-[Rrni]+", arg)) and len(arg) > 2 + + def _rg_candidate_files(self, query: str, directory: Path, *, max_files: int) -> list[Path]: + if not directory.exists(): + return [] + terms = [term.lower() for term in re.findall(r"[A-Za-z0-9_]{3,}", query)] + if not terms: + return [] + primary = max(terms, key=len) + try: + completed = subprocess.run( + [ + "rg", + "-l", + "-i", + "-F", + primary, + str(directory), + "--glob", + "*.json", + "--no-messages", + ], + check=False, + capture_output=True, + text=True, + timeout=20, + ) + except (OSError, subprocess.TimeoutExpired): + return [] + candidates = [Path(line) for line in completed.stdout.splitlines() if line.strip()] + filtered = [] + for path in candidates[: max(max_files * 20, max_files)]: + try: + text = path.read_text(encoding="utf-8", errors="ignore").lower() + except OSError: + continue + if all(term in text for term in terms): + filtered.append(path) + if len(filtered) >= max_files: + break + return filtered + + def _first_matching_source_line(self, path: Path, query: str) -> tuple[int | None, str]: + try: + lines = path.read_text(encoding="utf-8", errors="ignore").splitlines() + except OSError: + return None, "" + for line_number, line in enumerate(lines, 1): + if self._line_matches(line, query): + return line_number, self._compact_text(line, max_chars=220) + return None, "" + + def _source_root(self) -> Path | None: + with self.filesystem.store.connect() as conn: + row = conn.execute( + """ + SELECT storage_uri, source_path + FROM files + WHERE deleted_at IS NULL + LIMIT 1 + """ + ).fetchone() + if row is None: + return None + storage_path = Path(row["storage_uri"]) + source_path = Path(row["source_path"]) + root = storage_path + for _ in range(len(source_path.parts)): + root = root.parent + return root + + def _source_dir_for_folder(self, folder_path: str) -> Path | None: + source_root = self._source_root() + if source_root is None: + return None + stripped = folder_path.strip("/") + return source_root / stripped if stripped else source_root + + @staticmethod + def _source_path_from_storage(path: Path, source_root: Path) -> str: + try: + return path.relative_to(source_root).as_posix() + except ValueError: + return path.name + + @staticmethod + def _matching_child_path( + parent_path: str, + folder_path: str, + child_paths: dict[str, dict[str, Any]], + ) -> str | None: + normalized_parent = parent_path.rstrip("/") + if normalized_parent == "": + normalized_parent = "/" + if normalized_parent == "/": + parts = [part for part in folder_path.strip("/").split("/") if part] + candidate = "/" + parts[0] if parts else "/" + return candidate if candidate in child_paths else None + prefix = normalized_parent + "/" + if not folder_path.startswith(prefix): + return None + remainder = folder_path[len(prefix):] + first = remainder.split("/", 1)[0] + candidate = prefix + first + return candidate if candidate in child_paths else None + + def _file_row_for_storage(self, path: Path) -> dict[str, Any] | None: + storage_uri = str(path) + with self.filesystem.store.connect() as conn: + row = conn.execute( + """ + SELECT file_ref, external_id, title, source_path + FROM files + WHERE storage_uri = ? AND deleted_at IS NULL + LIMIT 1 + """, + (storage_uri,), + ).fetchone() + if row is None: + return None + return { + "file_ref": row["file_ref"], + "external_id": row["external_id"], + "title": row["title"], + "source_path": row["source_path"], + } + + @staticmethod + def _folder_path_for_source_path(source_path: str) -> str: + parent = str(Path(source_path).parent).strip(".") + return "/" + parent.strip("/") if parent and parent != "." else "/" + + def _folder_paths_for_file(self, file_ref: str | None) -> list[str]: + if not file_ref: + return [] + try: + return [folder["path"] for folder in self.filesystem.store.folder_memberships(file_ref)] + except KeyError: + return [] + + def _is_folder(self, path: str) -> bool: + try: + self.filesystem.browse(path, recursive=False, limit=1) + return True + except KeyError: + return False + + @staticmethod + def _normalize_folder_path(path: str) -> str: + value = str(path or "/").strip() + if not value or value == "/": + return "/" + return "/" + value.strip("/") + + @classmethod + def _relative_depth(cls, root: str, path: str) -> int: + root = cls._normalize_folder_path(root).rstrip("/") + path = cls._normalize_folder_path(path).rstrip("/") + if root == "": + root = "/" + if root == "/": + rel = path.strip("/") + else: + rel = path[len(root):].strip("/") + return 0 if not rel else len(rel.split("/")) + + @classmethod + def _compact_value(cls, value: Any) -> str: + if isinstance(value, list): + rendered = ", ".join(cls._compact_text(str(item), max_chars=40) for item in value[:3]) + if len(value) > 3: + rendered += f", ... {len(value) - 3} more" + return rendered + if isinstance(value, dict): + return cls._compact_text(json.dumps(value, ensure_ascii=False, sort_keys=True), max_chars=120) + return cls._compact_text(str(value), max_chars=120) + + @staticmethod + def _one_line_value(value: Any) -> str: + if isinstance(value, (dict, list)): + value = json.dumps(value, ensure_ascii=False, sort_keys=True) + return re.sub(r"\s+", " ", str(value or "")).strip() + + @staticmethod + def _compact_text(text: str, *, max_chars: int) -> str: + collapsed = re.sub(r"\s+", " ", text or "").strip() + if len(collapsed) <= max_chars: + return collapsed + return collapsed[: max_chars - 3].rstrip() + "..." + + @staticmethod + def _clean_error_message(exc: BaseException) -> str: + message = str(exc) + if isinstance(exc, KeyError) and len(exc.args) == 1: + message = str(exc.args[0]) + return message or exc.__class__.__name__ + + @classmethod + def _jsonable(cls, value: Any) -> Any: + if is_dataclass(value): + return asdict(value) + if isinstance(value, list): + return [cls._jsonable(item) for item in value] + if isinstance(value, dict): + return {key: cls._jsonable(item) for key, item in value.items()} + return value + + @classmethod + def _validate_raw_command(cls, command: str) -> None: + if any(token in command for token in cls.FORBIDDEN_SUBSTRINGS): + raise PIFSCommandError("Only PageIndex FileSystem commands are allowed") + + @classmethod + def _validate_tokens(cls, tokens: list[str]) -> None: + if any(token in cls.FORBIDDEN_TOKENS for token in tokens): + raise PIFSCommandError("Only PageIndex FileSystem commands are allowed") + + @classmethod + def _split_chained_commands(cls, command: str) -> list[str]: + return cls._split_unquoted_operator(command, "&&", reject_single_amp=True) + + @classmethod + def _split_piped_commands(cls, command: str) -> list[str]: + return cls._split_unquoted_operator(command, "|") + + @classmethod + def _split_unquoted_operator( + cls, + command: str, + operator: str, + *, + reject_single_amp: bool = False, + ) -> list[str]: + cls._validate_raw_command(command) + parts: list[str] = [] + current: list[str] = [] + quote: str | None = None + escaped = False + i = 0 + while i < len(command): + char = command[i] + if escaped: + current.append(char) + escaped = False + i += 1 + continue + if char == "\\" and quote != "'": + current.append(char) + escaped = True + i += 1 + continue + if quote: + current.append(char) + if char == quote: + quote = None + i += 1 + continue + if char in {"'", '"'}: + quote = char + current.append(char) + i += 1 + continue + if command.startswith(operator, i): + part = "".join(current).strip() + if not part: + raise PIFSCommandError("Invalid command syntax") + parts.append(part) + current = [] + i += len(operator) + continue + if reject_single_amp and char == "&": + raise PIFSCommandError("Only PageIndex FileSystem commands are allowed") + current.append(char) + i += 1 + part = "".join(current).strip() + if quote: + raise PIFSCommandError("Invalid command syntax: No closing quotation") + if not part: + raise PIFSCommandError("Invalid command syntax") + parts.append(part) + return parts + + def _pipe_head_tail(self, input_text: str, args: list[str], *, from_tail: bool) -> str: + count = self._parse_head_tail_count(args) + count = self._require_at_most( + count, + "pipe head/tail line count", + self.MAX_TEXT_LINES, + ) + payload = self._try_json_loads(input_text) + if payload is not None: + return self._render_json_payload(self._slice_payload(payload, count, from_tail=from_tail)) + lines = input_text.splitlines() + selected = [] if count == 0 else lines[-count:] if from_tail else lines[:count] + return "\n".join(selected) + + def _pipe_grep(self, input_text: str, args: list[str]) -> str: + ignore_case = False + invert = False + regex = False + patterns: list[str] = [] + for arg in args: + if arg in {"-i", "--ignore-case"}: + ignore_case = True + elif arg in {"-v", "--invert-match"}: + invert = True + elif arg in {"-E", "--extended-regexp"}: + regex = True + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported pipe grep option: {arg}") + else: + patterns.append(arg) + if len(patterns) != 1: + raise PIFSCommandError("pipe grep requires exactly one pattern") + pattern = patterns[0] + self._reject_regex_alternation_query(pattern, "pipe grep") + payload = self._try_json_loads(input_text) + if payload is not None: + return self._render_json_payload( + self._filter_payload( + payload, + pattern, + ignore_case=ignore_case, + invert=invert, + regex=regex, + ) + ) + filtered = [ + line + for line in input_text.splitlines() + if self._text_matches(line, pattern, ignore_case=ignore_case, invert=invert, regex=regex) + ] + return "\n".join(filtered) + + def _pipe_sed(self, input_text: str, args: list[str]) -> str: + if not args: + raise PIFSCommandError("pipe sed requires an expression") + if args[0] == "-n": + args = args[1:] + if len(args) != 1: + raise PIFSCommandError("pipe sed supports only -n ',p'") + match = re.fullmatch(r"(\d+)(?:,(\d+))?p", args[0]) + if not match: + raise PIFSCommandError("pipe sed supports only -n ',p'") + start = int(match.group(1)) + end = int(match.group(2) or match.group(1)) + if start < 1 or end < start: + raise PIFSCommandError("Invalid sed line range") + self._require_at_most(end - start + 1, "pipe sed line count", self.MAX_TEXT_LINES) + payload = self._try_json_loads(input_text) + if payload is not None: + return self._render_json_payload(self._slice_text_payload(payload, start, end)) + lines = input_text.splitlines() + return "\n".join(lines[start - 1 : end]) + + @staticmethod + def _parse_head_tail_count(args: list[str]) -> int: + count = 10 + i = 0 + while i < len(args): + arg = args[i] + if arg == "-n": + i += 1 + if i >= len(args): + raise PIFSCommandError("head/tail -n requires a count") + count = PIFSCommandExecutor._parse_non_negative_int(args[i], "head/tail count") + elif re.fullmatch(r"-\d+", arg): + count = PIFSCommandExecutor._parse_non_negative_int(arg[1:], "head/tail count") + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported head/tail option: {arg}") + else: + count = PIFSCommandExecutor._parse_non_negative_int(arg, "head/tail count") + i += 1 + return count + + @staticmethod + def _parse_standalone_head_tail(args: list[str], *, default_count: int) -> tuple[int, str]: + count = default_count + target = "" + i = 0 + while i < len(args): + arg = args[i] + if arg == "-n": + i += 1 + if i >= len(args): + raise PIFSCommandError("head/tail -n requires a count") + count = PIFSCommandExecutor._parse_non_negative_int(args[i], "head/tail count") + elif re.fullmatch(r"-\d+", arg): + count = PIFSCommandExecutor._parse_non_negative_int(arg[1:], "head/tail count") + elif arg.startswith("-"): + raise PIFSCommandError(f"Unsupported head/tail option: {arg}") + else: + target = arg + i += 1 + if not target: + raise PIFSCommandError("head/tail requires a file target") + return count, target + + @staticmethod + def _parse_non_negative_int(value: str, label: str) -> int: + try: + parsed = int(value) + except ValueError as exc: + raise PIFSCommandError(f"{label} must be an integer") from exc + if parsed < 0: + raise PIFSCommandError(f"{label} must be non-negative") + return parsed + + @classmethod + def _parse_bounded_int(cls, value: str, label: str, *, max_value: int) -> int: + parsed = cls._parse_non_negative_int(value, label) + return cls._require_at_most(parsed, label, max_value) + + @classmethod + def _require_at_most(cls, value: int, label: str, max_value: int) -> int: + if value > max_value: + raise PIFSCommandError( + f"{label} supports at most {max_value}; requested {value}. " + "Split it into a smaller call. If the evidence is sufficient, " + "stop; if not, continue with additional chunks before " + "answering. If you are unsure where to inspect, use cat " + "--structure first." + ) + return value + + @staticmethod + def _parse_find_maxdepth(value: str | None) -> int: + if value is None: + raise PIFSCommandError("find -maxdepth requires an integer >= 0") + try: + parsed = int(value) + except ValueError as exc: + raise PIFSCommandError("find -maxdepth requires an integer >= 0") from exc + if parsed < 0: + raise PIFSCommandError("find -maxdepth requires an integer >= 0") + return parsed + + @staticmethod + def _try_json_loads(input_text: str) -> Any | None: + try: + return json.loads(input_text) + except json.JSONDecodeError: + return None + + @staticmethod + def _render_json_payload(payload: Any) -> str: + return json.dumps(payload, ensure_ascii=False) + + @classmethod + def _slice_payload(cls, payload: Any, count: int, *, from_tail: bool) -> Any: + if isinstance(payload, list): + return payload[-count:] if from_tail and count else payload[:count] + if not isinstance(payload, dict): + return payload + sliced = dict(payload) + if "data" in sliced: + sliced["data"] = cls._slice_data(sliced["data"], count, from_tail=from_tail) + else: + sliced = cls._slice_mapping_lists(sliced, count, from_tail=from_tail) + return sliced + + @classmethod + def _slice_data(cls, data: Any, count: int, *, from_tail: bool) -> Any: + if isinstance(data, list): + return data[-count:] if from_tail and count else data[:count] + if isinstance(data, dict): + if isinstance(data.get("text"), str): + copied = dict(data) + lines = copied["text"].splitlines() + copied["text"] = "\n".join(lines[-count:] if from_tail and count else lines[:count]) + return copied + return cls._slice_mapping_lists(data, count, from_tail=from_tail) + return data + + @classmethod + def _slice_mapping_lists(cls, data: dict[str, Any], count: int, *, from_tail: bool) -> dict[str, Any]: + copied = dict(data) + for key, value in copied.items(): + if isinstance(value, list): + copied[key] = value[-count:] if from_tail and count else value[:count] + return copied + + @classmethod + def _filter_payload( + cls, + payload: Any, + pattern: str, + *, + ignore_case: bool, + invert: bool, + regex: bool, + ) -> Any: + if isinstance(payload, list): + return [ + item + for item in payload + if cls._json_matches(item, pattern, ignore_case=ignore_case, invert=invert, regex=regex) + ] + if not isinstance(payload, dict): + return payload + filtered = dict(payload) + if "data" in filtered: + filtered["data"] = cls._filter_data( + filtered["data"], + pattern, + ignore_case=ignore_case, + invert=invert, + regex=regex, + ) + else: + filtered = cls._filter_mapping_lists( + filtered, + pattern, + ignore_case=ignore_case, + invert=invert, + regex=regex, + ) + return filtered + + @classmethod + def _filter_data( + cls, + data: Any, + pattern: str, + *, + ignore_case: bool, + invert: bool, + regex: bool, + ) -> Any: + if isinstance(data, list): + return [ + item + for item in data + if cls._json_matches(item, pattern, ignore_case=ignore_case, invert=invert, regex=regex) + ] + if isinstance(data, dict): + return cls._filter_mapping_lists( + data, + pattern, + ignore_case=ignore_case, + invert=invert, + regex=regex, + ) + if isinstance(data, str): + return "\n".join( + line + for line in data.splitlines() + if cls._text_matches(line, pattern, ignore_case=ignore_case, invert=invert, regex=regex) + ) + return data + + @classmethod + def _filter_mapping_lists( + cls, + data: dict[str, Any], + pattern: str, + *, + ignore_case: bool, + invert: bool, + regex: bool, + ) -> dict[str, Any]: + filtered = dict(data) + for key, value in filtered.items(): + if isinstance(value, list): + filtered[key] = [ + item + for item in value + if cls._json_matches(item, pattern, ignore_case=ignore_case, invert=invert, regex=regex) + ] + return filtered + + @classmethod + def _json_matches( + cls, + value: Any, + pattern: str, + *, + ignore_case: bool, + invert: bool, + regex: bool, + ) -> bool: + text = json.dumps(value, ensure_ascii=False, sort_keys=True) + return cls._text_matches(text, pattern, ignore_case=ignore_case, invert=invert, regex=regex) + + @staticmethod + def _text_matches( + text: str, + pattern: str, + *, + ignore_case: bool, + invert: bool, + regex: bool, + ) -> bool: + flags = re.IGNORECASE if ignore_case else 0 + if regex: + try: + matched = re.search(pattern, text, flags) is not None + except re.error as exc: + raise PIFSCommandError(f"Invalid grep regex: {exc}") from exc + elif ignore_case: + matched = pattern.lower() in text.lower() + else: + matched = pattern in text + return not matched if invert else matched + + @classmethod + def _slice_text_payload(cls, payload: Any, start: int, end: int) -> Any: + if not isinstance(payload, dict): + return payload + sliced = dict(payload) + data = sliced.get("data") + if isinstance(data, dict) and isinstance(data.get("text"), str): + copied_data = dict(data) + lines = copied_data["text"].splitlines() + copied_data["text"] = "\n".join(lines[start - 1 : end]) + copied_data["start_line"] = start + copied_data["end_line"] = min(end, len(lines)) + sliced["data"] = copied_data + return sliced diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py new file mode 100644 index 000000000..72833b78a --- /dev/null +++ b/pageindex/filesystem/core.py @@ -0,0 +1,1949 @@ +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import TYPE_CHECKING, Any, Optional, Union +from urllib.parse import unquote, urlparse + +from .metadata import MetadataQueryEngine +from .metadata_generation import ( + MetadataGenerationBackend, + MetadataGenerationError, + MetadataGenerationInput, + MetadataGenerationResult, + MetadataGenerator, +) +from .semantic_folder_policy import ( + SEMANTIC_FOLDER_BASE_FIELDS, + SEMANTIC_FOLDER_ROOT, + SEMANTIC_FOLDER_SYSTEM_FIELDS, + canonical_semantic_folder_field_name, + is_semantic_folder_forbidden_field, + semantic_folder_allowed_extension_fields, +) +from .store import ( + SQLiteFileSystemStore, + fingerprint, + make_file_ref, + metadata_text, + normalize_path, +) +from .structural_read import ( + flatten_pageindex_structure_nodes, + first_node_location, + find_pageindex_node, + strip_pageindex_text_fields, +) +from .types import OpenResult, SearchResult + +if TYPE_CHECKING: + from ..client import PageIndexClient + from .projection_indexing import SummaryProjectionIndexer + +DEFAULT_METADATA_GENERATION_FIELDS = { + "summary": True, + "doc_type": True, + "domain": True, + "topic": True, + "entity": False, + "relation": False, +} + +DEFAULT_METADATA_FIELD_TYPES = { + "summary": "string", + "doc_type": "string", + "domain": "string", + "topic": "string", + "entity": "string", + "relation": "string", +} + +METADATA_STATUSES = { + "skipped", + "pending_submit", + "pending_generate", + "generated", + "failed", +} + +PROJECTION_INDEX_STATUSES = { + "not_indexed", + "pending_index", + "generated", + "ready", + "failed", +} + +SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation") +SEMANTIC_GREP_CHANNELS = ("entity", "relation") +PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"} +PAGEINDEX_DOCUMENT_CONTENT_TYPES = { + "application/pdf", + "text/markdown", + "text/x-markdown", + "application/markdown", +} +TEXT_ARTIFACT_SUFFIXES = {".txt", ".text"} +TEXT_ARTIFACT_CONTENT_TYPES = {"text/plain"} + + +class PageIndexFileSystem: + def __init__( + self, + workspace: Union[str, Path], + *, + semantic_retrieval_backend: Any | None = None, + metadata_generator: MetadataGenerationBackend | None = None, + metadata_provider: str = "openai", + metadata_model: str | None = None, + metadata_base_url: str | None = None, + metadata_max_text_chars: int = 24000, + summary_projection_indexer: SummaryProjectionIndexer | None = None, + summary_projection_index: bool = True, + summary_projection_index_dir: Union[str, Path, None] = None, + summary_projection_embedding_provider: str = "openai", + summary_projection_embedding_model: str = "text-embedding-3-small", + summary_projection_embedding_dimensions: int = 256, + summary_projection_embedding_timeout: float = 60, + ): + self.workspace = Path(workspace).expanduser() + self.store = SQLiteFileSystemStore(self.workspace) + self.metadata = MetadataQueryEngine(self.store) + self.semantic_retrieval_backend = semantic_retrieval_backend + self.metadata_generator = metadata_generator + self.metadata_provider = metadata_provider + self.metadata_model = metadata_model + self.metadata_base_url = metadata_base_url + self.metadata_max_text_chars = metadata_max_text_chars + self.summary_projection_indexer = summary_projection_indexer + self.summary_projection_index = summary_projection_index + self.summary_projection_index_dir = ( + Path(summary_projection_index_dir).expanduser() + if summary_projection_index_dir is not None + else self.workspace / "artifacts" / "projection_indexes" + ) + self.summary_projection_embedding_provider = summary_projection_embedding_provider + self.summary_projection_embedding_model = summary_projection_embedding_model + self.summary_projection_embedding_dimensions = summary_projection_embedding_dimensions + self.summary_projection_embedding_timeout = summary_projection_embedding_timeout + + def register_file( + self, + *, + storage_uri: str, + source_path: str, + folder_path: Optional[str] = None, + metadata: Optional[dict[str, Any]] = None, + external_id: Optional[str] = None, + title: Optional[str] = None, + content: str = "", + content_type: str = "text/plain", + source_type: Optional[str] = None, + metadata_policy: Optional[dict[str, Any]] = None, + metadata_status: Optional[str] = None, + ) -> str: + return self.register_files( + [ + { + "storage_uri": storage_uri, + "source_path": source_path, + "folder_path": folder_path, + "metadata": metadata, + "external_id": external_id, + "title": title, + "content": content, + "content_type": content_type, + "source_type": source_type, + "metadata_policy": metadata_policy, + "metadata_status": metadata_status, + } + ] + )[0] + + def register(self, **kwargs: Any) -> str: + if not self._register_uses_deferred_metadata(kwargs.get("metadata_policy")): + self._ensure_register_completion_defaults() + return self.register_file(**kwargs) + + def register_files(self, files: list[dict[str, Any]]) -> list[str]: + records = [self._prepare_file_record(file) for file in files] + try: + for record in records: + self._generate_register_metadata(record) + self._register_generation_policy_schema(records) + self.store.insert_files(records) + except Exception: + self._cleanup_failed_register_artifacts(records) + raise + for record in records: + if self._complete_summary_projection_index(record): + self.store.update_file_metadata_status( + record["file_ref"], + metadata=record["metadata"], + metadata_status=record["metadata_status"], + ) + self._sync_owned_raw_artifact(record) + return [record["file_ref"] for record in records] + + def batch_generate(self, *, limit: int | None = None) -> dict[str, Any]: + if self.metadata_generator is None: + raise MetadataGenerationError( + "metadata_generator is required to generate pending PIFS metadata" + ) + rows = self.store.list_pending_metadata_status(limit=limit) + generated = 0 + failed = 0 + file_refs: list[str] = [] + for row in rows: + record = self._record_from_file_entry(row) + self._generate_register_metadata(record, force=True) + self._complete_summary_projection_index(record) + self._register_generation_policy_schema([record]) + self.store.update_file_metadata_status( + record["file_ref"], + metadata=record["metadata"], + metadata_status=record["metadata_status"], + ) + self._sync_owned_raw_artifact(record) + file_refs.append(record["file_ref"]) + if record["metadata_status"]["status"] == "failed": + failed += 1 + else: + generated += 1 + return { + "processed": len(rows), + "generated": generated, + "failed": failed, + "file_refs": file_refs, + } + + def _ensure_register_completion_defaults(self) -> None: + if self.metadata_generator is None: + self.metadata_generator = MetadataGenerator( + provider=self.metadata_provider, + model=self.metadata_model, + base_url=self.metadata_base_url, + max_text_chars=self.metadata_max_text_chars, + ) + if self.summary_projection_index and self.summary_projection_indexer is None: + from .projection_indexing import SummaryProjectionIndexer + + self.summary_projection_indexer = SummaryProjectionIndexer.from_provider( + self.summary_projection_index_dir, + embedding_provider=self.summary_projection_embedding_provider, + embedding_model=self.summary_projection_embedding_model, + embedding_dimensions=self.summary_projection_embedding_dimensions, + embedding_timeout=self.summary_projection_embedding_timeout, + ) + if self.summary_projection_index and self.semantic_retrieval_backend is None: + self.configure_hybrid_projection_retrieval( + self.summary_projection_index_dir, + embedding_provider=self.summary_projection_embedding_provider, + embedding_model=self.summary_projection_embedding_model, + embedding_dimensions=self.summary_projection_embedding_dimensions, + embedding_timeout=self.summary_projection_embedding_timeout, + ) + + def configure_existing_projection_retrieval(self) -> bool: + """Attach semantic retrieval to already-built projection indexes. + + Register-time generation owns building the index files. Opening an + existing workspace should still expose the corresponding read commands, + such as search-summary, without forcing a re-register step. + """ + if self.semantic_retrieval_backend is not None: + return bool(self.semantic_retrieval_channels()) + index_config = self._existing_projection_index_config() + if index_config is None: + return False + metadata = dict(index_config.get("metadata") or {}) + embedding_provider = str( + metadata.get("embedding_provider") + or self.summary_projection_embedding_provider + ) + embedding_model = str( + metadata.get("embedding_model") + or self.summary_projection_embedding_model + ) + embedding_dimensions = int( + metadata.get("embedding_dimensions") + or index_config.get("dimension") + or self.summary_projection_embedding_dimensions + ) + self.configure_hybrid_projection_retrieval( + self.summary_projection_index_dir, + embedding_provider=embedding_provider, + embedding_model=embedding_model, + embedding_dimensions=embedding_dimensions, + embedding_timeout=self.summary_projection_embedding_timeout, + ) + return bool(self.semantic_retrieval_channels()) + + def _existing_projection_index_config(self) -> dict[str, Any] | None: + from .hybrid_projection import INDEX_BY_CHANNEL + from .semantic_index import SQLiteVecSemanticIndex + + for channel in SEMANTIC_RETRIEVAL_CHANNELS: + index_name = INDEX_BY_CHANNEL.get(channel) + if not index_name: + continue + index_path = self.summary_projection_index_dir / f"{index_name}.sqlite" + if not index_path.exists(): + continue + try: + info = SQLiteVecSemanticIndex(index_path).info() + except Exception: + continue + if int(info.get("document_count") or 0) <= 0: + continue + metadata = dict(info.get("metadata") or {}) + if metadata.get("channel") and metadata.get("channel") != channel: + continue + return info + return None + + @staticmethod + def _register_uses_deferred_metadata(policy: Any) -> bool: + if not isinstance(policy, dict): + return False + return bool(policy.get("batch")) or policy.get("mode") == "batch" + + @classmethod + def default_metadata_policy(cls) -> dict[str, Any]: + return { + "fields": dict(DEFAULT_METADATA_GENERATION_FIELDS), + "projection_indexes": {"summary": True}, + "batch": False, + } + + def browse( + self, + path: str = "/", + recursive: bool = False, + limit: int = 100, + max_depth: int | None = None, + ) -> dict[str, list[dict[str, Any]]]: + return self.store.list_folder( + path, + recursive=recursive, + limit=limit, + max_depth=max_depth, + ) + + def folder_info(self, path: str = "/") -> dict[str, Any]: + return self.store.folder_info(path) + + def find_folders( + self, + path: str = "/", + metadata_filter: Optional[dict[str, Any] | str] = None, + limit: int = 100, + max_depth: int | None = None, + ) -> list[dict[str, Any]]: + parsed_filter = self.metadata.parse_filter(metadata_filter) + return self.store.find_folders( + path, + metadata_filter=parsed_filter, + limit=limit, + max_depth=max_depth, + ) + + def create_folder( + self, + path: str, + kind: str = "manual", + description: str = "", + metadata: Optional[dict[str, Any]] = None, + ) -> str: + return self.store.create_folder( + path, + kind=kind, + description=description, + metadata=metadata, + ) + + def attach_file_to_folder( + self, + file_ref: str, + folder_path_or_id: str, + metadata: Optional[dict[str, Any]] = None, + ) -> None: + self.store.attach_file_to_folder(file_ref, folder_path_or_id, metadata=metadata) + + def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None: + self.store.attach_files_to_folders(items) + + def apply_semantic_folder_projection( + self, + projection_plan: dict[str, Any], + *, + file_ref_by_document_id: Optional[dict[str, str]] = None, + ) -> dict[str, Any]: + """Attach registered files to a Semantic Folder Projection. + + Registration remains the explicit folder placement step. This method is + the separate product API for adding derived `/semantic/...` memberships. + """ + folders = list(projection_plan.get("folders") or []) + memberships = list(projection_plan.get("memberships") or []) + policy_raw = projection_plan.get("policy") + policy = policy_raw if isinstance(policy_raw, dict) else {} + allowed_extension_fields = semantic_folder_allowed_extension_fields( + policy.get("allowed_extension_fields", []) + ) + for folder in folders: + self._validate_semantic_folder_projection_item(folder, allowed_extension_fields) + for membership in memberships: + self._validate_semantic_folder_projection_item(membership, allowed_extension_fields) + + for folder in folders: + folder_metadata = folder.get("metadata") + self.create_folder( + self._validate_semantic_folder_projection_path(str(folder["path"])), + kind=str(folder.get("kind") or "semantic_projection"), + description=str(folder.get("description") or ""), + metadata=folder_metadata if isinstance(folder_metadata, dict) else {}, + ) + + items: list[dict[str, Any]] = [] + file_ref_by_document_id = file_ref_by_document_id or {} + for membership in memberships: + document_id = self._semantic_folder_projection_document_id(membership) + file_ref = file_ref_by_document_id.get(document_id) + if not file_ref: + file_ref = self.store.resolve_file_ref(document_id) + metadata = ( + dict(membership.get("folder_metadata")) + if isinstance(membership.get("folder_metadata"), dict) + else {} + ) + metadata.update( + { + "projection": "Semantic Folder Projection", + "field": membership.get("field", ""), + "value": membership.get("value", ""), + "mount_kind": membership.get( + "mount_kind", + "semantic_folder_projection", + ), + } + ) + items.append( + { + "file_ref": file_ref, + "folder": self._validate_semantic_folder_projection_path( + str(membership["folder_path"]) + ), + "metadata": metadata, + } + ) + self.attach_files_to_folders(items) + return { + "projection": "Semantic Folder Projection", + "folders_applied": len(folders), + "memberships_attached": len(items), + } + + def search( + self, + query: Union[str, list[str], None] = None, + scope: Optional[dict[str, Any]] = None, + metadata_filter: Optional[dict[str, Any] | str] = None, + limit: int = 10, + semantic: bool = True, + ) -> list[SearchResult]: + parsed_filter = self.metadata.parse_filter(metadata_filter) + if semantic and self._should_use_semantic_retrieval(query, scope): + semantic_results = self._semantic_search( + query, + scope=scope, + metadata_filter=parsed_filter, + limit=limit, + ) + if semantic_results: + return semantic_results + rows = self.store.search_files( + query, + scope=scope, + metadata_filter=parsed_filter, + limit=limit, + ) + results = [] + scope_path = self._scope_folder_path(scope) + for row in rows: + folder_paths = [ + folder["path"] + for folder in self.store.folder_memberships(row["file_ref"]) + ] + folder_path = self._preferred_folder_path(folder_paths, scope_path, row["folder_path"]) + results.append( + SearchResult( + file_ref=row["file_ref"], + external_id=row["external_id"], + title=row["title"], + snippet=row["snippet"], + folder_path=folder_path, + folder_paths=folder_paths, + metadata=row["metadata"], + metadata_status=row["metadata_status"], + source_path=row["source_path"], + id=row["id"], + document_id=row["document_id"], + name=row["name"], + description=row["description"], + status=row["status"], + pageNum=row["pageNum"], + createdAt=row["createdAt"], + folderId=row["folderId"], + ) + ) + return results + + def search_semantic_channel( + self, + channel: str, + query: Union[str, list[str], None], + *, + scope: Optional[dict[str, Any]] = None, + metadata_filter: Optional[dict[str, Any] | str] = None, + limit: int = 10, + ) -> list[SearchResult]: + parsed_filter = self.metadata.parse_filter(metadata_filter) + if ( + self.semantic_retrieval_backend is None + or not self.has_semantic_channel(channel) + or not self._query_text(query) + ): + return [] + return self._semantic_search( + query, + scope=scope, + metadata_filter=parsed_filter, + limit=limit, + channel=channel, + ) + + def configure_hybrid_projection_retrieval( + self, + index_dir: Union[str, Path], + *, + embedding_provider: str = "openai", + embedding_model: str = "text-embedding-3-small", + embedding_dimensions: int = 256, + embedding_timeout: float = 60, + per_channel_limit: int = 100, + fetch_multiplier: int = 100, + ) -> Any: + from .hybrid_projection import HybridProjectionSearchBackend + + self.semantic_retrieval_backend = HybridProjectionSearchBackend.from_provider( + index_dir, + embedding_provider=embedding_provider, + embedding_model=embedding_model, + embedding_dimensions=embedding_dimensions, + embedding_timeout=embedding_timeout, + per_channel_limit=per_channel_limit, + fetch_multiplier=fetch_multiplier, + ) + return self.semantic_retrieval_backend + + @property + def has_semantic_retrieval_backend(self) -> bool: + return self.semantic_retrieval_backend is not None + + def semantic_retrieval_channels(self) -> tuple[str, ...]: + backend = self.semantic_retrieval_backend + if backend is None: + return () + available_channels = getattr(backend, "available_channels", None) + if callable(available_channels): + raw_channels = available_channels() + else: + raw_channels = getattr(backend, "semantic_tool_channels", ()) + available = set(raw_channels or ()) + return tuple(channel for channel in SEMANTIC_RETRIEVAL_CHANNELS if channel in available) + + def has_semantic_channel(self, channel: str) -> bool: + return channel in self.semantic_retrieval_channels() + + def retrieval_capabilities(self) -> dict[str, Any]: + semantic_channels = self.semantic_retrieval_channels() + semantic_commands = [f"search-{channel}" for channel in semantic_channels] + semantic_grep_channels = [ + channel for channel in SEMANTIC_GREP_CHANNELS if channel in semantic_channels + ] + if semantic_grep_channels: + semantic_commands.append("semantic-grep") + return { + "lexical": { + "grep_recursive": True, + "grep_recursive_semantic_prefilter": False, + "grep_recursive_guard": "bounded broad-folder notice", + "find_maxdepth": True, + }, + "semantic": { + "backend_configured": self.semantic_retrieval_backend is not None, + "channels": list(semantic_channels), + "commands": semantic_commands, + "semantic_grep_channels": semantic_grep_channels, + }, + } + + def find( + self, + target: str, + patterns: Union[str, list[str]], + limit: int = 20, + ) -> list[OpenResult]: + file_ref = self._resolve_target(target) + patterns = [patterns] if isinstance(patterns, str) else list(patterns) + lowered_patterns = [pattern.lower() for pattern in patterns if pattern] + if not lowered_patterns: + return [] + text = self.store.read_text(file_ref) + lines = text.splitlines() + matches = [] + for i, line in enumerate(lines, 1): + haystack = line.lower() + if any(pattern in haystack for pattern in lowered_patterns): + start = max(1, i - 1) + end = min(len(lines), i + 1) + matches.append(self._open_lines(file_ref, start, end)) + if len(matches) >= limit: + break + return matches + + def open(self, target: str, location: str = "all") -> OpenResult: + file_ref = self._resolve_target(target) + entry = self.store.get_file(file_ref) + if self._file_format(entry) in {"pdf", "markdown", "pageindex"}: + raise ValueError( + "open() text artifact reads are not supported for PDF/Markdown PageIndex files; " + "use pageindex_structure(), pageindex_pages(), or pageindex_node()." + ) + if str(location).strip().lower() in {"all", "full", "*"}: + return self._open_all(file_ref) + start, end = self._parse_line_range(location) + return self._open_lines(file_ref, start, end) + + def cat_text_artifact(self, target: str, location: str = "all") -> OpenResult: + file_ref = self._resolve_target(target) + entry = self.store.get_file(file_ref) + self._require_text_artifact_file(entry, "cat --all") + if str(location).strip().lower() in {"all", "full", "*"}: + return self._open_all(file_ref) + start, end = self._parse_line_range(location) + return self._open_lines(file_ref, start, end) + + def pageindex_structure( + self, + target: str, + *, + offset: int = 0, + limit: int = 25, + ) -> dict[str, Any]: + file_ref = self._resolve_target(target) + entry = self.store.get_file(file_ref) + self._require_pageindex_document_file(entry, "cat --structure") + client, doc_id = self._pageindex_client_doc_for_entry(entry) + if doc_id is None: + return self._structural_unavailable( + "structure", + entry, + message=( + "PageIndex structure is not cached for this file in the " + "PageIndexClient workspace." + ), + ) + structure = self._client_json(client.get_document_structure(doc_id)) + if isinstance(structure, dict) and structure.get("error"): + return self._structural_unavailable( + "structure", + entry, + message=str(structure["error"]), + ) + node_rows = flatten_pageindex_structure_nodes(structure) + offset = max(0, offset) + limit = max(0, limit) + window = node_rows[offset : offset + limit] if limit else [] + next_offset = offset + len(window) + has_more = next_offset < len(node_rows) + return { + "mode": "structure", + "file_ref": file_ref, + "external_id": entry.external_id, + "source_path": entry.source_path, + "status": entry.pageindex_tree_status, + "available": True, + "pageindex_doc_id": doc_id, + "structure": window, + "structure_pagination": { + "offset": offset, + "limit": limit, + "returned_nodes": len(window), + "total_nodes": len(node_rows), + "has_more": has_more, + "next_offset": next_offset if has_more else None, + }, + } + + def pageindex_node(self, target: str, node_id: str) -> dict[str, Any]: + file_ref = self._resolve_target(target) + entry = self.store.get_file(file_ref) + self._require_pageindex_document_file(entry, "cat --node") + client, doc_id = self._pageindex_client_doc_for_entry(entry) + if doc_id is None: + return self._structural_unavailable( + "node", + entry, + node_id=node_id, + message=( + "PageIndex structure is not cached for this file in the " + "PageIndexClient workspace." + ), + ) + client._ensure_doc_loaded(doc_id) + doc = client.documents.get(doc_id, {}) + node = find_pageindex_node(doc.get("structure", []), node_id) + if node is None: + return self._structural_unavailable( + "node", + entry, + node_id=node_id, + message="PageIndex node was not found in the cached structure.", + ) + text = str(node.get("text") or "") + if not text: + location = first_node_location(node) + if location: + content = self._client_json(client.get_page_content(doc_id, location)) + if isinstance(content, list): + text = "\n\n".join(str(page.get("content") or "") for page in content) + if not text: + return self._structural_unavailable( + "node", + entry, + node_id=node_id, + message="Cached PageIndex node has no text content.", + ) + return { + "mode": "node", + "file_ref": file_ref, + "external_id": entry.external_id, + "source_path": entry.source_path, + "status": entry.pageindex_tree_status, + "available": True, + "pageindex_doc_id": doc_id, + "node_id": node_id, + "node": strip_pageindex_text_fields(node), + "text": text, + } + + def pageindex_pages(self, target: str, pages: str) -> dict[str, Any]: + file_ref = self._resolve_target(target) + entry = self.store.get_file(file_ref) + self._require_pageindex_document_file(entry, "cat --page") + client, doc_id = self._pageindex_client_doc_for_entry(entry) + if doc_id is None: + return self._structural_unavailable( + "page", + entry, + pages=pages, + message=( + "PageIndex page content is not cached for this file in the " + "PageIndexClient workspace." + ), + ) + page_entries = self._client_json(client.get_page_content(doc_id, pages)) + if isinstance(page_entries, dict) and page_entries.get("error"): + return self._structural_unavailable( + "page", + entry, + pages=pages, + message=str(page_entries["error"]), + ) + if not isinstance(page_entries, list) or not page_entries: + return self._structural_unavailable( + "page", + entry, + pages=pages, + message="Requested PageIndex page content is not cached for this file.", + ) + text = "\n\n".join(str(page.get("content") or "") for page in page_entries) + return { + "mode": "page", + "file_ref": file_ref, + "external_id": entry.external_id, + "source_path": entry.source_path, + "status": entry.pageindex_tree_status, + "available": True, + "pageindex_doc_id": doc_id, + "pages": pages, + "data": page_entries, + "text": text, + } + + def _stat(self, target: str) -> dict[str, Any]: + file_ref = self._resolve_target(target) + return self.store.file_info(file_ref) + + def _require_text_artifact_file(self, entry: Any, command: str) -> None: + if self._file_format(entry) == "text": + return + raise ValueError( + f"{command} is only supported for txt/text files; " + f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " + "Use cat --structure, " + "cat --page, or " + "cat --node for PDF/Markdown PageIndex files." + ) + + def _require_pageindex_document_file(self, entry: Any, command: str) -> None: + if self._file_format(entry) in {"pdf", "markdown", "pageindex"}: + return + raise ValueError( + f"{command} is only supported for PDF/Markdown PageIndex files; " + f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " + "Use cat --all for txt/text files." + ) + + @classmethod + def _file_format(cls, entry: Any) -> str: + suffix = Path(str(entry.source_path or "")).suffix.lower() + content_type = cls._normalized_content_type(entry.content_type) + if suffix == ".pdf" or content_type == "application/pdf": + return "pdf" + if suffix in PAGEINDEX_DOCUMENT_SUFFIXES or content_type in PAGEINDEX_DOCUMENT_CONTENT_TYPES: + return "markdown" + if suffix in TEXT_ARTIFACT_SUFFIXES: + return "text" + if entry.pageindex_doc_id or entry.pageindex_tree_status != "not_built": + return "pageindex" + if content_type in TEXT_ARTIFACT_CONTENT_TYPES: + return "text" + return "unsupported" + + @classmethod + def _source_format(cls, source_path: Any, content_type: str | None) -> str: + suffix = Path(str(source_path or "")).suffix.lower() + normalized_content_type = cls._normalized_content_type(content_type) + if suffix == ".pdf" or normalized_content_type == "application/pdf": + return "pdf" + if ( + suffix in PAGEINDEX_DOCUMENT_SUFFIXES + or normalized_content_type in PAGEINDEX_DOCUMENT_CONTENT_TYPES + ): + return "markdown" + if suffix in TEXT_ARTIFACT_SUFFIXES: + return "text" + if normalized_content_type in TEXT_ARTIFACT_CONTENT_TYPES: + return "text" + return "unsupported" + + @staticmethod + def _normalized_content_type(content_type: str | None) -> str: + return str(content_type or "").split(";", 1)[0].strip().lower() + + @property + def pageindex_client_workspace(self) -> Path: + return self.workspace / "artifacts" / "pageindex_client" + + def _pageindex_client(self) -> PageIndexClient: + from ..client import PageIndexClient + + return PageIndexClient(workspace=str(self.pageindex_client_workspace)) + + def _pageindex_client_doc_for_entry(self, entry: Any) -> tuple[PageIndexClient, str | None]: + client = self._pageindex_client() + if not entry.pageindex_doc_id: + return client, None + if entry.pageindex_doc_id not in client.documents: + return client, None + return client, entry.pageindex_doc_id + + def _registration_pageindex_pointer( + self, + *, + storage_uri: str, + source_path: str, + content_type: str, + ) -> tuple[str | None, str, dict[str, Any] | None]: + if self._source_format(source_path, content_type) not in {"pdf", "markdown"}: + return None, "not_built", None + client = self._pageindex_client() + source = self._canonical_source_path(storage_uri=storage_uri, source_path=source_path) + cached_doc_id = self._find_cached_pageindex_doc_id(client, source) + if cached_doc_id: + return cached_doc_id, "built", None + if source is None: + return None, "failed", self._pageindex_tree_failure_record( + source="PageIndexFileSystem.registration", + error_type="UnresolvableSourcePath", + message=( + "PageIndex source path must resolve to a local file path for " + "PDF/Markdown registration." + ), + ) + try: + doc_id = client.index(source) + return doc_id, "built", None + except Exception as exc: + return None, "failed", self._pageindex_tree_failure_record( + source="PageIndexClient.index", + error_type=exc.__class__.__name__, + message=str(exc) or exc.__class__.__name__, + ) + + @staticmethod + def _pageindex_tree_failure_record( + *, + source: str, + error_type: str, + message: str, + ) -> dict[str, Any]: + return { + "status": "failed", + "owner": "pageindex", + "source": source, + "error_type": error_type, + "message": message, + } + + def _find_cached_pageindex_doc_id( + self, + client: PageIndexClient, + source_path: str | None, + ) -> str | None: + if source_path is None: + return None + for doc_id, doc in client.documents.items(): + if self._canonical_path(doc.get("path")) == source_path: + return doc_id + return None + + def _canonical_source_path(self, *, storage_uri: str, source_path: str) -> str | None: + parsed = urlparse(storage_uri) + if parsed.scheme == "file": + return self._canonical_path(unquote(parsed.path)) + if storage_uri and not parsed.scheme: + return self._canonical_path(storage_uri) + if Path(source_path).expanduser().is_absolute(): + return self._canonical_path(source_path) + return None + + @staticmethod + def _canonical_path(path: Any) -> str | None: + if not path: + return None + return str(Path(os.path.expanduser(str(path))).resolve(strict=False)) + + @staticmethod + def _client_json(payload: str) -> Any: + try: + return json.loads(payload) + except json.JSONDecodeError: + return {"error": f"Invalid PageIndexClient JSON response: {payload}"} + + def _metadata_schema(self) -> dict[str, Any]: + return self.metadata.export_schema() + + def _register_metadata_schema(self, schema: dict[str, Any]) -> None: + self.metadata.register_schema(schema) + + def _create_folder(self, path: str) -> str: + return self.create_folder(path) + + def _prepare_file_record(self, file: dict[str, Any]) -> dict[str, Any]: + storage_uri = file["storage_uri"] + raw_source_path = str(file["source_path"]) + source_path = raw_source_path.strip("/") + metadata = file.get("metadata") or {} + if not isinstance(metadata, dict): + raise ValueError("metadata must be a JSON object") + legacy_value_key = "derived_" + "metadata" + legacy_policy_key = "metadata_" + "generation_policy" + legacy_status_key = "metadata_" + "generation_status" + if legacy_value_key in file: + raise ValueError("legacy generated metadata map has been removed; put values in metadata") + if legacy_policy_key in file: + raise ValueError("legacy metadata policy key has been renamed to metadata_policy") + if legacy_status_key in file: + raise ValueError("legacy metadata status key has been renamed to metadata_status") + self._validate_register_metadata(metadata) + external_id = file.get("external_id") + content = file.get("content") or "" + content_type = file.get("content_type") or "text/plain" + ( + pageindex_doc_id, + pageindex_tree_status, + pageindex_tree_failure, + ) = self._registration_pageindex_pointer( + storage_uri=storage_uri, + source_path=raw_source_path, + content_type=content_type, + ) + artifact_content = self._registration_text_artifact_content( + source_path=raw_source_path, + content_type=content_type, + pageindex_doc_id=pageindex_doc_id, + pageindex_tree_status=pageindex_tree_status, + fallback_content=content, + ) + fts_content = file.get("fts_content", artifact_content) + source_type = file.get("source_type") or self._infer_source_type(source_path) + metadata_policy = self._normalize_metadata_policy( + file.get("metadata_policy"), + metadata=metadata, + ) + metadata_status = self._metadata_status_state( + metadata_policy, + metadata=metadata, + status=file.get("metadata_status"), + ) + self._attach_pageindex_tree_failure(metadata_status, pageindex_tree_failure) + indexed_metadata = SQLiteFileSystemStore.indexed_metadata_values(metadata) + searchable_metadata = dict(metadata) + folder_path = normalize_path(file.get("folder_path") or "/") + title = file.get("title") or metadata.get("title") or Path(source_path).stem + file_ref = make_file_ref(external_id or source_path) + text_artifact_path = file.get("text_artifact_path") + owns_text_artifact = text_artifact_path is None + if text_artifact_path is None: + text_artifact_path = self.store.write_text_artifact(file_ref, artifact_content) + raw_artifact_path = file.get("raw_artifact_path") + owns_raw_artifact = False + if raw_artifact_path is None and file.get("write_raw_artifact", True): + raw_artifact_path = self.store.raw_dir / f"{file_ref}.json" + owns_raw_artifact = True + descriptor = self._build_descriptor(title, metadata) + return { + "file_ref": file_ref, + "external_id": external_id, + "storage_uri": storage_uri, + "source_path": source_path, + "title": title, + "descriptor": descriptor, + "content_type": content_type, + "source_type": source_type, + "fingerprint": fingerprint(artifact_content), + "text_artifact_path": str(text_artifact_path), + "raw_artifact_path": str(raw_artifact_path) if raw_artifact_path is not None else None, + "pageindex_doc_id": pageindex_doc_id, + "pageindex_tree_status": pageindex_tree_status, + "metadata": metadata, + "metadata_json": json.dumps(metadata, ensure_ascii=False), + "metadata_status": metadata_status, + "metadata_status_json": json.dumps(metadata_status, ensure_ascii=False), + "indexed_metadata": indexed_metadata, + "metadata_text": metadata_text(searchable_metadata), + "folder_path": folder_path, + "content": fts_content, + "skip_fts": bool(file.get("skip_fts", False)), + "_pifs_owned_text_artifact": owns_text_artifact, + "_pifs_owned_raw_artifact": owns_raw_artifact, + } + + def _registration_text_artifact_content( + self, + *, + source_path: str, + content_type: str, + pageindex_doc_id: str | None, + pageindex_tree_status: str, + fallback_content: str, + ) -> str: + if self._source_format(source_path, content_type) not in {"pdf", "markdown"}: + return fallback_content + if pageindex_tree_status != "built" or not pageindex_doc_id: + return fallback_content + return self._pageindex_extracted_text(pageindex_doc_id) + + def _pageindex_extracted_text(self, doc_id: str) -> str: + client = self._pageindex_client() + if doc_id not in client.documents: + return "" + client._ensure_doc_loaded(doc_id) + doc = client.documents.get(doc_id) or {} + page_text = self._pageindex_pages_text(doc.get("pages")) + if page_text: + return page_text + return self._pageindex_structure_text(doc.get("structure", [])) + + @staticmethod + def _pageindex_pages_text(pages: Any) -> str: + if not isinstance(pages, list): + return "" + parts: list[str] = [] + for page in pages: + if not isinstance(page, dict): + continue + content = str(page.get("content") or "").strip() + if content: + parts.append(content) + return "\n\n".join(parts) + + @classmethod + def _pageindex_structure_text(cls, structure: Any) -> str: + parts: list[str] = [] + cls._collect_pageindex_node_text(structure, parts) + return "\n\n".join(parts) + + @classmethod + def _collect_pageindex_node_text(cls, node: Any, parts: list[str]) -> None: + if isinstance(node, list): + for item in node: + cls._collect_pageindex_node_text(item, parts) + return + if not isinstance(node, dict): + return + text = str(node.get("text") or "").strip() + if text: + parts.append(text) + cls._collect_pageindex_node_text(node.get("nodes", []), parts) + + @staticmethod + def _raw_artifact_payload( + *, + storage_uri: str, + source_path: str, + folder_path: str, + metadata: dict[str, Any], + metadata_status: dict[str, Any], + ) -> dict[str, Any]: + return { + "storage_uri": storage_uri, + "source_path": source_path, + "folder_path": folder_path, + "metadata": metadata, + "metadata_status": metadata_status, + } + + def _sync_owned_raw_artifact(self, record: dict[str, Any]) -> None: + raw_artifact_path = record.get("raw_artifact_path") + if not raw_artifact_path: + return + default_raw_artifact_path = self.store.raw_dir / f"{record['file_ref']}.json" + if Path(raw_artifact_path).expanduser().resolve(strict=False) != ( + default_raw_artifact_path.resolve(strict=False) + ): + return + record["raw_artifact_path"] = str( + self.store.write_raw_artifact( + record["file_ref"], + self._raw_artifact_payload( + storage_uri=record["storage_uri"], + source_path=record["source_path"], + folder_path=record["folder_path"], + metadata=record["metadata"], + metadata_status=record["metadata_status"], + ), + ) + ) + + def _record_from_file_entry(self, entry: Any) -> dict[str, Any]: + content = self.store.read_text(entry.file_ref) + metadata_policy = self._normalize_metadata_policy( + entry.metadata_status.get("policy", {}), + metadata=entry.metadata, + ) + metadata_status = self._metadata_status_state( + metadata_policy, + metadata=entry.metadata, + status=entry.metadata_status.get("status"), + ) + self._attach_pageindex_tree_failure( + metadata_status, + entry.metadata_status.get("pageindex_tree"), + ) + return { + "file_ref": entry.file_ref, + "external_id": entry.external_id, + "storage_uri": entry.storage_uri, + "source_path": entry.source_path, + "title": entry.title, + "descriptor": entry.descriptor, + "content_type": entry.content_type, + "source_type": entry.source_type, + "fingerprint": entry.fingerprint, + "text_artifact_path": entry.text_artifact_path, + "raw_artifact_path": entry.raw_artifact_path, + "pageindex_doc_id": entry.pageindex_doc_id, + "pageindex_tree_status": entry.pageindex_tree_status, + "metadata": dict(entry.metadata), + "metadata_json": json.dumps(entry.metadata, ensure_ascii=False), + "metadata_status": metadata_status, + "metadata_status_json": json.dumps(metadata_status, ensure_ascii=False), + "indexed_metadata": SQLiteFileSystemStore.indexed_metadata_values(entry.metadata), + "metadata_text": metadata_text(entry.metadata), + "folder_path": entry.folder_path, + "content": content, + "skip_fts": False, + } + + def _generate_register_metadata(self, record: dict[str, Any], *, force: bool = False) -> None: + status = record["metadata_status"] + policy = status.get("policy", {}) + if self._metadata_policy_is_batch(policy) and not force: + self._mark_requested_generation_status(record, "pending_submit") + return + fields = self._metadata_fields_to_generate(record) + if not fields: + return + if self.metadata_generator is None: + if self._metadata_policy_requires_sync(policy): + raise MetadataGenerationError( + "metadata_generator is required for synchronous PIFS metadata generation; " + "set metadata_policy batch=true to defer" + ) + return + try: + result = self.metadata_generator.generate( + MetadataGenerationInput( + file_ref=record["file_ref"], + external_id=record.get("external_id"), + title=record["title"], + source_path=record["source_path"], + content_type=record["content_type"], + source_type=record.get("source_type"), + text=Path(record["text_artifact_path"]).read_text(encoding="utf-8"), + metadata=dict(record.get("metadata") or {}), + text_artifact_path=record.get("text_artifact_path"), + ), + fields=fields, + ) + if isinstance(result, dict): + result = MetadataGenerationResult(values=result) + except Exception as exc: + self._apply_metadata_status_failures(record, fields, str(exc)) + return + failures = dict(result.failures) + for field in fields: + if field in result.values: + record["metadata"][field] = result.values[field] + status["fields"][field] = { + "requested": True, + "status": "generated", + "owner": "pifs", + "source": "llm", + } + else: + failures.setdefault(field, "metadata generator did not return field") + for field, reason in failures.items(): + status["fields"][field] = { + "requested": True, + "status": "failed", + "owner": "pifs", + "source": "llm", + "error": str(reason), + } + self._refresh_record_metadata_status(record) + + def _complete_summary_projection_index(self, record: dict[str, Any]) -> bool: + metadata_status = record["metadata_status"] + summary_index = metadata_status.get("projection_indexes", {}).get("summary") + if not summary_index or not summary_index.get("requested"): + return False + summary = str(record.get("metadata", {}).get("summary") or "").strip() + if not summary: + return False + if self.summary_projection_indexer is None: + self._refresh_record_metadata_status(record) + return True + try: + result = self.summary_projection_indexer.upsert_summary(record) + except Exception as exc: + summary_index["status"] = "failed" + summary_index["error"] = str(exc) + self._refresh_record_metadata_status(record) + return True + summary_index.clear() + summary_index.update({"requested": True, **result}) + if summary_index.get("status") != "ready": + summary_index["status"] = "ready" + self._refresh_record_metadata_status(record) + return True + + @staticmethod + def _unlink_artifact(path: Any) -> None: + try: + Path(path).unlink() + except FileNotFoundError: + return + + def _cleanup_failed_register_artifacts(self, records: list[dict[str, Any]]) -> None: + for record in records: + if record.get("_pifs_owned_text_artifact"): + self._unlink_artifact(record["text_artifact_path"]) + if record.get("_pifs_owned_raw_artifact") and record.get("raw_artifact_path"): + self._unlink_artifact(record["raw_artifact_path"]) + + @staticmethod + def _metadata_policy_is_batch(policy: dict[str, Any]) -> bool: + return bool(policy.get("batch")) or policy.get("mode") == "batch" + + @staticmethod + def _metadata_policy_requires_sync(policy: dict[str, Any]) -> bool: + return policy.get("batch") is False or policy.get("mode") == "sync" + + def _metadata_fields_to_generate(self, record: dict[str, Any]) -> list[str]: + fields: list[str] = [] + for name, state in record["metadata_status"].get("fields", {}).items(): + if not state.get("requested"): + continue + if state.get("status") == "generated" and name in record["metadata"]: + continue + fields.append(name) + return fields + + def _mark_requested_generation_status(self, record: dict[str, Any], status: str) -> None: + for name, field in record["metadata_status"].get("fields", {}).items(): + if field.get("requested") and field.get("status") != "generated": + record["metadata_status"]["fields"][name] = { + "requested": True, + "status": status, + "owner": "pifs", + "source": "llm", + } + self._refresh_record_metadata_status(record, explicit_status=status) + + def _apply_metadata_status_failures( + self, + record: dict[str, Any], + fields: list[str], + reason: str, + ) -> None: + for field in fields: + record["metadata_status"]["fields"][field] = { + "requested": True, + "status": "failed", + "owner": "pifs", + "source": "llm", + "error": reason, + } + self._refresh_record_metadata_status(record, explicit_status="failed") + + def _refresh_record_metadata_status( + self, + record: dict[str, Any], + *, + explicit_status: str | None = None, + ) -> None: + metadata_status = record["metadata_status"] + statuses = [ + field.get("status") + for field in metadata_status.get("fields", {}).values() + if field.get("requested") and field.get("status") + ] + metadata_status["status"] = explicit_status or self._aggregate_metadata_status(statuses) + self._refresh_projection_index_statuses(metadata_status, record["metadata"]) + record["metadata_json"] = json.dumps(record["metadata"], ensure_ascii=False) + record["metadata_status_json"] = json.dumps(metadata_status, ensure_ascii=False) + record["indexed_metadata"] = SQLiteFileSystemStore.indexed_metadata_values(record["metadata"]) + record["metadata_text"] = metadata_text(record["metadata"]) + + def _open_lines(self, file_ref: str, start: int, end: int) -> OpenResult: + entry = self.store.get_file(file_ref) + lines = self.store.read_text(file_ref).splitlines() + start = max(1, start) + end = min(max(start, end), len(lines)) + text = "\n".join(lines[start - 1:end]) + return OpenResult( + file_ref=file_ref, + start_line=start, + end_line=end, + text=text, + external_id=entry.external_id, + folder_path=entry.folder_path, + source_path=entry.source_path, + ) + + def _open_all(self, file_ref: str) -> OpenResult: + entry = self.store.get_file(file_ref) + text = self.store.read_text(file_ref) + line_count = len(text.splitlines()) + return OpenResult( + file_ref=file_ref, + start_line=1, + end_line=line_count, + text=text, + external_id=entry.external_id, + folder_path=entry.folder_path, + source_path=entry.source_path, + ) + + @classmethod + def _structural_unavailable( + cls, + mode: str, + entry: Any, + *, + message: str, + node_id: str | None = None, + pages: str | None = None, + ) -> dict[str, Any]: + pageindex_tree_error = cls._pageindex_tree_failure_message(entry.metadata_status) + if pageindex_tree_error and entry.pageindex_tree_status == "failed": + message = f"PageIndex tree build failed: {pageindex_tree_error}" + result = { + "mode": mode, + "file_ref": entry.file_ref, + "external_id": entry.external_id, + "source_path": entry.source_path, + "status": entry.pageindex_tree_status, + "available": False, + "message": message, + } + if pageindex_tree_error: + result["pageindex_tree_error"] = pageindex_tree_error + if node_id is not None: + result["node_id"] = node_id + if pages is not None: + result["pages"] = pages + return result + + @staticmethod + def _attach_pageindex_tree_failure( + metadata_status: dict[str, Any], + pageindex_tree_failure: Any, + ) -> None: + if isinstance(pageindex_tree_failure, dict) and pageindex_tree_failure: + metadata_status["pageindex_tree"] = dict(pageindex_tree_failure) + + @staticmethod + def _pageindex_tree_failure_message(metadata_status: Any) -> str | None: + if not isinstance(metadata_status, dict): + return None + pageindex_tree = metadata_status.get("pageindex_tree") + if not isinstance(pageindex_tree, dict): + return None + if pageindex_tree.get("status") != "failed": + return None + message = str(pageindex_tree.get("message") or "").strip() + error_type = str(pageindex_tree.get("error_type") or "").strip() + if error_type and message: + return f"{error_type}: {message}" + return message or error_type or None + + def _resolve_target(self, target: str) -> str: + return self.store.resolve_file_ref(target) + + def _should_use_semantic_retrieval( + self, + query: Union[str, list[str], None], + scope: Optional[dict[str, Any]], + ) -> bool: + if self.semantic_retrieval_backend is None: + return False + if not self._query_text(query): + return False + if not scope: + return True + return bool(scope.get("recursive", True)) + + def _semantic_search( + self, + query: Union[str, list[str], None], + *, + scope: Optional[dict[str, Any]], + metadata_filter: Optional[dict[str, Any]], + limit: int, + channel: str | None = None, + ) -> list[SearchResult]: + if self.semantic_retrieval_backend is None: + return [] + filters = self._semantic_filters_for_scope(scope) + fetch_limit = max(limit * 10, 50) + query_text = self._query_text(query) + if channel: + search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None) + if search_channel is None: + return [] + candidates = search_channel( + channel, + query_text, + limit=fetch_limit, + filters=filters, + ) + else: + candidates = self.semantic_retrieval_backend.search( + query_text, + limit=fetch_limit, + filters=filters, + ) + results: list[SearchResult] = [] + seen: set[str] = set() + scope_path = self._scope_folder_path(scope) + for candidate in candidates: + try: + file_ref = self.store.resolve_file_ref(candidate.document_id) + except KeyError: + continue + if file_ref in seen: + continue + if not self.store.file_matches(file_ref, scope=scope, metadata_filter=metadata_filter): + continue + seen.add(file_ref) + entry = self.store.get_file(file_ref) + folder_paths = [ + folder["path"] + for folder in self.store.folder_memberships(file_ref) + ] + folder_path = self._preferred_folder_path(folder_paths, scope_path, entry.folder_path) + results.append( + SearchResult( + file_ref=file_ref, + external_id=entry.external_id, + title=entry.title, + snippet=candidate.snippet or entry.descriptor, + folder_path=folder_path, + folder_paths=folder_paths, + metadata=entry.metadata, + metadata_status=entry.metadata_status, + source_path=entry.source_path, + id=entry.external_id or file_ref, + document_id=entry.external_id, + name=entry.title, + description=entry.descriptor, + status=entry.pageindex_tree_status, + pageNum=None, + createdAt=None, + folderId=None, + ) + ) + if len(results) >= limit: + break + return results + + @staticmethod + def _build_descriptor(title: str, metadata: dict[str, Any]) -> str: + source = metadata.get("source_type") or metadata.get("repo") or metadata.get("channel") + return f"{title} ({source})" if source else title + + @staticmethod + def _validate_register_metadata(metadata: dict[str, Any]) -> None: + pifs_owned_fields = set(DEFAULT_METADATA_GENERATION_FIELDS) + conflicts = sorted(pifs_owned_fields.intersection(metadata)) + if conflicts: + raise ValueError( + "metadata contains PIFS-owned generated field(s): " + + ", ".join(conflicts) + + "; configure metadata_policy instead of passing generated fields" + ) + + def _register_generation_policy_schema(self, records: list[dict[str, Any]]) -> None: + pifs_fields: dict[str, dict[str, str]] = {} + user_fields: dict[str, dict[str, str]] = {} + for record in records: + policy_fields = record["metadata_status"]["policy"]["fields"] + generated_names = {str(name) for name, requested in policy_fields.items() if requested} + for name, requested in policy_fields.items(): + if requested: + pifs_fields[name] = { + "type": DEFAULT_METADATA_FIELD_TYPES.get( + name, + self._infer_metadata_field_type( + record.get("metadata", {}).get(name) + ), + ) + } + for name, value in record.get("metadata", {}).items(): + if name in generated_names: + pifs_fields.setdefault(name, {"type": self._infer_metadata_field_type(value)}) + else: + user_fields.setdefault(name, {"type": self._infer_metadata_field_type(value)}) + if pifs_fields: + self.metadata.register_schema({"fields": pifs_fields}, source="pifs") + if user_fields: + self.metadata.register_schema({"fields": user_fields}, source="user") + + @classmethod + def _normalize_metadata_policy( + cls, + policy: Optional[dict[str, Any]], + *, + metadata: dict[str, Any], + ) -> dict[str, Any]: + fields = dict(DEFAULT_METADATA_GENERATION_FIELDS) + field_statuses: dict[str, str] = {} + projection_indexes: dict[str, bool] | None = None + projection_index_statuses: dict[str, str] = {} + mode = None + batch = None + top_level_status = None + if policy is not None: + if not isinstance(policy, dict): + raise ValueError("metadata_policy must be a JSON object") + raw_fields = policy.get("fields") + if raw_fields is None: + raw_fields = { + name: declaration + for name, declaration in policy.items() + if name not in {"batch", "mode", "status", "projection_indexes"} + } + if not isinstance(raw_fields, dict): + raise ValueError("metadata_policy fields must be a JSON object") + for name, declaration in raw_fields.items(): + name = str(name) + if isinstance(declaration, bool): + fields[name] = declaration + continue + if isinstance(declaration, dict): + fields[name] = bool( + declaration.get("enabled", declaration.get("requested", True)) + ) + field_status = declaration.get("status") + if field_status is not None: + cls._validate_metadata_status(str(field_status)) + field_statuses[name] = str(field_status) + continue + raise ValueError(f"Invalid metadata generation policy for field: {name}") + mode = policy.get("mode") + if "batch" in policy: + batch = bool(policy["batch"]) + elif mode == "batch": + batch = True + top_level_status = policy.get("status") + if top_level_status is not None: + cls._validate_metadata_status(str(top_level_status)) + if "projection_indexes" in policy: + projection_indexes, projection_index_statuses = ( + cls._normalize_projection_index_policy(policy["projection_indexes"]) + ) + normalized: dict[str, Any] = { + "fields": fields, + "projection_indexes": ( + projection_indexes + if projection_indexes is not None + else {"summary": bool(fields.get("summary", False))} + ), + } + if field_statuses: + normalized["field_statuses"] = field_statuses + if projection_index_statuses: + normalized["projection_index_statuses"] = projection_index_statuses + if mode: + normalized["mode"] = str(mode) + if batch is not None: + normalized["batch"] = batch + if top_level_status: + normalized["status"] = str(top_level_status) + return normalized + + @classmethod + def _metadata_status_state( + cls, + policy: dict[str, Any], + *, + metadata: dict[str, Any], + status: Optional[str], + ) -> dict[str, Any]: + explicit_status = status or policy.get("status") + if explicit_status is not None: + explicit_status = str(explicit_status) + cls._validate_metadata_status(explicit_status) + field_statuses = policy.get("field_statuses", {}) + fields: dict[str, dict[str, Any]] = {} + for name, requested in policy["fields"].items(): + if not requested: + fields[name] = { + "requested": False, + "status": "skipped", + "owner": "pifs", + "source": "llm", + } + continue + field_status = field_statuses.get(name) + if field_status is None: + field_status = explicit_status + if field_status is None: + field_status = "generated" if name in metadata else "pending_generate" + fields[name] = { + "requested": True, + "status": field_status, + "owner": "pifs", + "source": "llm", + } + + requested_statuses = [ + item["status"] + for item in fields.values() + if item.get("requested") and item.get("status") + ] + aggregate_status = explicit_status or cls._aggregate_metadata_status(requested_statuses) + policy_summary = { + "fields": dict(policy["fields"]), + "projection_indexes": dict(policy.get("projection_indexes", {})), + } + if "mode" in policy: + policy_summary["mode"] = policy["mode"] + if "batch" in policy: + policy_summary["batch"] = policy["batch"] + state = { + "status": aggregate_status, + "policy": policy_summary, + "fields": fields, + "projection_indexes": {}, + } + projection_statuses = policy.get("projection_index_statuses", {}) + for name, requested in policy.get("projection_indexes", {}).items(): + if not requested: + continue + state["projection_indexes"][name] = { + "requested": True, + "status": projection_statuses.get(name, "not_indexed"), + "owner": "pifs", + "source": "index", + } + cls._refresh_projection_index_statuses(state, metadata) + return state + + @staticmethod + def _aggregate_metadata_status(statuses: list[str]) -> str: + if not statuses: + return "generated" + for status in ("failed", "pending_submit", "pending_generate"): + if status in statuses: + return status + return "generated" + + @staticmethod + def _validate_metadata_status(status: str) -> None: + if status not in METADATA_STATUSES: + raise ValueError(f"Unsupported metadata status: {status}") + + @classmethod + def _normalize_projection_index_policy( + cls, + projection_policy: Any, + ) -> tuple[dict[str, bool], dict[str, str]]: + if projection_policy is None: + return {}, {} + if not isinstance(projection_policy, dict): + raise ValueError("metadata_policy projection_indexes must be a JSON object") + projection_indexes: dict[str, bool] = {} + projection_index_statuses: dict[str, str] = {} + for name, declaration in projection_policy.items(): + name = str(name) + if isinstance(declaration, bool): + projection_indexes[name] = declaration + continue + if isinstance(declaration, dict): + projection_indexes[name] = bool( + declaration.get("enabled", declaration.get("requested", True)) + ) + status = declaration.get("status") + if status is not None: + status = str(status) + cls._validate_projection_index_status(status) + projection_index_statuses[name] = status + continue + raise ValueError(f"Invalid projection index policy for index: {name}") + return projection_indexes, projection_index_statuses + + @staticmethod + def _validate_projection_index_status(status: str) -> None: + if status not in PROJECTION_INDEX_STATUSES: + raise ValueError(f"Unsupported projection index status: {status}") + + @classmethod + def _refresh_projection_index_statuses( + cls, + metadata_status: dict[str, Any], + metadata: dict[str, Any], + ) -> None: + summary_index = metadata_status.get("projection_indexes", {}).get("summary") + if not summary_index or not summary_index.get("requested"): + return + if "summary" not in metadata: + return + if summary_index.get("status", "not_indexed") == "not_indexed": + summary_index["status"] = "pending_index" + + @staticmethod + def _infer_metadata_field_type(value: Any) -> str: + if isinstance(value, bool): + return "boolean" + if isinstance(value, (int, float)): + return "number" + return "string" + + @staticmethod + def _infer_source_type(source_path: str) -> Optional[str]: + parts = [part for part in Path(source_path).parts if part not in ("", ".")] + return parts[0] if parts else None + + @staticmethod + def _scope_folder_path(scope: Optional[dict[str, Any]]) -> Optional[str]: + if not scope: + return None + path = scope.get("folder_path") or scope.get("path") + return normalize_path(path) if path else None + + @classmethod + def _semantic_filters_for_scope(cls, scope: Optional[dict[str, Any]]) -> dict[str, Any]: + path = cls._scope_folder_path(scope) + if not path or path == "/": + return {} + source_type = cls._source_type_filter_from_path(path) + return {"source_type": source_type} if source_type else {} + + @staticmethod + def _source_type_filter_from_path(path: str) -> str: + segments = [segment for segment in path.strip("/").split("/") if segment] + if not segments: + return "" + if segments[0] == SEMANTIC_FOLDER_ROOT.strip("/"): + segments = segments[1:] + if not segments: + return "" + first_segment = segments[0] + if first_segment.startswith("source_type="): + return first_segment.split("=", 1)[1].replace("-", "_") + if path.startswith(f"{SEMANTIC_FOLDER_ROOT}/"): + return "" + return "" + + @classmethod + def _validate_semantic_folder_projection_item( + cls, + item: dict[str, Any], + allowed_extension_fields: set[str], + ) -> None: + path = item.get("folder_path") or item.get("path") + if not path: + raise ValueError("Semantic Folder Projection items must include a folder path") + cls._validate_semantic_folder_projection_path(str(path)) + allowed_fields = ( + SEMANTIC_FOLDER_BASE_FIELDS + | SEMANTIC_FOLDER_SYSTEM_FIELDS + | allowed_extension_fields + ) + if item.get("dataset_doc_uuid"): + raise ValueError( + "dataset_doc_uuid is not allowed in Semantic Folder Projection memberships; " + "use file_key or file_ref" + ) + fields = [] + explicit_field = cls._canonical_semantic_folder_field_name(item.get("field")) + if explicit_field: + fields.append(explicit_field) + fields.extend(cls._semantic_folder_projection_fields_from_path(str(path))) + for payload_key in ("metadata", "folder_metadata"): + cls._validate_semantic_folder_projection_metadata_payload( + item.get(payload_key), + allowed_fields, + ) + for field in fields: + if is_semantic_folder_forbidden_field(field) or field not in allowed_fields: + raise ValueError(f"Field is not allowed for Semantic Folder Projection: {field}") + + @staticmethod + def _validate_semantic_folder_projection_path(path: str) -> str: + normalized = normalize_path(path) + if normalized != SEMANTIC_FOLDER_ROOT and not normalized.startswith( + f"{SEMANTIC_FOLDER_ROOT}/" + ): + raise ValueError("Semantic Folder Projection paths must be under /semantic") + return normalized + + @classmethod + def _semantic_folder_projection_fields_from_path(cls, path: str) -> list[str]: + normalized = cls._validate_semantic_folder_projection_path(path) + fields: list[str] = [] + for segment in normalized.strip("/").split("/")[1:]: + if "=" not in segment: + continue + field = cls._canonical_semantic_folder_field_name( + segment.split("=", 1)[0] + ) + if field: + fields.append(field) + return fields + + @classmethod + def _validate_semantic_folder_projection_metadata_payload( + cls, + payload: Any, + allowed_fields: set[str], + ) -> None: + if isinstance(payload, dict): + for key, value in payload.items(): + key_text = str(key) + key_field = cls._canonical_semantic_folder_field_name(key) + if is_semantic_folder_forbidden_field(key_field): + raise ValueError( + "Forbidden metadata field in Semantic Folder Projection payload: " + f"{key_text}" + ) + if key_field in {"field", "source_field", "metadata_field"}: + field = cls._canonical_semantic_folder_field_name(value) + if field and ( + is_semantic_folder_forbidden_field(field) + or field not in allowed_fields + ): + raise ValueError( + f"Field is not allowed for Semantic Folder Projection: {field}" + ) + cls._validate_semantic_folder_projection_metadata_payload(value, allowed_fields) + elif isinstance(payload, list): + for item in payload: + cls._validate_semantic_folder_projection_metadata_payload(item, allowed_fields) + elif isinstance(payload, str): + field = cls._canonical_semantic_folder_field_name(payload) + if is_semantic_folder_forbidden_field(field): + raise ValueError( + "Forbidden metadata field label in Semantic Folder Projection payload: " + f"{payload}" + ) + + @staticmethod + def _canonical_semantic_folder_field_name(value: Any) -> str: + return canonical_semantic_folder_field_name(value) + + @staticmethod + def _semantic_folder_projection_document_id(membership: dict[str, Any]) -> str: + for key in ("file_key", "file_ref", "document_ref"): + value = str(membership.get(key) or "").strip() + if value: + return value + raise ValueError("Semantic Folder Projection membership is missing file_key or file_ref") + + @staticmethod + def _query_text(query: Union[str, list[str], None]) -> str: + if query is None: + return "" + if isinstance(query, list): + return " ".join(str(item) for item in query) + return str(query) + + @staticmethod + def _preferred_folder_path( + folder_paths: list[str], + scope_path: Optional[str], + fallback: str, + ) -> str: + if scope_path: + scoped = [ + path + for path in folder_paths + if path == scope_path or path.startswith(f"{scope_path.rstrip('/')}/") + ] + if scoped: + return sorted(scoped, key=lambda item: (len(item), item))[0] + non_root = [path for path in folder_paths if path != "/"] + if non_root: + return sorted(non_root, key=lambda item: (len(item), item))[0] + return fallback + + @staticmethod + def _parse_line_range(location: str) -> tuple[int, int]: + value = str(location).strip() + if "-" in value: + left, right = value.split("-", 1) + start, end = int(left), int(right) + else: + start = end = int(value) + if start < 1 or end < start: + raise ValueError(f"Invalid line range: {location}") + return start, end diff --git a/pageindex/filesystem/hybrid_projection.py b/pageindex/filesystem/hybrid_projection.py new file mode 100644 index 000000000..b49d49afa --- /dev/null +++ b/pageindex/filesystem/hybrid_projection.py @@ -0,0 +1,649 @@ +from __future__ import annotations + +import json +import os +import re +import sqlite3 +import struct +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult + + +INDEX_BY_CHANNEL = { + "metadata": "metadata_composite_vector", + "summary": "summary_only_vector", + "entity": "entity_vectors", + "constraint": "constraint_vectors", + "relation": "relation_vectors", +} +HYBRID_ENTITY_RELATION_CHANNELS = ("metadata", "entity", "constraint", "relation") +SEMANTIC_TOOL_CHANNELS = ("summary", "entity", "relation") +HYBRID_ENTITY_RELATION_WEIGHTS = { + "metadata": 0.25, + "entity": 0.25, + "relation": 0.30, + "constraint": 0.20, +} + + +@dataclass(frozen=True) +class QueryProjection: + entities: list[str] + relations: list[str] + constraints: list[str] + expected_answer_type: str = "" + + +@dataclass(frozen=True) +class HybridProjectionCandidate: + document_id: str + score: float + sources: list[dict[str, Any]] + source_type: str + source_path: str + title: str + metadata: dict[str, Any] + snippet: str + + +class HybridProjectionSearchBackend: + """Hybrid entity/relation/vector retrieval over rebuildable projection indexes. + + The SQLite catalog remains the source of truth. This backend only reads + external sqlite-vec projection indexes and returns candidate document ids + for the catalog to resolve and filter. + """ + + def __init__( + self, + index_dir: str | Path, + *, + embedder: Any, + embedding_provider: str, + embedding_model: str, + embedding_dimensions: int = 256, + embedding_cache_path: str | Path | None = None, + per_channel_limit: int = 100, + fetch_multiplier: int = 100, + ) -> None: + self.index_dir = Path(index_dir).expanduser() + self.embedder = embedder + self.embedding_provider = embedding_provider + self.embedding_model = embedding_model + self.embedding_dimensions = embedding_dimensions + self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions) + self.embedding_cache = EmbeddingCache( + Path(embedding_cache_path).expanduser() + if embedding_cache_path is not None + else self.index_dir / "embedding_cache.sqlite" + ) + self.per_channel_limit = per_channel_limit + self.fetch_multiplier = fetch_multiplier + self.indexes = { + channel: SQLiteVecSemanticIndex(self.index_dir / f"{index_name}.sqlite") + for channel, index_name in INDEX_BY_CHANNEL.items() + } + + @classmethod + def from_provider( + cls, + index_dir: str | Path, + *, + embedding_provider: str = "openai", + embedding_model: str = "text-embedding-3-small", + embedding_dimensions: int = 256, + embedding_timeout: float = 60, + **kwargs: Any, + ) -> "HybridProjectionSearchBackend": + return cls( + index_dir, + embedder=make_embedder( + embedding_provider, + embedding_model, + dimensions=embedding_dimensions, + timeout=embedding_timeout, + ), + embedding_provider=embedding_provider, + embedding_model=embedding_model, + embedding_dimensions=embedding_dimensions, + **kwargs, + ) + + def search( + self, + query: str, + *, + limit: int = 10, + filters: dict[str, Any] | None = None, + ) -> list[HybridProjectionCandidate]: + query = normalize_text(query) + if not query: + return [] + projection = heuristic_query_projection(query) + channels = tuple( + channel + for channel in HYBRID_ENTITY_RELATION_CHANNELS + if self._channel_document_count(channel) > 0 + ) + if not channels: + if self._channel_document_count("summary") > 0: + return self.search_channel("summary", query, limit=limit, filters=filters) + return [] + channel_hits = self._search_channels( + query=query, + projection=projection, + limit=max(limit, self.per_channel_limit), + filters=filters, + channels=channels, + ) + return aggregate_hybrid_entity_relation(channel_hits, projection)[:limit] + + def search_channel( + self, + channel: str, + query: str, + *, + limit: int = 10, + filters: dict[str, Any] | None = None, + ) -> list[HybridProjectionCandidate]: + if channel not in SEMANTIC_TOOL_CHANNELS: + raise ValueError(f"unsupported semantic channel: {channel}") + if channel not in self.available_channels(): + return [] + query = normalize_text(query) + if not query: + return [] + projection = heuristic_query_projection(query) + vector = self.embedding_cache.embed_texts( + [query_text_for_channel(channel, query, projection)], + provider=self.embedding_provider, + model=self.cache_model, + embedder=self.embedder, + batch_size=1, + )[0] + results = self.indexes[channel].search( + vector, + limit=limit, + filters=filters, + fetch_multiplier=self.fetch_multiplier, + ) + return rank_single_semantic_channel(channel, results) + + def available_channels(self) -> tuple[str, ...]: + return tuple( + channel + for channel in SEMANTIC_TOOL_CHANNELS + if self._channel_document_count(channel) > 0 + ) + + def info(self) -> dict[str, Any]: + return { + "index_dir": str(self.index_dir), + "embedding_provider": self.embedding_provider, + "embedding_model": self.embedding_model, + "embedding_dimensions": self.embedding_dimensions, + "strategy": "hybrid_entity_relation_vector", + "available_channels": list(self.available_channels()), + "channels": { + channel: self._safe_channel_info(channel) + for channel in self.indexes + }, + } + + def _channel_document_count(self, channel: str) -> int: + info = self._safe_channel_info(channel) + if not info.get("available"): + return 0 + return int(info.get("document_count") or 0) + + def _safe_channel_info(self, channel: str) -> dict[str, Any]: + index = self.indexes[channel] + if not index.db_path.exists(): + return { + "db_path": str(index.db_path), + "available": False, + "document_count": 0, + "error": "index file is missing", + } + try: + info = index.info() + except (OSError, sqlite3.Error, SemanticIndexError) as exc: + return { + "db_path": str(index.db_path), + "available": False, + "document_count": 0, + "error": str(exc), + } + return {**info, "available": int(info.get("document_count") or 0) > 0} + + def _search_channels( + self, + *, + query: str, + projection: QueryProjection, + limit: int, + filters: dict[str, Any] | None, + channels: tuple[str, ...], + ) -> dict[str, list[SemanticSearchResult]]: + query_texts = { + channel: query_text_for_channel(channel, query, projection) + for channel in channels + } + vectors = self.embedding_cache.embed_texts( + [query_texts[channel] for channel in channels], + provider=self.embedding_provider, + model=self.cache_model, + embedder=self.embedder, + batch_size=1, + ) + return { + channel: self.indexes[channel].search( + vector, + limit=limit, + filters=filters, + fetch_multiplier=self.fetch_multiplier, + ) + for channel, vector in zip(channels, vectors) + } + + +class EmbeddingCache: + def __init__(self, db_path: Path): + self.db_path = db_path + self.db_path.parent.mkdir(parents=True, exist_ok=True) + with self.connect() as conn: + conn.execute( + """ + CREATE TABLE IF NOT EXISTS embedding_cache ( + provider TEXT NOT NULL, + model TEXT NOT NULL, + text_hash TEXT NOT NULL, + dimension INTEGER NOT NULL, + vector_blob BLOB, + vector_json TEXT, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY(provider, model, text_hash) + ) + """ + ) + conn.commit() + + def connect(self) -> sqlite3.Connection: + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + return conn + + def embed_texts( + self, + texts: list[str], + *, + provider: str, + model: str, + embedder: Any, + batch_size: int, + ) -> list[list[float]]: + hashes = [SQLiteVecSemanticIndex.text_hash(text) for text in texts] + cached: dict[str, list[float]] = {} + with self.connect() as conn: + for text_hash in sorted(set(hashes)): + row = conn.execute( + """ + SELECT vector_blob, vector_json + FROM embedding_cache + WHERE provider = ? AND model = ? AND text_hash = ? + """, + (provider, model, text_hash), + ).fetchone() + if row is not None: + cached[text_hash] = decode_vector(row["vector_blob"], row["vector_json"]) + missing_positions = [ + index for index, text_hash in enumerate(hashes) if text_hash not in cached + ] + for start in range(0, len(missing_positions), max(1, batch_size)): + positions = missing_positions[start : start + max(1, batch_size)] + batch_texts = [texts[index] for index in positions] + vectors = embed_with_retry(embedder, batch_texts) + with self.connect() as conn: + conn.executemany( + """ + INSERT OR REPLACE INTO embedding_cache( + provider, model, text_hash, dimension, vector_blob, vector_json + ) + VALUES (?, ?, ?, ?, ?, '') + """, + [ + ( + provider, + model, + hashes[index], + len(vector), + encode_vector(vector), + ) + for index, vector in zip(positions, vectors) + ], + ) + conn.commit() + for index, vector in zip(positions, vectors): + cached[hashes[index]] = vector + return [cached[text_hash] for text_hash in hashes] + + +class EmbeddingClient: + def __init__(self, *, provider: str, model: str, dimensions: int, timeout: float): + self.provider = provider.lower() + self.model = model + self.dimensions = dimensions + if self.provider != "openai": + raise ValueError(f"unknown embedding provider: {provider}") + from openai import OpenAI + + api_key = os.environ.get("PIFS_EMBEDDING_API_KEY") or os.environ.get("OPENAI_API_KEY") + base_url = os.environ.get("PIFS_EMBEDDING_BASE_URL") or os.environ.get("OPENAI_BASE_URL") + if not api_key: + raise ValueError( + "PIFS_EMBEDDING_API_KEY or OPENAI_API_KEY is required for PIFS embeddings" + ) + self.client = OpenAI(api_key=api_key, base_url=base_url or None, timeout=timeout) + + def embed(self, texts: list[str]) -> list[list[float]]: + kwargs: dict[str, Any] = {"model": self.model, "input": texts} + if self.dimensions > 0: + kwargs["dimensions"] = self.dimensions + response = self.client.embeddings.create(**kwargs) + return [list(item.embedding) for item in sorted(response.data, key=lambda item: item.index)] + + +def make_embedder(provider: str, model: str, *, dimensions: int, timeout: float) -> Any: + return EmbeddingClient( + provider=provider, + model=model, + dimensions=dimensions, + timeout=timeout, + ) + + +def query_text_for_channel(channel: str, query: str, projection: QueryProjection) -> str: + if channel in {"metadata", "summary"}: + return query + if channel == "entity": + return compact_join(projection.entities, limit=24) or query + if channel == "constraint": + return compact_join(projection.constraints, limit=24) or query + if channel == "relation": + return "\n".join(projection.relations) or query + raise ValueError(f"unknown semantic channel: {channel}") + + +def rank_single_semantic_channel( + channel: str, + results: list[SemanticSearchResult], +) -> list[HybridProjectionCandidate]: + rows: list[HybridProjectionCandidate] = [] + seen: set[str] = set() + for rank, result in enumerate(results, 1): + doc_id = str(result.external_id or result.file_ref) + if doc_id in seen: + continue + seen.add(doc_id) + rows.append( + HybridProjectionCandidate( + document_id=doc_id, + score=1 / (60 + rank), + sources=[{"channel": channel, "rank": rank, "distance": result.distance}], + source_type=result.source_type, + source_path=result.source_path, + title=result.title, + metadata=result.metadata, + snippet=f"{channel}_vector rank={rank}", + ) + ) + return rows + + +def aggregate_hybrid_entity_relation( + channel_hits: dict[str, list[SemanticSearchResult]], + projection: QueryProjection, +) -> list[HybridProjectionCandidate]: + by_doc: dict[str, dict[str, Any]] = {} + for channel, results in channel_hits.items(): + weight = HYBRID_ENTITY_RELATION_WEIGHTS[channel] + seen_in_channel = set() + for rank, result in enumerate(results, 1): + doc_id = str(result.external_id or result.file_ref) + if doc_id in seen_in_channel: + continue + seen_in_channel.add(doc_id) + item = by_doc.setdefault( + doc_id, + { + "document_id": doc_id, + "score": 0.0, + "sources": [], + "source_type": result.source_type, + "source_path": result.source_path, + "title": result.title, + "metadata": result.metadata, + }, + ) + item["score"] += weight * (1 / (60 + rank)) + item["sources"].append({"channel": channel, "rank": rank, "distance": result.distance}) + candidates = [] + for item in by_doc.values(): + item["score"] += exact_match_bonus(item, projection) + candidates.append( + HybridProjectionCandidate( + document_id=item["document_id"], + score=float(item["score"]), + sources=item["sources"], + source_type=item["source_type"], + source_path=item["source_path"], + title=item["title"], + metadata=item["metadata"], + snippet=hybrid_snippet(item), + ) + ) + return sorted( + candidates, + key=lambda item: ( + -item.score, + min(source["rank"] for source in item.sources), + item.document_id, + ), + ) + + +def exact_match_bonus(item: dict[str, Any], projection: QueryProjection) -> float: + haystack = json.dumps( + { + "title": item.get("title", ""), + "source_path": item.get("source_path", ""), + "metadata": item.get("metadata", {}), + }, + ensure_ascii=False, + ).lower() + terms = [*projection.entities[:8], *projection.constraints[:6]] + matched = 0 + for term in terms: + normalized = str(term).lower().strip() + if len(normalized) >= 3 and normalized in haystack: + matched += 1 + return min(0.02, matched * 0.004) + + +def hybrid_snippet(item: dict[str, Any]) -> str: + channels = ", ".join( + f"{source['channel']}@{source['rank']}" for source in item.get("sources", [])[:4] + ) + topic = str((item.get("metadata") or {}).get("topic") or "").strip() + parts = [f"hybrid_entity_relation_vector {channels}"] + if topic: + parts.append(f"topic: {topic}") + return "; ".join(parts) + + +def heuristic_query_projection(question: str) -> QueryProjection: + entities = dedupe( + [ + *identifier_terms(question), + *keyword_terms(question)[:16], + ] + )[:16] + constraints = dedupe( + [ + *extract_constraint_terms(question), + *numeric_terms(question), + ] + )[:12] + predicate = infer_query_predicate(question) + subject = entities[0] if entities else "question" + return QueryProjection( + entities=entities, + relations=[f"{subject} | {predicate} | {question}"], + constraints=constraints, + expected_answer_type=infer_answer_type(question), + ) + + +def compact_join(values: list[str], *, limit: int) -> str: + return " | ".join(values[:limit]) + + +def identifier_terms(text: str) -> list[str]: + patterns = [ + r"\b[A-Z]{2,12}-\d{2,}\b", + r"\b[A-Za-z_][A-Za-z0-9_]{2,}\b\s*(?:=|:)\s*[A-Za-z0-9_.:/-]+", + r"\b[A-Za-z][A-Za-z0-9_+-]+(?:[-_+][A-Za-z0-9]+)+\b", + r"\b[A-Z]{2,}[A-Za-z0-9_-]*\b", + ] + found: list[str] = [] + for pattern in patterns: + found.extend(match.strip() for match in re.findall(pattern, text)) + return found + + +def keyword_terms(text: str) -> list[str]: + stopwords = { + "about", + "after", + "also", + "and", + "are", + "for", + "from", + "how", + "into", + "the", + "this", + "that", + "what", + "when", + "where", + "which", + "with", + } + terms = [ + term.lower() + for term in re.findall(r"[A-Za-z][A-Za-z0-9_+-]{2,}", text) + if term.lower() not in stopwords + ] + return dedupe(terms) + + +def extract_constraint_terms(text: str) -> list[str]: + constraints = [] + for pattern in [ + r"\b(?:must|should|required|requires?|default(?:s)?|limit(?:s)?|maximum|minimum)\b[^.!?\n]{0,120}", + r"\b[A-Za-z_][A-Za-z0-9_]{2,}\s*(?:=|:)\s*[A-Za-z0-9_.:/-]+", + ]: + constraints.extend(match.strip() for match in re.findall(pattern, text, flags=re.IGNORECASE)) + return dedupe(constraints) + + +def numeric_terms(text: str) -> list[str]: + return re.findall( + r"\b\d+(?:\.\d+)?\s*(?:MiB|GiB|MB|GB|ms|sec|seconds|minutes|hours|days|%|tokens?|req/s|rps)\b", + text, + flags=re.IGNORECASE, + ) + + +def infer_query_predicate(question: str) -> str: + lowered = question.lower() + rules = [ + ("asks_default", ["default", "defaults"]), + ("asks_limit", ["limit", "maximum", "minimum", "size"]), + ("asks_cause", ["caused", "cause", "why"]), + ("asks_owner", ["who", "owner", "assigned"]), + ("asks_deadline", ["when", "deadline", "date"]), + ("asks_status", ["status", "state"]), + ("asks_requirement", ["required", "requirement", "must"]), + ] + for predicate, needles in rules: + if any(needle in lowered for needle in needles): + return predicate + return "asks_about" + + +def infer_answer_type(question: str) -> str: + lowered = question.lower() + if "how many" in lowered or "limit" in lowered or "size" in lowered: + return "number_or_limit" + if lowered.startswith("who"): + return "person_or_team" + if lowered.startswith("when"): + return "date_or_time" + if "why" in lowered or "caused" in lowered: + return "cause" + return "fact" + + +def dedupe(values: Any) -> list[str]: + seen = set() + result = [] + for value in values: + normalized = re.sub(r"\s+", " ", str(value)).strip() + key = normalized.lower() + if not normalized or key in seen: + continue + seen.add(key) + result.append(normalized) + return result + + +def normalize_text(text: str) -> str: + return re.sub(r"\s+", " ", str(text or "")).strip() + + +def embedding_cache_model_key(model: str, dimensions: int) -> str: + return f"{model}:dimensions={dimensions}" if dimensions > 0 else model + + +def embed_with_retry(embedder: Any, texts: list[str], *, max_attempts: int = 8) -> list[list[float]]: + for attempt in range(1, max_attempts + 1): + try: + return embedder.embed(texts) + except Exception: + if attempt >= max_attempts: + raise + time.sleep(min(120.0, 2.0 ** (attempt - 1))) + raise RuntimeError("unreachable embedding retry state") + + +def encode_vector(vector: list[float]) -> bytes: + return struct.pack(f"<{len(vector)}f", *vector) + + +def decode_vector(blob: bytes | None, vector_json: str | None) -> list[float]: + if blob: + if len(blob) % 4 != 0: + raise ValueError("invalid cached vector blob length") + return list(struct.unpack(f"<{len(blob) // 4}f", blob)) + if vector_json: + value = json.loads(vector_json) + if isinstance(value, list): + return [float(item) for item in value] + raise ValueError("cached embedding row does not contain a vector") diff --git a/pageindex/filesystem/metadata.py b/pageindex/filesystem/metadata.py new file mode 100644 index 000000000..60d7beb97 --- /dev/null +++ b/pageindex/filesystem/metadata.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import json +import re +from typing import Any + +from .types import MetadataField + + +class MetadataQueryError(ValueError): + pass + + +class MetadataQueryEngine: + FIELD_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_]*$") + OPERATORS = {"$eq", "$ne", "$in", "$gt", "$gte", "$lt", "$lte", "$contains"} + LOGICAL_OPERATORS = {"$and", "$or"} + FOLDER_SCOPE_FIELD_HINTS = {"path", "folder", "folders", "folder_path", "folder_paths"} + MAX_DEPTH = 5 + + def __init__(self, store: Any): + self.store = store + + def register_schema(self, schema: dict[str, Any], source: str = "manual") -> None: + fields = [] + raw_fields = schema.get("fields", schema) + if not isinstance(raw_fields, dict): + raise MetadataQueryError("metadata schema must contain a fields object") + for name, declaration in raw_fields.items(): + name = str(name) + self.validate_field_name(name) + if isinstance(declaration, str): + field_type = declaration + description = "" + elif isinstance(declaration, dict): + field_type = str(declaration.get("type", "")) + description = str(declaration.get("description", "")) + else: + raise MetadataQueryError(f"Invalid schema declaration for field: {name}") + if field_type not in {"string", "number", "boolean"}: + raise MetadataQueryError(f"Unsupported metadata field type for {name}: {field_type}") + fields.append( + MetadataField( + name=name, + field_type=field_type, + description=description, + source=source, + ) + ) + if fields: + self.store.upsert_metadata_fields(fields) + + def parse_filter(self, value: str | dict[str, Any] | None) -> dict[str, Any] | None: + if value is None or value == "": + return None + if isinstance(value, str): + value = self.parse_dsl(value) + if not isinstance(value, dict): + raise MetadataQueryError("metadata_filter must be a JSON object") + self.validate_filter(value) + return value + + def parse_dsl(self, dsl: str) -> dict[str, Any]: + try: + parsed = json.loads(dsl) + except json.JSONDecodeError as exc: + raise MetadataQueryError( + "metadata DSL must be a JSON object, for example " + '\'{"$and":[{"repo":"redwood"},{"year":{"$gte":2024}}]}\'' + ) from exc + if not isinstance(parsed, dict): + raise MetadataQueryError("metadata DSL must be a JSON object") + return parsed + + def validate_filter(self, metadata_filter: dict[str, Any], depth: int = 1) -> None: + if depth > self.MAX_DEPTH: + raise MetadataQueryError(f"metadata_filter nesting depth exceeds {self.MAX_DEPTH}") + if not metadata_filter: + return + for key, condition in metadata_filter.items(): + if key in self.LOGICAL_OPERATORS: + self._validate_logical(key, condition, depth) + continue + self.validate_field(key) + self._validate_field_condition(key, condition) + + def _validate_logical(self, operator: str, condition: Any, depth: int) -> None: + if not isinstance(condition, list) or not condition: + raise MetadataQueryError(f"{operator} requires a non-empty list") + for item in condition: + if not isinstance(item, dict): + raise MetadataQueryError(f"{operator} items must be metadata filter objects") + self.validate_filter(item, depth + 1) + + def _validate_field_condition(self, field: str, condition: Any) -> None: + if not isinstance(condition, dict) or not any( + str(key).startswith("$") for key in condition + ): + self._validate_scalar(condition, context=field) + return + if len(condition) != 1: + raise MetadataQueryError( + f"Field {field} condition must contain exactly one metadata operator" + ) + operator, expected = next(iter(condition.items())) + if operator not in self.OPERATORS: + raise MetadataQueryError(f"Unsupported metadata operator: {operator}") + if operator == "$in": + if not isinstance(expected, list): + raise MetadataQueryError(f"{field} $in requires a list") + for item in expected: + self._validate_scalar(item, context=f"{field} $in") + return + if operator == "$contains": + self._validate_scalar(expected, context=f"{field} $contains") + return + if operator in {"$gt", "$gte", "$lt", "$lte"}: + self._validate_range_value(expected, context=f"{field} {operator}") + return + self._validate_scalar(expected, context=f"{field} {operator}") + + def validate_field(self, field: str) -> None: + self.validate_field_name(field) + if not self.store.metadata_field_exists(field): + if field in self.FOLDER_SCOPE_FIELD_HINTS: + raise MetadataQueryError( + f"Unknown metadata field: {field}. Folder paths are positional PIFS paths, " + "not metadata fields; use `ls /documents` or `find /documents -type f`. " + "Use --where only with fields from `stat --schema`." + ) + raise MetadataQueryError(f"Unknown metadata field: {field}") + + def validate_field_name(self, field: str) -> None: + if not self.FIELD_RE.match(field): + raise MetadataQueryError(f"Invalid metadata field: {field}") + + def export_schema(self) -> dict[str, Any]: + fields = {} + for field in self.store.list_metadata_fields(): + fields[field.name] = { + "type": field.field_type, + "description": field.description, + } + return {"fields": fields} + + @staticmethod + def _validate_scalar(value: Any, *, context: str) -> None: + if isinstance(value, bool): + return + if isinstance(value, (int, float)): + return + if isinstance(value, str): + return + raise MetadataQueryError(f"{context} must be a string, number, or boolean") + + @staticmethod + def _validate_range_value(value: Any, *, context: str) -> None: + if isinstance(value, bool) or not isinstance(value, (int, float, str)): + raise MetadataQueryError(f"{context} must be a string or number") diff --git a/pageindex/filesystem/metadata_generation.py b/pageindex/filesystem/metadata_generation.py new file mode 100644 index 000000000..86b2ac6e7 --- /dev/null +++ b/pageindex/filesystem/metadata_generation.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import json +import os +from dataclasses import dataclass, field +from typing import Any, Protocol + + +GENERATED_METADATA_FIELDS = ("summary", "doc_type", "domain", "topic", "entity", "relation") + + +class MetadataGenerationError(RuntimeError): + pass + + +@dataclass(frozen=True) +class MetadataGenerationInput: + file_ref: str + external_id: str | None + title: str + source_path: str + content_type: str + source_type: str | None + text: str + metadata: dict[str, Any] = field(default_factory=dict) + text_artifact_path: str | None = None + + +@dataclass(frozen=True) +class MetadataGenerationResult: + values: dict[str, Any] = field(default_factory=dict) + failures: dict[str, str] = field(default_factory=dict) + + +class MetadataGenerationBackend(Protocol): + def generate( + self, + request: MetadataGenerationInput, + *, + fields: list[str], + ) -> MetadataGenerationResult | dict[str, Any]: + ... + + +class MetadataGenerator: + """Default product generator for retrieval metadata. + + This intentionally lives under pageindex.filesystem instead of benchmark + paths. It uses registered text today; callers can pass PageIndex-extracted + text through the same MetadataGenerationInput without changing the API. + Provider selection is an instance parameter rather than a provider-specific + public class name. + """ + + def __init__( + self, + *, + provider: str | None = None, + model: str | None = None, + base_url: str | None = None, + max_text_chars: int = 24000, + ): + self.provider = (provider or os.environ.get("PIFS_METADATA_PROVIDER", "openai")).lower() + self.model = model or os.environ.get("PIFS_METADATA_MODEL", "gpt-5-nano") + self.base_url = ( + base_url + if base_url is not None + else os.environ.get("PIFS_METADATA_BASE_URL") or os.environ.get("OPENAI_BASE_URL") + ) + self.max_text_chars = max_text_chars + + def generate( + self, + request: MetadataGenerationInput, + *, + fields: list[str], + ) -> MetadataGenerationResult: + if self.provider != "openai": + raise MetadataGenerationError(f"unsupported metadata provider: {self.provider}") + return self._generate_openai(request, fields=fields) + + def _generate_openai( + self, + request: MetadataGenerationInput, + *, + fields: list[str], + ) -> MetadataGenerationResult: + api_key = os.environ.get("PIFS_METADATA_API_KEY") or os.environ.get("OPENAI_API_KEY") + if not api_key: + raise MetadataGenerationError( + "PIFS_METADATA_API_KEY or OPENAI_API_KEY is required for PIFS metadata generation" + ) + + from openai import OpenAI + + client = OpenAI(api_key=api_key, base_url=self.base_url or None) + response = client.chat.completions.create( + model=self.model, + messages=[ + { + "role": "system", + "content": ( + "Generate grounded retrieval metadata for one document. " + "Use only the provided document text and ordinary source metadata. " + "The summary must be a retrieval summary, not a title rewrite. " + "Do not use filenames, paths, URLs, storage URIs, or outside knowledge. " + "Return strict JSON matching the requested fields." + ), + }, + { + "role": "user", + "content": json.dumps( + { + "requested_fields": fields, + "document": { + "title": request.title, + "source_type": request.source_type, + "content_type": request.content_type, + "metadata": request.metadata, + "text": request.text[: self.max_text_chars], + }, + }, + ensure_ascii=False, + ), + }, + ], + response_format=self._response_format(fields), + ) + content = response.choices[0].message.content or "{}" + values = json.loads(content) + return MetadataGenerationResult( + values={field: values[field] for field in fields if field in values}, + ) + + @staticmethod + def _response_format(fields: list[str]) -> dict[str, Any]: + properties: dict[str, Any] = {} + for field in fields: + if field in {"summary", "doc_type", "domain", "topic"}: + properties[field] = {"type": "string"} + elif field in {"entity", "relation"}: + properties[field] = {"type": "string"} + else: + raise MetadataGenerationError( + f"MetadataGenerator does not support generated metadata field: {field}" + ) + return { + "type": "json_schema", + "json_schema": { + "name": "pifs_metadata_generation", + "strict": True, + "schema": { + "type": "object", + "additionalProperties": False, + "required": fields, + "properties": properties, + }, + }, + } diff --git a/pageindex/filesystem/projection_indexing.py b/pageindex/filesystem/projection_indexing.py new file mode 100644 index 000000000..e5d7b829e --- /dev/null +++ b/pageindex/filesystem/projection_indexing.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .hybrid_projection import ( + EmbeddingCache, + INDEX_BY_CHANNEL, + embedding_cache_model_key, + make_embedder, +) +from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexRecord + + +class SummaryProjectionIndexer: + """Synchronous register-time summary projection indexer.""" + + def __init__( + self, + index_dir: str | Path, + *, + embedder: Any, + embedding_provider: str, + embedding_model: str, + embedding_dimensions: int = 256, + embedding_cache_path: str | Path | None = None, + ) -> None: + self.index_dir = Path(index_dir).expanduser() + self.index_dir.mkdir(parents=True, exist_ok=True) + self.embedder = embedder + self.embedding_provider = embedding_provider + self.embedding_model = embedding_model + self.embedding_dimensions = embedding_dimensions + self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions) + self.embedding_cache = EmbeddingCache( + Path(embedding_cache_path).expanduser() + if embedding_cache_path is not None + else self.index_dir / "embedding_cache.sqlite" + ) + self.index = SQLiteVecSemanticIndex( + self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite" + ) + self._ensure_index() + + @classmethod + def from_provider( + cls, + index_dir: str | Path, + *, + embedding_provider: str = "openai", + embedding_model: str = "text-embedding-3-small", + embedding_dimensions: int = 256, + embedding_timeout: float = 60, + **kwargs: Any, + ) -> "SummaryProjectionIndexer": + return cls( + index_dir, + embedder=make_embedder( + embedding_provider, + embedding_model, + dimensions=embedding_dimensions, + timeout=embedding_timeout, + ), + embedding_provider=embedding_provider, + embedding_model=embedding_model, + embedding_dimensions=embedding_dimensions, + **kwargs, + ) + + def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]: + summary = str((record.get("metadata") or {}).get("summary") or "").strip() + if not summary: + return {"status": "skipped", "reason": "missing_summary"} + vector = self.embedding_cache.embed_texts( + [summary], + provider=self.embedding_provider, + model=self.cache_model, + embedder=self.embedder, + batch_size=1, + )[0] + metadata = dict(record.get("metadata") or {}) + count = self.index.upsert_many( + [ + SemanticIndexRecord( + file_ref=str(record["file_ref"]), + vector=vector, + text=summary, + external_id=record.get("external_id"), + source_type=str(record.get("source_type") or ""), + source_path=str(record.get("source_path") or ""), + title=str(record.get("title") or ""), + metadata=metadata, + ) + ] + ) + return { + "status": "ready", + "indexed_rows": count, + "index_path": str(self.index.db_path), + "embedding_provider": self.embedding_provider, + "embedding_model": self.embedding_model, + "embedding_dimensions": self.embedding_dimensions, + } + + def _ensure_index(self) -> None: + if not self.index.db_path.exists(): + self.index.reset( + dimension=self.embedding_dimensions, + metadata=self._index_metadata(), + ) + return + try: + existing_dimension = self.index.dimension() + except Exception as exc: + raise RuntimeError( + "could not validate existing summary projection index config; " + f"refusing to reset {self.index.db_path}. Move the existing index " + "aside or rebuild it intentionally before changing embedding config." + ) from exc + if existing_dimension != self.embedding_dimensions: + raise RuntimeError( + "summary projection index dimension mismatch: " + f"{self.index.db_path} was built with dimension {existing_dimension}, " + f"but configured embedding_dimensions is {self.embedding_dimensions}. " + "Use the matching embedding config, or rebuild the projection index " + "at a new path after preserving the existing data." + ) + + def _index_metadata(self) -> dict[str, Any]: + return { + "channel": "summary", + "embedding_provider": self.embedding_provider, + "embedding_model": self.embedding_model, + "embedding_dimensions": self.embedding_dimensions, + } diff --git a/pageindex/filesystem/semantic_folder_policy.py b/pageindex/filesystem/semantic_folder_policy.py new file mode 100644 index 000000000..8e81d5f9a --- /dev/null +++ b/pageindex/filesystem/semantic_folder_policy.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import re +from typing import Any, Iterable + + +SEMANTIC_FOLDER_ROOT = "/semantic" +SEMANTIC_FOLDER_BASE_FIELDS = {"doc_type", "domain", "topic"} +SEMANTIC_FOLDER_SYSTEM_FIELDS = {"source_type"} +SEMANTIC_FOLDER_FORBIDDEN_FIELDS = { + "summary", + "entities", + "relations", + "constraints", + "retrieval_cues", + "dataset_doc_uuid", + "path", + "uri", + "source_path", + "storage_uri", + "title", + "content_type", + "created_at", + "updated_at", +} + + +def canonical_semantic_folder_field_name(value: Any) -> str: + text = str(value or "").strip() + if not text: + return "" + text = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", text) + text = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", text) + return re.sub(r"[^A-Za-z0-9]+", "_", text).strip("_").casefold() + + +def compact_semantic_folder_field_name(value: Any) -> str: + return re.sub(r"[^a-z0-9]+", "", canonical_semantic_folder_field_name(value)) + + +def semantic_folder_field_identity_keys(value: Any) -> frozenset[str]: + canonical = canonical_semantic_folder_field_name(value) + compact = compact_semantic_folder_field_name(value) + return frozenset(key for key in (canonical, compact) if key) + + +def semantic_folder_field_identity_set(fields: Iterable[Any]) -> frozenset[str]: + keys: set[str] = set() + for field in fields: + keys.update(semantic_folder_field_identity_keys(field)) + return frozenset(keys) + + +SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES = semantic_folder_field_identity_set( + SEMANTIC_FOLDER_FORBIDDEN_FIELDS +) + + +def is_semantic_folder_forbidden_field(value: Any) -> bool: + return bool( + semantic_folder_field_identity_keys(value) + & SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES + ) + + +def semantic_folder_allowed_extension_fields(fields: Iterable[Any]) -> set[str]: + allowed = set() + for field in fields: + name = canonical_semantic_folder_field_name(field) + if name and not is_semantic_folder_forbidden_field(field): + allowed.add(name) + return allowed diff --git a/pageindex/filesystem/semantic_index.py b/pageindex/filesystem/semantic_index.py new file mode 100644 index 000000000..2453e1f35 --- /dev/null +++ b/pageindex/filesystem/semantic_index.py @@ -0,0 +1,362 @@ +from __future__ import annotations + +import hashlib +import json +import sqlite3 +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Protocol + +import sqlite_vec + + +class SemanticIndexError(RuntimeError): + pass + + +@dataclass(frozen=True) +class SemanticIndexRecord: + file_ref: str + vector: list[float] + text: str + external_id: str | None = None + source_type: str = "" + source_path: str = "" + title: str = "" + metadata: dict[str, Any] | None = None + + +@dataclass(frozen=True) +class SemanticSearchResult: + file_ref: str + distance: float + external_id: str | None + source_type: str + source_path: str + title: str + text_hash: str + metadata: dict[str, Any] + + +class RebuildableSemanticIndex(Protocol): + def reset(self, *, dimension: int, metadata: dict[str, Any] | None = None) -> None: + ... + + def upsert_many(self, records: list[SemanticIndexRecord]) -> int: + ... + + def search( + self, + vector: list[float], + *, + limit: int = 10, + filters: dict[str, Any] | None = None, + fetch_multiplier: int = 20, + ) -> list[SemanticSearchResult]: + ... + + def info(self) -> dict[str, Any]: + ... + + +class SQLiteVecSemanticIndex: + """Rebuildable local semantic index backed by sqlite-vec. + + This is intentionally separate from the PIFS catalog tables. The catalog + remains source of truth; this file is a rebuildable recall index. + """ + + def __init__(self, db_path: str | Path): + self.db_path = Path(db_path).expanduser() + self.db_path.parent.mkdir(parents=True, exist_ok=True) + + def reset(self, *, dimension: int, metadata: dict[str, Any] | None = None) -> None: + if dimension <= 0: + raise SemanticIndexError("semantic index dimension must be positive") + with self.connect() as conn: + conn.executescript( + """ + DROP TABLE IF EXISTS semantic_index_vec; + DROP TABLE IF EXISTS semantic_index_docs; + DROP TABLE IF EXISTS semantic_index_config; + CREATE TABLE semantic_index_config ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL + ); + CREATE TABLE semantic_index_docs ( + rowid INTEGER PRIMARY KEY, + file_ref TEXT NOT NULL UNIQUE, + external_id TEXT, + source_type TEXT NOT NULL DEFAULT '', + source_path TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL DEFAULT '', + text_hash TEXT NOT NULL, + text_chars INTEGER NOT NULL DEFAULT 0, + metadata_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP + ); + CREATE INDEX idx_semantic_index_docs_file_ref + ON semantic_index_docs(file_ref); + CREATE INDEX idx_semantic_index_docs_external_id + ON semantic_index_docs(external_id); + CREATE INDEX idx_semantic_index_docs_source_type + ON semantic_index_docs(source_type); + """ + ) + conn.execute( + "CREATE VIRTUAL TABLE semantic_index_vec USING " + f"vec0(source_type TEXT partition key, embedding float[{dimension}])" + ) + config = { + "dimension": str(dimension), + "adapter": "sqlite-vec", + "adapter_version": sqlite_vec.__version__, + "metadata": json.dumps(metadata or {}, ensure_ascii=False, sort_keys=True), + } + conn.executemany( + "INSERT INTO semantic_index_config(key, value) VALUES (?, ?)", + sorted(config.items()), + ) + conn.commit() + + def upsert_many(self, records: list[SemanticIndexRecord]) -> int: + if not records: + return 0 + dimension = self.dimension() + with self.connect() as conn: + inserted = 0 + for record in records: + if len(record.vector) != dimension: + raise SemanticIndexError( + f"vector dimension mismatch for {record.file_ref}: " + f"expected {dimension}, got {len(record.vector)}" + ) + rowid = self._upsert_doc(conn, record) + conn.execute("DELETE FROM semantic_index_vec WHERE rowid = ?", (rowid,)) + conn.execute( + "INSERT INTO semantic_index_vec(rowid, source_type, embedding) VALUES (?, ?, ?)", + ( + rowid, + record.source_type, + sqlite_vec.serialize_float32(record.vector), + ), + ) + inserted += 1 + conn.commit() + return inserted + + def search( + self, + vector: list[float], + *, + limit: int = 10, + filters: dict[str, Any] | None = None, + fetch_multiplier: int = 20, + ) -> list[SemanticSearchResult]: + dimension = self.dimension() + if len(vector) != dimension: + raise SemanticIndexError( + f"query vector dimension mismatch: expected {dimension}, got {len(vector)}" + ) + fetch_k = min(4096, max(limit, limit * max(fetch_multiplier, 1))) + source_types = _source_type_filters(filters or {}) + with self.connect() as conn: + rows = [] + if source_types: + for source_type in source_types: + rows.extend( + conn.execute( + """ + SELECT + d.file_ref, + d.external_id, + d.source_type, + d.source_path, + d.title, + d.text_hash, + d.metadata_json, + v.distance + FROM semantic_index_vec v + JOIN semantic_index_docs d ON d.rowid = v.rowid + WHERE v.embedding MATCH ? AND k = ? AND v.source_type = ? + ORDER BY v.distance + """, + (sqlite_vec.serialize_float32(vector), fetch_k, source_type), + ).fetchall() + ) + rows.sort(key=lambda row: float(row["distance"])) + else: + rows = conn.execute( + """ + SELECT + d.file_ref, + d.external_id, + d.source_type, + d.source_path, + d.title, + d.text_hash, + d.metadata_json, + v.distance + FROM semantic_index_vec v + JOIN semantic_index_docs d ON d.rowid = v.rowid + WHERE v.embedding MATCH ? AND k = ? + ORDER BY v.distance + """, + (sqlite_vec.serialize_float32(vector), fetch_k), + ).fetchall() + results: list[SemanticSearchResult] = [] + for row in rows: + metadata = _json_obj(row["metadata_json"]) + if not _matches_filters(row, metadata, filters or {}): + continue + results.append( + SemanticSearchResult( + file_ref=row["file_ref"], + distance=float(row["distance"]), + external_id=row["external_id"], + source_type=row["source_type"], + source_path=row["source_path"], + title=row["title"], + text_hash=row["text_hash"], + metadata=metadata, + ) + ) + if len(results) >= limit: + break + return results + + def info(self) -> dict[str, Any]: + with self.connect() as conn: + config = { + row["key"]: row["value"] + for row in conn.execute( + "SELECT key, value FROM semantic_index_config ORDER BY key" + ).fetchall() + } + count = conn.execute("SELECT COUNT(*) FROM semantic_index_docs").fetchone()[0] + parsed_metadata: dict[str, Any] + try: + parsed_metadata = json.loads(config.get("metadata", "{}")) + except json.JSONDecodeError: + parsed_metadata = {} + return { + "db_path": str(self.db_path), + "adapter": config.get("adapter", "sqlite-vec"), + "adapter_version": config.get("adapter_version", ""), + "dimension": int(config.get("dimension", "0") or 0), + "document_count": count, + "metadata": parsed_metadata, + } + + def dimension(self) -> int: + with self.connect() as conn: + row = conn.execute( + "SELECT value FROM semantic_index_config WHERE key = 'dimension'" + ).fetchone() + if row is None: + raise SemanticIndexError( + f"semantic index is not initialized; call reset() first: {self.db_path}" + ) + return int(row["value"]) + + def connect(self) -> sqlite3.Connection: + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + conn.enable_load_extension(True) + sqlite_vec.load(conn) + conn.enable_load_extension(False) + return conn + + @staticmethod + def text_hash(text: str) -> str: + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + @staticmethod + def _upsert_doc(conn: sqlite3.Connection, record: SemanticIndexRecord) -> int: + existing = conn.execute( + "SELECT rowid FROM semantic_index_docs WHERE file_ref = ?", + (record.file_ref,), + ).fetchone() + metadata_json = json.dumps(record.metadata or {}, ensure_ascii=False, sort_keys=True) + text_hash = SQLiteVecSemanticIndex.text_hash(record.text) + if existing is None: + cursor = conn.execute( + """ + INSERT INTO semantic_index_docs( + file_ref, external_id, source_type, source_path, title, + text_hash, text_chars, metadata_json + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + record.file_ref, + record.external_id, + record.source_type, + record.source_path, + record.title, + text_hash, + len(record.text), + metadata_json, + ), + ) + return int(cursor.lastrowid) + rowid = int(existing["rowid"]) + conn.execute( + """ + UPDATE semantic_index_docs + SET external_id = ?, + source_type = ?, + source_path = ?, + title = ?, + text_hash = ?, + text_chars = ?, + metadata_json = ?, + updated_at = CURRENT_TIMESTAMP + WHERE rowid = ? + """, + ( + record.external_id, + record.source_type, + record.source_path, + record.title, + text_hash, + len(record.text), + metadata_json, + rowid, + ), + ) + return rowid + + +def _json_obj(text: str | None) -> dict[str, Any]: + if not text: + return {} + try: + value = json.loads(text) + except json.JSONDecodeError: + return {} + return value if isinstance(value, dict) else {} + + +def _matches_filters( + row: sqlite3.Row, + metadata: dict[str, Any], + filters: dict[str, Any], +) -> bool: + for key, expected in filters.items(): + actual = row[key] if key in row.keys() else metadata.get(key) + if isinstance(expected, list): + if str(actual) not in {str(item) for item in expected}: + return False + elif str(actual) != str(expected): + return False + return True + + +def _source_type_filters(filters: dict[str, Any]) -> list[str]: + value = filters.get("source_type") + if value is None: + return [] + if isinstance(value, list): + return [str(item) for item in value if str(item)] + return [str(value)] if str(value) else [] diff --git a/pageindex/filesystem/store.py b/pageindex/filesystem/store.py new file mode 100644 index 000000000..7517d70ed --- /dev/null +++ b/pageindex/filesystem/store.py @@ -0,0 +1,1928 @@ +from __future__ import annotations + +import hashlib +import json +import re +import sqlite3 +from pathlib import Path +from typing import Any, Iterable, Optional + +from .types import FileEntry, MetadataField + +SCHEMA_VERSION = 1 + + +class SQLiteFileSystemStore: + def __init__(self, workspace: str | Path): + self.workspace = Path(workspace).expanduser() + self.workspace.mkdir(parents=True, exist_ok=True) + self.db_path = self.workspace / "filesystem.sqlite" + self.text_dir = self.workspace / "artifacts" / "text" + self.raw_dir = self.workspace / "artifacts" / "raw" + self.pageindex_client_dir = self.workspace / "artifacts" / "pageindex_client" + for path in (self.text_dir, self.raw_dir, self.pageindex_client_dir): + path.mkdir(parents=True, exist_ok=True) + self.initialize_schema() + + def connect(self) -> sqlite3.Connection: + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys = ON") + return conn + + def initialize_schema(self) -> None: + with self.connect() as conn: + self._create_current_schema(conn) + self.ensure_folder(conn, "/") + conn.execute(f"PRAGMA user_version = {SCHEMA_VERSION}") + + def _create_current_schema(self, conn: sqlite3.Connection) -> None: + conn.executescript( + """ + CREATE TABLE IF NOT EXISTS files ( + file_ref TEXT PRIMARY KEY, + external_id TEXT, + storage_uri TEXT NOT NULL, + source_path TEXT NOT NULL, + title TEXT NOT NULL, + descriptor TEXT NOT NULL, + content_type TEXT NOT NULL, + source_type TEXT, + fingerprint TEXT NOT NULL, + text_artifact_path TEXT NOT NULL, + raw_artifact_path TEXT, + pageindex_doc_id TEXT, + pageindex_tree_status TEXT NOT NULL DEFAULT 'not_built', + metadata_json TEXT NOT NULL DEFAULT '{}', + metadata_status_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP, + deleted_at TEXT + ); + + CREATE TABLE IF NOT EXISTS folders ( + folder_id TEXT PRIMARY KEY, + parent_id TEXT, + name TEXT NOT NULL, + path TEXT NOT NULL UNIQUE, + description TEXT NOT NULL DEFAULT '', + kind TEXT NOT NULL DEFAULT 'manual', + metadata_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(parent_id) REFERENCES folders(folder_id) + ); + + CREATE TABLE IF NOT EXISTS file_folders ( + file_ref TEXT NOT NULL, + folder_id TEXT NOT NULL, + metadata_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (file_ref, folder_id), + FOREIGN KEY(file_ref) REFERENCES files(file_ref) ON DELETE CASCADE, + FOREIGN KEY(folder_id) REFERENCES folders(folder_id) ON DELETE CASCADE + ); + + CREATE TABLE IF NOT EXISTS metadata_schema ( + schema_id TEXT PRIMARY KEY, + scope_path TEXT, + version INTEGER NOT NULL DEFAULT 1, + status TEXT NOT NULL DEFAULT 'active', + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS metadata_fields ( + field_id TEXT PRIMARY KEY, + schema_id TEXT NOT NULL DEFAULT 'default', + name TEXT NOT NULL, + type TEXT NOT NULL, + description TEXT NOT NULL DEFAULT '', + indexed INTEGER NOT NULL DEFAULT 1, + faceted INTEGER NOT NULL DEFAULT 0, + sortable INTEGER NOT NULL DEFAULT 0, + source TEXT NOT NULL DEFAULT 'manual', + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(schema_id, name), + FOREIGN KEY(schema_id) REFERENCES metadata_schema(schema_id) + ); + + CREATE TABLE IF NOT EXISTS metadata_values ( + file_ref TEXT NOT NULL, + field_id TEXT NOT NULL, + value_text TEXT, + value_number REAL, + value_bool INTEGER, + value_json TEXT, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY(file_ref) REFERENCES files(file_ref) ON DELETE CASCADE, + FOREIGN KEY(field_id) REFERENCES metadata_fields(field_id) ON DELETE CASCADE + ); + + CREATE VIRTUAL TABLE IF NOT EXISTS file_fts + USING fts5(file_ref UNINDEXED, title, body, metadata_text); + + CREATE INDEX IF NOT EXISTS idx_files_external_id ON files(external_id); + CREATE INDEX IF NOT EXISTS idx_files_source_path ON files(source_path); + CREATE INDEX IF NOT EXISTS idx_files_source_type ON files(source_type); + CREATE INDEX IF NOT EXISTS idx_folders_path ON folders(path); + CREATE INDEX IF NOT EXISTS idx_folders_parent_id ON folders(parent_id); + CREATE INDEX IF NOT EXISTS idx_file_folders_folder ON file_folders(folder_id); + CREATE INDEX IF NOT EXISTS idx_metadata_fields_name ON metadata_fields(name); + CREATE INDEX IF NOT EXISTS idx_metadata_values_field_text ON metadata_values(field_id, value_text); + CREATE INDEX IF NOT EXISTS idx_metadata_values_field_number ON metadata_values(field_id, value_number); + """ + ) + conn.execute( + """ + INSERT OR IGNORE INTO metadata_schema(schema_id, scope_path, version, status) + VALUES ('default', NULL, 1, 'active') + """ + ) + + @staticmethod + def _json_object(value: Any) -> dict[str, Any]: + try: + parsed = json.loads(value or "{}") if isinstance(value, str) else value + except json.JSONDecodeError: + return {} + return parsed if isinstance(parsed, dict) else {} + + @staticmethod + def _columns(conn: sqlite3.Connection, table: str) -> set[str]: + return {row["name"] for row in conn.execute(f"PRAGMA table_info({table})").fetchall()} + + def insert_file(self, record: dict[str, Any]) -> None: + self.insert_files([record]) + + def insert_files(self, records: list[dict[str, Any]]) -> None: + if not records: + return + with self.connect() as conn: + conn.execute("PRAGMA temp_store = MEMORY") + folder_cache: dict[tuple[str, str], str] = {} + file_rows = [] + membership_rows = [] + file_ref_rows = [] + fts_file_ref_rows = [] + fts_rows = [] + metadata_rows = [] + metadata_field_ids = { + row["name"]: row["field_id"] + for row in conn.execute( + "SELECT name, field_id FROM metadata_fields WHERE schema_id = 'default'" + ).fetchall() + } + for record in records: + folder_cache_key = (record["folder_path"], record.get("folder_kind", "manual")) + folder_id = folder_cache.get(folder_cache_key) + if folder_id is None: + folder_id = self.ensure_folder( + conn, + record["folder_path"], + kind=record.get("folder_kind", "manual"), + ) + folder_cache[folder_cache_key] = folder_id + file_rows.append(self._file_insert_values(record)) + membership_rows.append( + ( + record["file_ref"], + folder_id, + json.dumps(record.get("folder_metadata") or {}, ensure_ascii=False), + ) + ) + file_ref_rows.append((record["file_ref"],)) + if not record.get("skip_fts", False): + fts_file_ref_rows.append((record["file_ref"],)) + fts_rows.append( + ( + record["file_ref"], + record["title"], + record["content"], + record["metadata_text"], + ) + ) + metadata_rows.extend( + self._metadata_insert_values( + record["file_ref"], + record.get("indexed_metadata", record["metadata"]), + metadata_field_ids, + ) + ) + conn.executemany(self._file_insert_sql(), file_rows) + conn.executemany( + """ + INSERT OR REPLACE INTO file_folders(file_ref, folder_id, metadata_json) + VALUES (?, ?, ?) + """, + membership_rows, + ) + conn.executemany("DELETE FROM metadata_values WHERE file_ref = ?", file_ref_rows) + if metadata_rows: + conn.executemany( + """ + INSERT INTO metadata_values( + file_ref, field_id, value_text, value_number, value_bool, value_json + ) VALUES (?, ?, ?, ?, ?, ?) + """, + metadata_rows, + ) + if fts_file_ref_rows: + conn.executemany("DELETE FROM file_fts WHERE file_ref = ?", fts_file_ref_rows) + conn.executemany( + """ + INSERT INTO file_fts(file_ref, title, body, metadata_text) + VALUES (?, ?, ?, ?) + """, + fts_rows, + ) + + @staticmethod + def _file_insert_sql() -> str: + columns = [ + "file_ref", + "external_id", + "storage_uri", + "source_path", + "title", + "descriptor", + "content_type", + "source_type", + "fingerprint", + "text_artifact_path", + "raw_artifact_path", + "pageindex_doc_id", + "pageindex_tree_status", + "metadata_json", + "metadata_status_json", + ] + columns.extend(["deleted_at", "updated_at"]) + placeholders = ", ".join(["?"] * (len(columns) - 2) + ["NULL", "CURRENT_TIMESTAMP"]) + return f""" + INSERT OR REPLACE INTO files ({", ".join(columns)}) + VALUES ({placeholders}) + """ + + @staticmethod + def _file_insert_values(record: dict[str, Any]) -> tuple[Any, ...]: + values: list[Any] = [ + record["file_ref"], + record["external_id"], + record["storage_uri"], + record["source_path"], + record["title"], + record["descriptor"], + record["content_type"], + record["source_type"], + record["fingerprint"], + record["text_artifact_path"], + record["raw_artifact_path"], + record.get("pageindex_doc_id"), + record.get("pageindex_tree_status", "not_built"), + record["metadata_json"], + record.get("metadata_status_json", "{}"), + ] + return tuple(values) + + def _metadata_insert_values( + self, + file_ref: str, + metadata: dict[str, Any], + metadata_field_ids: dict[str, str], + ) -> list[tuple[Any, ...]]: + values = [] + for name, value in metadata.items(): + if not self._valid_field_name(name): + continue + field_id = metadata_field_ids.get(name) + if field_id is None: + continue + for item in self._metadata_value_items(value): + values.append( + ( + file_ref, + field_id, + item["value_text"], + item["value_number"], + item["value_bool"], + item["value_json"], + ) + ) + return values + + def create_folder( + self, + path: str, + *, + kind: str = "manual", + description: str = "", + metadata: dict[str, Any] | None = None, + ) -> str: + with self.connect() as conn: + return self.ensure_folder( + conn, + path, + kind=kind, + description=description, + metadata=metadata, + ) + + def attach_file_to_folder( + self, + file_ref: str, + folder_path_or_id: str, + *, + metadata: dict[str, Any] | None = None, + ) -> None: + with self.connect() as conn: + resolved_file_ref = self._resolve_file_ref(conn, file_ref) + folder_id = self._resolve_or_create_folder(conn, folder_path_or_id) + conn.execute( + """ + INSERT INTO file_folders(file_ref, folder_id, metadata_json) + VALUES (?, ?, ?) + ON CONFLICT(file_ref, folder_id) DO UPDATE SET + metadata_json = excluded.metadata_json + """, + ( + resolved_file_ref, + folder_id, + json.dumps(metadata or {}, ensure_ascii=False), + ), + ) + + def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None: + with self.connect() as conn: + for item in items: + resolved_file_ref = self._resolve_file_ref(conn, item["file_ref"]) + folder_id = self._resolve_or_create_folder(conn, item["folder"]) + conn.execute( + """ + INSERT INTO file_folders(file_ref, folder_id, metadata_json) + VALUES (?, ?, ?) + ON CONFLICT(file_ref, folder_id) DO UPDATE SET + metadata_json = excluded.metadata_json + """, + ( + resolved_file_ref, + folder_id, + json.dumps(item.get("metadata") or {}, ensure_ascii=False), + ), + ) + + def replace_metadata_values( + self, + conn: sqlite3.Connection, + file_ref: str, + metadata: dict[str, Any], + ) -> None: + conn.execute("DELETE FROM metadata_values WHERE file_ref = ?", (file_ref,)) + for name, value in metadata.items(): + if not self._valid_field_name(name): + continue + field_id = self._registered_field_id(conn, name) + if field_id is None: + continue + for item in self._metadata_value_items(value): + conn.execute( + """ + INSERT INTO metadata_values( + file_ref, field_id, value_text, value_number, value_bool, value_json + ) VALUES (?, ?, ?, ?, ?, ?) + """, + ( + file_ref, + field_id, + item["value_text"], + item["value_number"], + item["value_bool"], + item["value_json"], + ), + ) + + @staticmethod + def _registered_field_id(conn: sqlite3.Connection, name: str) -> str | None: + row = conn.execute( + """ + SELECT field_id + FROM metadata_fields + WHERE schema_id = 'default' AND name = ? + """, + (name,), + ).fetchone() + return None if row is None else row["field_id"] + + def replace_fts(self, conn: sqlite3.Connection, record: dict[str, Any]) -> None: + conn.execute("DELETE FROM file_fts WHERE file_ref = ?", (record["file_ref"],)) + conn.execute( + """ + INSERT INTO file_fts(file_ref, title, body, metadata_text) + VALUES (?, ?, ?, ?) + """, + ( + record["file_ref"], + record["title"], + record["content"], + record["metadata_text"], + ), + ) + + def upsert_metadata_fields( + self, + fields: Iterable[MetadataField], + *, + conn: sqlite3.Connection | None = None, + ) -> None: + owns_connection = conn is None + if conn is None: + conn = self.connect() + try: + conn.execute( + """ + INSERT OR IGNORE INTO metadata_schema(schema_id, scope_path, version, status) + VALUES ('default', NULL, 1, 'active') + """ + ) + for field in fields: + conn.execute( + """ + INSERT INTO metadata_fields( + field_id, schema_id, name, type, description, + indexed, faceted, sortable, source, updated_at + ) VALUES (?, 'default', ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) + ON CONFLICT(schema_id, name) DO UPDATE SET + type = excluded.type, + source = excluded.source, + updated_at = CURRENT_TIMESTAMP + """, + ( + self.field_id(field.name), + field.name, + field.field_type, + field.description, + int(field.indexed), + int(field.faceted), + int(field.sortable), + field.source, + ), + ) + if owns_connection: + conn.commit() + finally: + if owns_connection: + conn.close() + + def metadata_field_exists(self, name: str) -> bool: + with self.connect() as conn: + row = conn.execute( + "SELECT 1 FROM metadata_fields WHERE schema_id = 'default' AND name = ?", + (name,), + ).fetchone() + return row is not None + + def list_metadata_fields(self) -> list[MetadataField]: + with self.connect() as conn: + rows = conn.execute( + """ + SELECT name, type, description, indexed, faceted, sortable, source + FROM metadata_fields + WHERE schema_id = 'default' + ORDER BY name + """ + ).fetchall() + return [ + MetadataField( + name=row["name"], + field_type=row["type"], + description=row["description"], + indexed=bool(row["indexed"]), + faceted=bool(row["faceted"]), + sortable=bool(row["sortable"]), + source=row["source"], + ) + for row in rows + ] + + def list_folder( + self, + path: str = "/", + recursive: bool = False, + limit: int = 100, + max_depth: int | None = None, + ) -> dict[str, Any]: + path = normalize_path(path) + if max_depth is not None and max_depth < 0: + raise ValueError("max_depth must be non-negative") + with self.connect() as conn: + folder = self._folder_by_path(conn, path) + if folder is None: + raise KeyError(f"Unknown folder path: {path}") + if recursive: + folder_depth_clause = "" + folder_depth_params: list[Any] = [] + if max_depth is not None: + if max_depth == 0: + folder_depth_clause = "AND 0" + else: + folder_depth_clause = ( + f"AND ({self._folder_depth_sql('fo.path')} - ?) <= ?" + ) + folder_depth_params = [self._folder_depth(path), max_depth] + folder_rows = conn.execute( + f""" + SELECT + fo.folder_id, + fo.parent_id, + fo.name, + fo.path, + fo.description, + fo.kind, + fo.metadata_json, + fo.created_at, + fo.updated_at, + ( + SELECT COUNT(DISTINCT child_ff.file_ref) + FROM file_folders child_ff + JOIN files child_file + ON child_file.file_ref = child_ff.file_ref + AND child_file.deleted_at IS NULL + WHERE child_ff.folder_id = fo.folder_id + ) AS file_count, + ( + SELECT COUNT(*) + FROM folders child_folder + WHERE child_folder.parent_id = fo.folder_id + ) AS children_count + FROM folders fo + WHERE fo.path != ? AND (fo.path LIKE ? ESCAPE '\\') + {folder_depth_clause} + ORDER BY fo.path + LIMIT ? + """, + (path, self._descendant_like(path), *folder_depth_params, limit), + ).fetchall() + file_rows = self._file_rows_for_scope( + conn, + path, + True, + limit, + max_depth=max_depth, + ) + else: + folder_rows = conn.execute( + """ + SELECT + fo.folder_id, + fo.parent_id, + fo.name, + fo.path, + fo.description, + fo.kind, + fo.metadata_json, + fo.created_at, + fo.updated_at, + ( + SELECT COUNT(DISTINCT child_ff.file_ref) + FROM file_folders child_ff + JOIN files child_file + ON child_file.file_ref = child_ff.file_ref + AND child_file.deleted_at IS NULL + WHERE child_ff.folder_id = fo.folder_id + ) AS file_count, + ( + SELECT COUNT(*) + FROM folders child_folder + WHERE child_folder.parent_id = fo.folder_id + ) AS children_count + FROM folders fo + WHERE fo.parent_id = ? + ORDER BY fo.kind, fo.name + LIMIT ? + """, + (folder["folder_id"], limit), + ).fetchall() + file_rows = self._file_rows_for_scope(conn, path, False, limit) + return { + "folders": [self._folder_row_to_dict(row) for row in folder_rows], + "files": [self._file_summary(row) for row in file_rows], + } + + def folder_info(self, path: str = "/") -> dict[str, Any]: + path = normalize_path(path) + with self.connect() as conn: + row = conn.execute( + """ + SELECT + fo.folder_id, + fo.parent_id, + fo.name, + fo.path, + fo.description, + fo.kind, + fo.metadata_json, + fo.created_at, + fo.updated_at, + ( + SELECT COUNT(DISTINCT child_ff.file_ref) + FROM file_folders child_ff + JOIN files child_file + ON child_file.file_ref = child_ff.file_ref + AND child_file.deleted_at IS NULL + WHERE child_ff.folder_id = fo.folder_id + ) AS file_count, + ( + SELECT COUNT(*) + FROM folders child_folder + WHERE child_folder.parent_id = fo.folder_id + ) AS children_count + FROM folders fo + WHERE fo.path = ? + """, + (path,), + ).fetchone() + if row is None: + raise KeyError(f"Unknown folder path: {path}") + return self._folder_row_to_dict(row) + + def find_folders( + self, + path: str = "/", + *, + metadata_filter: Optional[dict[str, Any]] = None, + limit: int = 100, + max_depth: int | None = None, + ) -> list[dict[str, Any]]: + path = normalize_path(path) + if max_depth is not None and max_depth < 0: + raise ValueError("max_depth must be non-negative") + metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter) + metadata_clause = f"AND {' AND '.join(metadata_sql)}" if metadata_sql else "" + folder_depth_clause = "" + folder_depth_params: list[Any] = [] + if max_depth is not None: + if max_depth == 0: + folder_depth_clause = "AND 0" + else: + folder_depth_clause = f"AND ({self._folder_depth_sql('fo.path')} - ?) <= ?" + folder_depth_params = [self._folder_depth(path), max_depth] + sql = f""" + SELECT * + FROM ( + SELECT + fo.folder_id, + fo.parent_id, + fo.name, + fo.path, + fo.description, + fo.kind, + fo.metadata_json, + fo.created_at, + fo.updated_at, + ( + SELECT COUNT(DISTINCT child_ff.file_ref) + FROM file_folders child_ff + JOIN files child_file + ON child_file.file_ref = child_ff.file_ref + AND child_file.deleted_at IS NULL + WHERE child_ff.folder_id = fo.folder_id + ) AS file_count, + ( + SELECT COUNT(*) + FROM folders child_folder + WHERE child_folder.parent_id = fo.folder_id + ) AS children_count, + ( + SELECT COUNT(DISTINCT f.file_ref) + FROM files f + JOIN file_folders matched_ff + ON matched_ff.file_ref = f.file_ref + JOIN folders matched_folder + ON matched_folder.folder_id = matched_ff.folder_id + WHERE f.deleted_at IS NULL + AND ( + matched_folder.folder_id = fo.folder_id + OR matched_folder.path LIKE {self._descendant_like_sql_expr("fo.path")} ESCAPE '\\' + ) + {metadata_clause} + ) AS matched_files + FROM folders fo + WHERE fo.path != ? AND fo.path LIKE ? ESCAPE '\\' + {folder_depth_clause} + ) + WHERE matched_files > 0 + ORDER BY path + LIMIT ? + """ + params = [ + *metadata_params, + path, + self._descendant_like(path), + *folder_depth_params, + limit, + ] + with self.connect() as conn: + folder = self._folder_by_path(conn, path) + if folder is None: + raise KeyError(f"Unknown folder path: {path}") + rows = conn.execute(sql, params).fetchall() + return [self._folder_row_to_dict(row) for row in rows] + + def search_files( + self, + query: str | list[str] | None, + *, + scope: Optional[dict[str, Any]] = None, + metadata_filter: Optional[dict[str, Any]] = None, + limit: int = 10, + ) -> list[dict[str, Any]]: + query_text = self._query_text(query) + match_queries = self._fts_match_queries(query_text) if query_text else [None] + results: list[dict[str, Any]] = [] + seen: set[str] = set() + for match_query in match_queries: + rows = self._search_once(match_query, scope, metadata_filter, max(limit * 25, limit)) + for row in rows: + if row["file_ref"] in seen: + continue + seen.add(row["file_ref"]) + results.append(self._search_row_to_dict(row)) + if len(results) >= limit: + return results + if results: + return results + return results + + def _search_once( + self, + match_query: str | None, + scope: Optional[dict[str, Any]], + metadata_filter: Optional[dict[str, Any]], + limit: int, + ) -> list[sqlite3.Row]: + joins = [] + selects = [ + "f.file_ref", + "f.external_id", + "f.source_path", + "f.title", + "f.descriptor", + "f.pageindex_tree_status", + "f.metadata_json", + "f.metadata_status_json", + "f.created_at", + """ + ( + SELECT display_folder.folder_id + FROM file_folders display_ff + JOIN folders display_folder + ON display_folder.folder_id = display_ff.folder_id + WHERE display_ff.file_ref = f.file_ref + ORDER BY display_folder.path + LIMIT 1 + ) AS folder_id + """, + """ + ( + SELECT display_folder.path + FROM file_folders display_ff + JOIN folders display_folder + ON display_folder.folder_id = display_ff.folder_id + WHERE display_ff.file_ref = f.file_ref + ORDER BY display_folder.path + LIMIT 1 + ) AS folder_path + """, + ] + where = ["f.deleted_at IS NULL"] + params: list[Any] = [] + if match_query: + joins.append("JOIN file_fts ON file_fts.file_ref = f.file_ref") + selects.append("snippet(file_fts, 2, '', '', '...', 16) AS snippet") + selects.append("bm25(file_fts) AS rank") + where.append("file_fts MATCH ?") + params.append(match_query) + order_by = "rank" + else: + selects.append("f.descriptor AS snippet") + selects.append("0 AS rank") + order_by = "f.created_at DESC, f.title" + scope_sql, scope_params = self._scope_sql(scope) + if scope_sql: + where.append(scope_sql) + params.extend(scope_params) + metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter) + where.extend(metadata_sql) + params.extend(metadata_params) + sql = f""" + SELECT {", ".join(selects)} + FROM files f + {" ".join(joins)} + WHERE {" AND ".join(where)} + ORDER BY {order_by} + LIMIT ? + """ + params.append(limit) + with self.connect() as conn: + return conn.execute(sql, params).fetchall() + + def _metadata_filter_sql(self, metadata_filter: Optional[dict[str, Any]]) -> tuple[list[str], list[Any]]: + if not metadata_filter: + return [], [] + clause, params = self._compile_metadata_filter(metadata_filter) + return [clause] if clause else [], params + + def _compile_metadata_filter(self, metadata_filter: dict[str, Any]) -> tuple[str, list[Any]]: + clauses = [] + params: list[Any] = [] + for key, condition in metadata_filter.items(): + if key in {"$and", "$or"}: + child_clauses = [] + child_params: list[Any] = [] + for item in condition: + child_clause, item_params = self._compile_metadata_filter(item) + if child_clause: + child_clauses.append(f"({child_clause})") + child_params.extend(item_params) + if child_clauses: + joiner = " AND " if key == "$and" else " OR " + clauses.append(joiner.join(child_clauses)) + params.extend(child_params) + continue + field_clause, field_params = self._compile_metadata_field_filter(key, condition) + clauses.append(field_clause) + params.extend(field_params) + return " AND ".join(f"({clause})" for clause in clauses), params + + def _compile_metadata_field_filter(self, field: str, condition: Any) -> tuple[str, list[Any]]: + if not isinstance(condition, dict) or not any(str(key).startswith("$") for key in condition): + condition = {"$eq": condition} + operator, expected = next(iter(condition.items())) + field_id = self.field_id(field) + if operator == "$eq": + return ( + """ + EXISTS ( + SELECT 1 FROM metadata_values mv + WHERE mv.file_ref = f.file_ref + AND mv.field_id = ? + AND mv.value_text = ? + ) + """, + [field_id, self._metadata_compare_text(expected)], + ) + if operator == "$ne": + return ( + """ + NOT EXISTS ( + SELECT 1 FROM metadata_values mv + WHERE mv.file_ref = f.file_ref + AND mv.field_id = ? + AND mv.value_text = ? + ) + """, + [field_id, self._metadata_compare_text(expected)], + ) + if operator == "$in": + values = [self._metadata_compare_text(item) for item in expected] + if not values: + return "0", [] + placeholders = ", ".join("?" for _ in values) + return ( + f""" + EXISTS ( + SELECT 1 FROM metadata_values mv + WHERE mv.file_ref = f.file_ref + AND mv.field_id = ? + AND mv.value_text IN ({placeholders}) + ) + """, + [field_id, *values], + ) + if operator == "$contains": + return ( + """ + EXISTS ( + SELECT 1 FROM metadata_values mv + WHERE mv.file_ref = f.file_ref + AND mv.field_id = ? + AND lower(mv.value_text) LIKE lower(?) ESCAPE '\\' + ) + """, + [field_id, self._contains_like(self._metadata_compare_text(expected))], + ) + if operator in {"$gt", "$gte", "$lt", "$lte"}: + comparator = { + "$gt": ">", + "$gte": ">=", + "$lt": "<", + "$lte": "<=", + }[operator] + if isinstance(expected, (int, float)) and not isinstance(expected, bool): + return ( + f""" + EXISTS ( + SELECT 1 FROM metadata_values mv + WHERE mv.file_ref = f.file_ref + AND mv.field_id = ? + AND mv.value_number IS NOT NULL + AND mv.value_number {comparator} ? + ) + """, + [field_id, float(expected)], + ) + return ( + f""" + EXISTS ( + SELECT 1 FROM metadata_values mv + WHERE mv.file_ref = f.file_ref + AND mv.field_id = ? + AND mv.value_text {comparator} ? + ) + """, + [field_id, self._metadata_compare_text(expected)], + ) + raise ValueError(f"Unsupported metadata operator: {operator}") + + def get_file(self, file_ref: str) -> FileEntry: + with self.connect() as conn: + row = self._file_entry_row(conn, file_ref) + if row is None: + raise KeyError(f"Unknown file_ref: {file_ref}") + return self._file_entry(row) + + def list_pending_metadata_status(self, *, limit: int | None = None) -> list[FileEntry]: + sql = """ + SELECT + f.file_ref, + f.external_id, + f.storage_uri, + f.source_path, + f.title, + f.descriptor, + f.content_type, + f.source_type, + f.fingerprint, + f.text_artifact_path, + f.raw_artifact_path, + f.pageindex_doc_id, + f.pageindex_tree_status, + f.metadata_json, + f.metadata_status_json, + COALESCE(primary_folder.path, '/') AS folder_path + FROM files f + LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref + LEFT JOIN folders primary_folder ON primary_folder.folder_id = ff.folder_id + WHERE f.deleted_at IS NULL + AND ( + f.metadata_status_json LIKE '%pending_generate%' + OR f.metadata_status_json LIKE '%pending_submit%' + ) + GROUP BY f.file_ref + ORDER BY f.created_at, f.file_ref + """ + params: list[Any] = [] + if limit is not None: + sql += " LIMIT ?" + params.append(int(limit)) + with self.connect() as conn: + rows = conn.execute(sql, params).fetchall() + return [self._file_entry(row) for row in rows] + + def update_file_metadata_status( + self, + file_ref: str, + *, + metadata: dict[str, Any], + metadata_status: dict[str, Any], + ) -> None: + with self.connect() as conn: + row = self._file_entry_row(conn, file_ref) + if row is None: + raise KeyError(f"Unknown file_ref: {file_ref}") + metadata_text_value = metadata_text(metadata) + conn.execute( + """ + UPDATE files + SET metadata_json = ?, + metadata_status_json = ?, + updated_at = CURRENT_TIMESTAMP + WHERE file_ref = ? AND deleted_at IS NULL + """, + ( + json.dumps(metadata, ensure_ascii=False), + json.dumps(metadata_status, ensure_ascii=False), + file_ref, + ), + ) + self.replace_metadata_values( + conn, + file_ref, + self.indexed_metadata_values(metadata), + ) + conn.execute( + """ + UPDATE file_fts + SET metadata_text = ? + WHERE file_ref = ? + """, + (metadata_text_value, file_ref), + ) + + def resolve_file_ref(self, target: str) -> str: + with self.connect() as conn: + return self._resolve_file_ref(conn, target) + + def _resolve_file_ref(self, conn: sqlite3.Connection, target: str) -> str: + target = str(target).strip() + if not target: + raise KeyError("Empty file target") + row = conn.execute( + "SELECT file_ref FROM files WHERE file_ref = ? AND deleted_at IS NULL", + (target,), + ).fetchone() + if row: + return row["file_ref"] + row = conn.execute( + "SELECT file_ref FROM files WHERE external_id = ? AND deleted_at IS NULL", + (target,), + ).fetchone() + if row: + return row["file_ref"] + stripped = target.strip("/") + rows = conn.execute( + """ + SELECT + f.file_ref, + f.external_id, + f.title, + f.source_path, + COALESCE(MIN(fo.path), '/') AS folder_path + FROM files f + LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref + LEFT JOIN folders fo ON fo.folder_id = ff.folder_id + WHERE f.source_path = ? AND f.deleted_at IS NULL + GROUP BY f.file_ref, f.external_id, f.title, f.source_path + ORDER BY f.file_ref + LIMIT 2 + """, + (stripped,), + ).fetchall() + if len(rows) > 1: + matches = "; ".join(self._virtual_match_summary(row) for row in rows) + raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}") + if rows: + return rows[0]["file_ref"] + virtual_file_ref = self._resolve_virtual_file_ref(conn, target) + if virtual_file_ref: + return virtual_file_ref + raise KeyError(f"Unknown file target: {target}") + + def _resolve_virtual_file_ref(self, conn: sqlite3.Connection, target: str) -> str | None: + virtual_target = normalize_path(target) + rows = conn.execute( + """ + WITH virtual_matches AS ( + SELECT + f.file_ref, + f.external_id, + f.title, + f.source_path, + pf.path AS folder_path, + (CASE WHEN pf.path = '/' THEN '/' ELSE pf.path || '/' END) + || ltrim(f.title, '/') AS title_virtual_path, + (CASE WHEN pf.path = '/' THEN '/' ELSE pf.path || '/' END) + || ltrim(f.source_path, '/') AS source_virtual_path + FROM files f + JOIN file_folders ff ON ff.file_ref = f.file_ref + JOIN folders pf ON pf.folder_id = ff.folder_id + WHERE f.deleted_at IS NULL + ) + SELECT + file_ref, + external_id, + title, + source_path, + MIN(folder_path) AS folder_path + FROM virtual_matches + WHERE title_virtual_path = ? + OR source_virtual_path = ? + GROUP BY file_ref, external_id, title, source_path + ORDER BY file_ref + LIMIT 2 + """, + (virtual_target, virtual_target), + ).fetchall() + if not rows: + return None + if len(rows) > 1: + matches = "; ".join(self._virtual_match_summary(row) for row in rows) + raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}") + return rows[0]["file_ref"] + + @staticmethod + def _virtual_match_summary(row: sqlite3.Row) -> str: + external_id = row["external_id"] or "-" + return ( + f"file_ref={row['file_ref']} external_id={external_id} " + f"folder={row['folder_path']} title={row['title']!r} " + f"source_path={row['source_path']!r}" + ) + + def ensure_folder( + self, + conn: sqlite3.Connection | None, + path: str, + *, + kind: str = "manual", + description: str = "", + metadata: dict[str, Any] | None = None, + ) -> str: + owns_connection = conn is None + if conn is None: + conn = self.connect() + try: + normalized = normalize_path(path) + metadata_json = json.dumps(metadata or {}, ensure_ascii=False) + if normalized == "/": + folder_id = self.folder_id("/") + existing = conn.execute( + "SELECT folder_id FROM folders WHERE path = '/'" + ).fetchone() + if existing is not None and not description and metadata_json == "{}": + if owns_connection: + conn.commit() + return folder_id + self._upsert_folder_row( + conn, + folder_id=folder_id, + parent_id=None, + name="/", + path="/", + kind=kind, + description=description, + metadata_json=metadata_json, + ) + if owns_connection: + conn.commit() + return folder_id + parent_id = self.ensure_folder(conn, str(Path(normalized).parent), kind=kind) + name = normalized.rsplit("/", 1)[-1] + folder_id = self.folder_id(normalized) + self._upsert_folder_row( + conn, + folder_id=folder_id, + parent_id=parent_id, + name=name, + path=normalized, + kind=kind, + description=description, + metadata_json=metadata_json, + ) + if owns_connection: + conn.commit() + return folder_id + finally: + if owns_connection: + conn.close() + + def _upsert_folder_row( + self, + conn: sqlite3.Connection, + *, + folder_id: str, + parent_id: str | None, + name: str, + path: str, + kind: str, + description: str, + metadata_json: str, + ) -> None: + columns = self._columns(conn, "folders") + insert_columns = ["folder_id", "parent_id", "name", "path", "description", "kind", "metadata_json"] + values: list[Any] = [folder_id, parent_id, name, path, description, kind, metadata_json] + if "source" in columns: + insert_columns.append("source") + values.append("system") + if "sort_order" in columns: + insert_columns.append("sort_order") + values.append(0) + placeholders = ", ".join("?" for _ in values) + update_assignments = [ + "parent_id = excluded.parent_id", + "name = excluded.name", + "kind = excluded.kind", + "updated_at = CURRENT_TIMESTAMP", + ] + if description: + update_assignments.append("description = excluded.description") + if metadata_json != "{}": + update_assignments.append("metadata_json = excluded.metadata_json") + conn.execute( + f""" + INSERT INTO folders({", ".join(insert_columns)}) + VALUES ({placeholders}) + ON CONFLICT(path) DO UPDATE SET + {", ".join(update_assignments)} + """, + values, + ) + + def _resolve_or_create_folder(self, conn: sqlite3.Connection, folder_path_or_id: str) -> str: + target = str(folder_path_or_id).strip() + if not target: + raise KeyError("Empty folder target") + row = conn.execute( + "SELECT folder_id FROM folders WHERE folder_id = ?", + (target,), + ).fetchone() + if row: + return row["folder_id"] + row = conn.execute( + "SELECT folder_id FROM folders WHERE path = ?", + (normalize_path(target),), + ).fetchone() + if row: + return row["folder_id"] + return self.ensure_folder(conn, target) + + def read_text(self, file_ref: str) -> str: + entry = self.get_file(file_ref) + return Path(entry.text_artifact_path).read_text(encoding="utf-8") + + def write_text_artifact(self, file_ref: str, content: str) -> Path: + path = self.text_dir / f"{file_ref}.txt" + path.write_text(content, encoding="utf-8") + return path + + def update_pageindex_pointer( + self, + file_ref: str, + *, + pageindex_doc_id: str | None, + pageindex_tree_status: str, + ) -> None: + with self.connect() as conn: + resolved = self._resolve_file_ref(conn, file_ref) + conn.execute( + """ + UPDATE files + SET pageindex_doc_id = ?, + pageindex_tree_status = ?, + updated_at = CURRENT_TIMESTAMP + WHERE file_ref = ? AND deleted_at IS NULL + """, + (pageindex_doc_id, pageindex_tree_status, resolved), + ) + + def write_raw_artifact(self, file_ref: str, metadata: dict[str, Any]) -> Path: + path = self.raw_dir / f"{file_ref}.json" + path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8") + return path + + def file_info(self, target: str) -> dict[str, Any]: + file_ref = self.resolve_file_ref(target) + entry = self.get_file(file_ref) + info = self._file_entry_to_dict(entry) + info["folders"] = self.folder_memberships(file_ref) + return info + + def file_matches( + self, + file_ref: str, + *, + scope: Optional[dict[str, Any]] = None, + metadata_filter: Optional[dict[str, Any]] = None, + ) -> bool: + where = ["f.file_ref = ?", "f.deleted_at IS NULL"] + params: list[Any] = [file_ref] + scope_sql, scope_params = self._scope_sql(scope) + if scope_sql: + where.append(scope_sql) + params.extend(scope_params) + metadata_sql, metadata_params = self._metadata_filter_sql(metadata_filter) + where.extend(metadata_sql) + params.extend(metadata_params) + with self.connect() as conn: + row = conn.execute( + f""" + SELECT 1 + FROM files f + WHERE {" AND ".join(where)} + LIMIT 1 + """, + params, + ).fetchone() + return row is not None + + def folder_memberships(self, file_ref: str) -> list[dict[str, Any]]: + with self.connect() as conn: + rows = conn.execute( + """ + SELECT + fo.folder_id, + fo.parent_id, + fo.name, + fo.path, + fo.description, + fo.kind, + fo.metadata_json AS folder_metadata_json, + ff.metadata_json AS membership_metadata_json, + ff.created_at + FROM file_folders ff + JOIN folders fo ON fo.folder_id = ff.folder_id + WHERE ff.file_ref = ? + ORDER BY fo.path + """, + (file_ref,), + ).fetchall() + return [ + { + "folder_id": row["folder_id"], + "id": row["folder_id"], + "parent_id": row["parent_id"], + "parent_folder_id": row["parent_id"], + "name": row["name"], + "path": row["path"], + "kind": row["kind"], + "description": row["description"], + "folder_metadata": json.loads(row["folder_metadata_json"] or "{}"), + "metadata": json.loads(row["membership_metadata_json"] or "{}"), + "created_at": row["created_at"], + } + for row in rows + ] + + def count_files_in_folder(self, path: str, *, recursive: bool = True) -> int: + path = normalize_path(path) + with self.connect() as conn: + folder = self._folder_by_path(conn, path) + if folder is None: + raise KeyError(f"Unknown folder path: {path}") + if recursive: + row = conn.execute( + """ + SELECT COUNT(DISTINCT f.file_ref) AS count + FROM files f + JOIN file_folders ff ON ff.file_ref = f.file_ref + JOIN folders fo ON fo.folder_id = ff.folder_id + WHERE f.deleted_at IS NULL + AND (fo.path = ? OR fo.path LIKE ? ESCAPE '\\') + """, + (path, self._descendant_like(path)), + ).fetchone() + else: + row = conn.execute( + """ + SELECT COUNT(DISTINCT f.file_ref) AS count + FROM files f + JOIN file_folders ff ON ff.file_ref = f.file_ref + JOIN folders fo ON fo.folder_id = ff.folder_id + WHERE f.deleted_at IS NULL + AND fo.path = ? + """, + (path,), + ).fetchone() + return int(row["count"] or 0) + + def folder_subtree_thresholds( + self, + path: str, + *, + depth_limit: int, + file_limit: int, + ) -> dict[str, Any]: + path = normalize_path(path) + with self.connect() as conn: + folder = self._folder_by_path(conn, path) + if folder is None: + raise KeyError(f"Unknown folder path: {path}") + base_depth = self._folder_depth(path) + deep_folder = conn.execute( + """ + SELECT path + FROM folders + WHERE path != ? + AND path LIKE ? ESCAPE '\\' + AND ( + CASE + WHEN TRIM(path, '/') = '' THEN 0 + ELSE LENGTH(TRIM(path, '/')) - LENGTH(REPLACE(TRIM(path, '/'), '/', '')) + 1 + END + ) - ? > ? + LIMIT 1 + """, + (path, self._descendant_like(path), base_depth, depth_limit), + ).fetchone() + file_rows = conn.execute( + """ + SELECT DISTINCT f.file_ref + FROM files f + JOIN file_folders ff ON ff.file_ref = f.file_ref + JOIN folders fo ON fo.folder_id = ff.folder_id + WHERE f.deleted_at IS NULL + AND (fo.path = ? OR fo.path LIKE ? ESCAPE '\\') + LIMIT ? + """, + (path, self._descendant_like(path), file_limit + 1), + ).fetchall() + return { + "depth_limit": depth_limit, + "file_limit": file_limit, + "folder_depth_exceeds_limit": deep_folder is not None, + "file_count_exceeds_limit": len(file_rows) > file_limit, + "sampled_file_count": len(file_rows), + "sample_deep_folder_path": deep_folder["path"] if deep_folder is not None else "", + } + + def _file_entry_row(self, conn: sqlite3.Connection, file_ref: str) -> sqlite3.Row | None: + return conn.execute( + """ + SELECT + f.file_ref, + f.external_id, + f.storage_uri, + f.source_path, + f.title, + f.descriptor, + f.content_type, + f.source_type, + f.fingerprint, + f.text_artifact_path, + f.raw_artifact_path, + f.pageindex_doc_id, + f.pageindex_tree_status, + f.metadata_json, + f.metadata_status_json, + COALESCE( + ( + SELECT display_folder.path + FROM file_folders display_ff + JOIN folders display_folder + ON display_folder.folder_id = display_ff.folder_id + WHERE display_ff.file_ref = f.file_ref + ORDER BY display_folder.path + LIMIT 1 + ), + '/' + ) AS folder_path + FROM files f + WHERE f.file_ref = ? AND f.deleted_at IS NULL + """, + (file_ref,), + ).fetchone() + + def _file_rows_for_scope( + self, + conn: sqlite3.Connection, + path: str, + recursive: bool, + limit: int, + max_depth: int | None = None, + ) -> list[sqlite3.Row]: + sql = """ + SELECT + f.file_ref, + f.external_id, + f.title, + f.descriptor, + f.source_path, + f.pageindex_tree_status, + f.metadata_json, + f.metadata_status_json, + f.created_at, + MIN(pf.folder_id) AS folder_id, + MIN(pf.path) AS folder_path + FROM files f + JOIN file_folders ff ON ff.file_ref = f.file_ref + JOIN folders pf ON pf.folder_id = ff.folder_id + WHERE f.deleted_at IS NULL + """ + params: list[Any] + if recursive: + sql += " AND (pf.path = ? OR pf.path LIKE ? ESCAPE '\\')" + params = [path, self._descendant_like(path)] + if max_depth is not None: + if max_depth <= 0: + sql += " AND 0" + else: + sql += f" AND ({self._folder_depth_sql('pf.path')} - ?) <= ?" + params.extend([self._folder_depth(path), max_depth - 1]) + else: + sql += " AND pf.path = ?" + params = [path] + sql += " GROUP BY f.file_ref ORDER BY f.created_at DESC, f.title LIMIT ?" + params.append(limit) + return conn.execute(sql, params).fetchall() + + def _scope_sql(self, scope: Optional[dict[str, Any]]) -> tuple[str, list[Any]]: + if not scope: + return "", [] + recursive = scope.get("recursive", True) + max_depth = scope.get("max_depth") + if max_depth is not None: + max_depth = int(max_depth) + if max_depth < 0: + raise ValueError("max_depth must be non-negative") + folder_id = scope.get("folder_id") + if folder_id: + if folder_id == "root": + folder_path = "/" + else: + if recursive: + if max_depth == 0: + return "0", [] + depth_clause = "" + depth_params: list[Any] = [] + if max_depth is not None: + depth_clause = ( + "AND " + f"({self._folder_depth_sql('scope_folder.path')} - " + f"{self._folder_depth_sql('base_folder.path')}) <= ?" + ) + depth_params = [max_depth - 1] + return ( + f""" + EXISTS ( + SELECT 1 + FROM file_folders scope_ff + JOIN folders scope_folder + ON scope_folder.folder_id = scope_ff.folder_id + JOIN folders base_folder + ON base_folder.folder_id = ? + WHERE scope_ff.file_ref = f.file_ref + AND ( + scope_folder.folder_id = base_folder.folder_id + OR scope_folder.path LIKE {self._descendant_like_sql_expr("base_folder.path")} ESCAPE '\\' + ) + {depth_clause} + ) + """, + [folder_id, *depth_params], + ) + return ( + """ + EXISTS ( + SELECT 1 + FROM file_folders scope_ff + WHERE scope_ff.file_ref = f.file_ref + AND scope_ff.folder_id = ? + ) + """, + [folder_id], + ) + elif scope.get("folder_path") or scope.get("path"): + folder_path = normalize_path(scope.get("folder_path") or scope.get("path")) + else: + return "", [] + if recursive and max_depth == 0: + return "0", [] + path_clause = ( + "(scope_folder.path = ? OR scope_folder.path LIKE ? ESCAPE '\\')" + if recursive + else "scope_folder.path = ?" + ) + params = [folder_path, self._descendant_like(folder_path)] if recursive else [folder_path] + depth_clause = "" + if recursive and max_depth is not None: + depth_clause = f"AND ({self._folder_depth_sql('scope_folder.path')} - ?) <= ?" + params.extend([self._folder_depth(folder_path), max_depth - 1]) + return ( + f""" + EXISTS ( + SELECT 1 + FROM file_folders scope_ff + JOIN folders scope_folder + ON scope_folder.folder_id = scope_ff.folder_id + WHERE scope_ff.file_ref = f.file_ref + AND {path_clause} + {depth_clause} + ) + """, + params, + ) + + def _folder_by_path(self, conn: sqlite3.Connection, path: str) -> sqlite3.Row | None: + return conn.execute( + """ + SELECT + folder_id, + parent_id, + name, + path, + description, + kind, + metadata_json, + created_at, + updated_at + FROM folders + WHERE path = ? + """, + (path,), + ).fetchone() + + @classmethod + def _descendant_like(cls, path: str) -> str: + return "/%" if path == "/" else f"{cls._like_escape(path)}/%" + + @staticmethod + def _descendant_like_sql_expr(path_expr: str) -> str: + escaped_expr = SQLiteFileSystemStore._like_escape_sql_expr(path_expr) + return f"CASE WHEN {path_expr} = '/' THEN '/%' ELSE {escaped_expr} || '/%' END" + + @staticmethod + def _contains_like(value: str) -> str: + return f"%{SQLiteFileSystemStore._like_escape(value)}%" + + @staticmethod + def _like_escape(value: str) -> str: + return ( + value.replace("\\", "\\\\") + .replace("%", "\\%") + .replace("_", "\\_") + ) + + @staticmethod + def _like_escape_sql_expr(value_expr: str) -> str: + return ( + f"replace(replace(replace({value_expr}, '\\', '\\\\'), " + "'%', '\\%'), '_', '\\_')" + ) + + @staticmethod + def _folder_depth(path: str) -> int: + stripped = normalize_path(path).strip("/") + return 0 if not stripped else len(stripped.split("/")) + + @staticmethod + def _folder_depth_sql(path_expr: str) -> str: + return ( + "(CASE " + f"WHEN TRIM({path_expr}, '/') = '' THEN 0 " + f"ELSE LENGTH(TRIM({path_expr}, '/')) " + f"- LENGTH(REPLACE(TRIM({path_expr}, '/'), '/', '')) + 1 " + "END)" + ) + + @classmethod + def _folder_row_to_dict(cls, row: sqlite3.Row) -> dict[str, Any]: + return { + "folder_id": row["folder_id"], + "id": row["folder_id"], + "parent_id": row["parent_id"], + "parent_folder_id": row["parent_id"], + "name": row["name"], + "description": cls._row_value(row, "description", ""), + "path": row["path"], + "kind": row["kind"], + "metadata": json.loads(cls._row_value(row, "metadata_json", "{}") or "{}"), + "created_at": cls._row_value(row, "created_at"), + "updated_at": cls._row_value(row, "updated_at"), + "file_count": cls._row_value(row, "file_count", 0), + "children_count": cls._row_value(row, "children_count", 0), + "matched_files": cls._row_value(row, "matched_files", 0), + } + + @classmethod + def _file_summary(cls, row: sqlite3.Row) -> dict[str, Any]: + external_id = row["external_id"] + return { + "file_ref": row["file_ref"], + "id": external_id or row["file_ref"], + "document_id": external_id, + "external_id": external_id, + "name": row["title"], + "title": row["title"], + "description": cls._row_value(row, "descriptor", row["title"]), + "status": cls._row_value(row, "pageindex_tree_status", "not_built"), + "pageNum": None, + "createdAt": cls._row_value(row, "created_at"), + "folderId": cls._row_value(row, "folder_id"), + "source_path": row["source_path"], + "folder_path": row["folder_path"], + "metadata": json.loads(row["metadata_json"] or "{}"), + "metadata_status": json.loads( + cls._row_value(row, "metadata_status_json", "{}") or "{}" + ), + } + + @classmethod + def _search_row_to_dict(cls, row: sqlite3.Row) -> dict[str, Any]: + external_id = row["external_id"] + return { + "file_ref": row["file_ref"], + "id": external_id or row["file_ref"], + "document_id": external_id, + "external_id": external_id, + "name": row["title"], + "title": row["title"], + "description": cls._row_value(row, "descriptor", row["title"]), + "status": cls._row_value(row, "pageindex_tree_status", "not_built"), + "pageNum": None, + "createdAt": cls._row_value(row, "created_at"), + "folderId": cls._row_value(row, "folder_id"), + "source_path": row["source_path"], + "snippet": row["snippet"] or row["title"], + "folder_path": row["folder_path"], + "metadata": json.loads(row["metadata_json"] or "{}"), + "metadata_status": json.loads( + cls._row_value(row, "metadata_status_json", "{}") or "{}" + ), + } + + @staticmethod + def _row_value(row: sqlite3.Row, key: str, default: Any = None) -> Any: + return row[key] if key in row.keys() else default + + @staticmethod + def _file_entry(row: sqlite3.Row) -> FileEntry: + return FileEntry( + file_ref=row["file_ref"], + external_id=row["external_id"], + storage_uri=row["storage_uri"], + source_path=row["source_path"], + title=row["title"], + descriptor=row["descriptor"], + content_type=row["content_type"], + source_type=row["source_type"], + fingerprint=row["fingerprint"], + text_artifact_path=row["text_artifact_path"], + raw_artifact_path=row["raw_artifact_path"], + pageindex_doc_id=row["pageindex_doc_id"], + pageindex_tree_status=row["pageindex_tree_status"], + metadata=json.loads(row["metadata_json"] or "{}"), + folder_path=row["folder_path"], + metadata_status=json.loads( + SQLiteFileSystemStore._row_value(row, "metadata_status_json", "{}") or "{}" + ), + ) + + @classmethod + def _file_entry_to_dict(cls, entry: FileEntry) -> dict[str, Any]: + return { + "file_ref": entry.file_ref, + "id": entry.external_id or entry.file_ref, + "document_id": entry.external_id, + "external_id": entry.external_id, + "name": entry.title, + "storage_uri": entry.storage_uri, + "source_path": entry.source_path, + "title": entry.title, + "description": entry.descriptor, + "status": entry.pageindex_tree_status, + "pageNum": None, + "descriptor": entry.descriptor, + "content_type": entry.content_type, + "source_type": entry.source_type, + "fingerprint": entry.fingerprint, + "text_artifact_path": entry.text_artifact_path, + "raw_artifact_path": entry.raw_artifact_path, + "pageindex_doc_id": entry.pageindex_doc_id, + "pageindex_tree_status": entry.pageindex_tree_status, + "metadata": entry.metadata, + "metadata_status": entry.metadata_status, + "folder_path": entry.folder_path, + } + + @staticmethod + def _query_text(query: str | list[str] | None) -> str: + if query is None: + return "" + if isinstance(query, list): + return " ".join(str(item) for item in query) + return str(query) + + @classmethod + def _fts_match_queries(cls, query: str) -> list[str]: + terms = cls._fts_terms(query) + if not terms: + return [] + queries = [" ".join(terms)] + if len(terms) > 1: + queries.append(" OR ".join(terms)) + return queries + + @staticmethod + def _fts_terms(query: str) -> list[str]: + stopwords = { + "a", + "an", + "and", + "are", + "as", + "at", + "be", + "by", + "did", + "do", + "does", + "for", + "from", + "how", + "in", + "is", + "it", + "of", + "on", + "or", + "that", + "the", + "to", + "was", + "were", + "what", + "when", + "where", + "which", + "who", + "why", + "with", + } + terms = re.findall(r"[A-Za-z0-9_]+", query.lower()) + unique_terms = [] + seen = set() + for term in terms: + if term in stopwords or term in seen: + continue + seen.add(term) + unique_terms.append(term) + return unique_terms + + @staticmethod + def _metadata_value_items(value: Any) -> list[dict[str, Any]]: + if value is None: + return [] + if isinstance(value, list): + items = [] + for item in value: + items.extend(SQLiteFileSystemStore._metadata_value_items(item)) + return items + value_json = json.dumps(value, ensure_ascii=False, sort_keys=True) + value_text = SQLiteFileSystemStore._metadata_compare_text(value) + return [ + { + "value_text": value_text, + "value_number": float(value) if isinstance(value, (int, float)) and not isinstance(value, bool) else None, + "value_bool": int(value) if isinstance(value, bool) else None, + "value_json": value_json, + } + ] + + @staticmethod + def _metadata_compare_text(value: Any) -> str: + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, (dict, list)): + return json.dumps(value, ensure_ascii=False, sort_keys=True) + return "" if value is None else str(value) + + @staticmethod + def indexed_metadata_values(metadata: dict[str, Any]) -> dict[str, Any]: + return dict(metadata) + + @staticmethod + def _valid_field_name(name: str) -> bool: + return re.match(r"^[A-Za-z][A-Za-z0-9_]*$", str(name)) is not None + + @staticmethod + def folder_id(path: str) -> str: + normalized = normalize_path(path) + if normalized == "/": + return "folder_root" + digest = hashlib.sha1(normalized.encode("utf-8")).hexdigest()[:16] + return f"folder_{digest}" + + @staticmethod + def field_id(name: str) -> str: + digest = hashlib.sha1(name.encode("utf-8")).hexdigest()[:16] + return f"field_{digest}" + + +def normalize_path(path: str | Path | None) -> str: + if path is None: + return "/" + if str(path).strip().lower() == "root": + return "/" + parts = [part for part in str(path).replace("\\", "/").split("/") if part and part != "."] + return "/" + "/".join(parts) if parts else "/" + + +def make_file_ref(seed: str) -> str: + digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16] + return f"file_{digest}" + + +def fingerprint(content: str) -> str: + return hashlib.sha256(content.encode("utf-8")).hexdigest() + + +def metadata_text(metadata: dict[str, Any]) -> str: + values = [] + for value in metadata.values(): + if isinstance(value, list): + values.extend(str(item) for item in value) + elif isinstance(value, dict): + values.append(json.dumps(value, ensure_ascii=False, sort_keys=True)) + elif value is not None: + values.append(str(value)) + return " ".join(values) diff --git a/pageindex/filesystem/structural_read.py b/pageindex/filesystem/structural_read.py new file mode 100644 index 000000000..aca2bcdcd --- /dev/null +++ b/pageindex/filesystem/structural_read.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Any + + +def strip_pageindex_text_fields(value: Any) -> Any: + if isinstance(value, list): + return [strip_pageindex_text_fields(item) for item in value] + if isinstance(value, dict): + return { + key: strip_pageindex_text_fields(item) + for key, item in value.items() + if key != "text" + } + return value + + +def flatten_pageindex_structure_nodes(structure: Any) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + + def visit(value: Any, *, depth: int, parent_node_id: str | None) -> None: + if isinstance(value, list): + for item in value: + visit(item, depth=depth, parent_node_id=parent_node_id) + return + if not isinstance(value, dict): + return + + node_id = value.get("node_id") + child_values: list[Any] = [] + for child_key in ("nodes", "children"): + children = value.get(child_key) + if isinstance(children, list): + child_values.extend(children) + + row = { + key: strip_pageindex_text_fields(item) + for key, item in value.items() + if key not in {"text", "nodes", "children"} + } + row["depth"] = depth + row["children_count"] = len(child_values) + if parent_node_id: + row["parent_node_id"] = parent_node_id + rows.append(row) + + next_parent = str(node_id) if node_id is not None else parent_node_id + for child in child_values: + visit(child, depth=depth + 1, parent_node_id=next_parent) + + visit(structure, depth=0, parent_node_id=None) + return rows + + +def find_pageindex_node(structure: Any, node_id: str) -> dict[str, Any] | None: + if isinstance(structure, dict): + if str(structure.get("node_id", "")) == str(node_id): + return deepcopy(structure) + for child_key in ("nodes", "children"): + found = find_pageindex_node(structure.get(child_key), node_id) + if found is not None: + return found + if isinstance(structure, list): + for item in structure: + found = find_pageindex_node(item, node_id) + if found is not None: + return found + return None + + +def first_node_location(node: dict[str, Any]) -> str | None: + for key in ("line_num", "physical_index", "start_index"): + value = node.get(key) + if value is not None and value != "": + return str(value) + return None diff --git a/pageindex/filesystem/types.py b/pageindex/filesystem/types.py new file mode 100644 index 000000000..103d28dd6 --- /dev/null +++ b/pageindex/filesystem/types.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Optional + + +@dataclass(frozen=True) +class SearchResult: + file_ref: str + external_id: Optional[str] + title: str + snippet: str + folder_path: str + folder_paths: list[str] + metadata: dict[str, Any] + source_path: str = "" + id: Optional[str] = None + document_id: Optional[str] = None + name: str = "" + description: str = "" + status: str = "" + pageNum: Optional[int] = None + createdAt: Optional[str] = None + folderId: Optional[str] = None + metadata_status: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class OpenResult: + file_ref: str + start_line: int + end_line: int + text: str + external_id: Optional[str] = None + folder_path: str = "" + source_path: str = "" + + +@dataclass(frozen=True) +class FolderEntry: + folder_id: str + parent_id: Optional[str] + name: str + path: str + kind: str + + +@dataclass(frozen=True) +class FileEntry: + file_ref: str + external_id: Optional[str] + storage_uri: str + source_path: str + title: str + descriptor: str + content_type: str + source_type: Optional[str] + fingerprint: str + text_artifact_path: str + raw_artifact_path: Optional[str] + pageindex_doc_id: Optional[str] + pageindex_tree_status: str + metadata: dict[str, Any] + folder_path: str + metadata_status: dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class MetadataField: + name: str + field_type: str + description: str = "" + indexed: bool = True + faceted: bool = False + sortable: bool = False + source: str = "manual" + + +@dataclass(frozen=True) +class CommandResult: + command: str + data: Any + text: str diff --git a/pifs b/pifs new file mode 100755 index 000000000..fb2dbc08e --- /dev/null +++ b/pifs @@ -0,0 +1,10 @@ +#!/bin/sh +set -eu + +SCRIPT_DIR=$(CDPATH= cd -- "$(dirname -- "$0")" && pwd) + +if [ -x "$SCRIPT_DIR/.venv/bin/python" ]; then + exec "$SCRIPT_DIR/.venv/bin/python" -m pageindex.filesystem.cli "$@" +fi + +exec python3 -m pageindex.filesystem.cli "$@" diff --git a/requirements.txt b/requirements.txt index e6ad80531..f88e7cb05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ litellm==1.83.7 -# openai-agents # optional: required for examples/agentic_vectorless_rag_demo.py +# openai-agents==0.17.2 # optional: required for pifs chat/ask and examples/agentic_vectorless_rag_demo.py pymupdf==1.26.4 PyPDF2==3.0.1 python-dotenv==1.2.2 pyyaml==6.0.2 +sqlite-vec>=0.1.9 diff --git a/tests/test_filesystem_store.py b/tests/test_filesystem_store.py new file mode 100644 index 000000000..7f425038f --- /dev/null +++ b/tests/test_filesystem_store.py @@ -0,0 +1,45 @@ +import json + + +def test_insert_files_does_not_disable_sqlite_synchronous(tmp_path): + from pageindex.filesystem.store import SQLiteFileSystemStore + + statements = [] + + class RecordingStore(SQLiteFileSystemStore): + def connect(self): + conn = super().connect() + conn.set_trace_callback(statements.append) + return conn + + store = RecordingStore(tmp_path / "workspace") + statements.clear() + + store.insert_files( + [ + { + "file_ref": "ref_report", + "external_id": "doc_report", + "storage_uri": "file:///tmp/report.pdf", + "source_path": "documents/report.pdf", + "folder_path": "/documents", + "title": "Report", + "descriptor": "documents/report.pdf", + "content_type": "application/pdf", + "source_type": "documents", + "fingerprint": "fingerprint", + "text_artifact_path": "artifacts/text/ref_report.txt", + "raw_artifact_path": None, + "metadata": {}, + "metadata_json": json.dumps({}), + "metadata_text": "", + "content": "", + "skip_fts": True, + } + ] + ) + + assert not any( + statement.upper().replace(" ", "") == "PRAGMASYNCHRONOUS=OFF" + for statement in statements + ) diff --git a/tests/test_import_surface.py b/tests/test_import_surface.py new file mode 100644 index 000000000..b4309cf05 --- /dev/null +++ b/tests/test_import_surface.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import builtins +import importlib +import sys + + +def test_filesystem_import_works_without_eager_optional_dependencies(monkeypatch): + blocked_roots = {"litellm", "openai", "PyPDF2", "pymupdf", "sqlite_vec"} + real_import = builtins.__import__ + + def clear_pageindex_modules() -> None: + for name in list(sys.modules): + if name == "pageindex" or name.startswith("pageindex."): + sys.modules.pop(name, None) + + def import_without_optional_deps(name, globals=None, locals=None, fromlist=(), level=0): + root = name.split(".", 1)[0] + if root in blocked_roots: + raise ModuleNotFoundError(f"No module named '{root}'", name=root) + return real_import(name, globals, locals, fromlist, level) + + clear_pageindex_modules() + try: + with monkeypatch.context() as patch: + patch.setattr(builtins, "__import__", import_without_optional_deps) + + filesystem_module = importlib.import_module("pageindex.filesystem") + from pageindex import PageIndexFileSystem as TopLevelPageIndexFileSystem + from pageindex.filesystem import PageIndexFileSystem + + assert filesystem_module.PageIndexFileSystem is PageIndexFileSystem + assert TopLevelPageIndexFileSystem is PageIndexFileSystem + finally: + clear_pageindex_modules() diff --git a/tests/test_metadata_generation.py b/tests/test_metadata_generation.py new file mode 100644 index 000000000..3e64a4b9e --- /dev/null +++ b/tests/test_metadata_generation.py @@ -0,0 +1,30 @@ +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + + +def test_metadata_generator_uses_provider_parameter(): + from pageindex.filesystem.metadata_generation import ( + MetadataGenerationError, + MetadataGenerationInput, + MetadataGenerator, + ) + + generator = MetadataGenerator(provider="unsupported", model="unused") + request = MetadataGenerationInput( + file_ref="file_a", + external_id="doc_a", + title="A", + source_path="docs/a.txt", + content_type="text/plain", + source_type=None, + text="hello", + ) + + with pytest.raises(MetadataGenerationError, match="unsupported metadata provider: unsupported"): + generator.generate(request, fields=["summary"]) diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py new file mode 100644 index 000000000..087473aab --- /dev/null +++ b/tests/test_pageindex_filesystem_scope.py @@ -0,0 +1,496 @@ +import json +from types import SimpleNamespace + +import pytest + + +def test_filesystem_lazy_exports_remain_public(): + import pageindex.filesystem as filesystem + from pageindex.filesystem import ( + HybridProjectionSearchBackend, + RebuildableSemanticIndex, + SemanticIndexRecord, + SemanticSearchResult, + SQLiteVecSemanticIndex, + SummaryProjectionIndexer, + ) + + for name in ( + "HybridProjectionSearchBackend", + "RebuildableSemanticIndex", + "SemanticIndexRecord", + "SemanticSearchResult", + "SQLiteVecSemanticIndex", + "SummaryProjectionIndexer", + ): + assert name in filesystem.__all__ + assert name in dir(filesystem) + + assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend" + assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex" + assert SemanticIndexRecord.__name__ == "SemanticIndexRecord" + assert SemanticSearchResult.__name__ == "SemanticSearchResult" + assert SQLiteVecSemanticIndex.__name__ == "SQLiteVecSemanticIndex" + assert SummaryProjectionIndexer.__name__ == "SummaryProjectionIndexer" + + +class SummaryBackend: + def __init__(self, document_id): + self.document_id = document_id + self.calls = [] + + def available_channels(self): + return ("summary",) + + def search_channel(self, channel, query, *, limit=10, filters=None): + self.calls.append((channel, query, filters)) + return [ + SimpleNamespace( + document_id=self.document_id, + snippet=f"summary candidate: {query}", + ) + ] + + +class ChannelBackend: + def __init__(self, document_id, channels=("summary", "entity", "relation")): + self.document_id = document_id + self.channels = channels + + def available_channels(self): + return self.channels + + def search_channel(self, channel, query, *, limit=10, filters=None): + return [ + SimpleNamespace( + document_id=self.document_id, + snippet=f"{channel} candidate: {query}", + ) + ] + + +def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": "Federal Reserve annual report summary"} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + file_ref = filesystem.register_file( + storage_uri="file:///tmp/report.pdf", + source_path="examples/documents/report.pdf", + folder_path="/documents", + external_id="dsid_report", + title="report.pdf", + metadata={"source_type": "examples-documents"}, + content="Federal Reserve supervision and regulation annual report.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + backend = SummaryBackend("dsid_report") + filesystem.semantic_retrieval_backend = backend + executor = PIFSCommandExecutor(filesystem, json_output=True) + + result = json.loads( + executor.execute('search-summary "Federal Reserve annual report" /documents') + ) + + assert backend.calls[0][2] == {} + assert result["data"]["data"][0] == { + "path": "/examples/documents/report.pdf", + "summary": "Federal Reserve annual report summary", + "line_text": "1: Federal Reserve supervision and regulation annual report.", + } + assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref + + executor.json_output = False + rendered = executor.execute('search-summary "Federal Reserve annual report" /documents') + assert "path: /examples/documents/report.pdf" in rendered + assert "summary: Federal Reserve annual report summary" in rendered + assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered + assert "id=dsid_report" not in rendered + assert "file_ref=" not in rendered + + +def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": f"summary for {document.external_id}"} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + first_ref = filesystem.register_file( + storage_uri="file:///tmp/first.json", + source_path="slack/dsid_first.json", + folder_path="/documents", + external_id="dsid_first", + title="announcements", + content="first announcement mentions H200 reservations.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.register_file( + storage_uri="file:///tmp/second.json", + source_path="slack/dsid_second.json", + folder_path="/documents", + external_id="dsid_second", + title="announcements", + content="second announcement mentions unrelated maintenance.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first") + executor = PIFSCommandExecutor(filesystem, json_output=True) + + result = json.loads(executor.execute('search-summary "H200 reservations" /documents')) + + assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json" + assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref + with pytest.raises(KeyError, match="Ambiguous file target"): + filesystem.store.resolve_file_ref("/documents/announcements") + + +def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": f"summary for {document.external_id}"} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + first_ref = filesystem.register_file( + storage_uri="file:///tmp/first.json", + source_path="shared/source.json", + folder_path="/documents", + external_id="dsid_first", + title="First", + content="first content", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.register_file( + storage_uri="file:///tmp/second.json", + source_path="shared/source.json", + folder_path="/documents", + external_id="dsid_second", + title="Second", + content="second content", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first") + executor = PIFSCommandExecutor(filesystem, json_output=True) + + result = json.loads(executor.execute('search-summary "first" /documents')) + + assert result["data"]["data"][0]["path"] == "dsid_first" + assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref + + +def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class MetadataGenerator: + def generate(self, document, *, fields): + values = { + "summary": "Risk and compliance summary", + "entity": "Federal Reserve; Disney", + "relation": "Federal Reserve affects Disney valuation", + } + return MetadataGenerationResult(values={field: values[field] for field in fields}) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=MetadataGenerator(), + ) + filesystem.register_file( + storage_uri="file:///tmp/market-note.pdf", + source_path="examples/documents/market-note.pdf", + folder_path="/documents", + external_id="dsid_market_note", + title="market-note.pdf", + content="Federal Reserve policy affects Disney valuation.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + "entity": True, + "relation": True, + } + }, + ) + filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note") + executor = PIFSCommandExecutor(filesystem, json_output=True) + + entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents')) + assert entity["data"]["data"][0] == { + "path": "/examples/documents/market-note.pdf", + "summary": "Risk and compliance summary", + "line_text": "1: Federal Reserve policy affects Disney valuation.", + "entity": "Federal Reserve; Disney", + } + + relation = json.loads(executor.execute('search-relation "Disney valuation" /documents')) + assert relation["data"]["data"][0] == { + "path": "/examples/documents/market-note.pdf", + "summary": "Risk and compliance summary", + "line_text": "1: Federal Reserve policy affects Disney valuation.", + "relation": "Federal Reserve affects Disney valuation", + } + + executor.json_output = False + rendered = executor.execute('search-entity "Federal Reserve" /documents') + assert "path: /examples/documents/market-note.pdf" in rendered + assert "summary: Risk and compliance summary" in rendered + assert "entity: Federal Reserve; Disney" in rendered + assert "file_ref=" not in rendered + + +def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + filesystem.register_file( + storage_uri="file:///tmp/report.pdf", + source_path="examples/documents/report.pdf", + folder_path="/documents", + external_id="dsid_report", + title="Annual report", + content="Federal Reserve supervision and regulation annual report.", + ) + filesystem.semantic_retrieval_backend = SummaryBackend("dsid_report") + executor = PIFSCommandExecutor(filesystem, json_output=True) + + with pytest.raises(PIFSCommandError, match="Quote multi-word queries"): + executor.execute("search-summary Federal Reserve /documents") + + with pytest.raises(PIFSCommandError, match="quote it"): + executor.execute("search-summary Federal Reserve") + + with pytest.raises(PIFSCommandError, match="does not support regex alternation"): + executor.execute('search-summary "Federal|Reserve" /documents') + + +def test_semantic_search_scope_filters_explicit_source_type_facets(): + from pageindex.filesystem import PageIndexFileSystem + + assert PageIndexFileSystem._semantic_filters_for_scope( + {"folder_path": "/source_type=google-drive"} + ) == {"source_type": "google_drive"} + assert PageIndexFileSystem._semantic_filters_for_scope( + {"folder_path": "/semantic/source_type=google-drive"} + ) == {"source_type": "google_drive"} + assert PageIndexFileSystem._semantic_filters_for_scope( + {"folder_path": "/documents"} + ) == {} + + +def test_grep_source_file_requires_terms_on_same_line(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + source_dir = tmp_path / "source" / "documents" + source_dir.mkdir(parents=True) + source = source_dir / "split.json" + source.write_text( + '{\n "first": "alpha evidence lives here",\n' + ' "second": "omega evidence lives there"\n}\n', + encoding="utf-8", + ) + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + filesystem.register_file( + storage_uri=str(source), + source_path="documents/split.json", + folder_path="/documents", + external_id="doc_split_terms", + title="Split source terms", + content="registered artifact without the searched tokens", + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + result = json.loads(executor.execute('grep -R "alpha omega" /documents')) + + assert result["data"]["mode"] == "files" + assert result["data"]["data"] == [] + + matched = json.loads(executor.execute('grep -R "alpha evidence" /documents')) + + assert matched["data"]["data"][0]["external_id"] == "doc_split_terms" + assert matched["data"]["data"][0]["line"] == 2 + assert "alpha evidence" in matched["data"]["data"][0]["text"] + + +def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch): + from pageindex.filesystem import PageIndexFileSystem + from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex + + workspace = tmp_path / "workspace" + index_dir = workspace / "artifacts" / "projection_indexes" + summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + summary_index.reset( + dimension=3, + metadata={ + "channel": "summary", + "embedding_provider": "openai", + "embedding_model": "test-embedding", + "embedding_dimensions": 3, + }, + ) + summary_index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="documents", + source_path="documents/a.pdf", + title="A", + text="summary", + vector=[1.0, 0.0, 0.0], + ) + ] + ) + filesystem = PageIndexFileSystem(workspace) + calls = [] + + def fake_configure(index_dir_arg, **kwargs): + calls.append((index_dir_arg, kwargs)) + filesystem.semantic_retrieval_backend = SummaryBackend("doc_a") + return filesystem.semantic_retrieval_backend + + monkeypatch.setattr( + filesystem, + "configure_hybrid_projection_retrieval", + fake_configure, + ) + + assert filesystem.configure_existing_projection_retrieval() is True + assert calls == [ + ( + filesystem.summary_projection_index_dir, + { + "embedding_provider": "openai", + "embedding_model": "test-embedding", + "embedding_dimensions": 3, + "embedding_timeout": 60, + }, + ) + ] + assert filesystem.semantic_retrieval_channels() == ("summary",) + + +def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + class FixedEmbedder: + def embed(self, texts): + return [[1.0, 0.0, 0.0] for _ in texts] + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": "vendor renewal risk matrix"} + ) + + source = tmp_path / "source.txt" + source.write_text("ordinary fixture body", encoding="utf-8") + index_dir = tmp_path / "workspace" / "artifacts" / "projection_indexes" + indexer = SummaryProjectionIndexer( + index_dir, + embedder=FixedEmbedder(), + embedding_provider="test", + embedding_model="fake", + embedding_dimensions=3, + ) + backend = HybridProjectionSearchBackend( + index_dir, + embedder=FixedEmbedder(), + embedding_provider="test", + embedding_model="fake", + embedding_dimensions=3, + ) + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + summary_projection_indexer=indexer, + semantic_retrieval_backend=backend, + ) + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/source.txt", + folder_path="/documents", + external_id="doc_summary_only", + title="Operations note", + content=source.read_text(encoding="utf-8"), + metadata={"department": "ops"}, + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + + assert filesystem.search("purchase order exposure", semantic=False) == [] + + results = filesystem.search("purchase order exposure", semantic=True) + + assert [result.external_id for result in results] == ["doc_summary_only"] + assert results[0].snippet == "summary_vector rank=1" diff --git a/tests/test_pageindex_structural_read.py b/tests/test_pageindex_structural_read.py new file mode 100644 index 000000000..3994aa413 --- /dev/null +++ b/tests/test_pageindex_structural_read.py @@ -0,0 +1,766 @@ +import json +import tempfile +from pathlib import Path + +import pytest + + +def write_pageindex_client_doc(workspace: Path, doc_id: str, doc: dict) -> None: + workspace.mkdir(parents=True, exist_ok=True) + (workspace / f"{doc_id}.json").write_text( + json.dumps(doc, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + meta = { + doc_id: { + "type": doc.get("type", ""), + "doc_name": doc.get("doc_name", ""), + "doc_description": doc.get("doc_description", ""), + "path": doc.get("path", ""), + } + } + if doc.get("type") == "pdf": + meta[doc_id]["page_count"] = doc.get("page_count") + elif doc.get("type") == "md": + meta[doc_id]["line_count"] = doc.get("line_count") + (workspace / "_meta.json").write_text( + json.dumps(meta, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + +class RecordingMetadataGenerator: + values = { + "summary": "Generated retrieval summary.", + "doc_type": "technical_note", + "domain": "documentation", + "topic": "pageindex extraction", + } + + def __init__(self): + self.calls = [] + + def generate(self, request, *, fields): + self.calls.append((request, list(fields))) + return {field: self.values[field] for field in fields if field in self.values} + + +def test_pageindex_structure_options_report_failed_register_build(monkeypatch): + from pageindex import PageIndexClient + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + with tempfile.TemporaryDirectory() as tmp: + source = Path(tmp) / "report.md" + source.write_text("# Report\n\nCached structure is not built yet.", encoding="utf-8") + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + + def fail_index(*args, **kwargs): + raise RuntimeError("index failed: extractor unavailable") + + monkeypatch.setattr(PageIndexClient, "index", fail_index) + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/report.md", + external_id="dsid_structural_missing", + title="Structural report", + content=source.read_text(encoding="utf-8"), + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + structure = json.loads(executor.execute("cat dsid_structural_missing --structure")) + node = json.loads(executor.execute("cat dsid_structural_missing --node 0001")) + pages = json.loads(executor.execute("cat dsid_structural_missing --page 1-2")) + stat = json.loads(executor.execute("stat dsid_structural_missing")) + + assert structure["data"]["mode"] == "structure" + assert structure["data"]["available"] is False + assert structure["data"]["status"] == "failed" + assert "RuntimeError: index failed: extractor unavailable" in structure["data"]["message"] + assert stat["data"]["pageindex_tree_status"] == "failed" + assert stat["data"]["metadata_status"]["pageindex_tree"] == { + "status": "failed", + "owner": "pageindex", + "source": "PageIndexClient.index", + "error_type": "RuntimeError", + "message": "index failed: extractor unavailable", + } + + assert node["data"]["mode"] == "node" + assert node["data"]["available"] is False + assert node["data"]["node_id"] == "0001" + + assert pages["data"]["mode"] == "page" + assert pages["data"]["available"] is False + assert pages["data"]["pages"] == "1-2" + + assert "cp" not in executor.allowed_commands() + assert "mkdir" not in executor.allowed_commands() + + +def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_fts(monkeypatch): + from pageindex import PageIndexClient + from pageindex.filesystem import PageIndexFileSystem + + def fake_index(self, file_path, mode="auto"): + suffix = Path(file_path).suffix.lower() + doc_id = f"doc_{suffix.lstrip('.')}" + if suffix == ".pdf": + doc = { + "id": doc_id, + "type": "pdf", + "path": str(Path(file_path).resolve()), + "doc_name": "report.pdf", + "doc_description": "", + "page_count": 2, + "structure": [{"title": "Report", "node_id": "0001", "nodes": []}], + "pages": [ + {"page": 1, "content": "PageIndex PDF extracted alpha text."}, + {"page": 2, "content": "Second PageIndex PDF extracted beta text."}, + ], + } + else: + doc = { + "id": doc_id, + "type": "md", + "path": str(Path(file_path).resolve()), + "doc_name": "notes", + "doc_description": "", + "line_count": 3, + "structure": [ + { + "title": "Notes", + "node_id": "0001", + "line_num": 1, + "text": "# Notes\n\nPageIndex Markdown extracted gamma text.", + "nodes": [], + } + ], + } + write_pageindex_client_doc(self.workspace, doc_id, doc) + self.documents[doc_id] = doc + return doc_id + + monkeypatch.setattr(PageIndexClient, "index", fake_index) + with tempfile.TemporaryDirectory() as tmp: + source_pdf = Path(tmp) / "report.pdf" + source_md = Path(tmp) / "notes.md" + source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n") + source_md.write_text("# Notes\n\nCaller markdown content", encoding="utf-8") + generator = RecordingMetadataGenerator() + filesystem = PageIndexFileSystem( + workspace=Path(tmp) / "workspace", + metadata_generator=generator, + ) + + filesystem.register_file( + storage_uri=source_pdf.as_uri(), + source_path="docs/report.pdf", + external_id="dsid_pdf_extracted", + title="PDF extracted", + content="CALLER PDF CONTENT MUST NOT REACH GENERATOR", + ) + filesystem.register_file( + storage_uri=source_md.as_uri(), + source_path="docs/notes.md", + external_id="dsid_md_extracted", + title="Markdown extracted", + content="CALLER MD CONTENT MUST NOT REACH GENERATOR", + ) + + pdf_request = generator.calls[0][0] + md_request = generator.calls[1][0] + pdf_stat = filesystem.store.file_info("dsid_pdf_extracted") + md_stat = filesystem.store.file_info("dsid_md_extracted") + + assert "PageIndex PDF extracted alpha text" in pdf_request.text + assert "Second PageIndex PDF extracted beta text" in pdf_request.text + assert "CALLER PDF CONTENT" not in pdf_request.text + assert "PageIndex Markdown extracted gamma text" in md_request.text + assert "CALLER MD CONTENT" not in md_request.text + assert "PageIndex PDF extracted alpha text" in Path( + pdf_stat["text_artifact_path"] + ).read_text(encoding="utf-8") + assert "PageIndex Markdown extracted gamma text" in Path( + md_stat["text_artifact_path"] + ).read_text(encoding="utf-8") + assert [r.external_id for r in filesystem.search("alpha beta", limit=5)] == [ + "dsid_pdf_extracted" + ] + assert [r.external_id for r in filesystem.search("gamma", limit=5)] == [ + "dsid_md_extracted" + ] + assert filesystem.search("CALLER", limit=5) == [] + + +def test_register_text_metadata_generation_keeps_caller_content_without_pageindex(monkeypatch): + from pageindex import PageIndexClient + from pageindex.filesystem import PageIndexFileSystem + + def fail_index(*args, **kwargs): + raise AssertionError("PageIndexClient.index should not be called for text files") + + monkeypatch.setattr(PageIndexClient, "index", fail_index) + with tempfile.TemporaryDirectory() as tmp: + generator = RecordingMetadataGenerator() + filesystem = PageIndexFileSystem( + workspace=Path(tmp) / "workspace", + metadata_generator=generator, + ) + + filesystem.register_file( + storage_uri="file:///tmp/readme.txt", + source_path="docs/readme.txt", + external_id="dsid_text_generation", + title="Text generation", + content="Plain text caller content stays authoritative.", + content_type="text/plain", + ) + + stat = filesystem.store.file_info("dsid_text_generation") + + assert generator.calls[0][0].text == "Plain text caller content stays authoritative." + assert stat["pageindex_doc_id"] is None + assert stat["pageindex_tree_status"] == "not_built" + assert Path(stat["text_artifact_path"]).read_text( + encoding="utf-8" + ) == "Plain text caller content stays authoritative." + + +def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeypatch): + from pageindex import PageIndexClient + from pageindex.filesystem import PageIndexFileSystem + + calls: list[str] = [] + + def fake_index(self, file_path, mode="auto"): + calls.append(str(file_path)) + doc_id = f"doc_{Path(file_path).suffix.lstrip('.')}" + doc_type = "pdf" if Path(file_path).suffix == ".pdf" else "md" + doc = { + "id": doc_id, + "type": doc_type, + "path": str(Path(file_path).resolve()), + "doc_name": Path(file_path).name, + "doc_description": "", + "structure": [{"title": Path(file_path).stem, "node_id": "0001", "nodes": []}], + } + if doc_type == "pdf": + doc["page_count"] = 1 + doc["pages"] = [{"page": 1, "content": "Page one text"}] + else: + doc["line_count"] = 1 + write_pageindex_client_doc(self.workspace, doc_id, doc) + self.documents[doc_id] = doc + return doc_id + + monkeypatch.setattr(PageIndexClient, "index", fake_index) + with tempfile.TemporaryDirectory() as tmp: + source_pdf = Path(tmp) / "report.pdf" + source_md = Path(tmp) / "notes.md" + source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n") + source_md.write_text("# Notes", encoding="utf-8") + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + + filesystem.register_file( + storage_uri=str(source_pdf), + source_path="docs/report.pdf", + external_id="dsid_pdf_build", + title="PDF build", + content="pdf text", + ) + filesystem.register_file( + storage_uri=source_md.as_uri(), + source_path="docs/notes.md", + external_id="dsid_md_build", + title="Markdown build", + content=source_md.read_text(encoding="utf-8"), + ) + + pdf_stat = filesystem.store.file_info("dsid_pdf_build") + md_stat = filesystem.store.file_info("dsid_md_build") + + assert calls == [str(source_pdf.resolve()), str(source_md.resolve())] + assert pdf_stat["pageindex_doc_id"] == "doc_pdf" + assert pdf_stat["pageindex_tree_status"] == "built" + assert md_stat["pageindex_doc_id"] == "doc_md" + assert md_stat["pageindex_tree_status"] == "built" + + +def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monkeypatch): + from pageindex import PageIndexClient + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + with tempfile.TemporaryDirectory() as tmp: + source = Path(tmp) / "report.pdf" + source.write_bytes(b"%PDF-1.4\n% test fixture\n") + workspace = Path(tmp) / "workspace" + filesystem = PageIndexFileSystem(workspace=workspace) + write_pageindex_client_doc( + filesystem.pageindex_client_workspace, + "doc_cached_pdf", + { + "id": "doc_cached_pdf", + "type": "pdf", + "path": str(source.resolve()), + "doc_name": "report.pdf", + "doc_description": "", + "page_count": 2, + "structure": [ + { + "title": "Introduction", + "node_id": "0001", + "text": "Intro section text", + "nodes": [ + { + "title": "Findings", + "node_id": "0002", + "physical_index": 2, + "nodes": [], + } + ], + } + ], + "pages": [ + {"page": 1, "content": "Page one text"}, + {"page": 2, "content": "Page two text"}, + ], + }, + ) + + def fail_index(*args, **kwargs): + raise AssertionError("PageIndexClient.index should not be called on cache hit") + + monkeypatch.setattr(PageIndexClient, "index", fail_index) + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/report.pdf", + external_id="dsid_structural_cached", + title="Cached structural report", + content="text artifact remains available for grep, not cat --all", + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + structure = json.loads(executor.execute("cat dsid_structural_cached --structure")) + pages = json.loads(executor.execute("cat dsid_structural_cached --page 1-2")) + stat = json.loads(executor.execute("stat dsid_structural_cached")) + + assert structure["data"]["available"] is True + assert structure["data"]["pageindex_doc_id"] == "doc_cached_pdf" + assert structure["data"]["structure"][0]["title"] == "Introduction" + assert structure["data"]["structure"][1]["title"] == "Findings" + assert structure["data"]["structure_pagination"]["limit"] == 25 + assert "text" not in structure["data"]["structure"][0] + assert "text" not in structure["data"]["structure"][1] + + assert pages["data"]["available"] is True + assert pages["data"]["text"] == "Page one text\n\nPage two text" + with pytest.raises(PIFSCommandError, match="target-first"): + executor.execute("cat --page 1-2 dsid_structural_cached") + with pytest.raises(PIFSCommandError, match="one file target"): + executor.execute("cat dsid_structural_cached --page 1 2") + + assert stat["data"]["pageindex_doc_id"] == "doc_cached_pdf" + assert stat["data"]["pageindex_tree_status"] == "built" + + +def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact(): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + with tempfile.TemporaryDirectory() as tmp: + source = Path(tmp) / "notes.md" + source.write_text("# Notes\n\nBody", encoding="utf-8") + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + write_pageindex_client_doc( + filesystem.pageindex_client_workspace, + "doc_cached_md", + { + "id": "doc_cached_md", + "type": "md", + "path": str(source.resolve()), + "doc_name": "notes", + "doc_description": "", + "line_count": 3, + "structure": [ + { + "title": "Notes", + "node_id": "0001", + "line_num": 1, + "text": "# Notes\n\nBody", + "nodes": [], + } + ], + }, + ) + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/notes.md", + external_id="dsid_md_cached", + title="Cached markdown notes", + content=source.read_text(encoding="utf-8"), + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + node = json.loads(executor.execute("cat dsid_md_cached --node 0001")) + + assert node["data"]["available"] is True + assert node["data"]["pageindex_doc_id"] == "doc_cached_md" + assert node["data"]["node"]["title"] == "Notes" + assert node["data"]["text"] == "# Notes\n\nBody" + assert "text" not in node["data"]["node"] + + +def test_cat_structure_page_node_and_text_outputs_are_hard_limited(): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + with tempfile.TemporaryDirectory() as tmp: + source = Path(tmp) / "report.pdf" + source.write_bytes(b"%PDF-1.4\n% test fixture\n") + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + structure_nodes = [ + { + "title": f"Section {index}", + "node_id": f"{index:04d}", + "start_index": index, + "end_index": index, + "text": f"node {index} text", + "nodes": [], + } + for index in range(1, 31) + ] + write_pageindex_client_doc( + filesystem.pageindex_client_workspace, + "doc_limited_pdf", + { + "id": "doc_limited_pdf", + "type": "pdf", + "path": str(source.resolve()), + "doc_name": "report.pdf", + "doc_description": "", + "page_count": 10, + "structure": structure_nodes, + "pages": [ + {"page": index, "content": f"Page {index} text"} + for index in range(1, 11) + ], + }, + ) + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/report.pdf", + external_id="dsid_limited_pdf", + title="Limited structural report", + content="text artifact remains available for grep", + ) + text_content = "\n".join(f"line {index}" for index in range(1, 106)) + filesystem.register_file( + storage_uri="file:///tmp/long.txt", + source_path="docs/long.txt", + external_id="dsid_long_text", + title="Long text", + content=text_content, + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + first_structure = json.loads(executor.execute("cat dsid_limited_pdf --structure")) + assert len(first_structure["data"]["structure"]) == 25 + assert first_structure["data"]["structure_pagination"]["has_more"] is True + assert first_structure["data"]["structure_pagination"]["next_offset"] == 25 + + second_structure = json.loads( + executor.execute("cat dsid_limited_pdf --structure --offset 25") + ) + assert len(second_structure["data"]["structure"]) == 5 + assert second_structure["data"]["structure"][0]["node_id"] == "0026" + + pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-5")) + assert pages["data"]["text"] == ( + "Page 1 text\n\nPage 2 text\n\nPage 3 text\n\nPage 4 text\n\nPage 5 text" + ) + assert pages["data"]["page_pagination"]["limit"] == 5 + with pytest.raises(PIFSCommandError, match="at most 5"): + executor.execute("cat dsid_limited_pdf --page 1-6") + with pytest.raises(PIFSCommandError, match="evidence is sufficient"): + executor.execute("cat dsid_limited_pdf --page 1-6") + + nodes = json.loads( + executor.execute( + "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " + "0006 0007 0008 0009 0010" + ) + ) + assert nodes["data"]["node_ids"] == [ + "0001", + "0002", + "0003", + "0004", + "0005", + "0006", + "0007", + "0008", + "0009", + "0010", + ] + comma_nodes = json.loads( + executor.execute("cat dsid_limited_pdf --node 0001,0002") + ) + assert comma_nodes["data"]["node_ids"] == ["0001", "0002"] + with pytest.raises(PIFSCommandError, match="at most 10"): + executor.execute( + "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " + "0006 0007 0008 0009 0010 0011" + ) + with pytest.raises(PIFSCommandError, match="continue with additional chunks"): + executor.execute( + "cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 " + "0006 0007 0008 0009 0010 0011" + ) + + with pytest.raises(PIFSCommandError, match="quote the whole target"): + executor.execute("cat dsid_limited_pdf 0001") + + text = json.loads(executor.execute("cat dsid_long_text --all")) + assert "line 100" in text["data"]["text"] + assert "line 101" not in text["data"]["text"] + assert text["data"]["pagination"]["has_more"] is True + assert text["data"]["pagination"]["next_range"] == "101-105" + with pytest.raises(PIFSCommandError, match="at most 100"): + executor.execute("cat dsid_long_text --range 1-101") + + +def test_tree_folder_behavior_is_preserved(): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + with tempfile.TemporaryDirectory() as tmp: + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + filesystem.register_file( + storage_uri="file:///tmp/report.txt", + source_path="docs/report.txt", + folder_path="/docs/reports", + external_id="dsid_folder_tree", + title="Folder report", + content="folder tree behavior remains intact", + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + folder_tree = json.loads(executor.execute("tree /docs --depth 2")) + + assert folder_tree["data"]["path"] == "/docs" + assert folder_tree["data"]["folders"][0]["path"] == "/docs/reports" + + +def test_tree_does_not_read_file_internal_pageindex_structure(): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + with tempfile.TemporaryDirectory() as tmp: + source = Path(tmp) / "report.pdf" + source.write_bytes(b"%PDF-1.4\n% test fixture\n") + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + write_pageindex_client_doc( + filesystem.pageindex_client_workspace, + "doc_tree_is_folder_only", + { + "id": "doc_tree_is_folder_only", + "type": "pdf", + "path": str(source.resolve()), + "doc_name": "report.pdf", + "doc_description": "", + "page_count": 1, + "structure": [ + {"title": "Introduction", "node_id": "0001", "nodes": []} + ], + "pages": [{"page": 1, "content": "Page one text"}], + }, + ) + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/report.pdf", + external_id="dsid_tree_is_folder_only", + title="Cached structural report", + content="text artifact remains available", + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + with pytest.raises(PIFSCommandError): + executor.execute("tree dsid_tree_is_folder_only") + + structure = json.loads(executor.execute("cat dsid_tree_is_folder_only --structure")) + assert structure["data"]["structure"][0]["title"] == "Introduction" + + +def test_cat_all_is_limited_to_text_files(): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + with tempfile.TemporaryDirectory() as tmp: + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + filesystem.register_file( + storage_uri="file:///tmp/readme.txt", + source_path="docs/readme.txt", + external_id="dsid_text_file", + title="Text readme", + content="plain text body", + ) + filesystem.register_file( + storage_uri="file:///tmp/report.pdf", + source_path="docs/report.pdf", + external_id="dsid_pdf_file", + title="PDF report", + content="extracted text should not be served through cat --all", + ) + filesystem.register_file( + storage_uri="file:///tmp/notes.md", + source_path="docs/notes.md", + external_id="dsid_md_file", + title="Markdown notes", + content="markdown text should use PageIndex structure reads", + ) + filesystem.register_file( + storage_uri="file:///tmp/data.json", + source_path="docs/data.json", + external_id="dsid_json_file", + title="JSON record", + content='{"body":"json"}', + content_type="application/json", + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + text = json.loads(executor.execute("cat dsid_text_file --all")) + assert text["data"]["text"] == "plain text body" + + with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): + executor.execute("cat dsid_pdf_file --all") + with pytest.raises(ValueError, match="not supported for PDF/Markdown"): + filesystem.open("dsid_pdf_file") + with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): + executor.execute("cat dsid_md_file --all") + with pytest.raises(ValueError, match="not supported for PDF/Markdown"): + filesystem.open("dsid_md_file") + with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): + executor.execute("cat dsid_json_file --all") + opened_json = filesystem.open("dsid_json_file") + assert opened_json.text == '{"body":"json"}' + for command in ( + "head dsid_pdf_file", + "tail dsid_pdf_file", + "sed -n 1,1p dsid_pdf_file", + "head dsid_md_file", + "tail dsid_md_file", + "sed -n 1,1p dsid_md_file", + ): + with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): + executor.execute(command) + + +def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown(): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + with tempfile.TemporaryDirectory() as tmp: + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + filesystem.register_file( + storage_uri="file:///tmp/readme.txt", + source_path="docs/readme.txt", + external_id="dsid_text_only", + title="Text readme", + content="plain text body", + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + for command in ( + "cat dsid_text_only --structure", + "cat dsid_text_only --page 1", + "cat dsid_text_only --node 0001", + ): + with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"): + executor.execute(command) + + +def test_existing_pageindex_status_allows_legacy_record_without_format_suffix(): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + with tempfile.TemporaryDirectory() as tmp: + source = Path(tmp) / "uploaded" + source.write_text("# Uploaded\n\nBody", encoding="utf-8") + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + file_ref = filesystem.register_file( + storage_uri=source.as_uri(), + source_path="uploads/uploaded", + external_id="dsid_legacy_pageindex", + title="Legacy PageIndex record", + content="text/plain is only a weak default here", + ) + write_pageindex_client_doc( + filesystem.pageindex_client_workspace, + "doc_legacy_pageindex", + { + "id": "doc_legacy_pageindex", + "type": "md", + "path": str(source.resolve()), + "doc_name": "uploaded", + "doc_description": "", + "line_count": 3, + "structure": [ + {"title": "Uploaded", "node_id": "0001", "text": "Body", "nodes": []} + ], + }, + ) + filesystem.store.update_pageindex_pointer( + file_ref, + pageindex_doc_id="doc_legacy_pageindex", + pageindex_tree_status="built", + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + structure = json.loads(executor.execute("cat dsid_legacy_pageindex --structure")) + assert structure["data"]["structure"][0]["title"] == "Uploaded" + with pytest.raises(PIFSCommandError, match="only supported for txt/text files"): + executor.execute("cat dsid_legacy_pageindex --all") + + +def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monkeypatch): + from pageindex import PageIndexClient + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + with tempfile.TemporaryDirectory() as tmp: + source = Path(tmp) / "late.md" + source.write_text("# Late\n\nBody", encoding="utf-8") + filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") + + def fail_index(*args, **kwargs): + raise RuntimeError("index failed") + + monkeypatch.setattr(PageIndexClient, "index", fail_index) + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/late.md", + external_id="dsid_late_cache", + title="Late cache", + content=source.read_text(encoding="utf-8"), + ) + write_pageindex_client_doc( + filesystem.pageindex_client_workspace, + "doc_late_cache", + { + "id": "doc_late_cache", + "type": "md", + "path": str(source.resolve()), + "doc_name": "late", + "doc_description": "", + "line_count": 3, + "structure": [ + {"title": "Late", "node_id": "0001", "text": "Body", "nodes": []} + ], + }, + ) + executor = PIFSCommandExecutor(filesystem, json_output=True) + + structure = json.loads(executor.execute("cat dsid_late_cache --structure")) + stat = json.loads(executor.execute("stat dsid_late_cache")) + + assert structure["data"]["available"] is False + assert stat["data"]["pageindex_doc_id"] is None + assert stat["data"]["pageindex_tree_status"] == "failed" diff --git a/tests/test_pifs_agent_stream.py b/tests/test_pifs_agent_stream.py new file mode 100644 index 000000000..69f62edde --- /dev/null +++ b/tests/test_pifs_agent_stream.py @@ -0,0 +1,272 @@ +import io +import os +import threading +import unittest +from unittest.mock import patch +from types import SimpleNamespace + +from pydantic import BaseModel, ConfigDict + +from pageindex.filesystem import agent as agent_module +from pageindex.filesystem.agent import ( + AGENT_TOOL_POLICY, + AGENT_SYSTEM_PROMPT, + BASH_TOOL_DESCRIPTION, + PIFSAgentSession, + PIFSAgentStreamObserver, + build_agent_model_settings, + normalize_agent_stream_mode, + normalize_reasoning_effort, + normalize_reasoning_summary, + pifs_agent_raw_reasoning_enabled, + serialize_agent_final_output, + should_disable_pifs_agent_tracing, + should_use_openai_compatible_chat_model, +) + + +class StructuredAnswer(BaseModel): + model_config = ConfigDict(extra="forbid") + + answer: str + document_ids: list[str] + + +class PIFSAgentStreamTest(unittest.TestCase): + def raw_event(self, event_type, delta): + return SimpleNamespace( + type="raw_response_event", + data=SimpleNamespace(type=event_type, delta=delta), + ) + + def test_model_stream_prints_output_and_think_deltas(self): + output = io.StringIO() + stream_log = [] + observer = PIFSAgentStreamObserver("model", stream_log=stream_log, output=output) + + observer.handle_event(self.raw_event("response.reasoning_summary_text.delta", "look up folder")) + observer.handle_event(self.raw_event("response.output_text.delta", '{"answer":')) + observer.handle_event(self.raw_event("response.output_text.delta", '"done"}')) + observer.finish() + + printed = output.getvalue() + self.assertIn("[llm reasoning summary stream]", printed) + self.assertIn("look up folder", printed) + self.assertIn("[llm final output stream]", printed) + self.assertIn('{"answer":"done"}', printed.replace("\n", "")) + self.assertEqual( + stream_log, + [ + {"kind": "output", "text": '{"answer":"done"}'}, + {"kind": "think_summary", "text": "look up folder"}, + ], + ) + + def test_tools_mode_does_not_print_model_text(self): + output = io.StringIO() + stream_log = [] + observer = PIFSAgentStreamObserver("tools", stream_log=stream_log, output=output) + + observer.handle_event(self.raw_event("response.output_text.delta", "hidden from tools mode")) + observer.handle_event(self.raw_event("response.function_call_arguments.delta", '{"command":"ls /"}')) + observer.emit_tool_call("ls /") + observer.emit_tool_result(ok=True, output='{"ok": true}', seconds=0.001) + observer.finish() + + printed = output.getvalue() + self.assertNotIn("hidden from tools mode", printed) + self.assertIn("[llm -> pifs command]", printed) + self.assertIn("ls /", printed) + self.assertIn("[pifs -> llm result preview]", printed) + self.assertIn('{"ok": true}', printed) + self.assertEqual(stream_log[0], {"kind": "tool_call", "command": "ls /"}) + self.assertEqual(stream_log[1]["kind"], "tool_result") + self.assertEqual(stream_log[2], {"kind": "tool_args", "text": '{"command":"ls /"}'}) + + def test_empty_tool_command_is_not_printed_or_logged(self): + output = io.StringIO() + stream_log = [] + observer = PIFSAgentStreamObserver("tools", stream_log=stream_log, output=output) + + observer.emit_tool_call("") + observer.emit_tool_call(" ") + + self.assertEqual(output.getvalue(), "") + self.assertEqual(stream_log, []) + + def test_tool_result_preview_compacts_large_outputs(self): + output = io.StringIO() + observer = PIFSAgentStreamObserver("tools", output=output) + + observer.emit_tool_result( + ok=True, + output="\n".join(f"line {index}" for index in range(50)), + seconds=0.001, + ) + + printed = output.getvalue() + self.assertIn("[large PIFS result", printed) + self.assertIn("line 0", printed) + self.assertIn("more lines omitted from preview", printed) + self.assertNotIn("line 49", printed) + + def test_raw_reasoning_is_not_logged_by_default_but_summary_is(self): + output = io.StringIO() + stream_log = [] + previous = os.environ.pop("PAGEINDEX_PIFS_AGENT_RAW_REASONING", None) + try: + observer = PIFSAgentStreamObserver("model", stream_log=stream_log, output=output) + observer.handle_event(self.raw_event("response.reasoning_text.delta", "private chain")) + observer.handle_event( + self.raw_event("response.reasoning_summary_text.delta", "visible summary") + ) + observer.finish() + finally: + if previous is not None: + os.environ["PAGEINDEX_PIFS_AGENT_RAW_REASONING"] = previous + + printed = output.getvalue() + self.assertNotIn("private chain", printed) + self.assertIn("visible summary", printed) + self.assertEqual(stream_log, [{"kind": "think_summary", "text": "visible summary"}]) + + def test_raw_reasoning_requires_debug_env_flag(self): + self.assertFalse(pifs_agent_raw_reasoning_enabled({})) + self.assertTrue( + pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "on"}) + ) + self.assertTrue( + pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "TRUE"}) + ) + self.assertFalse( + pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "0"}) + ) + + def test_stream_mode_aliases(self): + self.assertEqual(normalize_agent_stream_mode("think"), "model") + self.assertEqual(normalize_agent_stream_mode("debug"), "all") + self.assertEqual(normalize_agent_stream_mode(""), "off") + with self.assertRaises(ValueError): + normalize_agent_stream_mode("nope") + + def test_reasoning_settings_enable_effort_and_summary(self): + settings = build_agent_model_settings( + reasoning_effort="medium", + reasoning_summary="detailed", + ) + + self.assertIsNotNone(settings) + self.assertEqual(settings.reasoning.effort, "medium") + self.assertEqual(settings.reasoning.summary, "detailed") + self.assertEqual(settings.verbosity, "low") + + def test_reasoning_effort_defaults_to_visible_summary(self): + settings = build_agent_model_settings(reasoning_effort="low") + + self.assertIsNotNone(settings) + self.assertEqual(settings.reasoning.effort, "low") + self.assertEqual(settings.reasoning.summary, "auto") + + def test_reasoning_and_base_url_normalization(self): + self.assertEqual(normalize_reasoning_effort("xhigh"), "xhigh") + self.assertIsNone(normalize_reasoning_summary("none")) + self.assertFalse(should_use_openai_compatible_chat_model(None)) + self.assertFalse(should_use_openai_compatible_chat_model("https://api.openai.com/v1/")) + self.assertTrue(should_use_openai_compatible_chat_model("https://example.test/v1")) + with self.assertRaises(ValueError): + normalize_reasoning_effort("maximum") + + def test_tracing_is_disabled_by_default_unless_env_enables_it(self): + self.assertTrue(should_disable_pifs_agent_tracing({})) + self.assertFalse( + should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "1"}) + ) + self.assertFalse( + should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "true"}) + ) + self.assertFalse( + should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "on"}) + ) + self.assertTrue( + should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "0"}) + ) + + def test_structured_agent_output_serializes_to_json(self): + output = serialize_agent_final_output( + StructuredAnswer(answer="done", document_ids=["dsid_1"]) + ) + + self.assertEqual(output, '{"answer":"done","document_ids":["dsid_1"]}') + + def test_prompt_tells_agent_when_to_choose_node_or_page(self): + self.assertIn("prefer cat --node ", AGENT_TOOL_POLICY) + self.assertIn("page-level evidence", AGENT_TOOL_POLICY) + self.assertIn("prefer\ncat --node ", BASH_TOOL_DESCRIPTION) + self.assertIn("stop if the evidence is sufficient", AGENT_TOOL_POLICY) + self.assertIn("continue with another chunk before answering", BASH_TOOL_DESCRIPTION) + self.assertIn("Do not reconstruct paths from document titles", BASH_TOOL_DESCRIPTION) + self.assertIn("file_ref/document_id", AGENT_TOOL_POLICY) + + def test_prompt_requires_stat_for_metadata_questions(self): + self.assertIn("stat --schema and stat ", AGENT_TOOL_POLICY) + self.assertIn("do not infer metadata presence or absence", AGENT_TOOL_POLICY) + self.assertIn("questions about metadata fields", BASH_TOOL_DESCRIPTION) + self.assertIn("Use stat only for metadata/schema/status questions", AGENT_TOOL_POLICY) + self.assertIn("Do not run stat merely to understand what a document says", AGENT_TOOL_POLICY) + self.assertIn("Do not use stat as a general content/topic discovery step", BASH_TOOL_DESCRIPTION) + + def test_prompt_routes_summary_search_to_search_summary(self): + self.assertIn("search-summary when the user asks for", BASH_TOOL_DESCRIPTION) + self.assertIn('use search-summary "" ', AGENT_TOOL_POLICY) + self.assertIn('search-summary "Federal Reserve" /documents', BASH_TOOL_DESCRIPTION) + self.assertIn("do not translate that request into find --where", AGENT_TOOL_POLICY) + self.assertIn("verify the relevant facts with cat", AGENT_TOOL_POLICY) + self.assertIn("verify the relevant claim with cat", BASH_TOOL_DESCRIPTION) + + def test_prompt_rejects_find_grep_as_exhaustive_search(self): + self.assertIn("Do not use find | grep as an exhaustive search", AGENT_TOOL_POLICY) + self.assertIn("find output can be scoped or limited", AGENT_TOOL_POLICY) + + def test_system_prompt_sets_workspace_identity_and_scope(self): + self.assertIn("PageIndex FileSystem Demo Agent", AGENT_SYSTEM_PROMPT) + self.assertIn("VectifyAI Team", AGENT_SYSTEM_PROMPT) + self.assertIn("current PageIndex FileSystem\nworkspace", AGENT_SYSTEM_PROMPT) + self.assertIn("unrelated to the current workspace", AGENT_SYSTEM_PROMPT) + self.assertIn("do not answer it as\na general-purpose assistant", AGENT_SYSTEM_PROMPT) + self.assertIn("workspace-related topic question", AGENT_SYSTEM_PROMPT) + self.assertIn("clarify only after a reasonable search", AGENT_SYSTEM_PROMPT) + self.assertIn("search for candidate documents before asking", AGENT_TOOL_POLICY) + self.assertIn("Do not conclude that no relevant document exists from one failed grep", AGENT_SYSTEM_PROMPT) + self.assertIn("A single failed grep is not enough evidence", AGENT_TOOL_POLICY) + + def test_threaded_runtime_error_is_not_retried_on_fresh_loop(self): + session = object.__new__(PIFSAgentSession) + session.executor = SimpleNamespace(query_context=None) + session.normalized_stream_mode = "off" + session.agent_log = [] + session.max_seconds = None + session.max_turns = 1 + session.session = None + session.agent = object() + + main_thread = threading.get_ident() + run_threads = [] + + def fail_asyncio_run(coro): + coro.close() + run_threads.append(threading.get_ident()) + raise RuntimeError("threaded agent failure") + + with ( + patch.object(agent_module.asyncio, "get_running_loop", return_value=object()), + patch.object(agent_module.asyncio, "run", side_effect=fail_asyncio_run), + ): + with self.assertRaisesRegex(RuntimeError, "threaded agent failure"): + session.run("Question: inspect workspace") + + self.assertEqual(len(run_threads), 1) + self.assertNotEqual(run_threads[0], main_thread) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py new file mode 100644 index 000000000..491cbb93c --- /dev/null +++ b/tests/test_pifs_cli.py @@ -0,0 +1,339 @@ +import os +from pathlib import Path + + +class FakeFileSystem: + def __init__(self, workspace): + self.workspace = Path(workspace) + self.projection_retrieval_configured = False + + def configure_existing_projection_retrieval(self): + self.projection_retrieval_configured = True + return True + + +def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + + filesystem = cli._filesystem_from_workspace(str(workspace)) + + assert filesystem.workspace == workspace + assert filesystem.projection_retrieval_configured is True + + +def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + executor_instances = [] + + class FakeExecutor: + def __init__(self, filesystem, *, json_output=False): + self.filesystem = filesystem + self.json_output = json_output + self.commands = [] + executor_instances.append(self) + + def execute(self, command): + self.commands.append(command) + return f"executed:{command}" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "PIFSCommandExecutor", FakeExecutor) + + status = cli.main(["--workspace", str(workspace), "ls", "/documents", "--json"]) + + assert status == 0 + assert capsys.readouterr().out == "executed:ls /documents\n" + assert len(executor_instances) == 1 + assert executor_instances[0].filesystem.workspace == workspace + assert executor_instances[0].json_output is True + assert executor_instances[0].commands == ["ls /documents"] + + +def test_cli_set_workspace_persists_default(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + config_path = tmp_path / "pifs.json" + workspace = tmp_path / "workspace" + monkeypatch.setenv("PIFS_CONFIG_FILE", str(config_path)) + + status = cli.main(["set", "workspace", str(workspace)]) + + assert status == 0 + output = capsys.readouterr().out + assert f"workspace: {workspace}" in output + assert f"config: {config_path}" in output + assert config_path.read_text(encoding="utf-8") == ( + '{\n "workspace": "' + str(workspace) + '"\n}\n' + ) + + +def test_cli_passthrough_uses_configured_workspace(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + config_path = tmp_path / "pifs.json" + workspace = tmp_path / "workspace" + executor_instances = [] + monkeypatch.setenv("PIFS_CONFIG_FILE", str(config_path)) + monkeypatch.delenv("PIFS_WORKSPACE", raising=False) + + class FakeExecutor: + def __init__(self, filesystem, *, json_output=False): + self.filesystem = filesystem + self.json_output = json_output + self.commands = [] + executor_instances.append(self) + + def execute(self, command): + self.commands.append(command) + return f"executed:{command}" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "PIFSCommandExecutor", FakeExecutor) + + assert cli.main(["set", "workspace", str(workspace)]) == 0 + capsys.readouterr() + + status = cli.main(["ls", "/documents"]) + + assert status == 0 + assert capsys.readouterr().out == "executed:ls /documents\n" + assert executor_instances[0].filesystem.workspace == workspace + + +def test_cli_ask_invokes_agent_with_question(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + agent_calls = [] + + def fake_run_pifs_agent(filesystem, question, **kwargs): + agent_calls.append((filesystem, question, kwargs)) + return "agent answer" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent) + + status = cli.main( + [ + "ask", + "--workspace", + str(workspace), + "--model", + "test-model", + "--stream-mode", + "off", + "--max-turns", + "7", + "--max-seconds", + "3.5", + "--reasoning-effort", + "low", + "--reasoning-summary", + "concise", + "What", + "is", + "inside?", + ] + ) + + assert status == 0 + assert capsys.readouterr().out == "agent answer\n" + filesystem, question, kwargs = agent_calls[0] + assert filesystem.workspace == workspace + assert question == "What is inside?" + assert kwargs == { + "model": "test-model", + "stream_mode": "off", + "max_turns": 7, + "max_seconds": 3.5, + "reasoning_effort": "low", + "reasoning_summary": "concise", + } + + +def test_cli_ask_loads_env_file_before_running_agent(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + env_file = tmp_path / ".env" + env_file.write_text("OPENAI_API_KEY=from-dotenv\n", encoding="utf-8") + agent_keys = [] + + def fake_run_pifs_agent(filesystem, question, **kwargs): + agent_keys.append(os.environ.get("OPENAI_API_KEY")) + return "agent answer" + + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent) + + status = cli.main( + [ + "ask", + "--workspace", + str(workspace), + "--env-file", + str(env_file), + "What", + "is", + "inside?", + ] + ) + + assert status == 0 + assert capsys.readouterr().out == "agent answer\n" + assert agent_keys == ["from-dotenv"] + + +def test_cli_chat_runs_one_question_and_exits(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + inputs = iter(["", "Summarize the workspace", "exit"]) + session_instances = [] + session_questions = [] + + class FakeSession: + def __init__(self, filesystem, **kwargs): + self.filesystem = filesystem + self.kwargs = kwargs + session_instances.append(self) + + def run(self, question): + session_questions.append((self, question)) + return f"answer:{question}" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "PIFSAgentSession", FakeSession) + monkeypatch.setattr("builtins.input", lambda prompt="": next(inputs)) + + status = cli.main(["chat", "--workspace", str(workspace), "--model", "test-model"]) + + assert status == 0 + assert capsys.readouterr().out == "" + assert len(session_instances) == 1 + assert session_instances[0].filesystem.workspace == workspace + assert session_questions == [(session_instances[0], "Summarize the workspace")] + assert session_instances[0].kwargs["model"] == "test-model" + assert session_instances[0].kwargs["stream_mode"] == "all" + + +def test_cli_chat_sanitizes_control_input(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + inputs = iter(["\x12", "he\x7fllo\x1b[A", "exit"]) + agent_calls = [] + + class FakeSession: + def __init__(self, filesystem, **kwargs): + pass + + def run(self, question): + agent_calls.append(question) + return f"answer:{question}" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "PIFSAgentSession", FakeSession) + monkeypatch.setattr("builtins.input", lambda prompt="": next(inputs)) + + status = cli.main(["chat", "--workspace", str(workspace), "--stream-mode", "off"]) + + assert status == 0 + assert agent_calls == ["hllo"] + assert capsys.readouterr().out == "answer:hllo\n" + + +def test_cli_ask_does_not_reprint_streamed_agent_output(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + + def fake_run_pifs_agent(filesystem, question, **kwargs): + print("streamed answer") + return "returned answer" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent) + + status = cli.main( + [ + "ask", + "--workspace", + str(workspace), + "--stream-mode", + "all", + "What", + "is", + "inside?", + ] + ) + + assert status == 0 + assert capsys.readouterr().out == "streamed answer\n" + + +def test_cli_chat_stream_mode_can_be_overridden(monkeypatch, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + inputs = iter(["Summarize the workspace", "exit"]) + session_kwargs = [] + + class FakeSession: + def __init__(self, filesystem, **kwargs): + session_kwargs.append(kwargs) + + def run(self, question): + return f"answer:{question}" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "PIFSAgentSession", FakeSession) + monkeypatch.setattr("builtins.input", lambda prompt="": next(inputs)) + + status = cli.main( + [ + "chat", + "--workspace", + str(workspace), + "--stream-mode", + "tools", + ] + ) + + assert status == 0 + assert session_kwargs[0]["stream_mode"] == "tools" + + +def test_cli_chat_reuses_one_agent_session_for_multiple_questions(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + inputs = iter(["first", "second", "exit"]) + sessions = [] + + class FakeSession: + def __init__(self, filesystem, **kwargs): + self.questions = [] + sessions.append(self) + + def run(self, question): + self.questions.append(question) + return f"answer:{question}" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "PIFSAgentSession", FakeSession) + monkeypatch.setattr("builtins.input", lambda prompt="": next(inputs)) + + status = cli.main(["chat", "--workspace", str(workspace), "--stream-mode", "off"]) + + assert status == 0 + assert len(sessions) == 1 + assert sessions[0].questions == ["first", "second"] + assert capsys.readouterr().out == "answer:first\nanswer:second\n" diff --git a/tests/test_pifs_find_maxdepth.py b/tests/test_pifs_find_maxdepth.py new file mode 100644 index 000000000..c1afe9145 --- /dev/null +++ b/tests/test_pifs_find_maxdepth.py @@ -0,0 +1,399 @@ +import json +from pathlib import Path + +import pytest + + +def _register_find_fixture(tmp_path: Path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + + source_dir = tmp_path / "source" + source_dir.mkdir() + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + filesystem.metadata.register_schema({"fields": {"department": "string"}}) + + def add_file( + filename: str, + *, + folder_path: str, + external_id: str, + title: str, + domain: str, + ) -> None: + source = source_dir / filename + source.write_text(f"{title} fixture text", encoding="utf-8") + filesystem.register_file( + storage_uri=source.as_uri(), + source_path=f"docs/{filename}", + folder_path=folder_path, + external_id=external_id, + title=title, + content=source.read_text(encoding="utf-8"), + metadata={"department": domain}, + ) + + add_file( + "root.txt", + folder_path="/documents", + external_id="doc_root", + title="Root document", + domain="ops", + ) + add_file( + "child.txt", + folder_path="/documents/team", + external_id="doc_child", + title="Child document", + domain="ops", + ) + add_file( + "deep.txt", + folder_path="/documents/team/deep", + external_id="doc_deep", + title="Deep document", + domain="ops", + ) + add_file( + "other.txt", + folder_path="/documents/team", + external_id="doc_other", + title="Other document", + domain="finance", + ) + return PIFSCommandExecutor(filesystem, json_output=True) + + +def _data(output: str): + return json.loads(output)["data"] + + +def test_find_maxdepth_one_returns_direct_files_only(tmp_path): + executor = _register_find_fixture(tmp_path) + + rows = _data(executor.execute("find /documents -maxdepth 1 -type f")) + + assert [row["external_id"] for row in rows] == ["doc_root"] + + +def test_find_output_is_path_first_without_session_refs(tmp_path): + executor = _register_find_fixture(tmp_path) + executor.json_output = False + + output = executor.execute("find /documents -maxdepth 1 -type f") + + assert output.startswith("/documents/Root document id=doc_root file_ref=file_") + assert "ref_1" not in output + assert "title=Root document" in output + + +def test_stable_path_targets_work_without_session_refs(tmp_path): + executor = _register_find_fixture(tmp_path) + executor.json_output = False + + stat = executor.execute("stat '/documents/Root document'") + text = executor.execute("cat '/documents/Root document' --all") + + assert "target: /documents/Root document" in stat + assert "document_id: doc_root" in stat + assert "Root document fixture text" in text + + +def test_shell_limits_reject_context_expanding_counts(tmp_path): + from pageindex.filesystem.commands import PIFSCommandError + + executor = _register_find_fixture(tmp_path) + + for command, limit in ( + ("find /documents --limit 51", 50), + ("grep --limit 21 Root /documents", 20), + ("ls /documents --limit 101", 100), + ("tree /documents --limit 201", 200), + ("head -n 101 /documents/Root\\ document", 100), + ("tail -n 101 /documents/Root\\ document", 100), + ("sed -n 1,101p /documents/Root\\ document", 100), + ): + with pytest.raises(PIFSCommandError, match=f"at most {limit}"): + executor.execute(command) + + +def test_grep_rejects_regex_alternation_patterns(tmp_path): + from pageindex.filesystem.commands import PIFSCommandError + + executor = _register_find_fixture(tmp_path) + executor.json_output = False + + with pytest.raises(PIFSCommandError, match="does not support regex alternation"): + executor.execute('grep -R "Root|Child" /documents') + + with pytest.raises(PIFSCommandError, match="multiple grep commands"): + executor.execute('find /documents -type f | grep "Root|Child"') + + +def test_stat_shell_output_includes_unified_metadata_status(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + source = tmp_path / "source.txt" + source.write_text("fixture text", encoding="utf-8") + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={field: "Generated summary for retrieval." for field in fields} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/source.txt", + folder_path="/documents", + external_id="doc_generated", + title="Generated metadata document", + content=source.read_text(encoding="utf-8"), + metadata={"department": "ops"}, + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + executor = PIFSCommandExecutor(filesystem, json_output=False) + + stat = executor.execute("stat /documents/'Generated metadata document'") + + assert "metadata:" in stat + assert " department: ops" in stat + assert " summary: Generated summary for retrieval." in stat + assert "metadata_status: generated" in stat + + +def test_stat_field_reads_one_metadata_field_across_multiple_targets(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={ + field: ( + f"Summary for {document.title}\n" + + "full summary token " * 80 + ) + for field in fields + } + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + for index in range(1, 3): + source = tmp_path / f"source{index}.txt" + source.write_text(f"fixture text {index}", encoding="utf-8") + filesystem.register_file( + storage_uri=source.as_uri(), + source_path=f"docs/source{index}.txt", + folder_path="/documents", + external_id=f"doc_summary_{index}", + title=f"Summary document {index}", + content=source.read_text(encoding="utf-8"), + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + executor = PIFSCommandExecutor(filesystem, json_output=False) + + output = executor.execute( + "stat --field summary /documents/'Summary document 1' /documents/'Summary document 2'" + ) + + assert "/documents/Summary document 1:" in output + assert "summary: Summary for Summary document 1" in output + assert "full summary token" in output + assert "[truncated]" not in output + assert "/documents/Summary document 2:" in output + assert "summary: Summary for Summary document 2" in output + + data = json.loads( + PIFSCommandExecutor(filesystem, json_output=True).execute( + "stat --field summary /documents/'Summary document 1' /documents/'Summary document 2'" + ) + )["data"] + assert data["mode"] == "field_values" + assert data["target_count"] == 2 + assert data["data"][0]["field"] == "summary" + assert data["data"][0]["value"].startswith("Summary for Summary document 1\n") + assert data["data"][0]["value"].count("full summary token") == 80 + + with pytest.raises(PIFSCommandError, match="Unknown metadata field"): + executor.execute("stat --field missing_field /documents/'Summary document 1'") + + +def test_stat_field_rejects_more_than_twenty_targets(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.commands import PIFSCommandError + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + targets = [] + for index in range(21): + source = tmp_path / f"source{index}.txt" + source.write_text(f"fixture text {index}", encoding="utf-8") + filesystem.register_file( + storage_uri=source.as_uri(), + source_path=f"docs/source{index}.txt", + folder_path="/documents", + external_id=f"doc_{index}", + title=f"Document {index}", + content=source.read_text(encoding="utf-8"), + metadata={"department": "ops"}, + ) + targets.append(f"/documents/'Document {index}'") + executor = PIFSCommandExecutor(filesystem, json_output=False) + + with pytest.raises(PIFSCommandError, match="at most 20"): + executor.execute("stat --field department " + " ".join(targets)) + + +def test_register_rejects_pifs_owned_metadata_fields(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + + source = tmp_path / "source.txt" + source.write_text("fixture text", encoding="utf-8") + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + + with pytest.raises(ValueError, match="PIFS-owned generated field"): + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/source.txt", + folder_path="/documents", + external_id="doc_conflict", + title="Conflict document", + content=source.read_text(encoding="utf-8"), + metadata={"summary": "caller summary"}, + ) + + +def test_batch_metadata_status_generates_into_unified_metadata(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + source = tmp_path / "source.txt" + source.write_text("fixture text", encoding="utf-8") + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult(values={"summary": "Batch generated summary."}) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + file_ref = filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/source.txt", + folder_path="/documents", + external_id="doc_batch", + title="Batch document", + content=source.read_text(encoding="utf-8"), + metadata={"department": "ops"}, + metadata_policy={ + "batch": True, + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + }, + }, + ) + + before = filesystem.store.get_file(file_ref) + assert "summary" not in before.metadata + assert before.metadata_status["fields"]["summary"]["status"] == "pending_submit" + + result = filesystem.batch_generate() + after = filesystem.store.get_file(file_ref) + + assert result["generated"] == 1 + assert after.metadata["summary"] == "Batch generated summary." + assert after.metadata["department"] == "ops" + assert after.metadata_status["fields"]["summary"]["status"] == "generated" + + +def test_find_maxdepth_zero_type_directory_returns_start_folder(tmp_path): + executor = _register_find_fixture(tmp_path) + + rows = _data(executor.execute("find /documents -maxdepth 0 -type d")) + + assert [row["path"] for row in rows] == ["/documents"] + + +def test_find_directory_output_renders_root_without_double_slash(tmp_path): + executor = _register_find_fixture(tmp_path) + executor.json_output = False + + output = executor.execute("find / -maxdepth 1 -type d") + + assert output.splitlines()[0] == "/ folders=1 files=0" + assert "//" not in output + assert "/documents/ folders=1 files=1" in output + + +def test_find_maxdepth_combines_with_where_and_limit(tmp_path): + executor = _register_find_fixture(tmp_path) + + rows = _data( + executor.execute( + """find /documents -maxdepth 2 -type f --where '{"department":"ops"}' --limit 1""" + ) + ) + + assert len(rows) == 1 + assert rows[0]["metadata"]["department"] == "ops" + assert rows[0]["folder_path"] in {"/documents", "/documents/team"} + + +def test_find_maxdepth_rejects_invalid_values_and_unsupported_options(tmp_path): + from pageindex.filesystem.commands import PIFSCommandError + + executor = _register_find_fixture(tmp_path) + + with pytest.raises(PIFSCommandError, match="find -maxdepth requires an integer >= 0"): + executor.execute("find /documents -maxdepth nope -type f") + with pytest.raises(PIFSCommandError, match="find -maxdepth requires an integer >= 0"): + executor.execute("find /documents -maxdepth -1 -type f") + with pytest.raises(PIFSCommandError, match="Unsupported find option: -exec"): + executor.execute("find /documents -maxdepth 1 -type f -exec") + + +def test_find_maxdepth_is_advertised_to_agents(tmp_path): + executor = _register_find_fixture(tmp_path) + + assert "-maxdepth N -type f|d" in executor.describe_available_command_surfaces() + assert executor.command_capabilities()["retrieval"]["lexical"]["find_maxdepth"] is True + + +def test_where_path_error_points_to_folder_scope(tmp_path): + from pageindex.filesystem.commands import PIFSCommandError + + executor = _register_find_fixture(tmp_path) + + with pytest.raises(PIFSCommandError) as exc_info: + executor.execute("""find --where '{"path":"/documents"}'""") + + message = str(exc_info.value) + assert "Folder paths are positional PIFS paths" in message + assert "find /documents -type f" in message + assert "stat --schema" in message diff --git a/tests/test_pifs_like_escape.py b/tests/test_pifs_like_escape.py new file mode 100644 index 000000000..82e7ef9dd --- /dev/null +++ b/tests/test_pifs_like_escape.py @@ -0,0 +1,115 @@ +from pathlib import Path + + +def _register_file( + filesystem, + tmp_path: Path, + filename: str, + *, + folder_path: str, + external_id: str, + metadata: dict[str, str] | None = None, +) -> None: + source = tmp_path / filename + source.write_text(f"{external_id} fixture text", encoding="utf-8") + filesystem.register_file( + storage_uri=source.as_uri(), + source_path=f"docs/{filename}", + folder_path=folder_path, + external_id=external_id, + title=external_id, + content=source.read_text(encoding="utf-8"), + metadata=metadata or {}, + ) + + +def test_descendant_folder_filter_treats_underscore_literally(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + _register_file( + filesystem, + tmp_path, + "literal.txt", + folder_path="/proj_1/docs", + external_id="literal_underscore", + ) + _register_file( + filesystem, + tmp_path, + "wildcard.txt", + folder_path="/projA1/docs", + external_id="wildcard_neighbor", + ) + + recursive = filesystem.browse("/proj_1", recursive=True, limit=10) + folder_id = filesystem.folder_info("/proj_1")["folder_id"] + scoped_results = filesystem.search( + scope={"folder_id": folder_id, "recursive": True}, + semantic=False, + limit=10, + ) + ranked_folders = { + folder["path"]: folder + for folder in filesystem.find_folders("/", max_depth=1, limit=10) + } + + assert {folder["path"] for folder in recursive["folders"]} == {"/proj_1/docs"} + assert {file["external_id"] for file in recursive["files"]} == {"literal_underscore"} + assert {result.external_id for result in scoped_results} == {"literal_underscore"} + assert ranked_folders["/proj_1"]["matched_files"] == 1 + assert ranked_folders["/projA1"]["matched_files"] == 1 + assert filesystem.store.count_files_in_folder("/proj_1", recursive=True) == 1 + + +def test_metadata_contains_treats_percent_and_underscore_literally(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + filesystem.metadata.register_schema({"fields": {"status": "string"}}) + _register_file( + filesystem, + tmp_path, + "percent.txt", + folder_path="/documents", + external_id="literal_percent", + metadata={"status": "100% done"}, + ) + _register_file( + filesystem, + tmp_path, + "percent-neighbor.txt", + folder_path="/documents", + external_id="percent_neighbor", + metadata={"status": "100X done"}, + ) + _register_file( + filesystem, + tmp_path, + "underscore.txt", + folder_path="/documents", + external_id="literal_underscore", + metadata={"status": "build_alpha"}, + ) + _register_file( + filesystem, + tmp_path, + "underscore-neighbor.txt", + folder_path="/documents", + external_id="underscore_neighbor", + metadata={"status": "buildXalpha"}, + ) + + percent_results = filesystem.search( + metadata_filter={"status": {"$contains": "100% done"}}, + semantic=False, + limit=10, + ) + underscore_results = filesystem.search( + metadata_filter={"status": {"$contains": "build_alpha"}}, + semantic=False, + limit=10, + ) + + assert {result.external_id for result in percent_results} == {"literal_percent"} + assert {result.external_id for result in underscore_results} == {"literal_underscore"} diff --git a/tests/test_pifs_path_resolution.py b/tests/test_pifs_path_resolution.py new file mode 100644 index 000000000..184fc53da --- /dev/null +++ b/tests/test_pifs_path_resolution.py @@ -0,0 +1,71 @@ +import pytest + + +def test_root_virtual_file_path_resolves_without_double_slash(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + file_ref = filesystem.register_file( + storage_uri="file:///tmp/root-source.txt", + source_path="sources/root-source.txt", + folder_path="/", + external_id="doc_root_title", + title="Root Title", + content="root content", + ) + + assert filesystem.store.resolve_file_ref("/Root Title") == file_ref + + +def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + first_ref = filesystem.register_file( + storage_uri="file:///tmp/first.txt", + source_path="b/file.txt", + folder_path="/a", + external_id="doc_first", + title="First", + content="first content", + ) + second_ref = filesystem.register_file( + storage_uri="file:///tmp/second.txt", + source_path="second-source.txt", + folder_path="/a/b", + external_id="doc_second", + title="file.txt", + content="second content", + ) + + with pytest.raises(KeyError, match="Ambiguous file target"): + filesystem.store.resolve_file_ref("/a/b/file.txt") + + assert first_ref != second_ref + + +def test_duplicate_source_path_target_raises_clear_error(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + first_ref = filesystem.register_file( + storage_uri="file:///tmp/first.txt", + source_path="shared/source.txt", + folder_path="/first", + external_id="doc_first", + title="First", + content="first content", + ) + second_ref = filesystem.register_file( + storage_uri="file:///tmp/second.txt", + source_path="shared/source.txt", + folder_path="/second", + external_id="doc_second", + title="Second", + content="second content", + ) + + with pytest.raises(KeyError, match="Ambiguous file target"): + filesystem.store.resolve_file_ref("/shared/source.txt") + + assert first_ref != second_ref diff --git a/tests/test_pifs_register_side_effects.py b/tests/test_pifs_register_side_effects.py new file mode 100644 index 000000000..867dd6bf4 --- /dev/null +++ b/tests/test_pifs_register_side_effects.py @@ -0,0 +1,60 @@ +from pathlib import Path + +import pytest + + +class SummaryGenerator: + def generate(self, document, *, fields): + return {field: "Generated registration summary." for field in fields} + + +class RecordingSummaryIndexer: + def __init__(self): + self.upserted = [] + + def upsert_summary(self, record): + self.upserted.append(dict(record)) + return {"status": "ready"} + + +def test_register_insert_failure_cleans_owned_artifacts_and_skips_projection( + tmp_path: Path, monkeypatch +): + from pageindex.filesystem import PageIndexFileSystem + + workspace = tmp_path / "workspace" + source = tmp_path / "source.txt" + source.write_text("Plain text content for registration.", encoding="utf-8") + indexer = RecordingSummaryIndexer() + filesystem = PageIndexFileSystem( + workspace=workspace, + metadata_generator=SummaryGenerator(), + summary_projection_indexer=indexer, + ) + + def fail_insert(records): + raise RuntimeError("catalog insert failed") + + monkeypatch.setattr(filesystem.store, "insert_files", fail_insert) + + with pytest.raises(RuntimeError, match="catalog insert failed"): + filesystem.register_file( + storage_uri=source.as_uri(), + source_path="docs/source.txt", + folder_path="/documents", + external_id="doc_insert_failure", + title="Insert failure", + content=source.read_text(encoding="utf-8"), + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + + assert indexer.upserted == [] + assert list((workspace / "artifacts" / "raw").glob("*.json")) == [] + assert list((workspace / "artifacts" / "text").glob("*.txt")) == [] diff --git a/tests/test_semantic_index.py b/tests/test_semantic_index.py new file mode 100644 index 000000000..324ead76f --- /dev/null +++ b/tests/test_semantic_index.py @@ -0,0 +1,145 @@ +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from pageindex.filesystem.semantic_index import ( + SemanticIndexRecord, + SQLiteVecSemanticIndex, +) + + +def test_sqlite_vec_semantic_index_round_trip(tmp_path): + index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite") + index.reset(dimension=3, metadata={"field_mode": "summary"}) + + index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="github", + source_path="github/a.json", + title="Multipart upload limits", + text="multipart upload limits", + vector=[1.0, 0.0, 0.0], + metadata={"topic": "uploads"}, + ), + SemanticIndexRecord( + file_ref="file_b", + external_id="doc_b", + source_type="slack", + source_path="slack/b.json", + title="GPU cache issue", + text="gpu cache issue", + vector=[0.0, 1.0, 0.0], + metadata={"topic": "runtime"}, + ), + ] + ) + + assert index.info()["document_count"] == 2 + + results = index.search([0.9, 0.1, 0.0], limit=2) + assert [item.external_id for item in results] == ["doc_a", "doc_b"] + + filtered = index.search( + [0.9, 0.1, 0.0], + limit=2, + filters={"source_type": "slack"}, + ) + assert [item.external_id for item in filtered] == ["doc_b"] + + +def test_summary_projection_indexes_unified_metadata_summary(tmp_path): + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + class FakeEmbedder: + def embed(self, texts): + return [[1.0, 0.0, 0.0] for _ in texts] + + indexer = SummaryProjectionIndexer( + tmp_path / "projection", + embedder=FakeEmbedder(), + embedding_provider="test", + embedding_model="fake", + embedding_dimensions=3, + ) + + result = indexer.upsert_summary( + { + "file_ref": "file_a", + "external_id": "doc_a", + "source_type": "documents", + "source_path": "docs/a.pdf", + "title": "A", + "metadata": { + "summary": "Unified metadata summary.", + "department": "ops", + }, + } + ) + + assert result["status"] == "ready" + hits = indexer.index.search([1.0, 0.0, 0.0], limit=1) + assert hits[0].external_id == "doc_a" + assert hits[0].metadata["summary"] == "Unified metadata summary." + assert hits[0].metadata["department"] == "ops" + + +def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path): + from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + + class FakeEmbedder: + def embed(self, texts): + return [[1.0, 0.0, 0.0, 0.0] for _ in texts] + + index_dir = tmp_path / "projection" + index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") + index.reset( + dimension=3, + metadata={ + "channel": "summary", + "embedding_provider": "test", + "embedding_model": "fake", + "embedding_dimensions": 3, + }, + ) + index.upsert_many( + [ + SemanticIndexRecord( + file_ref="file_a", + external_id="doc_a", + source_type="documents", + source_path="docs/a.pdf", + title="A", + text="summary", + vector=[1.0, 0.0, 0.0], + ) + ] + ) + + with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"): + SummaryProjectionIndexer( + index_dir, + embedder=FakeEmbedder(), + embedding_provider="test", + embedding_model="fake", + embedding_dimensions=4, + ) + + preserved = SQLiteVecSemanticIndex(index.db_path) + assert preserved.info()["dimension"] == 3 + assert preserved.info()["document_count"] == 1 + assert preserved.search([1.0, 0.0, 0.0], limit=1)[0].external_id == "doc_a" + + +def test_hash_embedding_provider_is_not_available(): + from pageindex.filesystem.hybrid_projection import make_embedder + + with pytest.raises(ValueError, match="unknown embedding provider: hash"): + make_embedder("hash", "unused", dimensions=256, timeout=1)