From 8dbb8fc58a54f950cf8a281ddf6ef5ff65f55acd Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 17:26:20 +0200 Subject: [PATCH 01/14] Add CLI and agents file --- README.md | 73 +++++++++++++++ pyproject.toml | 3 +- src/semble/agents/semble-search.md | 29 ++++++ src/semble/mcp.py | 75 +++++++++++++++ tests/test_mcp.py | 146 ++++++++++++++++++++++++++++- 5 files changed, 324 insertions(+), 2 deletions(-) create mode 100644 src/semble/agents/semble-search.md diff --git a/README.md b/README.md index ff7f95a..454f8e8 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,29 @@ result.chunk.content # "def save_pretrained(self, path: PathLike, ..." - **MCP server**: drop-in tool for Claude Code, Cursor, Codex, OpenCode, and any other MCP-compatible agent. - **Local and remote**: pass a local path or a git URL. +## CLI + +Semble also ships as a standalone CLI for use outside of MCP — useful in scripts, sub-agents, or anywhere you want search results without an MCP session. + +```bash +# Search a local repo +semble search "authentication flow" ./my-project + +# Search a remote repo (cloned on demand) +semble search "save model to disk" https://github.com/MinishLab/model2vec + +# Find code similar to a known location (file_path and line from a prior search result) +semble find-related src/auth.py 42 ./my-project + +# Options +semble search "query" ./my-project --top-k 10 --mode bm25 +semble find-related src/auth.py 42 ./my-project --top-k 10 +``` + +`path` defaults to the current directory when omitted. + +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. + ## MCP Server Semble can run as an MCP server so agents can search any codebase directly. Repos are cloned and indexed on demand, and indexes are cached for the lifetime of the session. @@ -119,6 +142,56 @@ Add to `~/.cursor/mcp.json` (or `.cursor/mcp.json` in your project): | `search` | Search a codebase with a natural-language or code query. Pass `repo` as a git URL or local path. | | `find_related` | Given a file path and line number, return chunks semantically similar to the code at that location. | +### Sub-agent support (Claude Code and Codex) + +Both Claude Code and Codex CLI lazy-load MCP tool schemas. Sub-agents spawned in either tool cannot call `mcp__semble__search` directly — the schema isn't available in their context until an explicit discovery step. The fix is to let sub-agents invoke semble through the CLI via Bash instead. + +Run this once in your project root to create the agent definition: + +```bash +semble init +# or, if semble is not on $PATH: +uvx --from "semble[mcp]" semble init +``` + +This writes `.claude/agents/semble-search.md` into your project. Commit it alongside your code so the whole team gets sub-agent search automatically. + +The file looks like this (you can also create it manually): + +```markdown +--- +name: semble-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Grep/Glob/Read for any semantic or exploratory question. +tools: Bash, Read +--- + +Use `semble search` to find code by describing what it does: + +​```bash +semble search "authentication flow" /path/to/repo +semble search "save model to disk" /path/to/repo --top-k 10 +​``` + +Use `semble find-related` to find code similar to a known location (pass `file_path` and `line` from a prior search result): + +​```bash +semble find-related src/auth.py 42 /path/to/repo +​``` + +Both commands default `path` to the current directory if omitted. Git URLs are also accepted as `path`. + +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in place of `semble`. + +## Workflow + +1. Start with `semble search` to find relevant chunks. +2. Use `Read` to inspect a full file when the chunk alone isn't enough context. +3. Use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +4. Fall back to Bash `grep` only for exact string matches (variable names, import statements). +``` + +This agent definition uses `tools: Bash, Read` — no MCP — so it works in all sub-agent contexts without schema loading. The parent agent continues to use the MCP tool (with its cached index) for maximum performance. + ## How it works Semble splits each file into code-aware chunks using [Chonkie](https://github.com/chonkie-inc/chonkie), then scores every query against the chunks with two complementary retrievers: static [Model2Vec](https://github.com/MinishLab/model2vec) embeddings using the code-specialized [potion-code-16M](https://huggingface.co/minishlab/potion-code-16M) model for semantic similarity, and [BM25](https://github.com/xhluca/bm25s) for lexical matches on identifiers and API names. The two score lists are fused with Reciprocal Rank Fusion (RRF). diff --git a/pyproject.toml b/pyproject.toml index fd036be..e122037 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ where = ["src"] include = ["semble*"] [tool.setuptools.package-data] -semble = ["py.typed"] +semble = ["py.typed", "agents/*.md"] [tool.setuptools_scm] # can be empty if no extra settings are needed, presence enables setuptools_scm @@ -88,6 +88,7 @@ target-version = "py310" [tool.ruff.lint.per-file-ignores] "tests/**" = ["ANN"] "benchmarks/*.py" = ["T20"] +"src/semble/mcp.py" = ["T20", "E501"] [tool.ruff.lint] select = [ diff --git a/src/semble/agents/semble-search.md b/src/semble/agents/semble-search.md new file mode 100644 index 0000000..9934d85 --- /dev/null +++ b/src/semble/agents/semble-search.md @@ -0,0 +1,29 @@ +--- +name: semble-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Grep/Glob/Read for any semantic or exploratory question. +tools: Bash, Read +--- + +Use `semble search` to find code by describing what it does: + +```bash +semble search "authentication flow" /path/to/repo +semble search "save model to disk" /path/to/repo --top-k 10 +``` + +Use `semble find-related` to find code similar to a known location (pass `file_path` and `line` from a prior search result): + +```bash +semble find-related src/auth.py 42 /path/to/repo +``` + +Both commands default `path` to the current directory if omitted. Git URLs are also accepted as `path`. + +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in place of `semble`. + +## Workflow + +1. Start with `semble search` to find relevant chunks. +2. Use `Read` to inspect a full file when the chunk alone isn't enough context. +3. Use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +4. Fall back to Bash `grep` only for exact string matches (variable names, import statements). diff --git a/src/semble/mcp.py b/src/semble/mcp.py index cc7ec48..24104b8 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -2,6 +2,7 @@ import asyncio import re +import sys from pathlib import Path from typing import Annotated, Literal @@ -192,8 +193,18 @@ def _format_results(header: str, results: list[SearchResult]) -> str: return "\n".join(lines) +_AGENT_FILE_PATH = Path(".claude") / "agents" / "semble-search.md" + + def main() -> None: """Entry point for the semble command-line tool.""" + if len(sys.argv) > 1 and sys.argv[1] in ("search", "find-related", "init"): + _cli_main() + else: + _mcp_main() + + +def _mcp_main() -> None: import argparse parser = argparse.ArgumentParser( @@ -209,3 +220,67 @@ def main() -> None: parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") args = parser.parse_args() asyncio.run(serve(args.path, ref=args.ref)) + + +def _run_init(*, force: bool = False) -> None: + """Write the sub-agent file into the current project.""" + from importlib.resources import files + + dest = _AGENT_FILE_PATH + if dest.exists() and not force: + print(f"{dest} already exists. Run with --force to overwrite.", file=sys.stderr) + sys.exit(1) + dest.parent.mkdir(parents=True, exist_ok=True) + content = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") + dest.write_text(content, encoding="utf-8") + print(f"Created {dest}") + + +def _cli_main() -> None: + import argparse + + parser = argparse.ArgumentParser(prog="semble") + sub = parser.add_subparsers(dest="command") + + search_p = sub.add_parser("search", help="Search a codebase.") + search_p.add_argument("query", help="Natural language or code query.") + search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") + search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + search_p.add_argument( + "-m", "--mode", default="hybrid", choices=["hybrid", "semantic", "bm25"], help="Search mode (default: hybrid)." + ) + + related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") + related_p.add_argument("file_path", help="File path as shown in search results.") + related_p.add_argument("line", type=int, help="Line number (1-indexed).") + related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") + related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + + init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for sub-agent support.") + init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.") + + args = parser.parse_args() + + if args.command == "init": + _run_init(force=args.force) + return + + index = SembleIndex.from_git(args.path) if _is_git_url(args.path) else SembleIndex.from_path(args.path) + + if args.command == "search": + results = index.search(args.query, top_k=args.top_k, mode=args.mode) + if not results: + print("No results found.") + else: + print(_format_results(f"Search results for: {args.query!r} (mode={args.mode})", results)) + + elif args.command == "find-related": + chunk = _resolve_chunk(index.chunks, args.file_path, args.line) + if chunk is None: + print(f"No chunk found at {args.file_path}:{args.line}.", file=sys.stderr) + sys.exit(1) + results = index.find_related(chunk, top_k=args.top_k) + if not results: + print(f"No related chunks found for {args.file_path}:{args.line}.") + else: + print(_format_results(f"Chunks related to {args.file_path}:{args.line}", results)) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index a5651d6..7ecaace 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -1,14 +1,28 @@ import sys +from importlib.resources import files from pathlib import Path from typing import Any from unittest.mock import AsyncMock, MagicMock, patch import pytest -from semble.mcp import _format_results, _IndexCache, _is_git_url, _resolve_chunk, create_server, main, serve +from semble.mcp import ( + _AGENT_FILE_PATH, + _cli_main, + _format_results, + _IndexCache, + _is_git_url, + _resolve_chunk, + _run_init, + create_server, + main, + serve, +) from semble.types import Chunk, Encoder, SearchMode, SearchResult from tests.conftest import make_chunk +_AGENT_FILE = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") + def _tool_text(result: Any) -> str: """Extract the text string from a FastMCP call_tool result.""" @@ -269,3 +283,133 @@ def test_main_calls_asyncio_run(argv: list[str], monkeypatch: pytest.MonkeyPatch mock_run.side_effect = lambda coro: coro.close() main() mock_run.assert_called_once() + + +@pytest.mark.parametrize( + "argv, expected_in_output", + [ + (["semble", "search", "query text", "/some/path"], ["query text", "0.9"]), + (["semble", "search", "nothing", "/some/path", "--top-k", "3", "--mode", "bm25"], ["No results found"]), + ], +) +def test_cli_search( + argv: list[str], + expected_in_output: list[str], + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main search subcommand calls index.search and prints results.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + has_results = "No results" not in expected_in_output[0] + fake_index.search.return_value = ( + [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] if has_results else [] + ) + monkeypatch.setattr(sys, "argv", argv) + with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): + _cli_main() + out = capsys.readouterr().out + for fragment in expected_in_output: + assert fragment in out + + +def test_cli_find_related( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main find-related subcommand calls index.find_related and prints results.""" + chunk = make_chunk("class Bar: pass", "src/bar.py") + fake_index = MagicMock() + fake_index.chunks = [chunk] + fake_index.find_related.return_value = [SearchResult(chunk=chunk, score=0.8, source=SearchMode.SEMANTIC)] + monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) + with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): + _cli_main() + out = capsys.readouterr().out + assert "src/bar.py" in out + assert "0.800" in out + + +def test_cli_find_related_no_results( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main find-related prints a message when the index returns no related chunks.""" + chunk = make_chunk("class Bar: pass", "src/bar.py") + fake_index = MagicMock() + fake_index.chunks = [chunk] + fake_index.find_related.return_value = [] + monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) + with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): + _cli_main() + assert "No related chunks found" in capsys.readouterr().out + + +def test_cli_find_related_unknown_chunk( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main find-related exits with code 1 when chunk is not found.""" + fake_index = MagicMock() + fake_index.chunks = [] + monkeypatch.setattr(sys, "argv", ["semble", "find-related", "unknown.py", "1", "/some/path"]) + with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): + with pytest.raises(SystemExit) as exc_info: + _cli_main() + assert exc_info.value.code == 1 + assert "No chunk found" in capsys.readouterr().err + + +def test_init_creates_file(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + """_run_init writes the agent file and prints its path.""" + monkeypatch.chdir(tmp_path) + _run_init() + dest = tmp_path / _AGENT_FILE_PATH + assert dest.exists() + assert dest.read_text(encoding="utf-8") == _AGENT_FILE + assert str(_AGENT_FILE_PATH) in capsys.readouterr().out + + +def test_init_refuses_overwrite_without_force( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + """_run_init exits with code 1 when the file exists and --force is not set.""" + monkeypatch.chdir(tmp_path) + _run_init() + with pytest.raises(SystemExit) as exc_info: + _run_init() + assert exc_info.value.code == 1 + assert "already exists" in capsys.readouterr().err + + +def test_init_overwrites_with_force(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """_run_init overwrites an existing file when force=True.""" + monkeypatch.chdir(tmp_path) + dest = tmp_path / _AGENT_FILE_PATH + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_text("old content", encoding="utf-8") + _run_init(force=True) + assert dest.read_text(encoding="utf-8") == _AGENT_FILE + + +def test_init_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + """Semble init subcommand creates the agent file via _cli_main.""" + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(sys, "argv", ["semble", "init"]) + _cli_main() + assert (tmp_path / _AGENT_FILE_PATH).exists() + assert str(_AGENT_FILE_PATH) in capsys.readouterr().out + + +def test_main_dispatches_to_cli( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """main() routes to _cli_main when first argument is a CLI subcommand.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] + monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", "/some/path"]) + with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): + main() + assert "query text" in capsys.readouterr().out From 56342fb08265eace8ba4fe55b102a14750384c2b Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 17:30:09 +0200 Subject: [PATCH 02/14] Add CLI and agents file --- README.md | 44 +++----------------------------------------- 1 file changed, 3 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 454f8e8..69602d7 100644 --- a/README.md +++ b/README.md @@ -142,11 +142,9 @@ Add to `~/.cursor/mcp.json` (or `.cursor/mcp.json` in your project): | `search` | Search a codebase with a natural-language or code query. Pass `repo` as a git URL or local path. | | `find_related` | Given a file path and line number, return chunks semantically similar to the code at that location. | -### Sub-agent support (Claude Code and Codex) +### Sub-agent support -Both Claude Code and Codex CLI lazy-load MCP tool schemas. Sub-agents spawned in either tool cannot call `mcp__semble__search` directly — the schema isn't available in their context until an explicit discovery step. The fix is to let sub-agents invoke semble through the CLI via Bash instead. - -Run this once in your project root to create the agent definition: +Claude Code and Codex CLI lazy-load MCP tool schemas, so sub-agents cannot call `mcp__semble__search` directly. Run this once in your project root to drop in a Bash-based agent definition that works in all sub-agent contexts: ```bash semble init @@ -154,43 +152,7 @@ semble init uvx --from "semble[mcp]" semble init ``` -This writes `.claude/agents/semble-search.md` into your project. Commit it alongside your code so the whole team gets sub-agent search automatically. - -The file looks like this (you can also create it manually): - -```markdown ---- -name: semble-search -description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Grep/Glob/Read for any semantic or exploratory question. -tools: Bash, Read ---- - -Use `semble search` to find code by describing what it does: - -​```bash -semble search "authentication flow" /path/to/repo -semble search "save model to disk" /path/to/repo --top-k 10 -​``` - -Use `semble find-related` to find code similar to a known location (pass `file_path` and `line` from a prior search result): - -​```bash -semble find-related src/auth.py 42 /path/to/repo -​``` - -Both commands default `path` to the current directory if omitted. Git URLs are also accepted as `path`. - -If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in place of `semble`. - -## Workflow - -1. Start with `semble search` to find relevant chunks. -2. Use `Read` to inspect a full file when the chunk alone isn't enough context. -3. Use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Fall back to Bash `grep` only for exact string matches (variable names, import statements). -``` - -This agent definition uses `tools: Bash, Read` — no MCP — so it works in all sub-agent contexts without schema loading. The parent agent continues to use the MCP tool (with its cached index) for maximum performance. +This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md) into your project. Commit it so the whole team gets sub-agent search automatically. ## How it works From 9e87d1deac571e9c9a8c950755d78f216ae81502 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 17:31:09 +0200 Subject: [PATCH 03/14] Add CLI and agents file --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 69602d7..8f13da4 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,7 @@ semble init uvx --from "semble[mcp]" semble init ``` -This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md) into your project. Commit it so the whole team gets sub-agent search automatically. +This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md) into your project. ## How it works From c2db24d970a456a6aa93a3bd8b4e62b5c0ab5042 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 17:53:12 +0200 Subject: [PATCH 04/14] Add CLI and agents file --- README.md | 13 ++- pyproject.toml | 5 +- src/semble/cli.py | 96 ++++++++++++++++++++ src/semble/mcp.py | 143 +----------------------------- src/semble/utils.py | 47 ++++++++++ tests/test_cli.py | 212 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_mcp.py | 164 +--------------------------------- 7 files changed, 368 insertions(+), 312 deletions(-) create mode 100644 src/semble/cli.py create mode 100644 src/semble/utils.py create mode 100644 tests/test_cli.py diff --git a/README.md b/README.md index 8f13da4..c78bde8 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ [Quickstart](#quickstart) • [Main Features](#main-features) • +[CLI](#cli) • [MCP Server](#mcp-server) • [How it works](#how-it-works) • [Benchmarks](#benchmarks) @@ -67,7 +68,7 @@ result.chunk.content # "def save_pretrained(self, path: PathLike, ..." ## CLI -Semble also ships as a standalone CLI for use outside of MCP — useful in scripts, sub-agents, or anywhere you want search results without an MCP session. +Semble also ships as a standalone CLI for use outside of MCP. This is useful in scripts, sub-agents, or anywhere you want search results without an MCP session. ```bash # Search a local repo @@ -78,10 +79,6 @@ semble search "save model to disk" https://github.com/MinishLab/model2vec # Find code similar to a known location (file_path and line from a prior search result) semble find-related src/auth.py 42 ./my-project - -# Options -semble search "query" ./my-project --top-k 10 --mode bm25 -semble find-related src/auth.py 42 ./my-project --top-k 10 ``` `path` defaults to the current directory when omitted. @@ -144,7 +141,9 @@ Add to `~/.cursor/mcp.json` (or `.cursor/mcp.json` in your project): ### Sub-agent support -Claude Code and Codex CLI lazy-load MCP tool schemas, so sub-agents cannot call `mcp__semble__search` directly. Run this once in your project root to drop in a Bash-based agent definition that works in all sub-agent contexts: +Claude Code and Codex CLI lazy-load MCP tool schemas, so sub-agents cannot call `mcp__semble__search` directly. The fix is to have sub-agents invoke semble through the [CLI](#cli) via Bash instead. + +**Claude Code** — run this once in your project root and commit the result: ```bash semble init @@ -152,7 +151,7 @@ semble init uvx --from "semble[mcp]" semble init ``` -This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md) into your project. +This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md) into your project. Use `--force` to overwrite an existing file. ## How it works diff --git a/pyproject.toml b/pyproject.toml index e122037..bf3540b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ dev = [ "Source" = "https://github.com/MinishLab/semble" [project.scripts] -semble = "semble.mcp:main" +semble = "semble.cli:main" [tool.setuptools] package-dir = {"" = "src"} @@ -88,7 +88,7 @@ target-version = "py310" [tool.ruff.lint.per-file-ignores] "tests/**" = ["ANN"] "benchmarks/*.py" = ["T20"] -"src/semble/mcp.py" = ["T20", "E501"] +"src/semble/cli.py" = ["T20", "E501"] [tool.ruff.lint] select = [ @@ -132,6 +132,7 @@ ignore_missing_imports = true [tool.pytest.ini_options] testpaths = ["tests"] addopts = ["--tb=short", "--strict-markers", "--cov=semble", "--cov-report=term-missing"] +markers = ["slow: end-to-end tests that run real indexing and CLI subprocesses (deselect with -m 'not slow')"] [tool.coverage.run] relative_files = true diff --git a/src/semble/cli.py b/src/semble/cli.py new file mode 100644 index 0000000..c6f95ae --- /dev/null +++ b/src/semble/cli.py @@ -0,0 +1,96 @@ +import argparse +import asyncio +import sys +from importlib.resources import files +from pathlib import Path + +from semble.index import SembleIndex +from semble.utils import _format_results, _is_git_url, _resolve_chunk + +_CLAUDE_FILE_PATH = Path(".claude") / "agents" / "semble-search.md" + + +def main() -> None: + """Entry point for the semble command-line tool.""" + if len(sys.argv) > 1 and sys.argv[1] in ("search", "find-related", "init"): + _cli_main() + else: + _mcp_main() + + +def _mcp_main() -> None: + from semble.mcp import serve + + parser = argparse.ArgumentParser( + prog="semble", + description="Instant local code search for agents.", + ) + parser.add_argument( + "path", + nargs="?", + default=None, + help="Local directory or git URL to pre-index at startup (optional).", + ) + parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") + args = parser.parse_args() + asyncio.run(serve(args.path, ref=args.ref)) + + +def _run_init(*, force: bool = False) -> None: + """Write the Claude Code sub-agent file into the current project.""" + dest = _CLAUDE_FILE_PATH + if dest.exists() and not force: + print(f"{dest} already exists. Run with --force to overwrite.", file=sys.stderr) + sys.exit(1) + dest.parent.mkdir(parents=True, exist_ok=True) + content = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") + dest.write_text(content, encoding="utf-8") + print(f"Created {dest}") + + +def _cli_main() -> None: + parser = argparse.ArgumentParser(prog="semble") + sub = parser.add_subparsers(dest="command") + + search_p = sub.add_parser("search", help="Search a codebase.") + search_p.add_argument("query", help="Natural language or code query.") + search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") + search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + search_p.add_argument( + "-m", "--mode", default="hybrid", choices=["hybrid", "semantic", "bm25"], help="Search mode (default: hybrid)." + ) + + related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") + related_p.add_argument("file_path", help="File path as shown in search results.") + related_p.add_argument("line", type=int, help="Line number (1-indexed).") + related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") + related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + + init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for Claude Code sub-agent support.") + init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.") + + args = parser.parse_args() + + if args.command == "init": + _run_init(force=args.force) + return + + index = SembleIndex.from_git(args.path) if _is_git_url(args.path) else SembleIndex.from_path(args.path) + + if args.command == "search": + results = index.search(args.query, top_k=args.top_k, mode=args.mode) + if not results: + print("No results found.") + else: + print(_format_results(f"Search results for: {args.query!r} (mode={args.mode})", results)) + + elif args.command == "find-related": + chunk = _resolve_chunk(index.chunks, args.file_path, args.line) + if chunk is None: + print(f"No chunk found at {args.file_path}:{args.line}.", file=sys.stderr) + sys.exit(1) + results = index.find_related(chunk, top_k=args.top_k) + if not results: + print(f"No related chunks found for {args.file_path}:{args.line}.") + else: + print(_format_results(f"Chunks related to {args.file_path}:{args.line}", results)) diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 24104b8..1b5dc10 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -1,8 +1,6 @@ from __future__ import annotations import asyncio -import re -import sys from pathlib import Path from typing import Annotated, Literal @@ -11,7 +9,8 @@ from semble.index import SembleIndex from semble.index.dense import load_model -from semble.types import Chunk, Encoder, SearchResult +from semble.types import Encoder +from semble.utils import _format_results, _is_git_url, _resolve_chunk _REPO_DESCRIPTION = ( "Git URL (e.g. https://github.com/org/repo) or local path to index and search. " @@ -19,10 +18,6 @@ "The index is cached after the first call, so repeat queries are fast." ) -_GIT_URL_SCHEMES = ("https://", "http://", "ssh://", "git://", "git+ssh://", "file://") -# scp-like syntax: [user@]host:path, where host has no '/' before the ':'. -_SCP_GIT_URL_RE = re.compile(r"^[\w.-]+@[\w.-]+:(?!/)") - def create_server(cache: _IndexCache, default_source: str | None = None) -> FastMCP: """Build and return a configured FastMCP server backed by the given cache.""" @@ -143,8 +138,6 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: try: return await asyncio.shield(task) except asyncio.CancelledError: # pragma: no cover - # If this waiter was cancelled but the task is still running, preserve it for - # other waiters. Only evict if the task itself was cancelled. if task.done(): self._tasks.pop(cache_key, None) raise @@ -152,135 +145,3 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: # Build failed: evict so the next caller can retry. self._tasks.pop(cache_key, None) raise - - -def _resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | None: - """Return the chunk that contains *line* in *file_path*, or None. - - MCP tool arguments are JSON primitives (strings and ints), so the agent - passes file_path + line rather than a Chunk object. This function - reconstructs the Chunk at the MCP boundary before calling into the library. - - :param chunks: All indexed chunks to search. - :param file_path: File path as stored in the index. - :param line: 1-indexed line number to resolve. - :return: The best-matching Chunk, or None if not found. - """ - fallback = None - for chunk in chunks: - if chunk.file_path == file_path and chunk.start_line <= line <= chunk.end_line: - if line < chunk.end_line: - return chunk - if fallback is None: # line == end_line: boundary; keep as fallback for end-of-file chunks - fallback = chunk - return fallback - - -def _is_git_url(path: str) -> bool: - """Return True if path looks like a remote git URL rather than a local path.""" - return path.startswith(_GIT_URL_SCHEMES) or _SCP_GIT_URL_RE.match(path) is not None - - -def _format_results(header: str, results: list[SearchResult]) -> str: - """Render SearchResult objects as numbered, fenced code blocks.""" - lines: list[str] = [header, ""] - for i, r in enumerate(results, 1): - lines.append(f"## {i}. {r.chunk.location} [score={r.score:.3f}]") - lines.append("```") - lines.append(r.chunk.content.strip()) - lines.append("```") - lines.append("") - return "\n".join(lines) - - -_AGENT_FILE_PATH = Path(".claude") / "agents" / "semble-search.md" - - -def main() -> None: - """Entry point for the semble command-line tool.""" - if len(sys.argv) > 1 and sys.argv[1] in ("search", "find-related", "init"): - _cli_main() - else: - _mcp_main() - - -def _mcp_main() -> None: - import argparse - - parser = argparse.ArgumentParser( - prog="semble", - description="Instant local code search for agents.", - ) - parser.add_argument( - "path", - nargs="?", - default=None, - help="Local directory or git URL to pre-index at startup (optional).", - ) - parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") - args = parser.parse_args() - asyncio.run(serve(args.path, ref=args.ref)) - - -def _run_init(*, force: bool = False) -> None: - """Write the sub-agent file into the current project.""" - from importlib.resources import files - - dest = _AGENT_FILE_PATH - if dest.exists() and not force: - print(f"{dest} already exists. Run with --force to overwrite.", file=sys.stderr) - sys.exit(1) - dest.parent.mkdir(parents=True, exist_ok=True) - content = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") - dest.write_text(content, encoding="utf-8") - print(f"Created {dest}") - - -def _cli_main() -> None: - import argparse - - parser = argparse.ArgumentParser(prog="semble") - sub = parser.add_subparsers(dest="command") - - search_p = sub.add_parser("search", help="Search a codebase.") - search_p.add_argument("query", help="Natural language or code query.") - search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") - search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - search_p.add_argument( - "-m", "--mode", default="hybrid", choices=["hybrid", "semantic", "bm25"], help="Search mode (default: hybrid)." - ) - - related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") - related_p.add_argument("file_path", help="File path as shown in search results.") - related_p.add_argument("line", type=int, help="Line number (1-indexed).") - related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") - related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - - init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for sub-agent support.") - init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.") - - args = parser.parse_args() - - if args.command == "init": - _run_init(force=args.force) - return - - index = SembleIndex.from_git(args.path) if _is_git_url(args.path) else SembleIndex.from_path(args.path) - - if args.command == "search": - results = index.search(args.query, top_k=args.top_k, mode=args.mode) - if not results: - print("No results found.") - else: - print(_format_results(f"Search results for: {args.query!r} (mode={args.mode})", results)) - - elif args.command == "find-related": - chunk = _resolve_chunk(index.chunks, args.file_path, args.line) - if chunk is None: - print(f"No chunk found at {args.file_path}:{args.line}.", file=sys.stderr) - sys.exit(1) - results = index.find_related(chunk, top_k=args.top_k) - if not results: - print(f"No related chunks found for {args.file_path}:{args.line}.") - else: - print(_format_results(f"Chunks related to {args.file_path}:{args.line}", results)) diff --git a/src/semble/utils.py b/src/semble/utils.py new file mode 100644 index 0000000..43301fa --- /dev/null +++ b/src/semble/utils.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import re + +from semble.types import Chunk, SearchResult + +_GIT_URL_SCHEMES = ("https://", "http://", "ssh://", "git://", "git+ssh://", "file://") +_SCP_GIT_URL_RE = re.compile(r"^[\w.-]+@[\w.-]+:(?!/)") + + +def _is_git_url(path: str) -> bool: + """Return True if path looks like a remote git URL rather than a local path.""" + return path.startswith(_GIT_URL_SCHEMES) or _SCP_GIT_URL_RE.match(path) is not None + + +def _resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | None: + """Return the chunk that contains *line* in *file_path*, or None. + + MCP tool arguments are JSON primitives (strings and ints), so the agent + passes file_path + line rather than a Chunk object. This function + reconstructs the Chunk at the MCP boundary before calling into the library. + + :param chunks: All indexed chunks to search. + :param file_path: File path as stored in the index. + :param line: 1-indexed line number to resolve. + :return: The best-matching Chunk, or None if not found. + """ + fallback = None + for chunk in chunks: + if chunk.file_path == file_path and chunk.start_line <= line <= chunk.end_line: + if line < chunk.end_line: + return chunk + if fallback is None: # line == end_line: boundary; keep as fallback for end-of-file chunks + fallback = chunk + return fallback + + +def _format_results(header: str, results: list[SearchResult]) -> str: + """Render SearchResult objects as numbered, fenced code blocks.""" + lines: list[str] = [header, ""] + for i, r in enumerate(results, 1): + lines.append(f"## {i}. {r.chunk.location} [score={r.score:.3f}]") + lines.append("```") + lines.append(r.chunk.content.strip()) + lines.append("```") + lines.append("") + return "\n".join(lines) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..b210832 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,212 @@ +import subprocess +import sys +from importlib.resources import files +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from semble.cli import _CLAUDE_FILE_PATH, _cli_main, _run_init, main +from semble.types import SearchMode, SearchResult +from tests.conftest import make_chunk + +_CLAUDE_AGENT_FILE = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") + + +@pytest.mark.parametrize( + "argv", + [ + ["semble", "/some/path", "--ref", "main"], + ["semble"], + ], +) +def test_main_calls_asyncio_run(argv: list[str], monkeypatch: pytest.MonkeyPatch) -> None: + """main() delegates to asyncio.run(serve(...)) when no CLI subcommand is given.""" + monkeypatch.setattr(sys, "argv", argv) + with patch("asyncio.run") as mock_run: + mock_run.side_effect = lambda coro: coro.close() + main() + mock_run.assert_called_once() + + +@pytest.mark.parametrize( + "argv, expected_in_output", + [ + (["semble", "search", "query text", "/some/path"], ["query text", "0.9"]), + (["semble", "search", "nothing", "/some/path", "--top-k", "3", "--mode", "bm25"], ["No results found"]), + ], +) +def test_cli_search( + argv: list[str], + expected_in_output: list[str], + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main search subcommand calls index.search and prints results.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + has_results = "No results" not in expected_in_output[0] + fake_index.search.return_value = ( + [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] if has_results else [] + ) + monkeypatch.setattr(sys, "argv", argv) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + _cli_main() + out = capsys.readouterr().out + for fragment in expected_in_output: + assert fragment in out + + +def test_cli_find_related( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main find-related subcommand calls index.find_related and prints results.""" + chunk = make_chunk("class Bar: pass", "src/bar.py") + fake_index = MagicMock() + fake_index.chunks = [chunk] + fake_index.find_related.return_value = [SearchResult(chunk=chunk, score=0.8, source=SearchMode.SEMANTIC)] + monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + _cli_main() + out = capsys.readouterr().out + assert "src/bar.py" in out + assert "0.800" in out + + +def test_cli_find_related_no_results( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main find-related prints a message when the index returns no related chunks.""" + chunk = make_chunk("class Bar: pass", "src/bar.py") + fake_index = MagicMock() + fake_index.chunks = [chunk] + fake_index.find_related.return_value = [] + monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + _cli_main() + assert "No related chunks found" in capsys.readouterr().out + + +def test_cli_find_related_unknown_chunk( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main find-related exits with code 1 when chunk is not found.""" + fake_index = MagicMock() + fake_index.chunks = [] + monkeypatch.setattr(sys, "argv", ["semble", "find-related", "unknown.py", "1", "/some/path"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + with pytest.raises(SystemExit) as exc_info: + _cli_main() + assert exc_info.value.code == 1 + assert "No chunk found" in capsys.readouterr().err + + +def test_init_creates_file(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + """_run_init writes the agent file and prints its path.""" + monkeypatch.chdir(tmp_path) + _run_init() + dest = tmp_path / _CLAUDE_FILE_PATH + assert dest.exists() + assert dest.read_text(encoding="utf-8") == _CLAUDE_AGENT_FILE + assert str(_CLAUDE_FILE_PATH) in capsys.readouterr().out + + +def test_init_refuses_overwrite_without_force( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + """_run_init exits with code 1 when the file exists and force=False.""" + monkeypatch.chdir(tmp_path) + _run_init() + with pytest.raises(SystemExit) as exc_info: + _run_init() + assert exc_info.value.code == 1 + assert "already exists" in capsys.readouterr().err + + +def test_init_overwrites_with_force(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """_run_init overwrites an existing file when force=True.""" + monkeypatch.chdir(tmp_path) + dest = tmp_path / _CLAUDE_FILE_PATH + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_text("old content", encoding="utf-8") + _run_init(force=True) + assert dest.read_text(encoding="utf-8") == _CLAUDE_AGENT_FILE + + +def test_init_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + """Semble init creates the Claude agent file via _cli_main.""" + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(sys, "argv", ["semble", "init"]) + _cli_main() + assert (tmp_path / _CLAUDE_FILE_PATH).exists() + assert str(_CLAUDE_FILE_PATH) in capsys.readouterr().out + + +def test_main_dispatches_to_cli( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """main() routes to _cli_main when first argument is a CLI subcommand.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] + monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", "/some/path"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + main() + assert "query text" in capsys.readouterr().out + + +def test_cli_works_without_mcp_installed( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """CLI subcommands succeed even when the mcp package is not installed.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] + monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path"]) + monkeypatch.setitem(sys.modules, "mcp", None) + monkeypatch.setitem(sys.modules, "mcp.server.fastmcp", None) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + _cli_main() + assert "query" in capsys.readouterr().out + + +def test_agent_file_tools_are_bash_only() -> None: + """The agent file must list only Bash and Read — no MCP tools that require schema loading.""" + frontmatter = _CLAUDE_AGENT_FILE.split("---")[1] + tools_line = next(line for line in frontmatter.splitlines() if line.startswith("tools:")) + tools = [t.strip() for t in tools_line.removeprefix("tools:").split(",")] + assert set(tools) == {"Bash", "Read"}, f"Unexpected tools in agent file: {tools}" + assert not any("mcp__" in t for t in tools) + + +@pytest.mark.slow +def test_cli_search_subprocess() -> None: + """Semble search runs end-to-end as a subprocess — the path a sub-agent takes via Bash.""" + repo_root = Path(__file__).parent.parent + result = subprocess.run( + ["uv", "run", "semble", "search", "MCP server entry point", str(repo_root)], + capture_output=True, + text=True, + cwd=repo_root, + ) + assert result.returncode == 0, result.stderr + assert "mcp.py" in result.stdout + + +@pytest.mark.slow +def test_cli_find_related_subprocess() -> None: + """Semble find-related runs end-to-end as a subprocess — the path a sub-agent takes via Bash.""" + repo_root = Path(__file__).parent.parent + result = subprocess.run( + ["uv", "run", "semble", "find-related", "src/semble/mcp.py", "30", str(repo_root)], + capture_output=True, + text=True, + cwd=repo_root, + ) + assert result.returncode == 0, result.stderr + assert result.stdout.strip() diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 7ecaace..43100ba 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -1,28 +1,14 @@ -import sys -from importlib.resources import files from pathlib import Path from typing import Any from unittest.mock import AsyncMock, MagicMock, patch import pytest -from semble.mcp import ( - _AGENT_FILE_PATH, - _cli_main, - _format_results, - _IndexCache, - _is_git_url, - _resolve_chunk, - _run_init, - create_server, - main, - serve, -) +from semble.mcp import _IndexCache, create_server, serve from semble.types import Chunk, Encoder, SearchMode, SearchResult +from semble.utils import _format_results, _is_git_url, _resolve_chunk from tests.conftest import make_chunk -_AGENT_FILE = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") - def _tool_text(result: Any) -> str: """Extract the text string from a FastMCP call_tool result.""" @@ -267,149 +253,3 @@ async def test_serve_runs_stdio(tmp_path: Path, with_path: bool) -> None: await (serve(str(tmp_path)) if with_path else serve()) mock_run.assert_called_once() - - -@pytest.mark.parametrize( - "argv", - [ - ["semble", "/some/path", "--ref", "main"], - ["semble"], - ], -) -def test_main_calls_asyncio_run(argv: list[str], monkeypatch: pytest.MonkeyPatch) -> None: - """main() parses argv and delegates to asyncio.run(serve(...)).""" - monkeypatch.setattr(sys, "argv", argv) - with patch("semble.mcp.asyncio.run") as mock_run: - mock_run.side_effect = lambda coro: coro.close() - main() - mock_run.assert_called_once() - - -@pytest.mark.parametrize( - "argv, expected_in_output", - [ - (["semble", "search", "query text", "/some/path"], ["query text", "0.9"]), - (["semble", "search", "nothing", "/some/path", "--top-k", "3", "--mode", "bm25"], ["No results found"]), - ], -) -def test_cli_search( - argv: list[str], - expected_in_output: list[str], - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[str], -) -> None: - """_cli_main search subcommand calls index.search and prints results.""" - chunk = make_chunk("def foo(): pass", "src/foo.py") - fake_index = MagicMock() - has_results = "No results" not in expected_in_output[0] - fake_index.search.return_value = ( - [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] if has_results else [] - ) - monkeypatch.setattr(sys, "argv", argv) - with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): - _cli_main() - out = capsys.readouterr().out - for fragment in expected_in_output: - assert fragment in out - - -def test_cli_find_related( - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[str], -) -> None: - """_cli_main find-related subcommand calls index.find_related and prints results.""" - chunk = make_chunk("class Bar: pass", "src/bar.py") - fake_index = MagicMock() - fake_index.chunks = [chunk] - fake_index.find_related.return_value = [SearchResult(chunk=chunk, score=0.8, source=SearchMode.SEMANTIC)] - monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) - with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): - _cli_main() - out = capsys.readouterr().out - assert "src/bar.py" in out - assert "0.800" in out - - -def test_cli_find_related_no_results( - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[str], -) -> None: - """_cli_main find-related prints a message when the index returns no related chunks.""" - chunk = make_chunk("class Bar: pass", "src/bar.py") - fake_index = MagicMock() - fake_index.chunks = [chunk] - fake_index.find_related.return_value = [] - monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) - with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): - _cli_main() - assert "No related chunks found" in capsys.readouterr().out - - -def test_cli_find_related_unknown_chunk( - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[str], -) -> None: - """_cli_main find-related exits with code 1 when chunk is not found.""" - fake_index = MagicMock() - fake_index.chunks = [] - monkeypatch.setattr(sys, "argv", ["semble", "find-related", "unknown.py", "1", "/some/path"]) - with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): - with pytest.raises(SystemExit) as exc_info: - _cli_main() - assert exc_info.value.code == 1 - assert "No chunk found" in capsys.readouterr().err - - -def test_init_creates_file(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - """_run_init writes the agent file and prints its path.""" - monkeypatch.chdir(tmp_path) - _run_init() - dest = tmp_path / _AGENT_FILE_PATH - assert dest.exists() - assert dest.read_text(encoding="utf-8") == _AGENT_FILE - assert str(_AGENT_FILE_PATH) in capsys.readouterr().out - - -def test_init_refuses_overwrite_without_force( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] -) -> None: - """_run_init exits with code 1 when the file exists and --force is not set.""" - monkeypatch.chdir(tmp_path) - _run_init() - with pytest.raises(SystemExit) as exc_info: - _run_init() - assert exc_info.value.code == 1 - assert "already exists" in capsys.readouterr().err - - -def test_init_overwrites_with_force(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: - """_run_init overwrites an existing file when force=True.""" - monkeypatch.chdir(tmp_path) - dest = tmp_path / _AGENT_FILE_PATH - dest.parent.mkdir(parents=True, exist_ok=True) - dest.write_text("old content", encoding="utf-8") - _run_init(force=True) - assert dest.read_text(encoding="utf-8") == _AGENT_FILE - - -def test_init_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - """Semble init subcommand creates the agent file via _cli_main.""" - monkeypatch.chdir(tmp_path) - monkeypatch.setattr(sys, "argv", ["semble", "init"]) - _cli_main() - assert (tmp_path / _AGENT_FILE_PATH).exists() - assert str(_AGENT_FILE_PATH) in capsys.readouterr().out - - -def test_main_dispatches_to_cli( - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[str], -) -> None: - """main() routes to _cli_main when first argument is a CLI subcommand.""" - chunk = make_chunk("def foo(): pass", "src/foo.py") - fake_index = MagicMock() - fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] - monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", "/some/path"]) - with patch("semble.mcp.SembleIndex.from_path", return_value=fake_index): - main() - assert "query text" in capsys.readouterr().out From 62e0d09aab21b207d4d64171ac08b28f3bab3892 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 17:56:07 +0200 Subject: [PATCH 05/14] Add CLI and agents file --- README.md | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c78bde8..c5c5cb2 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ Add to `~/.cursor/mcp.json` (or `.cursor/mcp.json` in your project): ### Sub-agent support -Claude Code and Codex CLI lazy-load MCP tool schemas, so sub-agents cannot call `mcp__semble__search` directly. The fix is to have sub-agents invoke semble through the [CLI](#cli) via Bash instead. +Claude Code and Codex CLI lazy-load MCP tool schemas, so sub-agents cannot call `mcp__semble__search` directly. The fix is to invoke semble through the [CLI](#cli) via Bash instead. **Claude Code** — run this once in your project root and commit the result: @@ -151,7 +151,29 @@ semble init uvx --from "semble[mcp]" semble init ``` -This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md) into your project. Use `--force` to overwrite an existing file. +This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md). Use `--force` to overwrite. + +**Other tools (Codex, etc.)** — append the following to your `AGENTS.md`: + +```markdown +## Code Search + +Use `semble search` to find code by describing what it does, instead of grep: + +​```bash +semble search "authentication flow" /path/to/repo +semble search "save model to disk" /path/to/repo --top-k 10 +​``` + +Use `semble find-related` to discover code similar to a known location: + +​```bash +semble find-related src/auth.py 42 /path/to/repo +​``` + +Both commands default `path` to the current directory. Git URLs are accepted. +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. +``` ## How it works From d5bc16ff4ffa4bfb5a6e66c712dcecbc490ee7c2 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 17:57:47 +0200 Subject: [PATCH 06/14] Add CLI and agents file --- README.md | 6 +++--- src/semble/cli.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c5c5cb2..6eb20ce 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ Add to `~/.cursor/mcp.json` (or `.cursor/mcp.json` in your project): Claude Code and Codex CLI lazy-load MCP tool schemas, so sub-agents cannot call `mcp__semble__search` directly. The fix is to invoke semble through the [CLI](#cli) via Bash instead. -**Claude Code** — run this once in your project root and commit the result: +**Claude Code**: run this once in your project root: ```bash semble init @@ -151,9 +151,9 @@ semble init uvx --from "semble[mcp]" semble init ``` -This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md). Use `--force` to overwrite. +This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md). -**Other tools (Codex, etc.)** — append the following to your `AGENTS.md`: +**Other tools (Codex, etc.)**: append the following to your `AGENTS.md`: ```markdown ## Code Search diff --git a/src/semble/cli.py b/src/semble/cli.py index c6f95ae..be375e5 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -12,15 +12,13 @@ def main() -> None: """Entry point for the semble command-line tool.""" - if len(sys.argv) > 1 and sys.argv[1] in ("search", "find-related", "init"): + if len(sys.argv) > 1 and sys.argv[1] in ("search", "find-related", "init", "-h", "--help"): _cli_main() else: _mcp_main() def _mcp_main() -> None: - from semble.mcp import serve - parser = argparse.ArgumentParser( prog="semble", description="Instant local code search for agents.", @@ -33,6 +31,8 @@ def _mcp_main() -> None: ) parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") args = parser.parse_args() + from semble.mcp import serve + asyncio.run(serve(args.path, ref=args.ref)) From 8b8549bb3431a1ac0348cc97b622465570d35093 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 18:00:52 +0200 Subject: [PATCH 07/14] Simplify --- src/semble/utils.py | 12 +++------ tests/test_cli.py | 65 +++++++++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/semble/utils.py b/src/semble/utils.py index 43301fa..89dc3f9 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -14,16 +14,10 @@ def _is_git_url(path: str) -> bool: def _resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | None: - """Return the chunk that contains *line* in *file_path*, or None. + """Return the chunk containing *line* in *file_path*, or None. - MCP tool arguments are JSON primitives (strings and ints), so the agent - passes file_path + line rather than a Chunk object. This function - reconstructs the Chunk at the MCP boundary before calling into the library. - - :param chunks: All indexed chunks to search. - :param file_path: File path as stored in the index. - :param line: 1-indexed line number to resolve. - :return: The best-matching Chunk, or None if not found. + Reconstructs a Chunk from its JSON-primitive MCP tool arguments (file_path + line) + before calling into the library. """ fallback = None for chunk in chunks: diff --git a/tests/test_cli.py b/tests/test_cli.py index b210832..08a1ad1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,6 +12,17 @@ _CLAUDE_AGENT_FILE = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") +_SEARCH_CHUNK = make_chunk("def foo(): pass", "src/foo.py") +_SEARCH_RESULT = SearchResult(chunk=_SEARCH_CHUNK, score=0.9, source=SearchMode.HYBRID) + + +@pytest.fixture() +def fake_search_index() -> MagicMock: + """Fake index that returns one search result.""" + fake = MagicMock() + fake.search.return_value = [_SEARCH_RESULT] + return fake + @pytest.mark.parametrize( "argv", @@ -57,36 +68,36 @@ def test_cli_search( assert fragment in out -def test_cli_find_related( - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[str], -) -> None: - """_cli_main find-related subcommand calls index.find_related and prints results.""" - chunk = make_chunk("class Bar: pass", "src/bar.py") - fake_index = MagicMock() - fake_index.chunks = [chunk] - fake_index.find_related.return_value = [SearchResult(chunk=chunk, score=0.8, source=SearchMode.SEMANTIC)] - monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - _cli_main() - out = capsys.readouterr().out - assert "src/bar.py" in out - assert "0.800" in out +_BAR_CHUNK = make_chunk("class Bar: pass", "src/bar.py") -def test_cli_find_related_no_results( +@pytest.mark.parametrize( + ("find_related_return", "expected_fragments"), + [ + pytest.param( + [SearchResult(chunk=_BAR_CHUNK, score=0.8, source=SearchMode.SEMANTIC)], + ["src/bar.py", "0.800"], + id="with_results", + ), + pytest.param([], ["No related chunks found"], id="no_results"), + ], +) +def test_cli_find_related( + find_related_return: list[SearchResult], + expected_fragments: list[str], monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - """_cli_main find-related prints a message when the index returns no related chunks.""" - chunk = make_chunk("class Bar: pass", "src/bar.py") + """_cli_main find-related prints results or an empty-state message.""" fake_index = MagicMock() - fake_index.chunks = [chunk] - fake_index.find_related.return_value = [] + fake_index.chunks = [_BAR_CHUNK] + fake_index.find_related.return_value = find_related_return monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): _cli_main() - assert "No related chunks found" in capsys.readouterr().out + out = capsys.readouterr().out + for fragment in expected_fragments: + assert fragment in out def test_cli_find_related_unknown_chunk( @@ -146,31 +157,27 @@ def test_init_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: p def test_main_dispatches_to_cli( + fake_search_index: MagicMock, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: """main() routes to _cli_main when first argument is a CLI subcommand.""" - chunk = make_chunk("def foo(): pass", "src/foo.py") - fake_index = MagicMock() - fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", "/some/path"]) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + with patch("semble.cli.SembleIndex.from_path", return_value=fake_search_index): main() assert "query text" in capsys.readouterr().out def test_cli_works_without_mcp_installed( + fake_search_index: MagicMock, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: """CLI subcommands succeed even when the mcp package is not installed.""" - chunk = make_chunk("def foo(): pass", "src/foo.py") - fake_index = MagicMock() - fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path"]) monkeypatch.setitem(sys.modules, "mcp", None) monkeypatch.setitem(sys.modules, "mcp.server.fastmcp", None) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + with patch("semble.cli.SembleIndex.from_path", return_value=fake_search_index): _cli_main() assert "query" in capsys.readouterr().out From 1b9b4b85ec2ea2dbee248d783119d7d7d7725175 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 18:03:22 +0200 Subject: [PATCH 08/14] Simplify --- pyproject.toml | 1 - tests/test_cli.py | 94 +++++++++++++++-------------------------------- 2 files changed, 29 insertions(+), 66 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bf3540b..51723c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,7 +132,6 @@ ignore_missing_imports = true [tool.pytest.ini_options] testpaths = ["tests"] addopts = ["--tb=short", "--strict-markers", "--cov=semble", "--cov-report=term-missing"] -markers = ["slow: end-to-end tests that run real indexing and CLI subprocesses (deselect with -m 'not slow')"] [tool.coverage.run] relative_files = true diff --git a/tests/test_cli.py b/tests/test_cli.py index 08a1ad1..91712e4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,3 @@ -import subprocess import sys from importlib.resources import files from pathlib import Path @@ -12,17 +11,6 @@ _CLAUDE_AGENT_FILE = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") -_SEARCH_CHUNK = make_chunk("def foo(): pass", "src/foo.py") -_SEARCH_RESULT = SearchResult(chunk=_SEARCH_CHUNK, score=0.9, source=SearchMode.HYBRID) - - -@pytest.fixture() -def fake_search_index() -> MagicMock: - """Fake index that returns one search result.""" - fake = MagicMock() - fake.search.return_value = [_SEARCH_RESULT] - return fake - @pytest.mark.parametrize( "argv", @@ -68,36 +56,36 @@ def test_cli_search( assert fragment in out -_BAR_CHUNK = make_chunk("class Bar: pass", "src/bar.py") - - -@pytest.mark.parametrize( - ("find_related_return", "expected_fragments"), - [ - pytest.param( - [SearchResult(chunk=_BAR_CHUNK, score=0.8, source=SearchMode.SEMANTIC)], - ["src/bar.py", "0.800"], - id="with_results", - ), - pytest.param([], ["No related chunks found"], id="no_results"), - ], -) def test_cli_find_related( - find_related_return: list[SearchResult], - expected_fragments: list[str], monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - """_cli_main find-related prints results or an empty-state message.""" + """_cli_main find-related prints results.""" + chunk = make_chunk("class Bar: pass", "src/bar.py") fake_index = MagicMock() - fake_index.chunks = [_BAR_CHUNK] - fake_index.find_related.return_value = find_related_return + fake_index.chunks = [chunk] + fake_index.find_related.return_value = [SearchResult(chunk=chunk, score=0.8, source=SearchMode.SEMANTIC)] monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): _cli_main() out = capsys.readouterr().out - for fragment in expected_fragments: - assert fragment in out + assert "src/bar.py" in out + assert "0.800" in out + + +def test_cli_find_related_no_results( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main find-related prints a message when the index returns no related chunks.""" + chunk = make_chunk("class Bar: pass", "src/bar.py") + fake_index = MagicMock() + fake_index.chunks = [chunk] + fake_index.find_related.return_value = [] + monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + _cli_main() + assert "No related chunks found" in capsys.readouterr().out def test_cli_find_related_unknown_chunk( @@ -157,27 +145,31 @@ def test_init_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: p def test_main_dispatches_to_cli( - fake_search_index: MagicMock, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: """main() routes to _cli_main when first argument is a CLI subcommand.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", "/some/path"]) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_search_index): + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): main() assert "query text" in capsys.readouterr().out def test_cli_works_without_mcp_installed( - fake_search_index: MagicMock, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: """CLI subcommands succeed even when the mcp package is not installed.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path"]) monkeypatch.setitem(sys.modules, "mcp", None) monkeypatch.setitem(sys.modules, "mcp.server.fastmcp", None) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_search_index): + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): _cli_main() assert "query" in capsys.readouterr().out @@ -189,31 +181,3 @@ def test_agent_file_tools_are_bash_only() -> None: tools = [t.strip() for t in tools_line.removeprefix("tools:").split(",")] assert set(tools) == {"Bash", "Read"}, f"Unexpected tools in agent file: {tools}" assert not any("mcp__" in t for t in tools) - - -@pytest.mark.slow -def test_cli_search_subprocess() -> None: - """Semble search runs end-to-end as a subprocess — the path a sub-agent takes via Bash.""" - repo_root = Path(__file__).parent.parent - result = subprocess.run( - ["uv", "run", "semble", "search", "MCP server entry point", str(repo_root)], - capture_output=True, - text=True, - cwd=repo_root, - ) - assert result.returncode == 0, result.stderr - assert "mcp.py" in result.stdout - - -@pytest.mark.slow -def test_cli_find_related_subprocess() -> None: - """Semble find-related runs end-to-end as a subprocess — the path a sub-agent takes via Bash.""" - repo_root = Path(__file__).parent.parent - result = subprocess.run( - ["uv", "run", "semble", "find-related", "src/semble/mcp.py", "30", str(repo_root)], - capture_output=True, - text=True, - cwd=repo_root, - ) - assert result.returncode == 0, result.stderr - assert result.stdout.strip() From 0afffbc74051b4094bfd26adc8fc249c7e00bf90 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 18:06:36 +0200 Subject: [PATCH 09/14] Simplify --- README.md | 10 +++++----- src/semble/agents/semble-search.md | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 6eb20ce..d92af03 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ semble search "save model to disk" https://github.com/MinishLab/model2vec semble find-related src/auth.py 42 ./my-project ``` -`path` defaults to the current directory when omitted. +`path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. @@ -161,17 +161,17 @@ This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search. Use `semble search` to find code by describing what it does, instead of grep: ​```bash -semble search "authentication flow" /path/to/repo -semble search "save model to disk" /path/to/repo --top-k 10 +semble search "authentication flow" ./my-project +semble search "save model to disk" ./my-project --top-k 10 ​``` Use `semble find-related` to discover code similar to a known location: ​```bash -semble find-related src/auth.py 42 /path/to/repo +semble find-related src/auth.py 42 ./my-project ​``` -Both commands default `path` to the current directory. Git URLs are accepted. +`path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ``` diff --git a/src/semble/agents/semble-search.md b/src/semble/agents/semble-search.md index 9934d85..e7ab141 100644 --- a/src/semble/agents/semble-search.md +++ b/src/semble/agents/semble-search.md @@ -4,22 +4,22 @@ description: Code search agent for exploring any codebase. Use for finding code tools: Bash, Read --- -Use `semble search` to find code by describing what it does: +Use `semble search` to find code by describing what it does, instead of grep: ```bash -semble search "authentication flow" /path/to/repo -semble search "save model to disk" /path/to/repo --top-k 10 +semble search "authentication flow" ./my-project +semble search "save model to disk" ./my-project --top-k 10 ``` -Use `semble find-related` to find code similar to a known location (pass `file_path` and `line` from a prior search result): +Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash -semble find-related src/auth.py 42 /path/to/repo +semble find-related src/auth.py 42 ./my-project ``` -Both commands default `path` to the current directory if omitted. Git URLs are also accepted as `path`. +`path` defaults to the current directory when omitted; git URLs are accepted. -If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in place of `semble`. +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ## Workflow From effa3431a2a41769cd4e3f4404a5665406034b52 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 18:11:49 +0200 Subject: [PATCH 10/14] Simplify --- README.md | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index d92af03..0de4ca6 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,8 @@ [Quickstart](#quickstart) • [Main Features](#main-features) • -[CLI](#cli) • [MCP Server](#mcp-server) • +[CLI](#cli) • [How it works](#how-it-works) • [Benchmarks](#benchmarks) @@ -66,25 +66,6 @@ result.chunk.content # "def save_pretrained(self, path: PathLike, ..." - **MCP server**: drop-in tool for Claude Code, Cursor, Codex, OpenCode, and any other MCP-compatible agent. - **Local and remote**: pass a local path or a git URL. -## CLI - -Semble also ships as a standalone CLI for use outside of MCP. This is useful in scripts, sub-agents, or anywhere you want search results without an MCP session. - -```bash -# Search a local repo -semble search "authentication flow" ./my-project - -# Search a remote repo (cloned on demand) -semble search "save model to disk" https://github.com/MinishLab/model2vec - -# Find code similar to a known location (file_path and line from a prior search result) -semble find-related src/auth.py 42 ./my-project -``` - -`path` defaults to the current directory when omitted; git URLs are accepted. - -If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. - ## MCP Server Semble can run as an MCP server so agents can search any codebase directly. Repos are cloned and indexed on demand, and indexes are cached for the lifetime of the session. @@ -175,6 +156,25 @@ semble find-related src/auth.py 42 ./my-project If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. ``` +## CLI + +Semble also ships as a standalone CLI for use outside of MCP. This is useful in scripts, sub-agents, or anywhere you want search results without an MCP session. + +```bash +# Search a local repo +semble search "authentication flow" ./my-project + +# Search a remote repo (cloned on demand) +semble search "save model to disk" https://github.com/MinishLab/model2vec + +# Find code similar to a known location (file_path and line from a prior search result) +semble find-related src/auth.py 42 ./my-project +``` + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. + ## How it works Semble splits each file into code-aware chunks using [Chonkie](https://github.com/chonkie-inc/chonkie), then scores every query against the chunks with two complementary retrievers: static [Model2Vec](https://github.com/MinishLab/model2vec) embeddings using the code-specialized [potion-code-16M](https://huggingface.co/minishlab/potion-code-16M) model for semantic similarity, and [BM25](https://github.com/xhluca/bm25s) for lexical matches on identifiers and API names. The two score lists are fused with Reciprocal Rank Fusion (RRF). From d2a04ac3127e6321edc3d71f09a8c3f855fe8c4b Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 18:17:27 +0200 Subject: [PATCH 11/14] Simplify --- README.md | 10 ++++- tests/test_cli.py | 93 ++++++++++++++++++++++++++--------------------- 2 files changed, 60 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 0de4ca6..ebbe080 100644 --- a/README.md +++ b/README.md @@ -146,14 +146,22 @@ semble search "authentication flow" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` -Use `semble find-related` to discover code similar to a known location: +Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ​```bash semble find-related src/auth.py 42 ./my-project ​``` `path` defaults to the current directory when omitted; git URLs are accepted. + If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. + +## Workflow + +1. Start with `semble search` to find relevant chunks. +2. Use `Read` to inspect a full file when the chunk alone isn't enough context. +3. Use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +4. Fall back to Bash `grep` only for exact string matches (variable names, import statements). ``` ## CLI diff --git a/tests/test_cli.py b/tests/test_cli.py index 91712e4..f9df115 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -56,51 +56,43 @@ def test_cli_search( assert fragment in out +@pytest.mark.parametrize( + ("scenario", "expected_stdout", "expected_stderr", "expected_exit_code"), + [ + ("with_results", ["src/bar.py", "0.800"], None, None), + ("no_results", ["No related chunks found"], None, None), + ("unknown_chunk", [], "No chunk found", 1), + ], +) def test_cli_find_related( + scenario: str, + expected_stdout: list[str], + expected_stderr: str | None, + expected_exit_code: int | None, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - """_cli_main find-related prints results.""" - chunk = make_chunk("class Bar: pass", "src/bar.py") - fake_index = MagicMock() - fake_index.chunks = [chunk] - fake_index.find_related.return_value = [SearchResult(chunk=chunk, score=0.8, source=SearchMode.SEMANTIC)] - monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - _cli_main() - out = capsys.readouterr().out - assert "src/bar.py" in out - assert "0.800" in out - - -def test_cli_find_related_no_results( - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[str], -) -> None: - """_cli_main find-related prints a message when the index returns no related chunks.""" + """_cli_main find-related prints results, empty states, and missing-chunk errors.""" chunk = make_chunk("class Bar: pass", "src/bar.py") fake_index = MagicMock() - fake_index.chunks = [chunk] - fake_index.find_related.return_value = [] - monkeypatch.setattr(sys, "argv", ["semble", "find-related", "src/bar.py", "1", "/some/path"]) - with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - _cli_main() - assert "No related chunks found" in capsys.readouterr().out - - -def test_cli_find_related_unknown_chunk( - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[str], -) -> None: - """_cli_main find-related exits with code 1 when chunk is not found.""" - fake_index = MagicMock() - fake_index.chunks = [] - monkeypatch.setattr(sys, "argv", ["semble", "find-related", "unknown.py", "1", "/some/path"]) + fake_index.chunks = [] if scenario == "unknown_chunk" else [chunk] + fake_index.find_related.return_value = ( + [SearchResult(chunk=chunk, score=0.8, source=SearchMode.SEMANTIC)] if scenario == "with_results" else [] + ) + file_path = "unknown.py" if scenario == "unknown_chunk" else "src/bar.py" + monkeypatch.setattr(sys, "argv", ["semble", "find-related", file_path, "1", "/some/path"]) with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - with pytest.raises(SystemExit) as exc_info: + if expected_exit_code is None: _cli_main() - assert exc_info.value.code == 1 - assert "No chunk found" in capsys.readouterr().err + else: + with pytest.raises(SystemExit) as exc_info: + _cli_main() + assert exc_info.value.code == expected_exit_code + captured = capsys.readouterr() + for fragment in expected_stdout: + assert fragment in captured.out + if expected_stderr: + assert expected_stderr in captured.err def test_init_creates_file(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: @@ -158,20 +150,37 @@ def test_main_dispatches_to_cli( assert "query text" in capsys.readouterr().out -def test_cli_works_without_mcp_installed( +@pytest.mark.parametrize( + ("argv", "expected_stdout", "expect_system_exit"), + [ + (["semble", "--help"], "find-related", True), + (["semble", "search", "query", "/some/path"], "query", False), + ], +) +def test_cli_entrypoint_works_without_mcp_installed( + argv: list[str], + expected_stdout: str, + expect_system_exit: bool, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - """CLI subcommands succeed even when the mcp package is not installed.""" + """CLI entrypoint paths succeed even when the mcp package is not installed.""" chunk = make_chunk("def foo(): pass", "src/foo.py") fake_index = MagicMock() fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] - monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path"]) + monkeypatch.setattr(sys, "argv", argv) monkeypatch.setitem(sys.modules, "mcp", None) + monkeypatch.setitem(sys.modules, "mcp.server", None) monkeypatch.setitem(sys.modules, "mcp.server.fastmcp", None) + monkeypatch.setitem(sys.modules, "semble.mcp", None) with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): - _cli_main() - assert "query" in capsys.readouterr().out + if expect_system_exit: + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 0 + else: + main() + assert expected_stdout in capsys.readouterr().out def test_agent_file_tools_are_bash_only() -> None: From 069be6fb4cf7edefa6166dbab24280d5bad693df Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 18:21:36 +0200 Subject: [PATCH 12/14] Simplify --- README.md | 12 ++++++++---- src/semble/agents/semble-search.md | 9 +++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ebbe080..c47686b 100644 --- a/README.md +++ b/README.md @@ -139,10 +139,11 @@ This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search. ```markdown ## Code Search -Use `semble search` to find code by describing what it does, instead of grep: +Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ​```bash semble search "authentication flow" ./my-project +semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` @@ -159,9 +160,9 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Use `Read` to inspect a full file when the chunk alone isn't enough context. -3. Use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Fall back to Bash `grep` only for exact string matches (variable names, import statements). +2. Inspect full files only when the returned chunk is not enough context. +3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. ``` ## CLI @@ -172,6 +173,9 @@ Semble also ships as a standalone CLI for use outside of MCP. This is useful in # Search a local repo semble search "authentication flow" ./my-project +# Search for a symbol or identifier +semble search "save_pretrained" ./my-project + # Search a remote repo (cloned on demand) semble search "save model to disk" https://github.com/MinishLab/model2vec diff --git a/src/semble/agents/semble-search.md b/src/semble/agents/semble-search.md index e7ab141..515d60e 100644 --- a/src/semble/agents/semble-search.md +++ b/src/semble/agents/semble-search.md @@ -4,10 +4,11 @@ description: Code search agent for exploring any codebase. Use for finding code tools: Bash, Read --- -Use `semble search` to find code by describing what it does, instead of grep: +Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash semble search "authentication flow" ./my-project +semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` @@ -24,6 +25,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Use `Read` to inspect a full file when the chunk alone isn't enough context. -3. Use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Fall back to Bash `grep` only for exact string matches (variable names, import statements). +2. Inspect full files only when the returned chunk is not enough context. +3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. From 70d3bee5aca611f7de7f58bbb28a73b2ba1f932c Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 20:51:25 +0200 Subject: [PATCH 13/14] Add frozenset --- src/semble/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/semble/cli.py b/src/semble/cli.py index be375e5..2f209f3 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -8,11 +8,12 @@ from semble.utils import _format_results, _is_git_url, _resolve_chunk _CLAUDE_FILE_PATH = Path(".claude") / "agents" / "semble-search.md" +_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "-h", "--help"}) def main() -> None: """Entry point for the semble command-line tool.""" - if len(sys.argv) > 1 and sys.argv[1] in ("search", "find-related", "init", "-h", "--help"): + if len(sys.argv) > 1 and sys.argv[1] in _CLI_DISPATCH_ARGS: _cli_main() else: _mcp_main() From d1445e088ab2797cd9e4f85000b4ceead21efb8f Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 30 Apr 2026 20:51:37 +0200 Subject: [PATCH 14/14] Bump version --- src/semble/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semble/version.py b/src/semble/version.py index b1c662b..be4dce4 100644 --- a/src/semble/version.py +++ b/src/semble/version.py @@ -1,2 +1,2 @@ -__version_triple__ = (0, 1, 0) +__version_triple__ = (0, 1, 1) __version__ = ".".join(map(str, __version_triple__))