diff --git a/README.md b/README.md index ff7f95a..c47686b 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ [Quickstart](#quickstart) • [Main Features](#main-features) • [MCP Server](#mcp-server) • +[CLI](#cli) • [How it works](#how-it-works) • [Benchmarks](#benchmarks) @@ -119,6 +120,73 @@ Add to `~/.cursor/mcp.json` (or `.cursor/mcp.json` in your project): | `search` | Search a codebase with a natural-language or code query. Pass `repo` as a git URL or local path. | | `find_related` | Given a file path and line number, return chunks semantically similar to the code at that location. | +### Sub-agent support + +Claude Code and Codex CLI lazy-load MCP tool schemas, so sub-agents cannot call `mcp__semble__search` directly. The fix is to invoke semble through the [CLI](#cli) via Bash instead. + +**Claude Code**: run this once in your project root: + +```bash +semble init +# or, if semble is not on $PATH: +uvx --from "semble[mcp]" semble init +``` + +This writes [`.claude/agents/semble-search.md`](src/semble/agents/semble-search.md). + +**Other tools (Codex, etc.)**: append the following to your `AGENTS.md`: + +```markdown +## Code Search + +Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +​```bash +semble search "authentication flow" ./my-project +semble search "save_pretrained" ./my-project +semble search "save model to disk" ./my-project --top-k 10 +​``` + +Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): + +​```bash +semble find-related src/auth.py 42 ./my-project +​``` + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. + +## Workflow + +1. Start with `semble search` to find relevant chunks. +2. Inspect full files only when the returned chunk is not enough context. +3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +``` + +## CLI + +Semble also ships as a standalone CLI for use outside of MCP. This is useful in scripts, sub-agents, or anywhere you want search results without an MCP session. + +```bash +# Search a local repo +semble search "authentication flow" ./my-project + +# Search for a symbol or identifier +semble search "save_pretrained" ./my-project + +# Search a remote repo (cloned on demand) +semble search "save model to disk" https://github.com/MinishLab/model2vec + +# Find code similar to a known location (file_path and line from a prior search result) +semble find-related src/auth.py 42 ./my-project +``` + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. + ## How it works Semble splits each file into code-aware chunks using [Chonkie](https://github.com/chonkie-inc/chonkie), then scores every query against the chunks with two complementary retrievers: static [Model2Vec](https://github.com/MinishLab/model2vec) embeddings using the code-specialized [potion-code-16M](https://huggingface.co/minishlab/potion-code-16M) model for semantic similarity, and [BM25](https://github.com/xhluca/bm25s) for lexical matches on identifiers and API names. The two score lists are fused with Reciprocal Rank Fusion (RRF). diff --git a/pyproject.toml b/pyproject.toml index fd036be..51723c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ dev = [ "Source" = "https://github.com/MinishLab/semble" [project.scripts] -semble = "semble.mcp:main" +semble = "semble.cli:main" [tool.setuptools] package-dir = {"" = "src"} @@ -72,7 +72,7 @@ where = ["src"] include = ["semble*"] [tool.setuptools.package-data] -semble = ["py.typed"] +semble = ["py.typed", "agents/*.md"] [tool.setuptools_scm] # can be empty if no extra settings are needed, presence enables setuptools_scm @@ -88,6 +88,7 @@ target-version = "py310" [tool.ruff.lint.per-file-ignores] "tests/**" = ["ANN"] "benchmarks/*.py" = ["T20"] +"src/semble/cli.py" = ["T20", "E501"] [tool.ruff.lint] select = [ diff --git a/src/semble/agents/semble-search.md b/src/semble/agents/semble-search.md new file mode 100644 index 0000000..515d60e --- /dev/null +++ b/src/semble/agents/semble-search.md @@ -0,0 +1,30 @@ +--- +name: semble-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Grep/Glob/Read for any semantic or exploratory question. +tools: Bash, Read +--- + +Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +semble search "authentication flow" ./my-project +semble search "save_pretrained" ./my-project +semble search "save model to disk" ./my-project --top-k 10 +``` + +Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): + +```bash +semble find-related src/auth.py 42 ./my-project +``` + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. + +## Workflow + +1. Start with `semble search` to find relevant chunks. +2. Inspect full files only when the returned chunk is not enough context. +3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/cli.py b/src/semble/cli.py new file mode 100644 index 0000000..2f209f3 --- /dev/null +++ b/src/semble/cli.py @@ -0,0 +1,97 @@ +import argparse +import asyncio +import sys +from importlib.resources import files +from pathlib import Path + +from semble.index import SembleIndex +from semble.utils import _format_results, _is_git_url, _resolve_chunk + +_CLAUDE_FILE_PATH = Path(".claude") / "agents" / "semble-search.md" +_CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "init", "-h", "--help"}) + + +def main() -> None: + """Entry point for the semble command-line tool.""" + if len(sys.argv) > 1 and sys.argv[1] in _CLI_DISPATCH_ARGS: + _cli_main() + else: + _mcp_main() + + +def _mcp_main() -> None: + parser = argparse.ArgumentParser( + prog="semble", + description="Instant local code search for agents.", + ) + parser.add_argument( + "path", + nargs="?", + default=None, + help="Local directory or git URL to pre-index at startup (optional).", + ) + parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") + args = parser.parse_args() + from semble.mcp import serve + + asyncio.run(serve(args.path, ref=args.ref)) + + +def _run_init(*, force: bool = False) -> None: + """Write the Claude Code sub-agent file into the current project.""" + dest = _CLAUDE_FILE_PATH + if dest.exists() and not force: + print(f"{dest} already exists. Run with --force to overwrite.", file=sys.stderr) + sys.exit(1) + dest.parent.mkdir(parents=True, exist_ok=True) + content = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") + dest.write_text(content, encoding="utf-8") + print(f"Created {dest}") + + +def _cli_main() -> None: + parser = argparse.ArgumentParser(prog="semble") + sub = parser.add_subparsers(dest="command") + + search_p = sub.add_parser("search", help="Search a codebase.") + search_p.add_argument("query", help="Natural language or code query.") + search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") + search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + search_p.add_argument( + "-m", "--mode", default="hybrid", choices=["hybrid", "semantic", "bm25"], help="Search mode (default: hybrid)." + ) + + related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") + related_p.add_argument("file_path", help="File path as shown in search results.") + related_p.add_argument("line", type=int, help="Line number (1-indexed).") + related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") + related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + + init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for Claude Code sub-agent support.") + init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.") + + args = parser.parse_args() + + if args.command == "init": + _run_init(force=args.force) + return + + index = SembleIndex.from_git(args.path) if _is_git_url(args.path) else SembleIndex.from_path(args.path) + + if args.command == "search": + results = index.search(args.query, top_k=args.top_k, mode=args.mode) + if not results: + print("No results found.") + else: + print(_format_results(f"Search results for: {args.query!r} (mode={args.mode})", results)) + + elif args.command == "find-related": + chunk = _resolve_chunk(index.chunks, args.file_path, args.line) + if chunk is None: + print(f"No chunk found at {args.file_path}:{args.line}.", file=sys.stderr) + sys.exit(1) + results = index.find_related(chunk, top_k=args.top_k) + if not results: + print(f"No related chunks found for {args.file_path}:{args.line}.") + else: + print(_format_results(f"Chunks related to {args.file_path}:{args.line}", results)) diff --git a/src/semble/mcp.py b/src/semble/mcp.py index cc7ec48..1b5dc10 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import re from pathlib import Path from typing import Annotated, Literal @@ -10,7 +9,8 @@ from semble.index import SembleIndex from semble.index.dense import load_model -from semble.types import Chunk, Encoder, SearchResult +from semble.types import Encoder +from semble.utils import _format_results, _is_git_url, _resolve_chunk _REPO_DESCRIPTION = ( "Git URL (e.g. https://github.com/org/repo) or local path to index and search. " @@ -18,10 +18,6 @@ "The index is cached after the first call, so repeat queries are fast." ) -_GIT_URL_SCHEMES = ("https://", "http://", "ssh://", "git://", "git+ssh://", "file://") -# scp-like syntax: [user@]host:path, where host has no '/' before the ':'. -_SCP_GIT_URL_RE = re.compile(r"^[\w.-]+@[\w.-]+:(?!/)") - def create_server(cache: _IndexCache, default_source: str | None = None) -> FastMCP: """Build and return a configured FastMCP server backed by the given cache.""" @@ -142,8 +138,6 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: try: return await asyncio.shield(task) except asyncio.CancelledError: # pragma: no cover - # If this waiter was cancelled but the task is still running, preserve it for - # other waiters. Only evict if the task itself was cancelled. if task.done(): self._tasks.pop(cache_key, None) raise @@ -151,61 +145,3 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: # Build failed: evict so the next caller can retry. self._tasks.pop(cache_key, None) raise - - -def _resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | None: - """Return the chunk that contains *line* in *file_path*, or None. - - MCP tool arguments are JSON primitives (strings and ints), so the agent - passes file_path + line rather than a Chunk object. This function - reconstructs the Chunk at the MCP boundary before calling into the library. - - :param chunks: All indexed chunks to search. - :param file_path: File path as stored in the index. - :param line: 1-indexed line number to resolve. - :return: The best-matching Chunk, or None if not found. - """ - fallback = None - for chunk in chunks: - if chunk.file_path == file_path and chunk.start_line <= line <= chunk.end_line: - if line < chunk.end_line: - return chunk - if fallback is None: # line == end_line: boundary; keep as fallback for end-of-file chunks - fallback = chunk - return fallback - - -def _is_git_url(path: str) -> bool: - """Return True if path looks like a remote git URL rather than a local path.""" - return path.startswith(_GIT_URL_SCHEMES) or _SCP_GIT_URL_RE.match(path) is not None - - -def _format_results(header: str, results: list[SearchResult]) -> str: - """Render SearchResult objects as numbered, fenced code blocks.""" - lines: list[str] = [header, ""] - for i, r in enumerate(results, 1): - lines.append(f"## {i}. {r.chunk.location} [score={r.score:.3f}]") - lines.append("```") - lines.append(r.chunk.content.strip()) - lines.append("```") - lines.append("") - return "\n".join(lines) - - -def main() -> None: - """Entry point for the semble command-line tool.""" - import argparse - - parser = argparse.ArgumentParser( - prog="semble", - description="Instant local code search for agents.", - ) - parser.add_argument( - "path", - nargs="?", - default=None, - help="Local directory or git URL to pre-index at startup (optional).", - ) - parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") - args = parser.parse_args() - asyncio.run(serve(args.path, ref=args.ref)) diff --git a/src/semble/utils.py b/src/semble/utils.py new file mode 100644 index 0000000..89dc3f9 --- /dev/null +++ b/src/semble/utils.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +import re + +from semble.types import Chunk, SearchResult + +_GIT_URL_SCHEMES = ("https://", "http://", "ssh://", "git://", "git+ssh://", "file://") +_SCP_GIT_URL_RE = re.compile(r"^[\w.-]+@[\w.-]+:(?!/)") + + +def _is_git_url(path: str) -> bool: + """Return True if path looks like a remote git URL rather than a local path.""" + return path.startswith(_GIT_URL_SCHEMES) or _SCP_GIT_URL_RE.match(path) is not None + + +def _resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | None: + """Return the chunk containing *line* in *file_path*, or None. + + Reconstructs a Chunk from its JSON-primitive MCP tool arguments (file_path + line) + before calling into the library. + """ + fallback = None + for chunk in chunks: + if chunk.file_path == file_path and chunk.start_line <= line <= chunk.end_line: + if line < chunk.end_line: + return chunk + if fallback is None: # line == end_line: boundary; keep as fallback for end-of-file chunks + fallback = chunk + return fallback + + +def _format_results(header: str, results: list[SearchResult]) -> str: + """Render SearchResult objects as numbered, fenced code blocks.""" + lines: list[str] = [header, ""] + for i, r in enumerate(results, 1): + lines.append(f"## {i}. {r.chunk.location} [score={r.score:.3f}]") + lines.append("```") + lines.append(r.chunk.content.strip()) + lines.append("```") + lines.append("") + return "\n".join(lines) diff --git a/src/semble/version.py b/src/semble/version.py index b1c662b..be4dce4 100644 --- a/src/semble/version.py +++ b/src/semble/version.py @@ -1,2 +1,2 @@ -__version_triple__ = (0, 1, 0) +__version_triple__ = (0, 1, 1) __version__ = ".".join(map(str, __version_triple__)) diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..f9df115 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,192 @@ +import sys +from importlib.resources import files +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from semble.cli import _CLAUDE_FILE_PATH, _cli_main, _run_init, main +from semble.types import SearchMode, SearchResult +from tests.conftest import make_chunk + +_CLAUDE_AGENT_FILE = files("semble").joinpath("agents/semble-search.md").read_text(encoding="utf-8") + + +@pytest.mark.parametrize( + "argv", + [ + ["semble", "/some/path", "--ref", "main"], + ["semble"], + ], +) +def test_main_calls_asyncio_run(argv: list[str], monkeypatch: pytest.MonkeyPatch) -> None: + """main() delegates to asyncio.run(serve(...)) when no CLI subcommand is given.""" + monkeypatch.setattr(sys, "argv", argv) + with patch("asyncio.run") as mock_run: + mock_run.side_effect = lambda coro: coro.close() + main() + mock_run.assert_called_once() + + +@pytest.mark.parametrize( + "argv, expected_in_output", + [ + (["semble", "search", "query text", "/some/path"], ["query text", "0.9"]), + (["semble", "search", "nothing", "/some/path", "--top-k", "3", "--mode", "bm25"], ["No results found"]), + ], +) +def test_cli_search( + argv: list[str], + expected_in_output: list[str], + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main search subcommand calls index.search and prints results.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + has_results = "No results" not in expected_in_output[0] + fake_index.search.return_value = ( + [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] if has_results else [] + ) + monkeypatch.setattr(sys, "argv", argv) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + _cli_main() + out = capsys.readouterr().out + for fragment in expected_in_output: + assert fragment in out + + +@pytest.mark.parametrize( + ("scenario", "expected_stdout", "expected_stderr", "expected_exit_code"), + [ + ("with_results", ["src/bar.py", "0.800"], None, None), + ("no_results", ["No related chunks found"], None, None), + ("unknown_chunk", [], "No chunk found", 1), + ], +) +def test_cli_find_related( + scenario: str, + expected_stdout: list[str], + expected_stderr: str | None, + expected_exit_code: int | None, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """_cli_main find-related prints results, empty states, and missing-chunk errors.""" + chunk = make_chunk("class Bar: pass", "src/bar.py") + fake_index = MagicMock() + fake_index.chunks = [] if scenario == "unknown_chunk" else [chunk] + fake_index.find_related.return_value = ( + [SearchResult(chunk=chunk, score=0.8, source=SearchMode.SEMANTIC)] if scenario == "with_results" else [] + ) + file_path = "unknown.py" if scenario == "unknown_chunk" else "src/bar.py" + monkeypatch.setattr(sys, "argv", ["semble", "find-related", file_path, "1", "/some/path"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + if expected_exit_code is None: + _cli_main() + else: + with pytest.raises(SystemExit) as exc_info: + _cli_main() + assert exc_info.value.code == expected_exit_code + captured = capsys.readouterr() + for fragment in expected_stdout: + assert fragment in captured.out + if expected_stderr: + assert expected_stderr in captured.err + + +def test_init_creates_file(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + """_run_init writes the agent file and prints its path.""" + monkeypatch.chdir(tmp_path) + _run_init() + dest = tmp_path / _CLAUDE_FILE_PATH + assert dest.exists() + assert dest.read_text(encoding="utf-8") == _CLAUDE_AGENT_FILE + assert str(_CLAUDE_FILE_PATH) in capsys.readouterr().out + + +def test_init_refuses_overwrite_without_force( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + """_run_init exits with code 1 when the file exists and force=False.""" + monkeypatch.chdir(tmp_path) + _run_init() + with pytest.raises(SystemExit) as exc_info: + _run_init() + assert exc_info.value.code == 1 + assert "already exists" in capsys.readouterr().err + + +def test_init_overwrites_with_force(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """_run_init overwrites an existing file when force=True.""" + monkeypatch.chdir(tmp_path) + dest = tmp_path / _CLAUDE_FILE_PATH + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_text("old content", encoding="utf-8") + _run_init(force=True) + assert dest.read_text(encoding="utf-8") == _CLAUDE_AGENT_FILE + + +def test_init_via_cli(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: + """Semble init creates the Claude agent file via _cli_main.""" + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(sys, "argv", ["semble", "init"]) + _cli_main() + assert (tmp_path / _CLAUDE_FILE_PATH).exists() + assert str(_CLAUDE_FILE_PATH) in capsys.readouterr().out + + +def test_main_dispatches_to_cli( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """main() routes to _cli_main when first argument is a CLI subcommand.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] + monkeypatch.setattr(sys, "argv", ["semble", "search", "query text", "/some/path"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + main() + assert "query text" in capsys.readouterr().out + + +@pytest.mark.parametrize( + ("argv", "expected_stdout", "expect_system_exit"), + [ + (["semble", "--help"], "find-related", True), + (["semble", "search", "query", "/some/path"], "query", False), + ], +) +def test_cli_entrypoint_works_without_mcp_installed( + argv: list[str], + expected_stdout: str, + expect_system_exit: bool, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """CLI entrypoint paths succeed even when the mcp package is not installed.""" + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9, source=SearchMode.HYBRID)] + monkeypatch.setattr(sys, "argv", argv) + monkeypatch.setitem(sys.modules, "mcp", None) + monkeypatch.setitem(sys.modules, "mcp.server", None) + monkeypatch.setitem(sys.modules, "mcp.server.fastmcp", None) + monkeypatch.setitem(sys.modules, "semble.mcp", None) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + if expect_system_exit: + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 0 + else: + main() + assert expected_stdout in capsys.readouterr().out + + +def test_agent_file_tools_are_bash_only() -> None: + """The agent file must list only Bash and Read — no MCP tools that require schema loading.""" + frontmatter = _CLAUDE_AGENT_FILE.split("---")[1] + tools_line = next(line for line in frontmatter.splitlines() if line.startswith("tools:")) + tools = [t.strip() for t in tools_line.removeprefix("tools:").split(",")] + assert set(tools) == {"Bash", "Read"}, f"Unexpected tools in agent file: {tools}" + assert not any("mcp__" in t for t in tools) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index a5651d6..43100ba 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -1,12 +1,12 @@ -import sys from pathlib import Path from typing import Any from unittest.mock import AsyncMock, MagicMock, patch import pytest -from semble.mcp import _format_results, _IndexCache, _is_git_url, _resolve_chunk, create_server, main, serve +from semble.mcp import _IndexCache, create_server, serve from semble.types import Chunk, Encoder, SearchMode, SearchResult +from semble.utils import _format_results, _is_git_url, _resolve_chunk from tests.conftest import make_chunk @@ -253,19 +253,3 @@ async def test_serve_runs_stdio(tmp_path: Path, with_path: bool) -> None: await (serve(str(tmp_path)) if with_path else serve()) mock_run.assert_called_once() - - -@pytest.mark.parametrize( - "argv", - [ - ["semble", "/some/path", "--ref", "main"], - ["semble"], - ], -) -def test_main_calls_asyncio_run(argv: list[str], monkeypatch: pytest.MonkeyPatch) -> None: - """main() parses argv and delegates to asyncio.run(serve(...)).""" - monkeypatch.setattr(sys, "argv", argv) - with patch("semble.mcp.asyncio.run") as mock_run: - mock_run.side_effect = lambda coro: coro.close() - main() - mock_run.assert_called_once()