From 3081f3db206a5aaceb8f01f5d150a324f89166af Mon Sep 17 00:00:00 2001 From: Arnav Goel Date: Wed, 10 Jun 2026 04:05:23 -0400 Subject: [PATCH] feat: multi-row table headers, .xls support, DECO benchmark; rename to excel_parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renames the package ks_xlsx_parser → excel_parser (and rust ks_xlsx_core → excel_core) across source, docs, scripts, and site, plus parser improvements: - Header detection: extend find_header_span to multi-row header bands, gated on styling continuity so single-row headers stay one row. Measured on DECO (852 files, 1,480 GT tables): multi-row header F1 0.37→0.50 (exact 0%→24%, recall 0.23→0.33), single-row exact 84%→79%, table IoU unchanged. New unit tests in tests/test_header_detector.py. - .xls support: convert_xls_to_xlsx backend so legacy workbooks parse. - DECO structural benchmark (scripts/eval_deco.py): scores table-boundary IoU + header-row precision/recall/F1 vs Docling — the structural ground truth SpreadsheetBench lacks. Wired into download_corpora.sh + benchmarks README. Full test suite: 1137 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 10 + .gitignore | 6 +- CHANGELOG.md | 38 +- CONTRIBUTING.md | 24 +- Dockerfile.bench | 14 +- LICENSE | 2 +- Makefile | 8 +- README.md | 146 +- SECURITY.md | 4 +- docs/MAINTAINERS.md | 12 +- docs/PARSER_KNOWN_ISSUES.md | 2 +- docs/RELEASE_PROCESS.md | 30 +- docs/benchmark-local-setup.md | 6 +- docs/corpora.md | 2 +- docs/launch/ANNOUNCEMENTS.md | 44 +- docs/launch/MEDIUM_ARTICLE.md | 24 +- docs/launch/RELEASE_NOTES_v0.1.1.md | 30 +- docs/launch/RELEASE_NOTES_v0.2.0.md | 16 +- docs/launch/RELEASE_NOTES_v0.2.1.md | 38 +- docs/launch/SEO.md | 16 +- docs/recall-investigation.md | 10 +- docs/wiki/API-Reference.md | 16 +- docs/wiki/Architecture.md | 4 +- docs/wiki/Benchmark-vs-hucre.md | 30 +- docs/wiki/Data-Models.md | 4 +- docs/wiki/Home.md | 20 +- docs/wiki/Pipeline-Internals.md | 24 +- docs/wiki/Quick-Start.md | 14 +- docs/wiki/Web-API.md | 18 +- examples/demo.py | 8 +- examples/generate_examples.py | 2 +- examples/stress_test/stress_test_runner.py | 4 +- pyproject.toml | 18 +- rust/{ks_xlsx_core => excel_core}/Cargo.lock | 2 +- rust/{ks_xlsx_core => excel_core}/Cargo.toml | 6 +- rust/{ks_xlsx_core => excel_core}/README.md | 12 +- rust/{ks_xlsx_core => excel_core}/REMOVAL.md | 12 +- .../pyproject.toml | 6 +- .../src/formula.rs | 0 rust/{ks_xlsx_core => excel_core}/src/lib.rs | 4 +- scripts/append_bench_history.py | 2 +- scripts/download_corpora.sh | 31 + scripts/enrich_failures.py | 4 +- scripts/eval_deco.py | 560 ++++++++ scripts/eval_retrieval.py | 44 +- scripts/publish_wiki.sh | 2 +- scripts/run_bench.sh | 2 +- scripts/run_enterprise_metrics.py | 2 +- scripts/verify_wheel.py | 22 +- site/index.html | 152 +- site/robots.txt | 2 +- site/sitemap.xml | 28 +- .../__init__.py | 8 +- .../analysis/__init__.py | 0 src/excel_parser/analysis/header_detector.py | 187 +++ .../analysis/light_block_detector.py | 6 +- .../analysis/llm_artifacts.py | 14 +- .../analysis/pattern_splitter.py | 8 +- src/excel_parser/analysis/section_detector.py | 136 ++ .../analysis/table_assembler.py | 8 +- .../analysis/table_grouper.py | 8 +- .../analysis/template_extractor.py | 8 +- .../analysis/tree_builder.py | 8 +- .../annotation/__init__.py | 0 .../annotation/block_splitter.py | 6 +- .../annotation/cell_annotator.py | 6 +- src/{ks_xlsx_parser => excel_parser}/api.py | 26 +- .../charts/__init__.py | 0 .../charts/chart_extractor.py | 4 +- .../chunking/__init__.py | 0 src/excel_parser/chunking/chunker.py | 503 +++++++ .../chunking/segmenter.py | 190 ++- .../comparison/__init__.py | 0 .../comparison/template_comparator.py | 4 +- .../export/__init__.py | 0 .../export/model_exporter.py | 8 +- .../formula/__init__.py | 0 .../formula/dependency_builder.py | 8 +- .../formula/formula_parser.py | 8 +- .../models/__init__.py | 4 +- .../models/block.py | 0 .../models/cell.py | 0 .../models/chart.py | 0 .../models/common.py | 2 +- .../models/dependency.py | 0 .../models/shape.py | 0 .../models/sheet.py | 5 + .../models/table.py | 0 .../models/table_structure.py | 0 .../models/template.py | 0 .../models/tree.py | 0 .../models/workbook.py | 0 .../parsers/__init__.py | 0 .../parsers/calamine_core.py | 14 +- .../parsers/cell_parser.py | 4 +- .../parsers/sheet_parser.py | 19 +- .../parsers/table_parser.py | 4 +- .../parsers/workbook_parser.py | 71 +- src/excel_parser/parsers/xls_converter.py | 521 +++++++ .../pipeline.py | 54 +- src/{ks_xlsx_parser => excel_parser}/py.typed | 0 .../rendering/__init__.py | 0 .../rendering/html_renderer.py | 143 +- .../rendering/text_renderer.py | 133 +- .../storage/__init__.py | 0 .../storage/serializer.py | 4 +- .../utils/__init__.py | 0 .../utils/logging_config.py | 6 +- .../verification/__init__.py | 2 +- .../verification/stage_verifier.py | 34 +- src/ks_xlsx_parser/chunking/chunker.py | 275 ---- tests/benchmarks/README.md | 29 +- tests/benchmarks/__init__.py | 2 +- tests/benchmarks/_runner.py | 4 +- tests/benchmarks/adapters/ks_adapter.py | 12 +- tests/benchmarks/hucre_node/hucre_adapter.mjs | 2 +- tests/benchmarks/hucre_node/package.json | 4 +- tests/benchmarks/reports/COMPARISON.md | 76 + tests/benchmarks/vs_hucre.py | 4 +- tests/conftest.py | 416 +++++- tests/helpers/invariant_checker.py | 2 +- tests/test_array_formula_rendering.py | 4 +- tests/test_charts.py | 6 +- tests/test_corpus_robustness.py | 6 +- tests/test_eval_retrieval_classify.py | 43 + tests/test_formula_handling.py | 12 +- tests/test_formula_parser.py | 4 +- tests/test_formula_uncached_rendering.py | 4 +- tests/test_header_detector.py | 127 ++ tests/test_llm_artifacts.py | 6 +- tests/test_models.py | 2 +- tests/test_multi_table_layout.py | 254 +++- tests/test_parsers.py | 2 +- tests/test_pipeline.py | 2 +- tests/test_rendering.py | 28 +- tests/test_sections.py | 163 +++ tests/test_segmentation.py | 6 +- tests/test_stage_verification.py | 4 +- tests/test_structural_invariants.py | 6 +- tests/test_xls_support.py | 194 +++ uv.lock | 1235 ++++++++++++++++- 141 files changed, 5570 insertions(+), 1075 deletions(-) rename rust/{ks_xlsx_core => excel_core}/Cargo.lock (99%) rename rust/{ks_xlsx_core => excel_core}/Cargo.toml (54%) rename rust/{ks_xlsx_core => excel_core}/README.md (71%) rename rust/{ks_xlsx_core => excel_core}/REMOVAL.md (80%) rename rust/{ks_xlsx_core => excel_core}/pyproject.toml (68%) rename rust/{ks_xlsx_core => excel_core}/src/formula.rs (100%) rename rust/{ks_xlsx_core => excel_core}/src/lib.rs (97%) create mode 100644 scripts/eval_deco.py rename src/{ks_xlsx_parser => excel_parser}/__init__.py (71%) rename src/{ks_xlsx_parser => excel_parser}/analysis/__init__.py (100%) create mode 100644 src/excel_parser/analysis/header_detector.py rename src/{ks_xlsx_parser => excel_parser}/analysis/light_block_detector.py (96%) rename src/{ks_xlsx_parser => excel_parser}/analysis/llm_artifacts.py (98%) rename src/{ks_xlsx_parser => excel_parser}/analysis/pattern_splitter.py (96%) create mode 100644 src/excel_parser/analysis/section_detector.py rename src/{ks_xlsx_parser => excel_parser}/analysis/table_assembler.py (96%) rename src/{ks_xlsx_parser => excel_parser}/analysis/table_grouper.py (97%) rename src/{ks_xlsx_parser => excel_parser}/analysis/template_extractor.py (95%) rename src/{ks_xlsx_parser => excel_parser}/analysis/tree_builder.py (96%) rename src/{ks_xlsx_parser => excel_parser}/annotation/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/annotation/block_splitter.py (97%) rename src/{ks_xlsx_parser => excel_parser}/annotation/cell_annotator.py (98%) rename src/{ks_xlsx_parser => excel_parser}/api.py (93%) rename src/{ks_xlsx_parser => excel_parser}/charts/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/charts/chart_extractor.py (99%) rename src/{ks_xlsx_parser => excel_parser}/chunking/__init__.py (100%) create mode 100644 src/excel_parser/chunking/chunker.py rename src/{ks_xlsx_parser => excel_parser}/chunking/segmenter.py (68%) rename src/{ks_xlsx_parser => excel_parser}/comparison/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/comparison/template_comparator.py (99%) rename src/{ks_xlsx_parser => excel_parser}/export/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/export/model_exporter.py (97%) rename src/{ks_xlsx_parser => excel_parser}/formula/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/formula/dependency_builder.py (93%) rename src/{ks_xlsx_parser => excel_parser}/formula/formula_parser.py (96%) rename src/{ks_xlsx_parser => excel_parser}/models/__init__.py (94%) rename src/{ks_xlsx_parser => excel_parser}/models/block.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/cell.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/chart.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/common.py (99%) rename src/{ks_xlsx_parser => excel_parser}/models/dependency.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/shape.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/sheet.py (95%) rename src/{ks_xlsx_parser => excel_parser}/models/table.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/table_structure.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/template.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/tree.py (100%) rename src/{ks_xlsx_parser => excel_parser}/models/workbook.py (100%) rename src/{ks_xlsx_parser => excel_parser}/parsers/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/parsers/calamine_core.py (86%) rename src/{ks_xlsx_parser => excel_parser}/parsers/cell_parser.py (99%) rename src/{ks_xlsx_parser => excel_parser}/parsers/sheet_parser.py (96%) rename src/{ks_xlsx_parser => excel_parser}/parsers/table_parser.py (97%) rename src/{ks_xlsx_parser => excel_parser}/parsers/workbook_parser.py (86%) create mode 100644 src/excel_parser/parsers/xls_converter.py rename src/{ks_xlsx_parser => excel_parser}/pipeline.py (86%) rename src/{ks_xlsx_parser => excel_parser}/py.typed (100%) rename src/{ks_xlsx_parser => excel_parser}/rendering/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/rendering/html_renderer.py (56%) rename src/{ks_xlsx_parser => excel_parser}/rendering/text_renderer.py (62%) rename src/{ks_xlsx_parser => excel_parser}/storage/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/storage/serializer.py (98%) rename src/{ks_xlsx_parser => excel_parser}/utils/__init__.py (100%) rename src/{ks_xlsx_parser => excel_parser}/utils/logging_config.py (91%) rename src/{ks_xlsx_parser => excel_parser}/verification/__init__.py (85%) rename src/{ks_xlsx_parser => excel_parser}/verification/stage_verifier.py (96%) delete mode 100644 src/ks_xlsx_parser/chunking/chunker.py create mode 100644 tests/benchmarks/reports/COMPARISON.md create mode 100644 tests/test_header_detector.py create mode 100644 tests/test_sections.py create mode 100644 tests/test_xls_support.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba7b9e2..23b05ce 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,6 +35,16 @@ jobs: with: python-version: ${{ matrix.python-version }} + # Headless LibreOffice powers the full-fidelity legacy .xls → .xlsx path + # (formula text, charts). On Linux it's a cheap apt install, so the + # full-fidelity tests run here instead of being skipped. macOS runners + # skip it (the cask install is heavyweight); those tests self-skip. + - name: Install LibreOffice (Linux) + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends libreoffice-calc-nogui + - name: Install run: uv pip install --system -e ".[dev,api]" diff --git a/.gitignore b/.gitignore index d06734c..5a1baac 100644 --- a/.gitignore +++ b/.gitignore @@ -57,8 +57,10 @@ examples/stress_test/stress_results.json examples/stress_test/built_reference.json examples/stress_test/STRESS_TEST_RESULTS.md -# Local benchmark harness (private, not pushed) -tests/benchmarks/reports/ +# Local benchmark harness (private, not pushed) — run outputs stay private, +# except the curated comparison the README links to. +tests/benchmarks/reports/* +!tests/benchmarks/reports/COMPARISON.md tests/benchmarks/hucre_node/node_modules/ tests/benchmarks/hucre_node/.pnpm-store/ diff --git a/CHANGELOG.md b/CHANGELOG.md index ac45319..fb8eb36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -All notable changes to **ks-xlsx-parser** are documented here. +All notable changes to **excel-parser** are documented here. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). @@ -47,18 +47,18 @@ Template for a new release (copy this block, fill in, move Unreleased items in): ## [0.2.1] — 2026-05-19 -### ⚠️ BREAKING (Fixed — see also #ks-xlsx-parser channel report) +### ⚠️ BREAKING (Fixed — see also #excel-parser channel report) - Repository layout flattened on `src/` was leaking 13 generic top-level packages (`models`, `utils`, `parsers`, …) into installed wheels and silently dropping `pipeline.py` and `api.py` (setuptools `packages.find` only finds *packages*, not top-level modules). Users hitting - `from ks_xlsx_parser.pipeline import ...` on 0.2.0 from PyPI got + `from excel_parser.pipeline import ...` on 0.2.0 from PyPI got `ModuleNotFoundError`. **All modules now live under - `src/ks_xlsx_parser/`**; the wheel's `top_level.txt` contains only - `ks_xlsx_parser`. Imports inside the package switched from - `from pipeline import` to `from ks_xlsx_parser.pipeline import`. + `src/excel_parser/`**; the wheel's `top_level.txt` contains only + `excel_parser`. Imports inside the package switched from + `from pipeline import` to `from excel_parser.pipeline import`. Downstream code that imported the leaked generics - (`from models import …`) MUST migrate to `from ks_xlsx_parser.models …`. + (`from models import …`) MUST migrate to `from excel_parser.models …`. ### Added - `scripts/verify_wheel.py` — builds the wheel, installs it in a fresh @@ -89,9 +89,9 @@ Template for a new release (copy this block, fill in, move Unreleased items in): ### Changed - Dropped `PYTHONPATH=src` from Makefile benchmark targets — the package is now properly installable so callers don't need it. -- `pyproject.toml`: `packages.find` constrained to `ks_xlsx_parser*`, - `py.typed` declared as package data, `xlsx-parser-api` console script - updated to `ks_xlsx_parser.api:main`. +- `pyproject.toml`: `packages.find` constrained to `excel_parser*`, + `py.typed` declared as package data, `excel-parser-api` console script + updated to `excel_parser.api:main`. ### ⚠️ BREAKING - Retired the in-tree `testBench/` corpus. The 1054-workbook stress dataset @@ -121,7 +121,7 @@ Template for a new release (copy this block, fill in, move Unreleased items in): **Benchmark + retrievability release.** Adds a head-to-head benchmark against [Docling](https://github.com/DS4SD/docling) on the [SpreadsheetBench](https://github.com/RUCKBReasoning/SpreadsheetBench) corpus (912 instances, 5,458 xlsx files) and fixes three rendering bugs that -were silently torpedoing RAG retrieval. ks-xlsx-parser parses **99.945%** of +were silently torpedoing RAG retrieval. excel-parser parses **99.945%** of SpreadsheetBench and **ties Docling at recall@1 / wins at recall@3 (+2.7 pp) and recall@5 (+1.8 pp)**, plus 36.9% citation-grade geometric recall (Docling 0%, structurally — no A1 anchors). @@ -190,14 +190,14 @@ and recall@5 (+1.8 pp)**, plus 36.9% citation-grade geometric recall (Docling text-match and geometric recall metrics. ### Performance -- ks-xlsx-parser is now ~5% faster on average parse time on SpreadsheetBench +- excel-parser is now ~5% faster on average parse time on SpreadsheetBench than Docling (251 ms vs 265 ms mean), while producing a richer output (formulas, dependency graph, charts, named ranges, etc.). ### Docs - `tests/benchmarks/README.md` — new — methodology + adapter design. - `tests/benchmarks/reports/COMPARISON.md` — new — head-to-head report. -- README — new "Benchmark — ks-xlsx-parser vs Docling on SpreadsheetBench" +- README — new "Benchmark — excel-parser vs Docling on SpreadsheetBench" section near the top with the headline table. ### Internal @@ -215,8 +215,8 @@ and recall@5 (+1.8 pp)**, plus 36.9% citation-grade geometric recall (Docling announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_v0.1.1.md). ### Added -- Public Python package **`ks-xlsx-parser`** on PyPI; import as - `xlsx_parser` or the alias `ks_xlsx_parser`. +- Public Python package **`excel-parser`** on PyPI; import as + `excel_parser` or the alias `excel_parser`. - `parse_workbook()` returning a `ParseResult` with `.workbook`, `.chunks`, and `.serializer` — full workbook graph (cells, formulas, merges, tables, charts, CF, DV, named ranges, dependency edges). @@ -233,7 +233,7 @@ announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_ combo: 400, adversarial: 300). - `tests/test_testbench_roundtrip.py` — parallel round-trip gate; 1054/1054 passing in ~70 s. -- FastAPI web server (`xlsx-parser-api`) in the `[api]` extra. +- FastAPI web server (`excel-parser-api`) in the `[api]` extra. - GitHub Actions: `ci.yml` (test matrix on py3.10/3.11/3.12 × ubuntu/macos + dedicated testBench job) and `release.yml` (wheel + sdist + testBench zip, PyPI Trusted Publishing). @@ -278,7 +278,7 @@ announcement: [`docs/launch/RELEASE_NOTES_v0.1.1.md`](docs/launch/RELEASE_NOTES_ - Removed internal-only tooling: Ralph loop scripts, Cursor / Serena agent configs, iteration logs, Knowledge-Stack-internal framing in DESIGN.md. -- Rebranded from `arnav2/XLSXParser` to `knowledgestack/ks-xlsx-parser`; +- Rebranded from `arnav2/XLSXParser` to `knowledgestack/excel-parser`; transferred the repo into the `knowledgestack` org and made it public. - `uv.lock` regenerated after dropping the `[ralph]` extra and adding `pytest-timeout` / `ruff` / `mypy` to `[dev]`. @@ -289,5 +289,5 @@ Private-beta release used inside the Knowledge Stack ecosystem. Not published to PyPI. Superseded by 0.1.1. -[Unreleased]: https://github.com/knowledgestack/ks-xlsx-parser/compare/v0.1.1...HEAD -[0.1.1]: https://github.com/knowledgestack/ks-xlsx-parser/releases/tag/v0.1.1 +[Unreleased]: https://github.com/knowledgestack/excel-parser/compare/v0.1.1...HEAD +[0.1.1]: https://github.com/knowledgestack/excel-parser/releases/tag/v0.1.1 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c9bdabf..6da2cdb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,9 +1,9 @@ -# Contributing to ks-xlsx-parser +# Contributing to excel-parser **First: welcome.** 👋 If you got here and aren't sure what to do: - Jump into our [**Discord**](https://discord.gg/4uaGhJcx) — real-time help, roadmap chat, and the fastest way to pair on an idea with a maintainer. -- Or open a [Discussion](https://github.com/knowledgestack/ks-xlsx-parser/discussions) if async is your thing. +- Or open a [Discussion](https://github.com/knowledgestack/excel-parser/discussions) if async is your thing. We'd rather talk than have you leave. Every good-first-issue, every weird `.xlsx` fixture, every three-line doc patch is welcome. @@ -15,21 +15,21 @@ bug or send a small PR. If that's you, thank you. 1. **Run `make bench-robust` on SpreadsheetBench and report a file that breaks.** We actively want edge-case `.xlsx` fixtures — use the - [Parser edge case issue template](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml). + [Parser edge case issue template](https://github.com/knowledgestack/excel-parser/issues/new?template=parser_edge_case.yml). 2. **Submit an adversarial workbook.** Attach a `.xlsx` (or a generator that builds one) to a Parser edge case issue. If the parser crashes on it, even better. 3. **Fix one of the flagged issues** in [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md). 4. **Improve docs.** The README, the architecture diagram, the examples — if something confused you, it confuses everyone. -5. **Open a [Show & Tell](https://github.com/knowledgestack/ks-xlsx-parser/discussions/new?category=show-and-tell)** +5. **Open a [Show & Tell](https://github.com/knowledgestack/excel-parser/discussions/new?category=show-and-tell)** if you shipped something with the parser. Seriously, it helps us prioritise. ## Development setup ```bash -git clone https://github.com/knowledgestack/ks-xlsx-parser.git -cd ks-xlsx-parser +git clone https://github.com/knowledgestack/excel-parser.git +cd excel-parser make install # pip install -e ".[dev,api]" make test # fast, default suite make corpus-download # fetch SpreadsheetBench (5,458 real-world xlsx) @@ -58,14 +58,14 @@ fix with a one-paragraph explanation is almost always mergeable. ## Reporting issues -Use the [issue templates](https://github.com/knowledgestack/ks-xlsx-parser/issues/new/choose). +Use the [issue templates](https://github.com/knowledgestack/excel-parser/issues/new/choose). For security issues, please use the -[private advisory flow](https://github.com/knowledgestack/ks-xlsx-parser/security/advisories/new) +[private advisory flow](https://github.com/knowledgestack/excel-parser/security/advisories/new) — not a public issue. Helpful things to include: -- Output of `python -c "import xlsx_parser; print(xlsx_parser.__version__)"` +- Output of `python -c "import excel_parser; print(excel_parser.__version__)"` - Python version (`python --version`) - OS - Minimal `.xlsx` that reproduces the bug (or a generator that builds one) @@ -83,9 +83,9 @@ Helpful things to include: ## Community - **Discord**: — come hang out, the maintainers and regulars are active here. -- Discussions: -- Issues: -- Security: +- Discussions: +- Issues: +- Security: - Knowledge Stack org: By participating you agree to follow our [Code of Conduct](CODE_OF_CONDUCT.md). diff --git a/Dockerfile.bench b/Dockerfile.bench index cf2755f..24414e9 100644 --- a/Dockerfile.bench +++ b/Dockerfile.bench @@ -1,4 +1,4 @@ -# Benchmark image for ks-xlsx-parser. +# Benchmark image for excel-parser. # # Builds once, then on each run downloads SpreadsheetBench (if not cached), # parses the corpus, embeds chunks with a small sentence-transformer, and @@ -6,14 +6,14 @@ # tests/benchmarks/reports/ — mount that path as a volume to persist results. # # Usage: -# docker build -f Dockerfile.bench -t ks-xlsx-parser-bench . +# docker build -f Dockerfile.bench -t excel-parser-bench . # docker run --rm \ # -v "$PWD/tests/benchmarks/reports:/app/tests/benchmarks/reports" \ # -v "$PWD/data:/app/data" \ -# ks-xlsx-parser-bench +# excel-parser-bench # # # Quick sanity run on 20 instances: -# docker run --rm -e BENCH_SAMPLE=20 ks-xlsx-parser-bench +# docker run --rm -e BENCH_SAMPLE=20 excel-parser-bench FROM python:3.12-slim @@ -24,8 +24,12 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ WORKDIR /app +# libreoffice-calc-nogui gives a headless `soffice` for full-fidelity legacy +# .xls → .xlsx conversion (preserves formula text, charts, shapes). Without it +# the parser falls back to the pure-Python xlrd path (values only). --no-install- +# recommends keeps the image lean (skips the X11/Java recommends). RUN apt-get update && apt-get install -y --no-install-recommends \ - curl unzip ca-certificates git \ + curl unzip ca-certificates git libreoffice-calc-nogui \ && rm -rf /var/lib/apt/lists/* # Install deps first to keep layers cacheable across code edits. diff --git a/LICENSE b/LICENSE index 05fb1f6..b5e58b1 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 XLSX Parser Contributors +Copyright (c) 2025 Excel Parser Contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index 5331c52..c5e3373 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ PYTHON ?= python PKG_VERSION := $(shell $(PYTHON) -c "import tomllib, pathlib; print(tomllib.loads(pathlib.Path('pyproject.toml').read_text())['project']['version'])") help: - @echo "ks-xlsx-parser — common targets" + @echo "excel-parser — common targets" @echo "" @echo " make install Install package and dev deps (editable)" @echo " make install-dev Alias for install (matches ks-backend)" @@ -44,7 +44,7 @@ format: $(PYTHON) -m ruff format src/ tests/ scripts/ typecheck: - $(PYTHON) -m mypy src/ks_xlsx_parser + $(PYTHON) -m mypy src/excel_parser # Build the wheel and prove it imports outside the editable source tree. # This is the regression guard for the v0.2.0 packaging bug (pipeline.py @@ -84,5 +84,5 @@ bench-track: $(PYTHON) scripts/triage_recall.py tests/benchmarks/reports/retrieval docker-bench: - docker build -f Dockerfile.bench -t ks-xlsx-parser-bench . - docker run --rm -v "$(PWD)/tests/benchmarks/reports:/app/tests/benchmarks/reports" ks-xlsx-parser-bench + docker build -f Dockerfile.bench -t excel-parser-bench . + docker run --rm -v "$(PWD)/tests/benchmarks/reports:/app/tests/benchmarks/reports" excel-parser-bench diff --git a/README.md b/README.md index f39f718..50ceef1 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -ks-xlsx-parser +excel-parser

- Star on GitHub - Fork on GitHub - GitHub stargazers + Star on GitHub + Fork on GitHub + GitHub stargazers

@@ -13,23 +13,23 @@

📊 Make XLSX LLM Ready 🤖

- ks-xlsx-parser — the open-source Python library that parses Excel (.xlsx) files into citation-ready JSON for LLMs, RAG pipelines, and AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Claude, MCP). + excel-parser — the open-source Python library that parses Excel (.xlsx) files into citation-ready JSON for LLMs, RAG pipelines, and AI agents (LangChain, LangGraph, CrewAI, OpenAI Agents SDK, Claude, MCP).

- PyPI + PyPI Python 3.10+ MIT License SpreadsheetBench - CI + CI

Discord Knowledge Stack - Discussions - GitHub stars - Landing site + Discussions + GitHub stars + Landing site

@@ -46,7 +46,7 @@ > dependency graphs, and RAG-ready chunks — deterministic, fully tested, MIT.

- ks-xlsx-parser highlighting a financial model on the left and emitting typed, citation-linked chunks on the right + excel-parser highlighting a financial model on the left and emitting typed, citation-linked chunks on the right
Raw workbook on the left (financial_model.xlsx) → parser output on the right: 4 chunks, each tied back to an exact sheet!range, ready to cite in an LLM response.

@@ -54,7 +54,7 @@ Spreadsheets are still the #1 unstructured data source in the enterprise. Feeding a `.xlsx` directly to an LLM loses structure (rows, formulas, merges), loses provenance (which cell said what), and blows through context windows. -`ks-xlsx-parser` turns an Excel workbook into a token-counted, source-addressable +`excel-parser` turns an Excel workbook into a token-counted, source-addressable graph that drops straight into [LangChain](https://www.langchain.com/), [LangGraph](https://langchain-ai.github.io/langgraph/), [CrewAI](https://www.crewai.com/), the @@ -62,7 +62,7 @@ graph that drops straight into [LangChain](https://www.langchain.com/), [MCP](https://modelcontextprotocol.io/)-aware client (Claude Desktop, Cursor, Windsurf, Zed, …).

- Star the repo + Star the repo   Join our Discord

@@ -77,22 +77,22 @@ graph that drops straight into [LangChain](https://www.langchain.com/), --- -## 🏁 Benchmark — ks-xlsx-parser vs Docling on SpreadsheetBench +## 🏁 Benchmark — excel-parser vs Docling on SpreadsheetBench

SpreadsheetBench Parse success - Recall@3 vs Docling + Recall@3 vs Docling A1 anchors

-Apples-to-apples on [SpreadsheetBench v0.1](https://github.com/RUCKBReasoning/SpreadsheetBench): 912 real-world task instances curated from ExcelHome / Mr.Excel / r/excel. For each instance we parse the input `.xlsx`, embed every chunk with `BAAI/bge-small-en-v1.5`, then check whether the chunk containing the ground-truth answer is in the top-k by similarity to the question. +Apples-to-apples on [SpreadsheetBench v0.1](https://github.com/RUCKBReasoning/SpreadsheetBench): 912 real-world task instances curated from ExcelHome / Mr.Excel / r/excel. Both parsers are scored in the **same run on the same harness** — for each instance we parse the input `.xlsx`, embed every chunk with `BAAI/bge-small-en-v1.5`, then check whether the chunk containing the ground-truth answer is in the top-k by similarity to the question. Text-match recall is reported over the **scoreable** instances (the harness excludes instances whose answer cell is empty/uncached for *both* parsers equally). - + @@ -106,33 +106,33 @@ Apples-to-apples on [SpreadsheetBench v0.1](https://github.com/RUCKBReasoning/Sp - - - + + + - - - + + + - - - + + + - + - - - - + + + + @@ -145,8 +145,10 @@ Apples-to-apples on [SpreadsheetBench v0.1](https://github.com/RUCKBReasoning/Sp ### 💡 What the numbers mean -- **`ks-xlsx-parser` ties at recall@1 and wins recall@3 (+2.7 pp) and recall@5 (+1.8 pp).** Text-match recall is parser-agnostic — it asks whether *any* parser surfaced a chunk containing the answer string, after normalising commas, percent signs, ISO dates, and booleans on both sides. -- **`ks-xlsx-parser` wins citation-grade (geometric) recall outright (0.369 vs 0.000).** Docling produces markdown without per-chunk `sheet!range` anchors, so it can't render a citation that points at the exact source cells. This is the difference between "the answer is somewhere in the workbook" and "the answer is in `Revenue!C7`." +- **Text-match recall is a close race — call it honestly.** Docling edges `excel-parser` at recall@1 (0.708 vs 0.693, **+1.5 pp Docling**); `excel-parser` takes recall@3 (**+2.8 pp**) and recall@5 (**+1.9 pp**). Text-match is parser-agnostic — it asks whether *any* parser surfaced a chunk containing the answer string, after normalising commas, percent signs, ISO dates, and booleans on both sides. For retrieval *quality* alone, the two are roughly even. +- **The real separation is citation-grade (geometric) recall: `0.889` vs `0.000`.** `excel-parser` emits a `sheet!A1:Z99` range on every chunk, so a top-k hit can cite the exact source cells; Docling produces markdown with **no cell coordinates**, so it structurally cannot — this is a *capability gap*, not an extraction-quality failure. It's the difference between "the answer is somewhere in the workbook" and "the answer is in `Revenue!C7`." (In-scope geometric recall@5 is **0.960** — see [`COMPARISON.md`](tests/benchmarks/reports/COMPARISON.md).) +- **Parse time is mixed, and `excel-parser` is not uniformly faster.** Median per-file latency is comparable (~11–13 ms, `excel-parser` slightly ahead); on the **mean**, Docling is faster (238 ms vs 349 ms) because `excel-parser`'s large-table row-windowing renders big sheets more than once. Both parse 0 of 912 with errors. +- **Harness honesty.** The geometric and text numbers above use a corrected harness vs. earlier releases: (1) geometric overlap now credits a match when the dataset omits the sheet name (~62% of instances are single-sheet) instead of comparing a real sheet name against `""` — this raised `excel-parser`'s geometric recall from the previously-reported 0.369 (the parser always pointed correctly; the old metric under-counted), and leaves Docling at 0.000; (2) instances whose answer cell is empty/uncached are excluded from the **text** denominator for *both* parsers. Both fixes are parser-independent and covered by unit tests. - **`Marker` is excluded by design.** Its xlsx → HTML → PDF → layout-recognition pipeline clocks >30 min per workbook on CPU. The benchmark framework supports adding a Marker adapter when GPU is available — see [`tests/benchmarks/adapters/docling_adapter.py`](tests/benchmarks/adapters/docling_adapter.py) as a template. ### 🔁 Reproduce @@ -196,16 +198,16 @@ are all first-class ways to keep the lights on. **Jump into the community:** - 💬 **[Discord](https://discord.gg/4uaGhJcx)** — real-time help, roadmap conversations, show off what you're building. Drop in, say hi. -- 🗣 [GitHub Discussions](https://github.com/knowledgestack/ks-xlsx-parser/discussions) — async Q&A, RFCs, and long-form ideas. -- 🐞 [Issues](https://github.com/knowledgestack/ks-xlsx-parser/issues/new/choose) — report a bug, request a feature, or file a parser edge case. -- 🎯 [Show & Tell](https://github.com/knowledgestack/ks-xlsx-parser/discussions/new?category=show-and-tell) — tell us about your production use. -- 🔐 [Security](https://github.com/knowledgestack/ks-xlsx-parser/security/advisories/new) — private vulnerability disclosure. +- 🗣 [GitHub Discussions](https://github.com/knowledgestack/excel-parser/discussions) — async Q&A, RFCs, and long-form ideas. +- 🐞 [Issues](https://github.com/knowledgestack/excel-parser/issues/new/choose) — report a bug, request a feature, or file a parser edge case. +- 🎯 [Show & Tell](https://github.com/knowledgestack/excel-parser/discussions/new?category=show-and-tell) — tell us about your production use. +- 🔐 [Security](https://github.com/knowledgestack/excel-parser/security/advisories/new) — private vulnerability disclosure. - 🙌 [Contribute](CONTRIBUTING.md) — every PR is reviewed; `good-first-issue` labels live on Issues. -- 🧰 [Knowledge Stack org](https://github.com/knowledgestack) — see the rest of the ecosystem (ks-cookbook, ks-xlsx-parser, more on the way). +- 🧰 [Knowledge Stack org](https://github.com/knowledgestack) — see the rest of the ecosystem (ks-cookbook, excel-parser, more on the way). Not sure where to start? Run `make bench-robust` on SpreadsheetBench, find a file that breaks, open a -[Parser edge case](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml). +[Parser edge case](https://github.com/knowledgestack/excel-parser/issues/new?template=parser_edge_case.yml). That's the fastest path to a merged PR. --- @@ -213,11 +215,11 @@ That's the fastest path to a merged PR. ## 🚀 30-second demo ```bash -pip install ks-xlsx-parser +pip install excel-parser ``` ```python -from ks_xlsx_parser import parse_workbook +from excel_parser import parse_workbook result = parse_workbook(path="q4_forecast.xlsx") @@ -244,8 +246,8 @@ That's it. Every chunk has: ## 🗺️ Table of Contents -- [🏁 Benchmark — vs Docling on SpreadsheetBench](#-benchmark--ks-xlsx-parser-vs-docling-on-spreadsheetbench) -- [🤔 Why a dedicated XLSX parser for LLMs?](#-why-a-dedicated-xlsx-parser-for-llms) +- [🏁 Benchmark — vs Docling on SpreadsheetBench](#-benchmark--excel-parser-vs-docling-on-spreadsheetbench) +- [🤔 Why a dedicated XLSX parser for LLMs?](#-why-a-dedicated-excel-parser-for-llms) - [🏗️ Architecture](#️-architecture) - [📦 Installation](#-installation) - [📚 Documentation](#-documentation) @@ -265,7 +267,7 @@ That's it. Every chunk has: Most Excel libraries answer one of two questions well: *"read a rectangle of values"* (pandas, openpyxl) or *"run Excel headless"* (xlwings, LibreOffice). -`ks-xlsx-parser` answers a third one: **"give me a structured, inspectable, +`excel-parser` answers a third one: **"give me a structured, inspectable, loss-minimising graph that an LLM or auditor can reason about."** | Output | Why an LLM cares | @@ -288,7 +290,7 @@ corpus, and everything is open source. The pipeline runs **8 deterministic stages**: parse → analyse → annotate → segment → render → serialise → verify → compare/export. Full diagram, stage-by-stage breakdown, and module map in [**docs/wiki/Architecture.md**](docs/wiki/Architecture.md). Stage internals in [**Pipeline Internals**](docs/wiki/Pipeline-Internals.md). > [!NOTE] -> The importable module is `xlsx_parser`; `ks_xlsx_parser` is a re-export +> The importable module is `excel_parser`; `excel_parser` is a re-export > matching the PyPI package name. The package is fully type-annotated > (`py.typed` is shipped). @@ -299,16 +301,16 @@ The pipeline runs **8 deterministic stages**: parse → analyse → annotate → Requires Python 3.10+. ```bash -pip install ks-xlsx-parser # core library -pip install "ks-xlsx-parser[api]" # + FastAPI web server -pip install "ks-xlsx-parser[dev]" # + test tooling +pip install excel-parser # core library +pip install "excel-parser[api]" # + FastAPI web server +pip install "excel-parser[dev]" # + test tooling ``` From source: ```bash -git clone https://github.com/knowledgestack/ks-xlsx-parser.git -cd ks-xlsx-parser +git clone https://github.com/knowledgestack/excel-parser.git +cd excel-parser make install # pip install -e ".[dev,api]" make test # default suite make corpus-download # fetch SpreadsheetBench (5,458 real-world xlsx) @@ -323,7 +325,7 @@ Runtime deps: `openpyxl`, `pydantic`, `lxml`, `xxhash`, `tiktoken`. ## 📚 Documentation All implementation detail lives under [`docs/wiki/`](docs/wiki/) (mirrored -to the [GitHub Wiki](https://github.com/knowledgestack/ks-xlsx-parser/wiki) +to the [GitHub Wiki](https://github.com/knowledgestack/excel-parser/wiki) on each release) so this README stays scannable: - 🚀 [**Quick Start**](docs/wiki/Quick-Start.md) — parse, iterate chunks, walk the dep graph, serialise, parse from bytes. Five short snippets, ~90 % of real usage. @@ -339,9 +341,9 @@ on each release) so this README stays scannable: ## ⚔️ How it compares -This is the **structural** capability matrix. For head-to-head retrieval numbers (recall@k, geometric, latency) on a 912-instance real-world corpus, see [🏁 Benchmark — ks-xlsx-parser vs Docling on SpreadsheetBench](#-benchmark--ks-xlsx-parser-vs-docling-on-spreadsheetbench) up top. +This is the **structural** capability matrix. For head-to-head retrieval numbers (recall@k, geometric, latency) on a 912-instance real-world corpus, see [🏁 Benchmark — excel-parser vs Docling on SpreadsheetBench](#-benchmark--excel-parser-vs-docling-on-spreadsheetbench) up top. -| | pandas / openpyxl | Docling | `ks-xlsx-parser` | +| | pandas / openpyxl | Docling | `excel-parser` | |---|:---:|:---:|:---:| | Reads values | ✅ | ✅ | ✅ | | Keeps **formulas** | ⚠️ raw string | ❌ | ✅ parsed + dependency graph | @@ -356,7 +358,7 @@ This is the **structural** capability matrix. For head-to-head retrieval numbers | Deterministic **content hashes** | ❌ | ❌ | ✅ xxhash64 per cell / block / chunk | | Streaming `.xlsx` > 100 MB | ⚠️ | ❌ | ✅ (chunked parse) | -Most tools give you a dataframe. `ks-xlsx-parser` gives you a **graph an LLM can cite**. +Most tools give you a dataframe. `excel-parser` gives you a **graph an LLM can cite**. --- @@ -365,7 +367,7 @@ Most tools give you a dataframe. `ks-xlsx-parser` gives you a **graph an LLM can > [**@productdevbook**](https://github.com/productdevbook). For an unbiased > head-to-head on the SpreadsheetBench corpus — perf numbers, > extraction-count parity, where each side wins — see the wiki: -> [**`ks-xlsx-parser` vs `hucre`**](docs/wiki/Benchmark-vs-hucre.md). +> [**`excel-parser` vs `hucre`**](docs/wiki/Benchmark-vs-hucre.md). --- @@ -385,7 +387,7 @@ Teams shipping agents, RAG pipelines, or auditing tools that ingest Excel. > [!IMPORTANT] > **Not a fit** if you need to *execute* Excel (recalculate, run VBA, pivot-refresh). -> Use xlwings or a headless Excel for that. `ks-xlsx-parser` reads; it doesn't run. +> Use xlwings or a headless Excel for that. `excel-parser` reads; it doesn't run. --- @@ -409,7 +411,7 @@ nothing is committed to the repo. ## 🚧 Limitations -- **`.xls` not supported** — only `.xlsx` and `.xlsm` (OOXML). Convert legacy files externally. +- **Legacy `.xls` (BIFF)** — supported via in-memory conversion to `.xlsx`. If **LibreOffice** is installed (auto-detected, headless), conversion is **full-fidelity**: formula text, cached values, charts, shapes, and styling are all preserved. Without LibreOffice, a pure-Python `xlrd` fallback preserves values, types, number formats, merges, and basic styling, but **not** formula text or charts (cached formula values still survive). Disable the LibreOffice path with `EXCEL_PARSER_DISABLE_SOFFICE=1`. - **Pivot tables** — detected but not fully parsed. - **Sparklines** — not extracted. - **VBA macros** — flagged but never executed or analysed. @@ -424,7 +426,7 @@ Full list in [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md). ## 🧰 Knowledge Stack ecosystem -`ks-xlsx-parser` is one piece of the [**Knowledge Stack**](https://github.com/knowledgestack) +`excel-parser` is one piece of the [**Knowledge Stack**](https://github.com/knowledgestack) open-source family — *document intelligence for agents*, built so that engineering teams can focus on agents and we handle the messy parts of enterprise data. @@ -432,11 +434,11 @@ enterprise data. | Repo | What it does | |------|--------------| | [**ks-cookbook**](https://github.com/knowledgestack/ks-cookbook) | 32 production-style flagship agents + recipes for LangChain, LangGraph, CrewAI, Temporal, the OpenAI Agents SDK, and any [MCP](https://modelcontextprotocol.io/) client. | -| [**ks-xlsx-parser**](https://github.com/knowledgestack/ks-xlsx-parser) (this repo) | Turn `.xlsx` into LLM-ready JSON with citations and dependency graphs. | +| [**excel-parser**](https://github.com/knowledgestack/excel-parser) (this repo) | Turn `.xlsx` into LLM-ready JSON with citations and dependency graphs. | | [@knowledgestack](https://github.com/knowledgestack) | Follow the org for upcoming repos — parsers, extractors, and MCP servers for PDF, DOCX, PPTX, HTML, and more. | Building on top of the stack? Tell us about it in -[Show & Tell](https://github.com/knowledgestack/ks-xlsx-parser/discussions/new?category=show-and-tell) +[Show & Tell](https://github.com/knowledgestack/excel-parser/discussions/new?category=show-and-tell) or the [#showcase](https://discord.gg/4uaGhJcx) channel on Discord. --- @@ -446,12 +448,12 @@ or the [#showcase](https://discord.gg/4uaGhJcx) channel on Discord.

Discord Follow Knowledge Stack - Discussions + Discussions

- 💬 **[Join the Discord](https://discord.gg/4uaGhJcx)** — our main real-time channel. Roadmap, help, job postings, show-and-tell, and the occasional meme. - 🐙 **[Follow @knowledgestack](https://github.com/knowledgestack)** on GitHub for new releases across the ecosystem. -- 📣 Watch this repo (→ *Releases only*) to get pinged when `ks-xlsx-parser` ships an update. +- 📣 Watch this repo (→ *Releases only*) to get pinged when `excel-parser` ships an update. If you'd rather just peek first — run the benchmark suite against the public SpreadsheetBench corpus (`make corpus-download && make bench-robust`) @@ -465,7 +467,7 @@ We love contributions. Three paths, in order of speed-to-merge: 1. **Report a benchmark failure** — run `make bench-robust` on SpreadsheetBench, find a file that breaks, attach it to a - [Parser edge case issue](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml). + [Parser edge case issue](https://github.com/knowledgestack/excel-parser/issues/new?template=parser_edge_case.yml). 2. **Submit an adversarial workbook** — open a Parser edge case issue with the file attached; we'll fold it into the suite. 3. **Fix a flagged issue** — see [`docs/PARSER_KNOWN_ISSUES.md`](docs/PARSER_KNOWN_ISSUES.md). @@ -475,7 +477,7 @@ See the [Code of Conduct](CODE_OF_CONDUCT.md) and [Security policy](SECURITY.md) before posting. If you don't have time to contribute but the project helped you, please -**[star the repo](https://github.com/knowledgestack/ks-xlsx-parser)**. That's +**[star the repo](https://github.com/knowledgestack/excel-parser)**. That's the main signal that keeps this maintained. --- @@ -485,7 +487,7 @@ the main signal that keeps this maintained.
What is the best Python library to parse Excel (.xlsx) for LLMs? -`ks-xlsx-parser` is purpose-built for it. Unlike pandas or openpyxl, it preserves formulas with a directed dependency graph, merged regions, tables, charts, and conditional formatting, and emits token-counted chunks with `source_uri` citations an LLM can quote. `pip install ks-xlsx-parser`. +`excel-parser` is purpose-built for it. Unlike pandas or openpyxl, it preserves formulas with a directed dependency graph, merged regions, tables, charts, and conditional formatting, and emits token-counted chunks with `source_uri` citations an LLM can quote. `pip install excel-parser`.
@@ -506,26 +508,26 @@ Same pattern — wrap `parse_workbook` in whatever tool abstraction your framewo
Can Claude Desktop, Cursor, Windsurf, or another MCP client read Excel files? -Yes — run the bundled FastAPI server (`pip install ks-xlsx-parser[api]; xlsx-parser-api`) and call `POST /parse`. A native MCP server is on the [Knowledge Stack](https://github.com/knowledgestack) roadmap. +Yes — run the bundled FastAPI server (`pip install excel-parser[api]; excel-parser-api`) and call `POST /parse`. A native MCP server is on the [Knowledge Stack](https://github.com/knowledgestack) roadmap.
How do I build a RAG pipeline over Excel spreadsheets? -Three steps: `pip install ks-xlsx-parser`, call `parse_workbook()` on each file, then `result.serializer.to_vector_store_entries()` to get `id + text + metadata` triples ready for Qdrant, pgvector, Weaviate, or Pinecone. Every entry has a `content_hash` for dedup and a `source_uri` the LLM cites in its answer. +Three steps: `pip install excel-parser`, call `parse_workbook()` on each file, then `result.serializer.to_vector_store_entries()` to get `id + text + metadata` triples ready for Qdrant, pgvector, Weaviate, or Pinecone. Every entry has a `content_hash` for dedup and a `source_uri` the LLM cites in its answer.
-How is ks-xlsx-parser different from openpyxl or pandas? +How is excel-parser different from openpyxl or pandas? -openpyxl and pandas give you a rectangle of values. `ks-xlsx-parser` gives you the full workbook graph: parsed formulas with dependency edges, merged regions, Excel ListObjects, all 7 chart types, every conditional-formatting rule type, and LLM chunks with citation URIs + token counts. It wraps openpyxl and uses lxml for the bits openpyxl loses. +openpyxl and pandas give you a rectangle of values. `excel-parser` gives you the full workbook graph: parsed formulas with dependency edges, merged regions, Excel ListObjects, all 7 chart types, every conditional-formatting rule type, and LLM chunks with citation URIs + token counts. It wraps openpyxl and uses lxml for the bits openpyxl loses.
-Does ks-xlsx-parser run Excel formulas or macros? +Does excel-parser run Excel formulas or macros? No. The library reads `.xlsx` files; it never executes them. VBA macros are flagged but never run. External links are recorded but never resolved. ZIP-bomb and cell-count limits make it safe for untrusted uploads. @@ -550,6 +552,6 @@ Search queries this library answers: *Python Excel parser for LLMs*, *XLSX to JS [MIT](LICENSE). Use it, fork it, ship it. Attribution appreciated but not required. -If you ship something built on top of `ks-xlsx-parser`, we'd love a -[Show & Tell](https://github.com/knowledgestack/ks-xlsx-parser/discussions/new?category=show-and-tell) +If you ship something built on top of `excel-parser`, we'd love a +[Show & Tell](https://github.com/knowledgestack/excel-parser/discussions/new?category=show-and-tell) post or a shoutout on [Discord](https://discord.gg/4uaGhJcx). diff --git a/SECURITY.md b/SECURITY.md index 853a9c9..b81fa04 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -13,7 +13,7 @@ We provide security fixes for the latest released version on PyPI. **Please do not report security vulnerabilities through public GitHub issues.** -Instead, report them via GitHub's [Private Vulnerability Reporting](https://github.com/knowledgestack/ks-xlsx-parser/security/advisories/new) +Instead, report them via GitHub's [Private Vulnerability Reporting](https://github.com/knowledgestack/excel-parser/security/advisories/new) feature. This lets us triage privately before disclosing. If you cannot use GitHub's private reporting, email the maintainers @@ -29,7 +29,7 @@ informed throughout the triage. ## What counts as a vulnerability -ks-xlsx-parser processes untrusted `.xlsx` input, so we treat the following as +excel-parser processes untrusted `.xlsx` input, so we treat the following as in-scope: - Arbitrary code execution via a crafted workbook diff --git a/docs/MAINTAINERS.md b/docs/MAINTAINERS.md index 6378cb2..6eb8c73 100644 --- a/docs/MAINTAINERS.md +++ b/docs/MAINTAINERS.md @@ -62,7 +62,7 @@ Create categories (click *New Category* for each): - **📣 Announcements** (maintainer-posts only) — releases and project news - **💡 Ideas** (open) — before-it-becomes-an-issue feature brainstorms -- **🎯 Show and tell** (open) — projects built with ks-xlsx-parser +- **🎯 Show and tell** (open) — projects built with excel-parser - Attach the template in `.github/DISCUSSION_TEMPLATE/show-and-tell.yml` - **🙏 Q&A** (open, answerable) — usage and "does it handle X" questions - **🧪 Benchmark findings** (open) — edge cases that shouldn't be issues yet @@ -79,7 +79,7 @@ One-time PyPI setup: go to PyPI → *your project* → *Publishing* → *Add a n pending publisher* with: - Owner: `knowledgestack` -- Repository name: `ks-xlsx-parser` +- Repository name: `excel-parser` - Workflow name: `release.yml` - Environment name: `pypi` @@ -89,7 +89,7 @@ without a human click. ## Release checklist -1. Bump `version` in `pyproject.toml` and `src/xlsx_parser/__init__.py` (keep in sync). +1. Bump `version` in `pyproject.toml` and `src/excel_parser/__init__.py` (keep in sync). 2. Promote every entry from `## [Unreleased]` in [`../CHANGELOG.md`](../CHANGELOG.md) into a new `## [X.Y.Z] — YYYY-MM-DD` section; reset Unreleased to a stub line; update the compare-link footer at the bottom. @@ -131,9 +131,9 @@ Put a `.github/CODEOWNERS` with: * @knowledgestack/maintainers # Parser internals -/src/xlsx_parser/parsers/ @knowledgestack/maintainers -/src/xlsx_parser/formula/ @knowledgestack/maintainers -/src/xlsx_parser/analysis/ @knowledgestack/maintainers +/src/excel_parser/parsers/ @knowledgestack/maintainers +/src/excel_parser/formula/ @knowledgestack/maintainers +/src/excel_parser/analysis/ @knowledgestack/maintainers # Docs /docs/ @knowledgestack/maintainers diff --git a/docs/PARSER_KNOWN_ISSUES.md b/docs/PARSER_KNOWN_ISSUES.md index e1f5029..ea0a18a 100644 --- a/docs/PARSER_KNOWN_ISSUES.md +++ b/docs/PARSER_KNOWN_ISSUES.md @@ -47,7 +47,7 @@ parsing and serialization succeed end-to-end. **Root cause**: openpyxl's `data_only=True` reader does not always surface the most recently written cached value for complex dynamic-array or volatile formulas when the calc chain references across multiple sheets. This is an -openpyxl limitation, not an ks-xlsx-parser bug; calamine reads from the raw XML +openpyxl limitation, not an excel-parser bug; calamine reads from the raw XML and catches the newer values. **Potential fixes** (tracked): diff --git a/docs/RELEASE_PROCESS.md b/docs/RELEASE_PROCESS.md index 14c89a8..1e5d4ae 100644 --- a/docs/RELEASE_PROCESS.md +++ b/docs/RELEASE_PROCESS.md @@ -12,21 +12,21 @@ The release workflow's `pypi` job declares `environment: pypi`. That environment ```bash # Create the empty environment (idempotent) -gh api -X PUT repos/knowledgestack/ks-xlsx-parser/environments/pypi +gh api -X PUT repos/knowledgestack/excel-parser/environments/pypi ``` -Optional but recommended after creation: open https://github.com/knowledgestack/ks-xlsx-parser/settings/environments/pypi and add a **required reviewer** so a tag push needs explicit approval before the PyPI publish step runs. This is a safety net — once a tag is pushed the release workflow auto-fires; a reviewer gate gives you one last "are you sure?" before the irreversible PyPI publish. +Optional but recommended after creation: open https://github.com/knowledgestack/excel-parser/settings/environments/pypi and add a **required reviewer** so a tag push needs explicit approval before the PyPI publish step runs. This is a safety net — once a tag is pushed the release workflow auto-fires; a reviewer gate gives you one last "are you sure?" before the irreversible PyPI publish. ### 2. PyPI Trusted Publisher binding -This **cannot** be done via API or a PR — it requires logging into pypi.org as a maintainer of `ks-xlsx-parser`. +This **cannot** be done via API or a PR — it requires logging into pypi.org as a maintainer of `excel-parser`. -1. Go to https://pypi.org/manage/project/ks-xlsx-parser/settings/publishing/ +1. Go to https://pypi.org/manage/project/excel-parser/settings/publishing/ 2. Click **Add a new publisher** → **GitHub** 3. Fill in: - - **PyPI Project Name:** `ks-xlsx-parser` + - **PyPI Project Name:** `excel-parser` - **Owner:** `knowledgestack` - - **Repository:** `ks-xlsx-parser` + - **Repository:** `excel-parser` - **Workflow filename:** `release.yml` - **Environment name:** `pypi` 4. Save. @@ -35,7 +35,7 @@ Verify with: ```bash # Should list any publishers tied to the project (requires you to be logged in) -open "https://pypi.org/manage/project/ks-xlsx-parser/settings/publishing/" +open "https://pypi.org/manage/project/excel-parser/settings/publishing/" ``` If you're spinning up a new project that doesn't exist on PyPI yet, the publisher has to be configured as a **pending publisher** under your account first. Same form, accessible at https://pypi.org/manage/account/publishing/. @@ -45,7 +45,7 @@ If you're spinning up a new project that doesn't exist on PyPI yet, the publishe CI on a PR validates {ubuntu, macOS} × Python {3.10, 3.11, 3.12} before merge. Add a branch protection rule on `main` requiring those status checks to pass before a PR is mergeable: ```bash -gh api -X PUT repos/knowledgestack/ks-xlsx-parser/branches/main/protection \ +gh api -X PUT repos/knowledgestack/excel-parser/branches/main/protection \ -F required_status_checks[strict]=true \ -F 'required_status_checks[contexts][]=tests (ubuntu-latest / py3.10)' \ -F 'required_status_checks[contexts][]=tests (ubuntu-latest / py3.11)' \ @@ -58,7 +58,7 @@ gh api -X PUT repos/knowledgestack/ks-xlsx-parser/branches/main/protection \ -F restrictions= 2>/dev/null ``` -Or set in the UI: https://github.com/knowledgestack/ks-xlsx-parser/settings/branches +Or set in the UI: https://github.com/knowledgestack/excel-parser/settings/branches ## Per-release checklist @@ -67,7 +67,7 @@ For every new version `X.Y.Z`: 1. **Decide the version number.** Follow [SemVer](https://semver.org/). Breaking API change → major bump. New feature, no breakage → minor. Bugfix only → patch. 2. **Bump version in two places** (kept in sync to avoid drift): - `pyproject.toml` — `version = "X.Y.Z"` - - `src/ks_xlsx_parser/__init__.py` — `__version__ = "X.Y.Z"` + - `src/excel_parser/__init__.py` — `__version__ = "X.Y.Z"` 3. **Write the CHANGELOG entry** under a new `## [X.Y.Z] — YYYY-MM-DD` heading in [`CHANGELOG.md`](../CHANGELOG.md). Use the section labels documented at the top of that file (Added / Changed / Fixed / Performance / Docs / Internal / ⚠️ BREAKING). 4. **(Optional but recommended) Write hand-curated release notes** at `docs/launch/RELEASE_NOTES_vX.Y.Z.md`. If present, the release workflow picks it up automatically as the GitHub Release body; otherwise GitHub auto-generates from commits. 5. **Run `make test` locally** and verify all tests pass. @@ -78,10 +78,10 @@ For every new version `X.Y.Z`: git tag -a vX.Y.Z -m "vX.Y.Z — " git push origin vX.Y.Z ``` -8. **Watch the workflow.** https://github.com/knowledgestack/ks-xlsx-parser/actions — the `Release` workflow should run `build` → `github-release` → `pypi`. If the `pypi` job is gated on a reviewer, approve it in the Actions UI. +8. **Watch the workflow.** https://github.com/knowledgestack/excel-parser/actions — the `Release` workflow should run `build` → `github-release` → `pypi`. If the `pypi` job is gated on a reviewer, approve it in the Actions UI. 9. **Verify post-release:** - - PyPI: https://pypi.org/project/ks-xlsx-parser/X.Y.Z/ resolves and `pip install ks-xlsx-parser==X.Y.Z` works in a fresh venv. - - GitHub Release: https://github.com/knowledgestack/ks-xlsx-parser/releases/tag/vX.Y.Z shows the release notes + wheel + sdist. + - PyPI: https://pypi.org/project/excel-parser/X.Y.Z/ resolves and `pip install excel-parser==X.Y.Z` works in a fresh venv. + - GitHub Release: https://github.com/knowledgestack/excel-parser/releases/tag/vX.Y.Z shows the release notes + wheel + sdist. - The `[Unreleased]` heading at the top of `CHANGELOG.md` is reset to "Nothing yet" for the next cycle (manual; do this in a follow-up PR). ## Common failure modes @@ -100,7 +100,7 @@ If a published release has a critical bug: ```bash # Yank from PyPI (hides from `pip install` but doesn't delete — required for cache-poisoning safety) -# UI: https://pypi.org/manage/project/ks-xlsx-parser/release/X.Y.Z/ +# UI: https://pypi.org/manage/project/excel-parser/release/X.Y.Z/ # Tag a hotfix git checkout main @@ -109,7 +109,7 @@ git tag -a vX.Y.Z+1 -m "vX.Y.Z+1 — hotfix for " git push origin vX.Y.Z+1 ``` -Yanked versions remain installable if pinned explicitly; `pip install ks-xlsx-parser` without a version constraint skips them. This is the safe default for accidental release of broken code. +Yanked versions remain installable if pinned explicitly; `pip install excel-parser` without a version constraint skips them. This is the safe default for accidental release of broken code. ## Why we use Trusted Publishing instead of an API token diff --git a/docs/benchmark-local-setup.md b/docs/benchmark-local-setup.md index ab39803..ee37e39 100644 --- a/docs/benchmark-local-setup.md +++ b/docs/benchmark-local-setup.md @@ -165,20 +165,20 @@ That's how "is recall improving?" gets answered. Goal: `recall_text@5 > 0.90`. When you want to make sure local results aren't drifting from CI: ```bash -docker build -f Dockerfile.bench -t ks-xlsx-parser-bench . +docker build -f Dockerfile.bench -t excel-parser-bench . # Quick sanity (60 instances, ~3 min after image load): docker run --rm \ -e BENCH_SAMPLE=60 \ -v "$PWD/tests/benchmarks/reports:/app/tests/benchmarks/reports" \ -v "$PWD/data:/app/data" \ - ks-xlsx-parser-bench + excel-parser-bench # Full corpus: docker run --rm \ -v "$PWD/tests/benchmarks/reports:/app/tests/benchmarks/reports" \ -v "$PWD/data:/app/data" \ - ks-xlsx-parser-bench + excel-parser-bench ``` The image pre-warms the embedding model at build time so the first diff --git a/docs/corpora.md b/docs/corpora.md index 0896e3e..59487fa 100644 --- a/docs/corpora.md +++ b/docs/corpora.md @@ -1,6 +1,6 @@ # Corpus & Benchmarks -ks-xlsx-parser benchmarks against public corpora that are downloaded on demand — +excel-parser benchmarks against public corpora that are downloaded on demand — nothing large is committed to the repo. ## Primary corpus — SpreadsheetBench v0.1 diff --git a/docs/launch/ANNOUNCEMENTS.md b/docs/launch/ANNOUNCEMENTS.md index 2dce3cd..fb53aa5 100644 --- a/docs/launch/ANNOUNCEMENTS.md +++ b/docs/launch/ANNOUNCEMENTS.md @@ -6,9 +6,9 @@ Copy-paste these. Tweak the tone to match the channel. ## 🎮 Discord — `#announcements` -> **🚀 ks-xlsx-parser is now open source!** +> **🚀 excel-parser is now open source!** > -> We just shipped **ks-xlsx-parser v0.1.1** — the Knowledge Stack ETL layer +> We just shipped **excel-parser v0.1.1** — the Knowledge Stack ETL layer > that turns `.xlsx` into LLM-ready JSON with proper citations, dependency > graphs, and per-chunk token counts. > @@ -21,9 +21,9 @@ Copy-paste these. Tweak the tone to match the channel. > • A 1054-workbook stress corpus (`testBench/`) that we round-trip on every > CI run — 1054/1054 passing in ~70s. > -> **Install:** `pip install ks-xlsx-parser` -> **Repo:** -> **Release:** +> **Install:** `pip install excel-parser` +> **Repo:** +> **Release:** > > ⭐ Star the repo if this saves you time, and drop your edge-case workbooks > in <#edge-cases> (or just DM us). Every `.xlsx` that breaks the parser @@ -36,9 +36,9 @@ Copy-paste these. Tweak the tone to match the channel. ## 🎮 Discord — `#general` (shorter) -> We just open-sourced **ks-xlsx-parser** 🎉 +> We just open-sourced **excel-parser** 🎉 > Turn `.xlsx` into LLM-ready JSON with citations + dependency graphs. -> `pip install ks-xlsx-parser` · +> `pip install excel-parser` · > Break it, star it, hang out here 🙌 --- @@ -47,13 +47,13 @@ Copy-paste these. Tweak the tone to match the channel. > Spreadsheets are still the #1 unstructured data source in the enterprise. > -> We just open-sourced **ks-xlsx-parser** — turns `.xlsx` into +> We just open-sourced **excel-parser** — turns `.xlsx` into > citation-ready JSON your agents can actually reason about. > > 1054 stress-test workbooks. 100% pass rate. MIT. > -> `pip install ks-xlsx-parser` -> 🔗 github.com/knowledgestack/ks-xlsx-parser +> `pip install excel-parser` +> 🔗 github.com/knowledgestack/excel-parser Follow-up tweet (thread): @@ -61,13 +61,13 @@ Follow-up tweet (thread): > gives you prose. Neither gives you the *graph* an LLM needs: formulas, > merges, charts, dependency edges, and citation URIs per chunk. > -> That's the gap ks-xlsx-parser fills. +> That's the gap excel-parser fills. --- ## 💼 LinkedIn -> We just open-sourced ks-xlsx-parser — the Knowledge Stack ETL layer for +> We just open-sourced excel-parser — the Knowledge Stack ETL layer for > turning Excel workbooks into LLM-ready, citation-grounded JSON. > > Built for teams shipping agents and RAG pipelines that ingest real @@ -83,16 +83,16 @@ Follow-up tweet (thread): > > MIT licensed. Part of the Knowledge Stack ecosystem (https://github.com/knowledgestack). > -> `pip install ks-xlsx-parser` +> `pip install excel-parser` > -> ⭐ Star: https://github.com/knowledgestack/ks-xlsx-parser +> ⭐ Star: https://github.com/knowledgestack/excel-parser > 💬 Discord: https://discord.gg/4uaGhJcx --- ## 🧡 Hacker News — "Show HN" -**Title:** `Show HN: ks-xlsx-parser – turn .xlsx into citation-ready JSON for LLMs` +**Title:** `Show HN: excel-parser – turn .xlsx into citation-ready JSON for LLMs` **Body:** @@ -125,9 +125,9 @@ Follow-up tweet (thread): > only non-empty cells are `A1` and `XFD1048576` was iterating ~17B > empty cells before. Now 135ms. > -> Install: `pip install ks-xlsx-parser` +> Install: `pip install excel-parser` > -> Repo: https://github.com/knowledgestack/ks-xlsx-parser +> Repo: https://github.com/knowledgestack/excel-parser > Discord: https://discord.gg/4uaGhJcx > > Would love bug reports — especially `.xlsx` files that break it. @@ -136,7 +136,7 @@ Follow-up tweet (thread): ## 🧵 Reddit (r/MachineLearning, r/LangChain, r/Python) -**Title:** `[P] Open-sourced ks-xlsx-parser: turn .xlsx into citation-ready JSON for LLMs` +**Title:** `[P] Open-sourced excel-parser: turn .xlsx into citation-ready JSON for LLMs` **Body:** @@ -156,13 +156,13 @@ Follow-up tweet (thread): > run. 1054/1054 pass in ~70s. > - Python 3.10+, MIT, no macro execution, no external-link resolution. > -> `pip install ks-xlsx-parser` +> `pip install excel-parser` > -> Repo: https://github.com/knowledgestack/ks-xlsx-parser +> Repo: https://github.com/knowledgestack/excel-parser > > Would genuinely love `.xlsx` files that break it — every edge-case > report becomes a new fixture in the next release. We have a -> [Parser edge case](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml) +> [Parser edge case](https://github.com/knowledgestack/excel-parser/issues/new?template=parser_edge_case.yml) > issue template specifically for that. > > (Part of the wider Knowledge Stack open-source family — @@ -173,7 +173,7 @@ Follow-up tweet (thread): ## 📢 Dev.to / Medium / blog post outline -**Title:** `Make XLSX LLM Ready: why we built ks-xlsx-parser` +**Title:** `Make XLSX LLM Ready: why we built excel-parser` **Outline** (use as prompt to yourself, then expand): diff --git a/docs/launch/MEDIUM_ARTICLE.md b/docs/launch/MEDIUM_ARTICLE.md index 1b76ccb..117c5f5 100644 --- a/docs/launch/MEDIUM_ARTICLE.md +++ b/docs/launch/MEDIUM_ARTICLE.md @@ -12,11 +12,11 @@ ## TL;DR -- We open-sourced [**`ks-xlsx-parser`**](https://github.com/knowledgestack/ks-xlsx-parser), an MIT-licensed library that turns `.xlsx` workbooks into **citation-ready JSON** for LLM agents and RAG pipelines. +- We open-sourced [**`excel-parser`**](https://github.com/knowledgestack/excel-parser), an MIT-licensed library that turns `.xlsx` workbooks into **citation-ready JSON** for LLM agents and RAG pipelines. - Every output chunk carries a `source_uri` like `file.xlsx#Sheet!A1:F18`, a token count, an HTML + pipe-text rendering, and a deterministic content hash. - It preserves what every other parser drops on the floor: **formulas, merges, charts, conditional formatting, data validation, and a directed dependency graph** with cycle detection. - Ships with a **1054-workbook stress corpus** that runs in CI on every commit (**1054/1054 passing, ~70 s**). -- `pip install ks-xlsx-parser`. +- `pip install excel-parser`. --- @@ -36,7 +36,7 @@ And your options collapse: None of those give the LLM a **graph it can cite**. And citation is the load-bearing requirement: once compliance/finance/legal is in the loop, "the answer is somewhere in `Q4_forecast.xlsx`" is not an acceptable output. -That's the gap we built `ks-xlsx-parser` to fill. +That's the gap we built `excel-parser` to fill. --- @@ -178,7 +178,7 @@ Here's what parsing a financial model looks like in practice. **Input:** `q4_forecast.xlsx`, 13 sheets, 21k cells, multiple tables per sheet, charts, conditional formatting, named ranges. ```python -from ks_xlsx_parser import parse_workbook +from excel_parser import parse_workbook result = parse_workbook(path="q4_forecast.xlsx") @@ -211,7 +211,7 @@ COGS Margin | 45.0% | % | % of revenue **Walk the dependency graph:** ```python -from ks_xlsx_parser.models import CellCoord +from excel_parser.models import CellCoord upstream = result.workbook.dependency_graph.get_upstream( "Revenue", CellCoord(row=10, col=3), max_depth=3 @@ -241,16 +241,16 @@ Spreadsheets are a great attack surface. We're explicit: - **ZIP-bomb protection.** Incoming bytes are size-checked before openpyxl sees them. - **Cell-count ceiling.** Per-sheet `max_cells_per_sheet` (default 2M) truncates rather than OOMs. -You can safely point `ks-xlsx-parser` at untrusted uploads. We do. +You can safely point `excel-parser` at untrusted uploads. We do. --- ## 8. Where this fits in the ecosystem -`ks-xlsx-parser` is the first library in the [**Knowledge Stack**](https://github.com/knowledgestack) open-source family — document intelligence for agents, so engineering teams can focus on agents and we handle the messy parts of enterprise data. +`excel-parser` is the first library in the [**Knowledge Stack**](https://github.com/knowledgestack) open-source family — document intelligence for agents, so engineering teams can focus on agents and we handle the messy parts of enterprise data. - [**ks-cookbook**](https://github.com/knowledgestack/ks-cookbook) — 32 production-style flagship agents + recipes for LangChain, LangGraph, CrewAI, Temporal, and the OpenAI Agents SDK. -- [**ks-xlsx-parser**](https://github.com/knowledgestack/ks-xlsx-parser) — this library. +- [**excel-parser**](https://github.com/knowledgestack/excel-parser) — this library. - Next up: PDF, DOCX, PPTX parsers with the same citation model, plus an MCP server so Claude Desktop / Cursor / Windsurf / Zed can call the parsers without glue code. --- @@ -259,9 +259,9 @@ You can safely point `ks-xlsx-parser` at untrusted uploads. We do. If you got this far, please: -1. ⭐ **[Star the repo](https://github.com/knowledgestack/ks-xlsx-parser).** It's the single biggest signal that keeps maintainers paid. +1. ⭐ **[Star the repo](https://github.com/knowledgestack/excel-parser).** It's the single biggest signal that keeps maintainers paid. 2. 💬 **[Join the Discord](https://discord.gg/4uaGhJcx).** We hang out there. Ask questions, float ideas, show off what you built. -3. 🧪 **Run `make testbench` and send us a workbook that breaks it.** Every edge-case report becomes a fixture in the next release. There's even a [Parser edge case issue template](https://github.com/knowledgestack/ks-xlsx-parser/issues/new?template=parser_edge_case.yml) specifically for this. +3. 🧪 **Run `make testbench` and send us a workbook that breaks it.** Every edge-case report becomes a fixture in the next release. There's even a [Parser edge case issue template](https://github.com/knowledgestack/excel-parser/issues/new?template=parser_edge_case.yml) specifically for this. We'll ship more parsers. We'll ship an MCP server. We'll ship a native agent runtime that knows how to ground its citations. But none of that matters if nobody's telling us which `.xlsx` files break the parser first. @@ -269,6 +269,6 @@ Drop by Discord. Tell us what you're building. --- -*`ks-xlsx-parser` is MIT-licensed. Use it, fork it, ship it. If you build something on top of it, we'd love a [Show & Tell](https://github.com/knowledgestack/ks-xlsx-parser/discussions/new?category=show-and-tell) — or a shoutout in Discord.* +*`excel-parser` is MIT-licensed. Use it, fork it, ship it. If you build something on top of it, we'd love a [Show & Tell](https://github.com/knowledgestack/excel-parser/discussions/new?category=show-and-tell) — or a shoutout in Discord.* -**`pip install ks-xlsx-parser`** · [GitHub](https://github.com/knowledgestack/ks-xlsx-parser) · [Discord](https://discord.gg/4uaGhJcx) · [Knowledge Stack](https://github.com/knowledgestack) +**`pip install excel-parser`** · [GitHub](https://github.com/knowledgestack/excel-parser) · [Discord](https://discord.gg/4uaGhJcx) · [Knowledge Stack](https://github.com/knowledgestack) diff --git a/docs/launch/RELEASE_NOTES_v0.1.1.md b/docs/launch/RELEASE_NOTES_v0.1.1.md index dc9ccd4..3234637 100644 --- a/docs/launch/RELEASE_NOTES_v0.1.1.md +++ b/docs/launch/RELEASE_NOTES_v0.1.1.md @@ -1,6 +1,6 @@ -# ks-xlsx-parser v0.1.1 — Make XLSX LLM Ready 🚀 +# excel-parser v0.1.1 — Make XLSX LLM Ready 🚀 -**First public release** of `ks-xlsx-parser`, an open-source (MIT) ETL layer +**First public release** of `excel-parser`, an open-source (MIT) ETL layer that turns `.xlsx` workbooks into structured, citation-ready JSON your agents and RAG pipelines can actually reason about. @@ -20,7 +20,7 @@ ecosystem. Now open for the rest of the world. cocktails (400) + adversarial files (300) + 57 real-world and curated-stress workbooks. Round-trip gate in CI, **1054/1054 passing in ~70 s**. Ship fixtures in the - [`testBench-v0.1.1.zip`](https://github.com/knowledgestack/ks-xlsx-parser/releases/tag/v0.1.1) + [`testBench-v0.1.1.zip`](https://github.com/knowledgestack/excel-parser/releases/tag/v0.1.1) asset attached to this release. - ⚡ **Parser perf fixes** — real-world workbooks that used to hang now finish in under a second. @@ -41,11 +41,11 @@ ecosystem. Now open for the rest of the world. ## 30-second demo ```bash -pip install ks-xlsx-parser +pip install excel-parser ``` ```python -from ks_xlsx_parser import parse_workbook +from excel_parser import parse_workbook result = parse_workbook(path="q4_forecast.xlsx") @@ -58,27 +58,27 @@ for chunk in result.chunks: ## Install ```bash -pip install ks-xlsx-parser # core library -pip install ks-xlsx-parser[api] # + FastAPI web server -pip install ks-xlsx-parser[dev] # + test tooling +pip install excel-parser # core library +pip install excel-parser[api] # + FastAPI web server +pip install excel-parser[dev] # + test tooling ``` Python 3.10+, tested on Ubuntu and macOS. ## Artifacts attached -- `ks_xlsx_parser-0.1.1-py3-none-any.whl` — wheel, published to - [PyPI](https://pypi.org/project/ks-xlsx-parser/) -- `ks_xlsx_parser-0.1.1.tar.gz` — sdist +- `excel_parser-0.1.1-py3-none-any.whl` — wheel, published to + [PyPI](https://pypi.org/project/excel-parser/) +- `excel_parser-0.1.1.tar.gz` — sdist - `testBench-v0.1.1.zip` — 1053-workbook stress corpus (17 MB). Drop into any parser for a stiff regression test. ## Community - 💬 **Discord**: -- 🗣 **Discussions**: -- 🐞 **Issues**: -- ⭐ **Star the repo**: +- 🗣 **Discussions**: +- 🐞 **Issues**: +- ⭐ **Star the repo**: - 🧰 **Knowledge Stack org**: ## What's next @@ -92,7 +92,7 @@ Python 3.10+, tested on Ubuntu and macOS. Bug reports, edge-case workbooks, and PRs welcome — especially `.xlsx` files that break the parser. See -[`CONTRIBUTING.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CONTRIBUTING.md). +[`CONTRIBUTING.md`](https://github.com/knowledgestack/excel-parser/blob/main/CONTRIBUTING.md). **Thanks to every team that filed an edge case during the private beta.** diff --git a/docs/launch/RELEASE_NOTES_v0.2.0.md b/docs/launch/RELEASE_NOTES_v0.2.0.md index de2eb4a..5ed29ae 100644 --- a/docs/launch/RELEASE_NOTES_v0.2.0.md +++ b/docs/launch/RELEASE_NOTES_v0.2.0.md @@ -1,6 +1,6 @@ -# ks-xlsx-parser v0.2.0 — Benchmark + Retrievability 📊 +# excel-parser v0.2.0 — Benchmark + Retrievability 📊 -**Headline:** ks-xlsx-parser now has a head-to-head benchmark against [Docling](https://github.com/DS4SD/docling) on the [SpreadsheetBench](https://github.com/RUCKBReasoning/SpreadsheetBench) corpus (912 task instances, 5,458 xlsx files). ks **parses 99.945%** of the corpus and **ties Docling at recall@1 / wins at recall@3 (+2.7 pp) and recall@5 (+1.8 pp)** on apples-to-apples retrieval, with **36.9% citation-grade geometric recall** that Docling structurally cannot achieve. +**Headline:** excel-parser now has a head-to-head benchmark against [Docling](https://github.com/DS4SD/docling) on the [SpreadsheetBench](https://github.com/RUCKBReasoning/SpreadsheetBench) corpus (912 task instances, 5,458 xlsx files). ks **parses 99.945%** of the corpus and **ties Docling at recall@1 / wins at recall@3 (+2.7 pp) and recall@5 (+1.8 pp)** on apples-to-apples retrieval, with **36.9% citation-grade geometric recall** that Docling structurally cannot achieve. Plus three quiet RAG-breaking rendering bugs in 0.1.1 are gone. @@ -10,7 +10,7 @@ Plus three quiet RAG-breaking rendering bugs in 0.1.1 are gone. A reproducible, parser-agnostic benchmark over real-world workbooks scraped from ExcelHome / Mr.Excel / r/excel: -| Metric | **ks-xlsx-parser** | Docling 2.93 | Δ | +| Metric | **excel-parser** | Docling 2.93 | Δ | |---|---:|---:|---:| | Parse success (5,458 files) | **99.945%** | not run at scale | — | | Recall@1 (text-match) | 0.580 | 0.579 | **+0.1 pp (tied)** | @@ -23,7 +23,7 @@ A reproducible, parser-agnostic benchmark over real-world workbooks scraped from Marker is intentionally absent — its xlsx → HTML → PDF → layout-model pipeline clocks >30 min per workbook on CPU. The harness supports adding a Marker adapter (`tests/benchmarks/adapters/docling_adapter.py` as a template); the speed wall is the obstacle. -Full methodology, capability matrix, and caveats: [`tests/benchmarks/reports/COMPARISON.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/tests/benchmarks/reports/COMPARISON.md). +Full methodology, capability matrix, and caveats: [`tests/benchmarks/reports/COMPARISON.md`](https://github.com/knowledgestack/excel-parser/blob/main/tests/benchmarks/reports/COMPARISON.md). ### 🔧 Three rendering bugs that were silently torpedoing retrieval @@ -51,9 +51,9 @@ Cells using `GradientFill` (rare but real — caught by SpreadsheetBench instanc ## Reproduce ```bash -pip install -U ks-xlsx-parser==0.2.0 # or -git clone https://github.com/knowledgestack/ks-xlsx-parser -cd ks-xlsx-parser +pip install -U excel-parser==0.2.0 # or +git clone https://github.com/knowledgestack/excel-parser +cd excel-parser make corpus-download # one-time, ~100 MB make bench # ~30 min for both benchmarks open tests/benchmarks/reports/COMPARISON.md @@ -63,7 +63,7 @@ open tests/benchmarks/reports/COMPARISON.md No breaking API changes. The only behavioral change is that **`render_text` on numeric cells now contains the raw value instead of the Excel-display-formatted string** (e.g. `1272` instead of `1,272.00`). If you were relying on display formatting in retrieval keys or downstream regex parsing, switch to the cell's `display_value` field on the `ChunkDTO`. For everything else, drop-in. -Full changelog: [`CHANGELOG.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CHANGELOG.md#020--2026-05-11). +Full changelog: [`CHANGELOG.md`](https://github.com/knowledgestack/excel-parser/blob/main/CHANGELOG.md#020--2026-05-11). ## Thanks diff --git a/docs/launch/RELEASE_NOTES_v0.2.1.md b/docs/launch/RELEASE_NOTES_v0.2.1.md index af7a8a8..1cc155f 100644 --- a/docs/launch/RELEASE_NOTES_v0.2.1.md +++ b/docs/launch/RELEASE_NOTES_v0.2.1.md @@ -1,12 +1,12 @@ -# ks-xlsx-parser v0.2.1 — Packaging hotfix 📦 +# excel-parser v0.2.1 — Packaging hotfix 📦 -**Headline:** v0.2.0 shipped a broken wheel. `from ks_xlsx_parser.pipeline import parse_workbook` (and every public entry point that depends on it) raised `ModuleNotFoundError` for everyone who installed from PyPI. **v0.2.1 fixes it.** +**Headline:** v0.2.0 shipped a broken wheel. `from excel_parser.pipeline import parse_workbook` (and every public entry point that depends on it) raised `ModuleNotFoundError` for everyone who installed from PyPI. **v0.2.1 fixes it.** If you're on 0.2.0: ```bash -pip install --upgrade ks-xlsx-parser # or: uv pip install --upgrade ks-xlsx-parser -python -c "from ks_xlsx_parser.pipeline import parse_workbook; print('ok')" +pip install --upgrade excel-parser # or: uv pip install --upgrade excel-parser +python -c "from excel_parser.pipeline import parse_workbook; print('ok')" ``` ## What was broken @@ -15,19 +15,19 @@ The source tree was laid out flat under `src/` — `pipeline.py` and `api.py` as ``` analysis annotation charts chunking comparison export formula -ks_xlsx_parser models parsers rendering storage utils verification +excel_parser models parsers rendering storage utils verification ``` — 14 top-level entries, 13 of them generic. Two consequences: -1. `from ks_xlsx_parser.pipeline import ...` failed because `pipeline.py` was never copied into the wheel. -2. Anyone with an unrelated `models` / `utils` / `parsers` package in `site-packages` had it shadowed by ks-xlsx-parser's internal ones. +1. `from excel_parser.pipeline import ...` failed because `pipeline.py` was never copied into the wheel. +2. Anyone with an unrelated `models` / `utils` / `parsers` package in `site-packages` had it shadowed by excel-parser's internal ones. CI couldn't catch this because the test matrix used an editable install — `src/` lives on `sys.path` and the wheel-packaging defect is invisible. ## What changed in 0.2.1 -* **Proper nested package.** Everything now lives under `src/ks_xlsx_parser/`. The wheel's `top_level.txt` contains only `ks_xlsx_parser`. Imports inside the package switched from `from pipeline import …` to relative / fully-qualified `ks_xlsx_parser.pipeline`. +* **Proper nested package.** Everything now lives under `src/excel_parser/`. The wheel's `top_level.txt` contains only `excel_parser`. Imports inside the package switched from `from pipeline import …` to relative / fully-qualified `excel_parser.pipeline`. * **`scripts/verify_wheel.py`** — builds the wheel, installs it in a clean venv, asserts the public import surface resolves and there's no namespace pollution. Wired into both `ci.yml` (new `wheel-check` job — required) and `release.yml` (a `Verify wheel` step that runs before PyPI publish). The class of bug that produced 0.2.0 cannot recur. * **CI overhaul** — separate `test` / `wheel-check` / `lint` / `typecheck` jobs; `uv`-backed installs (faster); `make install-dev` alias. * **Docker benchmark + accuracy tracking** — `Dockerfile.bench` reproduces the SpreadsheetBench retrieval benchmark; a new `.github/workflows/benchmark.yml` runs a 60-instance sample on every PR touching the parser and the full 912-corpus weekly. `scripts/append_bench_history.py` keeps a commit-over-commit history file. (Goal: text recall@5 > 0.90.) @@ -45,24 +45,24 @@ from parsers.workbook_parser import … must update to: ```python -from ks_xlsx_parser.models import WorkbookDTO -from ks_xlsx_parser.parsers.workbook_parser import … +from excel_parser.models import WorkbookDTO +from excel_parser.parsers.workbook_parser import … ``` -If you only used the documented public surface (`from ks_xlsx_parser import parse_workbook` and `ks_xlsx_parser.pipeline.parse_workbook`), nothing changes — those names still resolve, and now they actually *load*. +If you only used the documented public surface (`from excel_parser import parse_workbook` and `excel_parser.pipeline.parse_workbook`), nothing changes — those names still resolve, and now they actually *load*. ## Upgrade ```bash -pip install --upgrade ks-xlsx-parser +pip install --upgrade excel-parser # or -uv pip install --upgrade ks-xlsx-parser +uv pip install --upgrade excel-parser ``` Then: ```python -from ks_xlsx_parser import parse_workbook +from excel_parser import parse_workbook result = parse_workbook(path="report.xlsx") print(result.workbook.total_cells) for chunk in result.chunks: @@ -73,8 +73,8 @@ for chunk in result.chunks: ``` 1041 passed, 11 deselected # full test suite -verifying ks_xlsx_parser-0.2.1-py3-none-any.whl -wheel contents OK (61 entries, top-level: ks_xlsx_parser) +verifying excel_parser-0.2.1-py3-none-any.whl +wheel contents OK (61 entries, top-level: excel_parser) clean-venv import OK wheel verification PASSED ``` @@ -87,6 +87,6 @@ Frank for the bug report — the channel screenshot was the right level of detai ## See also -* [CHANGELOG.md](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CHANGELOG.md) — full diff log. -* [`docs/recall-investigation.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/recall-investigation.md) — diagnosis framework for the recall→0.90 roadmap. -* [`docs/benchmark-local-setup.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/benchmark-local-setup.md) — reproduce the benchmark on your laptop. +* [CHANGELOG.md](https://github.com/knowledgestack/excel-parser/blob/main/CHANGELOG.md) — full diff log. +* [`docs/recall-investigation.md`](https://github.com/knowledgestack/excel-parser/blob/main/docs/recall-investigation.md) — diagnosis framework for the recall→0.90 roadmap. +* [`docs/benchmark-local-setup.md`](https://github.com/knowledgestack/excel-parser/blob/main/docs/benchmark-local-setup.md) — reproduce the benchmark on your laptop. diff --git a/docs/launch/SEO.md b/docs/launch/SEO.md index 587deee..965ec3a 100644 --- a/docs/launch/SEO.md +++ b/docs/launch/SEO.md @@ -1,4 +1,4 @@ -# SEO + GEO playbook for ks-xlsx-parser +# SEO + GEO playbook for excel-parser What we built into the repo, the site, and the README; plus the manual submission steps you still need to do to actually rank. @@ -106,7 +106,7 @@ can pull them into rich snippets and LLMs can quote them. ### 1. Google Search Console — submit the site + sitemap 1. Go to . -2. Add a property: **URL prefix** → `https://knowledgestack.github.io/ks-xlsx-parser/`. +2. Add a property: **URL prefix** → `https://knowledgestack.github.io/excel-parser/`. 3. Verify via HTML tag — copy the `` it gives you and paste it into `site/index.html` right after the ``. Commit + push; the Pages workflow redeploys in ~1 min. 4. Once verified, submit **Sitemaps → `sitemap.xml`**. Coverage reports typically populate in 24–72 h. @@ -149,7 +149,7 @@ Search Stack Overflow for: - "how to extract formulas from xlsx python" - "excel with langchain" -…and answer with a short paragraph + `pip install ks-xlsx-parser` + +…and answer with a short paragraph + `pip install excel-parser` + minimal code snippet + link to the repo. Stack Overflow answers rank really well in both Google and LLM contexts. @@ -165,7 +165,7 @@ really well in both Google and LLM contexts. Once we cut `v0.1.1`, the release workflow will publish to PyPI with proper long-description rendering. Configure the trusted publisher at <https://pypi.org/manage/account/publishing/> — Owner `knowledgestack`, -Repo `ks-xlsx-parser`, Workflow `release.yml`, Environment `pypi`. +Repo `excel-parser`, Workflow `release.yml`, Environment `pypi`. ### 8. First-post amplification loop @@ -197,11 +197,11 @@ In order: Target: position 1–3 for `xlsx parser python` within 90 days. - **`pypistats`** — daily downloads; trailing 30-day is the KPI. - **GitHub stars velocity** — first 100 stars is the hardest; track via - `gh api repos/knowledgestack/ks-xlsx-parser --jq .stargazers_count` + `gh api repos/knowledgestack/excel-parser --jq .stargazers_count` daily for the first month. -- **"ks-xlsx-parser" site: queries** in Search Console — how many - pages rank for our brand + a qualifier (e.g. "ks-xlsx-parser - langchain", "ks-xlsx-parser rag"). Growing brand-tail is a sign LLMs +- **"excel-parser" site: queries** in Search Console — how many + pages rank for our brand + a qualifier (e.g. "excel-parser + langchain", "excel-parser rag"). Growing brand-tail is a sign LLMs are learning to cite us. - **Perplexity + ChatGPT spot-checks** — weekly manual queries for the target phrases. Track whether we appear in the citation list. diff --git a/docs/recall-investigation.md b/docs/recall-investigation.md index 80e4358..46ba9e7 100644 --- a/docs/recall-investigation.md +++ b/docs/recall-investigation.md @@ -1,8 +1,8 @@ -# Retrieval-recall investigation — getting ks-xlsx-parser to >0.90 +# Retrieval-recall investigation — getting excel-parser to >0.90 ## Where we are (v0.2.0 on SpreadsheetBench, 912 instances) -| Metric | ks-xlsx-parser | docling 2.93 | +| Metric | excel-parser | docling 2.93 | |------------------------|----------------|--------------| | Parse success | 99.945% | not run at scale | | Recall@1 (text-match) | 0.580 | 0.579 | @@ -138,16 +138,16 @@ that `block.cell_range` ⊆ `bounding_box(block.cells)`. ```bash # Build once -docker build -f Dockerfile.bench -t ks-xlsx-parser-bench . +docker build -f Dockerfile.bench -t excel-parser-bench . # Quick smoke (60 instances, < 2 min) -docker run --rm -e BENCH_SAMPLE=60 ks-xlsx-parser-bench +docker run --rm -e BENCH_SAMPLE=60 excel-parser-bench # Full corpus, persist reports + corpus cache docker run --rm \ -v "$PWD/tests/benchmarks/reports:/app/tests/benchmarks/reports" \ -v "$PWD/data:/app/data" \ - ks-xlsx-parser-bench + excel-parser-bench ``` The `Benchmark` GitHub workflow: diff --git a/docs/wiki/API-Reference.md b/docs/wiki/API-Reference.md index caa1292..26bb37e 100644 --- a/docs/wiki/API-Reference.md +++ b/docs/wiki/API-Reference.md @@ -1,9 +1,9 @@ # API Reference -The public surface re-exported from both `xlsx_parser` and `ks_xlsx_parser`: +The public surface re-exported from both `excel_parser` and `excel_parser`: ```python -from ks_xlsx_parser import ( +from excel_parser import ( parse_workbook, # single file → ParseResult compare_workbooks, # N files → GeneralizedTemplate export_importer, # template → generated Python class @@ -79,7 +79,7 @@ def compare_workbooks( **Example:** ```python -from ks_xlsx_parser import compare_workbooks +from excel_parser import compare_workbooks template = compare_workbooks( ["report_q1.xlsx", "report_q2.xlsx", "report_q3.xlsx"], @@ -110,7 +110,7 @@ def export_importer( **Example:** ```python -from ks_xlsx_parser import compare_workbooks, export_importer +from excel_parser import compare_workbooks, export_importer template = compare_workbooks(["q1.xlsx", "q2.xlsx", "q3.xlsx"]) export_importer(template, "quarterly_importer.py", @@ -126,7 +126,7 @@ the template. Step-by-step debugging of the parse pipeline. ```python -from ks_xlsx_parser import StageVerifier, ExcellentStage +from excel_parser import StageVerifier, ExcellentStage verifier = StageVerifier(path="workbook.xlsx") report = verifier.run() @@ -150,15 +150,15 @@ print(report.to_markdown()) # human-readable summary ## CLI -The package also installs an `xlsx-parser-api` console entry point that +The package also installs an `excel-parser-api` console entry point that launches the FastAPI web server — see the [Web API](Web-API) page. ## Import paths Two module names point at the same package: -- `from xlsx_parser import ...` — original import path. -- `from ks_xlsx_parser import ...` — alias matching the PyPI +- `from excel_parser import ...` — original import path. +- `from excel_parser import ...` — alias matching the PyPI distribution name (dashes normalised to underscores). Use whichever reads better. Both will always work. diff --git a/docs/wiki/Architecture.md b/docs/wiki/Architecture.md index 2371a9c..0e35e84 100644 --- a/docs/wiki/Architecture.md +++ b/docs/wiki/Architecture.md @@ -1,6 +1,6 @@ # Architecture -`ks-xlsx-parser` runs an 8-stage pipeline: **parse → analyse → annotate → segment → render → serialise → verify → compare/export**. The whole graph is deterministic and side-effect-free — you can run the same workbook through it 1,000 times and get the same chunk IDs and hashes. +`excel-parser` runs an 8-stage pipeline: **parse → analyse → annotate → segment → render → serialise → verify → compare/export**. The whole graph is deterministic and side-effect-free — you can run the same workbook through it 1,000 times and get the same chunk IDs and hashes. ```mermaid %%{init: {'theme':'base', 'themeVariables': { @@ -55,7 +55,7 @@ flowchart TD class VER,CMP,EXP aux ``` -> The importable module is `xlsx_parser`; `ks_xlsx_parser` is a re-export matching the PyPI package name. The package is fully type-annotated (`py.typed` is shipped). +> The importable module is `excel_parser`; `excel_parser` is a re-export matching the PyPI package name. The package is fully type-annotated (`py.typed` is shipped). ## The 8 stages diff --git a/docs/wiki/Benchmark-vs-hucre.md b/docs/wiki/Benchmark-vs-hucre.md index 246b1b9..03afcdf 100644 --- a/docs/wiki/Benchmark-vs-hucre.md +++ b/docs/wiki/Benchmark-vs-hucre.md @@ -1,11 +1,11 @@ -# `ks-xlsx-parser` vs [`hucre`](https://github.com/productdevbook/hucre) +# `excel-parser` vs [`hucre`](https://github.com/productdevbook/hucre) An honest, reproducible head-to-head against [**`hucre`**](https://github.com/productdevbook/hucre) — an excellent zero-dependency TypeScript spreadsheet I/O engine by [**@productdevbook**](https://github.com/productdevbook). Hucre reads **and writes** xlsx/csv/ods, runs in Node/Deno/Bun/browsers/Cloudflare Workers, and ships in ~18 KB gzipped. It's a different *category* of tool than -`ks-xlsx-parser` — they're an I/O engine, we're a semantic extractor — but +`excel-parser` — they're an I/O engine, we're a semantic extractor — but since xlsx reading overlaps, it's worth putting both on the same corpus and publishing what we find. We built the comparison as much to learn from hucre as to measure ourselves. @@ -17,10 +17,10 @@ as to measure ourselves. - **hucre is faster on raw throughput**: ~3× at P50 in our fast mode, ~25–100× at P95 on data-heavy files. - **We extract more**: formula dependency graph, chart type/series, pivots, RAG chunks with token counts + citation URIs, content hashes. Hucre extracts **sparklines** and round-trips charts — we don't. - **We agree on every feature both parsers extract** to exact parity (tables, merges, CF rules, DV rules, hyperlinks, comments) or near-exact (formulas: 0.05% drift). -- Accuracy is the primary constraint of `ks-xlsx-parser`: **1631-test pytest suite**, cross-validated against [`calamine`](https://github.com/tafia/calamine), zero regressions required on every perf change. +- Accuracy is the primary constraint of `excel-parser`: **1631-test pytest suite**, cross-validated against [`calamine`](https://github.com/tafia/calamine), zero regressions required on every perf change. Pick hucre for edge-runtime / browser / CF-Worker I/O. -Pick `ks-xlsx-parser` for Python LLM / RAG / auditing pipelines. +Pick `excel-parser` for Python LLM / RAG / auditing pipelines. --- @@ -29,14 +29,14 @@ Pick `ks-xlsx-parser` for Python LLM / RAG / auditing pipelines. > *This page reflects the v0.1.x benchmark run on a curated stress corpus that > shipped with earlier releases. Current head benchmarks SpreadsheetBench > (5,458 real-world workbooks); see -> [COMPARISON.md](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/tests/benchmarks/reports/COMPARISON.md).* +> [COMPARISON.md](https://github.com/knowledgestack/excel-parser/blob/main/tests/benchmarks/reports/COMPARISON.md).* Same machine, same run, same OS page cache. `parse_workbook(mode="fast")` is the apples-to-apples configuration for hucre's read-only path (it skips LLM-specific chunking + template/tree extraction but still extracts every metadata feature hucre extracts). -| metric | `hucre` 0.3.0 | `ks-xlsx-parser` **full** | `ks-xlsx-parser` **fast** | +| metric | `hucre` 0.3.0 | `excel-parser` **full** | `excel-parser` **fast** | |---|---:|---:|---:| | P50 parse time | **1.3 ms** | 5.0 ms | **3.9 ms** | | P95 parse time | **3.5 ms** | 368 ms | 206 ms | @@ -60,7 +60,7 @@ throughput is your bottleneck, use it. ## Where `hucre` wins -| | `hucre` | `ks-xlsx-parser` | +| | `hucre` | `excel-parser` | |---|:---:|:---:| | **Writes** xlsx/csv/ods (round-trip) | ✅ | ❌ read-only | | **CSV / ODS / HTML** input | ✅ | ❌ xlsx / xlsm only | @@ -74,9 +74,9 @@ throughput is your bottleneck, use it. --- -## Where `ks-xlsx-parser` wins +## Where `excel-parser` wins -| | `ks-xlsx-parser` | `hucre` | +| | `excel-parser` | `hucre` | |---|:---:|:---:| | **Formula dependency graph** (topological, cycle detection via Tarjan's SCC) | ✅ | ❌ formula stored as string only | | **Chart type + series extraction** (7 types: bar, line, pie, scatter, area, radar, bubble) | ✅ | ❌ round-trip preservation only | @@ -95,7 +95,7 @@ throughput is your bottleneck, use it. On every feature **both** parsers extract, the drift is zero or near-zero: -| feature | `hucre` | `ks-xlsx-parser` | drift | +| feature | `hucre` | `excel-parser` | drift | |---|---:|---:|:---:| | formulas | 46,411 | 46,433 | 0.05% | | tables | 523 | 523 | **0** | @@ -111,7 +111,7 @@ we parse 16 formulas that hucre misses — we surface this in the drift report, not hide it. The cell-count difference on adversarial merge-heavy files (we emit ~50% -more rows) is a **methodology difference**: `ks-xlsx-parser` counts every +more rows) is a **methodology difference**: `excel-parser` counts every addressable cell in a merged region; hucre counts the master cell only. Both are defensible; document in the drift report generated by the benchmark harness. @@ -120,7 +120,7 @@ benchmark harness. ## Our accuracy commitment -Every perf change in `ks-xlsx-parser` has to pass, in order: +Every perf change in `excel-parser` has to pass, in order: 1. The **1631-test pytest suite** (unit + integration + corpus-slice) 2. **Cross-validation** against [`calamine`](https://github.com/tafia/calamine) — the Rust reference parser — on a golden fixture set @@ -140,12 +140,12 @@ tool. ## Reproducing these numbers -The benchmark harness lives at [`tests/benchmarks/`](https://github.com/knowledgestack/ks-xlsx-parser/tree/main/tests/benchmarks). -Full details in [`tests/benchmarks/README`](https://github.com/knowledgestack/ks-xlsx-parser/tree/main/tests/benchmarks) +The benchmark harness lives at [`tests/benchmarks/`](https://github.com/knowledgestack/excel-parser/tree/main/tests/benchmarks). +Full details in [`tests/benchmarks/README`](https://github.com/knowledgestack/excel-parser/tree/main/tests/benchmarks) but the short version: ```bash -# From the repo root, in the ks-xlsx-parser venv +# From the repo root, in the excel-parser venv cd tests/benchmarks/hucre_node && pnpm install --frozen-lockfile cd ../../.. diff --git a/docs/wiki/Data-Models.md b/docs/wiki/Data-Models.md index d434934..b66a2b8 100644 --- a/docs/wiki/Data-Models.md +++ b/docs/wiki/Data-Models.md @@ -5,7 +5,7 @@ JSON-serialisable, validated on construction, shipping with `py.typed` so your editor gives you autocomplete and type errors. For the canonical machine-readable spec, see -[`docs/WORKBOOK_GRAPH_SPEC.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/WORKBOOK_GRAPH_SPEC.md). +[`docs/WORKBOOK_GRAPH_SPEC.md`](https://github.com/knowledgestack/excel-parser/blob/main/docs/WORKBOOK_GRAPH_SPEC.md). ## High-level map @@ -183,4 +183,4 @@ Full shape is recursive but roughly: ``` For the exact field-by-field breakdown including every optional field, -see [`docs/WORKBOOK_GRAPH_SPEC.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/WORKBOOK_GRAPH_SPEC.md). +see [`docs/WORKBOOK_GRAPH_SPEC.md`](https://github.com/knowledgestack/excel-parser/blob/main/docs/WORKBOOK_GRAPH_SPEC.md). diff --git a/docs/wiki/Home.md b/docs/wiki/Home.md index 997a9a9..9a99a76 100644 --- a/docs/wiki/Home.md +++ b/docs/wiki/Home.md @@ -1,4 +1,4 @@ -# ks-xlsx-parser Wiki +# excel-parser Wiki Welcome! This wiki holds the implementation detail we'd rather keep out of the front-page README so it stays scannable. The code-heavy stuff lives here. @@ -27,28 +27,28 @@ the front-page README so it stays scannable. The code-heavy stuff lives here. ## Related docs in the main repo -- [`README.md`](https://github.com/knowledgestack/ks-xlsx-parser#readme) — +- [`README.md`](https://github.com/knowledgestack/excel-parser#readme) — hero page, architecture diagram, comparison table, community links. -- [`docs/WORKBOOK_GRAPH_SPEC.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/WORKBOOK_GRAPH_SPEC.md) — +- [`docs/WORKBOOK_GRAPH_SPEC.md`](https://github.com/knowledgestack/excel-parser/blob/main/docs/WORKBOOK_GRAPH_SPEC.md) — the canonical specification for the extraction output. -- [`docs/PARSER_KNOWN_ISSUES.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/PARSER_KNOWN_ISSUES.md) — +- [`docs/PARSER_KNOWN_ISSUES.md`](https://github.com/knowledgestack/excel-parser/blob/main/docs/PARSER_KNOWN_ISSUES.md) — known edge cases and how we handle them. -- [`docs/corpora.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/docs/corpora.md) — +- [`docs/corpora.md`](https://github.com/knowledgestack/excel-parser/blob/main/docs/corpora.md) — public benchmark corpora (SpreadsheetBench, EUSES, Enron). -- [`CONTRIBUTING.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CONTRIBUTING.md) — +- [`CONTRIBUTING.md`](https://github.com/knowledgestack/excel-parser/blob/main/CONTRIBUTING.md) — dev loop, PR checklist, community channels. -- [`CHANGELOG.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CHANGELOG.md) — +- [`CHANGELOG.md`](https://github.com/knowledgestack/excel-parser/blob/main/CHANGELOG.md) — release history. ## Community - 💬 [Discord](https://discord.gg/4uaGhJcx) — fastest way to get a real answer from a human. -- 🗣 [GitHub Discussions](https://github.com/knowledgestack/ks-xlsx-parser/discussions) — +- 🗣 [GitHub Discussions](https://github.com/knowledgestack/excel-parser/discussions) — async Q&A and RFCs. -- 🐞 [Issues](https://github.com/knowledgestack/ks-xlsx-parser/issues/new/choose) — +- 🐞 [Issues](https://github.com/knowledgestack/excel-parser/issues/new/choose) — bugs, feature requests, parser edge cases. Something in the wiki out of date or confusing? Open a PR against -[`docs/wiki/`](https://github.com/knowledgestack/ks-xlsx-parser/tree/main/docs/wiki) +[`docs/wiki/`](https://github.com/knowledgestack/excel-parser/tree/main/docs/wiki) — the wiki is rebuilt from that directory on every release. diff --git a/docs/wiki/Pipeline-Internals.md b/docs/wiki/Pipeline-Internals.md index eb5dcf5..8f89790 100644 --- a/docs/wiki/Pipeline-Internals.md +++ b/docs/wiki/Pipeline-Internals.md @@ -10,19 +10,19 @@ extending the parser or hunting a regression. .xlsx bytes │ ▼ -1. Parse ── src/xlsx_parser/parsers/ openpyxl + lxml → WorkbookDTO -2. Analyse ── src/xlsx_parser/formula/ tokenise, resolve refs - src/xlsx_parser/analysis/ build dependency graph -3. Annotate ── src/xlsx_parser/annotation/ semantic roles, KPIs -4. Segment ── src/xlsx_parser/chunking/ sheets → logical blocks -5. Render ── src/xlsx_parser/rendering/ HTML + pipe-text -6. Serialise ── src/xlsx_parser/storage/ to_json, DB rows, vectors -7. Verify ── src/xlsx_parser/verification/ stage-level assertions -8. Compare/Export── src/xlsx_parser/comparison/ multi-workbook templates - src/xlsx_parser/export/ generated importer classes +1. Parse ── src/excel_parser/parsers/ openpyxl + lxml → WorkbookDTO +2. Analyse ── src/excel_parser/formula/ tokenise, resolve refs + src/excel_parser/analysis/ build dependency graph +3. Annotate ── src/excel_parser/annotation/ semantic roles, KPIs +4. Segment ── src/excel_parser/chunking/ sheets → logical blocks +5. Render ── src/excel_parser/rendering/ HTML + pipe-text +6. Serialise ── src/excel_parser/storage/ to_json, DB rows, vectors +7. Verify ── src/excel_parser/verification/ stage-level assertions +8. Compare/Export── src/excel_parser/comparison/ multi-workbook templates + src/excel_parser/export/ generated importer classes ``` -The entry point is `src/xlsx_parser/pipeline.py`. Each stage is an +The entry point is `src/excel_parser/pipeline.py`. Each stage is an independent module you can unit-test in isolation. ## 1. Parse @@ -41,7 +41,7 @@ as a ZIP and parsing the raw OOXML XML with `lxml` (see rather than `ws.iter_rows()`, because the latter walks the full bounding box — a single `XFD1048576` cell otherwise forces a ~17 B empty-cell walk. See -[`CHANGELOG.md`](https://github.com/knowledgestack/ks-xlsx-parser/blob/main/CHANGELOG.md#performance). +[`CHANGELOG.md`](https://github.com/knowledgestack/excel-parser/blob/main/CHANGELOG.md#performance). ## 2. Analyse diff --git a/docs/wiki/Quick-Start.md b/docs/wiki/Quick-Start.md index 0324f26..d593cba 100644 --- a/docs/wiki/Quick-Start.md +++ b/docs/wiki/Quick-Start.md @@ -7,9 +7,9 @@ five short snippets. Each one is runnable standalone against a real ## Install ```bash -pip install ks-xlsx-parser # core library -pip install "ks-xlsx-parser[api]" # + FastAPI web server -pip install "ks-xlsx-parser[dev]" # + test tooling +pip install excel-parser # core library +pip install "excel-parser[api]" # + FastAPI web server +pip install "excel-parser[dev]" # + test tooling ``` Python 3.10+, tested on Ubuntu and macOS. @@ -17,7 +17,7 @@ Python 3.10+, tested on Ubuntu and macOS. ## 1. Parse a workbook ```python -from ks_xlsx_parser import parse_workbook +from excel_parser import parse_workbook result = parse_workbook(path="workbook.xlsx") @@ -54,7 +54,7 @@ Each chunk carries: ## 3. Walk the formula dependency graph ```python -from ks_xlsx_parser.models import CellCoord +from excel_parser.models import CellCoord upstream_edges = result.workbook.dependency_graph.get_upstream( sheet="Sheet1", @@ -100,7 +100,7 @@ vectors = ser.to_vector_store_entries() ## 5. Parse from bytes (typical server path) ```python -from ks_xlsx_parser import parse_workbook +from excel_parser import parse_workbook with open("workbook.xlsx", "rb") as f: content = f.read() @@ -137,7 +137,7 @@ def load_spreadsheet(path: str) -> list[dict]: def cite_cell(path: str, sheet: str, a1: str) -> dict: """Fetch one cell with its full context (value, formula, upstream deps).""" - from ks_xlsx_parser.models import CellCoord + from excel_parser.models import CellCoord from openpyxl.utils import coordinate_to_tuple row, col = coordinate_to_tuple(a1) diff --git a/docs/wiki/Web-API.md b/docs/wiki/Web-API.md index 1edd4b2..7f45879 100644 --- a/docs/wiki/Web-API.md +++ b/docs/wiki/Web-API.md @@ -1,12 +1,12 @@ # Web API -`ks-xlsx-parser` ships a FastAPI application with a drag-and-drop UI, so +`excel-parser` ships a FastAPI application with a drag-and-drop UI, so any service that can hit HTTP can use the parser without a Python dep. ## Install ```bash -pip install "ks-xlsx-parser[api]" +pip install "excel-parser[api]" ``` The `[api]` extra pulls in `fastapi`, `uvicorn[standard]`, and @@ -16,10 +16,10 @@ The `[api]` extra pulls in `fastapi`, `uvicorn[standard]`, and ```bash # console entry point, listens on :8080 -xlsx-parser-api +excel-parser-api # or directly with uvicorn -uvicorn xlsx_parser.api:app --reload --port 8080 +uvicorn excel_parser.api:app --reload --port 8080 ``` Open <http://localhost:8080> for the drag-and-drop UI. @@ -106,7 +106,7 @@ Returns `{"status": "ok"}`. Point your load balancer here. ```python from fastapi import FastAPI -from ks_xlsx_parser.api import app as xlsx_app +from excel_parser.api import app as xlsx_app app = FastAPI() app.mount("/xlsx", xlsx_app) @@ -120,9 +120,9 @@ The API respects a handful of environment variables: | Variable | Default | Purpose | |---|---|---| -| `XLSX_PARSER_MAX_CELLS` | `2000000` | Per-sheet cell cap passed to `parse_workbook`. | -| `XLSX_PARSER_MAX_FILE_MB` | `100` | Reject uploads larger than this before parsing. | -| `XLSX_PARSER_PORT` | `8080` | Port the console entry point listens on. | +| `EXCEL_PARSER_MAX_CELLS` | `2000000` | Per-sheet cell cap passed to `parse_workbook`. | +| `EXCEL_PARSER_MAX_FILE_MB` | `100` | Reject uploads larger than this before parsing. | +| `EXCEL_PARSER_PORT` | `8080` | Port the console entry point listens on. | Production users typically front it with Nginx or Caddy for TLS + auth. @@ -131,4 +131,4 @@ Production users typically front it with Nginx or Caddy for TLS + auth. An MCP server wrapping the same parse surface is on the roadmap so that Claude Desktop, Cursor, Windsurf, and Zed can call it without any glue code. Track progress or vote on the -[roadmap discussion](https://github.com/knowledgestack/ks-xlsx-parser/discussions). +[roadmap discussion](https://github.com/knowledgestack/excel-parser/discussions). diff --git a/examples/demo.py b/examples/demo.py index de59151..4ab460f 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Demo script showing the xlsx_parser in action on example workbooks. +Demo script showing the excel_parser in action on example workbooks. Run: python examples/demo.py """ @@ -12,8 +12,8 @@ # Add src to path for development sys.path.insert(0, str(Path(__file__).parent.parent / "src")) -from ks_xlsx_parser.pipeline import parse_workbook -from ks_xlsx_parser.utils.logging_config import configure_logging +from excel_parser.pipeline import parse_workbook +from excel_parser.utils.logging_config import configure_logging EXAMPLES_DIR = Path(__file__).parent / "fixtures" @@ -136,7 +136,7 @@ def demo_engineering_calcs(): print(f" Named Ranges: {[nr.name for nr in wb.named_ranges]}") # Show dependency chain for Design Moment (C15) - from ks_xlsx_parser.models import CellCoord + from excel_parser.models import CellCoord upstream = wb.dependency_graph.get_upstream( "Beam Design", CellCoord(row=15, col=3), max_depth=3 ) diff --git a/examples/generate_examples.py b/examples/generate_examples.py index 245caa2..ef8fb6f 100644 --- a/examples/generate_examples.py +++ b/examples/generate_examples.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Generate example Excel workbooks for demonstrating the ks_xlsx_parser. +Generate example Excel workbooks for demonstrating the excel_parser. Creates several representative workbooks in the examples/ folder that showcase the parser's capabilities across different Excel features. diff --git a/examples/stress_test/stress_test_runner.py b/examples/stress_test/stress_test_runner.py index 98128fa..cc61511 100644 --- a/examples/stress_test/stress_test_runner.py +++ b/examples/stress_test/stress_test_runner.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Stress Test Runner for XLSX Parser Pipeline. +Stress Test Runner for Excel Parser Pipeline. Builds progressively complex Excel files, runs them through the parser, and documents any failures, errors, or unexpected behavior. Runs in a loop @@ -18,7 +18,7 @@ PROJECT_ROOT = Path(__file__).parent.parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) -from ks_xlsx_parser.pipeline import parse_workbook +from excel_parser.pipeline import parse_workbook STRESS_DIR = Path(__file__).parent diff --git a/pyproject.toml b/pyproject.toml index e5b6deb..25f2cf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,14 +3,14 @@ requires = ["setuptools>=68.0", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "ks-xlsx-parser" +name = "excel-parser" version = "0.2.1" description = "Production-grade Excel Workflow Parser for RAG + auditability systems" readme = "README.md" license = {text = "MIT"} requires-python = ">=3.10" authors = [ - {name = "XLSX Parser Contributors"} + {name = "Excel Parser Contributors"} ] keywords = [ "excel", @@ -38,15 +38,16 @@ dependencies = [ "lxml>=4.9.0", "xxhash>=3.0.0", "tiktoken>=0.5.0", + "xlrd>=2.0.0", # legacy .xls (BIFF) reader; converted to .xlsx in-memory ] [project.urls] -Homepage = "https://github.com/knowledgestack/ks-xlsx-parser" -Repository = "https://github.com/knowledgestack/ks-xlsx-parser" -Documentation = "https://github.com/knowledgestack/ks-xlsx-parser#readme" +Homepage = "https://github.com/knowledgestack/excel-parser" +Repository = "https://github.com/knowledgestack/excel-parser" +Documentation = "https://github.com/knowledgestack/excel-parser#readme" [project.scripts] -xlsx-parser-api = "ks_xlsx_parser.api:main" +excel-parser-api = "excel_parser.api:main" [project.optional-dependencies] api = [ @@ -63,6 +64,7 @@ dev = [ "requests>=2.28.0", "ruff>=0.6.0", "mypy>=1.0", + "xlwt>=1.3.0", # writes legacy .xls fixtures for the converter tests ] # Retrieval-recall benchmark (scripts/eval_retrieval.py). Heavy — only the # benchmark Docker image and `make bench-retrieval` need these. @@ -86,10 +88,10 @@ addopts = "-m 'not corpus'" [tool.setuptools.packages.find] where = ["src"] -include = ["ks_xlsx_parser*"] +include = ["excel_parser*"] [tool.setuptools.package-data] -ks_xlsx_parser = ["py.typed"] +excel_parser = ["py.typed"] [tool.ruff] line-length = 110 diff --git a/rust/ks_xlsx_core/Cargo.lock b/rust/excel_core/Cargo.lock similarity index 99% rename from rust/ks_xlsx_core/Cargo.lock rename to rust/excel_core/Cargo.lock index 90bc1a7..c25bf61 100644 --- a/rust/ks_xlsx_core/Cargo.lock +++ b/rust/excel_core/Cargo.lock @@ -170,7 +170,7 @@ dependencies = [ ] [[package]] -name = "ks_xlsx_core" +name = "excel_core" version = "0.1.0" dependencies = [ "calamine", diff --git a/rust/ks_xlsx_core/Cargo.toml b/rust/excel_core/Cargo.toml similarity index 54% rename from rust/ks_xlsx_core/Cargo.toml rename to rust/excel_core/Cargo.toml index a6b22ef..b4e7656 100644 --- a/rust/ks_xlsx_core/Cargo.toml +++ b/rust/excel_core/Cargo.toml @@ -1,11 +1,11 @@ [package] -name = "ks_xlsx_core" +name = "excel_core" version = "0.1.0" edition = "2021" -description = "Rust OOXML fast-path for ks-xlsx-parser: cell values + formulas via calamine, exposed to Python via PyO3. REMOVABLE: see REMOVAL.md when python-calamine upstreams formula support." +description = "Rust OOXML fast-path for excel-parser: cell values + formulas via calamine, exposed to Python via PyO3. REMOVABLE: see REMOVAL.md when python-calamine upstreams formula support." [lib] -name = "ks_xlsx_core" +name = "excel_core" crate-type = ["cdylib"] [dependencies] diff --git a/rust/ks_xlsx_core/README.md b/rust/excel_core/README.md similarity index 71% rename from rust/ks_xlsx_core/README.md rename to rust/excel_core/README.md index 8919ea6..a97718a 100644 --- a/rust/ks_xlsx_core/README.md +++ b/rust/excel_core/README.md @@ -1,6 +1,6 @@ -# ks_xlsx_core +# excel_core -Rust + PyO3 fast-path for `ks-xlsx-parser`. Wraps the `calamine` Rust crate +Rust + PyO3 fast-path for `excel-parser`. Wraps the `calamine` Rust crate and exposes cell values **and formulas** to Python. This exists because `python-calamine` (0.6.2) returns cached values only — @@ -14,7 +14,7 @@ path when upstream Python bindings gain formula support. From the repo root, inside the venv: ``` -cd rust/ks_xlsx_core +cd rust/excel_core maturin develop --release ``` @@ -26,11 +26,11 @@ numbers will be misleading. Exactly one function: ```python -import ks_xlsx_core +import excel_core -sheets: list[SheetData] = ks_xlsx_core.read_workbook("workbook.xlsx") +sheets: list[SheetData] = excel_core.read_workbook("workbook.xlsx") # SheetData: {name: str, cells: list[(row, col, value, formula, dtype)]} ``` -All consumers in `ks-xlsx-parser` go through `src/parsers/calamine_core.py`, +All consumers in `excel-parser` go through `src/parsers/calamine_core.py`, never through this module directly. diff --git a/rust/ks_xlsx_core/REMOVAL.md b/rust/excel_core/REMOVAL.md similarity index 80% rename from rust/ks_xlsx_core/REMOVAL.md rename to rust/excel_core/REMOVAL.md index e86df7f..fb9e197 100644 --- a/rust/ks_xlsx_core/REMOVAL.md +++ b/rust/excel_core/REMOVAL.md @@ -1,4 +1,4 @@ -# Removing `ks_xlsx_core` +# Removing `excel_core` This Rust crate exists for one reason: **python-calamine (the Python binding) does not expose formula strings as of 0.6.2**, only cached values via @@ -19,18 +19,18 @@ this crate is redundant. The isolation point is a single Python adapter module. When the dependency flips, these are the only changes required: -1. **Delete this directory**: `rm -rf rust/ks_xlsx_core`. +1. **Delete this directory**: `rm -rf rust/excel_core`. 2. **Drop the maturin build from CI** (wherever it's wired — today it's - invoked as `maturin develop` inside `rust/ks_xlsx_core`). + invoked as `maturin develop` inside `rust/excel_core`). 3. **Rewrite the adapter** at `src/parsers/calamine_core.py` to call `python_calamine` directly for both values and formulas. The public surface of that module must not change — the rest of the parser only depends on `read_sheet_cells(path_or_bytes, sheet_name) -> SheetCells`. -4. **Remove the optional-dep entry** `ks-xlsx-core` from `pyproject.toml`. +4. **Remove the optional-dep entry** `excel-core` from `pyproject.toml`. 5. **Run** `pytest tests/` to confirm nothing regresses. -No other code in `src/` imports `ks_xlsx_core` directly — that's by design. -Grep for `ks_xlsx_core` before removing to confirm the only references are +No other code in `src/` imports `excel_core` directly — that's by design. +Grep for `excel_core` before removing to confirm the only references are inside `src/parsers/calamine_core.py` and the Rust crate itself. ## Why not just use `maturin` on pypi's `python-calamine` source? diff --git a/rust/ks_xlsx_core/pyproject.toml b/rust/excel_core/pyproject.toml similarity index 68% rename from rust/ks_xlsx_core/pyproject.toml rename to rust/excel_core/pyproject.toml index 9f28c7b..0eccf14 100644 --- a/rust/ks_xlsx_core/pyproject.toml +++ b/rust/excel_core/pyproject.toml @@ -3,9 +3,9 @@ requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" [project] -name = "ks-xlsx-core" +name = "excel-core" version = "0.1.0" -description = "Rust fast-path for ks-xlsx-parser. Removable when python-calamine upstreams formula support." +description = "Rust fast-path for excel-parser. Removable when python-calamine upstreams formula support." requires-python = ">=3.10" readme = "README.md" license = {text = "MIT"} @@ -15,6 +15,6 @@ classifiers = [ ] [tool.maturin] -module-name = "ks_xlsx_core" +module-name = "excel_core" bindings = "pyo3" features = ["pyo3/extension-module"] diff --git a/rust/ks_xlsx_core/src/formula.rs b/rust/excel_core/src/formula.rs similarity index 100% rename from rust/ks_xlsx_core/src/formula.rs rename to rust/excel_core/src/formula.rs diff --git a/rust/ks_xlsx_core/src/lib.rs b/rust/excel_core/src/lib.rs similarity index 97% rename from rust/ks_xlsx_core/src/lib.rs rename to rust/excel_core/src/lib.rs index 3b81032..d47bb07 100644 --- a/rust/ks_xlsx_core/src/lib.rs +++ b/rust/excel_core/src/lib.rs @@ -1,4 +1,4 @@ -//! ks_xlsx_core — PyO3 bindings exposing `calamine`'s cell values **and** +//! excel_core — PyO3 bindings exposing `calamine`'s cell values **and** //! formula strings to Python. //! //! Rationale lives in REMOVAL.md. Scope is intentionally minimal: one entry @@ -141,7 +141,7 @@ fn read_workbook(py: Python<'_>, path: &str) -> PyResult<PyObject> { } #[pymodule] -fn ks_xlsx_core(m: &Bound<'_, PyModule>) -> PyResult<()> { +fn excel_core(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(read_workbook, m)?)?; m.add_function(wrap_pyfunction!(formula::scan_formula, m)?)?; m.add("__version__", env!("CARGO_PKG_VERSION"))?; diff --git a/scripts/append_bench_history.py b/scripts/append_bench_history.py index 47cf539..512c35f 100755 --- a/scripts/append_bench_history.py +++ b/scripts/append_bench_history.py @@ -47,7 +47,7 @@ def main(argv: list[str] | None = None) -> int: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--reports-dir", type=Path, default=ROOT / "tests" / "benchmarks" / "reports" / "retrieval") - ap.add_argument("--parser", default="ks-xlsx-parser", + ap.add_argument("--parser", default="excel-parser", help="which parser's metrics to record") args = ap.parse_args(argv) diff --git a/scripts/download_corpora.sh b/scripts/download_corpora.sh index 2ed68ef..2798466 100755 --- a/scripts/download_corpora.sh +++ b/scripts/download_corpora.sh @@ -96,12 +96,43 @@ fetch_targz() { echo "✓ $name: $count xlsx files" } +fetch_zip_verbatim() { + # Download a .zip and extract it into $CORPUS_DIR/<name>/ preserving the + # archive's directory structure (unlike fetch_zip, which flattens to *.xlsx). + local name="$1" + local url="$2" + local dest="$CORPUS_DIR/$name" + + if [ -d "$dest" ]; then + echo "✓ $name already present, skipping" + return + fi + + echo "→ Downloading $name ..." + local zip_path="$TMP_DIR/$name.zip" + curl -L --fail --retry 3 --connect-timeout 20 -o "$zip_path" "$url" + + mkdir -p "$dest" + unzip -q "$zip_path" -d "$dest" + + local count + count="$(find "$dest" -type f -name '*.xlsx' | wc -l | tr -d ' ')" + echo "✓ $name: $count xlsx files" +} + # SpreadsheetBench (RUC-KB 2024): 912 task instances × ~6 files each (input + answer # across 3 test cases) = ~5,458 real-world xlsx files curated from ExcelHome / # Mr.Excel / r/excel. dataset.json contains (instruction, answer_sheet, # answer_position) tuples we use for retrieval-recall@k evaluation. fetch_targz "spreadsheetbench" "https://raw.githubusercontent.com/RUCKBReasoning/SpreadsheetBench/main/data/spreadsheetbench_912_v0.1.tar.gz" +# DECO (Dresden Enron COrpus, ICDAR 2019): 852 real .xlsx under completed/ with a +# hidden `Range_Annotations_Data` sheet giving per-sheet table regions + header/ +# data/derived row ranges — the only public xlsx corpus with cell-level table + +# header ground truth. Drives scripts/eval_deco.py (table-IoU + header-row F1 vs +# Docling). not_applicable/ holds table-free sheets (no annotations). +fetch_zip_verbatim "deco" "https://github.com/ddenron/deco_dataset/raw/master/annotated_files/annotated.zip" + # EUSES (mostly .xls, but keep any .xlsx present) fetch_zip "euses" "https://zenodo.org/records/581673/files/EUSES.zip" diff --git a/scripts/enrich_failures.py b/scripts/enrich_failures.py index 5f931e0..07f724c 100755 --- a/scripts/enrich_failures.py +++ b/scripts/enrich_failures.py @@ -7,7 +7,7 @@ A1 range covers the ground truth — those don't show up in failures.ndjson at all. -This script re-parses each instance's input.xlsx with both ks-xlsx-parser +This script re-parses each instance's input.xlsx with both excel-parser and openpyxl, then emits one row per FAILED instance (text-miss OR geometric-miss) with diagnostic columns chosen so post-hoc clustering is easy: @@ -92,7 +92,7 @@ def chunk_bbox(chunks) -> tuple[int, int, int, int] | None: def enrich(run_dir: Path, corpus: Path, out_path: Path) -> None: from openpyxl import load_workbook - from ks_xlsx_parser.pipeline import parse_workbook + from excel_parser.pipeline import parse_workbook # Load dataset.json once — we need question text + the original # answer_sheet attribution for instances where the rank scoring diff --git a/scripts/eval_deco.py b/scripts/eval_deco.py new file mode 100644 index 0000000..c2d71f6 --- /dev/null +++ b/scripts/eval_deco.py @@ -0,0 +1,560 @@ +""" +DECO structural benchmark — table-boundary + header-row detection, ks vs docling. + +DECO (Dresden Enron COrpus) ships 852 real spreadsheets where annotators marked, +for every sheet, the *table regions* and, inside each table, the *header* / *data* +/ *derived* row ranges. Those annotations live in a hidden worksheet named +``Range_Annotations_Data`` with columns: + + Sheet.Name, Sheet.Index, Annotation.Label, Annotation.Name, + Annotation.Range, Annotation.Parent, ... + +A ``Table`` row's parent is the worksheet; a ``Header``/``Data``/... row's parent +is the ``Annotation.Name`` of its table. That gives us ground truth for two things +SpreadsheetBench cannot score: + + 1. Table-boundary detection — where does each table start/end on the sheet. + 2. Header-row detection — which row(s) are the column header of a table. + +We score each parser against that GT: + + * ks — full: table-region IoU, fragmentation, plus header-row P/R/F1 using + the SHIPPED ``find_header_span`` (the single source of truth the + renderer + segmenter use). + * docling — xlsx → markdown collapses to ~one table per sheet and exposes no A1 + coordinates, so it cannot be scored on localisation. We score the one + axis it *can* be measured on: tables-detected-per-sheet vs GT count + (its multi-table "collapse" rate). + +Run one parser per process (memory isolation), then a report pass: + + PYTHONPATH=src uv run python scripts/eval_deco.py --parser ks --out RUN + PYTHONPATH=src uv run python scripts/eval_deco.py --parser docling --out RUN + PYTHONPATH=src uv run python scripts/eval_deco.py --report --out RUN +""" +from __future__ import annotations + +import argparse +import gc +import glob +import json +import os +import re +import sys +from collections import defaultdict +from pathlib import Path + +import openpyxl + +# ───────────────────────────────────────────────────────────── A1 helpers +_A1 = re.compile(r"\$?([A-Za-z]+)\$?(\d+)") + + +def _col_to_num(letters: str) -> int: + n = 0 + for ch in letters.upper(): + n = n * 26 + (ord(ch) - ord("A") + 1) + return n + + +def parse_a1_range(spec: str) -> tuple[int, int, int, int] | None: + """'$A$5:$S$44' or 'A5' → (r0, c0, r1, c1), 1-based inclusive. None if bad.""" + cells = _A1.findall(spec or "") + if not cells: + return None + (c_lo, r_lo) = cells[0] + (c_hi, r_hi) = cells[-1] + r0, c0 = int(r_lo), _col_to_num(c_lo) + r1, c1 = int(r_hi), _col_to_num(c_hi) + return (min(r0, r1), min(c0, c1), max(r0, r1), max(c0, c1)) + + +def _area(b: tuple[int, int, int, int]) -> int: + return (b[2] - b[0] + 1) * (b[3] - b[1] + 1) + + +def _inter(a: tuple[int, int, int, int], b: tuple[int, int, int, int]) -> int: + r0, c0 = max(a[0], b[0]), max(a[1], b[1]) + r1, c1 = min(a[2], b[2]), min(a[3], b[3]) + if r1 < r0 or c1 < c0: + return 0 + return (r1 - r0 + 1) * (c1 - c0 + 1) + + +def iou(a: tuple[int, int, int, int], b: tuple[int, int, int, int]) -> float: + i = _inter(a, b) + if i == 0: + return 0.0 + return i / (_area(a) + _area(b) - i) + + +def rows_of(span: tuple[int, int, int, int]) -> set[int]: + return set(range(span[0], span[2] + 1)) + + +# ───────────────────────────────────────────────────────────── GT loader +ANNOTATION_SHEETS = {"Range_Annotations_Data", "Annotation_Status_Data"} + + +def load_gt(path: str) -> dict[str, list[dict]]: + """Return {sheet_name: [ {range, headers:[ranges], name}, ... ]}.""" + try: + wb = openpyxl.load_workbook(path, read_only=True, data_only=True) + except Exception: + return {} + if "Range_Annotations_Data" not in wb.sheetnames: + wb.close() + return {} + ws = wb["Range_Annotations_Data"] + rows = list(ws.iter_rows(values_only=True)) + wb.close() + if not rows: + return {} + # tables keyed by their Annotation.Name; children attach by parent name + tables: dict[str, dict] = {} + children: list[tuple[str, str, tuple]] = [] # (label, parent, range) + for r in rows[1:]: + if not r or len(r) < 6 or not r[2]: + continue + sheet_name, _idx, label, name, rng, parent = ( + r[0], r[1], r[2], r[3], r[4], r[5], + ) + box = parse_a1_range(rng or "") + if box is None: + continue + if label == "Table": + tables[name] = {"sheet": sheet_name, "range": box, "headers": [], "name": name} + else: + children.append((label, parent, box)) + for label, parent, box in children: + if label == "Header" and parent in tables: + tables[parent]["headers"].append(box) + out: dict[str, list[dict]] = defaultdict(list) + for t in tables.values(): + out[t["sheet"]].append(t) + return out + + +# ───────────────────────────────────────────────────────────── ks scorer +def score_ks(path: str, gt: dict[str, list[dict]]) -> dict: + from excel_parser.analysis.header_detector import find_header_span + from excel_parser.models.common import CellCoord, CellRange + from excel_parser.pipeline import parse_workbook + + result = parse_workbook(path) + sheets = {s.sheet_name: s for s in result.workbook.sheets} + # ks candidate table regions per sheet (chunks that carry a cell_range) + regions: dict[str, list[tuple[int, int, int, int]]] = defaultdict(list) + for ch in result.chunks: + if ch.cell_range is None: + continue + cr = ch.cell_range + regions[ch.sheet_name].append( + (cr.top_left.row, cr.top_left.col, cr.bottom_right.row, cr.bottom_right.col) + ) + + def header_rows_for(span: tuple[int, int, int, int], sheet_name: str) -> set[int]: + sheet = sheets.get(sheet_name) + if sheet is None: + return set() + cr = CellRange( + top_left=CellCoord(row=span[0], col=span[1]), + bottom_right=CellCoord(row=span[2], col=span[3]), + ) + hs = find_header_span(sheet, cr) + return set() if hs is None else set(range(hs.top, hs.bottom + 1)) + + tbl_records = [] + for sheet_name, gtables in gt.items(): + cand = regions.get(sheet_name, []) + for t in gtables: + gbox = t["range"] + best_iou = max((iou(gbox, c) for c in cand), default=0.0) + best_region = max(cand, key=lambda c: iou(gbox, c), default=None) if cand else None + frags = sum(1 for c in cand if _inter(gbox, c) > 0) + gt_hrows = set() + for h in t["headers"]: + gt_hrows |= rows_of(h) + # end-to-end: header span ks computes on its best-matched region + ks_h_e2e = header_rows_for(best_region, sheet_name) if best_region else set() + # isolated: header detector accuracy on the GT region itself + ks_h_iso = header_rows_for(gbox, sheet_name) + tbl_records.append({ + "sheet": sheet_name, + "gt_range": gbox, + "best_iou": round(best_iou, 4), + "frags": frags, + "has_gt_header": bool(gt_hrows), + "gt_header_multirow": bool(gt_hrows) and (len({r for r in gt_hrows}) > 1), + "gt_hrows": sorted(gt_hrows), + "ks_hrows_e2e": sorted(ks_h_e2e), + "ks_hrows_iso": sorted(ks_h_iso), + }) + del result + gc.collect() + return {"file": os.path.basename(path), "parser": "ks", "tables": tbl_records} + + +# ───────────────────────────────────────────────────────────── docling scorer +_DOC_CONV = None + + +def _docling_converter(): + global _DOC_CONV + if _DOC_CONV is None: + from docling.document_converter import DocumentConverter + + _DOC_CONV = DocumentConverter() + return _DOC_CONV + + +def score_docling(path: str, gt: dict[str, list[dict]]) -> dict: + conv = _docling_converter() + real_sheets = set(gt.keys()) # GT sheets are the real (non-annotation) sheets + try: + doc = conv.convert(path).document + except Exception as exc: # noqa: BLE001 + return {"file": os.path.basename(path), "parser": "docling", + "error": f"{type(exc).__name__}: {exc}", "per_sheet": []} + # map tables → sheet name via provenance page_no (1 page per sheet, in order) + page_names = {} + try: + for pno, page in (doc.pages or {}).items(): + page_names[pno] = getattr(page, "name", None) + except Exception: + page_names = {} + # fallback: page order maps to workbook sheet order + sheet_order = list(_sheet_order(path)) + docling_counts: dict[str, int] = defaultdict(int) + for t in doc.tables: + sname = None + try: + prov = t.prov[0] if t.prov else None + if prov is not None: + pno = prov.page_no + sname = page_names.get(pno) + if sname is None and 1 <= pno <= len(sheet_order): + sname = sheet_order[pno - 1] + except Exception: + sname = None + if sname is None: + sname = "__unknown__" + docling_counts[sname] += 1 + per_sheet = [] + for sheet_name, gtables in gt.items(): + per_sheet.append({ + "sheet": sheet_name, + "gt_tables": len(gtables), + "docling_tables": docling_counts.get(sheet_name, 0), + }) + # if mapping failed entirely, also record total docling tables on real sheets + total_docling = sum(v for k, v in docling_counts.items() + if k in real_sheets or k == "__unknown__") + del doc + gc.collect() + return {"file": os.path.basename(path), "parser": "docling", + "per_sheet": per_sheet, "total_docling_tables_unmapped": total_docling} + + +def _sheet_order(path: str) -> list[str]: + # Full workbook sheet order (incl. annotation sheets) so docling's 1-based + # page_no aligns exactly with the sheet at that position. Annotation sheets + # are dropped at scoring time because they never appear in the GT. + try: + wb = openpyxl.load_workbook(path, read_only=True) + names = list(wb.sheetnames) + wb.close() + return names + except Exception: + return [] + + +# ───────────────────────────────────────────────────────────── runner +def iter_files(corpus: str, sample: int | None) -> list[str]: + files = sorted(glob.glob(os.path.join(corpus, "*.xlsx"))) + if sample: + # deterministic stride sample for reproducibility + step = max(1, len(files) // sample) + files = files[::step][:sample] + return files + + +class _Timeout(Exception): + pass + + +def run_parser(parser: str, files: list[str], out_dir: Path, timeout: int = 60) -> None: + import signal + + def _on_alarm(signum, frame): + raise _Timeout() + + signal.signal(signal.SIGALRM, _on_alarm) + + out_path = out_dir / f"{parser}.ndjson" + scorer = score_ks if parser == "ks" else score_docling + n = timeouts = errors = 0 + with out_path.open("w") as fh: + for path in files: + gt = load_gt(path) + if not gt: + continue + signal.alarm(timeout) + try: + rec = scorer(path, gt) + except _Timeout: + timeouts += 1 + rec = {"file": os.path.basename(path), "parser": parser, + "error": f"timeout>{timeout}s"} + except Exception as exc: # noqa: BLE001 + errors += 1 + rec = {"file": os.path.basename(path), "parser": parser, + "error": f"{type(exc).__name__}: {exc}"} + finally: + signal.alarm(0) + fh.write(json.dumps(rec) + "\n") + fh.flush() + n += 1 + if n % 25 == 0: + print(f" {parser}: {n}/{len(files)} scored " + f"({timeouts} timeout, {errors} err)", file=sys.stderr, flush=True) + print(f"✓ {parser}: wrote {n} records ({timeouts} timeout, {errors} err) → {out_path}", + file=sys.stderr) + + +# ───────────────────────────────────────────────────────────── report +def _prf(pred: set[int], gt: set[int]) -> tuple[int, int, int]: + """Return (true_pos, pred_size, gt_size).""" + return (len(pred & gt), len(pred), len(gt)) + + +def report(out_dir: Path) -> None: + ks_recs = _read_ndjson(out_dir / "ks.ndjson") + doc_recs = _read_ndjson(out_dir / "docling.ndjson") + + # ── ks table + header aggregates + n_tables = 0 + iou_sum = 0.0 + det50 = 0 + det30 = 0 + frag_sum = 0 + frag_vals: list[int] = [] + over_seg = 0 # GT table split across >1 ks region + # header (only tables with a GT header) + h_tables = 0 + h_multirow = 0 + tp_e2e = pred_e2e = gtsz_e2e = 0 + tp_iso = pred_iso = gtsz_iso = 0 + h_detected_e2e = 0 # ks emitted a non-empty header + h_exact_iso = 0 + ks_timeouts = sum(1 for r in ks_recs if "timeout" in (r.get("error") or "")) + ks_errors = sum(1 for r in ks_recs if r.get("error") and "timeout" not in r["error"]) + # per-file macro accumulators (dampen any single huge file, e.g. the 130-table UAMPS sheet) + macro_iou: list[float] = [] + macro_hf1: list[float] = [] + for rec in ks_recs: + f_iou: list[float] = [] + f_tp = f_pred = f_gt = 0 + for t in rec.get("tables", []): + f_iou.append(t["best_iou"]) + if t["has_gt_header"]: + e = set(t["ks_hrows_e2e"]) + g = set(t["gt_hrows"]) + f_tp += len(e & g) + f_pred += len(e) + f_gt += len(g) + if f_iou: + macro_iou.append(sum(f_iou) / len(f_iou)) + p = f_tp / f_pred if f_pred else 0.0 + r = f_tp / f_gt if f_gt else 0.0 + macro_hf1.append(2 * p * r / (p + r) if (p + r) else 0.0) + for rec in ks_recs: + for t in rec.get("tables", []): + n_tables += 1 + iou_sum += t["best_iou"] + det50 += t["best_iou"] >= 0.5 + det30 += t["best_iou"] >= 0.3 + frag_sum += t["frags"] + frag_vals.append(t["frags"]) + over_seg += t["frags"] > 1 + if t["has_gt_header"]: + h_tables += 1 + gt_h = set(t["gt_hrows"]) + if len(gt_h) > 1: + h_multirow += 1 + e2e = set(t["ks_hrows_e2e"]) + iso = set(t["ks_hrows_iso"]) + a, b, c = _prf(e2e, gt_h) + tp_e2e += a + pred_e2e += b + gtsz_e2e += c + h_detected_e2e += bool(e2e) + a, b, c = _prf(iso, gt_h) + tp_iso += a + pred_iso += b + gtsz_iso += c + h_exact_iso += (iso == gt_h) + + def f1(tp, pred, gt): + p = tp / pred if pred else 0.0 + r = tp / gt if gt else 0.0 + f = 2 * p * r / (p + r) if (p + r) else 0.0 + return p, r, f + + p_e2e, r_e2e, f_e2e = f1(tp_e2e, pred_e2e, gtsz_e2e) + p_iso, r_iso, f_iso = f1(tp_iso, pred_iso, gtsz_iso) + + def _median(xs: list) -> float: + if not xs: + return 0.0 + s = sorted(xs) + m = len(s) // 2 + return float(s[m]) if len(s) % 2 else (s[m - 1] + s[m]) / 2 + + # ── docling tables-per-sheet + d_sheets = 0 + d_abs_err = 0 + d_tables = 0 + d_table_vals: list[int] = [] + d_over = 0 # docling emitted MORE tables than GT (over-segment) + d_under = 0 # docling emitted FEWER (collapse/miss) + d_exact = 0 + d_errors = 0 + for rec in doc_recs: + if rec.get("error"): + d_errors += 1 + continue + for s in rec.get("per_sheet", []): + d_sheets += 1 + gt_n = s["gt_tables"] + dn = s["docling_tables"] + d_tables += dn + d_table_vals.append(dn) + d_abs_err += abs(dn - gt_n) + if dn > gt_n: + d_over += 1 + elif dn < gt_n: + d_under += 1 + else: + d_exact += 1 + + lines = [] + lines.append("# DECO structural benchmark — ks vs docling\n") + lines.append(f"Corpus: DECO `completed/` · GT tables scored: **{n_tables}** " + f"(across {len(ks_recs)} files)\n") + lines.append(f"ks parse: {ks_timeouts} files timed out, {ks_errors} errored " + f"(excluded from metrics below).\n") + lines.append("## Table-boundary detection (ks — needs A1 localisation)\n") + lines.append("| metric | value |") + lines.append("|---|---|") + lines.append(f"| mean best-IoU vs GT table | {iou_sum / n_tables:.3f} |" if n_tables else "| mean best-IoU | n/a |") + lines.append(f"| detected @ IoU≥0.5 | {det50}/{n_tables} ({100*det50/n_tables:.1f}%) |") + lines.append(f"| detected @ IoU≥0.3 | {det30}/{n_tables} ({100*det30/n_tables:.1f}%) |") + lines.append(f"| mean best-IoU, macro by file | {sum(macro_iou)/len(macro_iou):.3f} |" if macro_iou else "") + lines.append(f"| ks regions overlapping one GT table (fragmentation) | mean {frag_sum / n_tables:.2f}, median {_median(frag_vals):.0f} |") + lines.append(f"| GT tables split across >1 ks region | {over_seg}/{n_tables} ({100*over_seg/n_tables:.1f}%) |") + lines.append("") + lines.append("## Header-row detection (ks — shipped `find_header_span`)\n") + lines.append(f"GT tables with a header: **{h_tables}** · of which multi-row: " + f"**{h_multirow}** ({100*h_multirow/h_tables:.1f}%)\n" if h_tables else "No GT headers.\n") + lines.append("| metric | precision | recall | F1 |") + lines.append("|---|---|---|---|") + lines.append(f"| end-to-end (header on ks's own region) | {p_e2e:.3f} | {r_e2e:.3f} | {f_e2e:.3f} |") + lines.append(f"| isolated (header on GT region) | {p_iso:.3f} | {r_iso:.3f} | {f_iso:.3f} |") + if macro_hf1: + lines.append(f"| end-to-end, macro F1 by file | | | {sum(macro_hf1)/len(macro_hf1):.3f} |") + lines.append("") + lines.append(f"- ks emitted a non-empty header for **{h_detected_e2e}/{h_tables}** " + f"({100*h_detected_e2e/h_tables:.1f}%) GT-headered tables (end-to-end).") + lines.append(f"- exact header-row match (isolated): **{h_exact_iso}/{h_tables}** " + f"({100*h_exact_iso/h_tables:.1f}%).") + lines.append("") + lines.append("## Tables-per-sheet (docling — its only measurable axis)\n") + lines.append("docling emits no A1 coordinates, so it can't be scored on IoU or " + "header rows. The one axis it exposes is how many table objects it " + "produces per sheet vs the GT count.\n") + if d_sheets: + lines.append("| metric | value |") + lines.append("|---|---|") + lines.append(f"| GT sheets scored | {d_sheets} |") + lines.append(f"| docling tables per sheet | mean {d_tables / d_sheets:.2f}, median {_median(d_table_vals):.0f} |") + lines.append(f"| mean \\|docling − GT\\| tables per sheet | {d_abs_err / d_sheets:.2f} |") + lines.append(f"| sheets where docling = GT count | {d_exact}/{d_sheets} ({100*d_exact/d_sheets:.1f}%) |") + lines.append(f"| sheets where docling **over**-segments (>GT) | {d_over}/{d_sheets} ({100*d_over/d_sheets:.1f}%) |") + lines.append(f"| sheets where docling **under**-counts (<GT) | {d_under}/{d_sheets} ({100*d_under/d_sheets:.1f}%) |") + lines.append(f"| docling convert errors | {d_errors} files |") + else: + lines.append("_No docling records (run `--parser docling` first)._") + lines.append("") + + (out_dir / "summary.md").write_text("\n".join(lines)) + summary = { + "n_tables": n_tables, + "ks": { + "mean_best_iou": iou_sum / n_tables if n_tables else None, + "detect_at_0.5": det50 / n_tables if n_tables else None, + "detect_at_0.3": det30 / n_tables if n_tables else None, + "mean_fragments": frag_sum / n_tables if n_tables else None, + "over_segmented_frac": over_seg / n_tables if n_tables else None, + "header_tables": h_tables, + "header_multirow": h_multirow, + "header_e2e": {"precision": p_e2e, "recall": r_e2e, "f1": f_e2e}, + "header_iso": {"precision": p_iso, "recall": r_iso, "f1": f_iso}, + "header_detected_e2e_frac": h_detected_e2e / h_tables if h_tables else None, + "header_exact_iso_frac": h_exact_iso / h_tables if h_tables else None, + }, + "docling": { + "sheets": d_sheets, + "mean_tables_per_sheet": d_tables / d_sheets if d_sheets else None, + "mean_abs_table_count_err": d_abs_err / d_sheets if d_sheets else None, + "exact_frac": d_exact / d_sheets if d_sheets else None, + "over_segment_frac": d_over / d_sheets if d_sheets else None, + "under_count_frac": d_under / d_sheets if d_sheets else None, + "convert_errors": d_errors, + }, + } + (out_dir / "summary.json").write_text(json.dumps(summary, indent=2)) + print("\n".join(lines)) + print(f"\n✓ report → {out_dir}/summary.md") + + +def _read_ndjson(path: Path) -> list[dict]: + if not path.exists(): + return [] + out = [] + with path.open() as fh: + for line in fh: + line = line.strip() + if line: + out.append(json.loads(line)) + return out + + +# ───────────────────────────────────────────────────────────── main +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--corpus", default="data/corpora/deco/completed") + ap.add_argument("--parser", choices=["ks", "docling"]) + ap.add_argument("--report", action="store_true") + ap.add_argument("--sample", type=int, default=None) + ap.add_argument("--timeout", type=int, default=60, help="per-file timeout (s)") + ap.add_argument("--out", required=True, help="output run directory") + args = ap.parse_args() + + out_dir = Path(args.out) + out_dir.mkdir(parents=True, exist_ok=True) + + if args.report: + report(out_dir) + return 0 + if not args.parser: + ap.error("specify --parser ks|docling or --report") + + files = iter_files(args.corpus, args.sample) + print(f"{args.parser}: {len(files)} candidate files from {args.corpus}", file=sys.stderr) + run_parser(args.parser, files, out_dir, timeout=args.timeout) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/eval_retrieval.py b/scripts/eval_retrieval.py index d40b729..77c3f1f 100644 --- a/scripts/eval_retrieval.py +++ b/scripts/eval_retrieval.py @@ -42,7 +42,7 @@ REPO_ROOT = Path(__file__).resolve().parent.parent # Keep ``import scripts.X`` style imports working when invoked as # ``python scripts/eval_retrieval.py``. We no longer need ``src`` on the -# path — ks_xlsx_parser is a properly-installed package now. +# path — excel_parser is a properly-installed package now. sys.path.insert(0, str(REPO_ROOT)) @@ -168,8 +168,14 @@ class Chunk: chunk_id: str = "" def overlaps(self, sheet: str, range_box: tuple[int, int, int, int]) -> bool: - """True if this chunk's range overlaps the given (r0,c0,r1,c1) on `sheet`.""" - if self.sheet is not None and self.sheet != sheet: + """True if this chunk's range overlaps the given (r0,c0,r1,c1) on `sheet`. + + When ``sheet`` is unspecified (None/empty) the match is geometry-only: + ~62% of SpreadsheetBench instances omit the sheet name (single-sheet + workbooks), and a real chunk's sheet name can never equal "". Requiring + equality there silently denied geometric credit to correct chunks. + """ + if sheet and self.sheet is not None and self.sheet != sheet: return False if self.top_left is None or self.bottom_right is None: # Parser didn't surface a range — fall back to text match @@ -281,7 +287,7 @@ def parse_position_spec( def extract_chunks_ks(path: Path) -> list[Chunk]: - from ks_xlsx_parser.pipeline import parse_workbook + from excel_parser.pipeline import parse_workbook result = parse_workbook(path=str(path)) out: list[Chunk] = [] @@ -289,7 +295,7 @@ def extract_chunks_ks(path: Path) -> list[Chunk]: tl = parse_a1(c.top_left_cell) if c.top_left_cell else None br = parse_a1(c.bottom_right_cell) if c.bottom_right_cell else None out.append(Chunk( - parser="ks-xlsx-parser", + parser="excel-parser", sheet=c.sheet_name, top_left=tl, bottom_right=br, @@ -807,12 +813,24 @@ def aggregate( r for r in recs if not exec_map.get(str(r.instance_id)) ] - def _recall_at(k: int, key: str, subset: list[InstanceResult]) -> float: + def _recall_at( + k: int, + key: str, + subset: list[InstanceResult], + require_values: bool = False, + ) -> float: hits = 0 denom = 0 for r in subset: if r.error: continue + # Text-match recall is undefined for instances whose answer + # region holds no scoreable values (empty / uncached-formula + # answer.xlsx cells). The bucket histogram already excludes + # these via `had_answer_values`; the recall denominator must be + # consistent and not count an unscoreable instance as a miss. + if require_values and not r.extra.get("had_answer_values", True): + continue rank = getattr(r, key) if rank is None: denom += 1 # parser produced chunks but missed the answer @@ -861,9 +879,9 @@ def _recall_at(k: int, key: str, subset: list[InstanceResult]) -> float: "recall_geometric@1": _recall_at(1, "rank_of_first_overlap", recs), "recall_geometric@3": _recall_at(3, "rank_of_first_overlap", recs), "recall_geometric@5": _recall_at(5, "rank_of_first_overlap", recs), - "recall_text@1": _recall_at(1, "rank_of_text_match", recs), - "recall_text@3": _recall_at(3, "rank_of_text_match", recs), - "recall_text@5": _recall_at(5, "rank_of_text_match", recs), + "recall_text@1": _recall_at(1, "rank_of_text_match", recs, require_values=True), + "recall_text@3": _recall_at(3, "rank_of_text_match", recs, require_values=True), + "recall_text@5": _recall_at(5, "rank_of_text_match", recs, require_values=True), # Recall over the subset of instances the parser can possibly # satisfy (execution-required questions excluded). This is the # metric the recall-90 roadmap gates on; see cluster-05 doc. @@ -874,11 +892,11 @@ def _recall_at(k: int, key: str, subset: list[InstanceResult]) -> float: "recall_geometric@5_in_scope": _recall_at(5, "rank_of_first_overlap", in_scope_recs), "recall_text@1_in_scope": - _recall_at(1, "rank_of_text_match", in_scope_recs), + _recall_at(1, "rank_of_text_match", in_scope_recs, require_values=True), "recall_text@3_in_scope": - _recall_at(3, "rank_of_text_match", in_scope_recs), + _recall_at(3, "rank_of_text_match", in_scope_recs, require_values=True), "recall_text@5_in_scope": - _recall_at(5, "rank_of_text_match", in_scope_recs), + _recall_at(5, "rank_of_text_match", in_scope_recs, require_values=True), "table_integrity_clean": n_clean, "table_integrity_fragmented": n_frag, "table_fragmentation_rate": round(frag_rate, 4), @@ -943,7 +961,7 @@ def main(argv: list[str] | None = None) -> int: selected = {p.strip() for p in args.parsers.split(",")} parser_fns: dict[str, Any] = {} if "ks" in selected: - parser_fns["ks-xlsx-parser"] = extract_chunks_ks + parser_fns["excel-parser"] = extract_chunks_ks if "docling" in selected: parser_fns["docling"] = extract_chunks_docling if not parser_fns: diff --git a/scripts/publish_wiki.sh b/scripts/publish_wiki.sh index 4db26cb..a40fb96 100755 --- a/scripts/publish_wiki.sh +++ b/scripts/publish_wiki.sh @@ -16,7 +16,7 @@ set -euo pipefail -REPO="${REPO:-knowledgestack/ks-xlsx-parser}" +REPO="${REPO:-knowledgestack/excel-parser}" SRC_DIR="$(cd "$(dirname "$0")/.." && pwd)/docs/wiki" TMP_DIR="$(mktemp -d)" DRY_RUN=0 diff --git a/scripts/run_bench.sh b/scripts/run_bench.sh index 05a77fb..00b506a 100755 --- a/scripts/run_bench.sh +++ b/scripts/run_bench.sh @@ -2,7 +2,7 @@ # Entrypoint for the benchmark Docker image (Dockerfile.bench). # # Ensures the SpreadsheetBench corpus is present, runs the retrieval-recall -# benchmark for ks-xlsx-parser, appends the result to history.jsonl, and +# benchmark for excel-parser, appends the result to history.jsonl, and # prints a failure-bucket triage so accuracy can be tracked over time. # # Env vars: diff --git a/scripts/run_enterprise_metrics.py b/scripts/run_enterprise_metrics.py index 6366fbd..eb25e98 100644 --- a/scripts/run_enterprise_metrics.py +++ b/scripts/run_enterprise_metrics.py @@ -8,7 +8,7 @@ import json from scripts.generate_enterprise_fixtures import generate_all -from xlsx_parser import parse_workbook +from excel_parser import parse_workbook class EnterpriseScorecard: diff --git a/scripts/verify_wheel.py b/scripts/verify_wheel.py index 26801f3..33028b0 100755 --- a/scripts/verify_wheel.py +++ b/scripts/verify_wheel.py @@ -4,7 +4,7 @@ This is the regression guard for the v0.2.0 packaging bug: ``pipeline.py`` and ``api.py`` were top-level modules under ``src/`` and ``setuptools`` ``packages.find`` only picks up *packages*, so they were silently dropped -from the wheel — ``from ks_xlsx_parser.pipeline import ...`` failed for +from the wheel — ``from excel_parser.pipeline import ...`` failed for every installed user. The flat layout also leaked 13 generic top-level packages (``models``, ``utils``, ``parsers`` ...) into ``site-packages``. @@ -23,13 +23,13 @@ ROOT = Path(__file__).resolve().parent.parent # Imports a real downstream consumer relies on. Keep in sync with the -# public surface in ks_xlsx_parser/__init__.py. +# public surface in excel_parser/__init__.py. SMOKE_IMPORTS = [ - "from ks_xlsx_parser import parse_workbook, ParseResult", - "from ks_xlsx_parser.pipeline import parse_workbook", - "from ks_xlsx_parser.verification import StageVerifier", - "from ks_xlsx_parser.analysis.table_assembler import TableAssembler", - "from ks_xlsx_parser.models.workbook import WorkbookDTO", + "from excel_parser import parse_workbook, ParseResult", + "from excel_parser.pipeline import parse_workbook", + "from excel_parser.verification import StageVerifier", + "from excel_parser.analysis.table_assembler import TableAssembler", + "from excel_parser.models.workbook import WorkbookDTO", ] @@ -47,16 +47,16 @@ def check_wheel_contents(wheel: Path) -> None: top_level = next((n for n in names if n.endswith("top_level.txt")), None) if top_level: packages = zf.read(top_level).decode().split() - if packages != ["ks_xlsx_parser"]: + if packages != ["excel_parser"]: sys.exit( f"ERROR: wheel exposes top-level packages {packages}; " - "expected only ['ks_xlsx_parser']. The flat src/ layout leaked." + "expected only ['excel_parser']. The flat src/ layout leaked." ) - required = ["ks_xlsx_parser/pipeline.py", "ks_xlsx_parser/api.py"] + required = ["excel_parser/pipeline.py", "excel_parser/api.py"] for req in required: if not any(n == req for n in names): sys.exit(f"ERROR: wheel is missing {req}") - print(f"wheel contents OK ({len(names)} entries, top-level: ks_xlsx_parser)") + print(f"wheel contents OK ({len(names)} entries, top-level: excel_parser)") def check_install_and_import(wheel: Path) -> None: diff --git a/site/index.html b/site/index.html index 37fca39..3aaf815 100644 --- a/site/index.html +++ b/site/index.html @@ -5,32 +5,32 @@ <meta name="viewport" content="width=device-width, initial-scale=1" /> <!-- Primary SEO --> - <title>ks-xlsx-parser — Python Excel (XLSX) Parser for LLMs, RAG, LangChain, LangGraph, CrewAI & Claude + excel-parser — Python Excel (XLSX) Parser for LLMs, RAG, LangChain, LangGraph, CrewAI & Claude - - - + + + - + - + - - + + - + - +
Metric🟢 ks-xlsx-parser🟢 excel-parser ⚪ Docling 2.93 Δ
🎯 Recall@1
text-match
0.5800.579tied0.6930.708Docling +1.5 pp
🎯 Recall@3
text-match
0.6970.670+2.7 pp0.8480.820+2.8 pp
🎯 Recall@5
text-match
0.7040.686+1.8 pp0.8590.840+1.9 pp
📍 Geometric Recall@5
chunk's sheet!A1:Z99 overlaps the ground-truth range
0.3690.889 0.000 citation-grade only
⚡ Mean parse time
per file
251 ms265 ms~5% faster⚡ Parse time
per file
349 ms mean
11 ms median
238 ms mean
13 ms median
mixed
🧱 Parser errors
across 912 instances