diff --git a/.factory-plugin/marketplace.json b/.factory-plugin/marketplace.json
index 4282159..98338f6 100644
--- a/.factory-plugin/marketplace.json
+++ b/.factory-plugin/marketplace.json
@@ -53,6 +53,24 @@
       "description": "Pull request lifecycle skills: create PRs with consistent conventions and follow up on them until merge-ready",
       "source": "./plugins/code-review",
       "category": "productivity"
+    },
+    {
+      "name": "kreuzberg",
+      "description": "Local document extraction: text, tables, metadata, images from 91+ formats with optional OCR.",
+      "source": "./plugins/kreuzberg",
+      "category": "document-intelligence"
+    },
+    {
+      "name": "kreuzcrawl",
+      "description": "Web crawling and scraping with HTML→Markdown and headless-Chrome fallback.",
+      "source": "./plugins/kreuzcrawl",
+      "category": "web-scraping"
+    },
+    {
+      "name": "kreuzberg-cloud",
+      "description": "Managed extraction via api.kreuzberg.dev with webhooks, uploads, and usage tracking.",
+      "source": "./plugins/kreuzberg-cloud",
+      "category": "document-intelligence"
     }
   ]
 }
diff --git a/plugins/kreuzberg-cloud/.factory-plugin/plugin.json b/plugins/kreuzberg-cloud/.factory-plugin/plugin.json
new file mode 100644
index 0000000..03b8e43
--- /dev/null
+++ b/plugins/kreuzberg-cloud/.factory-plugin/plugin.json
@@ -0,0 +1,23 @@
+{
+  "name": "kreuzberg-cloud",
+  "version": "0.1.0",
+  "description": "Offload document extraction to api.kreuzberg.dev — managed extraction with webhooks, presigned uploads, and usage tracking.",
+  "author": {
+    "name": "Kreuzberg, Inc.",
+    "email": "support@kreuzberg.dev",
+    "url": "https://kreuzberg.dev"
+  },
+  "homepage": "https://kreuzberg.dev",
+  "repository": "https://github.com/kreuzberg-dev/plugins",
+  "license": "MIT",
+  "category": "document-intelligence",
+  "keywords": [
+    "cloud",
+    "document-intelligence",
+    "extraction",
+    "webhooks"
+  ],
+  "brandColor": "#0EA5E9",
+  "icon": "./assets/icon.svg",
+  "logo": "./assets/logo.png"
+}
diff --git a/plugins/kreuzberg-cloud/README.md b/plugins/kreuzberg-cloud/README.md
new file mode 100644
index 0000000..cadb366
--- /dev/null
+++ b/plugins/kreuzberg-cloud/README.md
@@ -0,0 +1,112 @@
+# kreuzberg-cloud
+
+Offload document extraction to `api.kreuzberg.dev` — managed extraction with webhook delivery, presigned uploads for large files, sandbox keys, and per-project usage tracking.
+
+<!-- TODO: screenshot -->
+
+## Install
+
+### From the marketplace (recommended)
+
+Pending review for official Claude marketplace.
+
+Self-host:
+
+```text
+/plugin marketplace add kreuzberg-dev/plugins
+/plugin install kreuzberg-cloud@kreuzberg
+```
+
+### v0.1.0 — skills only
+
+The plugin v0.1.0 ships **skills and documentation only; no MCP server**. The `kreuzberg-cloud` CLI binary with MCP wiring lands in plugin v0.2.0. Agents call the HTTP REST API directly via curl or one of the official SDKs:
+
+- **TypeScript/Node.js**: `@kreuzberg/cloud` ([npm](https://www.npmjs.com/package/@kreuzberg/cloud))
+- **Python**: `kreuzberg-cloud-sdk` ([PyPI](https://pypi.org/project/kreuzberg-cloud-sdk/))
+
+### API key requirement
+
+Set the `KREUZBERG_API_KEY` environment variable or write `~/.kreuzberg/cloud.toml`:
+
+```toml
+api_key = "sk_live_..."
+```
+
+If neither is set, the plugin's SessionStart hook displays a reminder. For evaluation without signup, use sandbox keys (see the `sandbox-keys` skill).
+
+## Skills shipped
+
+| Skill | Trigger |
+|-------|---------|
+| **kreuzberg-cloud** | Offload document extraction to api.kreuzberg.dev. Use when the user wants managed extraction with webhook delivery, presigned uploads for large files, sandbox keys, or per-project usage tracking — instead of running the local kreuzberg CLI. Covers authentication, the 12 REST endpoints, request/response shapes, error model, and SDK options. |
+| **offloading-extraction** | Use when the user wants to extract a document via the cloud rather than the local kreuzberg CLI. Covers POST /v1/extract — JSON vs multipart bodies, URL crawls, options block, webhook attachment, and the async response shape. |
+| **tracking-cloud-jobs** | Use when an extraction job has been submitted and the result needs to be retrieved. Covers GET /v1/jobs/{id}, polling cadence with exponential backoff, terminal status detection, and webhook delivery (signature verification, retry semantics). |
+| **presigned-uploads** | Use when the user has files larger than ~50 MB to extract via the cloud, or when base64-encoding the body would be wasteful. Covers the three-step presign / PUT / confirm flow against POST /v1/uploads/presign and POST /v1/uploads/confirm. |
+| **managing-cloud-usage** | Use when the user asks about quota, billing visibility, or processed-page counts. Covers GET /v1/usage — query params, response shape, when to report usage proactively to the user. |
+| **sandbox-keys** | Use when the user wants to try Kreuzberg Cloud without signing up, or needs an ephemeral key for evaluation, demos, or CI integration tests. Covers POST /v1/sandbox/key — the no-auth endpoint, quota, TTL, and cleanup expectations. |
+
+## MCP tools
+
+MCP wiring lands in v0.2.0. Until then, the v0.1.0 skills document the REST API directly with curl, TypeScript SDK, and Python SDK examples.
+
+## Configuration
+
+### Environment variable
+
+```bash
+export KREUZBERG_API_KEY="sk_live_..."
+```
+
+### Config file
+
+Create `~/.kreuzberg/cloud.toml`:
+
+```toml
+api_key = "sk_live_..."
+base_url = "https://api.kreuzberg.dev"  # optional
+```
+
+Precedence: CLI argument > environment variable > config file.
+
+## Examples
+
+Submit a document for extraction via curl:
+
+```text
+curl -X POST https://api.kreuzberg.dev/v1/extract \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"url":"https://example.com/document.pdf"}'
+```
+
+Poll a job for completion with the TypeScript SDK:
+
+```text
+import { CloudClient } from "@kreuzberg/cloud";
+const client = new CloudClient({ apiKey: process.env.KREUZBERG_API_KEY });
+const job = await client.getJob(jobId);
+console.log(job.status);  // "pending" | "processing" | "completed" | "failed"
+```
+
+Check quota with the Python SDK:
+
+```text
+from kreuzberg_cloud_sdk import Client
+client = Client(api_key=os.getenv("KREUZBERG_API_KEY"))
+usage = client.get_usage()
+print(f"Pages processed: {usage.pages_processed}, Quota: {usage.quota}")
+```
+
+## Versioning
+
+The plugin version tracks the marketplace `VERSION` file. See [CHANGELOG.md](../../CHANGELOG.md) for release notes.
+
+## License
+
+MIT.
+
+## See also
+
+- **Marketplace**: [kreuzberg-dev/plugins](https://github.com/kreuzberg-dev/plugins)
+- **Upstream**: [kreuzberg-dev/kreuzberg-cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud)
+- **Sibling plugins**: [kreuzberg](../kreuzberg/README.md), [kreuzcrawl](../kreuzcrawl/README.md)
diff --git a/plugins/kreuzberg-cloud/assets/icon.svg b/plugins/kreuzberg-cloud/assets/icon.svg
new file mode 100644
index 0000000..23eeacc
--- /dev/null
+++ b/plugins/kreuzberg-cloud/assets/icon.svg
@@ -0,0 +1,9 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256" width="256" height="256">
+  <rect width="256" height="256" rx="48" fill="#0EA5E9"/>
+  <path d="M72 104 L152 104 L192 144 L192 216 C192 220.4 188.4 224 184 224 L72 224 C67.6 224 64 220.4 64 216 L64 112 C64 107.6 67.6 104 72 104 Z" fill="white"/>
+  <path d="M152 104 L152 136 C152 140.4 155.6 144 160 144 L192 144 Z" fill="#0EA5E9"/>
+  <rect x="84" y="160" width="72" height="10" rx="5" fill="#0EA5E9"/>
+  <rect x="84" y="180" width="80" height="10" rx="5" fill="#0EA5E9"/>
+  <rect x="84" y="200" width="56" height="10" rx="5" fill="#0EA5E9"/>
+  <path d="M76 88 C76 70 90 56 108 56 C118 56 127 60 133 67 C138 60 146 56 156 56 C172 56 184 68 184 84 C184 86 184 88 183 90 L77 90 C76 89 76 88 76 88 Z" fill="white"/>
+</svg>
diff --git a/plugins/kreuzberg-cloud/assets/logo.png b/plugins/kreuzberg-cloud/assets/logo.png
new file mode 100644
index 0000000..8cb2e11
Binary files /dev/null and b/plugins/kreuzberg-cloud/assets/logo.png differ
diff --git a/plugins/kreuzberg-cloud/skills/kreuzberg-cloud/SKILL.md b/plugins/kreuzberg-cloud/skills/kreuzberg-cloud/SKILL.md
new file mode 100644
index 0000000..b1975d5
--- /dev/null
+++ b/plugins/kreuzberg-cloud/skills/kreuzberg-cloud/SKILL.md
@@ -0,0 +1,345 @@
+---
+name: kreuzberg-cloud
+description: >-
+  Offload document extraction to api.kreuzberg.dev. Use when the user wants
+  managed extraction with webhook delivery, presigned uploads for large
+  files, sandbox keys, or per-project usage tracking — instead of running
+  the local kreuzberg CLI. Covers authentication, the 12 REST endpoints,
+  request/response shapes, error model, and SDK options.
+license: MIT
+metadata:
+  author: kreuzberg-dev
+  version: "0.1.0"
+  repository: https://github.com/kreuzberg-dev/kreuzberg-cloud
+---
+
+# Kreuzberg Cloud
+
+Kreuzberg Cloud is the managed extraction API hosted at
+`https://api.kreuzberg.dev`. It exposes the same Rust extraction engine as
+the local `kreuzberg` CLI, with two extras: jobs are asynchronous (webhook
+or polling delivery) and large files go through presigned uploads instead
+of in-band base64.
+
+Use this skill when writing code that:
+
+- Hits `api.kreuzberg.dev` directly via HTTP.
+- Uses the `@kreuzberg/cloud` (npm) or `kreuzberg-cloud-sdk` (PyPI) SDKs.
+- Configures webhooks, sandbox keys, or usage queries.
+
+## v0.1.0 limitation
+
+The `kreuzberg-cloud` plugin v0.1.0 ships **skills only — no MCP server**.
+The `kreuzberg-cloud` CLI binary that hosts the MCP server lands in plugin
+v0.2.0. Until then, prefer one of:
+
+1. The TypeScript SDK (`@kreuzberg/cloud`) — ESM, tree-shakable, generated
+   from the OpenAPI 3.1 spec.
+2. The Python SDK (`kreuzberg-cloud-sdk`) — sync + async, `from_sandbox()`
+   helper for evaluation.
+3. Raw `curl` — every example below shows the curl form first.
+
+## When cloud vs local
+
+| Situation | Use |
+|---|---|
+| You already have the `kreuzberg` CLI installed and the file is on disk | Local (`kreuzberg` plugin) |
+| File is on a remote URL or in S3 / GCS | Cloud |
+| Need OCR for languages the local Tesseract install doesn't have | Cloud |
+| File is larger than ~50 MB | Cloud (presigned uploads) |
+| Want webhook delivery rather than blocking the caller | Cloud |
+| Batch of mixed documents with shared options | Either; cloud parallelizes server-side |
+| No network access, air-gapped environment | Local |
+| Evaluating before committing to install | Cloud sandbox key |
+
+## Getting an API key
+
+Three options, in order of preference for production:
+
+1. **Production key** — sign up at <https://kreuzberg.dev/cloud>, mint a key
+   from the dashboard. Format: `sk_live_*`.
+2. **Sandbox key** — no signup, 24-hour TTL, 50-page quota, rate-limited to
+   10 keys per IP per 24 hours. Format: `sk_sandbox_*`. See the
+   `sandbox-keys` skill.
+3. **Local `~/.kreuzberg/cloud.toml`** — for shell sessions, put the key in:
+
+   ```toml
+   # ~/.kreuzberg/cloud.toml
+   api_key = "sk_live_..."
+   ```
+
+The plugin's `SessionStart` hook checks `KREUZBERG_API_KEY` env var first,
+then `~/.kreuzberg/cloud.toml`, and emits a setup reminder if neither is
+present.
+
+## Authentication
+
+Every request — except `POST /v1/sandbox/key` — uses a Bearer token:
+
+```bash
+curl https://api.kreuzberg.dev/v1/usage \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY"
+```
+
+Both `sk_live_*` and `sk_sandbox_*` go in the same header. The server
+resolves project context from the key.
+
+## Base URL and versioning
+
+- Base: `https://api.kreuzberg.dev`
+- Path prefix: `/v1/` for all extraction, jobs, sandbox, uploads, usage
+  endpoints. Health endpoints (`/healthz`, `/readyz`) are unversioned.
+- The OpenAPI 3.1 spec is published at
+  <https://api.kreuzberg.dev/openapi.json>; full reference at
+  <https://docs.kreuzberg.cloud>.
+
+## The 12 endpoints
+
+Twelve operations across seven tag groups:
+
+### health (2)
+
+| Method | Path | Purpose |
+|---|---|---|
+| GET | `/healthz` | Liveness — returns 200 if the process is up. |
+| GET | `/readyz` | Readiness — returns 200 only when downstream deps are healthy. |
+
+Neither requires auth. Use `/readyz` for uptime monitors and `/healthz`
+for load-balancer health checks.
+
+### extract (1)
+
+| Method | Path | Purpose |
+|---|---|---|
+| POST | `/v1/extract` | Submit one or more documents (or URLs) for extraction. |
+
+Accepts `application/json` (base64 documents) or `multipart/form-data`
+(binary file parts). Returns `202 Accepted` with `job_ids` (extraction
+jobs) and `crawl_job_ids` (URL-crawl jobs). Pair with `GET /v1/jobs/{id}`
+to retrieve results — or supply a `webhook` block to receive them
+asynchronously. See the `offloading-extraction` skill.
+
+### jobs (1)
+
+| Method | Path | Purpose |
+|---|---|---|
+| GET | `/v1/jobs/{id}` | Get the current status and (if terminal) result of a job. |
+
+Accepts both extraction job IDs and crawl job IDs. Response shape varies:
+extraction jobs return `JobResponse`, crawl jobs return `CrawlJobResponse`.
+See the `tracking-cloud-jobs` skill.
+
+### documents (4)
+
+| Method | Path | Purpose |
+|---|---|---|
+| GET | `/v1/documents/{document_id}` | Latest version of a document with its extraction result. |
+| POST | `/v1/documents/{document_id}/diff` | Compute a diff between two versions (sync, with async fallback). |
+| GET | `/v1/documents/{document_id}/diff/{diff_job_id}` | Poll the status of an async diff job. |
+| GET | `/v1/documents/{document_id}/versions` | List all versions of a document (paginated). |
+
+For applications that re-process the same document over time. Each
+extraction returns a `document_id` that's stable across versions.
+
+### uploads (2)
+
+| Method | Path | Purpose |
+|---|---|---|
+| POST | `/v1/uploads/presign` | Generate per-file presigned PUT URLs. |
+| POST | `/v1/uploads/confirm` | Confirm the uploads and start processing. |
+
+Three-step flow for files larger than ~50 MB: presign → PUT to storage →
+confirm. See the `presigned-uploads` skill.
+
+### sandbox (1)
+
+| Method | Path | Purpose |
+|---|---|---|
+| POST | `/v1/sandbox/key` | Mint an ephemeral sandbox API key. |
+
+24-hour TTL, 50-page quota, 10 keys per IP per 24 hours. No auth required.
+See the `sandbox-keys` skill.
+
+### usage (1)
+
+| Method | Path | Purpose |
+|---|---|---|
+| GET | `/v1/usage` | Per-project usage statistics and remaining quota. |
+
+Accepts optional `start` and `end` ISO-8601 query params. Defaults to the
+current calendar month. See the `managing-cloud-usage` skill.
+
+## Key request / response shapes
+
+### `ExtractJsonRequest`
+
+```json
+{
+  "documents": [
+    {
+      "filename": "invoice.pdf",
+      "mime_type": "application/pdf",
+      "data": "<base64>"
+    }
+  ],
+  "urls": [
+    { "url": "https://example.com/docs" }
+  ],
+  "options": {
+    "extraction_config": {
+      "output_format": "markdown",
+      "ocr": { "backend": "tesseract", "language": "eng" }
+    }
+  },
+  "crawl_config": {
+    "max_depth": 2,
+    "max_pages": 50,
+    "stay_on_domain": true
+  },
+  "webhook": {
+    "url": "https://example.com/webhook",
+    "secret": "shared-hmac-secret",
+    "metadata": { "request_id": "abc123" }
+  }
+}
+```
+
+Either `documents` or `urls` is required (or both). `webhook`, `options`,
+and `crawl_config` are optional.
+
+### `ExtractResponse` (202)
+
+```json
+{
+  "job_ids": ["550e8400-e29b-41d4-a716-446655440000"],
+  "crawl_job_ids": ["660e9400-f39c-51e5-b827-557766551111"],
+  "status": "pending"
+}
+```
+
+### `JobResponse` (200)
+
+```json
+{
+  "id": "550e8400-e29b-41d4-a716-446655440000",
+  "filename": "invoice.pdf",
+  "status": "completed",
+  "created_at": "2025-12-21T10:00:00Z",
+  "processing_time_ms": 1234,
+  "result": {
+    "content": "Invoice total: $1,234.56",
+    "mime_type": "text/markdown",
+    "tables": [],
+    "images": [],
+    "metadata": { "title": "Invoice #12345" }
+  }
+}
+```
+
+### `JobStatus` enum
+
+```text
+awaiting_upload | pending | processing | chunking | aggregating
+                | completed | partial_success | failed | cancelled
+```
+
+Terminal states: `completed`, `partial_success`, `failed`, `cancelled`.
+Stop polling when any of those appears.
+
+### `UsageResponse` (200)
+
+```json
+{
+  "period_start": "2026-05-01",
+  "period_end": "2026-06-01",
+  "total_pages": 5432,
+  "total_documents": 87,
+  "total_failed": 2,
+  "quota_limit": 100000,
+  "quota_remaining": 94568,
+  "by_mime_type": {
+    "application/pdf": { "documents": 65, "pages": 3200, "failed": 1 }
+  }
+}
+```
+
+## Error model
+
+All errors are JSON with at least an `error` string field. Status codes
+follow REST conventions:
+
+| Status | Meaning | Typical cause |
+|---|---|---|
+| `400` | Bad request | Missing required field, malformed body, invalid UUID. |
+| `401` | Unauthorized | Missing or invalid `Authorization` header. |
+| `404` | Not found | Job / document ID doesn't exist in this project. |
+| `429` | Rate limited | Sandbox-key IP throttle or per-key quota. |
+| `500` | Server error | Database failure, worker crash — retry with backoff. |
+| `503` | Service unavailable | Downstream dep unhealthy — retry. |
+
+The SDKs surface these as typed exceptions: `AuthError`, `ValidationError`,
+`NotFoundError`, `RateLimitError` (carries `retry_after`), `ServerError`,
+`TimeoutError`, all extending `KreuzbergCloudError` (Python) /
+`KreuzbergError` (TypeScript).
+
+## Concrete examples
+
+### Sandbox onboarding (no signup)
+
+```bash
+# Mint an ephemeral key.
+curl -X POST https://api.kreuzberg.dev/v1/sandbox/key
+# → { "api_key": "sk_sandbox_...", "expires_at": "...", "pages_remaining": 50 }
+```
+
+```ts
+import { KreuzbergCloud } from "@kreuzberg/cloud";
+const client = await KreuzbergCloud.fromSandbox();
+const result = await client.extractAndWait({
+  file: new Blob(["Hello world"], { type: "text/plain" }),
+});
+console.log(result.result?.content);
+```
+
+```python
+from kreuzberg_cloud import AsyncKreuzbergCloud
+async with await AsyncKreuzbergCloud.from_sandbox() as client:
+    job = await client.extract_and_wait(file=b"hello world")
+    print(job.status, job.result and job.result.content)
+```
+
+### Single-file extract → poll
+
+```bash
+# 1. Submit.
+JOB_ID=$(curl -sX POST https://api.kreuzberg.dev/v1/extract \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "documents": [
+      {"filename": "invoice.pdf", "mime_type": "application/pdf",
+       "data": "'"$(base64 -w0 invoice.pdf)"'"}
+    ],
+    "options": {"extraction_config": {"output_format": "markdown"}}
+  }' | jq -r '.job_ids[0]')
+
+# 2. Poll until terminal.
+curl -s https://api.kreuzberg.dev/v1/jobs/$JOB_ID \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY"
+```
+
+## Other skills
+
+- `offloading-extraction` — full `POST /v1/extract` workflow with options.
+- `tracking-cloud-jobs` — polling cadence, webhook signatures.
+- `presigned-uploads` — three-step flow for files >50 MB.
+- `managing-cloud-usage` — quota and per-MIME breakdown.
+- `sandbox-keys` — when to recommend sandbox over production keys.
+
+## References
+
+- API docs: <https://docs.kreuzberg.cloud>
+- OpenAPI spec: <https://api.kreuzberg.dev/openapi.json>
+- TypeScript SDK: <https://www.npmjs.com/package/@kreuzberg/cloud>
+- Python SDK: <https://pypi.org/project/kreuzberg-cloud-sdk/>
+- Pricing and signup: <https://kreuzberg.dev/cloud>
diff --git a/plugins/kreuzberg-cloud/skills/managing-cloud-usage/SKILL.md b/plugins/kreuzberg-cloud/skills/managing-cloud-usage/SKILL.md
new file mode 100644
index 0000000..804b26f
--- /dev/null
+++ b/plugins/kreuzberg-cloud/skills/managing-cloud-usage/SKILL.md
@@ -0,0 +1,101 @@
+---
+name: managing-cloud-usage
+description: Use when the user asks about quota, billing visibility, or processed-page counts. Covers GET /v1/usage — query params, response shape, when to report usage proactively to the user.
+---
+
+# Managing cloud usage
+
+`GET /v1/usage` is the only endpoint for quota and billing visibility.
+It returns aggregate counters for the queried period plus the remaining
+quota for the project.
+
+## Endpoint
+
+```text
+GET https://api.kreuzberg.dev/v1/usage
+Authorization: Bearer $KREUZBERG_API_KEY
+```
+
+### Query parameters
+
+| Param | Format | Default |
+|---|---|---|
+| `start` | ISO-8601 date (e.g. `2026-03-01`) | First day of current month. |
+| `end` | ISO-8601 date (e.g. `2026-04-01`) | First day of next month. |
+
+Both are optional. Omit both for the current calendar month.
+
+## Response (200)
+
+```json
+{
+  "period_start": "2026-05-01",
+  "period_end": "2026-06-01",
+  "total_pages": 5432,
+  "total_documents": 87,
+  "total_failed": 2,
+  "quota_limit": 100000,
+  "quota_remaining": 94568,
+  "by_mime_type": {
+    "application/pdf": { "documents": 65, "pages": 3200, "failed": 1 },
+    "image/png":       { "documents": 15, "pages": 1800, "failed": 0 },
+    "text/plain":      { "documents":  7, "pages":  432, "failed": 1 }
+  }
+}
+```
+
+### Reading the response
+
+- `total_pages` — pages billed in the period. The unit of cost.
+- `total_documents` — files submitted, regardless of page count.
+- `total_failed` — extractions that ended in `failed` status. Failed
+  jobs do not consume quota.
+- `quota_limit` / `quota_remaining` — total and remaining pages on the
+  current plan.
+- `by_mime_type` — per-MIME breakdown. Useful for identifying which
+  document types drive cost.
+
+## Examples
+
+### Current-month usage
+
+```bash
+curl -s https://api.kreuzberg.dev/v1/usage \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" | jq .
+```
+
+### Specific date range
+
+```bash
+curl -s "https://api.kreuzberg.dev/v1/usage?start=2026-01-01&end=2026-02-01" \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" | jq .
+```
+
+### Quota remaining as a percentage
+
+```bash
+curl -s https://api.kreuzberg.dev/v1/usage \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  | jq '.quota_remaining * 100 / .quota_limit'
+```
+
+## When to report usage to the user
+
+Pull usage proactively when:
+
+- A batch job submits more than ~100 documents — report `quota_remaining`
+  after submit so the user can see the impact.
+- The user asks "how much have I used?" or any quota-shaped question.
+- A `429` response includes a quota-exhausted error — surface the usage
+  shape so the user can decide whether to upgrade.
+- After a long-running crawl finishes, since page count is hard to
+  estimate up front.
+
+Don't report usage on every routine extraction — it's noise.
+
+## Errors
+
+| Status | Cause |
+|---|---|
+| `400` | `start` or `end` not ISO-8601, or `end <= start`. |
+| `401` | Bad API key. |
diff --git a/plugins/kreuzberg-cloud/skills/offloading-extraction/SKILL.md b/plugins/kreuzberg-cloud/skills/offloading-extraction/SKILL.md
new file mode 100644
index 0000000..9fb3027
--- /dev/null
+++ b/plugins/kreuzberg-cloud/skills/offloading-extraction/SKILL.md
@@ -0,0 +1,200 @@
+---
+name: offloading-extraction
+description: Use when the user wants to extract a document via the cloud rather than the local kreuzberg CLI. Covers POST /v1/extract — JSON vs multipart bodies, URL crawls, options block, webhook attachment, and the async response shape.
+---
+
+# Offloading extraction
+
+`POST /v1/extract` is the single submit endpoint. It returns `202 Accepted`
+with `job_ids` (extraction) and `crawl_job_ids` (URL crawls) — never the
+extraction result inline. Pair every submit with either a poll loop
+(`tracking-cloud-jobs` skill) or a webhook.
+
+## When to reach for this
+
+- File is on a remote URL.
+- File is on disk but the local `kreuzberg` CLI is not installed.
+- You want server-side parallelism for a batch.
+- The user wants webhook-delivered results to skip blocking.
+- File is larger than ~50 MB → use `presigned-uploads` instead — the
+  base64 JSON body is too big.
+
+## Endpoint
+
+```text
+POST https://api.kreuzberg.dev/v1/extract
+Authorization: Bearer $KREUZBERG_API_KEY
+Content-Type: application/json | multipart/form-data
+```
+
+Returns `202 Accepted` with `ExtractResponse`.
+
+## Three submission shapes
+
+### 1. Base64 JSON (small files, <5 MB recommended)
+
+```bash
+curl -X POST https://api.kreuzberg.dev/v1/extract \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d @- <<JSON
+{
+  "documents": [
+    {
+      "filename": "invoice.pdf",
+      "mime_type": "application/pdf",
+      "data": "$(base64 -w0 invoice.pdf)"
+    }
+  ],
+  "options": {
+    "extraction_config": {
+      "output_format": "markdown",
+      "ocr": { "backend": "tesseract", "language": "eng" }
+    }
+  }
+}
+JSON
+```
+
+### 2. Multipart (binary, recommended for anything over ~1 MB)
+
+```bash
+curl -X POST https://api.kreuzberg.dev/v1/extract \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  -F "file=@invoice.pdf;type=application/pdf" \
+  -F 'options={"extraction_config":{"output_format":"markdown"}};type=application/json'
+```
+
+Add a `webhook` part as a JSON string:
+
+```bash
+  -F 'webhook={"url":"https://hooks.example.com/x","secret":"shh"};type=application/json'
+```
+
+### 3. URL crawl
+
+```bash
+curl -X POST https://api.kreuzberg.dev/v1/extract \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "urls": [{"url": "https://example.com/docs"}],
+    "crawl_config": {"max_depth": 2, "max_pages": 50, "stay_on_domain": true},
+    "webhook": {"url": "https://hooks.example.com/x"}
+  }'
+```
+
+URL crawls return `crawl_job_ids` instead of (or alongside) `job_ids`.
+
+## Response (202)
+
+```json
+{
+  "job_ids": ["550e8400-e29b-41d4-a716-446655440000"],
+  "crawl_job_ids": [],
+  "status": "pending"
+}
+```
+
+`status` is always `pending` at submit time; the per-job status is
+retrieved via `GET /v1/jobs/{id}`.
+
+## The `options` block
+
+Shape mirrors the local `ExtractionConfig`:
+
+```json
+{
+  "extraction_config": {
+    "output_format": "markdown",
+    "ocr": { "backend": "tesseract", "language": "eng+deu" },
+    "extract_tables": true,
+    "extract_images": false,
+    "chunking": { "max_chars": 4000, "overlap": 200 }
+  }
+}
+```
+
+Supported `output_format` values: `markdown`, `text`, `json`, `djot`,
+`html`. Default is `markdown`.
+
+## The `webhook` block
+
+```json
+{
+  "url": "https://hooks.example.com/x",
+  "secret": "shared-secret-32-bytes-min",
+  "metadata": { "request_id": "abc123", "user_id": "u_42" }
+}
+```
+
+`secret` is the HMAC key used to sign the webhook payload — see
+`tracking-cloud-jobs` for verification. `metadata` is echoed back in the
+delivered payload, useful for correlating server-side requests.
+
+## TypeScript SDK
+
+```ts
+import { KreuzbergCloud } from "@kreuzberg/cloud";
+import { readFile } from "node:fs/promises";
+
+const client = new KreuzbergCloud({ apiKey: process.env.KREUZBERG_API_KEY! });
+
+const data = await readFile("invoice.pdf");
+const job = await client.extract({
+  file: { name: "invoice.pdf", data, mimeType: "application/pdf" },
+  options: { extractionConfig: { outputFormat: "markdown" } },
+});
+console.log(job.id, job.status);
+```
+
+For submit + wait in one call:
+
+```ts
+const result = await client.extractAndWait({
+  file: { name: "invoice.pdf", data },
+});
+console.log(result.result?.content);
+```
+
+## Python SDK
+
+```python
+from pathlib import Path
+from kreuzberg_cloud import KreuzbergCloud
+
+with KreuzbergCloud(api_key=os.environ["KREUZBERG_API_KEY"]) as client:
+    job = client.extract(file=Path("invoice.pdf"))
+    print(job.id, job.status)
+```
+
+Submit + wait:
+
+```python
+job = client.extract_and_wait(file=Path("invoice.pdf"))
+print(job.result.content if job.result else job.status)
+```
+
+## Batch submission
+
+JSON: pass multiple entries in `documents`. Multipart: repeat the `file`
+part. SDKs expose `extractBatch` / `extract_batch` helpers that fan out
+correctly per platform (parallel HTTP for the async Python client,
+sequential for the sync one).
+
+## Errors
+
+| Status | Cause | Fix |
+|---|---|---|
+| `400` | Empty `documents` and `urls` | Provide at least one. |
+| `400` | Bad MIME type | Use a real RFC 6838 type, e.g. `application/pdf`. |
+| `401` | Missing Bearer | Set `Authorization` header. |
+| `413` | Request body too large | Switch to presigned uploads. |
+| `429` | Quota or rate limit | Backoff; check `quota_remaining` via `/v1/usage`. |
+
+## Next step
+
+After every submit, hand off to the `tracking-cloud-jobs` skill — cloud
+extraction is asynchronous and the result is delivered via either polling
+or webhook callback. Never assume a result is ready immediately after the
+`202` response.
diff --git a/plugins/kreuzberg-cloud/skills/presigned-uploads/SKILL.md b/plugins/kreuzberg-cloud/skills/presigned-uploads/SKILL.md
new file mode 100644
index 0000000..7d5164f
--- /dev/null
+++ b/plugins/kreuzberg-cloud/skills/presigned-uploads/SKILL.md
@@ -0,0 +1,158 @@
+---
+name: presigned-uploads
+description: Use when the user has files larger than ~50 MB to extract via the cloud, or when base64-encoding the body would be wasteful. Covers the three-step presign / PUT / confirm flow against POST /v1/uploads/presign and POST /v1/uploads/confirm.
+---
+
+# Presigned uploads
+
+For files larger than about 50 MB, skip the base64-in-JSON body of
+`POST /v1/extract` and use the three-step presigned-upload flow instead.
+The client uploads bytes directly to object storage, then tells the API to
+start processing.
+
+## When to reach for this
+
+- Single file > 50 MB.
+- Batch with aggregate body size > 100 MB.
+- Bandwidth-constrained environments where double-encoding (base64 + TLS
+  - worker) wastes throughput.
+- File already lives in S3 / GCS and you can stream rather than buffer.
+
+## The three steps
+
+```text
+1. POST /v1/uploads/presign  → batch_id + per-file presigned PUT URLs
+2. PUT <upload_url>          → upload each file's bytes directly
+3. POST /v1/uploads/confirm  → start extraction, returns job_ids
+```
+
+Step 1 returns one `upload_url` per document. Step 3 cannot run until
+every PUT in step 2 succeeds.
+
+## Step 1 — presign
+
+```bash
+curl -X POST https://api.kreuzberg.dev/v1/uploads/presign \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "documents": [
+      {"filename": "scan.pdf", "mime_type": "application/pdf"},
+      {"filename": "report.docx", "mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"}
+    ],
+    "config": {"output_format": "markdown"},
+    "webhook": {"url": "https://hooks.example.com/x"}
+  }'
+```
+
+### Response
+
+```json
+{
+  "batch_id": "batch_550e8400-e29b-41d4-a716",
+  "uploads": [
+    {
+      "job_id": "550e8400-...",
+      "upload_url": "https://storage.googleapis.com/kreuzberg-dev-uploads/...",
+      "object_key": "projects/abc123/uploads/550e8400-...",
+      "method": "PUT",
+      "expires_in_secs": 3600
+    },
+    {
+      "job_id": "660e9400-...",
+      "upload_url": "https://storage.googleapis.com/kreuzberg-dev-uploads/...",
+      "object_key": "projects/abc123/uploads/660e9400-...",
+      "method": "PUT",
+      "expires_in_secs": 3600
+    }
+  ]
+}
+```
+
+Keep the `batch_id` — you need it for step 3. URLs expire in 3600 seconds
+(1 hour); upload before then.
+
+## Step 2 — PUT to each upload URL
+
+The presigned URL is signed by Google Cloud Storage; PUT directly to it,
+**without** an `Authorization` header. Set `Content-Type` to match the
+`mime_type` declared in step 1:
+
+```bash
+curl -X PUT "<upload_url>" \
+  -H "Content-Type: application/pdf" \
+  --data-binary @scan.pdf
+```
+
+A successful upload returns `200 OK` with no body. Do this for every
+entry in `uploads` before moving on.
+
+## Step 3 — confirm
+
+```bash
+curl -X POST https://api.kreuzberg.dev/v1/uploads/confirm \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"batch_id": "batch_550e8400-e29b-41d4-a716"}'
+```
+
+### Response (202)
+
+```json
+{
+  "job_ids": ["550e8400-...", "660e9400-..."],
+  "status": "processing"
+}
+```
+
+These are the same `job_id` values returned in step 1's `uploads` array.
+From here, the flow is identical to `offloading-extraction` — poll
+`GET /v1/jobs/{id}` or wait for the webhook.
+
+## End-to-end curl example
+
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+API="https://api.kreuzberg.dev"
+KEY="$KREUZBERG_API_KEY"
+FILE="scan.pdf"
+
+# 1. Presign
+resp=$(curl -fsS -X POST "$API/v1/uploads/presign" \
+  -H "Authorization: Bearer $KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"documents":[{"filename":"'"$FILE"'","mime_type":"application/pdf"}]}')
+
+batch_id=$(echo "$resp" | jq -r .batch_id)
+upload_url=$(echo "$resp" | jq -r '.uploads[0].upload_url')
+
+# 2. PUT
+curl -fsS -X PUT "$upload_url" \
+  -H "Content-Type: application/pdf" \
+  --data-binary "@$FILE"
+
+# 3. Confirm
+curl -fsS -X POST "$API/v1/uploads/confirm" \
+  -H "Authorization: Bearer $KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"batch_id":"'"$batch_id"'"}' | jq .
+```
+
+## Errors
+
+| Status | Where | Cause |
+|---|---|---|
+| `400` | presign | Empty `documents`, bad MIME, missing `filename`. |
+| `403` | PUT | URL expired (>1h since presign) or `Content-Type` mismatch. |
+| `400` | confirm | One or more uploads missing in storage. |
+| `401` | presign/confirm | Bad Bearer token. |
+
+If `confirm` returns `400` complaining about a missing upload, retry the
+PUT for that specific `object_key` — confirmation requires every file to
+be present in storage first.
+
+## When not to use this
+
+For files under ~5 MB, the JSON `data` field is simpler and lower-latency
+(one round trip instead of three). See the `offloading-extraction` skill.
diff --git a/plugins/kreuzberg-cloud/skills/sandbox-keys/SKILL.md b/plugins/kreuzberg-cloud/skills/sandbox-keys/SKILL.md
new file mode 100644
index 0000000..edb4510
--- /dev/null
+++ b/plugins/kreuzberg-cloud/skills/sandbox-keys/SKILL.md
@@ -0,0 +1,116 @@
+---
+name: sandbox-keys
+description: Use when the user wants to try Kreuzberg Cloud without signing up, or needs an ephemeral key for evaluation, demos, or CI integration tests. Covers POST /v1/sandbox/key — the no-auth endpoint, quota, TTL, and cleanup expectations.
+---
+
+# Sandbox keys
+
+`POST /v1/sandbox/key` issues ephemeral, anonymous API keys. Use these
+for evaluation, demos, and integration smoke tests — never for production
+workloads.
+
+## What you get
+
+| Property | Value |
+|---|---|
+| Format | `sk_sandbox_*` |
+| TTL | 24 hours from issue |
+| Quota | 50 pages, hard cap |
+| Auth required to mint | None |
+| IP throttle | 10 keys per IP per 24 hours |
+
+The 50-page quota is per key, not per IP. The IP throttle prevents abuse
+of the no-auth mint endpoint.
+
+## Endpoint
+
+```text
+POST https://api.kreuzberg.dev/v1/sandbox/key
+```
+
+No `Authorization` header — this is the only authenticated-by-omission
+endpoint in the API.
+
+## Response (200)
+
+```json
+{
+  "api_key": "sk_sandbox_ABC123DEF456GHI789JKL012MNO345PQR678STU901VWX234",
+  "expires_at": "2025-12-21T10:00:00Z",
+  "pages_remaining": 50
+}
+```
+
+After 24 hours or 50 pages — whichever comes first — the key returns
+`401` on every endpoint. The key is not renewable; mint a fresh one.
+
+## Examples
+
+### Mint and use
+
+```bash
+KREUZBERG_API_KEY=$(curl -sX POST https://api.kreuzberg.dev/v1/sandbox/key \
+  | jq -r .api_key)
+
+curl -sX POST https://api.kreuzberg.dev/v1/extract \
+  -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "documents":[{"filename":"hi.txt","mime_type":"text/plain","data":"aGVsbG8="}],
+    "options":{"extraction_config":{"output_format":"markdown"}}
+  }'
+```
+
+### TypeScript SDK
+
+```ts
+import { KreuzbergCloud } from "@kreuzberg/cloud";
+
+const client = await KreuzbergCloud.fromSandbox();
+const result = await client.extractAndWait({
+  file: new Blob(["Hello world"], { type: "text/plain" }),
+});
+console.log(result.result?.content);
+```
+
+`fromSandbox()` mints a key under the hood and configures the client.
+
+### Python SDK
+
+```python
+import asyncio
+from kreuzberg_cloud import AsyncKreuzbergCloud
+
+async def main() -> None:
+    async with await AsyncKreuzbergCloud.from_sandbox() as client:
+        job = await client.extract_and_wait(file=b"hello world")
+        print(job.status, job.result and job.result.content)
+
+asyncio.run(main())
+```
+
+## When to recommend sandbox vs production keys
+
+| Use sandbox | Use production |
+|---|---|
+| First-time evaluation, no signup yet | Anything user-facing or business-critical |
+| Local smoke tests, demos | CI on the main branch |
+| One-off doc to test the API surface | Recurring batch pipelines |
+| Onboarding flow that bootstraps an SDK | Any workload >50 pages |
+
+If the user already has a production key, do not silently switch to a
+sandbox key — production keys carry the right quota, billing, and
+project-scoped resources.
+
+## Cleanup
+
+Sandbox keys self-expire after 24 hours. No revocation endpoint exists —
+nothing to clean up. Do not commit sandbox keys to version control even
+though they're short-lived; treat them like any other credential.
+
+## Errors
+
+| Status | Cause |
+|---|---|
+| `429` | IP has minted 10 keys in the last 24 hours. Wait or use a production key. |
+| `500` | Server-side mint failure; retry with backoff. |
diff --git a/plugins/kreuzberg-cloud/skills/tracking-cloud-jobs/SKILL.md b/plugins/kreuzberg-cloud/skills/tracking-cloud-jobs/SKILL.md
new file mode 100644
index 0000000..7f3cb98
--- /dev/null
+++ b/plugins/kreuzberg-cloud/skills/tracking-cloud-jobs/SKILL.md
@@ -0,0 +1,185 @@
+---
+name: tracking-cloud-jobs
+description: Use when an extraction job has been submitted and the result needs to be retrieved. Covers GET /v1/jobs/{id}, polling cadence with exponential backoff, terminal status detection, and webhook delivery (signature verification, retry semantics).
+---
+
+# Tracking cloud jobs
+
+Every `POST /v1/extract` returns a job ID. The actual result arrives one
+of two ways:
+
+1. **Polling** — `GET /v1/jobs/{id}` until status is terminal.
+2. **Webhook** — a callback you registered at submit time fires when the
+   job is done.
+
+Pick polling when latency tolerance is short and you control the caller.
+Pick webhooks when you can't block, or when the job runs minutes long.
+
+## Endpoint
+
+```text
+GET https://api.kreuzberg.dev/v1/jobs/{id}
+Authorization: Bearer $KREUZBERG_API_KEY
+```
+
+Accepts both extraction job IDs (from `job_ids`) and crawl job IDs (from
+`crawl_job_ids`). The response schema is `JobLookupResponse`, a union of
+`JobResponse` (extraction) and `CrawlJobResponse` (crawl).
+
+## Response (200)
+
+```json
+{
+  "id": "550e8400-e29b-41d4-a716-446655440000",
+  "filename": "invoice.pdf",
+  "status": "completed",
+  "created_at": "2025-12-21T10:00:00Z",
+  "processing_time_ms": 1234,
+  "result": {
+    "content": "Invoice total: $1,234.56",
+    "mime_type": "text/markdown",
+    "tables": [],
+    "images": [],
+    "metadata": { "title": "Invoice #12345" }
+  }
+}
+```
+
+`result` is `null` until the job reaches a terminal state.
+
+## Status lifecycle
+
+```text
+awaiting_upload  →  pending  →  processing  →  chunking  →  aggregating  →  completed
+                                                                          →  partial_success
+                                                                          →  failed
+                                                          (any time)      →  cancelled
+```
+
+**Terminal statuses** — stop polling when status is one of:
+
+- `completed` — `result` is populated.
+- `partial_success` — `result` is populated; check
+  `result.metadata.warnings` for the partial cause.
+- `failed` — `result` is `null`; an error was logged server-side.
+- `cancelled` — `result` is `null`; the job was cancelled before
+  completion.
+
+## Polling cadence
+
+Use exponential backoff capped at 30 seconds. Most extractions finish in
+under 5 seconds; large PDFs with OCR may take minutes.
+
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+JOB_ID="$1"
+delay=1
+while true; do
+  body=$(curl -fsS \
+    -H "Authorization: Bearer $KREUZBERG_API_KEY" \
+    "https://api.kreuzberg.dev/v1/jobs/$JOB_ID")
+  status=$(echo "$body" | jq -r .status)
+  case "$status" in
+    completed|partial_success|failed|cancelled)
+      echo "$body" | jq .; exit 0;;
+  esac
+  sleep "$delay"
+  delay=$(( delay * 2 > 30 ? 30 : delay * 2 ))
+done
+```
+
+### TypeScript SDK
+
+The SDK does the backoff for you:
+
+```ts
+import { KreuzbergCloud } from "@kreuzberg/cloud";
+const client = new KreuzbergCloud({ apiKey: process.env.KREUZBERG_API_KEY! });
+
+const result = await client.waitForJob(jobId, {
+  timeoutMs: 5 * 60_000,
+  pollIntervalMs: 1000, // starting interval; backs off internally
+});
+console.log(result.status, result.result?.content);
+```
+
+### Python SDK
+
+```python
+from kreuzberg_cloud import KreuzbergCloud
+
+with KreuzbergCloud(api_key=...) as client:
+    job = client.wait_for_job(job_id, timeout=300)
+    print(job.status, job.result and job.result.content)
+```
+
+## Webhooks
+
+Register a webhook at submit time by including a `webhook` block in the
+`POST /v1/extract` body:
+
+```json
+{
+  "webhook": {
+    "url": "https://hooks.example.com/kreuzberg",
+    "secret": "32-byte-shared-secret",
+    "metadata": { "request_id": "abc123" }
+  }
+}
+```
+
+When the job reaches a terminal status, the server POSTs the full
+`JobResponse` (or `CrawlJobResponse`) to `url`. The `metadata` you
+supplied is echoed back inside the payload.
+
+### Signature verification
+
+The server signs each webhook delivery with an HMAC computed over the raw
+JSON body using `secret`. The signature header name and exact algorithm
+(SHA-256, hex-encoded) are documented at <https://docs.kreuzberg.cloud>;
+treat them as the source of truth — do not hard-code header names from
+this skill.
+
+Verification pattern (Python, illustrative):
+
+```python
+import hmac, hashlib
+def verify(body: bytes, signature_hex: str, secret: str) -> bool:
+    expected = hmac.new(secret.encode(), body, hashlib.sha256).hexdigest()
+    return hmac.compare_digest(expected, signature_hex)
+```
+
+Reject any delivery whose signature does not match. Always pass `secret`
+to `POST /v1/extract` for production — unsigned webhooks can be forged.
+
+### Retry semantics
+
+Webhook deliveries retry on non-2xx responses with exponential backoff
+over several hours. Keep your handler idempotent — the same `job_id` may
+be delivered more than once on transient failures.
+
+### When to prefer webhooks vs polling
+
+| Prefer webhooks | Prefer polling |
+|---|---|
+| You can run an HTTP server | CLI / one-shot scripts |
+| Jobs run minutes long | Jobs finish in seconds |
+| Batch of many jobs | A single foreground job |
+| Caller can't block | Caller is already blocking |
+| You want exactly one delivery per terminal state | You want strict consistency in your own loop |
+
+## Crawl jobs
+
+`GET /v1/jobs/{crawl_job_id}` returns `CrawlJobResponse` (different shape
+from `JobResponse`). The crawl job lists each per-document `job_id` that
+was spawned; iterate through those to fetch individual extraction results.
+
+## Errors
+
+| Status | Meaning | Action |
+|---|---|---|
+| `400` | Malformed UUID | Verify the ID came from `job_ids` / `crawl_job_ids`. |
+| `401` | Bad API key | Check `Authorization` header. |
+| `404` | Job not found | Wrong project key, or job purged. |
+| `503` | DB unavailable | Retry with backoff. |
diff --git a/plugins/kreuzberg/.factory-plugin/plugin.json b/plugins/kreuzberg/.factory-plugin/plugin.json
new file mode 100644
index 0000000..1ca34e0
--- /dev/null
+++ b/plugins/kreuzberg/.factory-plugin/plugin.json
@@ -0,0 +1,24 @@
+{
+  "name": "kreuzberg",
+  "version": "0.1.0",
+  "description": "Local document extraction: text, tables, metadata, images from 91+ formats with optional OCR.",
+  "author": {
+    "name": "Kreuzberg, Inc.",
+    "email": "support@kreuzberg.dev",
+    "url": "https://kreuzberg.dev"
+  },
+  "homepage": "https://kreuzberg.dev",
+  "repository": "https://github.com/kreuzberg-dev/plugins",
+  "license": "MIT",
+  "category": "document-intelligence",
+  "keywords": [
+    "document-intelligence",
+    "extraction",
+    "ocr",
+    "pdf",
+    "tables"
+  ],
+  "brandColor": "#1F6FEB",
+  "icon": "./assets/icon.svg",
+  "logo": "./assets/logo.png"
+}
diff --git a/plugins/kreuzberg/README.md b/plugins/kreuzberg/README.md
new file mode 100644
index 0000000..0c89320
--- /dev/null
+++ b/plugins/kreuzberg/README.md
@@ -0,0 +1,110 @@
+# kreuzberg
+
+Extract text, tables, metadata, and images from 91+ document formats — PDF, Office, images with OCR, HTML, email, archives, academic — using the local `kreuzberg` CLI in your agent.
+
+<!-- TODO: screenshot -->
+
+## Install
+
+### From the marketplace (recommended)
+
+Pending review for official Claude marketplace.
+
+Self-host:
+
+```text
+/plugin marketplace add kreuzberg-dev/plugins
+/plugin install kreuzberg@kreuzberg
+```
+
+### Binary requirement
+
+Install the `kreuzberg` CLI:
+
+```bash
+brew install kreuzberg-dev/tap/kreuzberg
+# or
+cargo install kreuzberg-cli
+```
+
+OCR ships with Tesseract by default. Install language packs for non-English documents:
+
+```bash
+brew install tesseract-lang        # macOS
+sudo apt install tesseract-ocr-*   # Debian/Ubuntu
+```
+
+## Skills shipped
+
+| Skill | Trigger |
+|-------|---------|
+| **kreuzberg** | Extract text, tables, metadata, and images from 91+ document formats (PDF, Office, images, HTML, email, archives, academic) using Kreuzberg. Use when writing code that calls Kreuzberg APIs in Python, Node.js/TypeScript, Rust, or CLI. Covers installation, extraction (sync/async), configuration (OCR, chunking, output format), batch processing, error handling, and plugins. |
+| **extracting-with-ocr** | Use when extracting text from scanned PDFs, photographed pages, or images that have no embedded text layer. Covers OCR backends, language packs, force-OCR, and performance tuning. |
+| **extracting-tables** | Use when extracting tabular data from PDFs, spreadsheets, or images. Covers layout-aware table detection, table model selection, output formats (markdown / JSON cells), and known limits. |
+| **picking-a-format** | Use when choosing an output format for extracted documents — text, markdown, djot, html, or JSON. Maps consumer (LLM, parser, archive) to the right `--format` / `--content-format` pair. |
+
+**Reference materials** (linked from the `kreuzberg` skill):
+
+| Reference | Content |
+|-----------|---------|
+| **CLI Reference** | All commands, flags, config precedence, exit codes |
+| **Configuration Reference** | TOML/YAML/JSON formats, auto-discovery, env vars, full schema |
+| **Supported Formats** | All 91+ formats with file extensions and MIME types |
+| **Python API Reference** | All functions, config classes, plugin protocols, exact signatures |
+| **Node.js API Reference** | All functions, TypeScript interfaces, worker pool APIs |
+| **Rust API Reference** | All functions with feature gates, structs, Cargo.toml examples |
+| **Advanced Features** | Plugins, embeddings, MCP server, API server, security limits |
+| **Other Language Bindings** | Go, Ruby, Java, C#, PHP, Elixir, WASM, Docker |
+
+## MCP tools
+
+The `kreuzberg` MCP server exposes:
+
+- `extract` — single file extraction with config.
+- `extract_batch` — batch extraction from multiple files.
+- `detect_mime` — MIME type detection from bytes or path.
+- `cache_clear` — clear the extraction cache.
+
+## Configuration
+
+Kreuzberg auto-discovers `kreuzberg.toml` from the current directory upward. Set config via:
+
+1. **Environment variable**: `KREUZBERG_CONFIG_JSON='{"output_format":"markdown"}'`
+2. **Config file** (TOML): `kreuzberg.toml` in cwd or a parent directory.
+3. **CLI flag**: `kreuzberg extract doc.pdf --content-format markdown`
+
+See `skills/kreuzberg/references/configuration.md` for the full schema and precedence rules.
+
+## Examples
+
+Extract a PDF to plain text and print it:
+
+```text
+kreuzberg extract document.pdf
+```
+
+Extract with markdown formatting for LLM context:
+
+```text
+kreuzberg extract report.pdf --content-format markdown
+```
+
+Extract tables from a spreadsheet as JSON:
+
+```text
+kreuzberg extract data.xlsx --format json
+```
+
+## Versioning
+
+The plugin version tracks the marketplace `VERSION` file. See [CHANGELOG.md](../../CHANGELOG.md) for release notes.
+
+## License
+
+MIT. The skill content uses Elastic-2.0 references to the upstream [kreuzberg](https://github.com/kreuzberg-dev/kreuzberg) repository.
+
+## See also
+
+- **Marketplace**: [kreuzberg-dev/plugins](https://github.com/kreuzberg-dev/plugins)
+- **Upstream**: [kreuzberg-dev/kreuzberg](https://github.com/kreuzberg-dev/kreuzberg)
+- **Sibling plugins**: [kreuzcrawl](../kreuzcrawl/README.md), [kreuzberg-cloud](../kreuzberg-cloud/README.md)
diff --git a/plugins/kreuzberg/assets/icon.svg b/plugins/kreuzberg/assets/icon.svg
new file mode 100644
index 0000000..c096294
--- /dev/null
+++ b/plugins/kreuzberg/assets/icon.svg
@@ -0,0 +1,8 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256" width="256" height="256">
+  <rect width="256" height="256" rx="48" fill="#1F6FEB"/>
+  <path d="M64 64 L152 64 L192 104 L192 200 C192 204.4 188.4 208 184 208 L64 208 C59.6 208 56 204.4 56 200 L56 72 C56 67.6 59.6 64 64 64 Z" fill="white"/>
+  <path d="M152 64 L152 96 C152 100.4 155.6 104 160 104 L192 104 Z" fill="#1F6FEB"/>
+  <rect x="80" y="128" width="88" height="12" rx="6" fill="#1F6FEB"/>
+  <rect x="80" y="152" width="96" height="12" rx="6" fill="#1F6FEB"/>
+  <rect x="80" y="176" width="64" height="12" rx="6" fill="#1F6FEB"/>
+</svg>
diff --git a/plugins/kreuzberg/assets/logo.png b/plugins/kreuzberg/assets/logo.png
new file mode 100644
index 0000000..67f8959
Binary files /dev/null and b/plugins/kreuzberg/assets/logo.png differ
diff --git a/plugins/kreuzberg/skills/extracting-tables/SKILL.md b/plugins/kreuzberg/skills/extracting-tables/SKILL.md
new file mode 100644
index 0000000..d550090
--- /dev/null
+++ b/plugins/kreuzberg/skills/extracting-tables/SKILL.md
@@ -0,0 +1,148 @@
+---
+name: extracting-tables
+description: Use when extracting tabular data from PDFs, spreadsheets, or images. Covers layout-aware table detection, table model selection, output formats (markdown / JSON cells), and known limits.
+---
+
+# Extracting tables
+
+Use this when the user wants structured tabular data — financial
+statements, scientific tables, invoices, spreadsheet-style PDFs. Kreuzberg
+detects tables via a layout model (RT-DETR v2) and reconstructs cell
+structure with a configurable table model.
+
+## Basic usage
+
+```bash
+# Markdown tables embedded in the content stream
+kreuzberg extract report.pdf --layout --content-format markdown
+
+# Structured JSON output, tables appear under result.tables
+kreuzberg extract report.pdf --layout --format json
+```
+
+`--layout` turns on layout-aware extraction; without it, tables fall back
+to plain text reflow and you lose cell boundaries.
+
+## Output shapes
+
+Two surfaces, picked via `--format` (CLI shape) and `--content-format`
+(content rendering):
+
+- **Markdown tables in `content`** — `--content-format markdown`. Tables
+  appear inline as `| col | col |` blocks. Good for LLM ingestion.
+- **Structured `tables` array** — `--format json`. Each entry has
+  `cells[][]` (rows × cols), `markdown` (pre-rendered), `page_index`,
+  `bbox`. Use this when downstream code needs exact cell access.
+
+Both are populated at once when `--layout` is on. The `tables` array is
+always structured; the `content` stream switches representation.
+
+```bash
+kreuzberg extract financials.pdf --layout --format json \
+  | jq '.tables[] | {page: .page_index, rows: (.cells | length)}'
+```
+
+## Table models
+
+`--layout-table-model` picks the reconstruction backend:
+
+| Model              | Best for                                              | Notes                                       |
+| ------------------ | ----------------------------------------------------- | ------------------------------------------- |
+| `tatr`             | dense complex tables (academic, financial)            | **Default.** Heaviest, highest accuracy.    |
+| `slanet_auto`      | dispatches per-table to wired/wireless                | Good when table styles are mixed.           |
+| `slanet_wired`     | tables with visible borders                           | Faster than tatr.                           |
+| `slanet_wireless`  | tables without borders (whitespace-separated)         | For invoices, simple grids.                 |
+| `slanet_plus`      | hybrid wired / wireless                               | Lighter than `slanet_auto`.                 |
+| `disabled`         | layout detection only, no table structure             | Use to skip table model cost.               |
+
+```bash
+kreuzberg extract bank-statement.pdf \
+  --layout --layout-table-model tatr --content-format markdown
+```
+
+Drop `--layout-confidence` when the layout model misses tables (default
+threshold ~0.5):
+
+```bash
+kreuzberg extract noisy-scan.pdf --layout --layout-confidence 0.3
+```
+
+## Spreadsheets
+
+`.xlsx`, `.ods`, `.csv`, `.tsv` are extracted by dedicated parsers — no
+layout model needed. Each sheet becomes a markdown table (or structured
+table) automatically:
+
+```bash
+kreuzberg extract workbook.xlsx --content-format markdown
+kreuzberg extract data.csv --format json
+```
+
+Pass `--no-cache=true` only when iterating on the same file with different
+configs.
+
+## Config file alternative
+
+```toml
+# `output_format` in config files equals `--content-format` on the CLI.
+output_format = "markdown"
+
+[layout_detection]
+enabled = true
+confidence_threshold = 0.5
+table_model = "tatr"
+```
+
+Then:
+
+```bash
+kreuzberg extract report.pdf --format json
+```
+
+## Programmatic access
+
+From Python, structured tables live on `result.tables`:
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, LayoutDetectionConfig
+
+config = ExtractionConfig(
+    layout_detection=LayoutDetectionConfig(enabled=True, table_model="tatr"),
+    output_format="markdown",
+)
+result = extract_file_sync("report.pdf", config=config)
+for table in result.tables:
+    print(table.markdown)        # rendered markdown
+    print(table.cells[0][0])     # cell access
+```
+
+Node.js mirrors this (`extractFile`, `result.tables`, camelCase fields).
+See `references/python-api.md` and `references/nodejs-api.md` in the
+sibling `kreuzberg` skill for full type signatures.
+
+## Known limitations
+
+- **Merged cells** — reconstructed as repeated values across the spanned
+  region; the merge is not preserved as metadata in v0.1.
+- **Rotated tables** — enable `--ocr-auto-rotate true` for image-based
+  PDFs before extraction.
+- **Nested tables** — flattened. Detection succeeds; structural nesting is
+  lost.
+- **Multi-page tables** — each page yields a separate `tables[]` entry.
+  Stitch by matching column headers if needed.
+- **ONNX Runtime required** — layout and table models are unavailable in
+  WASM builds and on the Android x86_64 emulator; native targets ship
+  full support.
+
+## Common failure modes
+
+- **Empty `tables` with `--layout` on** — confidence threshold too high or
+  table model mismatched. Drop `--layout-confidence` to 0.3, try
+  `--layout-table-model tatr`.
+- **Markdown tables look ragged** — switch `--layout-table-model` to
+  `slanet_wired` for bordered grids or `slanet_wireless` for invoices.
+- **Slow extraction** — `tatr` is heavy. Use `slanet_auto` or
+  `slanet_plus` as a default; reach for `tatr` only when accuracy matters.
+
+See `references/cli-reference.md` for the full layout flag set and
+`references/advanced-features.md` for the layout pipeline internals.
diff --git a/plugins/kreuzberg/skills/extracting-with-ocr/SKILL.md b/plugins/kreuzberg/skills/extracting-with-ocr/SKILL.md
new file mode 100644
index 0000000..d8bc1f3
--- /dev/null
+++ b/plugins/kreuzberg/skills/extracting-with-ocr/SKILL.md
@@ -0,0 +1,123 @@
+---
+name: extracting-with-ocr
+description: Use when extracting text from scanned PDFs, photographed pages, or images that have no embedded text layer. Covers OCR backends, language packs, force-OCR, and performance tuning.
+---
+
+# Extracting with OCR
+
+Use this when a document is image-based: scanned PDFs, photographed pages,
+screenshots, JPEG/PNG/TIFF with text. Kreuzberg auto-OCRs raster images and
+auto-detects PDFs that lack a text layer. Force it on when extraction
+returned empty/garbled text from a PDF that "looks" textual.
+
+## When to force OCR
+
+- Extraction returned an empty `content` field, but the file opens visually.
+- The PDF text layer is junk (copy-paste from a viewer produces gibberish).
+- You want consistent output across mixed scanned + digital PDFs.
+
+```bash
+kreuzberg extract scan.pdf --force-ocr=true
+kreuzberg extract scan.pdf --ocr=true --ocr-language eng
+```
+
+If a page has an unreliable text layer, `--force-ocr=true` re-rasterizes
+and runs OCR on every page.
+
+## Backends
+
+Tesseract is the default and ships with the CLI — no extra install. Other
+backends are opt-in:
+
+| Backend       | Flag                                  | Install                                          | Notes                                                          |
+| ------------- | ------------------------------------- | ------------------------------------------------ | -------------------------------------------------------------- |
+| Tesseract     | `--ocr-backend tesseract` (default)   | bundled                                          | Best general-purpose, 100+ languages via tessdata.             |
+| PaddleOCR     | `--ocr-backend paddle-ocr`            | bundled (ONNX Runtime)                           | Strong on Asian scripts. Not available on WASM or Windows.     |
+| EasyOCR       | `--ocr-backend easyocr`               | Python binding (`pip install kreuzberg[easyocr]`)| Heavier model. CUDA accel via `easyocr_kwargs={"gpu": True}`.  |
+| VLM (vision)  | layout + a multimodal LLM via config  | configured per backend                           | Use when OCR fails on dense or handwritten layouts.            |
+
+Pick Tesseract first. Switch only when accuracy is unacceptable.
+
+## Language packs
+
+Tesseract uses ISO 639-2 codes. Default is `eng`. Combine with `+`:
+
+```bash
+kreuzberg extract menu.jpg --ocr=true --ocr-language "eng+deu"
+kreuzberg extract bilingual.pdf --ocr-language "eng+jpn"
+kreuzberg extract any.pdf --ocr-language all   # all installed packs
+```
+
+Install missing packs at the OS level:
+
+```bash
+# macOS
+brew install tesseract-lang
+
+# Debian/Ubuntu
+sudo apt install tesseract-ocr-deu tesseract-ocr-jpn tesseract-ocr-fra
+
+# Specific lang only
+sudo apt install tesseract-ocr-<iso639-2>
+```
+
+Kreuzberg fails fast with a helpful error if you request a language pack
+that is not installed. Read the error — it names the missing file.
+
+## Useful flags
+
+- `--ocr=true` — enable OCR (auto-enabled for images and scanned PDFs).
+- `--force-ocr=true` — OCR every page even if a text layer exists.
+- `--disable-ocr=true` — never OCR (extract embedded text only or fail).
+- `--ocr-language <lang>` — single code or `+`-joined list, or `all`.
+- `--ocr-backend <tesseract|paddle-ocr|easyocr>` — pick backend.
+- `--ocr-auto-rotate=true` — pre-rotate via the auto-rotate model.
+- `--acceleration <cpu|coreml|cuda|tensorrt|auto>` — ONNX accelerator for
+  paddle-ocr / auto-rotate / layout models.
+
+## Performance tips
+
+- Cache is on by default. Repeated extraction of the same file + config is
+  instant. Do not pass `--no-cache=true` unless you have a reason.
+- For batch OCR, use `kreuzberg batch *.pdf --ocr=true` — internal worker
+  pool parallelizes across CPU cores. Cap with `--max-concurrent N` if
+  memory is tight.
+- Raise `--target-dpi` (default 300) only for low-resolution scans. Higher
+  DPI is slower; 200 is usually enough for printed text.
+- Enable `--ocr-auto-rotate=true` only when pages may be rotated; the
+  classifier adds latency.
+- On Apple Silicon, `--acceleration coreml` typically beats CPU for
+  paddle-ocr and layout detection.
+
+## Config file alternative
+
+Long flag chains belong in `kreuzberg.toml` — auto-discovered from cwd
+upward.
+
+```toml
+force_ocr = true
+output_format = "markdown"
+
+[ocr]
+backend = "tesseract"
+language = "eng+deu"
+auto_rotate = true
+```
+
+Then just run:
+
+```bash
+kreuzberg extract document.pdf
+```
+
+## Common failure modes
+
+- **"missing tessdata"** — install the language pack at OS level (see above).
+- **Empty content on a scanned PDF without `--force-ocr`** — the file has a
+  bogus zero-width text layer. Re-run with `--force-ocr=true`.
+- **OCR on a rotated page** — add `--ocr-auto-rotate=true` or pre-rotate.
+- **Garbled CJK output** — ensure the right language pack is installed and
+  passed via `--ocr-language`; consider `paddle-ocr` for Chinese/Japanese.
+
+See `references/cli-reference.md` and `references/configuration.md` in the
+sibling `kreuzberg` skill for the full flag and config schema.
diff --git a/plugins/kreuzberg/skills/kreuzberg/SKILL.md b/plugins/kreuzberg/skills/kreuzberg/SKILL.md
new file mode 100644
index 0000000..845e752
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/SKILL.md
@@ -0,0 +1,419 @@
+---
+name: kreuzberg
+description: >-
+  Extract text, tables, metadata, and images from 91+ document formats
+  (PDF, Office, images, HTML, email, archives, academic) using Kreuzberg.
+  Use when writing code that calls Kreuzberg APIs in Python, Node.js/TypeScript,
+  Rust, or CLI. Covers installation, extraction (sync/async), configuration
+  (OCR, chunking, output format), batch processing, error handling, and plugins.
+license: Elastic-2.0
+metadata:
+  author: kreuzberg-dev
+  version: "0.1.0"
+  repository: https://github.com/kreuzberg-dev/kreuzberg
+---
+
+# Kreuzberg Document Extraction
+
+Kreuzberg is a high-performance document intelligence library with a Rust core and native bindings for Python, Node.js/TypeScript, Ruby, Go, Java, C#, PHP, and Elixir. It extracts text, tables, metadata, and images from 91+ file formats including PDF, Office documents, images (with OCR), HTML, email, archives, and academic formats.
+
+Use this skill when writing code that:
+
+- Extracts text or metadata from documents
+- Performs OCR on scanned documents or images
+- Batch-processes multiple files
+- Configures extraction options (output format, chunking, OCR, language detection)
+- Implements custom plugins (post-processors, validators, OCR backends)
+
+> If the `kreuzberg` MCP server is registered in this session, prefer its tools over shelling out to the CLI — they expose the same extraction surface with structured arguments and results.
+
+## Installation
+
+### Python
+
+```bash
+pip install kreuzberg
+# Optional OCR backends:
+pip install kreuzberg[easyocr]    # EasyOCR
+```
+
+### Node.js
+
+```bash
+npm install @kreuzberg/node
+```
+
+### Rust
+
+```toml
+# Cargo.toml
+[dependencies]
+kreuzberg = { version = "4", features = ["tokio-runtime"] }
+# features: tokio-runtime (required for sync + batch), pdf, ocr, chunking,
+#           embeddings, language-detection, keywords-yake, keywords-rake
+```
+
+### CLI
+
+```bash
+# Download from GitHub releases, or:
+cargo install kreuzberg-cli
+```
+
+## Quick Start
+
+### Python (Async)
+
+```python
+from kreuzberg import extract_file
+
+result = await extract_file("document.pdf")
+print(result.content)       # extracted text
+print(result.metadata)      # document metadata
+print(result.tables)        # extracted tables
+```
+
+### Python (Sync)
+
+```python
+from kreuzberg import extract_file_sync
+
+result = extract_file_sync("document.pdf")
+print(result.content)
+```
+
+### Node.js
+
+```typescript
+import { extractFile } from "@kreuzberg/node";
+
+const result = await extractFile("document.pdf");
+console.log(result.content);
+console.log(result.metadata);
+console.log(result.tables);
+```
+
+### Node.js (Sync)
+
+```typescript
+import { extractFileSync } from "@kreuzberg/node";
+
+const result = extractFileSync("document.pdf");
+```
+
+### Rust (Async)
+
+```rust
+use kreuzberg::{extract_file, ExtractionConfig};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
+
+### Rust (Sync) — requires `tokio-runtime` feature
+
+```rust
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("{}", result.content);
+    Ok(())
+}
+```
+
+### CLI
+
+```bash
+kreuzberg extract document.pdf
+kreuzberg extract document.pdf --format json
+kreuzberg extract document.pdf --content-format markdown
+```
+
+## Configuration
+
+All languages use the same configuration structure with language-appropriate naming conventions.
+
+### Python (snake_case)
+
+```python
+from kreuzberg import (
+    ExtractionConfig, OcrConfig, TesseractConfig,
+    PdfConfig, ChunkingConfig,
+)
+
+config = ExtractionConfig(
+    ocr=OcrConfig(
+        backend="tesseract",
+        language="eng",
+        tesseract_config=TesseractConfig(psm=6, enable_table_detection=True),
+    ),
+    pdf_options=PdfConfig(passwords=["secret123"]),
+    chunking=ChunkingConfig(max_chars=1000, max_overlap=200),
+    output_format="markdown",
+)
+
+result = await extract_file("document.pdf", config=config)
+```
+
+### Node.js (camelCase)
+
+```typescript
+import { extractFile, type ExtractionConfig } from "@kreuzberg/node";
+
+const config: ExtractionConfig = {
+  ocr: { backend: "tesseract", language: "eng" },
+  pdfOptions: { passwords: ["secret123"] },
+  chunking: { maxChars: 1000, maxOverlap: 200 },
+  outputFormat: "markdown",
+};
+
+const result = await extractFile("document.pdf", null, config);
+```
+
+### Rust (snake_case)
+
+```rust
+use kreuzberg::{ExtractionConfig, OcrConfig, ChunkingConfig, OutputFormat};
+
+let config = ExtractionConfig {
+    ocr: Some(OcrConfig {
+        backend: "tesseract".into(),
+        language: "eng".into(),
+        ..Default::default()
+    }),
+    chunking: Some(ChunkingConfig {
+        max_characters: 1000,
+        overlap: 200,
+        ..Default::default()
+    }),
+    output_format: OutputFormat::Markdown,
+    ..Default::default()
+};
+
+let result = extract_file("document.pdf", None, &config).await?;
+```
+
+### Config File (TOML)
+
+```toml
+output_format = "markdown"
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+
+[chunking]
+max_chars = 1000
+max_overlap = 200
+
+[pdf_options]
+passwords = ["secret123"]
+```
+
+```bash
+# CLI: auto-discovers kreuzberg.toml in current/parent directories
+kreuzberg extract doc.pdf
+# or explicit:
+kreuzberg extract doc.pdf --config kreuzberg.toml
+kreuzberg extract doc.pdf --config-json '{"ocr":{"backend":"tesseract","language":"deu"}}'
+```
+
+## Batch Processing
+
+### Python
+
+```python
+from kreuzberg import batch_extract_files, batch_extract_files_sync
+
+# Async
+results = await batch_extract_files(["doc1.pdf", "doc2.docx", "doc3.xlsx"])
+
+# Sync
+results = batch_extract_files_sync(["doc1.pdf", "doc2.docx"])
+
+for result in results:
+    print(f"{len(result.content)} chars extracted")
+```
+
+### Node.js
+
+```typescript
+import { batchExtractFiles } from "@kreuzberg/node";
+
+const results = await batchExtractFiles(["doc1.pdf", "doc2.docx"]);
+```
+
+### Rust — requires `tokio-runtime` feature
+
+```rust
+use kreuzberg::{batch_extract_file, ExtractionConfig};
+
+let config = ExtractionConfig::default();
+let paths = vec!["doc1.pdf", "doc2.docx"];
+let results = batch_extract_file(paths, &config).await?;
+```
+
+### CLI
+
+```bash
+kreuzberg batch *.pdf --format json
+kreuzberg batch docs/*.docx --content-format markdown
+```
+
+## OCR
+
+OCR runs automatically for images and scanned PDFs. Tesseract is the default backend (native binding, no external install required).
+
+### Backends
+
+- **Tesseract** (default): Built-in native binding. All Tesseract languages supported.
+- **EasyOCR** (Python only): `pip install kreuzberg[easyocr]`. Pass `easyocr_kwargs={"gpu": True}`.
+- **PaddleOCR** (Python only): Bundled since 4.8.5, no extra install needed. Pass `paddleocr_kwargs={"use_angle_cls": True}`.
+- **Guten** (Node.js only): Built-in OCR backend via `GutenOcrBackend`.
+
+### Language Codes
+
+```python
+config = ExtractionConfig(ocr=OcrConfig(language="eng"))       # English
+config = ExtractionConfig(ocr=OcrConfig(language="eng+deu"))   # Multiple
+config = ExtractionConfig(ocr=OcrConfig(language="all"))       # All installed
+```
+
+### Force OCR
+
+```python
+config = ExtractionConfig(force_ocr=True)  # OCR even if text is extractable
+```
+
+## ExtractionResult Fields
+
+| Field        | Python                      | Node.js                    | Rust                        | Description                                   |
+| ------------ | --------------------------- | -------------------------- | --------------------------- | --------------------------------------------- |
+| Text content | `result.content`            | `result.content`           | `result.content`            | Extracted text (str/String)                   |
+| MIME type    | `result.mime_type`          | `result.mimeType`          | `result.mime_type`          | Input document MIME type                      |
+| Metadata     | `result.metadata`           | `result.metadata`          | `result.metadata`           | Document metadata (dict/object/HashMap)       |
+| Tables       | `result.tables`             | `result.tables`            | `result.tables`             | Extracted tables with cells + markdown        |
+| Languages    | `result.detected_languages` | `result.detectedLanguages` | `result.detected_languages` | Detected languages (if enabled)               |
+| Chunks       | `result.chunks`             | `result.chunks`            | `result.chunks`             | Text chunks (if chunking enabled)             |
+| Images       | `result.images`             | `result.images`            | `result.images`             | Extracted images (if enabled)                 |
+| Elements     | `result.elements`           | `result.elements`          | `result.elements`           | Semantic elements (if element_based format)   |
+| Pages        | `result.pages`              | `result.pages`             | `result.pages`              | Per-page content (if page extraction enabled) |
+| Keywords     | `result.keywords`           | `result.keywords`          | `result.keywords`           | Extracted keywords (if enabled)               |
+
+## Error Handling
+
+### Python
+
+```python
+from kreuzberg import (
+    extract_file_sync, KreuzbergError, ParsingError,
+    OCRError, ValidationError, MissingDependencyError,
+)
+
+try:
+    result = extract_file_sync("file.pdf")
+except ParsingError as e:
+    print(f"Failed to parse: {e}")
+except OCRError as e:
+    print(f"OCR failed: {e}")
+except ValidationError as e:
+    print(f"Invalid input: {e}")
+except MissingDependencyError as e:
+    print(f"Missing dependency: {e}")
+except KreuzbergError as e:
+    print(f"Extraction failed: {e}")
+```
+
+### Node.js
+
+```typescript
+import {
+  extractFile,
+  KreuzbergError,
+  ParsingError,
+  OcrError,
+  ValidationError,
+  MissingDependencyError,
+} from "@kreuzberg/node";
+
+try {
+  const result = await extractFile("file.pdf");
+} catch (e) {
+  if (e instanceof ParsingError) {
+    /* ... */
+  } else if (e instanceof OcrError) {
+    /* ... */
+  } else if (e instanceof ValidationError) {
+    /* ... */
+  } else if (e instanceof KreuzbergError) {
+    /* ... */
+  }
+}
+```
+
+### Rust
+
+```rust
+use kreuzberg::{extract_file, ExtractionConfig, KreuzbergError};
+
+let config = ExtractionConfig::default();
+match extract_file("file.pdf", None, &config).await {
+    Ok(result) => println!("{}", result.content),
+    Err(KreuzbergError::Parsing(msg)) => eprintln!("Parse error: {msg}"),
+    Err(KreuzbergError::Ocr(msg)) => eprintln!("OCR error: {msg}"),
+    Err(e) => eprintln!("Error: {e}"),
+}
+```
+
+## Common Pitfalls
+
+1. **Python ChunkingConfig fields**: Use `max_chars` and `max_overlap`, NOT `max_characters` or `overlap`.
+2. **Rust extract_file signature**: Third argument is `&ExtractionConfig` (a reference), not `Option`. Use `&ExtractionConfig::default()` for defaults.
+3. **Rust feature gates**: `extract_file_sync`, `batch_extract_file`, and `batch_extract_file_sync` all require `features = ["tokio-runtime"]` in Cargo.toml.
+4. **Rust async context**: `extract_file` is async. Use `#[tokio::main]` or call from an async context.
+5. **CLI --format vs --content-format**: `--format` controls CLI output (text/json). `--content-format` controls content format (plain/markdown/djot/html). The older `--output-format` is a deprecated alias that still works but prints a warning — prefer `--content-format`.
+6. **Node.js extractFile signature**: `extractFile(path, mimeType?, config?)` — mimeType is the second arg (pass `null` to skip).
+7. **Python detect_mime_type**: The function for detecting from bytes is `detect_mime_type(data)`. For paths use `detect_mime_type_from_path(path)`.
+8. **Config file field names**: Use snake_case in TOML/YAML/JSON config files (e.g., `max_chars`, `max_overlap`, `pdf_options`).
+
+## Supported Formats (Summary)
+
+| Category          | Extensions                                                                                                                                                  |
+| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **PDF**           | `.pdf`                                                                                                                                                      |
+| **Word**          | `.docx`, `.odt`                                                                                                                                             |
+| **Spreadsheets**  | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods`                                                                                         |
+| **Presentations** | `.pptx`, `.ppt`, `.ppsx`                                                                                                                                    |
+| **eBooks**        | `.epub`, `.fb2`                                                                                                                                             |
+| **Images**        | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif`, `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm`, `.svg` |
+| **Markup**        | `.html`, `.htm`, `.xhtml`, `.xml`                                                                                                                           |
+| **Data**          | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv`                                                                                                           |
+| **Text**          | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf`                                                                                                 |
+| **Email**         | `.eml`, `.msg`                                                                                                                                              |
+| **Archives**      | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z`                                                                                                                        |
+| **Academic**      | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl`, `.tex`, `.latex`, `.typ`, `.jats`, `.ipynb`, `.docbook`, `.opml`, `.pod`, `.mdoc`, `.troff`           |
+
+See [references/supported-formats.md](references/supported-formats.md) for the complete format reference with MIME types.
+
+## Additional Resources
+
+Detailed reference files for specific topics:
+
+- **[Python API Reference](references/python-api.md)** — All functions, config classes, plugin protocols, exact signatures
+- **[Node.js API Reference](references/nodejs-api.md)** — All functions, TypeScript interfaces, worker pool APIs
+- **[Rust API Reference](references/rust-api.md)** — All functions with feature gates, structs, Cargo.toml examples
+- **[CLI Reference](references/cli-reference.md)** — All commands, flags, config precedence, exit codes
+- **[Configuration Reference](references/configuration.md)** — TOML/YAML/JSON formats, auto-discovery, env vars, full schema
+- **[Supported Formats](references/supported-formats.md)** — All 91+ formats with file extensions and MIME types
+- **[Advanced Features](references/advanced-features.md)** — Plugins, embeddings, MCP server, API server, security limits
+- **[Other Language Bindings](references/other-bindings.md)** — Go, Ruby, Java, C#, PHP, Elixir, WASM, Docker
+
+Full documentation: <https://docs.kreuzberg.dev>
+GitHub: <https://github.com/kreuzberg-dev/kreuzberg>
diff --git a/plugins/kreuzberg/skills/kreuzberg/references/advanced-features.md b/plugins/kreuzberg/skills/kreuzberg/references/advanced-features.md
new file mode 100644
index 0000000..283846d
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/references/advanced-features.md
@@ -0,0 +1,967 @@
+# Advanced Features Reference
+
+Kreuzberg provides powerful advanced features for customization, semantic processing, and integration with external systems.
+
+## Plugin System
+
+The plugin system allows you to extend Kreuzberg's extraction pipeline with custom post-processors, validators, and OCR backends. Plugins run within the extraction pipeline and have direct access to extraction results.
+
+### Custom Post-Processors
+
+Post-processors enrich extraction results after document parsing. They run non-destructively—if a post-processor fails, the extraction succeeds anyway (errors are logged).
+
+=== "Python"
+
+    ```python
+    from kreuzberg import register_post_processor, ExtractionResult
+
+    class MetadataEnricher:
+        def name(self) -> str:
+            return "metadata_enricher"
+
+        def process(self, result: ExtractionResult) -> ExtractionResult:
+            result.metadata["processed_by"] = "metadata_enricher"
+            result.metadata["char_count"] = len(result.content)
+            return result
+
+        def processing_stage(self) -> str:
+            # "early", "middle", or "late"
+            return "middle"
+
+        def initialize(self) -> None:
+            print("Initializing metadata enricher")
+
+        def shutdown(self) -> None:
+            print("Shutting down metadata enricher")
+
+    register_post_processor(MetadataEnricher())
+
+    # Now use extraction with the registered processor
+    from kreuzberg import extract_file_sync
+    result = extract_file_sync("document.pdf")
+    print(result.metadata["char_count"])
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { registerPostProcessor, ExtractionResult } from '@kreuzberg/node';
+
+    const enricher = {
+        name(): string {
+            return "metadata_enricher";
+        },
+
+        async process(result: ExtractionResult): Promise<ExtractionResult> {
+            result.metadata.processed_by = "metadata_enricher";
+            result.metadata.char_count = result.content.length;
+            return result;
+        },
+
+        processingStage?(): "early" | "middle" | "late" {
+            return "middle";
+        },
+
+        async initialize?(): Promise<void> {
+            console.log("Initializing metadata enricher");
+        },
+
+        async shutdown?(): Promise<void> {
+            console.log("Shutting down metadata enricher");
+        }
+    };
+
+    registerPostProcessor(enricher);
+
+    // Now use extraction with the registered processor
+    const result = await extractFile("document.pdf");
+    console.log(result.metadata.char_count);
+    ```
+
+### Custom Validators
+
+Validators perform quality checks on extraction results. Unlike post-processors, validator failures cause the entire extraction to fail. Use validators to enforce quality standards.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import register_validator, ExtractionResult, ValidationError
+
+    class MinimumContentValidator:
+        def name(self) -> str:
+            return "min_content_validator"
+
+        def validate(self, result: ExtractionResult) -> None:
+            if len(result.content) < 100:
+                raise ValidationError("Extracted content too short (< 100 chars)")
+
+        def priority(self) -> int:
+            # Higher priority runs first (0-1000, default 50)
+            return 100
+
+        def should_validate(self, result: ExtractionResult) -> bool:
+            # Only validate PDFs
+            return "pdf" in result.mime_type.lower()
+
+        def initialize(self) -> None:
+            pass
+
+        def shutdown(self) -> None:
+            pass
+
+    register_validator(MinimumContentValidator())
+
+    # Extraction will fail if content < 100 chars
+    result = extract_file_sync("document.pdf")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { registerValidator, ExtractionResult } from '@kreuzberg/node';
+
+    const validator = {
+        name(): string {
+            return "min_content_validator";
+        },
+
+        async validate(result: ExtractionResult): Promise<void> {
+            if (result.content.length < 100) {
+                throw new Error("Extracted content too short (< 100 chars)");
+            }
+        },
+
+        priority?(): number {
+            return 100;
+        },
+
+        shouldValidate?(result: ExtractionResult): boolean {
+            return result.mimeType.toLowerCase().includes("pdf");
+        },
+
+        async initialize?(): Promise<void> {},
+
+        async shutdown?(): Promise<void> {}
+    };
+
+    registerValidator(validator);
+
+    // Extraction will fail if content < 100 chars
+    const result = await extractFile("document.pdf");
+    ```
+
+### Custom OCR Backends
+
+Implement custom OCR engines by registering an OCR backend. This allows integration with proprietary or specialized OCR solutions.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import register_ocr_backend
+
+    class CustomOcrBackend:
+        def name(self) -> str:
+            return "custom_ocr"
+
+        def supported_languages(self) -> list[str]:
+            return ["eng", "deu", "fra", "spa"]
+
+        def process_image(self, image_bytes: bytes, language: str) -> dict:
+            # image_bytes: raw image data
+            # language: ISO 639-3 code (e.g., "eng", "deu")
+
+            # Call your OCR engine here
+            # text = my_ocr_engine.recognize(image_bytes, language)
+
+            return {
+                "content": "Extracted text from image",
+                "metadata": {"confidence": 0.95, "language": language},
+                "tables": []
+            }
+
+        def process_file(self, path: str, language: str) -> dict:
+            # Optional: custom file processing
+            # Called when extracting OCR from a file path
+            with open(path, "rb") as f:
+                image_bytes = f.read()
+            return self.process_image(image_bytes, language)
+
+        def initialize(self) -> None:
+            # Load models, initialize engine
+            pass
+
+        def shutdown(self) -> None:
+            # Clean up resources
+            pass
+
+        def version(self) -> str:
+            return "1.0.0"
+
+    register_ocr_backend(CustomOcrBackend())
+
+    # Use in extraction config
+    from kreuzberg import ExtractionConfig, OcrConfig, extract_file_sync
+
+    config = ExtractionConfig(
+        ocr=OcrConfig(backend="custom_ocr", language="eng")
+    )
+    result = extract_file_sync("scanned.pdf", config=config)
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { registerOcrBackend, ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    const backend = {
+        name(): string {
+            return "custom_ocr";
+        },
+
+        supportedLanguages(): string[] {
+            return ["eng", "deu", "fra", "spa"];
+        },
+
+        async processImage(
+            imageBytes: Uint8Array | string,
+            language: string
+        ): Promise<{
+            content: string;
+            mime_type: string;
+            metadata: Record<string, unknown>;
+            tables: unknown[];
+        }> {
+            const buffer = typeof imageBytes === "string"
+                ? Buffer.from(imageBytes, "base64")
+                : Buffer.from(imageBytes);
+
+            // Call your OCR engine
+            // const text = await myOcrEngine.recognize(buffer, language);
+
+            return {
+                content: "Extracted text from image",
+                mime_type: "text/plain",
+                metadata: { confidence: 0.95, language },
+                tables: []
+            };
+        },
+
+        async initialize?(): Promise<void> {
+            // Load models, initialize engine
+        },
+
+        async shutdown?(): Promise<void> {
+            // Clean up resources
+        }
+    };
+
+    registerOcrBackend(backend);
+
+    // Use in extraction config
+    const config: ExtractionConfig = {
+        ocr: { backend: "custom_ocr", language: "eng" }
+    };
+    const result = await extractFile("scanned.pdf", null, config);
+    ```
+
+## Per-File Configuration in Batch Operations
+
+Use `FileExtractionConfig` to override extraction settings for individual files within a batch. This is useful for mixed-format batches where different documents need different OCR, output, or processing settings.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        batch_extract_files_sync,
+        ExtractionConfig, FileExtractionConfig, OcrConfig,
+    )
+
+    config = ExtractionConfig(output_format="markdown")
+    paths = ["report.pdf", "scan.tiff"]
+    file_configs = [
+        None,  # use batch defaults
+        FileExtractionConfig(
+            force_ocr=True,
+            ocr=OcrConfig(backend="tesseract", language="deu"),
+        ),
+    ]
+    results = batch_extract_files_sync(paths, config, file_configs=file_configs)
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { batchExtractFilesSync } from '@kreuzberg/node';
+
+    const results = batchExtractFilesSync(
+      ['report.pdf', 'scan.tiff'],
+      { outputFormat: 'markdown' },
+      [null, { forceOcr: true, ocr: { backend: 'tesseract', language: 'deu' } }],
+    );
+    ```
+
+All `ExtractionConfig` fields except batch-level concerns (`max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`) can be overridden. `None`/`null` fields inherit from the batch default.
+
+## Embeddings
+
+Generate vector embeddings for text chunks using ONNX-based models. Embeddings enable semantic search, clustering, and similarity operations on extracted content.
+
+**Requirements:** ONNX Runtime 1.22.x or later
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, ChunkingConfig, EmbeddingConfig,
+        EmbeddingModelType, list_embedding_presets,
+        get_embedding_preset, extract_file_sync
+    )
+
+    # List available embedding presets
+    presets = list_embedding_presets()
+    print(f"Available presets: {presets}")  # ['balanced', 'compact', 'large']
+
+    # Get details about a preset
+    preset_info = get_embedding_preset("balanced")
+    print(f"Model: {preset_info.model_name}")
+    print(f"Dimensions: {preset_info.dimensions}")
+    print(f"Recommended chunk size: {preset_info.chunk_size}")
+
+    # Method 1: Use preset (recommended)
+    config = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_chars=512,
+            max_overlap=100,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.preset("balanced"),
+                normalize=True,
+                batch_size=32
+            )
+        )
+    )
+
+    # Method 2: Use specific fastembed model
+    config = ExtractionConfig(
+        chunking=ChunkingConfig(
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.fastembed(
+                    model="BAAI/bge-small-en-v1.5",
+                    dimensions=384
+                ),
+                normalize=True
+            )
+        )
+    )
+
+    # Method 3: Use custom ONNX model from HuggingFace
+    config = ExtractionConfig(
+        chunking=ChunkingConfig(
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType.custom(
+                    model_id="sentence-transformers/all-MiniLM-L6-v2",
+                    dimensions=384
+                ),
+                cache_dir="/path/to/model/cache"
+            )
+        )
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+
+    # Access embeddings in chunks
+    for chunk in result.chunks:
+        embedding = chunk.embedding  # list[float] or None
+        print(f"Chunk: {chunk.content[:50]}...")
+        print(f"Embedding dimensions: {len(embedding) if embedding else 0}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, ChunkingConfig,
+        listEmbeddingPresets, getEmbeddingPreset,
+        extractFile
+    } from '@kreuzberg/node';
+
+    // List available embedding presets
+    const presets = listEmbeddingPresets();
+    console.log(`Available presets: ${presets}`);  // ['balanced', 'compact', 'large']
+
+    // Get details about a preset
+    const preset = getEmbeddingPreset("balanced");
+    console.log(`Model: ${preset.modelName}`);
+    console.log(`Dimensions: ${preset.dimensions}`);
+    console.log(`Recommended chunk size: ${preset.chunkSize}`);
+
+    // Method 1: Use preset (recommended)
+    const config: ExtractionConfig = {
+        chunking: {
+            maxChars: 512,
+            maxOverlap: 100,
+            embedding: {
+                model: { type: 'preset', name: 'balanced' },
+                normalize: true,
+                batchSize: 32
+            }
+        }
+    };
+
+    // Method 2: Use specific fastembed model
+    const config2: ExtractionConfig = {
+        chunking: {
+            embedding: {
+                model: {
+                    type: 'fastembed',
+                    model: 'BAAI/bge-small-en-v1.5',
+                    dimensions: 384
+                },
+                normalize: true
+            }
+        }
+    };
+
+    // Method 3: Use custom ONNX model
+    const config3: ExtractionConfig = {
+        chunking: {
+            embedding: {
+                model: {
+                    type: 'custom',
+                    modelId: 'sentence-transformers/all-MiniLM-L6-v2',
+                    dimensions: 384
+                },
+                cacheDir: '/path/to/model/cache'
+            }
+        }
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+
+    // Access embeddings in chunks
+    if (result.chunks) {
+        for (const chunk of result.chunks) {
+            const embedding = chunk.embedding;  // number[] | null
+            console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
+            console.log(`Embedding dimensions: ${embedding?.length ?? 0}`);
+        }
+    }
+    ```
+
+## Keyword Extraction
+
+Extract important keywords and phrases from documents using YAKE (Yet Another Keyword Extractor) or RAKE (Rapid Automatic Keyword Extraction) algorithms.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, KeywordConfig, KeywordAlgorithm,
+        YakeParams, RakeParams, extract_file_sync
+    )
+
+    # YAKE algorithm (unsupervised, good for general use)
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.Yake,
+            max_keywords=15,
+            min_score=0.1,
+            ngram_range=(1, 3),
+            language="en",
+            yake_params=YakeParams(window_size=2)
+        )
+    )
+
+    # RAKE algorithm (co-occurrence based)
+    config = ExtractionConfig(
+        keywords=KeywordConfig(
+            algorithm=KeywordAlgorithm.Rake,
+            max_keywords=10,
+            min_score=0.0,
+            language="en",
+            rake_params=RakeParams(
+                min_word_length=3,
+                max_words_per_phrase=3
+            )
+        )
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+
+    # Access extracted keywords
+    if result.keywords:
+        for keyword in result.keywords:
+            print(f"Text: {keyword.text}")
+            print(f"Score: {keyword.score}")
+            print(f"Algorithm: {keyword.algorithm}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, KeywordConfig,
+        extractFile
+    } from '@kreuzberg/node';
+
+    // YAKE algorithm
+    const config: ExtractionConfig = {
+        keywords: {
+            algorithm: "yake",
+            maxKeywords: 15,
+            minScore: 0.1,
+            ngramRange: [1, 3],
+            language: "en",
+            yakeParams: {
+                windowSize: 2
+            }
+        }
+    };
+
+    // RAKE algorithm
+    const config2: ExtractionConfig = {
+        keywords: {
+            algorithm: "rake",
+            maxKeywords: 10,
+            minScore: 0.0,
+            language: "en",
+            rakeParams: {
+                minWordLength: 3,
+                maxWordsPerPhrase: 3
+            }
+        }
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+
+    // Access extracted keywords
+    if (result.keywords) {
+        for (const keyword of result.keywords) {
+            console.log(`Text: ${keyword.text}`);
+            console.log(`Score: ${keyword.score}`);
+            console.log(`Algorithm: ${keyword.algorithm}`);
+        }
+    }
+    ```
+
+## Language Detection
+
+Automatically detect the language(s) in documents using ISO 639-1 language codes.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, LanguageDetectionConfig,
+        extract_file_sync
+    )
+
+    # Enable language detection
+    config = ExtractionConfig(
+        language_detection=LanguageDetectionConfig(
+            enabled=True,
+            min_confidence=0.8,
+            detect_multiple=False
+        )
+    )
+
+    result = extract_file_sync("multilingual.pdf", config=config)
+
+    # Access detected languages
+    if result.detected_languages:
+        for lang_code in result.detected_languages:
+            print(f"Detected language: {lang_code}")  # e.g., "en", "de", "fr"
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, LanguageDetectionConfig,
+        extractFile
+    } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        languageDetection: {
+            enabled: true,
+            minConfidence: 0.8,
+            detectMultiple: false
+        }
+    };
+
+    const result = await extractFile("multilingual.pdf", null, config);
+
+    // Access detected languages
+    if (result.detectedLanguages) {
+        for (const langCode of result.detectedLanguages) {
+            console.log(`Detected language: ${langCode}`);  // e.g., "en", "de", "fr"
+        }
+    }
+    ```
+
+## Token Reduction
+
+Reduce the number of tokens in extracted content for cost optimization when working with LLM APIs. Higher modes are more aggressive but may lose more information.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, TokenReductionConfig,
+        extract_file_sync
+    )
+
+    # Light token reduction
+    config = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="light",
+            preserve_important_words=True
+        )
+    )
+
+    # Moderate reduction
+    config = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="moderate",
+            preserve_important_words=True
+        )
+    )
+
+    # Aggressive reduction
+    config = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="aggressive",
+            preserve_important_words=True
+        )
+    )
+
+    # Maximum reduction
+    config = ExtractionConfig(
+        token_reduction=TokenReductionConfig(
+            mode="maximum",
+            preserve_important_words=True
+        )
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+    print(f"Reduced content length: {len(result.content)}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, TokenReductionConfig,
+        extractFile
+    } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        tokenReduction: {
+            mode: "moderate",
+            preserveImportantWords: true
+        }
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+    console.log(`Reduced content length: ${result.content.length}`);
+    ```
+
+**Token Reduction Modes:**
+
+- `off`: No reduction (default)
+- `light`: Remove extra whitespace and redundant punctuation
+- `moderate`: Also remove common filler words and some formatting
+- `aggressive`: Also remove longer stopwords and collapse similar phrases
+- `maximum`: Maximum reduction while preserving semantic content
+
+## Page Extraction
+
+Extract and track per-page content separately. Useful for multi-page documents where you need page-level granularity.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import (
+        ExtractionConfig, PageConfig,
+        extract_file_sync
+    )
+
+    config = ExtractionConfig(
+        pages=PageConfig(
+            extract_pages=True,
+            insert_page_markers=True,
+            marker_format="\n\n<!-- PAGE {page_num} -->\n\n"
+        )
+    )
+
+    result = extract_file_sync("multi_page.pdf", config=config)
+
+    # Access per-page content
+    if result.pages:
+        for page in result.pages:
+            print(f"Page {page.page_number}:")
+            print(f"Content: {page.content[:100]}...")
+            print(f"Tables: {len(page.tables)}")
+            print(f"Images: {len(page.images)}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import {
+        ExtractionConfig, PageExtractionConfig,
+        extractFile
+    } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        pages: {
+            extractPages: true,
+            insertPageMarkers: true,
+            markerFormat: "\n\n<!-- PAGE {page_num} -->\n\n"
+        }
+    };
+
+    const result = await extractFile("multi_page.pdf", null, config);
+
+    // Access per-page content
+    if (result.pages) {
+        for (const page of result.pages) {
+            console.log(`Page ${page.pageNumber}:`);
+            console.log(`Content: ${page.content.substring(0, 100)}...`);
+            console.log(`Tables: ${page.tables.length}`);
+            console.log(`Images: ${page.images.length}`);
+        }
+    }
+    ```
+
+## Element-Based Output
+
+Extract semantic elements instead of unified content. This format is compatible with the Unstructured library and provides structured access to different content types (titles, headings, text, tables, images, etc.).
+
+=== "Python"
+
+    ```python
+    from kreuzberg import ExtractionConfig, ResultFormat, extract_file_sync
+
+    config = ExtractionConfig(
+        result_format="element_based"
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+
+    # Access semantic elements
+    if result.elements:
+        for element in result.elements:
+            print(f"Type: {element.element_type}")  # title, heading, narrative_text, etc.
+            print(f"Text: {element.text}")
+            if element.metadata.get("page_number"):
+                print(f"Page: {element.metadata['page_number']}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        resultFormat: "element_based"
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+
+    // Access semantic elements
+    if (result.elements) {
+        for (const element of result.elements) {
+            console.log(`Type: ${element.elementType}`);
+            console.log(`Text: ${element.text}`);
+            if (element.metadata.pageNumber) {
+                console.log(`Page: ${element.metadata.pageNumber}`);
+            }
+        }
+    }
+    ```
+
+**Element Types:**
+
+- `title`: Document or section title
+- `heading`: Section headings
+- `narrative_text`: Regular paragraph text
+- `list_item`: Items in bullet/numbered lists
+- `table`: Table structures
+- `image`: Images or figures
+- `page_break`: Page boundaries
+- `code_block`: Code snippets
+- `block_quote`: Quoted text
+- `footer`: Footer content
+- `header`: Header content
+
+## Djot Content
+
+Output extracted content in Djot markup format (a lighter alternative to Markdown with enhanced structure).
+
+=== "Python"
+
+    ```python
+    from kreuzberg import ExtractionConfig, OutputFormat, extract_file_sync
+
+    config = ExtractionConfig(
+        output_format="djot"
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+    print(result.content)  # Djot-formatted content
+
+    # Access structured Djot content
+    if result.djot_content:
+        print(f"Plain text: {result.djot_content['plain_text']}")
+        print(f"Blocks: {result.djot_content['blocks']}")
+        print(f"Links: {result.djot_content['links']}")
+        print(f"Images: {result.djot_content['images']}")
+        print(f"Footnotes: {result.djot_content['footnotes']}")
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        outputFormat: "djot"
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+    console.log(result.content);  // Djot-formatted content
+
+    // Access structured Djot content (if available)
+    if (result.djotContent) {
+        console.log(`Plain text: ${result.djotContent.plain_text}`);
+        console.log(`Blocks: ${result.djotContent.blocks}`);
+        console.log(`Links: ${result.djotContent.links}`);
+        console.log(`Images: ${result.djotContent.images}`);
+        console.log(`Footnotes: ${result.djotContent.footnotes}`);
+    }
+    ```
+
+## API Server
+
+Run Kreuzberg as an HTTP API server for integration with external services.
+
+    # Start server on default port 8000
+    kreuzberg serve
+
+    # Custom host and port
+    kreuzberg serve --host 0.0.0.0 --port 9000
+
+    # Enable CORS and other options
+    kreuzberg serve --host localhost --port 8000
+
+**API Endpoints:**
+
+- `POST /extract` - Extract from uploaded file
+- `POST /batch` - Batch extraction
+- `POST /detect` - Detect MIME type
+
+**Example:**
+
+    curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
+
+## MCP Server
+
+Run Kreuzberg as a Model Context Protocol server for integration with Claude and other AI models.
+
+    # Start MCP server with stdio transport
+    kreuzberg mcp --transport stdio
+
+    # Start MCP server with HTTP transport
+    kreuzberg mcp --transport http --host 127.0.0.1 --port 8001
+
+The MCP server exposes extraction functions to AI models, allowing them to process documents directly.
+
+## Security Limits
+
+Set resource limits to prevent abuse and control memory/file size consumption.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import ExtractionConfig, extract_file_sync
+
+    config = ExtractionConfig(
+        security_limits={
+            "max_file_size": 100_000_000,      # 100 MB
+            "max_archive_files": 1000,
+            "max_text_length": 10_000_000,     # 10 MB of text
+            "max_pages": 10000,
+            "max_concurrent_extractions": 4
+        }
+    )
+
+    result = extract_file_sync("document.pdf", config=config)
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    const config: ExtractionConfig = {
+        securityLimits: {
+            max_file_size: 100_000_000,        // 100 MB
+            max_archive_files: 1000,
+            max_text_length: 10_000_000,       // 10 MB of text
+            max_pages: 10000,
+            max_concurrent_extractions: 4
+        }
+    };
+
+    const result = await extractFile("document.pdf", null, config);
+    ```
+
+**Common Limits:**
+
+- `max_file_size`: Maximum input file size in bytes
+- `max_archive_files`: Maximum files in archives (zip, tar, etc.)
+- `max_text_length`: Maximum extracted text length
+- `max_pages`: Maximum number of pages to process
+- `max_concurrent_extractions`: Maximum concurrent extraction operations
+
+## Caching
+
+Extraction results are cached by default to improve performance on repeated extractions of identical documents. Control caching behavior through configuration.
+
+=== "Python"
+
+    ```python
+    from kreuzberg import ExtractionConfig, extract_file_sync
+
+    # Enable caching (default)
+    config = ExtractionConfig(use_cache=True)
+    result = extract_file_sync("document.pdf", config=config)
+
+    # Disable caching for a specific extraction
+    config = ExtractionConfig(use_cache=False)
+    result = extract_file_sync("document.pdf", config=config)
+    ```
+
+=== "TypeScript"
+
+    ```typescript
+    import { ExtractionConfig, extractFile } from '@kreuzberg/node';
+
+    // Enable caching (default)
+    const config: ExtractionConfig = { useCache: true };
+    const result = await extractFile("document.pdf", null, config);
+
+    // Disable caching
+    const config2: ExtractionConfig = { useCache: false };
+    const result2 = await extractFile("document.pdf", null, config2);
+    ```
+
+**CLI Cache Management:**
+
+    # View cache statistics
+    kreuzberg cache stats
+
+    # Clear all cached results
+    kreuzberg cache clear
+
+Caching is transparent and automatic—same input produces cached output instantly on subsequent extractions.
diff --git a/plugins/kreuzberg/skills/kreuzberg/references/cli-reference.md b/plugins/kreuzberg/skills/kreuzberg/references/cli-reference.md
new file mode 100644
index 0000000..bf5b6cc
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/references/cli-reference.md
@@ -0,0 +1,449 @@
+# Kreuzberg CLI Reference
+
+Comprehensive command-line interface for the Kreuzberg document intelligence library.
+
+## Installation
+
+Install from crates.io:
+
+```bash
+cargo install kreuzberg-cli
+```
+
+Or download pre-built binaries from [GitHub Releases](https://github.com/kreuzberg-dev/kreuzberg/releases).
+
+## Commands
+
+### extract
+
+Extract text and structure from a single document.
+
+```bash
+kreuzberg extract <path> [FLAGS]
+```
+
+## Positional Arguments
+
+- `<path>` — Path to the document file
+
+## Flags
+
+- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
+- `--config-json <json>` — Inline JSON configuration (merged after config file, before CLI flags).
+- `--config-json-base64 <base64>` — Base64-encoded JSON configuration.
+- `-m, --mime-type <type>` — MIME type hint (auto-detected if not provided).
+- `-f, --format <text|json>` — CLI output format (default: `text`). Controls how results display, not extraction content format.
+- `--content-format <plain|markdown|djot|html>` — Extraction content format (default: `plain`). Controls format of extracted content. (Note: `--output-format` is a deprecated alias.)
+- `--ocr <bool>` — Enable OCR processing.
+- `--ocr-backend <BACKEND>` — OCR backend: `tesseract`, `paddle-ocr`, `easyocr`.
+- `--ocr-language <LANG>` — OCR language code.
+- `--ocr-auto-rotate <bool>` — Auto-rotate images before OCR.
+- `--force-ocr <bool>` — Force OCR even if text extraction succeeds.
+- `--disable-ocr <bool>` — Disable OCR entirely (even for images).
+- `--no-cache <bool>` — Disable caching.
+- `--chunk <bool>` — Enable text chunking.
+- `--chunk-size <n>` — Chunk size in characters.
+- `--chunk-overlap <n>` — Chunk overlap in characters.
+- `--chunking-tokenizer <model>` — Tokenizer model for token-based sizing.
+- `--include-structure <bool>` — Include hierarchical document structure.
+- `--quality <bool>` — Enable quality processing.
+- `--detect-language <bool>` — Enable language detection.
+- `--layout` — Enable layout detection (RT-DETR v2). Use `--layout false` to disable.
+- `--layout-confidence <float>` — Layout confidence threshold (0.0-1.0).
+- `--layout-table-model <model>` — Table structure model: `tatr`, `slanet_wired`, `slanet_wireless`, `slanet_plus`, `slanet_auto`, `disabled`.
+- `--acceleration <provider>` — ONNX execution provider: `auto`, `cpu`, `coreml`, `cuda`, `tensorrt`.
+- `--extract-pages <bool>` — Extract pages as separate array.
+- `--page-markers <bool>` — Insert page marker comments.
+- `--extract-images <bool>` — Enable image extraction.
+- `--target-dpi <n>` — Target DPI for images (36-2400).
+- `--pdf-password <pass>` — Password for encrypted PDFs (repeatable).
+- `--pdf-extract-images <bool>` — Extract images from PDF pages.
+- `--pdf-extract-metadata <bool>` — Extract PDF metadata.
+- `--token-reduction <level>` — Token reduction: `off`, `light`, `moderate`, `aggressive`, `maximum`.
+- `--msg-codepage <n>` — Windows codepage fallback for MSG files.
+- `--max-concurrent <n>` — Max parallel extractions in batch mode.
+- `--max-threads <n>` — Cap all internal thread pools.
+- `--cache-namespace <name>` — Cache namespace for tenant isolation.
+- `--cache-ttl-secs <n>` — Per-request cache TTL in seconds.
+
+## Examples
+
+```bash
+# Extract with default settings
+kreuzberg extract document.pdf
+
+# Extract with OCR enabled
+kreuzberg extract scanned.pdf --ocr=true
+
+# Extract with specific content format
+kreuzberg extract doc.docx --content-format markdown
+
+# Extract with inline JSON config
+kreuzberg extract file.pdf --config-json '{"ocr":{"backend":"tesseract"}}'
+
+# Extract with base64-encoded config
+kreuzberg extract file.pdf --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
+
+# Extract and output as JSON
+kreuzberg extract doc.pdf --format json
+
+# Extract with chunking
+kreuzberg extract large-doc.pdf --chunk true --chunk-size 2000 --chunk-overlap 200
+
+# Layout-aware markdown extraction
+kreuzberg extract document.pdf --layout --content-format markdown
+
+# With custom confidence threshold
+kreuzberg extract document.pdf --layout-confidence 0.7 --content-format markdown
+```
+
+### batch
+
+Batch extract from multiple documents in parallel.
+
+```bash
+kreuzberg batch <paths...> [FLAGS]
+```
+
+## Positional Arguments
+
+- `<paths...>` — One or more document file paths
+
+## Flags
+
+- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
+- `--config-json <json>` — Inline JSON configuration (merged after config file, before CLI flags).
+- `--config-json-base64 <base64>` — Base64-encoded JSON configuration.
+- `-f, --format <text|json>` — CLI output format (default: `json`). Controls how results display, not extraction content format.
+- All extraction override flags from `extract` are also supported (e.g., `--content-format`, `--ocr`, `--layout`, `--force-ocr`, `--no-cache`, `--quality`, `--acceleration`, etc.). See the `extract` command flags for the full list.
+
+## Notes
+
+- Batch command defaults to JSON output format (unlike `extract` which defaults to text).
+- Does not support `--mime-type` or `--detect-language` flags.
+
+## Examples
+
+```bash
+# Batch extract multiple PDFs
+kreuzberg batch document1.pdf document2.pdf document3.pdf
+
+# Batch extract with glob patterns (shell expansion)
+kreuzberg batch *.pdf
+
+# Batch extract with custom output format
+kreuzberg batch doc1.pdf doc2.pdf --content-format markdown
+
+# Batch extract with OCR
+kreuzberg batch scanned*.pdf --ocr=true
+
+# Batch extract with text output format
+kreuzberg batch files*.docx --format text
+```
+
+### detect
+
+Identify MIME type of a file.
+
+```bash
+kreuzberg detect <path> [FLAGS]
+```
+
+## Positional Arguments
+
+- `<path>` — Path to the file
+
+## Flags
+
+- `-f, --format <text|json>` — Output format (default: `text`)
+
+## Examples
+
+```bash
+# Detect MIME type (text output)
+kreuzberg detect unknown-file.bin
+
+# Detect MIME type (JSON output)
+kreuzberg detect file.xyz --format json
+```
+
+### version
+
+Display version information.
+
+```bash
+kreuzberg version [FLAGS]
+```
+
+## Flags
+
+- `-f, --format <text|json>` — Output format (default: `text`)
+
+## Examples
+
+```bash
+# Show version as text
+kreuzberg version
+
+# Show version as JSON
+kreuzberg version --format json
+```
+
+### cache
+
+Manage extraction cache.
+
+#### cache stats
+
+Display cache statistics.
+
+```bash
+kreuzberg cache stats [FLAGS]
+```
+
+## Flags
+
+- `--cache-dir <path>` — Cache directory (default: `.kreuzberg` in current directory)
+- `-f, --format <text|json>` — Output format (default: `text`)
+
+## Examples
+
+```bash
+# Show cache stats
+kreuzberg cache stats
+
+# Show cache stats as JSON
+kreuzberg cache stats --format json
+
+# Show stats for specific cache directory
+kreuzberg cache stats --cache-dir /tmp/my-cache
+```
+
+### cache clear
+
+Clear all cached extractions.
+
+```bash
+kreuzberg cache clear [FLAGS]
+```
+
+## Flags
+
+- `--cache-dir <path>` — Cache directory (default: `.kreuzberg` in current directory)
+- `-f, --format <text|json>` — Output format (default: `text`)
+
+## Examples
+
+```bash
+# Clear cache
+kreuzberg cache clear
+
+# Clear specific cache directory
+kreuzberg cache clear --cache-dir /tmp/my-cache
+```
+
+### serve
+
+Start the API server (requires `api` feature).
+
+```bash
+kreuzberg serve [FLAGS]
+```
+
+## Flags
+
+- `-H, --host <host>` — Host to bind to (e.g., `127.0.0.1` or `0.0.0.0`). CLI arg overrides config file and environment variables.
+- `-p, --port <port>` — Port to bind to. CLI arg overrides config file and environment variables.
+- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
+
+## Configuration Precedence
+
+1. CLI arguments (`--host`, `--port`)
+2. Environment variables (`KREUZBERG_HOST`, `KREUZBERG_PORT`)
+3. Config file (`[server]` section)
+4. Built-in defaults (`127.0.0.1:8000`)
+
+## Examples
+
+```bash
+# Start server with defaults
+kreuzberg serve
+
+# Start server on specific host and port
+kreuzberg serve --host 0.0.0.0 --port 3000
+
+# Start server with config file
+kreuzberg serve --config kreuzberg.toml
+
+# Start server (environment variables override defaults)
+KREUZBERG_HOST=192.168.1.100 KREUZBERG_PORT=8080 kreuzberg serve
+```
+
+### mcp
+
+Start the Model Context Protocol (MCP) server (requires `mcp` feature).
+
+```bash
+kreuzberg mcp [FLAGS]
+```
+
+## Flags
+
+- `-c, --config <path>` — Path to config file (TOML, YAML, or JSON). Auto-discovers `kreuzberg.{toml,yaml,json}` in current and parent directories if omitted.
+- `--transport <stdio|http>` — Transport mode (default: `stdio`)
+- `--host <host>` — HTTP host for http transport (default: `127.0.0.1`)
+- `--port <port>` — HTTP port for http transport (default: `8001`)
+
+## Examples
+
+```bash
+# Start MCP server with stdio transport
+kreuzberg mcp
+
+# Start MCP server with HTTP transport
+kreuzberg mcp --transport http
+
+# Start MCP server on custom HTTP host/port
+kreuzberg mcp --transport http --host 0.0.0.0 --port 9000
+
+# Start MCP server with config file
+kreuzberg mcp --config kreuzberg.toml
+```
+
+## Configuration
+
+### File Format
+
+Configuration files support three formats with automatic detection:
+
+- **TOML** — `.toml` extension (recommended)
+- **YAML** — `.yaml` or `.yml` extension
+- **JSON** — `.json` extension
+
+### Configuration Precedence
+
+Settings are applied in order from highest to lowest priority:
+
+1. **Individual CLI flags** (e.g., `--ocr=true`, `--content-format markdown`)
+2. **Inline JSON config** (`--config-json` or `--config-json-base64`)
+3. **Config file** (explicit `--config path.toml` or auto-discovered)
+4. **Default values** (built-in library defaults)
+
+### Auto-Discovery
+
+When no config file is specified, Kreuzberg searches for configuration in this order:
+
+1. `kreuzberg.toml` in current directory
+2. `kreuzberg.yaml` in current directory
+3. `kreuzberg.json` in current directory
+4. Parent directories (same search pattern, up to filesystem root)
+
+### Example Configuration
+
+```toml
+# Top-level extraction options
+use_cache = true
+enable_quality_processing = true
+force_ocr = false
+output_format = "markdown"
+
+# OCR settings
+[ocr]
+backend = "tesseract"
+language = "eng"
+
+# Chunking settings
+[chunking]
+max_chars = 2000
+max_overlap = 200
+
+# Language detection
+[language_detection]
+enabled = true
+
+# Server configuration (for serve command)
+[server]
+host = "127.0.0.1"
+port = 8000
+```
+
+## Exit Codes
+
+- `0` — Success
+- Non-zero — Error (see stderr for details)
+
+## Error Handling
+
+The CLI validates input and provides clear error messages:
+
+- **File not found** — Verify path exists and is readable
+- **Invalid MIME type** — Ensure file is accessible and format is supported
+- **Invalid JSON** — Check `--config-json` syntax
+- **Invalid config file** — Verify TOML/YAML/JSON format
+- **Invalid chunk parameters** — Ensure chunk-size > 0 and overlap < chunk-size
+
+## Environment Variables
+
+- `RUST_LOG` — Set logging level (e.g., `RUST_LOG=debug`)
+- `KREUZBERG_HOST` — Server bind host (used by `serve` command)
+- `KREUZBERG_PORT` — Server bind port (used by `serve` command)
+
+## Common Patterns
+
+### Extract with Custom Configuration
+
+```bash
+kreuzberg extract document.pdf \
+  --content-format markdown \
+  --ocr=true \
+  --quality true
+```
+
+### Batch Process with Config File
+
+```bash
+kreuzberg batch *.pdf --config extraction-config.toml
+```
+
+### CI/CD Integration
+
+```bash
+# Extract to JSON for downstream processing
+kreuzberg extract file.pdf --format json | jq '.content'
+
+# Batch process with error handling
+kreuzberg batch docs/*.pdf --format json || exit 1
+```
+
+### Performance Tuning
+
+```bash
+# Disable cache for temporary processing
+kreuzberg extract file.pdf --no-cache=true
+
+# Enable chunking for large documents
+kreuzberg extract large-file.pdf \
+  --chunk true \
+  --chunk-size 5000 \
+  --chunk-overlap 500
+```
+
+## Debugging
+
+Enable detailed logging:
+
+```bash
+RUST_LOG=debug kreuzberg extract document.pdf
+```
+
+Check cache statistics:
+
+```bash
+kreuzberg cache stats --format json
+```
+
+Detect file MIME type:
+
+```bash
+kreuzberg detect unknown-file --format json
+```
diff --git a/plugins/kreuzberg/skills/kreuzberg/references/configuration.md b/plugins/kreuzberg/skills/kreuzberg/references/configuration.md
new file mode 100644
index 0000000..3e34845
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/references/configuration.md
@@ -0,0 +1,417 @@
+# Configuration Reference
+
+Kreuzberg uses a hierarchical configuration system supporting multiple formats and auto-discovery mechanisms. This reference covers all available configuration options, field names across programming languages, and loading strategies.
+
+## Supported Formats
+
+Kreuzberg configurations can be defined in three formats:
+
+- **TOML** (recommended): `kreuzberg.toml`
+- **YAML**: `kreuzberg.yaml`
+- **JSON**: `kreuzberg.json`
+
+All formats support the same schema and configuration options.
+
+## Auto-Discovery
+
+When no configuration file is explicitly specified, Kreuzberg searches for configuration files in the following order:
+
+1. Current working directory: `kreuzberg.toml`, `kreuzberg.yaml`, `kreuzberg.json`
+2. Parent directories (recursively up the tree, same file name pattern)
+
+The first matching configuration file is loaded.
+
+## Programmatic Loading
+
+### Python
+
+```python
+from kreuzberg import ExtractionConfig
+
+# Load from explicit path
+config = ExtractionConfig.from_file("kreuzberg.toml")
+
+# Auto-discover configuration
+config = ExtractionConfig.discover()
+```
+
+### Node.js / TypeScript
+
+```typescript
+import { ExtractionConfig } from "@kreuzberg/node";
+
+// Load from explicit path
+const config = ExtractionConfig.fromFile("kreuzberg.toml");
+
+// Auto-discover configuration
+const config = ExtractionConfig.discover();
+```
+
+### CLI
+
+```bash
+# Explicit configuration file
+kreuzberg extract --config kreuzberg.toml document.pdf
+
+# Auto-discovery (searches default locations)
+kreuzberg extract document.pdf
+```
+
+## Configuration Schema
+
+The complete TOML schema with all available sections and options:
+
+### Top-Level Options
+
+```toml
+use_cache = true
+enable_quality_processing = true
+force_ocr = false
+output_format = "markdown"
+result_format = "text"
+max_concurrent_extractions = 4
+```
+
+| Option                       | Type    | Default      | Description                                                                         |
+| ---------------------------- | ------- | ------------ | ----------------------------------------------------------------------------------- |
+| `use_cache`                  | boolean | `true`       | Enable caching of extraction results                                                |
+| `enable_quality_processing`  | boolean | `true`       | Enable post-processing for output quality                                           |
+| `force_ocr`                  | boolean | `false`      | Force OCR processing even for searchable PDFs                                       |
+| `disable_ocr`                | boolean | `false`      | Disable OCR entirely — image files return empty content instead of errors (v4.7.0+) |
+| `output_format`              | string  | `"markdown"` | Output format (markdown, html, text)                                                |
+| `result_format`              | string  | `"text"`     | Result format for structured output                                                 |
+| `max_concurrent_extractions` | integer | `4`          | Maximum concurrent document extractions                                             |
+
+### OCR Configuration
+
+```toml
+[ocr]
+backend = "tesseract"
+language = "eng"
+```
+
+| Option     | Type   | Default       | Description                                   |
+| ---------- | ------ | ------------- | --------------------------------------------- |
+| `backend`  | string | `"tesseract"` | OCR backend (currently tesseract)             |
+| `language` | string | `"eng"`       | ISO 639-3 language code (eng, deu, fra, etc.) |
+
+#### Tesseract Configuration
+
+```toml
+[ocr.tesseract_config]
+psm = 3
+oem = 3
+min_confidence = 0.0
+output_format = "text"
+enable_table_detection = false
+table_min_confidence = 0.5
+table_column_threshold = 50
+table_row_threshold_ratio = 0.5
+use_cache = true
+```
+
+| Option                      | Type    | Default  | Description                                |
+| --------------------------- | ------- | -------- | ------------------------------------------ |
+| `psm`                       | integer | `3`      | Page Segmentation Mode (0-13)              |
+| `oem`                       | integer | `3`      | OCR Engine Mode (0-3)                      |
+| `min_confidence`            | float   | `0.0`    | Minimum OCR confidence threshold (0.0-1.0) |
+| `output_format`             | string  | `"text"` | Output format from OCR                     |
+| `enable_table_detection`    | boolean | `false`  | Enable table detection during OCR          |
+| `table_min_confidence`      | float   | `0.5`    | Minimum confidence for table cells         |
+| `table_column_threshold`    | integer | `50`     | Pixel threshold for column detection       |
+| `table_row_threshold_ratio` | float   | `0.5`    | Row height ratio threshold                 |
+| `use_cache`                 | boolean | `true`   | Cache OCR results                          |
+
+#### Tesseract Preprocessing
+
+```toml
+[ocr.tesseract_config.preprocessing]
+target_dpi = 300
+auto_rotate = true
+deskew = true
+denoise = true
+contrast_enhance = true
+binarization_method = "otsu"
+invert_colors = false
+```
+
+| Option                | Type    | Default  | Description                                    |
+| --------------------- | ------- | -------- | ---------------------------------------------- |
+| `target_dpi`          | integer | `300`    | Target DPI for preprocessing                   |
+| `auto_rotate`         | boolean | `true`   | Automatically detect and correct page rotation |
+| `deskew`              | boolean | `true`   | Correct skewed pages                           |
+| `denoise`             | boolean | `true`   | Remove noise from images                       |
+| `contrast_enhance`    | boolean | `true`   | Enhance image contrast                         |
+| `binarization_method` | string  | `"otsu"` | Method for image binarization                  |
+| `invert_colors`       | boolean | `false`  | Invert image colors if needed                  |
+
+### PDF Options
+
+```toml
+[pdf_options]
+extract_images = true
+extract_metadata = true
+
+[pdf_options.hierarchy]
+enabled = true
+k_clusters = 6
+include_bbox = true
+ocr_coverage_threshold = 0.5
+```
+
+| Option                             | Type    | Default | Description                                    |
+| ---------------------------------- | ------- | ------- | ---------------------------------------------- |
+| `extract_images`                   | boolean | `true`  | Extract images from PDF documents              |
+| `extract_metadata`                 | boolean | `true`  | Extract PDF metadata                           |
+| `hierarchy.enabled`                | boolean | `true`  | Enable PDF hierarchy extraction (v4.0.0+)      |
+| `hierarchy.k_clusters`             | integer | `6`     | Number of clusters for hierarchy detection     |
+| `hierarchy.include_bbox`           | boolean | `true`  | Include bounding boxes in hierarchy            |
+| `hierarchy.ocr_coverage_threshold` | float   | `0.5`   | OCR coverage threshold for hierarchy (0.0-1.0) |
+
+### Image Processing
+
+```toml
+[images]
+extract_images = true
+target_dpi = 300
+max_image_dimension = 4096
+auto_adjust_dpi = true
+min_dpi = 72
+max_dpi = 600
+```
+
+| Option                | Type    | Default | Description                                  |
+| --------------------- | ------- | ------- | -------------------------------------------- |
+| `extract_images`      | boolean | `true`  | Extract images from documents                |
+| `target_dpi`          | integer | `300`   | Target DPI for image processing              |
+| `max_image_dimension` | integer | `4096`  | Maximum image dimension in pixels            |
+| `auto_adjust_dpi`     | boolean | `true`  | Automatically adjust DPI based on image size |
+| `min_dpi`             | integer | `72`    | Minimum DPI threshold                        |
+| `max_dpi`             | integer | `600`   | Maximum DPI threshold                        |
+
+### Chunking Configuration
+
+```toml
+[chunking]
+max_chars = 1000
+max_overlap = 200
+
+[chunking.embedding]
+batch_size = 32
+normalize = true
+show_download_progress = true
+cache_dir = "~/.cache/kreuzberg/embeddings"
+
+[chunking.embedding.model]
+type = "preset"
+name = "balanced"
+```
+
+| Option                             | Type    | Default                           | Description                                                |
+| ---------------------------------- | ------- | --------------------------------- | ---------------------------------------------------------- |
+| `max_chars`                        | integer | `1000`                            | Maximum characters per chunk                               |
+| `max_overlap`                      | integer | `200`                             | Overlap between consecutive chunks                         |
+| `embedding.batch_size`             | integer | `32`                              | Batch size for embedding generation                        |
+| `embedding.normalize`              | boolean | `true`                            | Normalize embeddings to unit length                        |
+| `embedding.show_download_progress` | boolean | `true`                            | Show progress when downloading models                      |
+| `embedding.cache_dir`              | string  | `"~/.cache/kreuzberg/embeddings"` | Directory for caching embeddings                           |
+| `embedding.model.type`             | string  | `"preset"`                        | Model type: preset, fastembed, or custom                   |
+| `embedding.model.name`             | string  | `"balanced"`                      | Preset model name (balanced, fast, accurate, multilingual) |
+| `embedding.model.model`            | string  |                                   | FastEmbed model identifier                                 |
+| `embedding.model.model_id`         | string  |                                   | Custom HuggingFace model ID                                |
+| `embedding.model.dimensions`       | integer |                                   | Embedding dimensions                                       |
+
+### Keywords Configuration
+
+```toml
+[keywords]
+algorithm = "yake"
+max_keywords = 10
+min_score = 0.0
+ngram_range = [1, 3]
+language = "en"
+```
+
+| Option         | Type    | Default  | Description                                 |
+| -------------- | ------- | -------- | ------------------------------------------- |
+| `algorithm`    | string  | `"yake"` | Keyword extraction algorithm (yake or rake) |
+| `max_keywords` | integer | `10`     | Maximum keywords to extract                 |
+| `min_score`    | float   | `0.0`    | Minimum relevance score for keywords        |
+| `ngram_range`  | array   | `[1, 3]` | N-gram size range [min, max]                |
+| `language`     | string  | `"en"`   | Language code for keyword extraction        |
+
+### Token Reduction
+
+```toml
+[token_reduction]
+mode = "off"
+preserve_important_words = true
+```
+
+| Option                     | Type    | Default | Description                               |
+| -------------------------- | ------- | ------- | ----------------------------------------- |
+| `mode`                     | string  | `"off"` | Mode: off, aggressive, moderate, minimal  |
+| `preserve_important_words` | boolean | `true`  | Preserve important words during reduction |
+
+### Language Detection
+
+```toml
+[language_detection]
+enabled = true
+min_confidence = 0.8
+detect_multiple = false
+```
+
+| Option            | Type    | Default | Description                                |
+| ----------------- | ------- | ------- | ------------------------------------------ |
+| `enabled`         | boolean | `true`  | Enable automatic language detection        |
+| `min_confidence`  | float   | `0.8`   | Minimum confidence threshold for detection |
+| `detect_multiple` | boolean | `false` | Detect multiple languages in document      |
+
+### Post-Processor
+
+```toml
+[postprocessor]
+enabled = true
+```
+
+| Option    | Type    | Default | Description                                 |
+| --------- | ------- | ------- | ------------------------------------------- |
+| `enabled` | boolean | `true`  | Enable post-processing of extracted content |
+
+## FileExtractionConfig (Per-File Overrides)
+
+Passed as an optional parameter to `batch_extract_file` / `batch_extract_bytes` (and their sync variants) to override settings per file in a batch. All fields optional — `None` = use batch default. The separate `_with_configs` functions were removed in v4.5.0.
+
+**Overridable fields:** `enable_quality_processing`, `ocr`, `force_ocr`, `chunking`, `images`, `pdf_options`, `token_reduction`, `language_detection`, `pages`, `keywords`, `postprocessor`, `html_options`, `result_format`, `output_format`, `include_document_structure`, `layout`.
+
+**Batch-level only (not overridable):** `max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`.
+
+**Merge semantics:** For each file, `FileExtractionConfig` fields are overlaid on the batch `ExtractionConfig`. `None` falls through to batch default; `Some(value)` replaces the batch default for that file.
+
+```toml
+# FileExtractionConfig cannot be specified in config files —
+# it is a programmatic API for per-file overrides at runtime.
+```
+
+## Naming Conventions
+
+Kreuzberg uses consistent naming conventions across different contexts:
+
+| Context              | Convention | Example                                       |
+| -------------------- | ---------- | --------------------------------------------- |
+| Python               | snake_case | `max_chars`, `pdf_options`, `use_cache`       |
+| Node.js / TypeScript | camelCase  | `maxChars`, `pdfOptions`, `useCache`          |
+| Rust                 | snake_case | `max_chars`, `pdf_options`, `use_cache`       |
+| TOML / YAML / JSON   | snake_case | `max_chars`, `pdf_options`, `use_cache`       |
+| CLI flags            | kebab-case | `--max-chars`, `--pdf-options`, `--use-cache` |
+
+When switching between languages, apply the appropriate conversion:
+
+- Python → Node.js: `snake_case` to `camelCase`
+- CLI → Python: `kebab-case` to `snake_case`
+- TOML → Python: No conversion needed (both use `snake_case`)
+
+## Environment Variables
+
+The following environment variables can override configuration:
+
+| Variable         | Purpose                             | Example     |
+| ---------------- | ----------------------------------- | ----------- |
+| `KREUZBERG_HOST` | Server bind address (serve command) | `127.0.0.1` |
+| `KREUZBERG_PORT` | Server port (serve command)         | `8080`      |
+
+## Configuration Merging
+
+Configuration sources are merged in priority order (highest to lowest):
+
+1. **CLI flags** (highest priority)
+2. **Inline JSON configuration** (programmatic)
+3. **Configuration file** (lowest priority)
+
+Later sources override earlier ones. For example, a CLI flag `--max-chars 2000` overrides `max_chars = 1000` in the configuration file.
+
+## Example Configurations
+
+### Minimal Configuration
+
+```toml
+use_cache = true
+enable_quality_processing = true
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+```
+
+### High-Quality PDF Extraction
+
+```toml
+use_cache = true
+enable_quality_processing = true
+force_ocr = false
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+
+[ocr.tesseract_config]
+psm = 3
+oem = 3
+enable_table_detection = true
+table_min_confidence = 0.7
+
+[pdf_options]
+extract_images = true
+extract_metadata = true
+
+[pdf_options.hierarchy]
+enabled = true
+k_clusters = 6
+
+[images]
+extract_images = true
+target_dpi = 300
+```
+
+### Semantic Search Configuration
+
+```toml
+[chunking]
+max_chars = 800
+max_overlap = 150
+
+[chunking.embedding]
+batch_size = 32
+normalize = true
+cache_dir = "~/.cache/kreuzberg/embeddings"
+
+[chunking.embedding.model]
+type = "preset"
+name = "accurate"
+
+[keywords]
+algorithm = "yake"
+max_keywords = 15
+```
+
+## Field Name Reference
+
+Critical field names to use in configuration files:
+
+- `max_chars` (NOT `max_characters`)
+- `max_overlap` (NOT `overlap`)
+- `table_min_confidence`
+- `table_column_threshold`
+- `table_row_threshold_ratio`
+- `ocr_coverage_threshold`
+- `k_clusters`
+- `include_bbox`
+- `enable_table_detection`
+- `auto_rotate`
+- `auto_adjust_dpi`
+- `show_download_progress`
+- `min_confidence`
+- `detect_multiple`
+
+Always verify field names against the source configuration file when adding new options.
diff --git a/plugins/kreuzberg/skills/kreuzberg/references/nodejs-api.md b/plugins/kreuzberg/skills/kreuzberg/references/nodejs-api.md
new file mode 100644
index 0000000..586456c
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/references/nodejs-api.md
@@ -0,0 +1,1380 @@
+# Node.js/TypeScript API Reference
+
+## Overview
+
+**Package**: `@kreuzberg/node` — A high-performance TypeScript SDK built on a Rust core for document intelligence and content extraction.
+
+Supports both **ESM** (`import`) and **CommonJS** (`require`):
+
+```typescript
+// ESM
+import { extractFile, batchExtractFiles } from "@kreuzberg/node";
+
+// CommonJS
+const { extractFile, batchExtractFiles } = require("@kreuzberg/node");
+```
+
+**Current Version**: 4.2.14
+
+---
+
+## Core Extraction Functions
+
+All extraction functions return `ExtractionResult` containing extracted content, metadata, tables, and optional chunks/images.
+
+### Single File Extraction
+
+#### `extractFile(filePath, mimeType?, config?): Promise<ExtractionResult>`
+
+Extract content from a single file asynchronously.
+
+```typescript
+import { extractFile } from "@kreuzberg/node";
+
+// Auto-detect MIME type from file extension
+const result = await extractFile("document.pdf");
+console.log(result.content);
+
+// Explicit MIME type
+const result2 = await extractFile("document.pdf", "application/pdf");
+
+// With configuration
+const result3 = await extractFile("document.pdf", null, {
+  chunking: {
+    maxChars: 1000,
+    maxOverlap: 200,
+  },
+});
+```
+
+**Parameters**:
+
+- `filePath: string` — Path to the file to extract
+- `mimeType?: string | null` — Optional MIME type hint (auto-detect if null)
+- `config?: ExtractionConfig` — Optional extraction configuration
+
+**Returns**: `Promise<ExtractionResult>`
+
+**Throws**: `ParsingError`, `OcrError`, `ValidationError`, `KreuzbergError`
+
+#### `extractFileSync(filePath, mimeType?, config?): ExtractionResult`
+
+Extract content from a single file synchronously.
+
+```typescript
+import { extractFileSync } from "@kreuzberg/node";
+
+const result = extractFileSync("document.pdf");
+console.log(result.content);
+```
+
+**Parameters**: Same as `extractFile()`
+
+**Returns**: `ExtractionResult`
+
+---
+
+### Raw Bytes Extraction
+
+#### `extractBytes(data, mimeType, config?): Promise<ExtractionResult>`
+
+Extract content from raw bytes (Buffer or Uint8Array) asynchronously.
+
+```typescript
+import { extractBytes } from "@kreuzberg/node";
+import { readFile } from "fs/promises";
+
+const data = await readFile("document.pdf");
+const result = await extractBytes(data, "application/pdf");
+console.log(result.content);
+```
+
+**Parameters**:
+
+- `data: Buffer | Uint8Array` — Raw file content
+- `mimeType: string` — MIME type (required)
+- `config?: ExtractionConfig` — Optional configuration
+
+**Returns**: `Promise<ExtractionResult>`
+
+#### `extractBytesSync(data, mimeType, config?): ExtractionResult`
+
+Extract content from raw bytes synchronously.
+
+```typescript
+import { extractBytesSync } from "@kreuzberg/node";
+import { readFileSync } from "fs";
+
+const data = readFileSync("document.pdf");
+const result = extractBytesSync(data, "application/pdf");
+```
+
+**Parameters**: Same as `extractBytes()`
+
+**Returns**: `ExtractionResult`
+
+---
+
+### Batch Extraction (Recommended)
+
+For processing multiple documents, batch APIs provide superior performance and memory management.
+
+#### `batchExtractFiles(paths, config?): Promise<ExtractionResult[]>`
+
+Extract content from multiple files in parallel (asynchronous).
+
+```typescript
+import { batchExtractFiles } from "@kreuzberg/node";
+
+const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
+const results = await batchExtractFiles(files);
+
+results.forEach((result, i) => {
+  console.log(`${files[i]}: ${result.content.substring(0, 100)}...`);
+});
+```
+
+**Parameters**:
+
+- `paths: string[]` — Array of file paths
+- `config?: ExtractionConfig` — Configuration (applied to all files)
+
+**Returns**: `Promise<ExtractionResult[]>` — Results in same order as input
+
+#### `batchExtractFilesSync(paths, config?): ExtractionResult[]`
+
+Extract content from multiple files synchronously.
+
+```typescript
+import { batchExtractFilesSync } from "@kreuzberg/node";
+
+const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
+const results = batchExtractFilesSync(files);
+```
+
+**Parameters**: Same as `batchExtractFiles()`
+
+**Returns**: `ExtractionResult[]`
+
+#### `batchExtractBytes(dataList, mimeTypes, config?): Promise<ExtractionResult[]>`
+
+Extract content from multiple byte arrays in parallel (asynchronous).
+
+```typescript
+import { batchExtractBytes } from "@kreuzberg/node";
+import { readFile } from "fs/promises";
+
+const files = ["doc1.pdf", "doc2.docx"];
+const dataList = await Promise.all(files.map((f) => readFile(f)));
+const mimeTypes = [
+  "application/pdf",
+  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+];
+
+const results = await batchExtractBytes(dataList, mimeTypes);
+```
+
+**Parameters**:
+
+- `dataList: Uint8Array[]` — Array of file contents
+- `mimeTypes: string[]` — MIME types (one per item, must match length)
+- `config?: ExtractionConfig` — Configuration (applied to all items)
+
+**Returns**: `Promise<ExtractionResult[]>`
+
+#### `batchExtractBytesSync(dataList, mimeTypes, config?): ExtractionResult[]`
+
+Extract content from multiple byte arrays synchronously.
+
+```typescript
+import { batchExtractBytesSync } from "@kreuzberg/node";
+import { readFileSync } from "fs";
+
+const dataList = ["doc1.pdf", "doc2.docx"].map((f) => readFileSync(f));
+const mimeTypes = [
+  "application/pdf",
+  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+];
+
+const results = batchExtractBytesSync(dataList, mimeTypes);
+```
+
+**Parameters**: Same as `batchExtractBytes()`
+
+**Returns**: `ExtractionResult[]`
+
+#### `batchExtractFilesWithConfigs(paths, fileConfigs, config?): Promise<ExtractionResult[]>`
+
+Extract multiple files with per-file configuration overrides (asynchronous).
+
+```typescript
+const results = await batchExtractFilesWithConfigs(
+  ["report.pdf", "scanned.pdf"],
+  [null, { forceOcr: true, ocr: { backend: "tesseract", language: "deu" } }],
+);
+```
+
+**Parameters**:
+
+- `paths: string[]` — File paths
+- `fileConfigs: (FileExtractionConfig | null)[]` — Per-file configs (null = use batch defaults)
+- `config?: ExtractionConfig` — Batch-level configuration
+
+#### `batchExtractFilesWithConfigsSync(paths, fileConfigs, config?): ExtractionResult[]`
+
+Synchronous variant.
+
+#### `batchExtractBytesWithConfigs(dataList, mimeTypes, fileConfigs, config?): Promise<ExtractionResult[]>`
+
+Extract multiple byte arrays with per-file overrides (asynchronous).
+
+#### `batchExtractBytesWithConfigsSync(dataList, mimeTypes, fileConfigs, config?): ExtractionResult[]`
+
+Synchronous variant.
+
+---
+
+## Worker Pool APIs
+
+Worker pools enable concurrent extraction using Node.js worker threads for CPU-bound processing.
+
+### `createWorkerPool(size?): WorkerPool`
+
+Create a worker pool for concurrent extraction.
+
+```typescript
+import { createWorkerPool } from "@kreuzberg/node";
+
+// Create pool with default size (number of CPU cores)
+const pool = createWorkerPool();
+
+// Create pool with specific size
+const pool4 = createWorkerPool(4);
+```
+
+**Parameters**:
+
+- `size?: number` — Number of workers (defaults to CPU core count)
+
+**Returns**: `WorkerPool` — Opaque handle for use with worker extraction functions
+
+### `extractFileInWorker(pool, filePath, mimeType?, config?): Promise<ExtractionResult>`
+
+Extract a single file using a worker from the pool.
+
+```typescript
+import { createWorkerPool, extractFileInWorker, closeWorkerPool } from "@kreuzberg/node";
+
+const pool = createWorkerPool(4);
+
+try {
+  const files = ["doc1.pdf", "doc2.docx", "doc3.xlsx"];
+  const results = await Promise.all(files.map((f) => extractFileInWorker(pool, f)));
+
+  results.forEach((r, i) => {
+    console.log(`${files[i]}: ${r.content.substring(0, 100)}...`);
+  });
+} finally {
+  await closeWorkerPool(pool);
+}
+```
+
+**Parameters**:
+
+- `pool: WorkerPool` — Worker pool instance
+- `filePath: string` — File path
+- `mimeType?: string | null` — Optional MIME type
+- `config?: ExtractionConfig` — Optional configuration
+
+**Returns**: `Promise<ExtractionResult>`
+
+### `batchExtractFilesInWorker(pool, paths, config?): Promise<ExtractionResult[]>`
+
+Extract multiple files using the worker pool for concurrent processing.
+
+```typescript
+import { createWorkerPool, batchExtractFilesInWorker, closeWorkerPool } from "@kreuzberg/node";
+
+const pool = createWorkerPool(4);
+
+try {
+  const files = ["invoice1.pdf", "invoice2.pdf", "invoice3.pdf"];
+  const results = await batchExtractFilesInWorker(pool, files, {
+    ocr: { backend: "tesseract", language: "eng" },
+  });
+
+  const total = results.reduce((sum, r) => sum + extractAmount(r.content), 0);
+  console.log(`Total: $${total}`);
+} finally {
+  await closeWorkerPool(pool);
+}
+```
+
+**Parameters**:
+
+- `pool: WorkerPool` — Worker pool instance
+- `paths: string[]` — File paths
+- `config?: ExtractionConfig` — Configuration (applied to all files)
+
+**Returns**: `Promise<ExtractionResult[]>`
+
+### `getWorkerPoolStats(pool): WorkerPoolStats`
+
+Get statistics about a worker pool.
+
+```typescript
+import { createWorkerPool, getWorkerPoolStats } from "@kreuzberg/node";
+
+const pool = createWorkerPool(4);
+const stats = getWorkerPoolStats(pool);
+
+console.log(`Pool size: ${stats.size}`);
+console.log(`Active workers: ${stats.activeWorkers}`);
+console.log(`Queued tasks: ${stats.queuedTasks}`);
+```
+
+**Parameters**:
+
+- `pool: WorkerPool` — Worker pool instance
+
+**Returns**: `WorkerPoolStats`
+
+### `closeWorkerPool(pool): Promise<void>`
+
+Close a worker pool and shut down all worker threads.
+
+```typescript
+import { createWorkerPool, closeWorkerPool } from "@kreuzberg/node";
+
+const pool = createWorkerPool(4);
+
+try {
+  // Use pool
+} finally {
+  await closeWorkerPool(pool);
+}
+```
+
+**Parameters**:
+
+- `pool: WorkerPool` — Worker pool instance to close
+
+**Returns**: `Promise<void>`
+
+---
+
+## Configuration Interface
+
+### `ExtractionConfig`
+
+Main configuration object controlling extraction behavior.
+
+```typescript
+interface ExtractionConfig {
+  // Caching and processing
+  useCache?: boolean; // Default: true
+  enableQualityProcessing?: boolean; // Default: false
+
+  // OCR configuration
+  ocr?: OcrConfig; // OCR settings
+  forceOcr?: boolean; // Default: false
+
+  // Document processing
+  chunking?: ChunkingConfig; // Break into chunks
+  images?: ImageExtractionConfig; // Image extraction
+  pdfOptions?: PdfConfig; // PDF-specific options
+  tokenReduction?: TokenReductionConfig; // Token optimization
+  languageDetection?: LanguageDetectionConfig; // Language detection
+  postprocessor?: PostProcessorConfig; // Post-processing
+  htmlOptions?: HtmlConversionOptions; // HTML conversion
+  keywords?: KeywordConfig; // Keyword extraction
+  pages?: PageExtractionConfig; // Page extraction
+
+  // Output control
+  maxConcurrentExtractions?: number; // Default: 4
+  outputFormat?: "plain" | "markdown" | "djot" | "html"; // Default: 'plain'
+  resultFormat?: "unified" | "element_based"; // Default: 'unified'
+}
+```
+
+### `FileExtractionConfig`
+
+Per-file overrides for batch operations. All fields optional (omitted = use batch default).
+
+```typescript
+interface FileExtractionConfig {
+  enableQualityProcessing?: boolean;
+  ocr?: OcrConfig;
+  forceOcr?: boolean;
+  chunking?: ChunkingConfig;
+  images?: ImageExtractionConfig;
+  pdfOptions?: PdfConfig;
+  tokenReduction?: TokenReductionConfig;
+  languageDetection?: LanguageDetectionConfig;
+  pages?: PageExtractionConfig;
+  keywords?: KeywordConfig;
+  postprocessor?: PostProcessorConfig;
+  outputFormat?: "plain" | "markdown" | "djot" | "html";
+  resultFormat?: "unified" | "element_based";
+  includeDocumentStructure?: boolean;
+}
+```
+
+Excluded (batch-level only): `maxConcurrentExtractions`, `useCache`, `securityLimits`.
+
+### `ChunkingConfig`
+
+Configuration for breaking documents into chunks (useful for RAG and vector databases).
+
+```typescript
+interface ChunkingConfig {
+  maxChars?: number; // Max characters per chunk (default: 4096)
+  maxOverlap?: number; // Overlap between chunks (default: 512)
+  chunkSize?: number; // Alternative unit (mutually exclusive with maxChars)
+  chunkOverlap?: number; // Alternative unit (mutually exclusive with maxOverlap)
+  preset?: string; // Named preset ('default', 'aggressive', 'minimal')
+  embedding?: Record<string, unknown>; // Embedding config
+  enabled?: boolean; // Enable chunking (default: true when config provided)
+}
+```
+
+**Key Point**: Use `maxChars` and `maxOverlap`, NOT `maxCharacters` or `overlap`.
+
+### `OcrConfig`
+
+Configuration for optical character recognition.
+
+```typescript
+interface OcrConfig {
+  backend: string; // OCR backend name (e.g., 'tesseract')
+  language?: string; // Language code (e.g., 'eng', 'deu')
+  tesseractConfig?: TesseractConfig;
+}
+
+interface TesseractConfig {
+  psm?: number; // Page Segmentation Mode (0-13)
+  enableTableDetection?: boolean;
+  tesseditCharWhitelist?: string; // Character whitelist
+}
+```
+
+### `ImageExtractionConfig`
+
+Configuration for extracting and optimizing images.
+
+```typescript
+interface ImageExtractionConfig {
+  extractImages?: boolean; // Default: true
+  targetDpi?: number; // Target DPI (default: 150)
+  maxImageDimension?: number; // Max width/height in pixels (default: 2000)
+  autoAdjustDpi?: boolean; // Auto-adjust DPI (default: true)
+  minDpi?: number; // Minimum DPI (default: 72)
+  maxDpi?: number; // Maximum DPI (default: 300)
+}
+```
+
+### `PdfConfig`
+
+PDF-specific extraction options.
+
+```typescript
+interface PdfConfig {
+  extractImages?: boolean; // Default: true
+  passwords?: string[]; // Passwords for encrypted PDFs
+  extractMetadata?: boolean; // Default: true
+  hierarchy?: HierarchyConfig; // Hierarchy extraction
+}
+```
+
+### `LanguageDetectionConfig`
+
+Configuration for automatic language detection.
+
+```typescript
+interface LanguageDetectionConfig {
+  enabled?: boolean; // Default: true
+  minConfidence?: number; // Threshold 0.0-1.0 (default: 0.5)
+  detectMultiple?: boolean; // Detect multiple languages (default: false)
+}
+```
+
+### `TokenReductionConfig`
+
+Configuration for optimizing token usage.
+
+```typescript
+interface TokenReductionConfig {
+  mode?: string; // 'aggressive' or 'conservative' (default: 'conservative')
+  preserveImportantWords?: boolean; // Default: true
+}
+```
+
+### `KeywordConfig`
+
+Configuration for keyword extraction.
+
+```typescript
+interface KeywordConfig {
+  algorithm?: "yake" | "rake"; // Default: 'yake'
+  maxKeywords?: number; // Maximum keywords (default: 10)
+  minScore?: number; // Minimum relevance score (default: 0.1)
+  ngramRange?: [number, number]; // N-gram range (default: [1, 3])
+  language?: string; // Language code (default: 'en')
+  yakeParams?: YakeParams;
+  rakeParams?: RakeParams;
+}
+```
+
+### `PageExtractionConfig`
+
+Configuration for page-level content tracking.
+
+```typescript
+interface PageExtractionConfig {
+  extractPages?: boolean; // Extract as separate pages array
+  insertPageMarkers?: boolean; // Insert page markers in content
+  markerFormat?: string; // Marker format with {page_num} placeholder
+}
+```
+
+### `HtmlConversionOptions`
+
+Configuration for HTML to Markdown conversion.
+
+```typescript
+interface HtmlConversionOptions {
+  headingStyle?: "atx" | "underlined" | "atx_closed";
+  listIndentType?: "spaces" | "tabs";
+  listIndentWidth?: number;
+  bullets?: string;
+  strongEmSymbol?: string;
+  escapeAsterisks?: boolean;
+  escapeUnderscores?: boolean;
+  escapeMisc?: boolean;
+  escapeAscii?: boolean;
+  codeLanguage?: string;
+  autolinks?: boolean;
+  defaultTitle?: boolean;
+  brInTables?: boolean;
+  hocrSpatialTables?: boolean;
+  highlightStyle?: "double_equal" | "html" | "bold" | "none";
+  extractMetadata?: boolean;
+  whitespaceMode?: "normalized" | "strict";
+  stripNewlines?: boolean;
+  wrap?: boolean;
+  wrapWidth?: number;
+  convertAsInline?: boolean;
+  subSymbol?: string;
+  supSymbol?: string;
+  newlineStyle?: "spaces" | "backslash";
+  codeBlockStyle?: "indented" | "backticks" | "tildes";
+  keepInlineImagesIn?: string[];
+  encoding?: string;
+  debug?: boolean;
+  stripTags?: string[];
+  preserveTags?: string[];
+  preprocessing?: HtmlPreprocessingOptions;
+}
+```
+
+---
+
+## Result Types
+
+### `ExtractionResult`
+
+Complete extraction result from document processing.
+
+```typescript
+interface ExtractionResult {
+  // Main content
+  content: string;
+
+  // Document type
+  mimeType: string;
+
+  // Metadata (format-specific)
+  metadata: Metadata;
+
+  // Extracted structures
+  tables: Table[];
+
+  // Optional processed data
+  detectedLanguages: string[] | null;
+  chunks: Chunk[] | null; // From chunking config
+  images: ExtractedImage[] | null; // From image extraction
+  elements?: Element[] | null; // From element_based result format
+  pages?: PageContent[] | null; // From page extraction
+  extractedKeywords?: ExtractedKeyword[] | null; // Extracted keywords with scores
+  qualityScore?: number | null; // Overall extraction quality (0.0-1.0)
+  processingWarnings?: ProcessingWarning[]; // Non-fatal warnings from pipeline
+}
+```
+
+### `Table`
+
+Extracted table data with cell structure.
+
+```typescript
+interface Table {
+  cells: string[][]; // 2D array of cell contents (rows × columns)
+  markdown: string; // Markdown representation
+  pageNumber: number; // 1-indexed page number
+}
+```
+
+### `Chunk`
+
+Text chunk for RAG or vector database indexing.
+
+```typescript
+interface Chunk {
+  content: string;
+  embedding?: number[] | null; // Vector embedding if computed
+  metadata: ChunkMetadata;
+}
+
+interface ChunkMetadata {
+  byteStart: number; // UTF-8 byte offset in original text
+  byteEnd: number; // UTF-8 byte offset
+  tokenCount?: number | null;
+  chunkIndex: number; // Zero-based index
+  totalChunks: number; // Total number of chunks
+  firstPage?: number | null; // 1-indexed, if page tracking enabled
+  lastPage?: number | null;
+}
+```
+
+### `ExtractedImage`
+
+Image extracted from document.
+
+```typescript
+interface ExtractedImage {
+  data: Uint8Array; // Raw image bytes
+  format: string; // Format (e.g., 'png', 'jpeg', 'tiff')
+  imageIndex: number; // Sequential index (0-indexed)
+  pageNumber?: number | null;
+  width?: number | null;
+  height?: number | null;
+  colorspace?: string | null;
+  bitsPerComponent?: number | null;
+  isMask: boolean;
+  description?: string | null;
+  ocrResult?: ExtractionResult | null; // OCR result if processed
+}
+```
+
+### `PageContent`
+
+Per-page content when page extraction is enabled.
+
+```typescript
+interface PageContent {
+  pageNumber: number; // 1-indexed
+  content: string; // Page text content
+  tables: Table[]; // Tables on this page
+  images: ExtractedImage[]; // Images on this page
+}
+```
+
+### `ExtractedKeyword`
+
+Extracted keyword with relevance score and position information.
+
+```typescript
+interface ExtractedKeyword {
+  text: string; // Keyword text
+  score: number; // Relevance score (0.0-1.0)
+  algorithm: string; // Algorithm used ("tfidf", "textrank", "yake", etc.)
+  positions?: number[] | null; // Character positions in content (if available)
+}
+```
+
+### `ProcessingWarning`
+
+Non-fatal warning encountered during document processing.
+
+```typescript
+interface ProcessingWarning {
+  source: string; // Component that generated the warning
+  message: string; // Warning message describing the issue
+}
+```
+
+### `Metadata`
+
+Extraction result metadata (format-specific).
+
+```typescript
+interface Metadata {
+  // Common fields
+  language?: string | null;
+  date?: string | null;
+  subject?: string | null;
+  format_type?:
+    | "pdf"
+    | "excel"
+    | "email"
+    | "pptx"
+    | "archive"
+    | "image"
+    | "xml"
+    | "text"
+    | "html"
+    | "ocr";
+
+  // PDF metadata
+  title?: string | null;
+  author?: string | null;
+  creator?: string | null;
+  producer?: string | null;
+  creation_date?: string | null;
+  modification_date?: string | null;
+  page_count?: number;
+
+  // Excel metadata
+  sheet_count?: number;
+  sheet_names?: string[];
+
+  // Email metadata
+  from_email?: string | null;
+  from_name?: string | null;
+  to_emails?: string[];
+  cc_emails?: string[];
+  bcc_emails?: string[];
+  message_id?: string | null;
+  attachments?: string[];
+
+  // Image metadata
+  width?: number;
+  height?: number;
+  exif?: Record<string, string>;
+
+  // OCR metadata
+  psm?: number;
+  output_format?: string;
+  table_count?: number;
+
+  // HTML metadata
+  canonical_url?: string | null;
+  html_language?: string | null;
+  text_direction?: "ltr" | "rtl" | "auto" | null;
+  open_graph?: Record<string, string>;
+  twitter_card?: Record<string, string>;
+  meta_tags?: Record<string, string>;
+  html_headers?: HeaderMetadata[];
+  html_links?: LinkMetadata[];
+  html_images?: HtmlImageMetadata[];
+  structured_data?: StructuredData[];
+
+  // Text metadata
+  line_count?: number;
+  word_count?: number;
+  character_count?: number;
+  headers?: string[] | null;
+  links?: [string, string][] | null;
+  code_blocks?: [string, string][] | null;
+
+  // Page structure
+  page_structure?: PageStructure | null;
+
+  // Additional typed fields
+  category?: string | null;
+  tags?: string[];
+  document_version?: string | null;
+  abstract_text?: string | null;
+
+  // Custom fields from postprocessors
+  [key: string]: unknown;
+}
+```
+
+---
+
+## Error Handling
+
+### Error Classes
+
+```typescript
+import {
+  KreuzbergError,
+  ParsingError,
+  OcrError, // Note: camelCase, not "OCRError"
+  ValidationError,
+  MissingDependencyError,
+  CacheError,
+  ImageProcessingError,
+  PluginError,
+  ErrorCode,
+} from "@kreuzberg/node";
+```
+
+**Error Hierarchy**:
+
+- `KreuzbergError` — Base class for all Kreuzberg errors
+  - `ParsingError` — Document format invalid or corrupted
+  - `OcrError` — OCR processing failed
+  - `ValidationError` — Extraction validation failed
+  - `MissingDependencyError` — Required dependency unavailable
+  - `CacheError` — Cache operation failed
+  - `ImageProcessingError` — Image extraction or processing failed
+  - `PluginError` — Plugin registration or execution failed
+
+### Error Diagnostics
+
+```typescript
+import {
+  classifyError,
+  getErrorCodeDescription,
+  getErrorCodeName,
+  getLastErrorCode,
+  getLastPanicContext,
+} from "@kreuzberg/node";
+
+try {
+  const result = await extractFile("document.pdf");
+} catch (error) {
+  const classification = classifyError(error.message);
+  console.log(`Error code: ${getErrorCodeName(classification.code)}`);
+  console.log(`Description: ${getErrorCodeDescription(classification.code)}`);
+  console.log(`Confidence: ${classification.confidence}`);
+}
+```
+
+### `ErrorCode` Enum
+
+```typescript
+enum ErrorCode {
+  Success = 0,
+  GenericError = 1,
+  Panic = 2,
+  InvalidArgument = 3,
+  IoError = 4,
+  ParsingError = 5,
+  OcrError = 6,
+  MissingDependency = 7,
+}
+```
+
+---
+
+## Plugin System
+
+### Post-Processors
+
+Custom post-processors can enrich extraction results without failing the extraction if they encounter errors.
+
+#### `registerPostProcessor(processor): void`
+
+Register a custom post-processor.
+
+```typescript
+import { registerPostProcessor, extractFile } from "@kreuzberg/node";
+
+const processor = {
+  name() {
+    return "my_processor";
+  },
+
+  async process(result) {
+    // Enrich result with custom metadata
+    result.metadata["custom_field"] = "value";
+    return result;
+  },
+
+  processingStage() {
+    return "late"; // 'early', 'middle', or 'late'
+  },
+
+  async initialize() {
+    // Called once when registered
+  },
+
+  async shutdown() {
+    // Called when unregistered
+  },
+};
+
+registerPostProcessor(processor);
+const result = await extractFile("document.pdf");
+```
+
+#### `unregisterPostProcessor(name): void`
+
+Remove a registered post-processor.
+
+```typescript
+import { unregisterPostProcessor } from "@kreuzberg/node";
+
+unregisterPostProcessor("my_processor");
+```
+
+#### `listPostProcessors(): string[]`
+
+List all registered post-processor names.
+
+```typescript
+import { listPostProcessors } from "@kreuzberg/node";
+
+const processors = listPostProcessors();
+console.log("Registered processors:", processors);
+```
+
+#### `clearPostProcessors(): void`
+
+Unregister all post-processors.
+
+```typescript
+import { clearPostProcessors } from "@kreuzberg/node";
+
+clearPostProcessors();
+```
+
+### Validators
+
+Custom validators check extraction results and fail the extraction if validation fails (unlike post-processors).
+
+#### `registerValidator(validator): void`
+
+Register a custom validator.
+
+```typescript
+import { registerValidator, extractFile } from "@kreuzberg/node";
+
+const validator = {
+  name() {
+    return "content_length_validator";
+  },
+
+  validate(result) {
+    if (result.content.length < 10) {
+      throw new Error("Content too short");
+    }
+  },
+
+  priority() {
+    return 100; // Higher = runs first
+  },
+
+  shouldValidate(result) {
+    return result.mimeType === "application/pdf"; // Conditional validation
+  },
+
+  async initialize() {
+    // Called once when registered
+  },
+
+  async shutdown() {
+    // Called when unregistered
+  },
+};
+
+registerValidator(validator);
+const result = await extractFile("document.pdf");
+```
+
+#### `unregisterValidator(name): void`
+
+Remove a registered validator.
+
+```typescript
+import { unregisterValidator } from "@kreuzberg/node";
+
+unregisterValidator("content_length_validator");
+```
+
+#### `listValidators(): string[]`
+
+List all registered validator names.
+
+```typescript
+import { listValidators } from "@kreuzberg/node";
+
+const validators = listValidators();
+```
+
+#### `clearValidators(): void`
+
+Unregister all validators.
+
+```typescript
+import { clearValidators } from "@kreuzberg/node";
+
+clearValidators();
+```
+
+### OCR Backends
+
+Custom OCR backends can be registered to handle image text extraction.
+
+#### `registerOcrBackend(backend): void`
+
+Register a custom OCR backend.
+
+```typescript
+import { registerOcrBackend, extractFile } from "@kreuzberg/node";
+
+const backend = {
+  name() {
+    return "my-ocr";
+  },
+
+  supportedLanguages() {
+    return ["eng", "deu", "fra"];
+  },
+
+  async processImage(imageBytes, language) {
+    // imageBytes: Uint8Array or Base64 string
+    const buffer =
+      typeof imageBytes === "string" ? Buffer.from(imageBytes, "base64") : Buffer.from(imageBytes);
+
+    // Process and extract text
+    return {
+      content: "extracted text",
+      mime_type: "text/plain",
+      metadata: { confidence: 0.95, language },
+      tables: [],
+    };
+  },
+
+  async initialize() {
+    // Load models, setup resources
+  },
+
+  async shutdown() {
+    // Cleanup resources
+  },
+};
+
+registerOcrBackend(backend);
+```
+
+#### `GutenOcrBackend`
+
+Built-in OCR backend implementation using Guten-OCR.
+
+```typescript
+import { GutenOcrBackend, registerOcrBackend, extractFile } from "@kreuzberg/node";
+
+const backend = new GutenOcrBackend();
+await backend.initialize();
+registerOcrBackend(backend);
+
+const result = await extractFile("scanned.pdf", null, {
+  ocr: { backend: "guten-ocr", language: "eng" },
+});
+```
+
+#### `unregisterOcrBackend(name): void`
+
+Remove a registered OCR backend.
+
+```typescript
+import { unregisterOcrBackend } from "@kreuzberg/node";
+
+unregisterOcrBackend("my-ocr");
+```
+
+#### `listOcrBackends(): string[]`
+
+List all registered OCR backend names.
+
+```typescript
+import { listOcrBackends } from "@kreuzberg/node";
+
+const backends = listOcrBackends();
+```
+
+#### `clearOcrBackends(): void`
+
+Unregister all OCR backends.
+
+```typescript
+import { clearOcrBackends } from "@kreuzberg/node";
+
+clearOcrBackends();
+```
+
+---
+
+## MIME Type Utilities
+
+### `detectMimeType(data): string | null`
+
+Detect MIME type from file content (magic bytes).
+
+```typescript
+import { detectMimeType } from "@kreuzberg/node";
+import { readFileSync } from "fs";
+
+const data = readFileSync("document");
+const mimeType = detectMimeType(data);
+console.log(`Detected MIME type: ${mimeType}`);
+```
+
+### `detectMimeTypeFromPath(filePath): string | null`
+
+Detect MIME type from file extension.
+
+```typescript
+import { detectMimeTypeFromPath } from "@kreuzberg/node";
+
+const mimeType = detectMimeTypeFromPath("document.pdf");
+console.log(`MIME type: ${mimeType}`); // 'application/pdf'
+```
+
+### `getExtensionsForMime(mimeType): string[]`
+
+Get file extensions for a MIME type.
+
+```typescript
+import { getExtensionsForMime } from "@kreuzberg/node";
+
+const extensions = getExtensionsForMime("application/pdf");
+console.log(`Extensions: ${extensions}`); // ['.pdf']
+```
+
+### `validateMimeType(mimeType): boolean`
+
+Check if a MIME type is valid.
+
+```typescript
+import { validateMimeType } from "@kreuzberg/node";
+
+if (validateMimeType("application/pdf")) {
+  console.log("Valid MIME type");
+}
+```
+
+---
+
+## Configuration Loading
+
+### `ExtractionConfig.fromFile(filePath): ExtractionConfig`
+
+Load extraction configuration from a file (JSON, YAML, or TOML).
+
+```typescript
+import { ExtractionConfig, extractFile } from "@kreuzberg/node";
+
+const config = ExtractionConfig.fromFile("./kreuzberg.toml");
+const result = await extractFile("document.pdf", null, config);
+```
+
+### `ExtractionConfig.discover(): ExtractionConfig | null`
+
+Auto-discover extraction configuration file in current and parent directories.
+
+```typescript
+import { ExtractionConfig, extractFile } from "@kreuzberg/node";
+
+// Searches for kreuzberg.{toml,yaml,json} in current directory and parents
+const config = ExtractionConfig.discover();
+if (config) {
+  const result = await extractFile("document.pdf", null, config);
+}
+```
+
+---
+
+## Embeddings
+
+### `getEmbeddingPreset(name): EmbeddingPreset | null`
+
+Get a named embedding model preset.
+
+```typescript
+import { getEmbeddingPreset } from "@kreuzberg/node";
+
+const preset = getEmbeddingPreset("default");
+if (preset) {
+  console.log(`Model: ${preset.modelName}`);
+  console.log(`Dimensions: ${preset.dimensions}`);
+}
+```
+
+### `listEmbeddingPresets(): string[]`
+
+List all available embedding presets.
+
+```typescript
+import { listEmbeddingPresets } from "@kreuzberg/node";
+
+const presets = listEmbeddingPresets();
+console.log("Available presets:", presets);
+```
+
+### `EmbeddingPreset`
+
+Type definition for embedding model presets.
+
+```typescript
+interface EmbeddingPreset {
+  name: string; // Preset name (e.g., "fast", "balanced", "quality", "multilingual")
+  chunkSize: number; // Recommended chunk size in characters
+  overlap: number; // Recommended overlap in characters
+  modelName: string; // Model identifier (e.g., "AllMiniLML6V2Q", "BGEBaseENV15")
+  dimensions: number; // Embedding vector dimensions
+  description: string; // Human-readable description
+}
+```
+
+---
+
+## Plugin Protocols
+
+### `PostProcessorProtocol`
+
+Interface for custom post-processors.
+
+```typescript
+interface PostProcessorProtocol {
+  name(): string;
+
+  process(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;
+
+  processingStage?(): ProcessingStage; // 'early' | 'middle' | 'late'
+
+  initialize?(): void | Promise<void>;
+
+  shutdown?(): void | Promise<void>;
+}
+```
+
+### `ValidatorProtocol`
+
+Interface for custom validators.
+
+```typescript
+interface ValidatorProtocol {
+  name(): string;
+
+  validate(result: ExtractionResult): void | Promise<void>;
+
+  priority?(): number; // Higher = runs first
+
+  shouldValidate?(result: ExtractionResult): boolean;
+
+  initialize?(): void | Promise<void>;
+
+  shutdown?(): void | Promise<void>;
+}
+```
+
+### `OcrBackendProtocol`
+
+Interface for custom OCR backends.
+
+```typescript
+interface OcrBackendProtocol {
+  name(): string;
+
+  supportedLanguages(): string[];
+
+  processImage(
+    imageBytes: Uint8Array | string,
+    language: string,
+  ): Promise<{
+    content: string;
+    mime_type: string;
+    metadata: Record<string, unknown>;
+    tables: unknown[];
+  }>;
+
+  initialize?(): void | Promise<void>;
+
+  shutdown?(): void | Promise<void>;
+}
+```
+
+---
+
+## Supported Document Formats
+
+- **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT
+- **Text**: Markdown, Plain Text, XML, JSON, YAML, TOML
+- **Web**: HTML (converted to Markdown)
+- **Email**: EML, MSG
+- **Images**: PNG, JPEG, TIFF (with OCR support)
+- **Archives**: ZIP, TAR, GZIP (file listing)
+
+---
+
+## Registry Functions
+
+### Document Extractors
+
+```typescript
+import {
+  listDocumentExtractors,
+  unregisterDocumentExtractor,
+  clearDocumentExtractors,
+} from "@kreuzberg/node";
+
+// List registered extractors
+const extractors = listDocumentExtractors();
+
+// Unregister a specific extractor
+unregisterDocumentExtractor("pdf");
+
+// Clear all extractors
+clearDocumentExtractors();
+```
+
+---
+
+## Type Exports
+
+All types are exported from `@kreuzberg/node`:
+
+```typescript
+export type {
+  Chunk,
+  ChunkingConfig,
+  ExtractionConfig,
+  ExtractionResult,
+  ExtractedImage,
+  KeywordConfig,
+  LanguageDetectionConfig,
+  OcrBackendProtocol,
+  OcrConfig,
+  PageContent,
+  PageExtractionConfig,
+  PdfConfig,
+  PostProcessorProtocol,
+  Table,
+  TokenReductionConfig,
+  ValidatorProtocol,
+  WorkerPool,
+  WorkerPoolStats,
+  EmbeddingPreset,
+  // ... and many more
+};
+```
+
+---
+
+## Best Practices
+
+1. **Use batch APIs for multiple documents**: `batchExtractFiles()` provides superior performance vs. calling `extractFile()` in a loop.
+
+2. **Enable chunking for RAG/vector DB**: Set `chunking` config to automatically break documents into overlapping chunks.
+
+3. **Use worker pools for high-concurrency scenarios**: Distribute CPU-bound work across multiple threads for 4+ concurrent extractions.
+
+4. **Configure language detection**: Enable automatic language detection for multilingual documents.
+
+5. **Register validators early**: Set up validators before calling extraction functions to catch quality issues immediately.
+
+6. **Use specific MIME types**: Provide explicit MIME types when available to avoid detection overhead.
+
+7. **Clean up resources**: Always call `closeWorkerPool()` when done to prevent resource leaks.
+
+8. **Handle extraction errors gracefully**: Catch specific error types (`ParsingError`, `OcrError`, etc.) for appropriate error handling.
+
+---
+
+## Version
+
+**Package Version**: 4.2.14
diff --git a/plugins/kreuzberg/skills/kreuzberg/references/other-bindings.md b/plugins/kreuzberg/skills/kreuzberg/references/other-bindings.md
new file mode 100644
index 0000000..515b8c8
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/references/other-bindings.md
@@ -0,0 +1,212 @@
+# Language Bindings Reference
+
+Kreuzberg provides native bindings for multiple programming languages, each with precompiled binaries for x86_64 and aarch64 on Linux and macOS. This reference covers installation and basic usage for each binding.
+
+## Go
+
+**Installation:**
+
+```bash
+go get github.com/kreuzberg-dev/kreuzberg/packages/go/v5
+```
+
+**Basic Extraction:**
+
+```go
+package main
+
+import (
+    "context"
+    "fmt"
+    "github.com/kreuzberg-dev/kreuzberg/packages/go/v5/kreuzberg"
+)
+
+func main() {
+    ctx := context.Background()
+    result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
+    if err != nil {
+        panic(err)
+    }
+    fmt.Println(result.Content)
+}
+```
+
+See the [Go binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go) for complete API reference.
+
+## Ruby
+
+**Installation:**
+
+```bash
+gem install kreuzberg
+```
+
+Or in your Gemfile:
+
+```ruby
+gem 'kreuzberg'
+```
+
+**Basic Extraction:**
+
+```ruby
+require 'kreuzberg'
+
+result = Kreuzberg.extract_file_sync('document.pdf')
+puts result.content
+```
+
+See the [Ruby binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/ruby) for complete API reference.
+
+## Java
+
+**Installation:**
+Add to your Maven `pom.xml`:
+
+```xml
+<dependency>
+    <groupId>dev.kreuzberg</groupId>
+    <artifactId>kreuzberg</artifactId>
+    <version>4.2.x</version>
+</dependency>
+```
+
+**Basic Extraction:**
+
+```java
+import dev.kreuzberg.Kreuzberg;
+import dev.kreuzberg.ExtractionResult;
+
+public class Example {
+    public static void main(String[] args) throws Exception {
+        ExtractionResult result = Kreuzberg.extractFile("document.pdf");
+        System.out.println(result.getContent());
+    }
+}
+```
+
+See the [Java binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/java) for complete API reference.
+
+## C
+
+**Installation:**
+
+```bash
+dotnet add package Kreuzberg
+```
+
+**Basic Extraction:**
+
+```csharp
+using Kreuzberg;
+
+var result = KreuzbergClient.ExtractFileSync("document.pdf");
+Console.WriteLine(result.Content);
+```
+
+See the [C# binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/csharp) for complete API reference.
+
+## PHP
+
+**Installation:**
+
+```bash
+composer require kreuzberg/kreuzberg
+```
+
+**Basic Extraction:**
+
+```php
+<?php
+require 'vendor/autoload.php';
+
+use Kreuzberg\Kreuzberg;
+
+$kreuzberg = new Kreuzberg();
+$result = $kreuzberg->extractFile('document.pdf');
+echo $result->content;
+```
+
+See the [PHP binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/php) for complete API reference.
+
+## Elixir
+
+**Installation:**
+Add to your `mix.exs` dependencies:
+
+```elixir
+def deps do
+  [
+    kreuzberg: "~> 4.2"
+  ]
+end
+```
+
+**Basic Extraction:**
+
+```elixir
+{:ok, result} = Kreuzberg.extract_file("document.pdf")
+IO.puts(result.content)
+```
+
+See the [Elixir binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/elixir) for complete API reference.
+
+## WebAssembly (WASM)
+
+**Installation:**
+
+```bash
+npm install @kreuzberg/wasm
+```
+
+**Basic Extraction:**
+
+```typescript
+import { extractBytes } from "@kreuzberg/wasm";
+
+const fileData = await fs.promises.readFile("document.pdf");
+const result = await extractBytes(fileData, "application/pdf");
+console.log(result.content);
+```
+
+Supports browsers, Deno, and Cloudflare Workers.
+
+See the [WASM binding documentation](https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/typescript) for complete API reference.
+
+## Docker
+
+**Installation:**
+Pull the official image from GitHub Container Registry:
+
+```bash
+docker pull ghcr.io/kreuzberg-dev/kreuzberg
+```
+
+**API Server Mode:**
+
+```bash
+docker run -p 8000:8000 ghcr.io/kreuzberg-dev/kreuzberg serve --host 0.0.0.0
+```
+
+**CLI Mode:**
+
+```bash
+docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg extract /data/document.pdf
+```
+
+**MCP Server Mode:**
+
+```bash
+docker run -i ghcr.io/kreuzberg-dev/kreuzberg mcp
+```
+
+Image sizes:
+
+- Core image: 1.0-1.3GB
+- Full image: ~1.0-1.3GB
+
+See the [Docker guide](https://docs.kreuzberg.dev/guides/docker/) for deployment details.
+
+## Platform Support
+
+All language bindings include precompiled binaries for x86_64 and aarch64 on Linux and macOS. Windows support varies by binding. Refer to the main [README](https://github.com/kreuzberg-dev/kreuzberg) for platform compatibility matrix.
diff --git a/plugins/kreuzberg/skills/kreuzberg/references/python-api.md b/plugins/kreuzberg/skills/kreuzberg/references/python-api.md
new file mode 100644
index 0000000..bf6adaf
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/references/python-api.md
@@ -0,0 +1,1440 @@
+# Kreuzberg Python API Reference
+
+Comprehensive documentation for the Kreuzberg Python API. All extraction logic and heavy lifting is implemented in high-performance Rust, with Python adding OCR backends (EasyOCR, PaddleOCR) and custom post-processor support.
+
+## Extraction Functions
+
+### Synchronous File Extraction
+
+```python
+def extract_file_sync(
+    file_path: str | Path,
+    mime_type: str | None = None,
+    config: ExtractionConfig | None = None,
+    *,
+    easyocr_kwargs: dict[str, Any] | None = None,
+    paddleocr_kwargs: dict[str, Any] | None = None,
+) -> ExtractionResult
+```
+
+Extract content from a file (synchronous).
+
+**Parameters:**
+
+- `file_path` (str | Path): Path to the file
+- `mime_type` (str | None): Optional MIME type hint (auto-detected if None)
+- `config` (ExtractionConfig | None): Extraction configuration (uses defaults if None)
+- `easyocr_kwargs` (dict | None): EasyOCR initialization options (languages, use_gpu, beam_width, etc.)
+- `paddleocr_kwargs` (dict | None): PaddleOCR initialization options (lang, use_angle_cls, show_log, etc.)
+
+**Returns:** ExtractionResult with content, metadata, and tables
+
+**Example:**
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig, TesseractConfig
+
+# Basic usage
+result = extract_file_sync("document.pdf")
+
+# With Tesseract configuration
+config = ExtractionConfig(
+    ocr=OcrConfig(
+        backend="tesseract",
+        language="eng",
+        tesseract_config=TesseractConfig(psm=6, enable_table_detection=True),
+    )
+)
+result = extract_file_sync("invoice.pdf", config=config)
+
+# With EasyOCR custom options
+config = ExtractionConfig(ocr=OcrConfig(backend="easyocr", language="eng"))
+result = extract_file_sync("scanned.pdf", config=config, easyocr_kwargs={"use_gpu": True})
+```
+
+### Asynchronous File Extraction
+
+```python
+async def extract_file(
+    file_path: str | Path,
+    mime_type: str | None = None,
+    config: ExtractionConfig | None = None,
+    *,
+    easyocr_kwargs: dict[str, Any] | None = None,
+    paddleocr_kwargs: dict[str, Any] | None = None,
+) -> ExtractionResult
+```
+
+Extract content from a file (asynchronous). Same parameters and behavior as `extract_file_sync`.
+
+### Synchronous Bytes Extraction
+
+```python
+def extract_bytes_sync(
+    data: bytes | bytearray,
+    mime_type: str,
+    config: ExtractionConfig | None = None,
+    *,
+    easyocr_kwargs: dict[str, Any] | None = None,
+    paddleocr_kwargs: dict[str, Any] | None = None,
+) -> ExtractionResult
+```
+
+Extract content from bytes (synchronous).
+
+**Parameters:**
+
+- `data` (bytes | bytearray): File content as bytes or bytearray
+- `mime_type` (str): MIME type of the data (required for format detection)
+- `config` (ExtractionConfig | None): Extraction configuration
+- `easyocr_kwargs` (dict | None): EasyOCR initialization options
+- `paddleocr_kwargs` (dict | None): PaddleOCR initialization options
+
+**Returns:** ExtractionResult with content, metadata, and tables
+
+### Asynchronous Bytes Extraction
+
+```python
+async def extract_bytes(
+    data: bytes | bytearray,
+    mime_type: str,
+    config: ExtractionConfig | None = None,
+    *,
+    easyocr_kwargs: dict[str, Any] | None = None,
+    paddleocr_kwargs: dict[str, Any] | None = None,
+) -> ExtractionResult
+```
+
+Extract content from bytes (asynchronous). Same parameters and behavior as `extract_bytes_sync`.
+
+### Batch File Extraction
+
+```python
+async def batch_extract_files(
+    paths: list[str | Path],
+    config: ExtractionConfig | None = None,
+    *,
+    easyocr_kwargs: dict[str, Any] | None = None,
+    paddleocr_kwargs: dict[str, Any] | None = None,
+) -> list[ExtractionResult]
+```
+
+Extract content from multiple files in parallel (asynchronous).
+
+**Parameters:**
+
+- `paths` (list[str | Path]): List of file paths
+- `config` (ExtractionConfig | None): Extraction configuration
+- `easyocr_kwargs` (dict | None): EasyOCR initialization options
+- `paddleocr_kwargs` (dict | None): PaddleOCR initialization options
+
+**Returns:** List of ExtractionResults (one per file)
+
+### Batch File Extraction (Synchronous)
+
+```python
+def batch_extract_files_sync(
+    paths: list[str | Path],
+    config: ExtractionConfig | None = None,
+    *,
+    easyocr_kwargs: dict[str, Any] | None = None,
+    paddleocr_kwargs: dict[str, Any] | None = None,
+) -> list[ExtractionResult]
+```
+
+Extract content from multiple files in parallel (synchronous).
+
+### Batch Bytes Extraction
+
+```python
+async def batch_extract_bytes(
+    data_list: list[bytes | bytearray],
+    mime_types: list[str],
+    config: ExtractionConfig | None = None,
+    *,
+    easyocr_kwargs: dict[str, Any] | None = None,
+    paddleocr_kwargs: dict[str, Any] | None = None,
+) -> list[ExtractionResult]
+```
+
+Extract content from multiple byte arrays in parallel (asynchronous).
+
+**Parameters:**
+
+- `data_list` (list[bytes | bytearray]): List of file contents as bytes/bytearray
+- `mime_types` (list[str]): List of MIME types (one per data item)
+- `config` (ExtractionConfig | None): Extraction configuration
+- `easyocr_kwargs` (dict | None): EasyOCR initialization options
+- `paddleocr_kwargs` (dict | None): PaddleOCR initialization options
+
+**Returns:** List of ExtractionResults (one per data item)
+
+### Batch Bytes Extraction (Synchronous)
+
+```python
+def batch_extract_bytes_sync(
+    data_list: list[bytes | bytearray],
+    mime_types: list[str],
+    config: ExtractionConfig | None = None,
+    *,
+    easyocr_kwargs: dict[str, Any] | None = None,
+    paddleocr_kwargs: dict[str, Any] | None = None,
+) -> list[ExtractionResult]
+```
+
+Extract content from multiple byte arrays in parallel (synchronous).
+
+### Per-File Config in Batch Functions
+
+As of v4.5.0, per-file configuration overrides are passed as an optional `file_configs` parameter on the unified batch functions:
+
+```python
+def batch_extract_files_sync(
+    paths: list[str | Path],
+    config: ExtractionConfig | None = None,
+    *,
+    file_configs: list[FileExtractionConfig | None] | None = None,
+    easyocr_kwargs: dict[str, Any] | None = None,
+) -> list[ExtractionResult]
+```
+
+The `file_configs` list must have the same length as `paths`. Each element is either a `FileExtractionConfig` override or `None` to use batch defaults. The same parameter is available on `batch_extract_files`, `batch_extract_bytes_sync`, and `batch_extract_bytes`.
+
+> **Note:** The separate `batch_extract_files_with_configs_sync` / `batch_extract_files_with_configs` / `batch_extract_bytes_with_configs_sync` / `batch_extract_bytes_with_configs` functions have been removed in v4.5.0.
+
+## Configuration Classes
+
+### ExtractionConfig
+
+Main extraction configuration for document processing. All attributes are optional and use sensible defaults when not specified.
+
+**Attributes:**
+
+| Field                        | Type                            | Default       | Description                                                                               |
+| ---------------------------- | ------------------------------- | ------------- | ----------------------------------------------------------------------------------------- |
+| `use_cache`                  | bool                            | True          | Enable caching of extraction results to improve performance on repeated extractions       |
+| `enable_quality_processing`  | bool                            | True          | Enable quality post-processing to clean and normalize extracted text                      |
+| `ocr`                        | OcrConfig \| None               | None          | OCR configuration for extracting text from images. None = OCR disabled                    |
+| `force_ocr`                  | bool                            | False         | Force OCR processing even for searchable PDFs that contain extractable text               |
+| `chunking`                   | ChunkingConfig \| None          | None          | Text chunking configuration for dividing content into manageable chunks. None = disabled  |
+| `images`                     | ImageExtractionConfig \| None   | None          | Image extraction configuration for extracting images FROM documents. None = no extraction |
+| `pdf_options`                | PdfConfig \| None               | None          | PDF-specific options like password handling and metadata extraction                       |
+| `token_reduction`            | TokenReductionConfig \| None    | None          | Token reduction configuration for reducing token count in extracted content               |
+| `language_detection`         | LanguageDetectionConfig \| None | None          | Language detection configuration for identifying document language(s)                     |
+| `keywords`                   | KeywordConfig \| None           | None          | Keyword extraction configuration for identifying important terms and phrases              |
+| `postprocessor`              | PostProcessorConfig \| None     | None          | Post-processor configuration for custom text processing                                   |
+| `max_concurrent_extractions` | int \| None                     | num_cpus \* 2 | Maximum concurrent extractions in batch operations                                        |
+| `html_options`               | HtmlConversionOptions \| None   | None          | HTML conversion options for converting documents to markdown                              |
+| `pages`                      | PageConfig \| None              | None          | Page extraction configuration for tracking page boundaries                                |
+| `security_limits`            | dict[str, int] \| None          | None          | Security limits configuration                                                             |
+| `result_format`              | str                             | "unified"     | Result format: "unified" or "element_based"                                               |
+| `output_format`              | str                             | "plain"       | Output content format: "plain", "markdown", "djot", or "html"                             |
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, ChunkingConfig, OcrConfig
+
+# Basic extraction with defaults
+config = ExtractionConfig()
+
+# Enable chunking with 512-char chunks and 100-char overlap
+config = ExtractionConfig(chunking=ChunkingConfig(max_chars=512, max_overlap=100))
+
+# Enable OCR with Tesseract
+config = ExtractionConfig(ocr=OcrConfig(backend="tesseract", language="eng"))
+
+# Multiple options
+config = ExtractionConfig(
+    use_cache=True,
+    enable_quality_processing=True,
+    output_format="markdown",
+    result_format="unified"
+)
+```
+
+### FileExtractionConfig
+
+Per-file extraction overrides for batch operations. All fields optional (`None` = use batch default).
+
+**Key fields:** `enable_quality_processing`, `ocr`, `force_ocr`, `chunking`, `images`, `pdf_options`, `token_reduction`, `language_detection`, `pages`, `keywords`, `postprocessor`, `html_options`, `result_format`, `output_format`, `include_document_structure`, `layout`.
+
+Excluded (batch-level only): `max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`.
+
+```python
+per_file = FileExtractionConfig(
+    force_ocr=True,
+    ocr=OcrConfig(backend="tesseract", language="deu"),
+)
+```
+
+### OcrConfig
+
+OCR configuration for extracting text from images.
+
+**Attributes:**
+
+| Field              | Type                    | Default     | Description                                                                                           |
+| ------------------ | ----------------------- | ----------- | ----------------------------------------------------------------------------------------------------- |
+| `backend`          | str                     | "tesseract" | OCR backend: "tesseract", "easyocr", or "paddleocr"                                                   |
+| `language`         | str                     | "eng"       | Language code (ISO 639-3 three-letter: "eng", "deu", "fra" or ISO 639-1 two-letter: "en", "de", "fr") |
+| `tesseract_config` | TesseractConfig \| None | None        | Tesseract-specific configuration (only used when backend="tesseract")                                 |
+
+**Example:**
+
+```python
+from kreuzberg import OcrConfig
+
+# Tesseract with German language
+config = OcrConfig(backend="tesseract", language="deu")
+
+# EasyOCR for faster recognition
+config = OcrConfig(backend="easyocr", language="eng")
+
+# PaddleOCR for production deployments
+config = OcrConfig(backend="paddleocr", language="chi_sim")
+```
+
+### TesseractConfig
+
+Detailed Tesseract OCR configuration for advanced tuning. Fine-tune Tesseract OCR behavior for specific document types and quality levels.
+
+**Attributes:**
+
+| Field                                | Type                             | Default    | Description                                                                               |
+| ------------------------------------ | -------------------------------- | ---------- | ----------------------------------------------------------------------------------------- |
+| `language`                           | str                              | "eng"      | OCR language (ISO 639-3 three-letter code)                                                |
+| `psm`                                | int                              | 3          | Page Segmentation Mode: 0 (detection only), 3 (auto), 6 (uniform block), 11 (sparse text) |
+| `output_format`                      | str                              | "markdown" | Output format for OCR results                                                             |
+| `oem`                                | int                              | 3          | OCR Engine Mode: 0 (legacy), 1 (LSTM), 2 (both), 3 (auto)                                 |
+| `min_confidence`                     | float                            | 0.0        | Minimum confidence threshold (0.0-1.0) for accepting OCR results                          |
+| `preprocessing`                      | ImagePreprocessingConfig \| None | None       | Image preprocessing configuration before OCR                                              |
+| `enable_table_detection`             | bool                             | True       | Enable automatic table detection and extraction                                           |
+| `table_min_confidence`               | float                            | 0.0        | Minimum confidence for table detection (0.0-1.0)                                          |
+| `table_column_threshold`             | int                              | 50         | Minimum pixel width between columns                                                       |
+| `table_row_threshold_ratio`          | float                            | 0.5        | Minimum row height ratio                                                                  |
+| `use_cache`                          | bool                             | True       | Cache OCR results for improved performance                                                |
+| `classify_use_pre_adapted_templates` | bool                             | True       | Use pre-adapted character templates                                                       |
+| `language_model_ngram_on`            | bool                             | False      | Enable language model n-gram processing                                                   |
+| `tessedit_dont_blkrej_good_wds`      | bool                             | True       | Don't block-reject good words                                                             |
+| `tessedit_dont_rowrej_good_wds`      | bool                             | True       | Don't row-reject good words                                                               |
+| `tessedit_enable_dict_correction`    | bool                             | True       | Enable dictionary-based spelling correction                                               |
+| `tessedit_char_whitelist`            | str                              | ""         | Whitelist of characters to recognize (empty = all)                                        |
+| `tessedit_char_blacklist`            | str                              | ""         | Blacklist of characters to ignore                                                         |
+| `tessedit_use_primary_params_model`  | bool                             | True       | Use primary parameters model                                                              |
+| `textord_space_size_is_variable`     | bool                             | True       | Allow variable space sizes                                                                |
+| `thresholding_method`                | bool                             | False      | Thresholding method for binarization                                                      |
+
+**Example:**
+
+```python
+from kreuzberg import TesseractConfig, ImagePreprocessingConfig
+
+# General document OCR
+config = TesseractConfig(psm=3, oem=3)
+
+# Invoice/form OCR with table detection
+config = TesseractConfig(psm=6, oem=2, enable_table_detection=True, min_confidence=0.6)
+
+# High-precision technical document OCR
+config = TesseractConfig(
+    psm=3,
+    oem=2,
+    preprocessing=ImagePreprocessingConfig(denoise=True, contrast_enhance=True, auto_rotate=True),
+    min_confidence=0.7,
+    tessedit_enable_dict_correction=True,
+)
+
+# Numeric-only OCR (for receipts, barcodes)
+config = TesseractConfig(psm=6, tessedit_char_whitelist="0123456789.-,", min_confidence=0.8)
+
+# Multiple language document
+config = TesseractConfig(language="eng+fra+deu", psm=3, oem=2)
+```
+
+### ChunkingConfig
+
+Text chunking configuration for dividing content into chunks. Chunking is useful for preparing content for embedding, indexing, or processing with length-limited systems (like LLM context windows).
+
+**Attributes:**
+
+| Field         | Type                    | Default | Description                                                                                                                            |
+| ------------- | ----------------------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `max_chars`   | int                     | 1000    | Maximum number of characters per chunk. Chunks larger than this will be split intelligently at sentence/paragraph boundaries           |
+| `max_overlap` | int                     | 200     | Overlap between consecutive chunks in characters. Creates context bridges between chunks for smoother processing                       |
+| `embedding`   | EmbeddingConfig \| None | None    | Configuration for generating embeddings for each chunk using ONNX models. None = no embeddings                                         |
+| `preset`      | str \| None             | None    | Use a preset chunking configuration (overrides individual settings if provided). Use list_embedding_presets() to see available presets |
+
+**IMPORTANT:** The fields are `max_chars` and `max_overlap` (NOT `max_characters` or `overlap`).
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType
+
+# Basic chunking with defaults
+config = ExtractionConfig(chunking=ChunkingConfig())
+
+# Custom chunk size with overlap
+config = ExtractionConfig(chunking=ChunkingConfig(max_chars=512, max_overlap=100))
+
+# Chunking with embeddings
+config = ExtractionConfig(
+    chunking=ChunkingConfig(
+        max_chars=512,
+        embedding=EmbeddingConfig(model=EmbeddingModelType.preset("balanced"))
+    )
+)
+
+# Using preset configuration
+config = ExtractionConfig(chunking=ChunkingConfig(preset="semantic"))
+```
+
+### PdfConfig
+
+PDF-specific extraction configuration.
+
+**Attributes:**
+
+| Field              | Type                    | Default | Description                                                                                         |
+| ------------------ | ----------------------- | ------- | --------------------------------------------------------------------------------------------------- |
+| `extract_images`   | bool                    | False   | Extract images from PDF documents                                                                   |
+| `passwords`        | list[str] \| None       | None    | List of passwords to try when opening encrypted PDFs. Try each password in order until one succeeds |
+| `extract_metadata` | bool                    | True    | Extract PDF metadata (title, author, creation date, etc.)                                           |
+| `hierarchy`        | HierarchyConfig \| None | None    | Document hierarchy detection configuration. None = no hierarchy detection                           |
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, PdfConfig, HierarchyConfig
+
+# Basic PDF configuration
+config = ExtractionConfig(pdf_options=PdfConfig())
+
+# Extract metadata and images from PDF
+config = ExtractionConfig(pdf_options=PdfConfig(extract_images=True, extract_metadata=True))
+
+# Handle encrypted PDFs
+config = ExtractionConfig(pdf_options=PdfConfig(passwords=["password123", "fallback_password"]))
+
+# Enable hierarchy detection
+config = ExtractionConfig(pdf_options=PdfConfig(hierarchy=HierarchyConfig(k_clusters=6)))
+```
+
+### ImageExtractionConfig
+
+Configuration for extracting images FROM documents. This is NOT for preprocessing images before OCR.
+
+**Attributes:**
+
+| Field                 | Type | Default | Description                                                                          |
+| --------------------- | ---- | ------- | ------------------------------------------------------------------------------------ |
+| `extract_images`      | bool | True    | Enable image extraction from documents                                               |
+| `target_dpi`          | int  | 300     | Target DPI for image normalization. Images are resampled to this DPI for consistency |
+| `max_image_dimension` | int  | 4096    | Maximum width or height for extracted images. Larger images are downscaled to fit    |
+| `auto_adjust_dpi`     | bool | True    | Automatically adjust DPI based on image content quality                              |
+| `min_dpi`             | int  | 72      | Minimum DPI threshold. Images with lower DPI are upscaled                            |
+| `max_dpi`             | int  | 600     | Maximum DPI threshold. Images with higher DPI are downscaled                         |
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, ImageExtractionConfig
+
+# Basic image extraction
+config = ExtractionConfig(images=ImageExtractionConfig())
+
+# Extract images with custom DPI settings
+config = ExtractionConfig(
+    images=ImageExtractionConfig(target_dpi=150, max_image_dimension=2048, auto_adjust_dpi=False)
+)
+```
+
+### EmbeddingConfig
+
+Embedding generation configuration for text chunks. Configures embedding generation using ONNX models via fastembed-rs.
+
+**Attributes:**
+
+| Field                    | Type               | Default           | Description                                                                                 |
+| ------------------------ | ------------------ | ----------------- | ------------------------------------------------------------------------------------------- |
+| `model`                  | EmbeddingModelType | Preset "balanced" | The embedding model to use (preset, fastembed, or custom)                                   |
+| `normalize`              | bool               | True              | Whether to normalize embedding vectors to unit length (recommended for cosine similarity)   |
+| `batch_size`             | int                | 32                | Number of texts to process simultaneously. Higher values use more memory but may be faster  |
+| `show_download_progress` | bool               | False             | Display progress during embedding model download                                            |
+| `cache_dir`              | str \| None        | None              | Custom directory for caching downloaded models (defaults to ~/.cache/kreuzberg/embeddings/) |
+
+**Example:**
+
+```python
+from kreuzberg import EmbeddingConfig, EmbeddingModelType
+
+# Basic preset embedding (recommended)
+config = EmbeddingConfig()
+
+# Specific preset with settings
+config = EmbeddingConfig(
+    model=EmbeddingModelType.preset("balanced"),
+    normalize=True,
+    batch_size=64
+)
+
+# Custom ONNX model
+config = EmbeddingConfig(
+    model=EmbeddingModelType.custom(model_id="sentence-transformers/all-MiniLM-L6-v2", dimensions=384)
+)
+
+# With custom cache directory
+config = EmbeddingConfig(cache_dir="/path/to/model/cache")
+```
+
+### EmbeddingModelType
+
+Embedding model type selector with multiple configurations.
+
+**Static Methods:**
+
+```python
+@staticmethod
+def preset(name: str) -> EmbeddingModelType
+```
+
+Use a preset configuration (recommended for most use cases). Available presets: balanced, compact, large.
+
+```python
+@staticmethod
+def fastembed(model: str, dimensions: int) -> EmbeddingModelType
+```
+
+Use a specific fastembed model by name.
+
+```python
+@staticmethod
+def custom(model_id: str, dimensions: int) -> EmbeddingModelType
+```
+
+Use a custom ONNX model from HuggingFace (e.g., sentence-transformers/\*).
+
+**Example:**
+
+```python
+from kreuzberg import EmbeddingModelType, list_embedding_presets
+
+# Using the balanced preset (recommended for general use)
+model = EmbeddingModelType.preset("balanced")
+
+# Using a specific fast embedding model
+model = EmbeddingModelType.fastembed(model="BAAI/bge-small-en-v1.5", dimensions=384)
+
+# Using a custom HuggingFace model
+model = EmbeddingModelType.custom(
+    model_id="sentence-transformers/all-MiniLM-L6-v2",
+    dimensions=384
+)
+
+# Listing available presets
+presets = list_embedding_presets()
+print(f"Available presets: {presets}")
+```
+
+### TokenReductionConfig
+
+Configuration for reducing token count in extracted content. Reduces token count to lower costs when working with LLM APIs.
+
+**Attributes:**
+
+| Field                      | Type | Default | Description                                                                                      |
+| -------------------------- | ---- | ------- | ------------------------------------------------------------------------------------------------ |
+| `mode`                     | str  | "off"   | Token reduction mode: "off", "light", "moderate", "aggressive", or "maximum"                     |
+| `preserve_important_words` | bool | True    | Preserve capitalized words, technical terms, and proper nouns even in aggressive reduction modes |
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, TokenReductionConfig
+
+# Moderate token reduction
+config = ExtractionConfig(
+    token_reduction=TokenReductionConfig(mode="moderate", preserve_important_words=True)
+)
+
+# Maximum reduction for large batches
+config = ExtractionConfig(
+    token_reduction=TokenReductionConfig(mode="maximum", preserve_important_words=True)
+)
+
+# No reduction (default)
+config = ExtractionConfig(
+    token_reduction=TokenReductionConfig(mode="off")
+)
+```
+
+### LanguageDetectionConfig
+
+Configuration for detecting document language(s).
+
+**Attributes:**
+
+| Field             | Type  | Default | Description                                                                                         |
+| ----------------- | ----- | ------- | --------------------------------------------------------------------------------------------------- |
+| `enabled`         | bool  | True    | Enable language detection for extracted content                                                     |
+| `min_confidence`  | float | 0.8     | Minimum confidence threshold (0.0-1.0) for language detection                                       |
+| `detect_multiple` | bool  | False   | Detect multiple languages in the document. When False, only the most confident language is returned |
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file_sync
+
+# Basic language detection
+config = ExtractionConfig(language_detection=LanguageDetectionConfig())
+
+# Detect multiple languages with lower confidence threshold
+config = ExtractionConfig(
+    language_detection=LanguageDetectionConfig(detect_multiple=True, min_confidence=0.6)
+)
+
+# Access detected languages in result
+result = extract_file_sync("multilingual.pdf", config=config)
+print(f"Languages: {result.detected_languages}")
+```
+
+### KeywordConfig
+
+Keyword extraction configuration.
+
+**Attributes:**
+
+| Field          | Type               | Default | Description                                                                   |
+| -------------- | ------------------ | ------- | ----------------------------------------------------------------------------- |
+| `algorithm`    | KeywordAlgorithm   | -       | Keyword extraction algorithm (KeywordAlgorithm.Yake or KeywordAlgorithm.Rake) |
+| `max_keywords` | int                | 10      | Maximum number of keywords to extract                                         |
+| `min_score`    | float              | 0.0     | Minimum score threshold                                                       |
+| `ngram_range`  | tuple[int, int]    | (1, 3)  | N-gram range for keyword extraction                                           |
+| `language`     | str \| None        | "en"    | Optional language hint                                                        |
+| `yake_params`  | YakeParams \| None | None    | YAKE-specific tuning parameters                                               |
+| `rake_params`  | RakeParams \| None | None    | RAKE-specific tuning parameters                                               |
+
+### PageConfig
+
+Page extraction and tracking configuration.
+
+**Attributes:**
+
+| Field                 | Type | Default                                | Description                                  |
+| --------------------- | ---- | -------------------------------------- | -------------------------------------------- |
+| `extract_pages`       | bool | False                                  | Enable page tracking and per-page extraction |
+| `insert_page_markers` | bool | False                                  | Insert page markers into content             |
+| `marker_format`       | str  | "\\n\\n<!-- PAGE {page_num} -->\\n\\n" | Marker template containing {page_num}        |
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, PageConfig
+
+config = ExtractionConfig(pages=PageConfig(extract_pages=True))
+```
+
+### PostProcessorConfig
+
+Configuration for post-processors in the extraction pipeline.
+
+**Attributes:**
+
+| Field                 | Type              | Default | Description                                                 |
+| --------------------- | ----------------- | ------- | ----------------------------------------------------------- |
+| `enabled`             | bool              | True    | Enable post-processors in the extraction pipeline           |
+| `enabled_processors`  | list[str] \| None | None    | Whitelist of processor names to run. None = run all enabled |
+| `disabled_processors` | list[str] \| None | None    | Blacklist of processor names to skip. None = none disabled  |
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, PostProcessorConfig
+
+# Basic post-processing with defaults
+config = ExtractionConfig(postprocessor=PostProcessorConfig())
+
+# Enable only specific processors
+config = ExtractionConfig(
+    postprocessor=PostProcessorConfig(
+        enabled=True,
+        enabled_processors=["normalize_whitespace", "fix_encoding"]
+    )
+)
+
+# Disable specific processors
+config = ExtractionConfig(
+    postprocessor=PostProcessorConfig(
+        enabled=True,
+        disabled_processors=["experimental_cleanup"]
+    )
+)
+
+# Disable all post-processing
+config = ExtractionConfig(postprocessor=PostProcessorConfig(enabled=False))
+```
+
+### ImagePreprocessingConfig
+
+Configuration for preprocessing images before OCR. This is NOT for extracting images from documents.
+
+**Attributes:**
+
+| Field                 | Type | Default | Description                                       |
+| --------------------- | ---- | ------- | ------------------------------------------------- |
+| `target_dpi`          | int  | 300     | Target DPI for image normalization before OCR     |
+| `auto_rotate`         | bool | True    | Automatically detect and correct image rotation   |
+| `deskew`              | bool | True    | Correct skewed images to improve OCR accuracy     |
+| `denoise`             | bool | False   | Apply denoising filters to reduce noise in images |
+| `contrast_enhance`    | bool | False   | Enhance contrast to improve text readability      |
+| `binarization_method` | str  | "otsu"  | Method for converting images to black and white   |
+| `invert_colors`       | bool | False   | Invert colors (white text on black background)    |
+
+**Example:**
+
+```python
+from kreuzberg import TesseractConfig, ImagePreprocessingConfig
+
+# Basic preprocessing for OCR
+config = TesseractConfig(preprocessing=ImagePreprocessingConfig())
+
+# Aggressive preprocessing for low-quality scans
+config = TesseractConfig(
+    preprocessing=ImagePreprocessingConfig(
+        target_dpi=300,
+        denoise=True,
+        contrast_enhance=True,
+        auto_rotate=True,
+        deskew=True
+    )
+)
+```
+
+## ExtractionResult
+
+Result object returned by extraction functions.
+
+**Attributes:**
+
+| Field                 | Type                           | Description                                                                      |
+| --------------------- | ------------------------------ | -------------------------------------------------------------------------------- |
+| `content`             | str                            | Main extracted text content in the specified output_format                       |
+| `mime_type`           | str                            | MIME type of the processed document                                              |
+| `metadata`            | Metadata                       | Extracted document metadata (title, author, created_at, format_type, etc.)       |
+| `tables`              | list[ExtractedTable]           | Extracted tables from the document                                               |
+| `detected_languages`  | list[str] \| None              | Detected language codes (e.g., ["en", "de"]) if language detection is enabled    |
+| `chunks`              | list[Chunk] \| None            | Text chunks if chunking is enabled (each chunk has content, embedding, metadata) |
+| `images`              | list[ExtractedImage] \| None   | Extracted images if image extraction is enabled                                  |
+| `pages`               | list[PageContent] \| None      | Per-page content and metadata if page extraction is enabled                      |
+| `elements`            | list[Element] \| None          | Semantic elements if result_format="element_based"                               |
+| `output_format`       | str \| None                    | Format of the content field (plain, markdown, djot, html)                        |
+| `result_format`       | str \| None                    | Result format used (unified or element_based)                                    |
+| `extracted_keywords`  | list[ExtractedKeyword] \| None | Extracted keywords with relevance scores if keyword extraction enabled           |
+| `quality_score`       | float \| None                  | Overall quality score for the extraction result (0.0-1.0)                        |
+| `processing_warnings` | list[ProcessingWarning]        | Non-fatal warnings encountered during extraction pipeline                        |
+
+**Methods:**
+
+```python
+def get_page_count(self) -> int
+```
+
+Get the total number of pages in the document.
+
+```python
+def get_chunk_count(self) -> int
+```
+
+Get the total number of chunks if chunking is enabled.
+
+```python
+def get_detected_language(self) -> str | None
+```
+
+Get the most confident detected language code.
+
+```python
+def get_metadata_field(self, field_name: str) -> Any | None
+```
+
+Get a specific metadata field by name.
+
+**Example:**
+
+```python
+from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig
+
+config = ExtractionConfig(
+    chunking=ChunkingConfig(max_chars=512),
+    output_format="markdown"
+)
+result = extract_file_sync("document.pdf", config=config)
+
+print(f"Content preview: {result.content[:200]}")
+print(f"MIME type: {result.mime_type}")
+print(f"Page count: {result.get_page_count()}")
+print(f"Chunk count: {result.get_chunk_count()}")
+print(f"Detected language: {result.get_detected_language()}")
+
+if result.tables:
+    print(f"Found {len(result.tables)} tables")
+
+if result.chunks:
+    first_chunk = result.chunks[0]
+    print(f"First chunk: {first_chunk.content[:100]}")
+    if first_chunk.embedding:
+        print(f"Embedding dimensions: {len(first_chunk.embedding)}")
+```
+
+## Error Classes
+
+All exceptions inherit from `KreuzbergError`, the base exception class.
+
+### KreuzbergError
+
+Base exception class for all Kreuzberg errors.
+
+```python
+class KreuzbergError(Exception):
+    """Base exception for all Kreuzberg errors."""
+```
+
+### ParsingError
+
+Raised when document parsing fails.
+
+```python
+class ParsingError(KreuzbergError):
+    """Document parsing failed (corrupt, malformed, etc.)."""
+```
+
+### OCRError
+
+Raised when OCR processing fails.
+
+```python
+class OCRError(KreuzbergError):
+    """OCR operation failed."""
+```
+
+### ValidationError
+
+Raised when validation fails.
+
+```python
+class ValidationError(KreuzbergError):
+    """Validation failed (invalid parameters, constraints, format mismatches)."""
+```
+
+### MissingDependencyError
+
+Raised when required dependencies are not available.
+
+```python
+class MissingDependencyError(KreuzbergError):
+    """Required dependency not available (easyocr, paddleocr, tesseract, etc.)."""
+
+    @staticmethod
+    def create_for_package(dependency_group: str, functionality: str, package_name: str) -> MissingDependencyError
+```
+
+**Example:**
+
+```python
+from kreuzberg import extract_file_sync, MissingDependencyError, OCRError, ParsingError
+
+try:
+    result = extract_file_sync("document.pdf")
+except ParsingError as e:
+    print(f"Failed to parse document: {e}")
+except OCRError as e:
+    print(f"OCR failed: {e}")
+except MissingDependencyError as e:
+    print(f"Missing dependency: {e}")
+```
+
+## Utility Functions
+
+### MIME Type Detection
+
+```python
+def detect_mime_type(data: bytes | bytearray) -> str
+```
+
+Detect MIME type from file bytes using magic number detection.
+
+**Parameters:**
+
+- `data` (bytes | bytearray): File content as bytes or bytearray
+
+**Returns:** Detected MIME type (e.g., "application/pdf", "image/png")
+
+```python
+def detect_mime_type_from_path(path: str | Path) -> str
+```
+
+Detect MIME type from file path by reading the file and detecting its MIME type.
+
+**Parameters:**
+
+- `path` (str | Path): Path to the file
+
+**Returns:** Detected MIME type
+
+**Raises:**
+
+- `OSError`: If file cannot be read (file not found, permission denied, etc.)
+- `RuntimeError`: If MIME type detection fails
+
+**Example:**
+
+```python
+from kreuzberg import detect_mime_type, detect_mime_type_from_path
+
+# From bytes
+pdf_bytes = b"%PDF-1.4\n"
+mime_type = detect_mime_type(pdf_bytes)
+
+# From path
+mime_type = detect_mime_type_from_path("document.pdf")
+```
+
+### MIME Type Validation
+
+```python
+def validate_mime_type(mime_type: str) -> str
+```
+
+Validate a MIME type string and return the canonical form.
+
+```python
+def get_extensions_for_mime(mime_type: str) -> list[str]
+```
+
+Get file extensions associated with a MIME type.
+
+**Example:**
+
+```python
+from kreuzberg import validate_mime_type, get_extensions_for_mime
+
+canonical = validate_mime_type("application/pdf")
+extensions = get_extensions_for_mime("application/pdf")  # Returns ["pdf"]
+```
+
+### Configuration Loading
+
+```python
+def load_extraction_config_from_file(path: str | Path) -> ExtractionConfig
+```
+
+Load extraction configuration from a specific file.
+
+**Parameters:**
+
+- `path` (str | Path): Path to the configuration file (.toml, .yaml, or .json)
+
+**Returns:** ExtractionConfig parsed from the file
+
+**Raises:**
+
+- `FileNotFoundError`: If the configuration file does not exist
+- `RuntimeError`: If the file cannot be read or parsed
+- `ValueError`: If the file format is invalid or unsupported
+
+```python
+def discover_extraction_config() -> ExtractionConfig | None
+```
+
+Discover extraction configuration from the environment (deprecated).
+
+Attempts to locate a Kreuzberg configuration file using:
+
+1. KREUZBERG_CONFIG_PATH environment variable
+2. Search for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in current and parent directories
+
+**Returns:** ExtractionConfig if found, None otherwise
+
+**Note:** Deprecated in favor of `load_extraction_config_from_file` for more predictable behavior.
+
+**Example:**
+
+```python
+from kreuzberg import load_extraction_config_from_file, extract_file_sync
+
+# Load from specific file
+config = load_extraction_config_from_file("kreuzberg.toml")
+result = extract_file_sync("document.pdf", config=config)
+
+# Auto-discover configuration
+import os
+os.environ["KREUZBERG_CONFIG_PATH"] = "config/kreuzberg.yaml"
+# Then extraction will use the discovered config
+```
+
+## Plugin System
+
+### Registering Post-Processors
+
+```python
+def register_post_processor(processor: Any) -> None
+```
+
+Register a Python PostProcessor with the Rust core. Once registered, the processor will be called automatically after extraction to enrich results.
+
+**Required Methods:**
+
+- `name() -> str`: Return processor name (must be non-empty)
+- `process(result: ExtractionResult) -> ExtractionResult`: Process and enrich the extraction result
+- `processing_stage() -> str`: Return "early", "middle", or "late"
+
+**Optional Methods:**
+
+- `initialize() -> None`: Called when processor is registered
+- `shutdown() -> None`: Called when processor is unregistered
+
+**Example:**
+
+```python
+from kreuzberg import register_post_processor, ExtractionResult
+
+class EntityExtractor:
+    def name(self) -> str:
+        return "entity_extraction"
+
+    def processing_stage(self) -> str:
+        return "early"
+
+    def process(self, result: ExtractionResult) -> ExtractionResult:
+        entities = {"PERSON": ["John Doe"], "ORG": ["Microsoft"]}
+        result.metadata["entities"] = entities
+        return result
+
+register_post_processor(EntityExtractor())
+```
+
+### Registering OCR Backends
+
+```python
+def register_ocr_backend(backend: Any) -> None
+```
+
+Register a Python OCR backend with the Rust core.
+
+**Required Methods:**
+
+- `name() -> str`: Return backend name (must be non-empty)
+- `supported_languages() -> list[str]`: Return list of supported language codes
+- `process_image(image_bytes: bytes, language: str) -> OcrResult`: Process image and return OCR result
+- `process_file(path: str, language: str) -> OcrResult`: Process file and return OCR result
+- `initialize() -> None`: Called when backend is registered
+- `shutdown() -> None`: Called when backend is unregistered
+- `version() -> str`: Return backend version string
+
+**Example:**
+
+```python
+from kreuzberg import register_ocr_backend
+
+class MyOcrBackend:
+    def name(self) -> str:
+        return "my-ocr"
+
+    def supported_languages(self) -> list[str]:
+        return ["eng", "deu", "fra"]
+
+    def process_image(self, image_bytes: bytes, language: str) -> dict:
+        return {
+            "content": "extracted text",
+            "metadata": {"confidence": 0.95},
+            "tables": []
+        }
+
+register_ocr_backend(MyOcrBackend())
+```
+
+### Registering Validators
+
+```python
+def register_validator(validator: Any) -> None
+```
+
+Register a Python Validator with the Rust core. Validators are called automatically after extraction to validate results.
+
+**Required Methods:**
+
+- `name() -> str`: Return validator name (must be non-empty)
+- `validate(result: ExtractionResult) -> None`: Validate the extraction result (raise error to fail)
+
+**Optional Methods:**
+
+- `should_validate(result: ExtractionResult) -> bool`: Check if validator should run (defaults to True)
+- `priority() -> int`: Return priority (defaults to 50, higher runs first)
+
+**Example:**
+
+```python
+from kreuzberg import register_validator, ValidationError, ExtractionResult
+
+class MinLengthValidator:
+    def name(self) -> str:
+        return "min_length_validator"
+
+    def priority(self) -> int:
+        return 100
+
+    def validate(self, result: ExtractionResult) -> None:
+        if len(result.content) < 100:
+            raise ValidationError("Content too short")
+
+register_validator(MinLengthValidator())
+```
+
+### Plugin Management Functions
+
+```python
+def list_post_processors() -> list[str]
+```
+
+List names of all registered post-processors.
+
+```python
+def list_validators() -> list[str]
+```
+
+List names of all registered validators.
+
+```python
+def list_ocr_backends() -> list[str]
+```
+
+List names of all available OCR backends.
+
+```python
+def unregister_post_processor(name: str) -> None
+```
+
+Unregister a post-processor by name.
+
+```python
+def unregister_validator(name: str) -> None
+```
+
+Unregister a validator by name.
+
+```python
+def unregister_ocr_backend(name: str) -> None
+```
+
+Unregister an OCR backend by name.
+
+```python
+def clear_post_processors() -> None
+```
+
+Clear all registered post-processors.
+
+```python
+def clear_validators() -> None
+```
+
+Clear all registered validators.
+
+```python
+def clear_ocr_backends() -> None
+```
+
+Clear all registered OCR backends.
+
+## Format Enums
+
+### OutputFormat
+
+Output format for extraction results.
+
+```python
+class OutputFormat(str, Enum):
+    PLAIN = "plain"         # Plain text format
+    MARKDOWN = "markdown"   # Markdown format
+    DJOT = "djot"          # Djot lightweight markup format
+    HTML = "html"          # HTML format
+```
+
+### ResultFormat
+
+Result format controlling extraction output structure.
+
+```python
+class ResultFormat(str, Enum):
+    UNIFIED = "unified"                # All content in `content` field
+    ELEMENT_BASED = "element_based"   # Unstructured-compatible output with semantic elements
+```
+
+## Error Handling
+
+### Error Code Functions
+
+```python
+def get_last_error_code() -> int
+```
+
+Get the last error code from the FFI layer.
+
+**Returns:**
+
+- 0 (SUCCESS): No error occurred
+- 1 (GENERIC_ERROR): Generic unspecified error
+- 2 (PANIC): A panic occurred in the Rust core
+- 3 (INVALID_ARGUMENT): Invalid argument provided
+- 4 (IO_ERROR): I/O operation failed
+- 5 (PARSING_ERROR): Document parsing failed
+- 6 (OCR_ERROR): OCR operation failed
+- 7 (MISSING_DEPENDENCY): Required dependency not available
+
+```python
+def get_error_details() -> dict[str, Any]
+```
+
+Get detailed error information from the FFI layer.
+
+**Returns:** dict with keys:
+
+- `message` (str): Human-readable error message
+- `error_code` (int): Numeric error code (0-7)
+- `error_type` (str): Error type name (e.g., "validation", "ocr")
+- `source_file` (str | None): Source file path if available
+- `source_function` (str | None): Function name if available
+- `source_line` (int): Line number (0 if unknown)
+- `context_info` (str | None): Additional context if available
+- `is_panic` (bool): Whether error came from a panic
+
+```python
+def classify_error(message: str) -> int
+```
+
+Classify an error message into a Kreuzberg error code.
+
+**Parameters:**
+
+- `message` (str): The error message to classify
+
+**Returns:** int error code (0-7) representing the classification
+
+```python
+def error_code_name(code: int) -> str
+```
+
+Get the human-readable name of an error code.
+
+**Parameters:**
+
+- `code` (int): Numeric error code (0-7)
+
+**Returns:** Human-readable error code name (e.g., "validation", "ocr")
+
+**Example:**
+
+```python
+from kreuzberg import get_error_details, get_last_error_code, error_code_name, classify_error
+
+try:
+    result = extract_file_sync("document.pdf")
+except Exception as e:
+    code = get_last_error_code()
+    if code:
+        print(f"Error code: {code} ({error_code_name(code)})")
+
+    details = get_error_details()
+    print(f"Error: {details['message']}")
+    print(f"Type: {details['error_type']}")
+
+    classified = classify_error(str(e))
+    print(f"Classified as: {error_code_name(classified)}")
+```
+
+## Validation Functions
+
+### Parameter Validation
+
+```python
+def validate_chunking_params(max_chars: int, max_overlap: int) -> bool
+```
+
+Validate chunking parameters.
+
+```python
+def validate_confidence(confidence: float) -> bool
+```
+
+Validate confidence value (0.0-1.0).
+
+```python
+def validate_dpi(dpi: int) -> bool
+```
+
+Validate DPI value.
+
+```python
+def validate_tesseract_psm(psm: int) -> bool
+```
+
+Validate Tesseract Page Segmentation Mode.
+
+```python
+def validate_tesseract_oem(oem: int) -> bool
+```
+
+Validate Tesseract OCR Engine Mode.
+
+```python
+def validate_ocr_backend(backend: str) -> bool
+```
+
+Validate OCR backend name.
+
+```python
+def validate_language_code(code: str) -> bool
+```
+
+Validate language code format.
+
+```python
+def validate_token_reduction_level(level: str) -> bool
+```
+
+Validate token reduction level.
+
+```python
+def validate_output_format(output_format: str) -> bool
+```
+
+Validate output format string.
+
+```python
+def validate_binarization_method(method: str) -> bool
+```
+
+Validate binarization method for image preprocessing.
+
+### Getting Valid Values
+
+```python
+def get_valid_binarization_methods() -> list[str]
+```
+
+Get list of valid binarization methods.
+
+```python
+def get_valid_language_codes() -> list[str]
+```
+
+Get list of valid language codes.
+
+```python
+def get_valid_ocr_backends() -> list[str]
+```
+
+Get list of valid OCR backend names.
+
+```python
+def get_valid_token_reduction_levels() -> list[str]
+```
+
+Get list of valid token reduction levels.
+
+```python
+def list_embedding_presets() -> list[str]
+```
+
+List available embedding presets.
+
+```python
+def get_embedding_preset(name: str) -> EmbeddingPreset | None
+```
+
+Get details about a specific embedding preset.
+
+**Example:**
+
+```python
+from kreuzberg import (
+    validate_dpi,
+    get_valid_binarization_methods,
+    list_embedding_presets,
+    get_embedding_preset
+)
+
+# Validate parameters
+if not validate_dpi(300):
+    print("Invalid DPI")
+
+# List valid values
+binarization_methods = get_valid_binarization_methods()
+presets = list_embedding_presets()
+
+# Get preset details
+preset = get_embedding_preset("balanced")
+if preset:
+    print(f"Balanced preset: {preset.description}")
+    print(f"Dimensions: {preset.dimensions}")
+    print(f"Recommended chunk size: {preset.chunk_size}")
+```
+
+## Configuration Utilities
+
+### Config Manipulation
+
+```python
+def config_to_json(config: ExtractionConfig) -> str
+```
+
+Convert ExtractionConfig to JSON string.
+
+```python
+def config_get_field(config: ExtractionConfig, field_name: str) -> Any | None
+```
+
+Get a specific field value from ExtractionConfig.
+
+```python
+def config_merge(base: ExtractionConfig, override: ExtractionConfig) -> None
+```
+
+Merge override config into base config (mutates base).
+
+**Example:**
+
+```python
+from kreuzberg import ExtractionConfig, config_to_json, config_get_field, config_merge
+
+config = ExtractionConfig(use_cache=True, enable_quality_processing=False)
+
+# Convert to JSON
+json_str = config_to_json(config)
+print(json_str)
+
+# Get field
+use_cache = config_get_field(config, "use_cache")
+print(f"use_cache: {use_cache}")
+
+# Merge configs
+override = ExtractionConfig(use_cache=False)
+config_merge(config, override)
+```
+
+## Version Information
+
+```python
+__version__: str
+```
+
+Current version of the kreuzberg package.
+
+**Example:**
+
+```python
+from kreuzberg import __version__
+
+print(f"Kreuzberg version: {__version__}")
+```
diff --git a/plugins/kreuzberg/skills/kreuzberg/references/rust-api.md b/plugins/kreuzberg/skills/kreuzberg/references/rust-api.md
new file mode 100644
index 0000000..ca86179
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/references/rust-api.md
@@ -0,0 +1,866 @@
+# Kreuzberg Rust API Reference
+
+Complete API reference for the Kreuzberg document extraction library in Rust.
+
+## Setup
+
+Add to your `Cargo.toml`:
+
+```toml
+[dependencies]
+kreuzberg = { version = "4", features = [
+    "tokio-runtime",
+    "pdf",
+    "ocr",
+    "chunking",
+    "embeddings",
+    "language-detection",
+    "keywords-yake",
+    "keywords-rake",
+    "api",
+    "mcp"
+] }
+tokio = { version = "1", features = ["full"] }
+```
+
+### Core Features
+
+- **tokio-runtime**: Enables async/sync extraction (default). Required for `extract_file_sync`, `batch_extract_file_sync`, `batch_extract_file`
+- **pdf**: PDF extraction with PDFium
+- **ocr**: Tesseract-based OCR for scanned documents
+- **chunking**: Text chunking for RAG pipelines
+- **embeddings**: Vector embeddings generation
+- **language-detection**: Detect document language
+- **keywords-yake** / **keywords-rake**: Extract keywords using YAKE or RAKE
+- **api**: HTTP API with Axum
+- **mcp**: Model Context Protocol support
+
+---
+
+## Core Extraction Functions
+
+### `extract_file` (async)
+
+Extract content from a file path.
+
+```rust
+pub async fn extract_file(
+    path: impl AsRef<Path>,
+    mime_type: Option<&str>,
+    config: &ExtractionConfig,
+) -> Result<ExtractionResult>
+```
+
+**Always available.** Requires async context (`#[tokio::main]`, `tokio::spawn`, etc.).
+
+```rust
+use kreuzberg::{extract_file, ExtractionConfig};
+use std::path::Path;
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file("document.pdf", None, &config).await?;
+    println!("Content: {}", result.content);
+    Ok(())
+}
+```
+
+### `extract_bytes` (async)
+
+Extract content from byte data.
+
+```rust
+pub async fn extract_bytes(
+    data: &[u8],
+    mime_type: &str,
+    config: &ExtractionConfig,
+) -> Result<ExtractionResult>
+```
+
+**Always available.** Requires async context.
+
+```rust
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let pdf_bytes = std::fs::read("document.pdf")?;
+    let result = extract_bytes(&pdf_bytes, "application/pdf", &config).await?;
+    Ok(())
+}
+```
+
+### `extract_file_sync` (sync)
+
+Synchronous wrapper around `extract_file`.
+
+```rust
+pub fn extract_file_sync(
+    path: impl AsRef<Path>,
+    mime_type: Option<&str>,
+    config: &ExtractionConfig,
+) -> Result<ExtractionResult>
+```
+
+**Requires tokio-runtime feature.** Blocks the current thread using a global Tokio runtime.
+
+```rust
+use kreuzberg::{extract_file_sync, ExtractionConfig};
+
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let result = extract_file_sync("document.pdf", None, &config)?;
+    println!("Content: {}", result.content);
+    Ok(())
+}
+```
+
+### `extract_bytes_sync` (sync)
+
+Synchronous wrapper around `extract_bytes`.
+
+```rust
+pub fn extract_bytes_sync(
+    content: &[u8],
+    mime_type: &str,
+    config: &ExtractionConfig,
+) -> Result<ExtractionResult>
+```
+
+**Always available.** Works in sync and async contexts.
+
+```rust
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let bytes = b"Hello, world!";
+    let result = extract_bytes_sync(bytes, "text/plain", &config)?;
+    Ok(())
+}
+```
+
+### `batch_extract_file` (async, parallel)
+
+Extract multiple files concurrently.
+
+```rust
+pub async fn batch_extract_file(
+    paths: Vec<impl AsRef<Path>>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>>
+```
+
+**Requires tokio-runtime feature.** Processes files in parallel with automatic concurrency management (defaults to `num_cpus * 1.5`).
+
+```rust
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let paths = vec!["doc1.pdf", "doc2.pdf", "doc3.pdf"];
+    let results = batch_extract_file(paths, &config).await?;
+    println!("Processed {} files", results.len());
+    Ok(())
+}
+```
+
+### `batch_extract_bytes` (async, parallel)
+
+Extract multiple byte arrays concurrently.
+
+```rust
+pub async fn batch_extract_bytes(
+    contents: Vec<(Vec<u8>, String)>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>>
+```
+
+**Requires tokio-runtime feature.** Each tuple is `(bytes, mime_type)`.
+
+```rust
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let contents = vec![
+        (b"PDF content".to_vec(), "application/pdf".to_string()),
+        (b"Text content".to_vec(), "text/plain".to_string()),
+    ];
+    let results = batch_extract_bytes(contents, &config).await?;
+    Ok(())
+}
+```
+
+### `batch_extract_file_sync` (sync, parallel)
+
+Synchronous wrapper for batch file extraction.
+
+```rust
+pub fn batch_extract_file_sync(
+    paths: Vec<impl AsRef<Path>>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>>
+```
+
+**Requires tokio-runtime feature.** Uses global runtime for concurrency.
+
+```rust
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let paths = vec!["doc1.pdf", "doc2.pdf"];
+    let results = batch_extract_file_sync(paths, &config)?;
+    Ok(())
+}
+```
+
+### `batch_extract_bytes_sync` (sync, parallel)
+
+Synchronous wrapper for batch byte extraction.
+
+```rust
+pub fn batch_extract_bytes_sync(
+    contents: Vec<(Vec<u8>, String)>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>>
+```
+
+**Always available.** Each tuple is `(bytes, mime_type)`.
+
+```rust
+fn main() -> kreuzberg::Result<()> {
+    let config = ExtractionConfig::default();
+    let contents = vec![
+        (b"content 1".to_vec(), "text/plain".to_string()),
+        (b"content 2".to_vec(), "text/plain".to_string()),
+    ];
+    let results = batch_extract_bytes_sync(contents, &config)?;
+    Ok(())
+}
+```
+
+### `FileExtractionConfig`
+
+Per-file overrides for batch operations, passed as an optional parameter to `batch_extract_file` / `batch_extract_bytes` (and their sync variants). All fields `Option<T>` — `None` = use batch default.
+
+> **Note (v4.5.0):** The separate `batch_extract_file_with_configs` / `batch_extract_bytes_with_configs` functions have been removed. Per-file configs are now an optional parameter on the unified batch functions.
+
+```rust
+pub struct FileExtractionConfig {
+    pub enable_quality_processing: Option<bool>,
+    pub ocr: Option<OcrConfig>,
+    pub force_ocr: Option<bool>,
+    pub chunking: Option<ChunkingConfig>,
+    pub images: Option<ImageExtractionConfig>,
+    pub pdf_options: Option<PdfConfig>,
+    pub token_reduction: Option<TokenReductionConfig>,
+    pub language_detection: Option<LanguageDetectionConfig>,
+    pub pages: Option<PageConfig>,
+    pub postprocessor: Option<PostProcessorConfig>,
+    pub output_format: Option<OutputFormat>,
+    pub include_document_structure: Option<bool>,
+}
+```
+
+Excluded batch-level fields: `max_concurrent_extractions`, `use_cache`, `acceleration`, `security_limits`.
+
+---
+
+## Configuration
+
+### `ExtractionConfig`
+
+Main configuration struct for all extraction operations.
+
+```rust
+pub struct ExtractionConfig {
+    /// Enable caching (default: true)
+    pub use_cache: bool,
+
+    /// Enable quality post-processing (default: true)
+    pub enable_quality_processing: bool,
+
+    /// OCR configuration (None = OCR disabled)
+    pub ocr: Option<OcrConfig>,
+
+    /// Force OCR even for searchable PDFs (default: false)
+    pub force_ocr: bool,
+
+    /// Text chunking configuration (None = disabled)
+    pub chunking: Option<ChunkingConfig>,
+
+    /// Image extraction configuration (None = disabled)
+    pub images: Option<ImageExtractionConfig>,
+
+    /// PDF-specific options (requires pdf feature)
+    #[cfg(feature = "pdf")]
+    pub pdf_options: Option<PdfConfig>,
+
+    /// Token reduction configuration (None = disabled)
+    pub token_reduction: Option<TokenReductionConfig>,
+
+    /// Language detection configuration (None = disabled)
+    pub language_detection: Option<LanguageDetectionConfig>,
+
+    /// Page extraction configuration (None = disabled)
+    pub pages: Option<PageConfig>,
+
+    /// Keyword extraction configuration (requires keywords-yake or keywords-rake)
+    #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+    pub keywords: Option<KeywordConfig>,
+
+    /// Post-processor configuration (None = use defaults)
+    pub postprocessor: Option<PostProcessorConfig>,
+
+    /// HTML to Markdown conversion options (requires html feature)
+    #[cfg(feature = "html")]
+    pub html_options: Option<ConversionOptions>,
+
+    /// Maximum concurrent extractions in batch (None = num_cpus * 1.5)
+    pub max_concurrent_extractions: Option<usize>,
+
+    /// Result structure format (default: Unified)
+    /// Uses types::OutputFormat (Unified | ElementBased)
+    pub result_format: types::OutputFormat,
+
+    /// Security limits for archives (requires archives feature)
+    #[cfg(feature = "archives")]
+    pub security_limits: Option<SecurityLimits>,
+
+    /// Content output format (default: Plain)
+    /// Uses config::OutputFormat (Plain | Markdown | Djot | Html)
+    pub output_format: OutputFormat,
+}
+```
+
+#### Creating Configs
+
+```rust
+use kreuzberg::{ExtractionConfig, OcrConfig, ChunkingConfig, OutputFormat};
+
+// Default configuration
+let config = ExtractionConfig::default();
+
+// With OCR
+let config = ExtractionConfig {
+    ocr: Some(OcrConfig {
+        backend: "tesseract".to_string(),
+        ..Default::default()
+    }),
+    ..Default::default()
+};
+
+// With chunking
+let config = ExtractionConfig {
+    chunking: Some(ChunkingConfig {
+        max_characters: 512,
+        overlap: 50,
+        ..Default::default()
+    }),
+    output_format: OutputFormat::Markdown,
+    ..Default::default()
+};
+```
+
+---
+
+## Output Formats
+
+There are two separate enums both named `OutputFormat` in different modules:
+
+### Content `OutputFormat` (`core::config::formats::OutputFormat`)
+
+Controls the format of the `content` field text. Used by `ExtractionConfig::output_format`.
+
+```rust
+pub enum OutputFormat {
+    /// Plain text (default)
+    Plain,
+    /// Markdown formatted
+    Markdown,
+    /// Djot markup format
+    Djot,
+    /// HTML format
+    Html,
+}
+```
+
+### Result `OutputFormat` (`types::extraction::OutputFormat`)
+
+Controls the result structure. Used by `ExtractionConfig::result_format`.
+
+```rust
+pub enum OutputFormat {
+    /// Unified format with all content in `content` field (default)
+    Unified,
+    /// Element-based format with semantic element extraction
+    ElementBased,
+}
+```
+
+```rust
+use kreuzberg::{ExtractionConfig, OutputFormat};
+
+let config = ExtractionConfig {
+    output_format: OutputFormat::Markdown,  // content format (Plain/Markdown/Djot/Html)
+    // result_format uses types::OutputFormat (Unified/ElementBased) — defaults to Unified
+    ..Default::default()
+};
+```
+
+---
+
+## Extraction Result
+
+### `ExtractionResult`
+
+Result returned by all extraction functions.
+
+```rust
+pub struct ExtractionResult {
+    /// Main extracted content
+    pub content: String,
+
+    /// Document MIME type
+    pub mime_type: Cow<'static, str>,
+
+    /// Metadata about extraction
+    pub metadata: Metadata,
+
+    /// Extracted tables (HTML/Markdown)
+    pub tables: Vec<Table>,
+
+    /// Detected languages (if language-detection enabled)
+    pub detected_languages: Option<Vec<String>>,
+
+    /// Text chunks (if chunking enabled)
+    pub chunks: Option<Vec<Chunk>>,
+
+    /// Extracted images (if image extraction enabled)
+    pub images: Option<Vec<ExtractedImage>>,
+
+    /// Per-page content (if page extraction enabled)
+    pub pages: Option<Vec<PageContent>>,
+
+    /// Semantic elements (if element-based format enabled)
+    pub elements: Option<Vec<Element>>,
+
+    /// Djot document structure (if extracting Djot)
+    pub djot_content: Option<DjotContent>,
+
+    /// Extracted keywords with relevance scores (if keyword extraction enabled)
+    pub extracted_keywords: Option<Vec<ExtractedKeyword>>,
+
+    /// Quality score for extraction result (0.0-1.0)
+    pub quality_score: Option<f64>,
+
+    /// Non-fatal warnings during processing pipeline
+    pub processing_warnings: Vec<ProcessingWarning>,
+}
+```
+
+### `ExtractedKeyword`
+
+Extracted keyword with relevance score and position information.
+
+```rust
+pub struct ExtractedKeyword {
+    /// Keyword text
+    pub text: String,
+
+    /// Relevance score (0.0-1.0)
+    pub score: f32,
+
+    /// Algorithm used for extraction ("tfidf", "textrank", "yake", etc.)
+    pub algorithm: String,
+
+    /// Character positions in content (if available)
+    pub positions: Option<Vec<usize>>,
+}
+```
+
+### `ProcessingWarning`
+
+Non-fatal warning encountered during document processing.
+
+```rust
+pub struct ProcessingWarning {
+    /// Component that generated the warning
+    pub source: String,
+
+    /// Warning message describing the issue
+    pub message: String,
+}
+```
+
+### `Chunk`
+
+Text chunk with optional embedding.
+
+```rust
+pub struct Chunk {
+    /// Chunk text content
+    pub content: String,
+
+    /// Optional embedding vector
+    pub embedding: Option<Vec<f32>>,
+
+    /// Chunk metadata
+    pub metadata: ChunkMetadata,
+}
+
+pub struct ChunkMetadata {
+    pub byte_start: usize,
+    pub byte_end: usize,
+    pub token_count: Option<usize>,
+    pub chunk_index: usize,
+    pub total_chunks: usize,
+    pub first_page: Option<usize>,
+    pub last_page: Option<usize>,
+}
+```
+
+### `ExtractedImage`
+
+Image extracted from document.
+
+```rust
+pub struct ExtractedImage {
+    /// Raw image bytes
+    pub data: Bytes,
+
+    /// Format: "jpeg", "png", "webp", etc.
+    pub format: Cow<'static, str>,
+
+    /// Zero-indexed position
+    pub image_index: usize,
+
+    /// Page number (1-indexed)
+    pub page_number: Option<usize>,
+
+    /// Image dimensions
+    pub width: Option<u32>,
+    pub height: Option<u32>,
+
+    /// Colorspace: "RGB", "CMYK", "Gray"
+    pub colorspace: Option<String>,
+
+    /// Bits per component
+    pub bits_per_component: Option<u32>,
+
+    /// Whether this is a mask image
+    pub is_mask: bool,
+
+    /// Image description
+    pub description: Option<String>,
+
+    /// Nested OCR result (if OCRed)
+    pub ocr_result: Option<Box<ExtractionResult>>,
+}
+```
+
+---
+
+## Error Handling
+
+### `KreuzbergError` enum
+
+```rust
+pub enum KreuzbergError {
+    /// File system errors (always bubble up)
+    Io(std::io::Error),
+
+    /// Document parsing errors
+    Parsing {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// OCR processing errors
+    Ocr {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Configuration/input validation errors
+    Validation {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Cache operation errors
+    Cache {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Image processing errors
+    ImageProcessing {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Serialization errors (JSON, MessagePack)
+    Serialization {
+        message: String,
+        source: Option<Box<dyn std::error::Error + Send + Sync>>,
+    },
+
+    /// Missing system dependency (e.g. Tesseract)
+    MissingDependency(String),
+
+    /// Plugin-specific errors
+    Plugin {
+        message: String,
+        plugin_name: String,
+    },
+
+    /// Mutex/RwLock poisoning
+    LockPoisoned(String),
+
+    /// Unsupported MIME type or format
+    UnsupportedFormat(String),
+
+    /// Other errors
+    Other(String),
+}
+```
+
+#### Error Constructors
+
+```rust
+use kreuzberg::KreuzbergError;
+
+// Create errors
+let err = KreuzbergError::parsing("invalid PDF");
+let err = KreuzbergError::ocr("Tesseract failed");
+let err = KreuzbergError::validation("config invalid");
+let err = KreuzbergError::unsupported_format("application/unknown");
+let err = KreuzbergError::missing_dependency("tesseract");
+
+// With source
+let source = std::io::Error::new(std::io::ErrorKind::NotFound, "file missing");
+let err = KreuzbergError::parsing_with_source("corrupt PDF", source);
+```
+
+#### Handling Errors
+
+```rust
+use kreuzberg::extract_file;
+
+match extract_file("doc.pdf", None, &config).await {
+    Ok(result) => println!("Success: {}", result.content),
+    Err(kreuzberg::KreuzbergError::Io(e)) => {
+        println!("File error: {}", e);
+    }
+    Err(kreuzberg::KreuzbergError::UnsupportedFormat(fmt)) => {
+        println!("Unsupported: {}", fmt);
+    }
+    Err(e) => println!("Other error: {}", e),
+}
+```
+
+---
+
+## MIME Type Detection
+
+### `detect_mime_type`
+
+Detect MIME type from file path.
+
+```rust
+pub fn detect_mime_type(path: impl AsRef<Path>) -> Result<String>
+```
+
+```rust
+use kreuzberg::detect_mime_type;
+
+let mime = detect_mime_type("document.pdf")?;
+assert_eq!(mime, "application/pdf");
+```
+
+### `detect_mime_type_from_bytes`
+
+Detect MIME type from byte data.
+
+```rust
+pub fn detect_mime_type_from_bytes(data: &[u8]) -> Result<String>
+```
+
+### `validate_mime_type`
+
+Check if a MIME type is supported.
+
+```rust
+pub fn validate_mime_type(mime_type: &str) -> Result<()>
+```
+
+```rust
+use kreuzberg::validate_mime_type;
+
+validate_mime_type("application/pdf")?;  // OK
+validate_mime_type("application/unknown")?;  // Error
+```
+
+### `get_extensions_for_mime`
+
+Get file extensions for a MIME type.
+
+```rust
+pub fn get_extensions_for_mime(mime_type: &str) -> Vec<String>
+```
+
+```rust
+use kreuzberg::get_extensions_for_mime;
+
+let exts = get_extensions_for_mime("application/pdf");
+// ["pdf"]
+
+let exts = get_extensions_for_mime("text/plain");
+// ["txt", "text"]
+```
+
+### MIME Type Constants
+
+```rust
+use kreuzberg::{
+    PDF_MIME_TYPE,
+    PLAIN_TEXT_MIME_TYPE,
+    HTML_MIME_TYPE,
+    MARKDOWN_MIME_TYPE,
+    JSON_MIME_TYPE,
+    XML_MIME_TYPE,
+    DOCX_MIME_TYPE,
+    POWER_POINT_MIME_TYPE,
+    EXCEL_MIME_TYPE,
+};
+
+assert_eq!(PDF_MIME_TYPE, "application/pdf");
+assert_eq!(PLAIN_TEXT_MIME_TYPE, "text/plain");
+```
+
+---
+
+## Plugin Registry
+
+Access extractors, OCR backends, and validators.
+
+### `get_document_extractor_registry`
+
+Get all available document extractors.
+
+```rust
+pub fn get_document_extractor_registry() -> Arc<RwLock<DocumentExtractorRegistry>>
+```
+
+### `get_ocr_backend_registry`
+
+Get all available OCR backends.
+
+```rust
+pub fn get_ocr_backend_registry() -> Arc<RwLock<OcrBackendRegistry>>
+```
+
+### `get_post_processor_registry`
+
+Get all available post-processors.
+
+```rust
+pub fn get_post_processor_registry() -> Arc<RwLock<PostProcessorRegistry>>
+```
+
+### `get_validator_registry`
+
+Get all available validators.
+
+```rust
+pub fn get_validator_registry() -> Arc<RwLock<ValidatorRegistry>>
+```
+
+---
+
+## Complete Example
+
+```rust
+use kreuzberg::{
+    extract_file, ExtractionConfig, OutputFormat,
+    ChunkingConfig, OcrConfig, LanguageDetectionConfig,
+};
+
+#[tokio::main]
+async fn main() -> kreuzberg::Result<()> {
+    // Configure extraction
+    let config = ExtractionConfig {
+        output_format: OutputFormat::Markdown,
+        chunking: Some(ChunkingConfig {
+            max_characters: 512,
+            overlap: 50,
+            ..Default::default()
+        }),
+        language_detection: Some(LanguageDetectionConfig::default()),
+        ocr: Some(OcrConfig {
+            backend: "tesseract".to_string(),
+            ..Default::default()
+        }),
+        force_ocr: false,
+        ..Default::default()
+    };
+
+    // Extract from file
+    let result = extract_file("document.pdf", None, &config).await?;
+
+    // Use results
+    println!("Content:\n{}", result.content);
+    println!("MIME: {}", result.mime_type);
+
+    if let Some(langs) = result.detected_languages {
+        println!("Languages: {:?}", langs);
+    }
+
+    if let Some(chunks) = result.chunks {
+        println!("Chunks: {}", chunks.len());
+        for chunk in chunks {
+            println!("  - {}", &chunk.content[..50.min(chunk.content.len())]);
+        }
+    }
+
+    if let Some(images) = result.images {
+        println!("Images: {}", images.len());
+    }
+
+    if let Some(pages) = result.pages {
+        println!("Pages: {}", pages.len());
+    }
+
+    Ok(())
+}
+```
+
+---
+
+## Result Type Alias
+
+```rust
+pub type Result<T> = std::result::Result<T, KreuzbergError>;
+```
+
+All fallible operations return `Result<T>` where errors are `KreuzbergError`.
+
+---
+
+## Feature Flags Summary
+
+| Feature            | Availability | Dependencies                                   |
+| ------------------ | ------------ | ---------------------------------------------- |
+| tokio-runtime      | Default      | Tokio runtime for async/sync                   |
+| pdf                | Default      | PDFium                                         |
+| ocr                | Optional     | Tesseract                                      |
+| chunking           | Optional     | text-splitter                                  |
+| embeddings         | Optional     | FastEmbed, requires tokio-runtime              |
+| language-detection | Optional     | whatlang                                       |
+| keywords-yake      | Optional     | yake-rust                                      |
+| keywords-rake      | Optional     | rake                                           |
+| api                | Optional     | Axum, requires tokio-runtime                   |
+| mcp                | Optional     | Model Context Protocol, requires tokio-runtime |
+
+---
+
+## Version
+
+This reference is for Kreuzberg 4.x.
diff --git a/plugins/kreuzberg/skills/kreuzberg/references/supported-formats.md b/plugins/kreuzberg/skills/kreuzberg/references/supported-formats.md
new file mode 100644
index 0000000..e50a66f
--- /dev/null
+++ b/plugins/kreuzberg/skills/kreuzberg/references/supported-formats.md
@@ -0,0 +1,223 @@
+# Supported Formats Reference
+
+Kreuzberg supports 91+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction. All formats support text and metadata extraction. Additional capabilities like OCR and table detection are noted per format.
+
+## Office Documents
+
+### Word Processing
+
+| Format             | Extensions               | MIME Type                                                                 | Capabilities                                                    |
+| ------------------ | ------------------------ | ------------------------------------------------------------------------- | --------------------------------------------------------------- |
+| Microsoft Word     | `.docx`                  | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` | Full text extraction, tables, embedded images, metadata, styles |
+| Word Macro-Enabled | `.docm`                  | `application/vnd.ms-word.document.macroEnabled.12`                        | Macro-enabled document extraction, metadata                     |
+| Word Template      | `.dotx`, `.dotm`, `.dot` | Various Word template MIME types                                          | Template document extraction, metadata                          |
+| OpenDocument Text  | `.odt`                   | `application/vnd.oasis.opendocument.text`                                 | Full text extraction, tables, embedded images, metadata, styles |
+
+### Spreadsheets
+
+| Format                   | Extensions | MIME Type                                                              | Capabilities                                             |
+| ------------------------ | ---------- | ---------------------------------------------------------------------- | -------------------------------------------------------- |
+| Excel Workbook           | `.xlsx`    | `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet`    | Sheet data, cell values, formulas, cell metadata, charts |
+| Excel Macro-Enabled      | `.xlsm`    | `application/vnd.ms-excel.sheet.macroEnabled.12`                       | Sheet data, formulas, macros (text only), metadata       |
+| Excel Binary             | `.xlsb`    | `application/vnd.ms-excel.sheet.binary.macroEnabled.12`                | Binary sheet data extraction, metadata                   |
+| Excel Legacy             | `.xls`     | `application/vnd.ms-excel`                                             | Legacy sheet data extraction, metadata                   |
+| Excel Add-in             | `.xla`     | `application/vnd.ms-excel`                                             | Add-in data extraction                                   |
+| Excel Macro Add-in       | `.xlam`    | `application/vnd.ms-excel.addin.macroEnabled.12`                       | Macro add-in metadata                                    |
+| Excel Template           | `.xltm`    | `application/vnd.ms-excel.template.macroEnabled.12`                    | Template data and metadata                               |
+| Excel Template (XML)     | `.xltx`    | `application/vnd.openxmlformats-officedocument.spreadsheetml.template` | XML template data and metadata                           |
+| Excel Template (Legacy)  | `.xlt`     | `application/vnd.ms-excel`                                             | Legacy template data extraction                          |
+| OpenDocument Spreadsheet | `.ods`     | `application/vnd.oasis.opendocument.spreadsheet`                       | Sheet data, cell values, formulas, metadata              |
+
+### Presentations
+
+| Format                  | Extensions               | MIME Type                                                                   | Capabilities                                         |
+| ----------------------- | ------------------------ | --------------------------------------------------------------------------- | ---------------------------------------------------- |
+| PowerPoint Presentation | `.pptx`                  | `application/vnd.openxmlformats-officedocument.presentationml.presentation` | Slide text, speaker notes, embedded images, metadata |
+| PowerPoint Legacy       | `.ppt`                   | `application/vnd.ms-powerpoint`                                             | Legacy slide text extraction, metadata               |
+| PowerPoint Slideshow    | `.ppsx`                  | `application/vnd.openxmlformats-officedocument.presentationml.slideshow`    | Slideshow content, speaker notes, metadata           |
+| PowerPoint Template     | `.potx`, `.potm`, `.pot` | Various PowerPoint template MIME types                                      | Template slide extraction, metadata                  |
+
+### PDF
+
+| Format                   | Extensions | MIME Type         | Capabilities                                                                                       |
+| ------------------------ | ---------- | ----------------- | -------------------------------------------------------------------------------------------------- |
+| Portable Document Format | `.pdf`     | `application/pdf` | Text extraction, tables, embedded images, metadata, OCR (when needed), password protection support |
+
+### eBooks
+
+| Format      | Extensions | MIME Type                       | Capabilities                                           |
+| ----------- | ---------- | ------------------------------- | ------------------------------------------------------ |
+| EPUB        | `.epub`    | `application/epub+zip`          | Chapter text, metadata, embedded resources, navigation |
+| FictionBook | `.fb2`     | `application/x-fictionbook+xml` | Book content, metadata, chapter structure              |
+
+### Database
+
+| Format | Extensions | MIME Type           | Capabilities                                          |
+| ------ | ---------- | ------------------- | ----------------------------------------------------- |
+| dBASE  | `.dbf`     | `application/x-dbf` | Table data extraction as markdown, field type support |
+
+### Hangul
+
+| Format                | Extensions      | MIME Type                                       | Capabilities                            |
+| --------------------- | --------------- | ----------------------------------------------- | --------------------------------------- |
+| Hangul Word Processor | `.hwp`, `.hwpx` | `application/x-hwp`, `application/haansofthwpx` | Korean document format, text extraction |
+
+## Images (OCR-Enabled)
+
+### Raster Images
+
+| Format | Extensions      | MIME Type    | Capabilities                                                                 |
+| ------ | --------------- | ------------ | ---------------------------------------------------------------------------- |
+| PNG    | `.png`          | `image/png`  | OCR text extraction, table detection, EXIF metadata, dimensions, color space |
+| JPEG   | `.jpg`, `.jpeg` | `image/jpeg` | OCR text extraction, table detection, EXIF metadata, color profile           |
+| GIF    | `.gif`          | `image/gif`  | OCR text extraction, animation metadata, dimensions                          |
+| WebP   | `.webp`         | `image/webp` | OCR text extraction, metadata, lossy/lossless detection                      |
+| Bitmap | `.bmp`          | `image/bmp`  | OCR text extraction, dimensions, color depth                                 |
+| TIFF   | `.tiff`, `.tif` | `image/tiff` | OCR text extraction, multi-page support, EXIF metadata, compression info     |
+
+### Advanced Image Formats
+
+| Format             | Extensions                     | MIME Type                 | Capabilities                                                                     |
+| ------------------ | ------------------------------ | ------------------------- | -------------------------------------------------------------------------------- |
+| JPEG 2000          | `.jp2`                         | `image/jp2`               | OCR via pure Rust decoder (hayro-jpeg2000), table detection, resolution metadata |
+| JPEG 2000 Extended | `.jpx`                         | `image/jpx`               | Advanced JPEG 2000 features, high-resolution content, metadata                   |
+| JPEG 2000 Compound | `.jpm`                         | `image/jpm`               | Compound image support, mixed content                                            |
+| Motion JPEG 2000   | `.mj2`                         | `video/mj2`               | JPEG 2000 video/sequence metadata                                                |
+| JBIG2              | `.jbig2`, `.jb2`               | `image/jbig2`             | Bi-level image OCR, high compression, technical documents                        |
+| Portable PixMap    | `.pnm`, `.pbm`, `.pgm`, `.ppm` | `image/x-portable-pixmap` | OCR for plain image formats, raw pixel data                                      |
+
+### Vector Graphics
+
+| Format                   | Extensions | MIME Type       | Capabilities                                                              |
+| ------------------------ | ---------- | --------------- | ------------------------------------------------------------------------- |
+| Scalable Vector Graphics | `.svg`     | `image/svg+xml` | DOM parsing, embedded text extraction, graphics metadata, vector elements |
+
+## Web & Data
+
+### Markup & Structured Text
+
+| Format           | Extensions      | MIME Type               | Capabilities                                                                       |
+| ---------------- | --------------- | ----------------------- | ---------------------------------------------------------------------------------- |
+| HyperText Markup | `.html`, `.htm` | `text/html`             | DOM parsing, text extraction, metadata (Open Graph, Twitter Card), link extraction |
+| XHTML            | `.xhtml`        | `application/xhtml+xml` | XHTML parsing, metadata extraction, semantic structure                             |
+| XML              | `.xml`          | `application/xml`       | DOM parsing, namespace handling, text extraction, structure analysis               |
+
+### Structured Data Formats
+
+| Format | Extensions      | MIME Type                   | Capabilities                                               |
+| ------ | --------------- | --------------------------- | ---------------------------------------------------------- |
+| JSON   | `.json`         | `application/json`          | Schema detection, nested structure parsing, validation     |
+| YAML   | `.yaml`, `.yml` | `application/x-yaml`        | Hierarchical data parsing, custom tags, nested structures  |
+| TOML   | `.toml`         | `application/toml`          | Configuration parsing, table structures, type preservation |
+| CSV    | `.csv`          | `text/csv`                  | Delimiter detection, header inference, type detection      |
+| TSV    | `.tsv`          | `text/tab-separated-values` | Tab-separated value parsing, header detection              |
+
+### Text & Markup Languages
+
+| Format           | Extensions         | MIME Type         | Capabilities                                      |
+| ---------------- | ------------------ | ----------------- | ------------------------------------------------- |
+| Plain Text       | `.txt`             | `text/plain`      | Raw text extraction, encoding detection           |
+| Markdown         | `.md`, `.markdown` | `text/markdown`   | CommonMark parsing, GFM extensions, front matter  |
+| Djot             | `.djot`            | `text/djot`       | Djot format parsing, semantic structure           |
+| reStructuredText | `.rst`             | `text/x-rst`      | RST parsing, directive handling, role extraction  |
+| Org Mode         | `.org`             | `text/org`        | Org mode structure, outline parsing, metadata     |
+| Rich Text Format | `.rtf`             | `application/rtf` | Text with formatting extraction, font information |
+
+## Email & Archives
+
+### Email Formats
+
+| Format            | Extensions | MIME Type                    | Capabilities                                                                           |
+| ----------------- | ---------- | ---------------------------- | -------------------------------------------------------------------------------------- |
+| Email Message     | `.eml`     | `message/rfc822`             | Headers (from, to, subject, date), body (HTML/plain text), attachments, threading info |
+| Microsoft Outlook | `.msg`     | `application/vnd.ms-outlook` | Outlook headers, body content, attachments, recipient metadata                         |
+
+### Archive Formats
+
+| Format      | Extensions | MIME Type                     | Capabilities                                               |
+| ----------- | ---------- | ----------------------------- | ---------------------------------------------------------- |
+| ZIP Archive | `.zip`     | `application/zip`             | File listing, nested archive support, compression metadata |
+| Tar Archive | `.tar`     | `application/x-tar`           | File listing, permission metadata, nested archives         |
+| Gzip Tar    | `.tgz`     | `application/gzip`            | Compressed archive listing, metadata                       |
+| Gzip        | `.gz`      | `application/gzip`            | Compressed file metadata                                   |
+| 7-Zip       | `.7z`      | `application/x-7z-compressed` | File listing, compression info, nested archives            |
+
+## Academic & Scientific
+
+### Citation Formats
+
+| Format                  | Extensions  | MIME Type                                | Capabilities                                      |
+| ----------------------- | ----------- | ---------------------------------------- | ------------------------------------------------- |
+| BibTeX                  | `.bib`      | `text/bibtex`                            | Structured parsing, entry types, field extraction |
+| BibLaTeX                | `.biblatex` | `text/bibtex`                            | Extended BibTeX format, advanced field support    |
+| RIS                     | `.ris`      | `application/x-research-info-systems`    | Structured RIS format parsing, type detection     |
+| NIH RIS                 | `.nbib`     | `application/x-research-info-systems`    | NIH/PubMed format, structured citation data       |
+| EndNote                 | `.enw`      | `application/x-endnote`                  | EndNote XML format, citation metadata             |
+| Citation Style Language | `.csl`      | `application/vnd.citationstyles.csl+xml` | CSL JSON/XML parsing, style definitions           |
+
+### Scientific & Technical Formats
+
+| Format           | Extensions       | MIME Type                  | Capabilities                                                |
+| ---------------- | ---------------- | -------------------------- | ----------------------------------------------------------- |
+| LaTeX            | `.tex`, `.latex` | `application/x-latex`      | LaTeX source parsing, commands, document structure          |
+| Typst            | `.typ`           | `text/plain`               | Typst markup parsing, document structure                    |
+| JATS XML         | `.jats`          | `application/xml`          | PubMed JATS parsing, article structure, metadata            |
+| Jupyter Notebook | `.ipynb`         | `application/x-ipynb+json` | Cell extraction (code + markdown), output parsing, metadata |
+| DocBook          | `.docbook`       | `application/docbook+xml`  | DocBook XML parsing, semantic structure                     |
+
+### Documentation Formats
+
+| Format      | Extensions | MIME Type                | Capabilities                                    |
+| ----------- | ---------- | ------------------------ | ----------------------------------------------- |
+| OPML        | `.opml`    | `application/x-opml+xml` | Outline parsing, hierarchy extraction, metadata |
+| Perl POD    | `.pod`     | `text/x-pod`             | Perl documentation parsing, section extraction  |
+| Manual Page | `.mdoc`    | `text/plain`             | UNIX manual page parsing, section structure     |
+| Troff/Groff | `.troff`   | `text/troff`             | Typesetting markup parsing, document structure  |
+
+## Format Capabilities Summary
+
+### Text Extraction
+
+All 91+ formats support full or partial text extraction. Document structure and encoding are automatically detected.
+
+### Metadata Support
+
+Comprehensive metadata extraction includes:
+
+- Document properties (title, author, subject, creation date, modification date)
+- Format-specific metadata (page count, dimensions, encoding, language)
+- EXIF data (for images)
+- Document statistics (word count, character count)
+
+### OCR (Optical Character Recognition)
+
+OCR is available for image formats:
+
+- **Raster Images**: PNG, JPEG, GIF, WebP, BMP, TIFF
+- **Advanced Formats**: JPEG 2000, JBIG2, PNM/PBM/PGM/PPM
+- **Configurable Backends**: Tesseract (all languages), EasyOCR, PaddleOCR (Python), Guten (Node.js)
+
+### Table Detection
+
+Smart table detection and reconstruction available for:
+
+- PDF documents (native tables and scanned content with OCR)
+- Office documents (Excel, Word)
+- Images (via OCR backends)
+- HTML/XML (from markup structure)
+
+### Archive & Nested Document Support
+
+Archives and nested formats support file listing and sequential extraction:
+
+- ZIP, TAR, TGZ, 7Z archives
+- Email attachments
+- Nested archives within archives
+
+## Getting Started
+
+For language-specific examples and detailed API documentation, see the [API Reference](https://docs.kreuzberg.dev/reference/api-python/).
+
+For OCR configuration and backend selection, see the [OCR Backends Guide](https://docs.kreuzberg.dev/guides/ocr/).
+
+For comprehensive format details and format detection, see the [Complete Format Reference](https://docs.kreuzberg.dev/reference/formats/).
diff --git a/plugins/kreuzberg/skills/picking-a-format/SKILL.md b/plugins/kreuzberg/skills/picking-a-format/SKILL.md
new file mode 100644
index 0000000..082e53f
--- /dev/null
+++ b/plugins/kreuzberg/skills/picking-a-format/SKILL.md
@@ -0,0 +1,94 @@
+---
+name: picking-a-format
+description: Use when choosing an output format for extracted documents — text, markdown, djot, html, or JSON. Maps consumer (LLM, parser, archive) to the right `--format` / `--content-format` pair.
+---
+
+# Picking a format
+
+Kreuzberg has two orthogonal format knobs. Get them right up front and the
+downstream code stays simple.
+
+| Knob                | What it controls                                  | Values                                 | Default          |
+| ------------------- | ------------------------------------------------- | -------------------------------------- | ---------------- |
+| `--format`          | How the CLI prints the result                     | `text`, `json`                         | `text` (`extract`), `json` (`batch`) |
+| `--content-format`  | How extracted content is rendered inside `result` | `plain`, `markdown`, `djot`, `html`    | `plain`          |
+| `--token-reduction` | Strip whitespace / boilerplate for LLM contexts   | `off`, `light`, `moderate`, `aggressive` | `off`          |
+
+`--format json` always returns the full `ExtractionResult` (content +
+metadata + tables + images). `--format text` prints just `content`.
+`--content-format` is what shows up inside that `content` field.
+
+## Decision tree
+
+```text
+Who consumes the output?
+├── LLM (Claude, GPT, Gemini, local) — embed/prompt context
+│       --format text --content-format markdown
+├── Vector store / RAG indexer
+│       --format json --content-format markdown
+│       (markdown preserves structure for chunking)
+├── Downstream parser that expects machine-readable JSON
+│       --format json --content-format plain
+│       (cleanest text + structured metadata)
+├── Human review / archival
+│       --format text --content-format markdown
+├── HTML re-rendering / web display
+│       --format json --content-format html
+├── Lossless intermediate for pandoc / academic tooling
+│       --format json --content-format djot
+└── Token-budget-constrained pipeline
+        --format text --content-format plain
+        (drops markup; add --token-reduction moderate for further savings)
+```
+
+## Examples
+
+Feed a PDF directly into an LLM:
+
+```bash
+kreuzberg extract paper.pdf --content-format markdown
+```
+
+Index a corpus into a RAG store with tables and headings preserved:
+
+```bash
+kreuzberg batch docs/*.pdf --format json --content-format markdown \
+  | jq -c '.[] | {path: .metadata.path, content: .content, tables: .tables}'
+```
+
+Strip a file to bare text for a token-tight summarizer:
+
+```bash
+kreuzberg extract long.pdf \
+  --content-format plain \
+  --token-reduction moderate
+```
+
+Pull metadata only, ignore content:
+
+```bash
+kreuzberg extract file.pdf --format json | jq '.metadata'
+```
+
+## When in doubt
+
+- **Default to `markdown`** as the content format. It is the best
+  compromise across LLMs, RAG, and human review, and Kreuzberg has the
+  most faithful renderer for it.
+- Reach for `plain` only when downstream cannot tolerate any markup.
+- Reach for `djot` only if you're already in a djot/pandoc pipeline.
+- Reach for `html` only when re-rendering for the web.
+
+## Token-reduction (orthogonal)
+
+`--token-reduction` collapses whitespace, strips repeated headers/footers,
+and trims boilerplate. It composes with any `--content-format`:
+
+- `off` (default), `light`, `moderate`, `aggressive`, `maximum`.
+
+Use `moderate` as a safe starting point for LLM context windows. `maximum`
+is lossy — verify before relying on it.
+
+See `references/cli-reference.md` for the full flag set and
+`references/configuration.md` for the equivalent `output_format` and
+`token_reduction` keys in `kreuzberg.toml`.
diff --git a/plugins/kreuzcrawl/.factory-plugin/plugin.json b/plugins/kreuzcrawl/.factory-plugin/plugin.json
new file mode 100644
index 0000000..a65e62c
--- /dev/null
+++ b/plugins/kreuzcrawl/.factory-plugin/plugin.json
@@ -0,0 +1,23 @@
+{
+  "name": "kreuzcrawl",
+  "version": "0.1.0",
+  "description": "Web crawling and scraping with HTML→Markdown and headless-Chrome fallback.",
+  "author": {
+    "name": "Kreuzberg, Inc.",
+    "email": "support@kreuzberg.dev",
+    "url": "https://kreuzberg.dev"
+  },
+  "homepage": "https://kreuzberg.dev",
+  "repository": "https://github.com/kreuzberg-dev/plugins",
+  "license": "MIT",
+  "category": "web-scraping",
+  "keywords": [
+    "web-scraping",
+    "crawling",
+    "html-to-markdown",
+    "headless-chrome"
+  ],
+  "brandColor": "#7C3AED",
+  "icon": "./assets/icon.svg",
+  "logo": "./assets/logo.png"
+}
diff --git a/plugins/kreuzcrawl/README.md b/plugins/kreuzcrawl/README.md
new file mode 100644
index 0000000..71826cd
--- /dev/null
+++ b/plugins/kreuzcrawl/README.md
@@ -0,0 +1,102 @@
+# kreuzcrawl
+
+Crawl, scrape, and convert websites to Markdown using the local `kreuzcrawl` CLI in your agent.
+
+<!-- TODO: screenshot -->
+
+## Install
+
+### From the marketplace (recommended)
+
+Pending review for official Claude marketplace.
+
+Self-host:
+
+```text
+/plugin marketplace add kreuzberg-dev/plugins
+/plugin install kreuzcrawl@kreuzberg
+```
+
+### Binary requirement
+
+Install the `kreuzcrawl` CLI:
+
+```bash
+brew install kreuzberg-dev/tap/kreuzcrawl
+# or
+cargo install kreuzcrawl-cli
+```
+
+Headless fallback requires Chrome/Chromium on your system. The CLI launches it on demand; skip the binary if you only plan to use `--browser-mode never`.
+
+## Skills shipped
+
+| Skill | Trigger |
+|-------|---------|
+| **kreuzcrawl** | Crawl, scrape, and convert websites to Markdown using the local kreuzcrawl CLI and its MCP server. Use when the user wants to fetch a page, follow links across a domain, enumerate URLs, or drive a real browser. Covers installation, the subcommands (scrape, crawl, map, interact, mcp, serve), output formats (JSON + Markdown), browser fallback, and when to prefer the MCP server over shelling out. |
+| **crawling-a-site** | Use when the user wants to follow links across a domain and capture every reachable page as Markdown. Covers `kreuzcrawl crawl` with depth, page caps, concurrency, rate limiting, domain scoping, robots, and output selection. |
+| **scraping-html-to-markdown** | Use when the user wants a single page rendered as clean Markdown plus structured metadata. Covers `kreuzcrawl scrape <url>`, JSON vs Markdown output, what metadata is returned, and how to handle JS-heavy pages. |
+| **headless-fallback** | Use when a static fetch returns nothing useful and the page needs a real browser. Covers `--browser-mode auto\|always\|never`, external CDP via `--browser-endpoint`, symptoms of JS-only pages and WAF blocks, and the performance cost. |
+
+## MCP tools
+
+The `kreuzcrawl` MCP server exposes:
+
+- `scrape` — fetch and convert a single URL to Markdown or JSON.
+- `crawl` — follow links across a domain, bounded by depth and page count.
+- `map` — enumerate URLs from sitemaps and link extraction.
+- `interact` — drive a headless browser with click, type, scroll actions.
+
+## Configuration
+
+Pass flags or use inline JSON via `--config`:
+
+```bash
+kreuzcrawl scrape https://example.com \
+  --format markdown \
+  --browser-mode auto \
+  --timeout 30000
+```
+
+For complex configs, use JSON:
+
+```bash
+kreuzcrawl crawl https://example.com \
+  --config '{"depth":3,"max_pages":200,"concurrent":8,"respect_robots_txt":true}'
+```
+
+See the `kreuzcrawl` and `crawling-a-site` skills for the full flag surface.
+
+## Examples
+
+Fetch a single page and print Markdown:
+
+```text
+kreuzcrawl scrape https://example.com/article --format markdown
+```
+
+Crawl a site at depth 3 with rate limiting:
+
+```text
+kreuzcrawl crawl https://example.com --depth 3 --max-pages 200 --concurrent 8 --stay-on-domain --format markdown
+```
+
+Enumerate URLs from a sitemap:
+
+```text
+kreuzcrawl map https://example.com --limit 500
+```
+
+## Versioning
+
+The plugin version tracks the marketplace `VERSION` file. See [CHANGELOG.md](../../CHANGELOG.md) for release notes.
+
+## License
+
+MIT.
+
+## See also
+
+- **Marketplace**: [kreuzberg-dev/plugins](https://github.com/kreuzberg-dev/plugins)
+- **Upstream**: [kreuzberg-dev/kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl)
+- **Sibling plugins**: [kreuzberg](../kreuzberg/README.md), [kreuzberg-cloud](../kreuzberg-cloud/README.md)
diff --git a/plugins/kreuzcrawl/assets/icon.svg b/plugins/kreuzcrawl/assets/icon.svg
new file mode 100644
index 0000000..4910078
--- /dev/null
+++ b/plugins/kreuzcrawl/assets/icon.svg
@@ -0,0 +1,14 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256" width="256" height="256">
+  <rect width="256" height="256" rx="48" fill="#7C3AED"/>
+  <g stroke="white" stroke-width="12" stroke-linecap="round" fill="none">
+    <line x1="128" y1="64"  x2="64"  y2="128"/>
+    <line x1="128" y1="64"  x2="192" y2="128"/>
+    <line x1="64"  y1="128" x2="128" y2="192"/>
+    <line x1="192" y1="128" x2="128" y2="192"/>
+    <line x1="128" y1="64"  x2="128" y2="192"/>
+  </g>
+  <circle cx="128" cy="64"  r="20" fill="white"/>
+  <circle cx="64"  cy="128" r="20" fill="white"/>
+  <circle cx="192" cy="128" r="20" fill="white"/>
+  <circle cx="128" cy="192" r="20" fill="white"/>
+</svg>
diff --git a/plugins/kreuzcrawl/assets/logo.png b/plugins/kreuzcrawl/assets/logo.png
new file mode 100644
index 0000000..be845f9
Binary files /dev/null and b/plugins/kreuzcrawl/assets/logo.png differ
diff --git a/plugins/kreuzcrawl/skills/crawling-a-site/SKILL.md b/plugins/kreuzcrawl/skills/crawling-a-site/SKILL.md
new file mode 100644
index 0000000..ec93c21
--- /dev/null
+++ b/plugins/kreuzcrawl/skills/crawling-a-site/SKILL.md
@@ -0,0 +1,142 @@
+---
+name: crawling-a-site
+description: >-
+  Use when the user wants to follow links across a domain and capture every
+  reachable page as Markdown. Covers `kreuzcrawl crawl` with depth, page
+  caps, concurrency, rate limiting, domain scoping, robots, and output
+  selection.
+---
+
+# Crawling a site
+
+Reach for `kreuzcrawl crawl` when one URL is not enough — the user wants
+the docs site, the blog, the marketing pages, or the whole domain.
+
+## Quick recipe
+
+```bash
+kreuzcrawl crawl https://example.com \
+  --depth 3 \
+  --max-pages 200 \
+  --concurrent 8 \
+  --rate-limit 250 \
+  --stay-on-domain \
+  --respect-robots-txt \
+  --format markdown
+```
+
+Defaults you should usually override:
+
+- `--depth 2` is shallow — set it explicitly.
+- `--max-pages` is unbounded by default; cap it for any unknown site.
+- `--concurrent 10` is aggressive for small hosts; drop to 4-8 for
+  third-party sites.
+
+## Flag surface
+
+| Flag                    | Default | Purpose                                                    |
+| ----------------------- | ------- | ---------------------------------------------------------- |
+| `--depth`, `-d`         | `2`     | Maximum hop count from the seed URL.                       |
+| `--max-pages`, `-n`     | —       | Hard cap on pages fetched. Set this on any unknown site.   |
+| `--concurrent`, `-c`    | `10`    | Parallel in-flight requests.                               |
+| `--rate-limit`          | `200`   | Milliseconds between requests to the same origin.          |
+| `--stay-on-domain`      | off     | Skip links that leave the seed domain.                     |
+| `--respect-robots-txt`  | off     | Honour `robots.txt`. Pass it for any third-party host.     |
+| `--proxy`               | —       | HTTP, HTTPS, or SOCKS5 proxy URL.                          |
+| `--user-agent`          | —       | Override the request UA. Be honest.                        |
+| `--timeout`             | `30000` | Per-request timeout in ms.                                 |
+| `--browser-mode`        | `auto`  | `auto`, `always`, `never` — see the headless-fallback skill. |
+| `--browser-endpoint`    | —       | External CDP `ws://` URL.                                  |
+| `--format`              | `json`  | `json` or `markdown`.                                      |
+| `--config`              | —       | Inline JSON or `@file.json` for the full `CrawlConfig`.    |
+
+Multiple seed URLs are accepted positionally — the engine fans out with
+`batch_crawl` and aggregates results.
+
+## When to pick which flags
+
+### Docs sites you own
+
+```bash
+kreuzcrawl crawl https://docs.example.com \
+  --depth 5 --max-pages 1000 --concurrent 16 --rate-limit 100 \
+  --stay-on-domain --format markdown > docs.md
+```
+
+Higher concurrency and lower rate limits are fine on infrastructure you
+control.
+
+### Third-party sites
+
+```bash
+kreuzcrawl crawl https://blog.unknown.example \
+  --depth 2 --max-pages 50 --concurrent 4 --rate-limit 500 \
+  --stay-on-domain --respect-robots-txt --format markdown
+```
+
+Stay shallow, cap pages, throttle hard, obey robots.
+
+### Multi-seed batch
+
+```bash
+kreuzcrawl crawl \
+  https://example.com/blog \
+  https://example.com/docs \
+  https://example.com/pricing \
+  --depth 2 --max-pages 100 --stay-on-domain --format json
+```
+
+JSON output for batch is an array of `{ seed_url, result }` entries — each
+`result` is a full crawl payload or `{ error: ... }`.
+
+## Output
+
+### Markdown mode
+
+```text
+---
+URL: https://example.com/page-one
+---
+# Page One
+
+… markdown content …
+
+---
+URL: https://example.com/page-two
+---
+…
+```
+
+### JSON mode
+
+Top-level `CrawlResult` with `pages: [...]`. Each page carries the rendered
+Markdown plus metadata, links, images, JSON-LD, and HTTP response info. Read
+`result.pages[i].markdown.content` for the Markdown string.
+
+## Politeness checklist
+
+- Pass `--respect-robots-txt` on every third-party crawl.
+- Cap `--max-pages` — a runaway BFS can issue tens of thousands of requests.
+- Bump `--rate-limit` for hosts that show signs of stress (5xx, slowdowns).
+- Identify yourself via `--user-agent kreuzcrawl (contact@example.com)`.
+
+## Common pitfalls
+
+- **No pages returned.** The seed page may be JS-only — the engine falls
+  back to headless automatically in `--browser-mode auto`, but `never` mode
+  will silently produce an empty crawl. Re-run with `--browser-mode always`
+  or check the headless-fallback skill.
+- **Crawl leaves the domain.** Pass `--stay-on-domain`. Combine with
+  `allow_subdomains: true` in `--config` JSON to include subdomains.
+- **Slow crawl.** The default rate limit is 200 ms per origin — multiple
+  seed URLs on the same host still share the bucket. Spread seeds across
+  hosts or raise `--concurrent` for unrelated origins.
+- **Memory growth.** Each page carries full Markdown plus structured data.
+  Stream JSON output to a file rather than holding it in memory; set
+  `--max-pages` aggressively if downstream cannot keep up.
+
+## When to reach for `map` instead
+
+If the user only needs the list of URLs (sitemap analysis, link planning,
+seeding another tool), use `kreuzcrawl map <url>` — it skips rendering and
+returns a flat `MapResult` with hundreds of URLs in seconds.
diff --git a/plugins/kreuzcrawl/skills/headless-fallback/SKILL.md b/plugins/kreuzcrawl/skills/headless-fallback/SKILL.md
new file mode 100644
index 0000000..0f3bb8f
--- /dev/null
+++ b/plugins/kreuzcrawl/skills/headless-fallback/SKILL.md
@@ -0,0 +1,140 @@
+---
+name: headless-fallback
+description: >-
+  Use when a static fetch returns nothing useful and the page needs a real
+  browser. Covers `--browser-mode auto|always|never`, external CDP via
+  `--browser-endpoint`, symptoms of JS-only pages and WAF blocks, and the
+  performance cost.
+---
+
+# Headless fallback
+
+Some pages are unscrapable without a real browser — SPA shells, infinite
+scroll, Cloudflare interstitials, JS-rendered article bodies. Kreuzcrawl
+ships with an optional headless-Chrome backend driven by chromiumoxide.
+
+## Modes
+
+```text
+--browser-mode auto    # default — try static first, fall back to browser on JS/WAF
+--browser-mode always  # skip static, go straight to browser
+--browser-mode never   # static only, fail closed
+```
+
+### `auto` (default)
+
+The engine fetches statically, then inspects the response. It launches
+headless Chrome and re-fetches when it sees:
+
+- WAF responses from one of 8 detected vendors (Cloudflare, Akamai, AWS WAF,
+  Imperva, DataDome, PerimeterX, Sucuri, F5).
+- SPA shells: `<noscript>` warnings, near-empty `<body>` with heavy JS.
+- Heuristic JS-render-required signals.
+
+This is the right default. The browser only spins up when needed.
+
+### `always`
+
+Skip the static probe entirely. Use when:
+
+- The user already told you the page needs JS.
+- You are scraping a site you know is React/Vue/Svelte SPA.
+- You need `<script>`-emitted state that never lands in static HTML.
+
+```bash
+kreuzcrawl scrape https://spa.example.com --browser-mode always --format markdown
+```
+
+### `never`
+
+Static only — the browser path is disabled. Use when:
+
+- You are in a hot loop where a stray Chrome launch would blow the budget.
+- You are running in a sandbox without a Chrome binary.
+- The user explicitly wants only static fetches.
+
+In `never` mode, JS-only pages return empty/stub content. Inspect
+`markdown.content` and `markdown.warnings` before treating the result as
+final.
+
+## Symptoms that point to headless
+
+In `--browser-mode never` or when you suspect the auto detector missed a
+signal:
+
+- `markdown.content` is short, nav-only, or just a loading message.
+- `status_code` is 200 but `metadata.headings` is empty on a page that
+  clearly has headings.
+- `markdown.warnings` mentions JS-render-required or WAF detection.
+- 403/406/503 with WAF response headers (`server: cloudflare`,
+  `cf-mitigated`, `x-amz-cf-id`, `set-cookie: __cf_bm=…`).
+
+Re-run with `--browser-mode always`. If that succeeds, leave it set for
+that host.
+
+## External CDP endpoint
+
+Point at an already-running Chrome (Browserless, Steel, your own) instead
+of launching locally:
+
+```bash
+kreuzcrawl scrape https://example.com \
+  --browser-mode always \
+  --browser-endpoint ws://browser.internal:9222/devtools/browser/<id> \
+  --format markdown
+```
+
+The endpoint must be a WebSocket URL — `ws://` or `wss://`. The CLI
+rejects anything else with a clear error.
+
+Use external CDP when:
+
+- You are running in containers or CI without a local Chrome.
+- You want a shared, warm browser pool across many crawl jobs.
+- You need browser-side residential proxies or stealth configuration the
+  local Chrome cannot provide.
+
+## Performance cost
+
+Headless Chrome is expensive relative to a static fetch:
+
+- Cold start: 1-3 seconds the first time it launches.
+- Per-page overhead: 500 ms-2 s for `NetworkIdle` wait, plus the page's own
+  JS load time.
+- Memory: each tab takes 100-300 MB; long crawls should bound
+  `--concurrent`.
+
+Mitigations:
+
+- Stay in `--browser-mode auto` — the engine only pays the cost when it
+  needs to.
+- Use `--browser-endpoint` to share one warm browser across jobs.
+- Drop `--concurrent` when you know the crawl will route through Chrome.
+
+## Wait strategies
+
+Pass via `--config` JSON when you need control:
+
+```bash
+kreuzcrawl scrape https://example.com --browser-mode always \
+  --config '{"browser":{"wait_strategy":{"type":"Selector","selector":".article-body"}}}'
+```
+
+Supported strategies:
+
+- `NetworkIdle` (default) — wait until the network goes quiet.
+- `Selector` — wait until a CSS selector resolves.
+- `Fixed` — wait a fixed duration.
+
+`extra_wait` adds milliseconds on top of the wait strategy if the page
+keeps loading content after the primary signal.
+
+## Persistent profiles
+
+```bash
+kreuzcrawl scrape https://app.example.com --browser-mode always \
+  --config '{"browser_profile":"prod","save_browser_profile":true}'
+```
+
+Profile names are path-traversal-validated. Use them to keep cookies,
+localStorage, and login state across runs without re-authenticating.
diff --git a/plugins/kreuzcrawl/skills/kreuzcrawl/SKILL.md b/plugins/kreuzcrawl/skills/kreuzcrawl/SKILL.md
new file mode 100644
index 0000000..b2f9556
--- /dev/null
+++ b/plugins/kreuzcrawl/skills/kreuzcrawl/SKILL.md
@@ -0,0 +1,191 @@
+---
+name: kreuzcrawl
+description: >-
+  Crawl, scrape, and convert websites to Markdown using the local kreuzcrawl
+  CLI and its MCP server. Use when the user wants to fetch a page, follow
+  links across a domain, enumerate URLs, or drive a real browser. Covers
+  installation, the subcommands (scrape, crawl, map, interact, mcp, serve),
+  output formats (JSON + Markdown), browser fallback, and when to prefer the
+  MCP server over shelling out.
+license: MIT
+metadata:
+  author: kreuzberg-dev
+  version: "0.1.0"
+  repository: https://github.com/kreuzberg-dev/kreuzcrawl
+---
+
+# Kreuzcrawl
+
+Kreuzcrawl is a Rust-native web crawler and scraper. It fetches static HTML
+with `reqwest`, falls back to headless Chrome when a page needs JS or trips a
+WAF, and converts every result to clean Markdown via the built-in
+HTML→Markdown engine.
+
+Use this skill when the user wants to:
+
+- Scrape a single URL to Markdown plus structured metadata.
+- Crawl a site following links bounded by depth, page count, and concurrency.
+- Enumerate URLs from sitemaps without paying for rendering.
+- Drive a real browser (click, type, scroll) and capture the resulting DOM.
+- Run the same operations from another agent harness via MCP tools.
+
+## Installation
+
+The plugin shells out to a `kreuzcrawl` binary on `PATH`. Install one of:
+
+```bash
+brew install kreuzberg-dev/tap/kreuzcrawl
+cargo install kreuzcrawl-cli
+```
+
+Verify:
+
+```bash
+kreuzcrawl --version
+```
+
+Headless fallback needs Chrome/Chromium reachable locally (`chromiumoxide`
+launches it on demand). Skip the install if you only plan to use
+`--browser-mode never`.
+
+## Command map
+
+```text
+kreuzcrawl scrape <url>     # single page → JSON or Markdown
+kreuzcrawl crawl <url...>   # follow links, BFS, depth-bounded
+kreuzcrawl map <url>        # enumerate URLs via sitemaps + link extraction
+kreuzcrawl interact <url>   # browser actions: click, type, scroll
+kreuzcrawl mcp              # MCP server (stdio) — auto-registered
+kreuzcrawl serve            # REST API server (optional `api` feature)
+```
+
+Batch behaviour is built into `crawl`: pass multiple seed URLs and the engine
+fans out via `batch_crawl` internally.
+
+### Shared flags
+
+| Flag                    | Default  | Notes                                              |
+| ----------------------- | -------- | -------------------------------------------------- |
+| `--format`              | `json`   | `json` or `markdown`.                              |
+| `--timeout`             | `30000`  | Request timeout in milliseconds.                   |
+| `--browser-mode`        | `auto`   | `auto`, `always`, or `never`.                      |
+| `--browser-endpoint`    | —        | Optional CDP `ws://` or `wss://` URL.              |
+| `--respect-robots-txt`  | off      | Pass to obey `robots.txt`.                         |
+| `--config <json>`       | —        | Inline JSON or `@file.json` to override defaults.  |
+
+The `--config` flag accepts the full `CrawlConfig` schema. Anything you set
+explicitly on the CLI overrides the corresponding JSON field.
+
+## Scrape a single page
+
+```bash
+kreuzcrawl scrape https://example.com --format markdown
+```
+
+JSON output (default) carries the rendered Markdown, page metadata
+(`PageMetadata`), links by category, images, feeds, JSON-LD blocks, and
+HTTP response metadata. Use Markdown output when piping into a file the user
+will read.
+
+See the `scraping-html-to-markdown` skill for the full flag surface.
+
+## Crawl a site
+
+```bash
+kreuzcrawl crawl https://example.com \
+  --depth 3 --max-pages 200 --concurrent 8 --rate-limit 250 \
+  --stay-on-domain --respect-robots-txt --format markdown
+```
+
+Crawling is BFS by default, bounded by `--depth`, `--max-pages`, and
+`--concurrent`. Per-domain politeness is enforced by `--rate-limit`
+(milliseconds between requests to the same origin).
+
+See the `crawling-a-site` skill for the recommended defaults and the full
+flag surface.
+
+## Map URLs
+
+```bash
+kreuzcrawl map https://example.com --limit 500 --search docs --format markdown
+```
+
+`map` reads `sitemap.xml` (and nested sitemaps), then falls back to link
+extraction from the seed page. It does not render pages — use it to plan a
+crawl or to feed URLs into another tool.
+
+## Browser interaction
+
+```bash
+kreuzcrawl interact https://example.com \
+  --actions '[{"type":"click","selector":"#load-more"},
+              {"type":"wait","duration_ms":500}]'
+```
+
+Action types include `click`, `type`, `select`, `scroll`, `wait`,
+`wait_for_selector`, and `screenshot`. The result wraps the final HTML
+under `interaction.final_html`.
+
+## MCP server
+
+When this plugin is installed in a Claude Code / Codex / Cursor / Gemini /
+opencode harness, the MCP server is auto-registered:
+
+```text
+kreuzcrawl mcp --transport stdio
+```
+
+Prefer MCP tools over shelling out when both are available:
+
+- Typed schemas surface argument errors before the call.
+- Results stream back as structured tool output instead of stdout text.
+- No `--format` juggling — the harness pulls whatever shape it needs.
+
+Fall back to the CLI when you need to script a pipeline, capture stderr, or
+chain with shell tools.
+
+## Headless fallback
+
+In `--browser-mode auto` (default), the engine:
+
+1. Fetches statically via `reqwest`.
+2. Detects WAF blocks (8 vendors) and JS-only shells.
+3. Re-fetches through headless Chrome with a real fingerprint when needed.
+
+Force the browser path with `--browser-mode always` when you already know
+the page needs JS. Use `--browser-mode never` for hot loops where the cost
+of a stray Chrome launch is unacceptable.
+
+Point `--browser-endpoint ws://host:9222/devtools/browser/<id>` at an
+already-running Chrome to skip the local launch.
+
+See the `headless-fallback` skill for symptoms, costs, and external-CDP
+patterns.
+
+## Output formats
+
+| Mode       | Use when                                                |
+| ---------- | ------------------------------------------------------- |
+| `json`     | Downstream consumer needs metadata, links, images, etc. |
+| `markdown` | Human reader or LLM-context payload.                    |
+
+Markdown output skips metadata. If you need both, run with `--format json`
+and read `result.markdown.content`.
+
+## Robots, rate limits, ethics
+
+- `--respect-robots-txt` is off by default; pass it for any crawl on a host
+  you do not own.
+- The default `--rate-limit 200` already produces a polite cadence; raise it
+  for shared hosts.
+- Identify the crawler honestly via `--user-agent`. Do not impersonate a
+  browser unless the operator has approved it.
+
+## Cross-references
+
+- `skills/crawling-a-site/SKILL.md` — multi-page crawl with depth, page
+  caps, concurrency, rate limits, and domain scoping.
+- `skills/scraping-html-to-markdown/SKILL.md` — single-page rendering, the
+  Markdown output shape, and common pitfalls.
+- `skills/headless-fallback/SKILL.md` — when and how to force the browser
+  backend.
diff --git a/plugins/kreuzcrawl/skills/scraping-html-to-markdown/SKILL.md b/plugins/kreuzcrawl/skills/scraping-html-to-markdown/SKILL.md
new file mode 100644
index 0000000..62dfb4d
--- /dev/null
+++ b/plugins/kreuzcrawl/skills/scraping-html-to-markdown/SKILL.md
@@ -0,0 +1,124 @@
+---
+name: scraping-html-to-markdown
+description: >-
+  Use when the user wants a single page rendered as clean Markdown plus
+  structured metadata. Covers `kreuzcrawl scrape <url>`, JSON vs Markdown
+  output, what metadata is returned, and how to handle JS-heavy pages.
+---
+
+# Scraping HTML to Markdown
+
+`kreuzcrawl scrape <url>` is the right tool when the user has a single
+page in mind. It returns Markdown plus a full structured payload (metadata,
+links, images, JSON-LD, HTTP response info).
+
+## Quick recipe
+
+```bash
+kreuzcrawl scrape https://example.com/article --format markdown
+```
+
+JSON form (default) when downstream needs metadata:
+
+```bash
+kreuzcrawl scrape https://example.com/article --format json
+```
+
+## Flag surface
+
+| Flag                    | Default | Purpose                                                |
+| ----------------------- | ------- | ------------------------------------------------------ |
+| `--format`              | `json`  | `json` or `markdown`.                                  |
+| `--timeout`             | `30000` | Per-request timeout in ms.                             |
+| `--proxy`               | —       | HTTP, HTTPS, or SOCKS5 proxy URL.                      |
+| `--user-agent`          | —       | Override request UA.                                   |
+| `--respect-robots-txt`  | off     | Honour `robots.txt`.                                   |
+| `--browser-mode`        | `auto`  | `auto`, `always`, `never` — see headless-fallback skill. |
+| `--browser-endpoint`    | —       | External CDP `ws://` URL.                              |
+| `--config`              | —       | Inline JSON or `@file.json` for full `CrawlConfig`.    |
+
+## Output shape
+
+### Markdown mode
+
+Prints the rendered Markdown only. Use when piping to a file the user will
+read, or when the result becomes LLM context downstream.
+
+### JSON mode
+
+Top-level `PageResult` with:
+
+- `url`, `final_url` (after redirects), `status_code`.
+- `markdown`: `{ content, fit_content, warnings }` — `fit_content` is a
+  pruned LLM-optimised variant.
+- `metadata`: Open Graph, Twitter Card, Dublin Core, article tags, JSON-LD,
+  headings (H1–H6), feeds, favicons, hreflang.
+- `links`: arrays for `Internal`, `External`, `Anchor`, and `Document`.
+- `images`: `<img>`, `<picture>`, `srcset`, `og:image`.
+- `tables`: structured table data preserved separately from Markdown.
+- `response`: HTTP headers, content type, charset, body size.
+
+Read `result.markdown.content` for the Markdown string when scripting.
+
+## Common pitfalls
+
+### Empty or stub content
+
+Static fetch returned a JS shell. Symptoms in JSON output:
+
+- `markdown.content` is short or only contains nav/footer chrome.
+- `markdown.warnings` mentions JS-render-required.
+- `metadata.headings` is empty when the page clearly has headings.
+
+Re-run with `--browser-mode always` and see the headless-fallback skill.
+
+### WAF block
+
+`Auto` mode detects 8 WAF vendors and retries through headless Chrome
+automatically. If you forced `--browser-mode never`, the WAF response will
+fall through. Check `response.status_code` — 403/406/503 with WAF headers
+(`server: cloudflare`, `x-amz-cf-id`, etc.) is the giveaway.
+
+### Robots.txt blocking the fetch
+
+If `--respect-robots-txt` is set and the path is disallowed, the scrape
+returns an error rather than partial content. Drop the flag only on hosts
+you own or have authorisation for.
+
+### Wrong charset
+
+Most pages declare UTF-8. Pages that lie about their charset surface as
+mojibake in `markdown.content`. Override via `--config '{"force_encoding":"latin-1"}'`
+or similar.
+
+## Examples
+
+### Scrape an article for downstream LLM context
+
+```bash
+kreuzcrawl scrape https://blog.example.com/post-123 --format markdown \
+  > /tmp/article.md
+```
+
+### Scrape with proxy and custom UA
+
+```bash
+kreuzcrawl scrape https://example.com \
+  --proxy http://proxy.internal:3128 \
+  --user-agent "kreuzcrawl (research@example.com)" \
+  --format json
+```
+
+### Extract just the OG metadata
+
+```bash
+kreuzcrawl scrape https://example.com --format json \
+  | jq '.metadata | {title: .og.title, description: .og.description, image: .og.image}'
+```
+
+## When to reach for crawl or interact instead
+
+- The user wants the whole site, not one page → `crawling-a-site` skill.
+- The user needs to click, type, or scroll before extracting → use
+  `kreuzcrawl interact` with the action list.
+- The user only wants the list of URLs → `kreuzcrawl map`.
diff --git a/skills/crawling-a-site b/skills/crawling-a-site
new file mode 120000
index 0000000..6ebe897
--- /dev/null
+++ b/skills/crawling-a-site
@@ -0,0 +1 @@
+../plugins/kreuzcrawl/skills/crawling-a-site
\ No newline at end of file
diff --git a/skills/extracting-tables b/skills/extracting-tables
new file mode 120000
index 0000000..99b47e4
--- /dev/null
+++ b/skills/extracting-tables
@@ -0,0 +1 @@
+../plugins/kreuzberg/skills/extracting-tables
\ No newline at end of file
diff --git a/skills/extracting-with-ocr b/skills/extracting-with-ocr
new file mode 120000
index 0000000..4c1391b
--- /dev/null
+++ b/skills/extracting-with-ocr
@@ -0,0 +1 @@
+../plugins/kreuzberg/skills/extracting-with-ocr
\ No newline at end of file
diff --git a/skills/headless-fallback b/skills/headless-fallback
new file mode 120000
index 0000000..c1ee785
--- /dev/null
+++ b/skills/headless-fallback
@@ -0,0 +1 @@
+../plugins/kreuzcrawl/skills/headless-fallback
\ No newline at end of file
diff --git a/skills/kreuzberg b/skills/kreuzberg
new file mode 120000
index 0000000..c5dcf92
--- /dev/null
+++ b/skills/kreuzberg
@@ -0,0 +1 @@
+../plugins/kreuzberg/skills/kreuzberg
\ No newline at end of file
diff --git a/skills/kreuzberg-cloud b/skills/kreuzberg-cloud
new file mode 120000
index 0000000..6e4f116
--- /dev/null
+++ b/skills/kreuzberg-cloud
@@ -0,0 +1 @@
+../plugins/kreuzberg-cloud/skills/kreuzberg-cloud
\ No newline at end of file
diff --git a/skills/kreuzcrawl b/skills/kreuzcrawl
new file mode 120000
index 0000000..a602475
--- /dev/null
+++ b/skills/kreuzcrawl
@@ -0,0 +1 @@
+../plugins/kreuzcrawl/skills/kreuzcrawl
\ No newline at end of file
diff --git a/skills/managing-cloud-usage b/skills/managing-cloud-usage
new file mode 120000
index 0000000..2f2b717
--- /dev/null
+++ b/skills/managing-cloud-usage
@@ -0,0 +1 @@
+../plugins/kreuzberg-cloud/skills/managing-cloud-usage
\ No newline at end of file
diff --git a/skills/offloading-extraction b/skills/offloading-extraction
new file mode 120000
index 0000000..a897630
--- /dev/null
+++ b/skills/offloading-extraction
@@ -0,0 +1 @@
+../plugins/kreuzberg-cloud/skills/offloading-extraction
\ No newline at end of file
diff --git a/skills/picking-a-format b/skills/picking-a-format
new file mode 120000
index 0000000..db19732
--- /dev/null
+++ b/skills/picking-a-format
@@ -0,0 +1 @@
+../plugins/kreuzberg/skills/picking-a-format
\ No newline at end of file
diff --git a/skills/presigned-uploads b/skills/presigned-uploads
new file mode 120000
index 0000000..c5e787e
--- /dev/null
+++ b/skills/presigned-uploads
@@ -0,0 +1 @@
+../plugins/kreuzberg-cloud/skills/presigned-uploads
\ No newline at end of file
diff --git a/skills/sandbox-keys b/skills/sandbox-keys
new file mode 120000
index 0000000..a5bd475
--- /dev/null
+++ b/skills/sandbox-keys
@@ -0,0 +1 @@
+../plugins/kreuzberg-cloud/skills/sandbox-keys
\ No newline at end of file
diff --git a/skills/scraping-html-to-markdown b/skills/scraping-html-to-markdown
new file mode 120000
index 0000000..e233871
--- /dev/null
+++ b/skills/scraping-html-to-markdown
@@ -0,0 +1 @@
+../plugins/kreuzcrawl/skills/scraping-html-to-markdown
\ No newline at end of file
diff --git a/skills/tracking-cloud-jobs b/skills/tracking-cloud-jobs
new file mode 120000
index 0000000..617b9d2
--- /dev/null
+++ b/skills/tracking-cloud-jobs
@@ -0,0 +1 @@
+../plugins/kreuzberg-cloud/skills/tracking-cloud-jobs
\ No newline at end of file