From f391d9049069c4a158fe00e8eb30266014f06e17 Mon Sep 17 00:00:00 2001
From: lajeti <liberian.ajeti@gmail.com>
Date: Wed, 1 Jul 2026 01:17:48 +0200
Subject: [PATCH 1/2] feat(text): extract text from form XObjects
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Text extraction only processed a page's top-level content stream, so
pages that draw all their text inside form XObjects (common in
tax/accounting and reporting PDFs, e.g. IRS Form 8879-PE) extracted as
empty strings — indistinguishable from a scanned image.

TextExtractor now handles the `Do` operator: on a /Subtype /Form
XObject it recurses into the form's content stream using the form's
own /Resources, with the form's /Matrix concatenated onto the CTM.
Image XObjects resolve to null and are skipped.

- Add ResourceResolver/FormXObject abstraction (fonts + XObjects),
  scoped per content stream; PDFPage builds and memoizes resolvers by
  Resources-dict identity (matching _resourceCache/_annotationCache).
- TextState gains captureState/restoreState that snapshot the full
  state and graphics-stack depth, so unbalanced q/Q inside a form
  cannot corrupt the rest of the page (lenient malformed-PDF handling).
- Guard nested/cyclic forms with a depth cap.

Tests: unit coverage for nested extraction, form-scoped fonts, state
isolation, /Matrix application, cycle safety, and back-compat; an
integration fixture (form-xobject-text.pdf). The rtl-placed-text
fixture is regenerated to drop a redundant duplicate text layer that
conflicted with now-correct form recursion; its RTL content stream
(the test subject) is preserved byte-for-byte.

Plan: .agents/plans/046-form-xobject-text-extraction.md

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../plans/046-form-xobject-text-extraction.md | 197 ++++++++++++++++++
 fixtures/text/form-xobject-text.pdf           | Bin 0 -> 803 bytes
 fixtures/text/rtl-placed-ltr-text.pdf         | Bin 5879 -> 5443 bytes
 src/api/pdf-page.ts                           | 132 +++++++++++-
 src/integration/text/text-extraction.test.ts  |  14 ++
 src/text/index.ts                             |   7 +-
 src/text/text-extractor.test.ts               | 157 ++++++++++++++
 src/text/text-extractor.ts                    | 134 +++++++++++-
 src/text/text-state.ts                        |  68 ++++++
 9 files changed, 694 insertions(+), 15 deletions(-)
 create mode 100644 .agents/plans/046-form-xobject-text-extraction.md
 create mode 100644 fixtures/text/form-xobject-text.pdf
 create mode 100644 src/text/text-extractor.test.ts

diff --git a/.agents/plans/046-form-xobject-text-extraction.md b/.agents/plans/046-form-xobject-text-extraction.md
new file mode 100644
index 0000000..400a670
--- /dev/null
+++ b/.agents/plans/046-form-xobject-text-extraction.md
@@ -0,0 +1,197 @@
+# 046: Form XObject Text Extraction
+
+## Problem Statement
+
+Text extraction (plan [035](./035-text-extraction.md)) only processes a page's
+top-level content stream. Many real-world PDFs — especially those produced by
+tax/accounting software (e.g. IRS Form 8879-PE), reporting tools, and design
+tools — draw all or part of their text inside **form XObjects** that the page
+content stream merely paints with the `Do` operator:
+
+```
+q /Fm0 Do Q          % page content: paints form XObjects, no text operators
+```
+
+The text (`BT ... Tj ... ET`) lives inside `/Fm0`, which carries its **own**
+`/Resources/Font` dictionary. Because `TextExtractor` had no `Do` handler, these
+pages extracted as empty strings, which is indistinguishable from a scanned
+image to a caller — a correctness gap, not a layout nicety.
+
+This is a follow-up to plan 035 (Tier 3 text features per GOALS.md), closing the
+gap between "page has no top-level text operators" and "page has no text."
+
+## Scope
+
+### In Scope
+
+- Recurse into form XObjects (`Subtype /Form`) invoked via `Do`
+- Resolve fonts and nested XObjects against each form's own `/Resources`
+- Apply the form's `/Matrix` to nested text positions
+- Isolate the caller's graphics/text state across a form invocation, tolerating
+  malformed (unbalanced) `q`/`Q` inside the form
+- Guard against cyclic form references
+- Reuse the existing line-grouping, span, and search pipeline unchanged
+
+### Out of Scope
+
+- Image XObjects (resolve to `null`; no OCR — consistent with plan 035)
+- Tiling patterns and Type 3 font glyph procedures (separate content streams)
+- Annotation appearance streams (separate feature — plan 037)
+- Deduplicating overlapping visible + invisible ("ActualText"-style) text layers
+- Marked content / tagged-PDF logical structure
+
+## Dependencies
+
+- **Content stream parser** — `src/content/parsing/content-stream-parser.ts`
+  (reused as-is to parse form content)
+- **Font layer** — `src/fonts/` (`parseFont`, ToUnicode) — reused per form
+- **TextExtractor / TextState** — `src/text/` (extended, not replaced)
+- **COS accessors** — `PdfDict.getDict/getArray/getName`, `PdfStream.getDecodedData`
+
+No new external dependencies.
+
+## Desired API
+
+No public API change. The existing entry points transparently gain form
+coverage:
+
+```typescript
+const pdf = await PDF.load(bytes);
+const page = pdf.getPage(1);
+
+// Previously returned "" for form-drawn pages; now returns the real text.
+const { text, lines } = page.extractText();
+
+// findText (page- and document-wide) benefits automatically since it
+// delegates to extractText().
+const matches = page.findText(/\{\{\s*\w+\s*\}\}/g);
+```
+
+## Architecture
+
+### Components
+
+```
+PDFPage.extractText()
+        │
+        ├─► createResourceResolver(pageResources)  ──► ResourceResolver
+        │        ├─ createFontResolver   (per-Resources font cache)
+        │        └─ createXObjectResolver(per-Resources, lazy, memoized)
+        ▼
+TextExtractor (constructed with the page-level resolver)
+        │
+        ├─► ContentStreamParser (existing)
+        │
+        ├─► TextState (existing; + captureState/restoreState)
+        │
+        └─► on `Do`: runForm()
+                 ├─ snapshot state + push form /Matrix onto CTM
+                 ├─ swap active ResourceResolver to the form's
+                 ├─ recurse over the form's content (depth-guarded)
+                 └─ restore snapshot + resolver
+```
+
+### Key abstraction: `ResourceResolver`
+
+A form's resources are scoped to the form, so font/XObject lookup cannot be a
+single page-wide callback. `ResourceResolver` bundles the two lookups for one
+content stream:
+
+```typescript
+interface ResourceResolver {
+  resolveFont: (name: string) => PdfFont | null;
+  resolveXObject: (name: string) => FormXObject | null; // null for images
+}
+
+interface FormXObject {
+  bytes: Uint8Array; // decoded content
+  matrix?: readonly [number, number, number, number, number, number];
+  resources: ResourceResolver; // the form's own
+}
+```
+
+`TextExtractor` tracks the _active_ resolver and swaps it while inside a form.
+`PDFPage` builds resolvers from COS dictionaries and memoizes them by
+dictionary identity (`_resourceResolverCache`), matching the existing
+`_resourceCache` / `_annotationCache` pattern on the class.
+
+### State isolation
+
+Per PDF spec §8.10.1, painting a form behaves as if wrapped in `q`/`Q` with the
+form's `/Matrix` concatenated onto the CTM. `TextState` gains
+`captureState()` / `restoreState()` that snapshot the full text+graphics state
+_and the graphics-stack depth_, so a form with unbalanced `q`/`Q` (lenient
+handling per the project's malformed-PDF principle) cannot corrupt the rest of
+the page.
+
+### Cycle safety
+
+A `formDepth` counter in the extractor caps nesting at `MAX_FORM_DEPTH` (16).
+Combined with identity memoization of resolvers, a form that paints itself
+terminates instead of recursing forever.
+
+## Test Plan
+
+### Unit (`src/text/text-extractor.test.ts`)
+
+- Extract text nested one level inside a form invoked by `Do`
+- Unresolvable / image XObject (`Do` is a no-op)
+- Form uses its **own** font resources (prove via a font that shifts codes)
+- State isolation: form with stray `Q` operators leaves later page text intact
+- `/Matrix` translation offsets nested text position
+- Cyclic self-referential form terminates without throwing
+- No `resolveXObject` provided → `Do` ignored (back-compat)
+
+### Integration (`src/integration/text/text-extraction.test.ts`)
+
+- New fixture `fixtures/text/form-xobject-text.pdf`: a page whose only text is
+  drawn via a form XObject with its own font → `extractText().text` contains it
+
+### Regression
+
+- `rtl-placed-text` fixture regenerated to drop a redundant duplicate text layer
+  (a clean-LTR form copy that real design-tool exports don't carry and that
+  conflicted with now-correct form recursion); the RTL content stream — the
+  actual test subject — is preserved byte-for-byte
+
+### Full suite
+
+- `bun run test:run` (all files), `bun run typecheck`, `bun run lint` green
+
+## Open Questions
+
+1. **Overlapping visible + invisible text** — When a PDF carries both a visible
+   layer and an invisible logical-order layer for the same words, extraction now
+   surfaces both. Real-world dedup (by position + content) is deferred; it is a
+   broader feature than form recursion. _Current approach_: extract everything,
+   matching pdf.js behavior.
+
+2. **Render mode 3 (invisible) text** — Kept in output, as before, because it is
+   the canonical layer for searchable/scanned PDFs. Not changed here.
+
+## Risks
+
+- **Double-counting** in the rare visible+invisible duplicate-layer case (see
+  Open Question 1). Mitigated by it being uncommon in generated PDFs; flagged
+  for a future dedup pass.
+- **Performance** — Each distinct form's fonts are parsed once and memoized;
+  repeated `Do` of the same form is O(1) after first resolve.
+
+## Implementation Phases
+
+### Phase 1: Resolver abstraction
+
+- Add `ResourceResolver` / `FormXObject` to `text-extractor.ts`
+- Refactor `PDFPage.createFontResolver` → `createResourceResolver` +
+  `createFontResolver(dict)` + `createXObjectResolver(dict)` + `readMatrix`
+
+### Phase 2: Extractor recursion
+
+- Track active resolver + `formDepth` in `TextExtractor`
+- Add `Do` handler → `runForm()` (snapshot, matrix, swap, recurse, restore)
+- Add `TextState.captureState` / `restoreState`
+
+### Phase 3: Tests & fixtures
+
+- Unit tests, integration fixture, regenerate `rtl-placed-ltr-text.pdf`
+- Verify full suite, typecheck, lint
diff --git a/fixtures/text/form-xobject-text.pdf b/fixtures/text/form-xobject-text.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4a020df90ce10e72021222db8bea7513ec42df72
GIT binary patch
literal 803
zcmah{U2obz5PbKqm=~n>p}9MQ?IJ~yLV{H(sYt<6BB4GU_@G8`*V;!#`s=&v4<O+o
zmSy?oW_M;*+k3iCzo@GuP>^toO>lh;k-h#l5O><^ty@Dp>9wg45@Cv)o50wdiF|%r
z(D1}Vkyn`anf{ZyxV`mIXN7!oS;#V_A!?%|zMH(zBX_`D5<gZUl3_^wrlxZHa!H2R
z>C0rXF-u?nr#p#=RY{9qlD3{YRv5HsXPwWTgGXc6{ufm3u=1tRyWkH5H@kQE!6hx)
z=4OemxEWMuLd^CHUym+z<&JvLw`M4a(T7n|a@*ukh|goR4hJ#Mu=O&^sJ%isK(<0c
zvyl>6j!!Jv+R4npe@3>!IJ*qN0&hdq@}~ssn{DgLG&kSO_OJ29QbUaG(&dGv&3G>C
zt*we9aN3STX;y(m63F9&IEwlaRybip)VHuFEc$wZDMd5q=ZqH)F0sC(Snqg|q;=~s
lU+Q9O%6i9V#Xp0D_Q^S3&vv&fuS;KhkEINHz40Us{sZ4k*w6p~

literal 0
HcmV?d00001

diff --git a/fixtures/text/rtl-placed-ltr-text.pdf b/fixtures/text/rtl-placed-ltr-text.pdf
index ca421574c6c9286b0b9a5d515ae3ac5b714080f4..2c7790bd6c12b9256aa1aa13e111dbb11bfd98bf 100644
GIT binary patch
delta 201
zcmeyadss_AHNeG9*HF)dOZD)g!;23$PZT&ic?CZohpB>rLXd);-DE{ZWgb%tAd9QG
zq$o8pcXK==H{a&70&OgUW>7U;sd*{+Nm*QGlT$=@F`7@77mLU-GBsB)00D(O1uihd
zz}UhJL(J5`)Doetq$n{nC$)&n#zsLuII}8M!CXN<C_leM!4PIsaY<rPNkvg=8keb&
Nu_c$Ps;j>n7XV8>HE{p{

delta 578
zcmZ{i&q~8U5XLEZTERb5PemP+f(kX8G;LZaJ(RYmLd6Ct6c0(W)-6dkBpaw+3ci4p
zyn#MKZ{n>U^=UkKb<;LjEY4}znfboo&hG5|JGYa)tT%J=g^IGX=h@5bX|nTKM-Ye9
zWG;tFMI<s50mBtktIJZiZ4OC)QPs(aU(+BoY$*J8O}<Vtni@|8+$V;Eq?Nupn;xe&
zw{nS$)Co;~?~&NkcVZ8zR%0XRVIf$Q70}Ac&E(@~oM0?Tq!w`p{0<bXtvJ<m6eG(j
z=(k7{!zu>dLZ>a}69=dlgbrBDW<CUzgW(VkedY$@K7@z<8R$dQ(9xQTf?*uX7#8(t
z(5LPIh%E#HVhK4QGz=VO0Vj^f{#U^Z7V7q?MXgY5IOJd(reFgBGUp=T7z5WxNQi4K
zcNo2;52o_=<HoL*Y+4kVIYe6Yo_<Z0qVZn0m&bX)aexbGp5T00Dg8ZDmC{;YLDj??
fCbQf(s7-tn|A7vDAW(`9_cN$iD4=Y%(Qcw2<2s=J

diff --git a/src/api/pdf-page.ts b/src/api/pdf-page.ts
index 628c15c..40b64bd 100644
--- a/src/api/pdf-page.ts
+++ b/src/api/pdf-page.ts
@@ -117,6 +117,7 @@ import {
   showText,
 } from "#src/helpers/operators";
 import * as operatorHelpers from "#src/helpers/operators";
+import type { RefResolver } from "#src/helpers/types";
 import type { PDFImage } from "#src/images/pdf-image";
 import { PdfArray } from "#src/objects/pdf-array";
 import { PdfDict } from "#src/objects/pdf-dict";
@@ -126,7 +127,7 @@ import { PdfRef } from "#src/objects/pdf-ref";
 import { PdfStream } from "#src/objects/pdf-stream";
 import { PdfString } from "#src/objects/pdf-string";
 import { getPlainText, groupCharsIntoLines } from "#src/text/line-grouper";
-import { TextExtractor } from "#src/text/text-extractor";
+import { type FormXObject, type ResourceResolver, TextExtractor } from "#src/text/text-extractor";
 import { searchPage } from "#src/text/text-search";
 import type { ExtractTextOptions, FindTextOptions, PageText, TextMatch } from "#src/text/types";
 
@@ -2817,11 +2818,14 @@ export class PDFPage {
     // Get content stream bytes
     const contentBytes = this.getContentBytes();
 
-    // Create font resolver
-    const resolveFont = this.createFontResolver();
+    // Build a resource resolver for fonts and form XObjects
+    const resources = this.createResourceResolver(this.resolveInheritedResources());
 
     // Extract characters
-    const extractor = new TextExtractor({ resolveFont });
+    const extractor = new TextExtractor({
+      resolveFont: resources.resolveFont,
+      resolveXObject: resources.resolveXObject,
+    });
     const chars = extractor.extract(contentBytes);
 
     // Group into lines and spans
@@ -2959,16 +2963,45 @@ export class PDFPage {
   }
 
   /**
-   * Create a font resolver function for text extraction.
+   * Memoized resource resolvers, keyed by Resources dictionary identity.
+   * Shared across nested form XObjects to avoid rebuilding font caches and to
+   * break cyclic XObject references.
    */
-  private createFontResolver(): (name: string) => PdfFont | null {
-    // Get the page's Font resources (may be a ref or inherited from parent)
-    const resourcesDict = this.resolveInheritedResources();
+  private readonly _resourceResolverCache = new Map<PdfDict, ResourceResolver>();
 
+  /**
+   * Build a resource resolver (fonts + form XObjects) for a Resources dict.
+   *
+   * Form XObjects carry their own Resources, so resolvers are scoped per
+   * Resources dictionary. Resolvers are memoized by dictionary identity, both
+   * to avoid rebuilding font caches for repeated XObjects and so that cyclic
+   * resource references resolve to the same instance. (The XObject resolver
+   * recurses lazily, so building one resolver never builds another.)
+   */
+  private createResourceResolver(resourcesDict: PdfDict | null): ResourceResolver {
     if (!resourcesDict) {
-      return () => null;
+      return { resolveFont: () => null, resolveXObject: () => null };
     }
 
+    const cached = this._resourceResolverCache.get(resourcesDict);
+
+    if (cached) {
+      return cached;
+    }
+
+    const resolver: ResourceResolver = {
+      resolveFont: this.createFontResolver(resourcesDict),
+      resolveXObject: this.createXObjectResolver(resourcesDict),
+    };
+    this._resourceResolverCache.set(resourcesDict, resolver);
+
+    return resolver;
+  }
+
+  /**
+   * Create a font resolver function for a given Resources dictionary.
+   */
+  private createFontResolver(resourcesDict: PdfDict): (name: string) => PdfFont | null {
     const font = resourcesDict.getDict("Font", this.ctx.resolve.bind(this.ctx));
 
     if (!font) {
@@ -3015,4 +3048,85 @@ export class PDFPage {
       return fontCache.get(name) ?? null;
     };
   }
+
+  /**
+   * Create a form-XObject resolver for a given Resources dictionary.
+   *
+   * Only form XObjects (Subtype /Form) carry extractable text; image XObjects
+   * resolve to null so the extractor skips them.
+   */
+  private createXObjectResolver(resourcesDict: PdfDict): (name: string) => FormXObject | null {
+    const resolve = this.ctx.resolve.bind(this.ctx);
+    const xobjects = resourcesDict.getDict("XObject", resolve);
+
+    if (!xobjects) {
+      return () => null;
+    }
+
+    const cache = new Map<string, FormXObject | null>();
+
+    return (name: string): FormXObject | null => {
+      const existing = cache.get(name);
+
+      if (existing !== undefined) {
+        return existing;
+      }
+
+      let result: FormXObject | null = null;
+      const entry = xobjects.get(name, resolve);
+
+      if (entry instanceof PdfStream && entry.getName("Subtype", resolve)?.value === "Form") {
+        let bytes: Uint8Array;
+
+        try {
+          bytes = entry.getDecodedData();
+        } catch {
+          // Undecodable stream — treat as empty rather than throwing.
+          bytes = new Uint8Array(0);
+        }
+
+        // A form's content is processed with its own Resources, falling back to
+        // the enclosing resources when the form omits them (lenient handling).
+        const formResources = entry.getDict("Resources", resolve) ?? resourcesDict;
+
+        result = {
+          bytes,
+          matrix: this.readMatrix(entry, resolve),
+          resources: this.createResourceResolver(formResources),
+        };
+      }
+
+      cache.set(name, result);
+
+      return result;
+    };
+  }
+
+  /**
+   * Read a 6-element /Matrix from an XObject dictionary, if present and valid.
+   */
+  private readMatrix(
+    dict: PdfDict,
+    resolve: RefResolver,
+  ): [number, number, number, number, number, number] | undefined {
+    const array = dict.getArray("Matrix", resolve);
+
+    if (!array || array.length !== 6) {
+      return undefined;
+    }
+
+    const values: number[] = [];
+
+    for (let i = 0; i < 6; i++) {
+      const value = array.at(i, resolve);
+
+      if (value?.type !== "number") {
+        return undefined;
+      }
+
+      values.push(value.value);
+    }
+
+    return [values[0], values[1], values[2], values[3], values[4], values[5]];
+  }
 }
diff --git a/src/integration/text/text-extraction.test.ts b/src/integration/text/text-extraction.test.ts
index 65ff72b..6b68fdd 100644
--- a/src/integration/text/text-extraction.test.ts
+++ b/src/integration/text/text-extraction.test.ts
@@ -70,6 +70,20 @@ describe("Text Extraction Integration", () => {
     });
   });
 
+  describe("form XObjects", () => {
+    it("extracts text nested inside a form XObject", async () => {
+      // The page draws all of its text via a form XObject (/Fm0 Do) that
+      // carries its own font resources, so extraction must recurse into it.
+      const bytes = await loadFixture("text", "form-xobject-text.pdf");
+      const pdf = await PDF.load(bytes);
+      const page = pdf.getPage(0);
+
+      const pageText = page!.extractText();
+
+      expect(pageText.text).toContain("FormXObjectText");
+    });
+  });
+
   describe("document-wide extractText", () => {
     it("extracts text from all pages", async () => {
       const bytes = await loadFixture("text", "openoffice-test-document.pdf");
diff --git a/src/text/index.ts b/src/text/index.ts
index e7f2703..4ae390b 100644
--- a/src/text/index.ts
+++ b/src/text/index.ts
@@ -6,7 +6,12 @@
  */
 
 export { getPlainText, groupCharsIntoLines, type LineGrouperOptions } from "./line-grouper";
-export { TextExtractor, type TextExtractorOptions } from "./text-extractor";
+export {
+  type FormXObject,
+  type ResourceResolver,
+  TextExtractor,
+  type TextExtractorOptions,
+} from "./text-extractor";
 export { searchPage, searchPages } from "./text-search";
 export { TextState } from "./text-state";
 export * from "./types";
diff --git a/src/text/text-extractor.test.ts b/src/text/text-extractor.test.ts
new file mode 100644
index 0000000..6bbb670
--- /dev/null
+++ b/src/text/text-extractor.test.ts
@@ -0,0 +1,157 @@
+import type { PdfFont } from "#src/fonts/pdf-font";
+import { describe, expect, it } from "vitest";
+
+import { type FormXObject, type ResourceResolver, TextExtractor } from "./text-extractor";
+
+/**
+ * Minimal font stub: each byte code maps to its ASCII character and a fixed
+ * advance width. Enough to exercise the extractor's text-showing path.
+ */
+const stubFont = {
+  subtype: "Type1",
+  baseFontName: "StubFont",
+  descriptor: null,
+  getWidth: () => 500,
+  toUnicode: (code: number) => String.fromCharCode(code),
+} as unknown as PdfFont;
+
+function bytes(content: string): Uint8Array {
+  return new TextEncoder().encode(content);
+}
+
+function fontOnly(): ResourceResolver {
+  return { resolveFont: () => stubFont, resolveXObject: () => null };
+}
+
+function extract(content: string, options: Partial<ResourceResolver> = {}): string {
+  const extractor = new TextExtractor({
+    resolveFont: options.resolveFont ?? (() => stubFont),
+    resolveXObject: options.resolveXObject ?? (() => null),
+  });
+
+  return extractor
+    .extract(bytes(content))
+    .map(c => c.char)
+    .join("");
+}
+
+describe("TextExtractor form XObjects", () => {
+  it("extracts text nested inside a form XObject invoked with Do", () => {
+    const form: FormXObject = {
+      bytes: bytes("BT /F1 12 Tf (Hello) Tj ET"),
+      resources: fontOnly(),
+    };
+
+    const text = extract("/Fm0 Do", {
+      resolveXObject: name => (name === "Fm0" ? form : null),
+    });
+
+    expect(text).toBe("Hello");
+  });
+
+  it("ignores Do when the XObject cannot be resolved (e.g. an image)", () => {
+    const text = extract("/Im0 Do", { resolveXObject: () => null });
+
+    expect(text).toBe("");
+  });
+
+  it("uses the form's own resources to resolve fonts", () => {
+    const formFont = {
+      subtype: "Type1",
+      baseFontName: "FormFont",
+      descriptor: null,
+      getWidth: () => 500,
+      // Shift every code by one so we can prove the form's font was used.
+      toUnicode: (code: number) => String.fromCharCode(code + 1),
+    } as unknown as PdfFont;
+
+    const form: FormXObject = {
+      bytes: bytes("BT /FF 12 Tf (AB) Tj ET"),
+      resources: { resolveFont: () => formFont, resolveXObject: () => null },
+    };
+
+    const extractor = new TextExtractor({
+      resolveFont: () => stubFont,
+      resolveXObject: name => (name === "Fm0" ? form : null),
+    });
+
+    const text = extractor
+      .extract(bytes("/Fm0 Do"))
+      .map(c => c.char)
+      .join("");
+
+    // "AB" shifted by the form's font becomes "BC".
+    expect(text).toBe("BC");
+  });
+
+  it("isolates the caller's state from imbalanced q/Q inside the form", () => {
+    // The form pops more graphics states than it pushes; this must not corrupt
+    // the text drawn on the page after the form returns.
+    const form: FormXObject = {
+      bytes: bytes("Q Q Q BT /F1 12 Tf (X) Tj ET"),
+      resources: fontOnly(),
+    };
+
+    const extractor = new TextExtractor({
+      resolveFont: () => stubFont,
+      resolveXObject: () => form,
+    });
+
+    const chars = extractor.extract(
+      bytes("q BT /F1 12 Tf (A) Tj ET Q /Fm0 Do q BT /F1 12 Tf (B) Tj ET Q"),
+    );
+
+    expect(chars.map(c => c.char).join("")).toBe("AXB");
+    // "B" is drawn after the form returned; its position should be unaffected
+    // by the form's stray Q operators.
+    const b = chars[chars.length - 1];
+    expect(Number.isFinite(b.bbox.x)).toBe(true);
+    expect(Number.isFinite(b.bbox.y)).toBe(true);
+  });
+
+  it("applies the form /Matrix to nested text position", () => {
+    const form: FormXObject = {
+      bytes: bytes("BT /F1 10 Tf (A) Tj ET"),
+      matrix: [1, 0, 0, 1, 100, 200],
+      resources: fontOnly(),
+    };
+
+    const extractor = new TextExtractor({
+      resolveFont: () => stubFont,
+      resolveXObject: () => form,
+    });
+
+    const withMatrix = extractor.extract(bytes("/Fm0 Do"));
+
+    const plain = new TextExtractor({
+      resolveFont: () => stubFont,
+      resolveXObject: () => ({ ...form, matrix: undefined }),
+    }).extract(bytes("/Fm0 Do"));
+
+    expect(withMatrix[0].bbox.x).toBeCloseTo(plain[0].bbox.x + 100);
+    expect(withMatrix[0].bbox.y).toBeCloseTo(plain[0].bbox.y + 200);
+  });
+
+  it("stops recursing on cyclic form references without throwing", () => {
+    // A form that paints itself would recurse forever without a depth guard.
+    const self: FormXObject = {
+      bytes: bytes("BT /F1 12 Tf (Z) Tj ET /Fm0 Do"),
+      resources: fontOnly(),
+    };
+
+    const extractor = new TextExtractor({
+      resolveFont: () => stubFont,
+      resolveXObject: () => self,
+    });
+
+    expect(() => extractor.extract(bytes("/Fm0 Do"))).not.toThrow();
+  });
+
+  it("ignores Do when no XObject resolver is provided", () => {
+    const extractor = new TextExtractor({ resolveFont: () => stubFont });
+
+    const chars = extractor.extract(bytes("/Fm0 Do"));
+
+    expect(chars).toEqual([]);
+  });
+});
diff --git a/src/text/text-extractor.ts b/src/text/text-extractor.ts
index 4a9c41f..bfad415 100644
--- a/src/text/text-extractor.ts
+++ b/src/text/text-extractor.ts
@@ -16,6 +16,41 @@ import type { PdfFont } from "#src/fonts/pdf-font";
 import { TextState } from "./text-state";
 import type { ExtractedChar } from "./types";
 
+/** Maximum form XObject nesting depth, to guard against cyclic references. */
+const MAX_FORM_DEPTH = 16;
+
+/**
+ * Resolves the named resources of a content stream (fonts and XObjects).
+ *
+ * Each form XObject carries its own resource dictionary, so a resolver is
+ * scoped to a single content stream.
+ */
+export interface ResourceResolver {
+  /**
+   * Resolve a font name to a PdfFont object.
+   * Font names are keys in the /Resources/Font dictionary (e.g., "F1", "TT0").
+   */
+  resolveFont: (name: string) => PdfFont | null;
+
+  /**
+   * Resolve an XObject name (key in /Resources/XObject) to a form XObject.
+   * Returns null for image XObjects or names that cannot be resolved.
+   */
+  resolveXObject: (name: string) => FormXObject | null;
+}
+
+/**
+ * A form XObject whose content stream should be processed inline.
+ */
+export interface FormXObject {
+  /** Decoded content stream bytes of the form. */
+  bytes: Uint8Array;
+  /** Optional /Matrix mapping form space into the current coordinate space. */
+  matrix?: readonly [number, number, number, number, number, number];
+  /** Resources scoped to the form's own content stream. */
+  resources: ResourceResolver;
+}
+
 /**
  * Options for text extraction.
  */
@@ -25,18 +60,32 @@ export interface TextExtractorOptions {
    * Font names are keys in the /Resources/Font dictionary (e.g., "F1", "TT0").
    */
   resolveFont: (name: string) => PdfFont | null;
+
+  /**
+   * Resolve an XObject name to a form XObject so its text can be extracted.
+   * Optional — when omitted, `Do` operators are ignored.
+   */
+  resolveXObject?: (name: string) => FormXObject | null;
 }
 
 /**
  * Extracts text from PDF content streams.
  */
 export class TextExtractor {
-  private readonly resolveFont: (name: string) => PdfFont | null;
   private readonly state: TextState;
   private readonly chars: ExtractedChar[] = [];
 
+  /** Resources for the content stream currently being processed. */
+  private resources: ResourceResolver;
+
+  /** Current form XObject nesting depth. */
+  private formDepth = 0;
+
   constructor(options: TextExtractorOptions) {
-    this.resolveFont = options.resolveFont;
+    this.resources = {
+      resolveFont: options.resolveFont,
+      resolveXObject: options.resolveXObject ?? (() => null),
+    };
     this.state = new TextState();
   }
 
@@ -47,14 +96,21 @@ export class TextExtractor {
    * @returns Array of extracted characters with positions
    */
   extract(contentBytes: Uint8Array): ExtractedChar[] {
+    this.runContent(contentBytes);
+
+    return this.chars;
+  }
+
+  /**
+   * Parse and process a content stream's operations with the active resources.
+   */
+  private runContent(contentBytes: Uint8Array): void {
     const parser = new ContentStreamParser(contentBytes);
     const { operations } = parser.parse();
 
     for (const op of operations) {
       this.processOperation(op);
     }
-
-    return this.chars;
   }
 
   /**
@@ -169,6 +225,74 @@ export class TextExtractor {
         this.state.moveToNextLine();
         this.handleTj([operands[2]]);
         break;
+
+      // XObject invocation
+      case "Do":
+        this.handleDo(operands);
+        break;
+    }
+  }
+
+  /**
+   * Handle Do (paint XObject) operator.
+   *
+   * Form XObjects carry their own content stream and resources, so any text
+   * inside them is extracted by processing the form inline. Image XObjects
+   * resolve to null and are skipped.
+   */
+  private handleDo(operands: ContentToken[]): void {
+    const name = this.getName(operands[0]);
+
+    if (!name) {
+      return;
+    }
+
+    const form = this.resources.resolveXObject(name);
+
+    if (!form) {
+      return;
+    }
+
+    this.runForm(form);
+  }
+
+  /**
+   * Process a form XObject's content stream inline.
+   *
+   * Per the PDF spec (8.10.1), invoking a form is equivalent to wrapping its
+   * content in q/Q with the form's /Matrix concatenated onto the CTM. The
+   * caller's state is fully snapshotted and restored so that imbalanced q/Q or
+   * leftover text state inside the form cannot affect the rest of the page.
+   */
+  private runForm(form: FormXObject): void {
+    if (this.formDepth >= MAX_FORM_DEPTH) {
+      return;
+    }
+
+    this.formDepth += 1;
+
+    const snapshot = this.state.captureState();
+    const previousResources = this.resources;
+
+    if (form.matrix) {
+      this.state.concatMatrix(
+        form.matrix[0],
+        form.matrix[1],
+        form.matrix[2],
+        form.matrix[3],
+        form.matrix[4],
+        form.matrix[5],
+      );
+    }
+
+    this.resources = form.resources;
+
+    try {
+      this.runContent(form.bytes);
+    } finally {
+      this.resources = previousResources;
+      this.state.restoreState(snapshot);
+      this.formDepth -= 1;
     }
   }
 
@@ -194,7 +318,7 @@ export class TextExtractor {
     const fontSize = this.getNumber(operands[1]);
 
     if (fontName) {
-      const font = this.resolveFont(fontName);
+      const font = this.resources.resolveFont(fontName);
       this.state.font = font;
     }
 
diff --git a/src/text/text-state.ts b/src/text/text-state.ts
index 82f4911..5bcfba6 100644
--- a/src/text/text-state.ts
+++ b/src/text/text-state.ts
@@ -37,6 +37,26 @@ export interface TextStateParams {
   renderMode: number;
 }
 
+/**
+ * A full snapshot of the rendering state, used to isolate nested content
+ * (e.g. form XObjects) so that imbalanced q/Q inside them cannot corrupt the
+ * caller's state.
+ */
+export interface StateSnapshot {
+  ctm: Matrix;
+  tm: Matrix;
+  tlm: Matrix;
+  font: PdfFont | null;
+  fontSize: number;
+  charSpacing: number;
+  wordSpacing: number;
+  horizontalScale: number;
+  leading: number;
+  rise: number;
+  renderMode: number;
+  graphicsStackDepth: number;
+}
+
 /**
  * Tracks all text rendering state during content stream processing.
  */
@@ -332,6 +352,54 @@ export class TextState {
     };
   }
 
+  /**
+   * Capture a full snapshot of the current rendering state.
+   *
+   * Used to isolate nested content streams (form XObjects): the snapshot
+   * records the graphics-state stack depth so it can be unwound even if the
+   * nested content has unbalanced q/Q operators.
+   */
+  captureState(): StateSnapshot {
+    return {
+      ctm: this.ctm.clone(),
+      tm: this.tm.clone(),
+      tlm: this.tlm.clone(),
+      font: this.font,
+      fontSize: this.fontSize,
+      charSpacing: this.charSpacing,
+      wordSpacing: this.wordSpacing,
+      horizontalScale: this.horizontalScale,
+      leading: this.leading,
+      rise: this.rise,
+      renderMode: this.renderMode,
+      graphicsStackDepth: this.graphicsStateStack.length,
+    };
+  }
+
+  /**
+   * Restore a snapshot captured by {@link captureState}.
+   *
+   * Any graphics-state entries pushed since the snapshot are discarded, so a
+   * nested stream with extra q (or missing Q) operators cannot leak state.
+   */
+  restoreState(snapshot: StateSnapshot): void {
+    this.ctm = snapshot.ctm;
+    this.tm = snapshot.tm;
+    this.tlm = snapshot.tlm;
+    this.font = snapshot.font;
+    this.fontSize = snapshot.fontSize;
+    this.charSpacing = snapshot.charSpacing;
+    this.wordSpacing = snapshot.wordSpacing;
+    this.horizontalScale = snapshot.horizontalScale;
+    this.leading = snapshot.leading;
+    this.rise = snapshot.rise;
+    this.renderMode = snapshot.renderMode;
+
+    if (this.graphicsStateStack.length > snapshot.graphicsStackDepth) {
+      this.graphicsStateStack.length = snapshot.graphicsStackDepth;
+    }
+  }
+
   /**
    * Clone the current text state.
    */

From b681eb23fdc78a3f916799a20f9f9fefd44ec63b Mon Sep 17 00:00:00 2001
From: lajeti <liberian.ajeti@gmail.com>
Date: Fri, 3 Jul 2026 13:14:02 +0200
Subject: [PATCH 2/2] release: v0.4.2

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index de8917f..10cbe49 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@libpdf/core",
-  "version": "0.4.0",
+  "version": "0.4.2",
   "description": "A modern PDF library for TypeScript - parsing and generation",
   "keywords": [
     "digital-signature",