From 639498a4305d631d1b864c3d61f88a610a5493d3 Mon Sep 17 00:00:00 2001 From: Lucas Smith Date: Thu, 2 Jul 2026 14:21:53 +1000 Subject: [PATCH] fix: handle malformed objects more gracefully --- src/parser/document-parser.test.ts | 110 +++++++++++++++++++++- src/parser/document-parser.ts | 104 ++++++++++++-------- src/parser/indirect-object-parser.test.ts | 98 ++++++++++++++++++- src/parser/indirect-object-parser.ts | 85 +++++++++++++++-- src/parser/object-parser.test.ts | 60 ++++++++++++ src/parser/object-parser.ts | 6 ++ src/parser/object-stream-parser.test.ts | 31 ++++++ src/parser/object-stream-parser.ts | 24 ++++- 8 files changed, 464 insertions(+), 54 deletions(-) diff --git a/src/parser/document-parser.test.ts b/src/parser/document-parser.test.ts index b86d276..605a721 100644 --- a/src/parser/document-parser.test.ts +++ b/src/parser/document-parser.test.ts @@ -16,6 +16,7 @@ function createMinimalPdf(options: { garbageBeforeHeader?: string; objects?: Array<{ objNum: number; content: string }>; xrefEntries?: Array<{ objNum: number; offset: number; gen?: number; free?: boolean }>; + extraXrefEntries?: Array<{ objNum: number; offset: number; gen?: number; free?: boolean }>; trailer?: Record; }): Uint8Array { const parts: string[] = []; @@ -30,9 +31,12 @@ function createMinimalPdf(options: { parts.push(`%PDF-${version}\n`); parts.push("%\x80\x81\x82\x83\n"); // Binary marker - // Track offsets for xref + // Track offsets for xref (byte lengths, not string lengths — the binary + // marker encodes to multiple UTF-8 bytes per char) + const byteLength = (s: string): number => new TextEncoder().encode(s).length; + const offsets: Array<{ objNum: number; offset: number; gen: number; free: boolean }> = []; - let currentOffset = parts.join("").length; + let currentOffset = byteLength(parts.join("")); // Objects const objects = options.objects ?? [ @@ -44,7 +48,7 @@ function createMinimalPdf(options: { offsets.push({ objNum: obj.objNum, offset: currentOffset, gen: 0, free: false }); const objStr = `${obj.objNum} 0 obj\n${obj.content}\nendobj\n`; parts.push(objStr); - currentOffset += objStr.length; + currentOffset += byteLength(objStr); } // Use provided xref entries or build from objects @@ -53,6 +57,10 @@ function createMinimalPdf(options: { ...offsets, ]; + if (options.extraXrefEntries) { + xrefEntries.push(...options.extraXrefEntries); + } + // XRef table const xrefOffset = currentOffset; parts.push("xref\n"); @@ -779,6 +787,102 @@ describe("DocumentParser", () => { }); }); + describe("malformed object recovery", () => { + it("recovers dict with missing value (lenient default)", () => { + const bytes = createMinimalPdf({ + objects: [ + { objNum: 1, content: "<< /Type /Catalog /Pages 2 0 R >>" }, + { objNum: 2, content: "<< /Type /Pages /Kids [] /Count 0 >>" }, + { objNum: 3, content: "<< /S /GoTo /D >>" }, + ], + }); + const scanner = new Scanner(bytes); + const doc = new DocumentParser(scanner).parse(); + + const obj = doc.getObject(PdfRef.of(3, 0)); + + expect(obj).toBeInstanceOf(PdfDict); + expect((obj as PdfDict).getName("S")?.value).toBe("GoTo"); + expect((obj as PdfDict).has("D")).toBe(false); + expect(doc.warnings.some(w => w.includes("Missing value for key D"))).toBe(true); + }); + + it("throws on dict with missing value in strict mode", () => { + const bytes = createMinimalPdf({ + objects: [ + { objNum: 1, content: "<< /Type /Catalog /Pages 2 0 R >>" }, + { objNum: 2, content: "<< /Type /Pages /Kids [] /Count 0 >>" }, + { objNum: 3, content: "<< /S /GoTo /D >>" }, + ], + }); + const scanner = new Scanner(bytes); + const doc = new DocumentParser(scanner, { lenient: false }).parse(); + + expect(() => doc.getObject(PdfRef.of(3, 0))).toThrow("Missing value for key D"); + }); + + it("recovers stream with wrong /Length via endstream scan", () => { + const bytes = createMinimalPdf({ + objects: [ + { objNum: 1, content: "<< /Type /Catalog /Pages 2 0 R >>" }, + { objNum: 2, content: "<< /Type /Pages /Kids [] /Count 0 >>" }, + { objNum: 3, content: "<< /Length 999 >>\nstream\nHello\nendstream" }, + ], + }); + const scanner = new Scanner(bytes); + const doc = new DocumentParser(scanner).parse(); + + const obj = doc.getObject(PdfRef.of(3, 0)); + + expect(obj).toBeInstanceOf(PdfStream); + expect(new TextDecoder().decode((obj as PdfStream).data)).toBe("Hello"); + expect(doc.warnings.some(w => w.includes("/Length 999"))).toBe(true); + }); + + it("returns null for unparseable object instead of throwing (lenient)", () => { + const bytes = createMinimalPdf({ + // Object 3 points at garbage (way past EOF) + extraXrefEntries: [{ objNum: 3, offset: 999999 }], + }); + const scanner = new Scanner(bytes); + const doc = new DocumentParser(scanner).parse(); + + const obj = doc.getObject(PdfRef.of(3, 0)); + + expect(obj).toBeNull(); + expect(doc.warnings.some(w => w.includes("Failed to parse object 3 0"))).toBe(true); + + // Rest of the document still works + expect(doc.getCatalog()).not.toBeNull(); + expect(doc.getPageCount()).toBe(0); + }); + + it("caches parse failures and does not re-warn on repeated lookups", () => { + const bytes = createMinimalPdf({ + extraXrefEntries: [{ objNum: 3, offset: 999999 }], + }); + const scanner = new Scanner(bytes); + const doc = new DocumentParser(scanner).parse(); + + expect(doc.getObject(PdfRef.of(3, 0))).toBeNull(); + + const warningCount = doc.warnings.length; + + expect(doc.getObject(PdfRef.of(3, 0))).toBeNull(); + expect(doc.warnings.length).toBe(warningCount); + }); + + it("throws for unparseable object in strict mode", () => { + const bytes = createMinimalPdf({ + extraXrefEntries: [{ objNum: 3, offset: 999999 }], + }); + const scanner = new Scanner(bytes); + const doc = new DocumentParser(scanner, { lenient: false }).parse(); + + expect(() => doc.getObject(PdfRef.of(3, 0))).toThrow(); + }); + }); + describe("error handling", () => { it("throws in strict mode for invalid xref", async () => { const malformedPdf = new TextEncoder().encode( diff --git a/src/parser/document-parser.ts b/src/parser/document-parser.ts index a67111b..9027a4d 100644 --- a/src/parser/document-parser.ts +++ b/src/parser/document-parser.ts @@ -429,8 +429,17 @@ export class DocumentParser { trailer: PdfDict, recoveredViaBruteForce: boolean, ): ParsedDocument { - // Object cache: "objNum genNum" -> PdfObject - const cache = new Map(); + // Object cache: "objNum genNum" -> PdfObject (null = known-unparseable) + const cache = new Map(); + + const lenient = this.options.lenient ?? true; + + const recoveryOptions = { + recoveryMode: lenient, + onWarning: (message: string, position: number) => { + this.warnings.push(`Object parse warning at offset ${position}: ${message}`); + }, + }; // Object stream cache: streamObjNum -> ObjectStreamParser const objectStreamCache = new Map(); @@ -644,8 +653,7 @@ export class DocumentParser { // Check cache if (cache.has(key)) { - // biome-ignore lint/style/noNonNullAssertion: checked with .has(...) - return cache.get(key)!; + return cache.get(key) ?? null; } // Look up in xref @@ -657,59 +665,77 @@ export class DocumentParser { let obj: PdfObject | null = null; - switch (entry.type) { - case "free": - return null; + try { + switch (entry.type) { + case "free": + return null; - case "uncompressed": { - const parser = new IndirectObjectParser(this.scanner, lengthResolver); + case "uncompressed": { + const parser = new IndirectObjectParser(this.scanner, lengthResolver, recoveryOptions); - const result = parser.parseObjectAt(entry.offset); + const result = parser.parseObjectAt(entry.offset); - // Verify generation matches - if (result.genNum !== ref.generation) { - this.warnings.push( - `Generation mismatch for object ${ref.objectNumber}: expected ${ref.generation}, got ${result.genNum}`, - ); - } + // Verify generation matches + if (result.genNum !== ref.generation) { + this.warnings.push( + `Generation mismatch for object ${ref.objectNumber}: expected ${ref.generation}, got ${result.genNum}`, + ); + } + + obj = result.value; - obj = result.value; + // Decrypt the object + if (securityHandler?.isAuthenticated) { + obj = decryptObject(obj, ref.objectNumber, ref.generation); + } - // Decrypt the object - if (securityHandler?.isAuthenticated) { - obj = decryptObject(obj, ref.objectNumber, ref.generation); + break; } - break; - } + case "compressed": { + // Get or create object stream parser + let streamParser = objectStreamCache.get(entry.streamObjNum); - case "compressed": { - // Get or create object stream parser - let streamParser = objectStreamCache.get(entry.streamObjNum); + if (!streamParser) { + // Load the object stream + const streamRef = PdfRef.of(entry.streamObjNum, 0); + const streamObj = getObject(streamRef); - if (!streamParser) { - // Load the object stream - const streamRef = PdfRef.of(entry.streamObjNum, 0); - const streamObj = getObject(streamRef); + if (!streamObj || !(streamObj instanceof PdfStream)) { + this.warnings.push(`Object stream ${entry.streamObjNum} not found or invalid`); - if (!streamObj || !(streamObj instanceof PdfStream)) { - this.warnings.push(`Object stream ${entry.streamObjNum} not found or invalid`); + return null; + } - return null; + streamParser = new ObjectStreamParser(streamObj, recoveryOptions); + + objectStreamCache.set(entry.streamObjNum, streamParser); } - streamParser = new ObjectStreamParser(streamObj); + obj = streamParser.getObject(entry.indexInStream); + + // Objects in object streams don't need individual decryption + // because the stream itself was decrypted - objectStreamCache.set(entry.streamObjNum, streamParser); + break; } + } + } catch (error) { + // Error boundary: a single malformed object must not crash reads + // or saves. In lenient mode, record a warning and treat the object + // as missing (like pdf.js and PDFBox). + if (!lenient) { + throw error; + } - obj = streamParser.getObject(entry.indexInStream); + const message = error instanceof Error ? error.message : String(error); - // Objects in object streams don't need individual decryption - // because the stream itself was decrypted + this.warnings.push(`Failed to parse object ${key}: ${message}`); - break; - } + // Cache the failure so repeated lookups don't re-parse and re-warn + cache.set(key, null); + + return null; } // Cache the result diff --git a/src/parser/indirect-object-parser.test.ts b/src/parser/indirect-object-parser.test.ts index 6b1fc41..f21cd4d 100644 --- a/src/parser/indirect-object-parser.test.ts +++ b/src/parser/indirect-object-parser.test.ts @@ -8,7 +8,7 @@ import { PdfStream } from "#src/objects/pdf-stream"; import { PdfString } from "#src/objects/pdf-string"; import { describe, expect, it } from "vitest"; -import { IndirectObjectParser } from "./indirect-object-parser"; +import { IndirectObjectParser, type IndirectObjectParserOptions } from "./indirect-object-parser"; /** * Helper to create an IndirectObjectParser from a string. @@ -16,11 +16,12 @@ import { IndirectObjectParser } from "./indirect-object-parser"; function parser( input: string, lengthResolver?: (ref: PdfRef) => number | null, + options?: IndirectObjectParserOptions, ): IndirectObjectParser { const bytes = new TextEncoder().encode(input); const scanner = new Scanner(bytes); - return new IndirectObjectParser(scanner, lengthResolver); + return new IndirectObjectParser(scanner, lengthResolver, options); } describe("IndirectObjectParser", () => { @@ -317,6 +318,99 @@ endobj`); expect(() => p.parseObject()).toThrow(/obj/i); }); + it("recovers stream with /Length too small via endstream scan", () => { + const warnings: string[] = []; + const p = parser( + `1 0 obj +<< /Length 3 >> +stream +Hello World +endstream +endobj`, + undefined, + { onWarning: msg => warnings.push(msg) }, + ); + const result = p.parseObject(); + + const stream = result.value as PdfStream; + expect(new TextDecoder().decode(stream.data)).toBe("Hello World"); + expect(warnings.length).toBe(1); + expect(warnings[0]).toContain("/Length 3"); + }); + + it("recovers stream with /Length too large via endstream scan", () => { + const warnings: string[] = []; + const p = parser( + `1 0 obj +<< /Length 9999 >> +stream +Hello +endstream +endobj`, + undefined, + { onWarning: msg => warnings.push(msg) }, + ); + const result = p.parseObject(); + + const stream = result.value as PdfStream; + expect(new TextDecoder().decode(stream.data)).toBe("Hello"); + expect(warnings.length).toBe(1); + }); + + it("accepts correct /Length without warnings", () => { + const warnings: string[] = []; + const p = parser( + `1 0 obj +<< /Length 5 >> +stream +Hello +endstream +endobj`, + undefined, + { onWarning: msg => warnings.push(msg) }, + ); + const result = p.parseObject(); + + const stream = result.value as PdfStream; + expect(new TextDecoder().decode(stream.data)).toBe("Hello"); + expect(warnings.length).toBe(0); + }); + + it("throws when stream has no endstream at all", () => { + const p = parser(`1 0 obj +<< /Length 9999 >> +stream +Hello and then the file just ends`); + + expect(() => p.parseObject()).toThrow(/no endstream found/); + }); + + it("recovers dict with missing value in recovery mode", () => { + const warnings: string[] = []; + const p = parser( + `1 0 obj +<< /S /GoTo /D >> +endobj`, + undefined, + { recoveryMode: true, onWarning: msg => warnings.push(msg) }, + ); + const result = p.parseObject(); + + const dict = result.value as PdfDict; + expect(dict.getName("S")?.value).toBe("GoTo"); + expect(dict.has("D")).toBe(false); + expect(warnings.length).toBe(1); + expect(warnings[0]).toContain("Missing value for key D"); + }); + + it("throws on dict with missing value without recovery mode", () => { + const p = parser(`1 0 obj +<< /S /GoTo /D >> +endobj`); + + expect(() => p.parseObject()).toThrow("Missing value for key D"); + }); + it("recovers stream with missing /Length via endstream scan", () => { const p = parser(`1 0 obj << /Type /XObject >> diff --git a/src/parser/indirect-object-parser.ts b/src/parser/indirect-object-parser.ts index 411ee41..828572e 100644 --- a/src/parser/indirect-object-parser.ts +++ b/src/parser/indirect-object-parser.ts @@ -1,4 +1,4 @@ -import { CR, LF, SPACE, TAB } from "#src/helpers/chars"; +import { CR, isWhitespace, LF } from "#src/helpers/chars"; import type { Scanner } from "#src/io/scanner"; import type { PdfDict } from "#src/objects/pdf-dict"; import type { PdfObject } from "#src/objects/pdf-object"; @@ -6,7 +6,7 @@ import type { PdfRef } from "#src/objects/pdf-ref"; import { PdfStream } from "#src/objects/pdf-stream"; import { ObjectParseError } from "./errors"; -import { ObjectParser } from "./object-parser"; +import { ObjectParser, type WarningCallback } from "./object-parser"; import { TokenReader } from "./token-reader"; /** @@ -24,6 +24,20 @@ export interface IndirectObject { */ export type LengthResolver = (ref: PdfRef) => number | null; +/** + * Options for IndirectObjectParser. + */ +export interface IndirectObjectParserOptions { + /** + * Enable recovery mode for lenient parsing of object contents. + * Malformed dicts/arrays produce partial results instead of throwing. + */ + recoveryMode?: boolean; + + /** Callback for warnings emitted during lenient parsing. */ + onWarning?: WarningCallback; +} + /** * Parser for indirect object definitions. * @@ -34,8 +48,13 @@ export class IndirectObjectParser { constructor( private scanner: Scanner, private lengthResolver?: LengthResolver, + private options: IndirectObjectParserOptions = {}, ) {} + private warn(message: string, position: number): void { + this.options.onWarning?.(message, position); + } + /** * Parse indirect object at current scanner position. */ @@ -69,6 +88,10 @@ export class IndirectObjectParser { // Parse the object value const objectParser = new ObjectParser(reader); + + objectParser.recoveryMode = this.options.recoveryMode ?? false; + objectParser.onWarning = this.options.onWarning ?? null; + const result = objectParser.parseObject(); if (result === null) { @@ -134,16 +157,30 @@ export class IndirectObjectParser { // Try to resolve /Length from the dict. If that fails (e.g. indirect // ref during brute-force recovery with no resolver), fall back to // scanning for the "endstream" keyword to determine the length. - let length: number; + let length = -1; try { length = this.resolveLength(dict); } catch { - length = this.findEndStream(startPos); + length = -1; + } - if (length < 0) { - throw new ObjectParseError("Stream missing /Length and no endstream found"); - } + // Validate /Length: the "endstream" keyword must follow the data. + // Malformed PDFs often carry a wrong /Length — recover by scanning + // for endstream instead of trusting the declared value. + if (length >= 0 && !this.isEndstreamAt(startPos + length)) { + this.warn( + `Stream /Length ${length} does not point at endstream, scanning for actual end`, + startPos, + ); + + length = -1; + } + + length = length < 0 ? this.findEndStream(startPos) : length; + + if (length < 0) { + throw new ObjectParseError("Stream has no valid /Length and no endstream found"); } // Read exactly `length` bytes. @@ -162,6 +199,38 @@ export class IndirectObjectParser { return new PdfStream(dict, data); } + /** + * Check whether the "endstream" keyword appears at the given position, + * allowing optional whitespace before it. + */ + private isEndstreamAt(pos: number): boolean { + const bytes = this.scanner.bytes; + + if (pos > bytes.length) { + return false; + } + + let i = pos; + + while (i < bytes.length && isWhitespace(bytes[i])) { + i++; + } + + const keyword = "endstream"; + + if (i + keyword.length > bytes.length) { + return false; + } + + for (let j = 0; j < keyword.length; j++) { + if (bytes[i + j] !== keyword.charCodeAt(j)) { + return false; + } + } + + return true; + } + /** * Expect and consume a keyword at current scanner position. */ @@ -190,7 +259,7 @@ export class IndirectObjectParser { if (byte === -1) { break; } - if (byte === SPACE || byte === TAB || byte === LF || byte === CR) { + if (isWhitespace(byte)) { this.scanner.advance(); } else { break; diff --git a/src/parser/object-parser.test.ts b/src/parser/object-parser.test.ts index 73051f4..5e5c7b1 100644 --- a/src/parser/object-parser.test.ts +++ b/src/parser/object-parser.test.ts @@ -363,6 +363,66 @@ describe("ObjectParser", () => { expect(dict.get("Type")).toBe(PdfName.of("Page")); expect(warnings.length).toBeGreaterThan(0); }); + + it("throws on missing value for key in normal mode", () => { + const p = parser("<< /S /GoTo /D >>"); + + expect(() => p.parseObject()).toThrow("Missing value for key D"); + }); + + it("drops key with missing value in recovery mode", () => { + const p = parser("<< /S /GoTo /D >>"); + const warnings: string[] = []; + + p.recoveryMode = true; + p.onWarning = msg => warnings.push(msg); + + const result = p.parseObject(); + const dict = result!.object as PdfDict; + + expect(dict.get("S")).toBe(PdfName.of("GoTo")); + expect(dict.has("D")).toBe(false); + expect(warnings.length).toBe(1); + expect(warnings[0]).toContain("Missing value for key D"); + }); + }); + + describe("unknown keywords", () => { + it("throws on unknown keyword in normal mode", () => { + const p = parser("garbage"); + + expect(() => p.parseObject()).toThrow("Unexpected keyword: garbage"); + }); + + it("treats unknown keyword as null in recovery mode", () => { + const p = parser("garbage"); + const warnings: string[] = []; + + p.recoveryMode = true; + p.onWarning = msg => warnings.push(msg); + + const result = p.parseObject(); + + expect(result!.object).toBe(PdfNull.instance); + expect(warnings.length).toBe(1); + expect(warnings[0]).toContain("garbage"); + }); + + it("recovers dict with unknown keyword value", () => { + const p = parser("<< /Type /Page /Foo bogus /Count 2 >>"); + const warnings: string[] = []; + + p.recoveryMode = true; + p.onWarning = msg => warnings.push(msg); + + const result = p.parseObject(); + const dict = result!.object as PdfDict; + + expect(dict.get("Type")).toBe(PdfName.of("Page")); + expect(dict.get("Foo")).toBe(PdfNull.instance); + expect(dict.getNumber("Count")?.value).toBe(2); + expect(warnings.length).toBe(1); + }); }); describe("stream detection", () => { diff --git a/src/parser/object-parser.ts b/src/parser/object-parser.ts index b15c873..c29fab7 100644 --- a/src/parser/object-parser.ts +++ b/src/parser/object-parser.ts @@ -157,6 +157,12 @@ export class ObjectParser { return { object: PdfBool.FALSE, hasStream: false }; default: + if (this.recoveryMode) { + this.warn(`Unexpected keyword: ${value}, treating as null`); + + return { object: PdfNull.instance, hasStream: false }; + } + throw new ObjectParseError(`Unexpected keyword: ${value}`); } } diff --git a/src/parser/object-stream-parser.test.ts b/src/parser/object-stream-parser.test.ts index 043493f..ab19987 100644 --- a/src/parser/object-stream-parser.test.ts +++ b/src/parser/object-stream-parser.test.ts @@ -272,6 +272,37 @@ describe("ObjectStreamParser", () => { }); }); + describe("recovery mode", () => { + it("throws on malformed object without recovery mode", () => { + const stream = createObjectStream([{ objNum: 1, text: "<< /S /GoTo /D >>" }]); + + const parser = new ObjectStreamParser(stream); + + expect(() => parser.getObject(0)).toThrow("Missing value for key D"); + }); + + it("recovers malformed object with recovery mode", () => { + const warnings: string[] = []; + const stream = createObjectStream([ + { objNum: 1, text: "<< /S /GoTo /D >>" }, + { objNum: 2, text: "<< /Type /Page >>" }, + ]); + + const parser = new ObjectStreamParser(stream, { + recoveryMode: true, + onWarning: msg => warnings.push(msg), + }); + + const first = parser.getObject(0) as PdfDict; + expect(first.getName("S")?.value).toBe("GoTo"); + expect(first.has("D")).toBe(false); + expect(warnings.length).toBe(1); + + const second = parser.getObject(1) as PdfDict; + expect(second.getName("Type")?.value).toBe("Page"); + }); + }); + describe("edge cases", () => { it("handles empty stream (n=0)", async () => { const stream = new PdfStream( diff --git a/src/parser/object-stream-parser.ts b/src/parser/object-stream-parser.ts index 4e30396..e5897cc 100644 --- a/src/parser/object-stream-parser.ts +++ b/src/parser/object-stream-parser.ts @@ -3,7 +3,7 @@ import type { PdfObject } from "#src/objects/pdf-object"; import type { PdfStream } from "#src/objects/pdf-stream"; import { ObjectParseError } from "./errors"; -import { ObjectParser } from "./object-parser"; +import { ObjectParser, type WarningCallback } from "./object-parser"; import { TokenReader } from "./token-reader"; /** @@ -16,6 +16,20 @@ interface ObjectStreamEntry { offset: number; } +/** + * Options for ObjectStreamParser. + */ +export interface ObjectStreamParserOptions { + /** + * Enable recovery mode for lenient parsing of contained objects. + * Malformed objects produce partial results instead of throwing. + */ + recoveryMode?: boolean; + + /** Callback for warnings emitted during lenient parsing. */ + onWarning?: WarningCallback; +} + /** * Parser for PDF object streams (/Type /ObjStm). * @@ -43,7 +57,10 @@ export class ObjectStreamParser { private readonly first: number; private readonly n: number; - constructor(private stream: PdfStream) { + constructor( + private stream: PdfStream, + private options: ObjectStreamParserOptions = {}, + ) { // Validate stream type const type = stream.getName("Type"); @@ -156,6 +173,9 @@ export class ObjectStreamParser { const reader = new TokenReader(scanner); const parser = new ObjectParser(reader); + parser.recoveryMode = this.options.recoveryMode ?? false; + parser.onWarning = this.options.onWarning ?? null; + const result = parser.parseObject(); return result?.object ?? null;