Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 107 additions & 3 deletions src/parser/document-parser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ function createMinimalPdf(options: {
garbageBeforeHeader?: string;
objects?: Array<{ objNum: number; content: string }>;
xrefEntries?: Array<{ objNum: number; offset: number; gen?: number; free?: boolean }>;
extraXrefEntries?: Array<{ objNum: number; offset: number; gen?: number; free?: boolean }>;
trailer?: Record<string, string>;
}): Uint8Array {
const parts: string[] = [];
Expand All @@ -30,9 +31,12 @@ function createMinimalPdf(options: {
parts.push(`%PDF-${version}\n`);
parts.push("%\x80\x81\x82\x83\n"); // Binary marker

// Track offsets for xref
// Track offsets for xref (byte lengths, not string lengths — the binary
// marker encodes to multiple UTF-8 bytes per char)
const byteLength = (s: string): number => new TextEncoder().encode(s).length;

const offsets: Array<{ objNum: number; offset: number; gen: number; free: boolean }> = [];
let currentOffset = parts.join("").length;
let currentOffset = byteLength(parts.join(""));

// Objects
const objects = options.objects ?? [
Expand All @@ -44,7 +48,7 @@ function createMinimalPdf(options: {
offsets.push({ objNum: obj.objNum, offset: currentOffset, gen: 0, free: false });
const objStr = `${obj.objNum} 0 obj\n${obj.content}\nendobj\n`;
parts.push(objStr);
currentOffset += objStr.length;
currentOffset += byteLength(objStr);
}

// Use provided xref entries or build from objects
Expand All @@ -53,6 +57,10 @@ function createMinimalPdf(options: {
...offsets,
];

if (options.extraXrefEntries) {
xrefEntries.push(...options.extraXrefEntries);
}

// XRef table
const xrefOffset = currentOffset;
parts.push("xref\n");
Expand Down Expand Up @@ -779,6 +787,102 @@ describe("DocumentParser", () => {
});
});

describe("malformed object recovery", () => {
it("recovers dict with missing value (lenient default)", () => {
const bytes = createMinimalPdf({
objects: [
{ objNum: 1, content: "<< /Type /Catalog /Pages 2 0 R >>" },
{ objNum: 2, content: "<< /Type /Pages /Kids [] /Count 0 >>" },
{ objNum: 3, content: "<< /S /GoTo /D >>" },
],
});
const scanner = new Scanner(bytes);
const doc = new DocumentParser(scanner).parse();

const obj = doc.getObject(PdfRef.of(3, 0));

expect(obj).toBeInstanceOf(PdfDict);
expect((obj as PdfDict).getName("S")?.value).toBe("GoTo");
expect((obj as PdfDict).has("D")).toBe(false);
expect(doc.warnings.some(w => w.includes("Missing value for key D"))).toBe(true);
});

it("throws on dict with missing value in strict mode", () => {
const bytes = createMinimalPdf({
objects: [
{ objNum: 1, content: "<< /Type /Catalog /Pages 2 0 R >>" },
{ objNum: 2, content: "<< /Type /Pages /Kids [] /Count 0 >>" },
{ objNum: 3, content: "<< /S /GoTo /D >>" },
],
});
const scanner = new Scanner(bytes);
const doc = new DocumentParser(scanner, { lenient: false }).parse();

expect(() => doc.getObject(PdfRef.of(3, 0))).toThrow("Missing value for key D");
});

it("recovers stream with wrong /Length via endstream scan", () => {
const bytes = createMinimalPdf({
objects: [
{ objNum: 1, content: "<< /Type /Catalog /Pages 2 0 R >>" },
{ objNum: 2, content: "<< /Type /Pages /Kids [] /Count 0 >>" },
{ objNum: 3, content: "<< /Length 999 >>\nstream\nHello\nendstream" },
],
});
const scanner = new Scanner(bytes);
const doc = new DocumentParser(scanner).parse();

const obj = doc.getObject(PdfRef.of(3, 0));

expect(obj).toBeInstanceOf(PdfStream);
expect(new TextDecoder().decode((obj as PdfStream).data)).toBe("Hello");
expect(doc.warnings.some(w => w.includes("/Length 999"))).toBe(true);
});

it("returns null for unparseable object instead of throwing (lenient)", () => {
const bytes = createMinimalPdf({
// Object 3 points at garbage (way past EOF)
extraXrefEntries: [{ objNum: 3, offset: 999999 }],
});
const scanner = new Scanner(bytes);
const doc = new DocumentParser(scanner).parse();

const obj = doc.getObject(PdfRef.of(3, 0));

expect(obj).toBeNull();
expect(doc.warnings.some(w => w.includes("Failed to parse object 3 0"))).toBe(true);

// Rest of the document still works
expect(doc.getCatalog()).not.toBeNull();
expect(doc.getPageCount()).toBe(0);
});

it("caches parse failures and does not re-warn on repeated lookups", () => {
const bytes = createMinimalPdf({
extraXrefEntries: [{ objNum: 3, offset: 999999 }],
});
const scanner = new Scanner(bytes);
const doc = new DocumentParser(scanner).parse();

expect(doc.getObject(PdfRef.of(3, 0))).toBeNull();

const warningCount = doc.warnings.length;

expect(doc.getObject(PdfRef.of(3, 0))).toBeNull();
expect(doc.warnings.length).toBe(warningCount);
});

it("throws for unparseable object in strict mode", () => {
const bytes = createMinimalPdf({
extraXrefEntries: [{ objNum: 3, offset: 999999 }],
});
const scanner = new Scanner(bytes);
const doc = new DocumentParser(scanner, { lenient: false }).parse();

expect(() => doc.getObject(PdfRef.of(3, 0))).toThrow();
});
});

describe("error handling", () => {
it("throws in strict mode for invalid xref", async () => {
const malformedPdf = new TextEncoder().encode(
Expand Down
104 changes: 65 additions & 39 deletions src/parser/document-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,17 @@ export class DocumentParser {
trailer: PdfDict,
recoveredViaBruteForce: boolean,
): ParsedDocument {
// Object cache: "objNum genNum" -> PdfObject
const cache = new Map<string, PdfObject>();
// Object cache: "objNum genNum" -> PdfObject (null = known-unparseable)
const cache = new Map<string, PdfObject | null>();

const lenient = this.options.lenient ?? true;

const recoveryOptions = {
recoveryMode: lenient,
onWarning: (message: string, position: number) => {
this.warnings.push(`Object parse warning at offset ${position}: ${message}`);
},
};

// Object stream cache: streamObjNum -> ObjectStreamParser
const objectStreamCache = new Map<number, ObjectStreamParser>();
Expand Down Expand Up @@ -644,8 +653,7 @@ export class DocumentParser {

// Check cache
if (cache.has(key)) {
// biome-ignore lint/style/noNonNullAssertion: checked with .has(...)
return cache.get(key)!;
return cache.get(key) ?? null;
}

// Look up in xref
Expand All @@ -657,59 +665,77 @@ export class DocumentParser {

let obj: PdfObject | null = null;

switch (entry.type) {
case "free":
return null;
try {
switch (entry.type) {
case "free":
return null;

case "uncompressed": {
const parser = new IndirectObjectParser(this.scanner, lengthResolver);
case "uncompressed": {
const parser = new IndirectObjectParser(this.scanner, lengthResolver, recoveryOptions);

const result = parser.parseObjectAt(entry.offset);
const result = parser.parseObjectAt(entry.offset);

// Verify generation matches
if (result.genNum !== ref.generation) {
this.warnings.push(
`Generation mismatch for object ${ref.objectNumber}: expected ${ref.generation}, got ${result.genNum}`,
);
}
// Verify generation matches
if (result.genNum !== ref.generation) {
this.warnings.push(
`Generation mismatch for object ${ref.objectNumber}: expected ${ref.generation}, got ${result.genNum}`,
);
}

obj = result.value;

obj = result.value;
// Decrypt the object
if (securityHandler?.isAuthenticated) {
obj = decryptObject(obj, ref.objectNumber, ref.generation);
}

// Decrypt the object
if (securityHandler?.isAuthenticated) {
obj = decryptObject(obj, ref.objectNumber, ref.generation);
break;
}

break;
}
case "compressed": {
// Get or create object stream parser
let streamParser = objectStreamCache.get(entry.streamObjNum);

case "compressed": {
// Get or create object stream parser
let streamParser = objectStreamCache.get(entry.streamObjNum);
if (!streamParser) {
// Load the object stream
const streamRef = PdfRef.of(entry.streamObjNum, 0);
const streamObj = getObject(streamRef);

if (!streamParser) {
// Load the object stream
const streamRef = PdfRef.of(entry.streamObjNum, 0);
const streamObj = getObject(streamRef);
if (!streamObj || !(streamObj instanceof PdfStream)) {
this.warnings.push(`Object stream ${entry.streamObjNum} not found or invalid`);

if (!streamObj || !(streamObj instanceof PdfStream)) {
this.warnings.push(`Object stream ${entry.streamObjNum} not found or invalid`);
return null;
}

return null;
streamParser = new ObjectStreamParser(streamObj, recoveryOptions);

objectStreamCache.set(entry.streamObjNum, streamParser);
}

streamParser = new ObjectStreamParser(streamObj);
obj = streamParser.getObject(entry.indexInStream);

// Objects in object streams don't need individual decryption
// because the stream itself was decrypted

objectStreamCache.set(entry.streamObjNum, streamParser);
break;
}
}
} catch (error) {
// Error boundary: a single malformed object must not crash reads
// or saves. In lenient mode, record a warning and treat the object
// as missing (like pdf.js and PDFBox).
if (!lenient) {
throw error;
}

obj = streamParser.getObject(entry.indexInStream);
const message = error instanceof Error ? error.message : String(error);

// Objects in object streams don't need individual decryption
// because the stream itself was decrypted
this.warnings.push(`Failed to parse object ${key}: ${message}`);

break;
}
// Cache the failure so repeated lookups don't re-parse and re-warn
cache.set(key, null);

return null;
}

// Cache the result
Expand Down
Loading
Loading