Skip to content

Commit edc3d2e

Browse files
Merge pull request #8946 from BitGo/claude/compassionate-euler-DJa99
Add PDF keycard parsing functionality
2 parents ddfc0e8 + a9ec49c commit edc3d2e

9 files changed

Lines changed: 450 additions & 3 deletions

File tree

modules/key-card/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"@bitgo/sdk-core": "^37.3.0",
3838
"@bitgo/statics": "^58.43.0",
3939
"jspdf": ">=4.2.0",
40+
"pdfjs-dist": "^4.0.0",
4041
"qrcode": "^1.5.1"
4142
},
4243
"devDependencies": {
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import { buildLinesFromPDFNodes, KeycardEntry, parseKeycardFromLines, PDFTextNode } from './parseKeycard';
2+
3+
/**
4+
* Extracts keycard entries from a PDF file (browser only).
5+
*
6+
* Before calling this function, configure the pdfjs worker:
7+
* import { GlobalWorkerOptions } from 'pdfjs-dist';
8+
* GlobalWorkerOptions.workerSrc = '<url to pdf.worker.min.js>';
9+
*
10+
* pdfjs-dist is loaded via dynamic import so this module can be safely
11+
* imported in Node.js environments without triggering browser-only globals.
12+
*/
13+
export async function extractKeycardEntriesFromPDF(file: File): Promise<{
14+
lines: string[];
15+
entries: KeycardEntry[];
16+
}> {
17+
const pdfjsLib = await import('pdfjs-dist');
18+
const arrayBuffer = await file.arrayBuffer();
19+
const loadingTask = pdfjsLib.getDocument({ data: arrayBuffer });
20+
const pdfDocument = await loadingTask.promise;
21+
const nodes: PDFTextNode[] = [];
22+
23+
for (let pageNumber = 1; pageNumber <= pdfDocument.numPages; pageNumber++) {
24+
const page = await pdfDocument.getPage(pageNumber);
25+
const textContent = await page.getTextContent();
26+
27+
for (const item of textContent.items) {
28+
if (!('str' in item) || !Array.isArray(item.transform)) {
29+
continue;
30+
}
31+
32+
const text = item.str.replace(/\s+/g, ' ').trim();
33+
if (!text) {
34+
continue;
35+
}
36+
37+
const x = Number(item.transform[4] ?? 0);
38+
const y = Number(item.transform[5] ?? 0);
39+
const width = 'width' in item ? Number(item.width ?? 0) : 0;
40+
41+
nodes.push({ text, x, y, page: pageNumber, width });
42+
}
43+
}
44+
45+
const lines = buildLinesFromPDFNodes(nodes);
46+
return {
47+
lines,
48+
entries: parseKeycardFromLines(lines),
49+
};
50+
}

modules/key-card/src/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ import { generateParamsForKeyCreation } from './generateParamsForKeyCreation';
55
import { GenerateKeycardParams, GenerateLightningQrDataParams, GenerateQrDataBaseParams } from './types';
66

77
export * from './drawKeycard';
8+
export * from './extractKeycardFromPDF';
89
export * from './faq';
910
export * from './generateQrData';
11+
export * from './parseKeycard';
1012
export * from './utils';
1113
export * from './types';
1214

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
export type PDFTextNode = {
2+
text: string;
3+
x: number;
4+
y: number;
5+
page: number;
6+
width: number;
7+
};
8+
9+
export type KeycardEntry = {
10+
label: string;
11+
value: string;
12+
};
13+
14+
const sectionHeaderRegex = /^([A-D])\s*[:.)-]\s*(.+?)\s*$/i;
15+
const dataLineRegex = /^data\s*:\s*(.*)$/i;
16+
const faqHeaderRegex = /^BitGo\s+KeyCard\s+FAQ$/i;
17+
18+
// PDF coordinate tolerance in points. Nodes within this distance on the Y-axis
19+
// are treated as belonging to the same line; nodes further apart are separate lines.
20+
const PDF_LINE_Y_TOLERANCE = 2;
21+
// Horizontal gap in points above which a space is inserted between adjacent nodes.
22+
const PDF_NODE_GAP_THRESHOLD = 2;
23+
24+
function sanitizeText(input: string): string {
25+
return input.replace(/\s+/g, ' ').trim();
26+
}
27+
28+
function normalizeSectionValue(rawValue: string): string {
29+
// Two-pass removal of "Part N" page-continuation labels:
30+
// 1. Line filter: removes labels that appear as standalone lines.
31+
// 2. Regex replace: removes labels embedded mid-line when
32+
// buildLinesFromPDFNodes merges them with adjacent content at the same
33+
// y-coordinate (e.g. "...X88bPart 2 lFPMd...").
34+
// join('') intentionally uses no separator — section values are continuous
35+
// strings (base64 / xpub) that wrap across PDF lines without spaces.
36+
return rawValue
37+
.split('\n')
38+
.filter((line) => !/^Part\s+\d+$/i.test(line.trim()))
39+
.join('')
40+
.replace(/\s*Part\s+\d+\s*/gi, '')
41+
.trim();
42+
}
43+
44+
function countChar(input: string, char: string): number {
45+
return input.split(char).length - 1;
46+
}
47+
48+
function isEncryptedWalletPasswordSectionTitle(title: string): boolean {
49+
return title.toLowerCase().includes('encrypted wallet password');
50+
}
51+
52+
/**
53+
* Reconstructs logical text lines from an unordered set of PDF text nodes.
54+
*
55+
* PDF text extraction returns individual positioned fragments. This function
56+
* sorts them by page then Y-coordinate (top-to-bottom), groups fragments
57+
* within PDF_LINE_Y_TOLERANCE points of each other onto the same line, and
58+
* inserts a space between fragments that are separated by more than
59+
* PDF_NODE_GAP_THRESHOLD points horizontally.
60+
*/
61+
export function buildLinesFromPDFNodes(nodes: PDFTextNode[]): string[] {
62+
const sortedNodes = [...nodes].sort((a, b) => {
63+
if (a.page !== b.page) {
64+
return a.page - b.page;
65+
}
66+
const yDiff = Math.abs(a.y - b.y);
67+
if (yDiff > PDF_LINE_Y_TOLERANCE) {
68+
return b.y - a.y;
69+
}
70+
return a.x - b.x;
71+
});
72+
73+
const lines: string[] = [];
74+
let currentLineNodes: PDFTextNode[] = [];
75+
let currentPage = -1;
76+
let currentY = Number.NaN;
77+
78+
function flushLine() {
79+
if (currentLineNodes.length === 0) {
80+
return;
81+
}
82+
83+
const sortedLineNodes = [...currentLineNodes].sort((a, b) => a.x - b.x);
84+
let line = '';
85+
let previousRightEdge: number | null = null;
86+
for (const node of sortedLineNodes) {
87+
const piece = sanitizeText(node.text);
88+
if (!piece) {
89+
continue;
90+
}
91+
92+
if (previousRightEdge !== null && node.x - previousRightEdge > PDF_NODE_GAP_THRESHOLD) {
93+
line += ' ';
94+
}
95+
line += piece;
96+
previousRightEdge = node.x + node.width;
97+
}
98+
99+
const normalizedLine = line.trim();
100+
if (normalizedLine) {
101+
lines.push(normalizedLine);
102+
}
103+
}
104+
105+
for (const node of sortedNodes) {
106+
const pageChanged = node.page !== currentPage;
107+
const lineChanged = Number.isNaN(currentY) || Math.abs(node.y - currentY) > PDF_LINE_Y_TOLERANCE;
108+
if (pageChanged || lineChanged) {
109+
flushLine();
110+
currentLineNodes = [node];
111+
currentPage = node.page;
112+
currentY = node.y;
113+
continue;
114+
}
115+
116+
currentLineNodes.push(node);
117+
}
118+
119+
flushLine();
120+
return lines;
121+
}
122+
123+
export function parseKeycardFromLines(lines: string[]): KeycardEntry[] {
124+
const sections: Array<{
125+
section: string;
126+
title: string;
127+
values: string[];
128+
isCapturingData: boolean;
129+
openCurlyCount: number;
130+
}> = [];
131+
let currentSectionIndex = -1;
132+
133+
for (const line of lines) {
134+
const labelMatch = line.match(sectionHeaderRegex);
135+
if (labelMatch) {
136+
const section = labelMatch[1]?.toUpperCase();
137+
const title = sanitizeText(labelMatch[2] ?? '');
138+
if (section && title) {
139+
sections.push({
140+
section,
141+
title,
142+
values: [],
143+
isCapturingData: false,
144+
openCurlyCount: 0,
145+
});
146+
currentSectionIndex = sections.length - 1;
147+
continue;
148+
}
149+
}
150+
151+
if (currentSectionIndex < 0) {
152+
continue;
153+
}
154+
155+
const currentSection = sections[currentSectionIndex];
156+
if (!currentSection) {
157+
continue;
158+
}
159+
160+
const dataLineMatch = line.match(dataLineRegex);
161+
if (dataLineMatch) {
162+
currentSection.isCapturingData = true;
163+
const inlineValue = sanitizeText(dataLineMatch[1] ?? '');
164+
if (inlineValue) {
165+
currentSection.values.push(inlineValue);
166+
currentSection.openCurlyCount += countChar(inlineValue, '{') - countChar(inlineValue, '}');
167+
}
168+
continue;
169+
}
170+
171+
if (currentSection.isCapturingData) {
172+
if (faqHeaderRegex.test(line)) {
173+
currentSection.isCapturingData = false;
174+
continue;
175+
}
176+
177+
currentSection.values.push(line);
178+
179+
// For encrypted wallet password, data is a single JSON object. Stop as
180+
// soon as the object closes so footer/FAQ content is not appended.
181+
if (isEncryptedWalletPasswordSectionTitle(currentSection.title)) {
182+
currentSection.openCurlyCount += countChar(line, '{') - countChar(line, '}');
183+
if (currentSection.values.length > 0 && currentSection.openCurlyCount <= 0) {
184+
currentSection.isCapturingData = false;
185+
}
186+
}
187+
}
188+
}
189+
190+
return sections
191+
.filter(({ section, values }) => ['A', 'B', 'C', 'D'].includes(section) && values.length > 0)
192+
.map(({ section, title, values }) => ({
193+
label: `${section}: ${title}`,
194+
value: normalizeSectionValue(values.join('\n')),
195+
}));
196+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import * as assert from 'assert';
2+
import { parseKeycardFromLines } from '../../src/parseKeycard';
3+
4+
describe('parseKeycardFromLines', function () {
5+
it('happy path – clean JSON in one line', function () {
6+
const encryptedJson =
7+
'{"iv":"abc123","v":1,"iter":10000,"ks":256,"ts":64,"mode":"ccm","adata":"","cipher":"aes","salt":"xyz789","ct":"AAABBBCCC"}';
8+
const lines = [
9+
'A: Box A – User Key',
10+
'data: xpub661MyMwAqRbcF...',
11+
'B: Box B – Backup Key',
12+
'data: xpub661MyMwAqRbcG...',
13+
'C: Box C – BitGo Key',
14+
'data: xpub661MyMwAqRbcH...',
15+
'D: Box D – Encrypted Wallet Password',
16+
`data: ${encryptedJson}`,
17+
];
18+
19+
const entries = parseKeycardFromLines(lines);
20+
const sectionD = entries.find((e) => e.label.startsWith('D:'));
21+
assert.ok(sectionD, 'section D should be present');
22+
assert.strictEqual(sectionD.value, encryptedJson);
23+
});
24+
25+
it('Part N on its own line – strips standalone label', function () {
26+
const part1 =
27+
'{"iv":"abc123","v":1,"iter":10000,"ks":256,"ts":64,"mode":"ccm","adata":"","cipher":"aes","salt":"xyz789","ct":"AAABBB';
28+
const part2 = 'CCCDDDEEE"}';
29+
const lines = [
30+
'A: Box A – User Key',
31+
'data: xpub661MyMwAqRbcF...',
32+
'B: Box B – Backup Key',
33+
'data: xpub661MyMwAqRbcG...',
34+
'C: Box C – BitGo Key',
35+
'data: xpub661MyMwAqRbcH...',
36+
'D: Box D – Encrypted Wallet Password',
37+
`data: ${part1}`,
38+
'Part 2',
39+
part2,
40+
];
41+
42+
const entries = parseKeycardFromLines(lines);
43+
const sectionD = entries.find((e) => e.label.startsWith('D:'));
44+
assert.ok(sectionD, 'section D should be present');
45+
assert.strictEqual(sectionD.value, `${part1}${part2}`);
46+
});
47+
48+
it('multiple embedded Part N labels – strips all page-break labels across a long ct value', function () {
49+
const seg1 = 'AAABBBCCC';
50+
const seg2 = 'DDDEEEFFF';
51+
const seg3 = 'GGGHHH';
52+
const mergedLine = `{"iv":"abc123","v":1,"iter":10000,"ks":256,"ts":64,"mode":"ccm","adata":"","cipher":"aes","salt":"xyz789","ct":"${seg1}Part 2 ${seg2}Part 3 ${seg3}"}`;
53+
const lines = [
54+
'A: Box A – User Key',
55+
'data: xpub661MyMwAqRbcF...',
56+
'D: Box D – Encrypted Wallet Password',
57+
`data: ${mergedLine}`,
58+
];
59+
60+
const entries = parseKeycardFromLines(lines);
61+
const sectionD = entries.find((e) => e.label.startsWith('D:'));
62+
assert.ok(sectionD, 'section D should be present');
63+
const expected = `{"iv":"abc123","v":1,"iter":10000,"ks":256,"ts":64,"mode":"ccm","adata":"","cipher":"aes","salt":"xyz789","ct":"${seg1}${seg2}${seg3}"}`;
64+
assert.strictEqual(sectionD.value, expected);
65+
});
66+
67+
it('Part N embedded mid-line – strips label fused into base64 content', function () {
68+
const ctPrefix = 'AAABBBCCC';
69+
const ctSuffix = 'DDDEEEFFF';
70+
const mergedLine = `{"iv":"abc123","v":1,"iter":10000,"ks":256,"ts":64,"mode":"ccm","adata":"","cipher":"aes","salt":"xyz789","ct":"${ctPrefix}Part 2 ${ctSuffix}"}`;
71+
const lines = [
72+
'A: Box A – User Key',
73+
'data: xpub661MyMwAqRbcF...',
74+
'B: Box B – Backup Key',
75+
'data: xpub661MyMwAqRbcG...',
76+
'C: Box C – BitGo Key',
77+
'data: xpub661MyMwAqRbcH...',
78+
'D: Box D – Encrypted Wallet Password',
79+
`data: ${mergedLine}`,
80+
];
81+
82+
const entries = parseKeycardFromLines(lines);
83+
const sectionD = entries.find((e) => e.label.startsWith('D:'));
84+
assert.ok(sectionD, 'section D should be present');
85+
const expected = `{"iv":"abc123","v":1,"iter":10000,"ks":256,"ts":64,"mode":"ccm","adata":"","cipher":"aes","salt":"xyz789","ct":"${ctPrefix}${ctSuffix}"}`;
86+
assert.strictEqual(sectionD.value, expected);
87+
});
88+
});

modules/web-demo/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
"@bitgo/statics": "^58.43.0",
6969
"bitgo": "^51.2.0",
7070
"lodash": "^4.18.0",
71+
"pdfjs-dist": "^4.0.0",
7172
"react": "^18.0.0",
7273
"react-dom": "^18.0.0",
7374
"react-json-view": "^1.21.3",

0 commit comments

Comments
 (0)