From 40a9fdd2f1eb43c2b1311e035b0ac828a66daa4e Mon Sep 17 00:00:00 2001 From: rssprivacy-commits Date: Mon, 15 Jun 2026 12:03:44 +0800 Subject: [PATCH] =?UTF-8?q?fix(make-pdf):=20correct=20CJK=20rendering=20?= =?UTF-8?q?=E2=80=94=20URL=20sentinel=20leak,=20JP-first=20fonts,=20CJK=20?= =?UTF-8?q?quotes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three defects surfaced rendering Simplified-Chinese documents; refined after an independent two-model code audit (which caught a regression in the first pass of the quote fix). 1. Bare URLs leaked internal `SMARTPANTS_PRESERVED_N` sentinels into output AND left /

unclosed (everything after became one hyperlink). URL_RE's `\S+` (NUL is non-whitespace) swallowed the adjacent tag placeholders; single-pass restore could not un-nest them. Fix: stop the URL match at the NUL boundary; additionally strip any stray NUL from input at smartypants() entry so text cannot forge a placeholder or create NUL-adjacency nesting. 2. The CJK font stack listed Hiragino (Japanese) before any Chinese font, so Simplified-Chinese text rendered in Japanese glyph variants (直/骨/角/没). Fix: PingFang SC / Noto Sans CJK SC / Source Han Sans SC / Microsoft YaHei first; JP fonts demoted to last resort. (Trade-off: true Japanese documents now prefer SC glyphs for shared Han; acceptable for an SC-primary tool. A lang-attribute-based selector would be the fuller fix.) 3. A quote directly after a CJK colon or opening bracket (:(【「『〈《) is now treated as opening. Sentence/clause-ending punctuation (,。、;!?) is deliberately excluded — a quote after those is usually a CLOSING quote (Chinese puts the period inside: 。"), and including them flipped closing quotes to opening. Verified: pdffonts PingFang-only; pdftotext no sentinel leak, correct opening AND closing quotes (他说:"你好。" closes correctly); visual render no anchor bleed. make-pdf/test: 91 pass / 0 fail. Co-Authored-By: Claude Opus 4.8 (1M context) --- make-pdf/src/print-css.ts | 4 ++-- make-pdf/src/smartypants.ts | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/make-pdf/src/print-css.ts b/make-pdf/src/print-css.ts index 2366f42b99..e0b4ec7edf 100644 --- a/make-pdf/src/print-css.ts +++ b/make-pdf/src/print-css.ts @@ -35,8 +35,8 @@ // Metric-compatible sans stack: Helvetica (macOS), Liberation Sans (Linux, // ships via fonts-liberation), Arial (Windows). Shared by every text surface. const SANS_STACK = `Helvetica, "Liberation Sans", Arial`; -// CJK fallback families, appended to the body stack only. -const CJK_STACK = `"Hiragino Kaku Gothic ProN", "Noto Sans CJK JP", "Microsoft YaHei"`; +// CJK fallback families (Simplified-Chinese first), appended to the body stack only. +const CJK_STACK = `"PingFang SC", "Heiti SC", "Noto Sans CJK SC", "Source Han Sans SC", "Microsoft YaHei", "Hiragino Kaku Gothic ProN", "Noto Sans CJK JP"`; // Color-emoji families: Apple (macOS), Segoe (Windows), Noto (Linux). const EMOJI_FAMILIES = `"Apple Color Emoji", "Segoe UI Emoji", "Noto Color Emoji"`; diff --git a/make-pdf/src/smartypants.ts b/make-pdf/src/smartypants.ts index 2dfe097e09..5f794a2aa7 100644 --- a/make-pdf/src/smartypants.ts +++ b/make-pdf/src/smartypants.ts @@ -20,7 +20,7 @@ const CODE_ZONE_RE = /<(pre|code|script|style)\b[^>]*>[\s\S]*?<\/\1>/gi; const TAG_RE = /<[^>]+>/g; -const URL_RE = /\bhttps?:\/\/\S+/g; +const URL_RE = /\bhttps?:\/\/[^\s\u0000]+/g; /** * Apply smartypants to an HTML string. Zones that should not be touched: @@ -44,7 +44,7 @@ export function smartypants(html: string): string { }); }; - let s = html; + let s = html.replace(/\u0000/g, ""); // drop stray input NUL (can't forge a placeholder) s = carve(s, CODE_ZONE_RE); s = carve(s, TAG_RE); s = carve(s, URL_RE); @@ -89,11 +89,11 @@ function transformText(text: string): string { // Double quotes: open if preceded by whitespace/bol, close if preceded // by word char or punctuation. - s = s.replace(/(^|[\s\(\[\{\-])"/g, "$1\u201c"); // opening " + s = s.replace(/(^|[\s\(\[\{\-:(【「『〈《])"/g, "$1\u201c"); // opening " s = s.replace(/"/g, "\u201d"); // remaining " are closing // Single quotes (after apostrophe pass): - s = s.replace(/(^|[\s\(\[\{\-])'/g, "$1\u2018"); // opening ' + s = s.replace(/(^|[\s\(\[\{\-:(【「『〈《])'/g, "$1\u2018"); // opening ' s = s.replace(/'/g, "\u2019"); // remaining ' are closing return s;