-
Notifications
You must be signed in to change notification settings - Fork 66.8k
Fix translation pipeline corrupting HTML tags and bold headings in non-English pages #43890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -884,6 +884,34 @@ export function correctTranslatedContentStrings( | |
| } | ||
| } | ||
|
|
||
| // Unescape HTML entity-encoded tags (`<tag>` → `<tag>`) that Crowdin | ||
| // introduces when the English source uses inline raw HTML — e.g. | ||
| // `<code><a href="...">label</a></code>` inside table `<td>` cells. | ||
| // Without this fix, those tags render as literal `<code>` text on translated | ||
| // pages rather than as styled code elements. | ||
| // Only unescape tag names present as raw HTML in the English source to avoid | ||
| // incorrectly expanding intentional `<` entity sequences. | ||
| if (englishContent && content.includes('<')) { | ||
| const englishTagNames = new Set( | ||
| [...englishContent.matchAll(/<([a-z][a-z0-9]*)/gi)].map((m) => m[1].toLowerCase()), | ||
| ) | ||
| if (englishTagNames.size > 0) { | ||
| content = content.replace( | ||
| /<(\/?[a-z][a-z0-9]*)(\s[^<>]*?)?>/gi, | ||
| (match, tag: string, attrs = '') => { | ||
| const baseName = tag.replace(/^\//, '').toLowerCase() | ||
| return englishTagNames.has(baseName) ? `<${tag}${attrs}>` : match | ||
| }, | ||
| ) | ||
|
Comment on lines
+894
to
+905
|
||
| } | ||
| } | ||
|
|
||
| // Remove bare code-fence wrapping from bold heading lines. Translation pipelines | ||
| // sometimes wrap `**heading**` lines in bare (no-language) fenced code blocks, | ||
| // causing them to render as code instead of bold text. Strip the fences and | ||
| // restore the heading as plain Markdown. | ||
| content = content.replace(/^```\s*\n(\*\*[^\n]+\*\*)\s*\n```/gm, '$1') | ||
|
|
||
| // Collapsed Markdown table rows — restore linebreaks between `|` cells. | ||
| content = content.replaceAll(' | | ', ' |\n| ') | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1355,6 +1355,80 @@ describe('correctTranslatedContentStrings', () => { | |
| expect(fix('{{%raw %}', 'es')).toBe('{% raw %}') | ||
| expect(fix('{{% raw %}', 'es')).toBe('{% raw %}') | ||
| }) | ||
|
|
||
| test('unescapes entity-encoded HTML tags when English source has matching raw HTML', () => { | ||
| const english = | ||
| '<td><code><a href="https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md">ubuntu-latest</a></code></td>' | ||
|
|
||
| expect(fix('<code>ubuntu-latest</code>', 'ko', english)).toBe( | ||
| '<code>ubuntu-latest</code>', | ||
| ) | ||
| expect( | ||
| fix('<a href="https://example.com">ubuntu-latest</a>', 'ko', english), | ||
| ).toBe('<a href="https://example.com">ubuntu-latest</a>') | ||
| expect( | ||
| fix( | ||
| '<code><a href="https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md">ubuntu-latest</a></code>', | ||
| 'ko', | ||
| english, | ||
| ), | ||
| ).toBe( | ||
| '<code><a href="https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md">ubuntu-latest</a></code>', | ||
| ) | ||
| }) | ||
|
Comment on lines
+1359
to
+1378
|
||
|
|
||
| test('does not unescape entity-encoded tags absent from English source', () => { | ||
| const english = '<p>Simple paragraph without code elements</p>' | ||
| const input = '<code>text</code>' | ||
| expect(fix(input, 'ko', english)).toBe(input) | ||
| }) | ||
|
|
||
| test('does not unescape entity-encoded tags when no English content provided', () => { | ||
| const input = '<code>ubuntu-latest</code>' | ||
| expect(fix(input, 'ko')).toBe(input) | ||
| }) | ||
|
|
||
| test('removes bare code-fence wrapping from bold heading lines', () => { | ||
| const input = '```\n**다음은 작업을 다운로드하는 데 필요합니다.**\n```' | ||
| expect(fix(input, 'ko')).toBe('**다음은 작업을 다운로드하는 데 필요합니다.**') | ||
| }) | ||
|
|
||
| test('removes bare code-fence wrapping from bold headings between real code blocks', () => { | ||
| const input = [ | ||
| '```shell copy', | ||
| 'github.com', | ||
| 'api.github.com', | ||
| '```', | ||
| '', | ||
| '```', | ||
| '**다음은 작업을 다운로드하는 데 필요합니다.**', | ||
| '```', | ||
| '', | ||
| '```shell copy', | ||
| 'codeload.github.com', | ||
| '```', | ||
| ].join('\n') | ||
|
|
||
| const expected = [ | ||
| '```shell copy', | ||
| 'github.com', | ||
| 'api.github.com', | ||
| '```', | ||
| '', | ||
| '**다음은 작업을 다운로드하는 데 필요합니다.**', | ||
| '', | ||
| '```shell copy', | ||
| 'codeload.github.com', | ||
| '```', | ||
| ].join('\n') | ||
|
|
||
| expect(fix(input, 'ko')).toBe(expected) | ||
| }) | ||
|
|
||
| test('does not strip language-specified code fences with bold content', () => { | ||
| const input = '```shell\n**not a heading**\n```' | ||
| expect(fix(input, 'ko')).toBe(input) | ||
| }) | ||
| }) | ||
|
|
||
| // ─── EDGE CASES ──────────────────────────────────────────────────── | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
attrscapture in the entity-tag regex ((\s[^<>]*?)?) will stop at the first>it sees. If an attribute value itself contains an encoded>/<(e.g.title="a > b"), the match can terminate early and the replacement will corrupt the tag/content. Consider switching to an attribute pattern that respects quoted strings, or using an HTML parser to unescape tags safely.