From f7e791c24d5b5948561ecc205b89c3954bd3cb88 Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Sat, 23 May 2026 15:41:33 +0500 Subject: [PATCH 1/2] fix(cli): pin UTF-8 encoding on init-options and .extensionignore I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``Path.read_text`` / ``Path.write_text`` default to the system locale codec, which is cp1252 / gb2312 / cp932 on Windows. Two user-facing file paths in spec-kit were calling them without an explicit ``encoding=`` argument: - ``src/specify_cli/__init__.py:400,412`` — ``save_init_options`` / ``load_init_options`` for ``.specify/init-options.json``. A peer machine with a different default locale (or a UTF-8 Unix CI runner reading a file written on a cp1252 Windows host) cannot decode the file, raising ``UnicodeDecodeError``. ``UnicodeDecodeError`` is a subclass of ``ValueError`` — not ``OSError`` / ``json.JSONDecodeError`` — so the existing fall-back ``except`` tuple in ``load_init_options`` also misses it and the error propagates raw to the CLI. - ``src/specify_cli/extensions.py:764`` — ``.extensionignore`` pattern reader. The very next line already normalises backslashes "so Windows-authored files work", proving the codebase expects Windows authors to write this file. Multibyte UTF-8 patterns (Chinese filenames, accented directory names) silently mojibake when the host locale is not UTF-8, so the patterns fail to match and unintended files are shipped with the extension. The sibling integration-catalog reader at ``src/specify_cli/integrations/catalog.py:150,156,193,202,374`` already pins ``encoding="utf-8"`` everywhere. PR #2280 fixed the symmetric PowerShell-template BOM bug. This change brings the two remaining drifted paths in line with that precedent. Regression tests: - ``tests/test_presets.py::TestInitOptions`` — parametrized non-ASCII round-trip (CJK, Latin-1, Greek, emoji) plus a corrupted-file case that asserts the existing "fall back to {}" contract still holds when a peer file contains bytes invalid as UTF-8. - ``tests/test_extensions.py::TestExtensionIgnore`` — Japanese (``ドキュメント/``) and Latin-1 (``café/``) ignore patterns correctly exclude their directories during install. --- src/specify_cli/__init__.py | 21 +++++++++++++--- src/specify_cli/extensions.py | 8 +++++- tests/test_extensions.py | 46 +++++++++++++++++++++++++++++++++-- tests/test_presets.py | 39 +++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 6 deletions(-) diff --git a/src/specify_cli/__init__.py b/src/specify_cli/__init__.py index c0bdbaabe3..559e6df5f8 100644 --- a/src/specify_cli/__init__.py +++ b/src/specify_cli/__init__.py @@ -397,7 +397,15 @@ def save_init_options(project_path: Path, options: dict[str, Any]) -> None: """ dest = project_path / INIT_OPTIONS_FILE dest.parent.mkdir(parents=True, exist_ok=True) - dest.write_text(json.dumps(options, indent=2, sort_keys=True)) + # Pin UTF-8 explicitly: ``Path.write_text`` defaults to the system + # locale codec, which is cp1252 / gb2312 / cp932 on Windows. A + # locale-encoded write succeeds locally but produces a file a peer + # machine (different locale) or Unix CI cannot decode. The sibling + # integration-catalog writer in ``integrations/catalog.py`` already + # pins ``encoding="utf-8"`` for the same reason. + dest.write_text( + json.dumps(options, indent=2, sort_keys=True), encoding="utf-8" + ) def load_init_options(project_path: Path) -> dict[str, Any]: @@ -409,8 +417,15 @@ def load_init_options(project_path: Path) -> dict[str, Any]: if not path.exists(): return {} try: - return json.loads(path.read_text()) - except (json.JSONDecodeError, OSError): + # Match the explicit UTF-8 used by ``save_init_options``; without + # it ``read_text`` falls back to the system codec on Windows and + # raises ``UnicodeDecodeError`` on any file a peer wrote with + # non-ASCII content. ``UnicodeDecodeError`` is a subclass of + # ``ValueError``, not ``OSError`` / ``json.JSONDecodeError``, so + # it must be listed explicitly here to preserve the existing + # "fall back to empty dict" contract. + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, UnicodeDecodeError): return {} diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 871503f0ae..ba8020279b 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -761,7 +761,13 @@ def _load_extensionignore(source_dir: Path) -> Optional[Callable[[str, List[str] if not ignore_file.exists(): return None - lines: List[str] = ignore_file.read_text().splitlines() + # Pin UTF-8 explicitly: ``Path.read_text`` defaults to the system + # locale codec on Windows (cp1252 / gb2312 / cp932), which silently + # corrupts multibyte patterns when the file is shared across + # machines with different locales. The next line already + # normalises backslashes "so Windows-authored files work" — the + # codebase already expects Windows authors to write this file. + lines: List[str] = ignore_file.read_text(encoding="utf-8").splitlines() # Normalise backslashes in patterns so Windows-authored files work normalised: List[str] = [] diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 153388a541..9cdf7b3ea2 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -3147,9 +3147,13 @@ def _make_extension(self, temp_dir, valid_manifest_data, extra_files=None, ignor else: p.write_text(content) - # Write .extensionignore + # Write .extensionignore. Pinned to UTF-8 so non-ASCII patterns + # in tests (see ``test_extensionignore_utf8_patterns``) survive + # the round-trip on Windows runners with non-UTF-8 default locales. if ignore_content is not None: - (ext_dir / ".extensionignore").write_text(ignore_content) + (ext_dir / ".extensionignore").write_text( + ignore_content, encoding="utf-8" + ) return ext_dir @@ -3379,6 +3383,44 @@ def test_extensionignore_windows_backslash_patterns(self, temp_dir, valid_manife assert (dest / "docs" / "guide.md").exists() assert not (dest / "docs" / "internal" / "draft.md").exists() + def test_extensionignore_utf8_patterns(self, temp_dir, valid_manifest_data): + """Non-ASCII patterns in .extensionignore work on every locale. + + ``Path.read_text`` defaults to the system locale codec on Windows + (cp1252 / gb2312 / cp932). Without an explicit ``encoding="utf-8"``, + a pattern like ``ドキュメント/`` written by a UTF-8 host becomes + mojibake on a cp1252 host and silently fails to match — leaking + files the author intended to exclude. The existing + ``test_extensionignore_windows_backslash_patterns`` already shows + the codebase treats this as a Windows-author-friendly file; UTF-8 + is part of that same contract. + """ + ext_dir = self._make_extension( + temp_dir, + valid_manifest_data, + extra_files={ + "ドキュメント/private.md": "secret", + "ドキュメント/public.md": "public", + "docs/guide.md": "# Guide", + "café/résumé.txt": "draft", + }, + ignore_content="ドキュメント/\ncafé/\n", + ) + + proj_dir = temp_dir / "project" + proj_dir.mkdir() + (proj_dir / ".specify").mkdir() + + manager = ExtensionManager(proj_dir) + manager.install_from_directory(ext_dir, "0.1.0", register_commands=False) + + dest = proj_dir / ".specify" / "extensions" / "test-ext" + # Multibyte patterns excluded. + assert not (dest / "ドキュメント").exists() + assert not (dest / "café").exists() + # ASCII path with no matching pattern is unaffected. + assert (dest / "docs" / "guide.md").exists() + def test_extensionignore_star_does_not_cross_directories(self, temp_dir, valid_manifest_data): """'*' should NOT match across directory boundaries (gitignore semantics).""" ext_dir = self._make_extension( diff --git a/tests/test_presets.py b/tests/test_presets.py index f1c0e95f4e..d3d61b4c5a 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -2269,6 +2269,45 @@ def test_load_returns_empty_on_invalid_json(self, project_dir): assert load_init_options(project_dir) == {} + @pytest.mark.parametrize( + "value", + ["名前-プロジェクト", "café-résumé", "Ωmega-Δelta", "🚀-launch"], + ) + def test_save_load_round_trip_preserves_non_ascii(self, project_dir, value): + """Non-ASCII values round-trip via explicit UTF-8 encoding. + + ``Path.write_text`` / ``Path.read_text`` default to the system + locale codec on Windows (cp1252 / gb2312 / cp932). Without + ``encoding="utf-8"`` pinned on both ends, a project name like + ``café`` written on a UTF-8 host becomes garbled or unreadable on + a cp1252 host (and vice versa). Pin UTF-8 explicitly so init + options round-trip across machines and CI. + """ + from specify_cli import save_init_options, load_init_options + + save_init_options(project_dir, {"ai": "claude", "project_name": value}) + + loaded = load_init_options(project_dir) + assert loaded["project_name"] == value + + def test_load_returns_empty_on_locale_corrupted_file(self, project_dir): + """A file written in a non-UTF-8 codec falls back to {}, not crash. + + Simulates a file produced by an old client (or by a peer machine + with a different default locale) that contains bytes invalid as + UTF-8. ``load_init_options`` should fall back to ``{}`` per the + existing contract — never propagate a raw ``UnicodeDecodeError`` + to the CLI surface. + """ + from specify_cli import load_init_options + + opts_file = project_dir / ".specify" / "init-options.json" + opts_file.parent.mkdir(parents=True, exist_ok=True) + # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8. + opts_file.write_bytes(b'{"project_name": "caf\xe9"}') + + assert load_init_options(project_dir) == {} + class TestPresetSkills: """Tests for preset skill registration and unregistration. From c0a8d166e7334ef98773a253664fac6c07f3e19a Mon Sep 17 00:00:00 2001 From: Quratulain-bilal Date: Sun, 31 May 2026 01:53:47 +0500 Subject: [PATCH 2/2] fix(cli): wrap .extensionignore decode error and tighten UTF-8 contract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Copilot review feedback on this PR. Three issues, three fixes: 1. ``save_init_options`` now writes JSON with ``ensure_ascii=False``. Without that flag, ``json.dumps`` emits ASCII-only ``\uXXXX`` escapes, which means the ``encoding="utf-8"`` pin on the surrounding ``Path.write_text`` makes no observable difference for any value we currently write. Flipping ``ensure_ascii`` makes the non-ASCII bytes hit the file directly, so the encoding pin becomes the thing that decides between cp1252 garbage and clean UTF-8 on Windows. The comment above the call now describes the real reason instead of the previously-misleading rationale Copilot flagged. 2. ``test_save_load_round_trip_preserves_non_ascii`` was a no-op under the old ``ensure_ascii=True`` writer (Copilot's second comment). Added ``test_save_writes_real_utf8_bytes`` that asserts the on-disk bytes contain the UTF-8 encoding of ``café`` (``0xC3 0xA9``), not its JSON escape form ``é``. Removing either ``ensure_ascii=False`` or ``encoding="utf-8"`` from the writer now breaks this test — the contract is pinned. 3. ``.extensionignore`` reader wraps ``UnicodeDecodeError`` as ``ValidationError`` with a pointer to the offending byte (Copilot's third comment). Mirrors ``ExtensionManifest._load_yaml``'s existing handler for ``extension.yml``. Adds ``test_extensionignore_invalid_utf8_raises_validation_error`` asserting installation aborts with the wrapped error instead of a raw Python traceback. --- src/specify_cli/__init__.py | 35 +++++++++++++++++++++--------- src/specify_cli/extensions.py | 17 ++++++++++++++- tests/test_extensions.py | 29 +++++++++++++++++++++++++ tests/test_presets.py | 40 +++++++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 11 deletions(-) diff --git a/src/specify_cli/__init__.py b/src/specify_cli/__init__.py index 559e6df5f8..bd681a109b 100644 --- a/src/specify_cli/__init__.py +++ b/src/specify_cli/__init__.py @@ -397,14 +397,27 @@ def save_init_options(project_path: Path, options: dict[str, Any]) -> None: """ dest = project_path / INIT_OPTIONS_FILE dest.parent.mkdir(parents=True, exist_ok=True) - # Pin UTF-8 explicitly: ``Path.write_text`` defaults to the system - # locale codec, which is cp1252 / gb2312 / cp932 on Windows. A - # locale-encoded write succeeds locally but produces a file a peer - # machine (different locale) or Unix CI cannot decode. The sibling - # integration-catalog writer in ``integrations/catalog.py`` already - # pins ``encoding="utf-8"`` for the same reason. + # Write JSON as real UTF-8 instead of ``\uXXXX`` escape sequences + # (``ensure_ascii=False``) and pin the file encoding to match. + # + # The default ``json.dumps`` output is ASCII-only — any non-ASCII + # character is encoded as a ``\uXXXX`` escape — so without the + # ``ensure_ascii=False`` flip below the encoding pin alone would be + # a no-op for any payload we plausibly write today. We pair the two + # so the on-disk bytes match a human's expectation of "this file is + # UTF-8" (greppable, readable in editors that don't decode JSON + # escapes, friendly to peers running ``cat`` or ``Get-Content``) and + # so the encoding pin is a real contract instead of a future hedge. + # + # ``Path.write_text`` without ``encoding=`` falls back to the system + # locale codec (cp1252 / gb2312 / cp932 on Windows), which would + # mis-encode non-ASCII bytes locally and produce a file a peer with + # a different locale couldn't decode. The sibling integration- + # catalog writer in ``integrations/catalog.py`` pins + # ``encoding="utf-8"`` for the same reason. dest.write_text( - json.dumps(options, indent=2, sort_keys=True), encoding="utf-8" + json.dumps(options, indent=2, sort_keys=True, ensure_ascii=False), + encoding="utf-8", ) @@ -419,11 +432,13 @@ def load_init_options(project_path: Path) -> dict[str, Any]: try: # Match the explicit UTF-8 used by ``save_init_options``; without # it ``read_text`` falls back to the system codec on Windows and - # raises ``UnicodeDecodeError`` on any file a peer wrote with - # non-ASCII content. ``UnicodeDecodeError`` is a subclass of + # raises ``UnicodeDecodeError`` on any file containing the + # multi-byte UTF-8 sequences ``save_init_options`` now writes + # directly. ``UnicodeDecodeError`` is a subclass of # ``ValueError``, not ``OSError`` / ``json.JSONDecodeError``, so # it must be listed explicitly here to preserve the existing - # "fall back to empty dict" contract. + # "fall back to empty dict" contract for corrupted / foreign- + # codec files. return json.loads(path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError, UnicodeDecodeError): return {} diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index ba8020279b..f92bea45b4 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -767,7 +767,22 @@ def _load_extensionignore(source_dir: Path) -> Optional[Callable[[str, List[str] # machines with different locales. The next line already # normalises backslashes "so Windows-authored files work" — the # codebase already expects Windows authors to write this file. - lines: List[str] = ignore_file.read_text(encoding="utf-8").splitlines() + # + # A file that is not valid UTF-8 is a user-authoring mistake, so + # surface it as ``ValidationError`` with a pointer to the offending + # byte — the same pattern ``ExtensionManifest._load_yaml`` uses + # for ``extension.yml`` (see ``UnicodeDecodeError`` handler in + # this module). Without the wrap, the raw ``UnicodeDecodeError`` + # would abort installation with a Python traceback instead of a + # clear message naming the file. + try: + raw = ignore_file.read_text(encoding="utf-8") + except UnicodeDecodeError as e: + raise ValidationError( + f".extensionignore is not valid UTF-8: {ignore_file} " + f"({e.reason} at byte {e.start})" + ) + lines: List[str] = raw.splitlines() # Normalise backslashes in patterns so Windows-authored files work normalised: List[str] = [] diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 9cdf7b3ea2..00306beab1 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -3421,6 +3421,35 @@ def test_extensionignore_utf8_patterns(self, temp_dir, valid_manifest_data): # ASCII path with no matching pattern is unaffected. assert (dest / "docs" / "guide.md").exists() + def test_extensionignore_invalid_utf8_raises_validation_error( + self, temp_dir, valid_manifest_data + ): + """A non-UTF-8 ``.extensionignore`` surfaces as ``ValidationError``. + + Pinning ``encoding="utf-8"`` on the reader means an + ``.extensionignore`` written in some other codec (cp1252, etc.) + now triggers ``UnicodeDecodeError`` instead of silently + mojibake-ing patterns. Wrap that exception as ``ValidationError`` + with a pointer to the offending byte — the same pattern + ``ExtensionManifest._load_yaml`` uses for ``extension.yml`` — + so installation aborts with a user-friendly message instead of a + raw Python traceback. + """ + ext_dir = self._make_extension(temp_dir, valid_manifest_data) + # Write an .extensionignore whose bytes are not valid UTF-8. + # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8. + (ext_dir / ".extensionignore").write_bytes(b"caf\xe9/\n") + + proj_dir = temp_dir / "project" + proj_dir.mkdir() + (proj_dir / ".specify").mkdir() + + manager = ExtensionManager(proj_dir) + with pytest.raises( + ValidationError, match=r"\.extensionignore is not valid UTF-8" + ): + manager.install_from_directory(ext_dir, "0.1.0", register_commands=False) + def test_extensionignore_star_does_not_cross_directories(self, temp_dir, valid_manifest_data): """'*' should NOT match across directory boundaries (gitignore semantics).""" ext_dir = self._make_extension( diff --git a/tests/test_presets.py b/tests/test_presets.py index d3d61b4c5a..a5d581a40c 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -2282,6 +2282,13 @@ def test_save_load_round_trip_preserves_non_ascii(self, project_dir, value): ``café`` written on a UTF-8 host becomes garbled or unreadable on a cp1252 host (and vice versa). Pin UTF-8 explicitly so init options round-trip across machines and CI. + + Note: this test only meaningfully exercises the encoding pin + because ``save_init_options`` now writes JSON with + ``ensure_ascii=False`` — otherwise ``json.dumps`` would output + ASCII-only ``\\uXXXX`` escapes and the encoding pin would be a + no-op for any value here. ``test_save_writes_real_utf8_bytes`` + below asserts that contract directly. """ from specify_cli import save_init_options, load_init_options @@ -2290,6 +2297,39 @@ def test_save_load_round_trip_preserves_non_ascii(self, project_dir, value): loaded = load_init_options(project_dir) assert loaded["project_name"] == value + def test_save_writes_real_utf8_bytes(self, project_dir): + """The on-disk file contains real UTF-8 bytes, not ``\\uXXXX`` escapes. + + Pinning ``encoding="utf-8"`` on ``write_text`` only makes a + difference when the serialiser actually emits non-ASCII + characters. With ``ensure_ascii=False`` on ``json.dumps`` the + non-ASCII bytes hit the file, so the encoding pin is the thing + that decides between cp1252 garbage and clean UTF-8 on Windows. + + This test pins that behaviour: the on-disk bytes are valid UTF-8 + and contain the multi-byte encoding of ``café``, not its + ``\\u00e9`` escape form. Reviewers can verify that removing + ``ensure_ascii=False`` or ``encoding="utf-8"`` from the writer + breaks this test, which is what Copilot's review pointed out the + original round-trip test failed to do. + """ + from specify_cli import save_init_options + + save_init_options(project_dir, {"project_name": "café"}) + + opts_file = project_dir / ".specify" / "init-options.json" + raw = opts_file.read_bytes() + # 'café' in UTF-8 ends with bytes 0xC3 0xA9 ('é'). The cp1252 + # encoding of 'é' is the single byte 0xE9. The JSON-escape form + # would be the 6-byte literal '\\u00e9'. We assert the UTF-8 form + # is present so the test pins the actual contract. + assert b"caf\xc3\xa9" in raw, ( + "Expected UTF-8 bytes for 'café' in the on-disk file, " + f"got: {raw!r}" + ) + # And the whole file decodes cleanly as UTF-8. + raw.decode("utf-8") + def test_load_returns_empty_on_locale_corrupted_file(self, project_dir): """A file written in a non-UTF-8 codec falls back to {}, not crash.