diff --git a/src/specify_cli/__init__.py b/src/specify_cli/__init__.py index c0bdbaabe3..bd681a109b 100644 --- a/src/specify_cli/__init__.py +++ b/src/specify_cli/__init__.py @@ -397,7 +397,28 @@ def save_init_options(project_path: Path, options: dict[str, Any]) -> None: """ dest = project_path / INIT_OPTIONS_FILE dest.parent.mkdir(parents=True, exist_ok=True) - dest.write_text(json.dumps(options, indent=2, sort_keys=True)) + # Write JSON as real UTF-8 instead of ``\uXXXX`` escape sequences + # (``ensure_ascii=False``) and pin the file encoding to match. + # + # The default ``json.dumps`` output is ASCII-only — any non-ASCII + # character is encoded as a ``\uXXXX`` escape — so without the + # ``ensure_ascii=False`` flip below the encoding pin alone would be + # a no-op for any payload we plausibly write today. We pair the two + # so the on-disk bytes match a human's expectation of "this file is + # UTF-8" (greppable, readable in editors that don't decode JSON + # escapes, friendly to peers running ``cat`` or ``Get-Content``) and + # so the encoding pin is a real contract instead of a future hedge. + # + # ``Path.write_text`` without ``encoding=`` falls back to the system + # locale codec (cp1252 / gb2312 / cp932 on Windows), which would + # mis-encode non-ASCII bytes locally and produce a file a peer with + # a different locale couldn't decode. The sibling integration- + # catalog writer in ``integrations/catalog.py`` pins + # ``encoding="utf-8"`` for the same reason. + dest.write_text( + json.dumps(options, indent=2, sort_keys=True, ensure_ascii=False), + encoding="utf-8", + ) def load_init_options(project_path: Path) -> dict[str, Any]: @@ -409,8 +430,17 @@ def load_init_options(project_path: Path) -> dict[str, Any]: if not path.exists(): return {} try: - return json.loads(path.read_text()) - except (json.JSONDecodeError, OSError): + # Match the explicit UTF-8 used by ``save_init_options``; without + # it ``read_text`` falls back to the system codec on Windows and + # raises ``UnicodeDecodeError`` on any file containing the + # multi-byte UTF-8 sequences ``save_init_options`` now writes + # directly. ``UnicodeDecodeError`` is a subclass of + # ``ValueError``, not ``OSError`` / ``json.JSONDecodeError``, so + # it must be listed explicitly here to preserve the existing + # "fall back to empty dict" contract for corrupted / foreign- + # codec files. + return json.loads(path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError, UnicodeDecodeError): return {} diff --git a/src/specify_cli/extensions.py b/src/specify_cli/extensions.py index 871503f0ae..f92bea45b4 100644 --- a/src/specify_cli/extensions.py +++ b/src/specify_cli/extensions.py @@ -761,7 +761,28 @@ def _load_extensionignore(source_dir: Path) -> Optional[Callable[[str, List[str] if not ignore_file.exists(): return None - lines: List[str] = ignore_file.read_text().splitlines() + # Pin UTF-8 explicitly: ``Path.read_text`` defaults to the system + # locale codec on Windows (cp1252 / gb2312 / cp932), which silently + # corrupts multibyte patterns when the file is shared across + # machines with different locales. The next line already + # normalises backslashes "so Windows-authored files work" — the + # codebase already expects Windows authors to write this file. + # + # A file that is not valid UTF-8 is a user-authoring mistake, so + # surface it as ``ValidationError`` with a pointer to the offending + # byte — the same pattern ``ExtensionManifest._load_yaml`` uses + # for ``extension.yml`` (see ``UnicodeDecodeError`` handler in + # this module). Without the wrap, the raw ``UnicodeDecodeError`` + # would abort installation with a Python traceback instead of a + # clear message naming the file. + try: + raw = ignore_file.read_text(encoding="utf-8") + except UnicodeDecodeError as e: + raise ValidationError( + f".extensionignore is not valid UTF-8: {ignore_file} " + f"({e.reason} at byte {e.start})" + ) + lines: List[str] = raw.splitlines() # Normalise backslashes in patterns so Windows-authored files work normalised: List[str] = [] diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 153388a541..00306beab1 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -3147,9 +3147,13 @@ def _make_extension(self, temp_dir, valid_manifest_data, extra_files=None, ignor else: p.write_text(content) - # Write .extensionignore + # Write .extensionignore. Pinned to UTF-8 so non-ASCII patterns + # in tests (see ``test_extensionignore_utf8_patterns``) survive + # the round-trip on Windows runners with non-UTF-8 default locales. if ignore_content is not None: - (ext_dir / ".extensionignore").write_text(ignore_content) + (ext_dir / ".extensionignore").write_text( + ignore_content, encoding="utf-8" + ) return ext_dir @@ -3379,6 +3383,73 @@ def test_extensionignore_windows_backslash_patterns(self, temp_dir, valid_manife assert (dest / "docs" / "guide.md").exists() assert not (dest / "docs" / "internal" / "draft.md").exists() + def test_extensionignore_utf8_patterns(self, temp_dir, valid_manifest_data): + """Non-ASCII patterns in .extensionignore work on every locale. + + ``Path.read_text`` defaults to the system locale codec on Windows + (cp1252 / gb2312 / cp932). Without an explicit ``encoding="utf-8"``, + a pattern like ``ドキュメント/`` written by a UTF-8 host becomes + mojibake on a cp1252 host and silently fails to match — leaking + files the author intended to exclude. The existing + ``test_extensionignore_windows_backslash_patterns`` already shows + the codebase treats this as a Windows-author-friendly file; UTF-8 + is part of that same contract. + """ + ext_dir = self._make_extension( + temp_dir, + valid_manifest_data, + extra_files={ + "ドキュメント/private.md": "secret", + "ドキュメント/public.md": "public", + "docs/guide.md": "# Guide", + "café/résumé.txt": "draft", + }, + ignore_content="ドキュメント/\ncafé/\n", + ) + + proj_dir = temp_dir / "project" + proj_dir.mkdir() + (proj_dir / ".specify").mkdir() + + manager = ExtensionManager(proj_dir) + manager.install_from_directory(ext_dir, "0.1.0", register_commands=False) + + dest = proj_dir / ".specify" / "extensions" / "test-ext" + # Multibyte patterns excluded. + assert not (dest / "ドキュメント").exists() + assert not (dest / "café").exists() + # ASCII path with no matching pattern is unaffected. + assert (dest / "docs" / "guide.md").exists() + + def test_extensionignore_invalid_utf8_raises_validation_error( + self, temp_dir, valid_manifest_data + ): + """A non-UTF-8 ``.extensionignore`` surfaces as ``ValidationError``. + + Pinning ``encoding="utf-8"`` on the reader means an + ``.extensionignore`` written in some other codec (cp1252, etc.) + now triggers ``UnicodeDecodeError`` instead of silently + mojibake-ing patterns. Wrap that exception as ``ValidationError`` + with a pointer to the offending byte — the same pattern + ``ExtensionManifest._load_yaml`` uses for ``extension.yml`` — + so installation aborts with a user-friendly message instead of a + raw Python traceback. + """ + ext_dir = self._make_extension(temp_dir, valid_manifest_data) + # Write an .extensionignore whose bytes are not valid UTF-8. + # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8. + (ext_dir / ".extensionignore").write_bytes(b"caf\xe9/\n") + + proj_dir = temp_dir / "project" + proj_dir.mkdir() + (proj_dir / ".specify").mkdir() + + manager = ExtensionManager(proj_dir) + with pytest.raises( + ValidationError, match=r"\.extensionignore is not valid UTF-8" + ): + manager.install_from_directory(ext_dir, "0.1.0", register_commands=False) + def test_extensionignore_star_does_not_cross_directories(self, temp_dir, valid_manifest_data): """'*' should NOT match across directory boundaries (gitignore semantics).""" ext_dir = self._make_extension( diff --git a/tests/test_presets.py b/tests/test_presets.py index f1c0e95f4e..a5d581a40c 100644 --- a/tests/test_presets.py +++ b/tests/test_presets.py @@ -2269,6 +2269,85 @@ def test_load_returns_empty_on_invalid_json(self, project_dir): assert load_init_options(project_dir) == {} + @pytest.mark.parametrize( + "value", + ["名前-プロジェクト", "café-résumé", "Ωmega-Δelta", "🚀-launch"], + ) + def test_save_load_round_trip_preserves_non_ascii(self, project_dir, value): + """Non-ASCII values round-trip via explicit UTF-8 encoding. + + ``Path.write_text`` / ``Path.read_text`` default to the system + locale codec on Windows (cp1252 / gb2312 / cp932). Without + ``encoding="utf-8"`` pinned on both ends, a project name like + ``café`` written on a UTF-8 host becomes garbled or unreadable on + a cp1252 host (and vice versa). Pin UTF-8 explicitly so init + options round-trip across machines and CI. + + Note: this test only meaningfully exercises the encoding pin + because ``save_init_options`` now writes JSON with + ``ensure_ascii=False`` — otherwise ``json.dumps`` would output + ASCII-only ``\\uXXXX`` escapes and the encoding pin would be a + no-op for any value here. ``test_save_writes_real_utf8_bytes`` + below asserts that contract directly. + """ + from specify_cli import save_init_options, load_init_options + + save_init_options(project_dir, {"ai": "claude", "project_name": value}) + + loaded = load_init_options(project_dir) + assert loaded["project_name"] == value + + def test_save_writes_real_utf8_bytes(self, project_dir): + """The on-disk file contains real UTF-8 bytes, not ``\\uXXXX`` escapes. + + Pinning ``encoding="utf-8"`` on ``write_text`` only makes a + difference when the serialiser actually emits non-ASCII + characters. With ``ensure_ascii=False`` on ``json.dumps`` the + non-ASCII bytes hit the file, so the encoding pin is the thing + that decides between cp1252 garbage and clean UTF-8 on Windows. + + This test pins that behaviour: the on-disk bytes are valid UTF-8 + and contain the multi-byte encoding of ``café``, not its + ``\\u00e9`` escape form. Reviewers can verify that removing + ``ensure_ascii=False`` or ``encoding="utf-8"`` from the writer + breaks this test, which is what Copilot's review pointed out the + original round-trip test failed to do. + """ + from specify_cli import save_init_options + + save_init_options(project_dir, {"project_name": "café"}) + + opts_file = project_dir / ".specify" / "init-options.json" + raw = opts_file.read_bytes() + # 'café' in UTF-8 ends with bytes 0xC3 0xA9 ('é'). The cp1252 + # encoding of 'é' is the single byte 0xE9. The JSON-escape form + # would be the 6-byte literal '\\u00e9'. We assert the UTF-8 form + # is present so the test pins the actual contract. + assert b"caf\xc3\xa9" in raw, ( + "Expected UTF-8 bytes for 'café' in the on-disk file, " + f"got: {raw!r}" + ) + # And the whole file decodes cleanly as UTF-8. + raw.decode("utf-8") + + def test_load_returns_empty_on_locale_corrupted_file(self, project_dir): + """A file written in a non-UTF-8 codec falls back to {}, not crash. + + Simulates a file produced by an old client (or by a peer machine + with a different default locale) that contains bytes invalid as + UTF-8. ``load_init_options`` should fall back to ``{}`` per the + existing contract — never propagate a raw ``UnicodeDecodeError`` + to the CLI surface. + """ + from specify_cli import load_init_options + + opts_file = project_dir / ".specify" / "init-options.json" + opts_file.parent.mkdir(parents=True, exist_ok=True) + # 0xE9 is 'é' in cp1252 but an invalid lead byte in UTF-8. + opts_file.write_bytes(b'{"project_name": "caf\xe9"}') + + assert load_init_options(project_dir) == {} + class TestPresetSkills: """Tests for preset skill registration and unregistration.