Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# CHANGELOG

## 0.2.0 beta 2 - 2026 June + LATER
- **Fixed:** The test-suite adapter now writes UTF-8 JSON consistently on Windows.
- **Fixed:** Classic strings now validate YINI escape sequences directly, including invalid octal and Unicode escapes.
- **Fixed:** Literal control characters are rejected in single-line strings while multiline triple-quoted strings still preserve valid formatting.

## 0.2.0 beta 1 - 2026 June
- **Fixed:** Improved error reporting when a YINI file has broken syntax, including unfinished block comments such as `/* comment`.
- **Fixed:** `#!` lines outside the first line are now safely ignored as comment-like lines.
Expand Down
204 changes: 199 additions & 5 deletions src/yini_parser/core/value_decoders.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
# src/yini_parser/core/value_decoders.py
from typing import NoReturn

from ..api.errors import YiniParseError

"""
- Parsers reads raw/source text and recognizes its structure as tokens.
- Decoders converts tokens into its runtime value.
"""

_CLASSIC_SIMPLE_ESCAPES = {
"\\": "\\",
'"': '"',
"'": "'",
"a": "\a",
"b": "\b",
"f": "\f",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v",
}


def decode_string_token(
token_text: str,
Expand Down Expand Up @@ -46,10 +61,17 @@ def decode_string_token(
if prefix in {"C", "c"}:
return _decode_classic_string(
inner,
allow_line_breaks=True,
line=line,
column=column,
)

_validate_string_content(
inner,
allow_line_breaks=True,
line=line,
column=column,
)
return inner

# Single-quoted or double-quoted string.
Expand All @@ -58,12 +80,19 @@ def decode_string_token(

# Raw and unprefixed strings: return as-is.
if prefix in {"", "R", "r"}:
_validate_string_content(
inner,
allow_line_breaks=False,
line=line,
column=column,
)
return inner

# Classic strings: decode escapes.
if prefix in {"C", "c"}:
return _decode_classic_string(
inner,
allow_line_breaks=False,
line=line,
column=column,
)
Expand Down Expand Up @@ -141,17 +170,182 @@ def parse_number_literal(text, line=None, column=None):
def _decode_classic_string(
inner: str,
*,
allow_line_breaks: bool,
line: int | None = None,
column: int | None = None,
) -> str:
try:
return bytes(inner, "utf-8").decode("unicode_escape")
except UnicodeDecodeError as exc:
result: list[str] = []
index = 0

while index < len(inner):
char = inner[index]

if char != "\\":
_validate_string_content(
char,
allow_line_breaks=allow_line_breaks,
line=line,
column=column,
)
result.append(char)
index += 1
continue

if index + 1 >= len(inner):
_raise_invalid_escape(
"Invalid string escape sequence: trailing backslash.",
line=line,
column=column,
)

escape = inner[index + 1]

if escape in _CLASSIC_SIMPLE_ESCAPES:
result.append(_CLASSIC_SIMPLE_ESCAPES[escape])
index += 2
continue

if escape == "o":
result.append(
_decode_digits_escape(
inner,
start=index + 2,
length=3,
base=8,
prefix="\\o",
line=line,
column=column,
)
)
index += 5
continue

if escape == "x":
result.append(
_decode_digits_escape(
inner,
start=index + 2,
length=2,
base=16,
prefix="\\x",
line=line,
column=column,
)
)
index += 4
continue

if escape == "u":
result.append(
_decode_digits_escape(
inner,
start=index + 2,
length=4,
base=16,
prefix="\\u",
line=line,
column=column,
)
)
index += 6
continue

if escape == "U":
result.append(
_decode_digits_escape(
inner,
start=index + 2,
length=8,
base=16,
prefix="\\U",
line=line,
column=column,
)
)
index += 10
continue

_raise_invalid_escape(
f"Invalid string escape sequence: \\{escape}.",
line=line,
column=column,
)

return "".join(result)


def _validate_string_content(
text: str,
*,
allow_line_breaks: bool,
line: int | None = None,
column: int | None = None,
) -> None:
for char in text:
if ord(char) >= 0x20:
continue

if allow_line_breaks and char in {"\n", "\r", "\t"}:
continue

raise YiniParseError(
f"Invalid string escape sequence: {exc.reason}.",
"Invalid string literal: literal control characters are not allowed.",
line=line,
column=column,
) from None
)


def _decode_digits_escape(
text: str,
*,
start: int,
length: int,
base: int,
prefix: str,
line: int | None = None,
column: int | None = None,
) -> str:
digits = text[start : start + length]

if len(digits) != length or not _digits_match_base(digits, base):
_raise_invalid_escape(
f"Invalid string escape sequence: {prefix} requires {length} "
f"base-{base} digit(s).",
line=line,
column=column,
)

try:
return chr(int(digits, base))
except ValueError:
_raise_invalid_escape(
f"Invalid string escape sequence: {prefix}{digits}.",
line=line,
column=column,
)


def _digits_match_base(digits: str, base: int) -> bool:
if base == 8:
return all("0" <= digit <= "7" for digit in digits)

if base == 16:
return all(digit in "0123456789abcdefABCDEF" for digit in digits)

raise ValueError(f"Unsupported escape base: {base}")


def _raise_invalid_escape(
message: str,
*,
line: int | None = None,
column: int | None = None,
) -> NoReturn:
raise YiniParseError(
message,
line=line,
column=column,
)


def _parse_duodecimal(
Expand Down
40 changes: 40 additions & 0 deletions tests/test_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

import json
import subprocess
import sys
from pathlib import Path


def test_adapter_writes_utf8_json_for_unicode_strings(tmp_path: Path) -> None:
input_path = tmp_path / "unicode.yini"
input_path.write_text(
"""
@yini

^ Strings
quote = "She said “hello” and left."
""".lstrip(),
encoding="utf-8",
)

completed = subprocess.run(
[
sys.executable,
"tools/yini_parser_adapter.py",
"--input",
str(input_path),
"--mode",
"lenient",
],
check=True,
capture_output=True,
)

stdout = completed.stdout.decode("utf-8")

assert json.loads(stdout) == {
"Strings": {
"quote": "She said “hello” and left.",
},
}
35 changes: 35 additions & 0 deletions tests/test_values.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# tests/test_values.py
from __future__ import annotations

import pytest

from yini_parser.api.errors import YiniParseError
from yini_parser.api.load import loads


Expand Down Expand Up @@ -90,6 +93,38 @@ def test_parses_basic_strings() -> None:
}


def test_parses_classic_octal_escape() -> None:
text = r"""
^ App
letter = C"\o141"
""".lstrip()

result = loads(text)

assert result == {
"App": {
"letter": "a",
},
}


def test_rejects_invalid_classic_octal_escape() -> None:
text = r"""
^ App
bad = C"\o378"
""".lstrip()

with pytest.raises(YiniParseError):
loads(text)


def test_rejects_literal_control_character_in_string() -> None:
text = '^ App\nbad = "alpha\tbeta"\n'

with pytest.raises(YiniParseError):
loads(text)


def test_parses_lists() -> None:
text = """
^ App
Expand Down
15 changes: 15 additions & 0 deletions tools/yini_parser_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,20 @@
from typing import NoReturn


def _configure_stdio() -> None:
"""
The yini-test-suite reads adapter output as UTF-8 JSON. On some Windows
setups, Python may encode piped stdout/stderr with the active locale unless
we make the encoding explicit.
"""

if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")

if hasattr(sys.stderr, "reconfigure"):
sys.stderr.reconfigure(encoding="utf-8")


def _ensure_local_src_on_path() -> None:
"""
Allows this adapter to run directly from the repository root without
Expand Down Expand Up @@ -48,6 +62,7 @@ def _parse_args() -> argparse.Namespace:


def main() -> int:
_configure_stdio()
_ensure_local_src_on_path()

from yini_parser.api.errors import YiniParseError
Expand Down
Loading