diff --git a/.github/scripts/msvc_build.cmd b/.github/scripts/msvc_build.cmd new file mode 100644 index 00000000..0b0cebf2 --- /dev/null +++ b/.github/scripts/msvc_build.cmd @@ -0,0 +1,58 @@ +@echo off +setlocal EnableExtensions + +if "%~4"=="" ( + echo Usage: msvc_build.cmd ROOT OUT_DIR TARGET ARCH + echo TARGET is testTransforms or kanzi + exit /b 2 +) + +set "ROOT=%~f1" +set "OUT_DIR=%~f2" +set "TARGET=%~3" +set "ARCH=%~4" +set "OBJ_DIR=%OUT_DIR%\obj-%TARGET%-%ARCH%" + +for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath`) do set "VSINSTALL=%%i" +if "%VSINSTALL%"=="" ( + echo Visual Studio installation not found + exit /b 1 +) + +call "%VSINSTALL%\Common7\Tools\VsDevCmd.bat" -arch=%ARCH% -host_arch=x64 +if errorlevel 1 exit /b %errorlevel% + +if not exist "%OUT_DIR%" mkdir "%OUT_DIR%" +if not exist "%OBJ_DIR%" mkdir "%OBJ_DIR%" + +pushd "%ROOT%" +if errorlevel 1 exit /b %errorlevel% + +git ls-files "src/*.cpp" "src/*/*.cpp" | findstr /v /i /c:"src/test/" /c:"src/app/" > "%OUT_DIR%\kanzi_lib_sources.rsp" +if errorlevel 1 exit /b %errorlevel% + +git ls-files "src/*.cpp" "src/*/*.cpp" | findstr /v /i /c:"src/test/" > "%OUT_DIR%\kanzi_cli_sources.rsp" +if errorlevel 1 exit /b %errorlevel% + +if /i "%TARGET%"=="testTransforms" goto build_testTransforms +if /i "%TARGET%"=="kanzi" goto build_kanzi + +popd +echo Unknown target: %TARGET% +exit /b 2 + +:build_testTransforms +( + echo src\test\TestTransforms.cpp + type "%OUT_DIR%\kanzi_lib_sources.rsp" +) > "%OUT_DIR%\test_transform_sources.rsp" +cl /nologo /EHsc /MT /O2 /DNDEBUG /D_CRT_SECURE_NO_WARNINGS /DTestTransforms_main=main /GR- /std:c++17 /Zc:__cplusplus /I "%ROOT%\src" /Fo"%OBJ_DIR%\\" /Fe"%OUT_DIR%\testTransforms.exe" @"%OUT_DIR%\test_transform_sources.rsp" +set "RC=%ERRORLEVEL%" +popd +exit /b %RC% + +:build_kanzi +cl /nologo /EHsc /MT /O2 /DNDEBUG /D_CRT_SECURE_NO_WARNINGS /GR- /std:c++17 /Zc:__cplusplus /I "%ROOT%\src" /Fo"%OBJ_DIR%\\" /Fe"%OUT_DIR%\kanzi.exe" @"%OUT_DIR%\kanzi_cli_sources.rsp" +set "RC=%ERRORLEVEL%" +popd +exit /b %RC% diff --git a/.github/scripts/stream_roundtrip.py b/.github/scripts/stream_roundtrip.py new file mode 100644 index 00000000..c58633a4 --- /dev/null +++ b/.github/scripts/stream_roundtrip.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +import argparse +import hashlib +import random +import subprocess +import tempfile +from pathlib import Path + + +SAMPLE_SIZE = 4 * 1024 * 1024 +CHUNK_SIZE = 256 * 1024 +LEVELS = tuple(range(1, 10)) + + +def sha256(path): + digest = hashlib.sha256() + + with path.open("rb") as source: + for chunk in iter(lambda: source.read(1024 * 1024), b""): + digest.update(chunk) + + return digest.hexdigest() + + +def write_random(path): + rng = random.Random(0x4B414E5A49) + remaining = SAMPLE_SIZE + + with path.open("wb") as target: + while remaining > 0: + size = min(CHUNK_SIZE, remaining) + target.write(bytes(rng.getrandbits(8) for _ in range(size))) + remaining -= size + + +def write_structured(path): + line = ( + b"kanzi stream transform check 0123456789 " + b"abcdefghijklmnopqrstuvwxyz repeated fields ROLZ RANK SRT " + + b"A" * 96 + + b"\n" + ) + chunk = (line * ((CHUNK_SIZE // len(line)) + 1))[:CHUNK_SIZE] + remaining = SAMPLE_SIZE + + with path.open("wb") as target: + while remaining > 0: + target.write(chunk[: min(CHUNK_SIZE, remaining)]) + remaining -= CHUNK_SIZE + + +def write_mixed(path): + rng = random.Random(0x53545245414D) + chunks = SAMPLE_SIZE // CHUNK_SIZE + + with path.open("wb") as target: + for block in range(chunks): + mode = block % 4 + + if mode == 0: + data = bytes((idx + block) & 255 for idx in range(CHUNK_SIZE)) + elif mode == 1: + data = bytes(65 + (((idx // 257) + block) % 26) for idx in range(CHUNK_SIZE)) + elif mode == 2: + data = bytes(rng.getrandbits(8) for _ in range(CHUNK_SIZE)) + else: + data = b"\0" * CHUNK_SIZE + + target.write(data) + + +def run(command): + print("+ " + " ".join(str(part) for part in command), flush=True) + subprocess.run(command, check=True) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("kanzi", type=Path) + args = parser.parse_args() + + kanzi = args.kanzi.resolve() + + if not kanzi.is_file(): + raise SystemExit(f"Missing Kanzi executable: {kanzi}") + + with tempfile.TemporaryDirectory(prefix="kanzi-stream-") as tmp_dir: + root = Path(tmp_dir) + samples = { + "random": root / "random-4m.bin", + "structured": root / "structured-4m.txt", + "mixed": root / "mixed-4m.bin", + } + writers = { + "random": write_random, + "structured": write_structured, + "mixed": write_mixed, + } + + for name, writer in writers.items(): + writer(samples[name]) + + for name, source in samples.items(): + source_hash = sha256(source) + + for level in LEVELS: + compressed = root / f"{name}-l{level}.knz" + decoded = root / f"{name}-l{level}.out" + run([str(kanzi), "-c", "-i", str(source), "-o", str(compressed), "-f", "-b", "1m", "-l", str(level), "-x32", "-j", "1", "-v", "1"]) + run([str(kanzi), "-d", "-i", str(compressed), "-o", str(decoded), "-f", "-j", "1", "-v", "1"]) + decoded_hash = sha256(decoded) + + if decoded_hash != source_hash: + raise SystemExit(f"SHA-256 mismatch for {name} level {level}") + + print( + f"{name} level {level}: {source.stat().st_size} => " + f"{compressed.stat().st_size}, SHA-256 match", + flush=True, + ) + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/transform_regression.py b/.github/scripts/transform_regression.py new file mode 100644 index 00000000..a5bb019b --- /dev/null +++ b/.github/scripts/transform_regression.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +import argparse +import subprocess +import tempfile +from pathlib import Path + + +COMMANDS = ( + ("ROLZ", ("-type=ROLZ", "-noperf")), + ("SRT", ("-type=SRT", "-noperf")), + ("RANK", ("-type=RANK", "-noperf")), + ("all", ("-type=all", "-noperf")), +) + + +def print_tail(path, lines=80): + try: + content = path.read_text(errors="replace").splitlines() + except OSError as exc: + print(f"Could not read log {path}: {exc}") + return + + tail = content[-lines:] + + if tail: + print(f"--- tail of {path.name} ---") + print("\n".join(tail)) + print("--- end tail ---") + + +def run_case(executable, name, args, timeout): + log_file = Path(tempfile.gettempdir()) / f"kanzi-transform-{name}.log" + command = [str(executable), *args] + print("+ " + " ".join(command), flush=True) + + with log_file.open("w", encoding="utf-8", errors="replace") as log: + try: + result = subprocess.run( + command, + stdout=log, + stderr=subprocess.STDOUT, + timeout=timeout, + check=False, + ) + return result.returncode, log_file + except subprocess.TimeoutExpired: + log.write(f"\nTIMEOUT after {timeout} seconds\n") + return 124, log_file + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("executable", type=Path) + parser.add_argument("--expect", choices=("pass", "fail"), required=True) + parser.add_argument("--timeout", type=int, default=300) + args = parser.parse_args() + + executable = args.executable.resolve() + + if not executable.is_file(): + raise SystemExit(f"Missing test executable: {executable}") + + failures = [] + + for name, command_args in COMMANDS: + code, log_file = run_case(executable, name, command_args, args.timeout) + print(f"{name}: exit {code}, log {log_file}", flush=True) + + if code != 0: + failures.append((name, code, log_file)) + print_tail(log_file) + + if args.expect == "pass": + if failures: + failed = ", ".join(f"{name}={code}" for name, code, _ in failures) + raise SystemExit(f"Expected all transform cases to pass, got {failed}") + + print("All transform regression cases passed", flush=True) + return + + if not failures: + raise SystemExit("Expected the baseline to reproduce at least one transform failure") + + failed = ", ".join(f"{name}={code}" for name, code, _ in failures) + print(f"Baseline reproduced transform failure(s): {failed}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/kanzi-quality.yml b/.github/workflows/kanzi-quality.yml new file mode 100644 index 00000000..efc48f6b --- /dev/null +++ b/.github/workflows/kanzi-quality.yml @@ -0,0 +1,97 @@ +name: Kanzi quality + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + workflow_dispatch: + inputs: + baseline_ref: + description: Known-bad ref used by the MSVC contrast job + required: false + default: f583957239707a6913a2a9994527bb1d3f623aa7 + +env: + KANZI_REGRESSION_BASE: f583957239707a6913a2a9994527bb1d3f623aa7 + +jobs: + windows-regression-contrast: + name: windows-2025-vs2026 / msvc x64 baseline contrast + runs-on: windows-2025-vs2026 + steps: + - uses: actions/checkout@v6.0.1 + with: + fetch-depth: 0 + - name: Select known-bad baseline + shell: pwsh + run: | + $baseline = "${{ github.event.inputs.baseline_ref }}" + if ([string]::IsNullOrWhiteSpace($baseline)) { + $baseline = "${{ env.KANZI_REGRESSION_BASE }}" + } + "BASELINE_REF=$baseline" >> $env:GITHUB_ENV + - name: Build baseline and current testTransforms + shell: cmd + run: | + git worktree add "%RUNNER_TEMP%\baseline" "%BASELINE_REF%" + call .github\scripts\msvc_build.cmd "%RUNNER_TEMP%\baseline" "%RUNNER_TEMP%\baseline-msvc-x64" testTransforms x64 + if errorlevel 1 exit /b %errorlevel% + call .github\scripts\msvc_build.cmd "%GITHUB_WORKSPACE%" "%RUNNER_TEMP%\head-msvc-x64" testTransforms x64 + if errorlevel 1 exit /b %errorlevel% + - name: Confirm baseline reproduces transform failure + shell: cmd + run: python .github\scripts\transform_regression.py --expect fail "%RUNNER_TEMP%\baseline-msvc-x64\testTransforms.exe" + - name: Confirm current head fixes transform failure + shell: cmd + run: python .github\scripts\transform_regression.py --expect pass "%RUNNER_TEMP%\head-msvc-x64\testTransforms.exe" + + unix-transforms: + name: ${{ matrix.os }} / ${{ matrix.cxx }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + cc: gcc + cxx: g++ + - os: ubuntu-latest + cc: clang + cxx: clang++ + - os: macos-latest + cc: clang + cxx: clang++ + steps: + - uses: actions/checkout@v6.0.1 + - name: Build testTransforms and kanzi + run: | + cd src + make clean + make testTransforms kanzi CC=${{ matrix.cc }} CXX=${{ matrix.cxx }} + - name: Run transform tests + run: python3 .github/scripts/transform_regression.py --expect pass bin/testTransforms + - name: Run stream round trips + run: python3 .github/scripts/stream_roundtrip.py bin/kanzi + + windows-transforms: + name: windows-2025-vs2026 / msvc ${{ matrix.arch }} + runs-on: windows-2025-vs2026 + strategy: + fail-fast: false + matrix: + arch: [ x64, x86 ] + steps: + - uses: actions/checkout@v6.0.1 + - name: Build and run transform tests + shell: cmd + run: | + call .github\scripts\msvc_build.cmd "%GITHUB_WORKSPACE%" "%RUNNER_TEMP%\head-msvc-${{ matrix.arch }}" testTransforms ${{ matrix.arch }} + if errorlevel 1 exit /b %errorlevel% + python .github\scripts\transform_regression.py --expect pass "%RUNNER_TEMP%\head-msvc-${{ matrix.arch }}\testTransforms.exe" + - name: Build and run stream round trips + shell: cmd + run: | + call .github\scripts\msvc_build.cmd "%GITHUB_WORKSPACE%" "%RUNNER_TEMP%\head-msvc-${{ matrix.arch }}" kanzi ${{ matrix.arch }} + if errorlevel 1 exit /b %errorlevel% + python .github\scripts\stream_roundtrip.py "%RUNNER_TEMP%\head-msvc-${{ matrix.arch }}\kanzi.exe" diff --git a/src/test/TestTransforms.cpp b/src/test/TestTransforms.cpp index 0814caa1..a7507275 100644 --- a/src/test/TestTransforms.cpp +++ b/src/test/TestTransforms.cpp @@ -455,7 +455,7 @@ int testTransformsCorrectness(const string& name) cout << endl << "Test " << ii << endl; int size = 80000; // Declare size, will be updated in conditions - kanzi::byte values[1024 * 1024] = { kanzi::byte(0xAA) }; + vector values(1024 * 1024, kanzi::byte(0xAA)); if (name == "ALIAS") mod = 15 + 12 * ii; @@ -469,15 +469,15 @@ int testTransformsCorrectness(const string& name) (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3, (kanzi::byte)3 }; - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } else if (ii < 10) { size = ii; - memset(values, ii, size); + memset(&values[0], ii, size); } else if (ii == 10) { size = 255; - memset(values, ii, size); + memset(&values[0], ii, size); values[127] = kanzi::byte(255); } else if (ii == 11) { @@ -488,12 +488,12 @@ int testTransformsCorrectness(const string& name) for (int i = 1; i < 80000; i++) arr[i] = kanzi::byte(8); - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } else if (ii == 12) { size = 8; kanzi::byte arr[8] = { (kanzi::byte)0, (kanzi::byte)0, (kanzi::byte)1, (kanzi::byte)1, (kanzi::byte)2, (kanzi::byte)2, (kanzi::byte)3, (kanzi::byte)3 }; - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } else if (ii == 13) { // For RLT @@ -506,7 +506,7 @@ int testTransformsCorrectness(const string& name) } arr[1] = kanzi::byte(255); // force RLT escape to be first symbol - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } else if (ii == 14) { // Lots of zeros @@ -522,7 +522,7 @@ int testTransformsCorrectness(const string& name) arr[i] = kanzi::byte(val); } - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } else if (ii == 15) { // Lots of zeros @@ -538,7 +538,7 @@ int testTransformsCorrectness(const string& name) arr[i] = kanzi::byte(val); } - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } else if (ii == 16) { // Totally random @@ -549,7 +549,7 @@ int testTransformsCorrectness(const string& name) for (int j = 20; j < 512; j++) arr[j] = kanzi::byte(rand() % mod); - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } else if (ii < 25) { size = 2048; @@ -566,7 +566,7 @@ int testTransformsCorrectness(const string& name) arr[j + k] = arr[j + k - step]; } - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } else if (ii == 50) { cout << "Large random data" << endl; @@ -576,7 +576,7 @@ int testTransformsCorrectness(const string& name) for (int i = 0; i < size; i++) arr[i] = kanzi::byte(rand() % 256); - memcpy(values, arr, size); + memcpy(&values[0], arr, size); delete[] arr; } else { @@ -601,7 +601,7 @@ int testTransformsCorrectness(const string& name) idx += len; } - memcpy(values, &arr[0], size); + memcpy(&values[0], &arr[0], size); } Context ctx; @@ -665,7 +665,6 @@ int testTransformsCorrectness(const string& name) cout << endl << "Encoding error" << endl; res = 1; - ff = nullptr; goto End; } @@ -815,6 +814,7 @@ int testTransformsSpeed(const string& name) if ((iba1._index != size) || (iba2._index >= iba1._index)) { cout << endl << "No compression (ratio > 1.0), skip reverse" << endl; + delete ff; continue; } diff --git a/src/transform/ROLZCodec.cpp b/src/transform/ROLZCodec.cpp index 4710029d..6edc24dc 100644 --- a/src/transform/ROLZCodec.cpp +++ b/src/transform/ROLZCodec.cpp @@ -451,14 +451,17 @@ bool ROLZCodec1::inverse(SliceArray& input, SliceArray _posChecks = 1 << _logPosChecks; _maskChecks = uint8(_posChecks - 1); - kanzi::byte* arena = new kanzi::byte[sizeChunk + sizeChunk / 5 + 2 * sizeChunk / 4]; - SliceArray litBuf(&arena[0], sizeChunk); - SliceArray mIdxBuf(&arena[sizeChunk], sizeChunk / 4); - SliceArray tkBuf(&arena[sizeChunk + sizeChunk / 4], sizeChunk / 4); - SliceArray lenBuf(&arena[sizeChunk + sizeChunk / 2], sizeChunk / 5); + const int litBufSize = sizeChunk; + const int mIdxBufSize = sizeChunk / 4; + const int tkBufSize = sizeChunk / 4; + const int lenBufSize = sizeChunk / 5; + kanzi::byte* arena = new kanzi::byte[litBufSize + mIdxBufSize + tkBufSize + lenBufSize]; + SliceArray litBuf(&arena[0], litBufSize); + SliceArray mIdxBuf(&arena[litBufSize], mIdxBufSize); + SliceArray tkBuf(&arena[litBufSize + mIdxBufSize], tkBufSize); + SliceArray lenBuf(&arena[litBufSize + mIdxBufSize + tkBufSize], lenBufSize); memset(&_counters[0], 0, sizeof(_counters)); bool success = true; - const int litBufSize = litBuf._length; // Main loop while (startChunk < dstEnd) { @@ -475,7 +478,12 @@ bool ROLZCodec1::inverse(SliceArray& input, SliceArray try { // Decode literal, length and match index buffers - ifixedbuf buffer(reinterpret_cast(&src[srcIdx]), max(min(count - srcIdx, sizeChunk + 16), 65536)); + if (srcIdx >= count) { + success = false; + goto End; + } + + ifixedbuf buffer(reinterpret_cast(&src[srcIdx]), size_t(count - srcIdx)); istream is(&buffer); DefaultInputBitStream ibs(is, 65536); const int litLen = int(ibs.readBits(32));