From d33d823bc4f758bcc7e35392db52e00910f75c7a Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Wed, 17 Jun 2026 22:55:13 -0400 Subject: [PATCH 1/7] Backport Zstd streaming decompression, multi-frame, and 32-bit fix (#707, #782, #757) Backport three Zstandard improvements from main to the 0.15.x branch, adapted to use the v0.15.x Buffer API from compat_ext: - Add streaming decompression when frame content size is unknown (#707) - Fix negative size issue on 32-bit platforms (#782) - Support decompression of multiple concatenated frames (#757) Co-Authored-By: Claude Sonnet 4.6 --- docs/release.rst | 6 + numcodecs/tests/test_pyzstd.py | 75 ++++++++++++ numcodecs/tests/test_zstd.py | 102 ++++++++++++++++ numcodecs/zstd.pyx | 207 +++++++++++++++++++++++++++++++-- pyproject.toml | 1 + 5 files changed, 381 insertions(+), 10 deletions(-) create mode 100644 numcodecs/tests/test_pyzstd.py diff --git a/docs/release.rst b/docs/release.rst index c5ba3919..e865fd1a 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -21,6 +21,12 @@ Improvements ~~~~~~~~~~~~ * Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec. By :user:`Cas Wognum `. +* Add streaming decompression for Zstandard when the frame content size is unknown. + By :user:`Mark Kittisopikul `, :issue:`707` +* Fix Zstd decompression negative size issue on 32-bit platforms. + By :user:`Mark Kittisopikul `, :issue:`782` +* Allow Zstandard to decompress multiple concatenated frames. + By :user:`Mark Kittisopikul `, :issue:`757` Fixes ~~~~~ diff --git a/numcodecs/tests/test_pyzstd.py b/numcodecs/tests/test_pyzstd.py new file mode 100644 index 00000000..e6df84a7 --- /dev/null +++ b/numcodecs/tests/test_pyzstd.py @@ -0,0 +1,75 @@ +# Check Zstd against pyzstd package + +import numpy as np +import pytest +import pyzstd +from numcodecs.zstd import Zstd + +test_data = [ + b"Hello World!", + np.arange(113).tobytes(), + np.arange(10, 15).tobytes(), + np.random.randint(3, 50, size=(53,), dtype=np.uint16).tobytes(), +] + + +@pytest.mark.parametrize("input", test_data) +def test_pyzstd_simple(input): + """ + Test if Zstd.[decode, encode] can perform the inverse operation to + pyzstd.[compress, decompress] in the simple case. + """ + z = Zstd() + assert z.decode(pyzstd.compress(input)) == input + assert pyzstd.decompress(z.encode(input)) == input + + +@pytest.mark.parametrize("input", test_data) +def test_pyzstd_simple_multiple_frames_decode(input): + """ + Test decompression of two concatenated frames of known sizes + + numcodecs.zstd.Zstd currently fails because it only assesses the size of the + first frame. Rather, it should keep iterating through all the frames until + the end of the input buffer. + """ + z = Zstd() + assert pyzstd.decompress(pyzstd.compress(input) * 2) == input * 2 + assert z.decode(pyzstd.compress(input) * 2) == input * 2 + + +@pytest.mark.parametrize("input", test_data) +def test_pyzstd_simple_multiple_frames_encode(input): + """ + Test if pyzstd can decompress two concatenated frames from Zstd.encode + """ + z = Zstd() + assert pyzstd.decompress(z.encode(input) * 2) == input * 2 + + +@pytest.mark.parametrize("input", test_data) +def test_pyzstd_streaming(input): + """ + Test if Zstd can decode a single frame and concatenated frames in streaming + mode where the decompressed size is not recorded in the frame header. + """ + pyzstd_c = pyzstd.ZstdCompressor() + pyzstd_d = pyzstd.ZstdDecompressor() + pyzstd_e = pyzstd.EndlessZstdDecompressor() + z = Zstd() + + d_bytes = input + pyzstd_c.compress(d_bytes) + c_bytes = pyzstd_c.flush() + assert z.decode(c_bytes) == d_bytes + assert pyzstd_d.decompress(z.encode(d_bytes)) == d_bytes + + # Test multiple streaming frames + assert z.decode(c_bytes * 2) == pyzstd_e.decompress(c_bytes * 2) + assert z.decode(c_bytes * 3) == pyzstd_e.decompress(c_bytes * 3) + assert z.decode(c_bytes * 4) == pyzstd_e.decompress(c_bytes * 4) + assert z.decode(c_bytes * 5) == pyzstd_e.decompress(c_bytes * 5) + assert z.decode(c_bytes * 7) == pyzstd_e.decompress(c_bytes * 7) + assert z.decode(c_bytes * 11) == pyzstd_e.decompress(c_bytes * 11) + assert z.decode(c_bytes * 13) == pyzstd_e.decompress(c_bytes * 13) + assert z.decode(c_bytes * 99) == pyzstd_e.decompress(c_bytes * 99) diff --git a/numcodecs/tests/test_zstd.py b/numcodecs/tests/test_zstd.py index de42d9e1..afa87de6 100644 --- a/numcodecs/tests/test_zstd.py +++ b/numcodecs/tests/test_zstd.py @@ -1,4 +1,5 @@ import itertools +import subprocess import numpy as np import pytest @@ -90,3 +91,104 @@ def test_native_functions(): assert Zstd.default_level() == 3 assert Zstd.min_level() == -131072 assert Zstd.max_level() == 22 + + +def test_streaming_decompression(): + # Test input frames with unknown frame content size + codec = Zstd() + + # If the zstd command line interface is available, check the bytes + cli = zstd_cli_available() + if cli: + view_zstd_streaming_bytes() + + # Encode bytes directly that were the result of streaming compression + bytes_val = b'(\xb5/\xfd\x00Xa\x00\x00Hello World!' + dec = codec.decode(bytes_val) + dec_expected = b'Hello World!' + assert dec == dec_expected + if cli: + assert bytes_val == generate_zstd_streaming_bytes(dec_expected) + assert dec_expected == generate_zstd_streaming_bytes(bytes_val, decompress=True) + + # Two consecutive frames given as input + bytes2 = bytes(bytearray(bytes_val * 2)) + dec2 = codec.decode(bytes2) + dec2_expected = b'Hello World!Hello World!' + assert dec2 == dec2_expected + if cli: + assert dec2_expected == generate_zstd_streaming_bytes(bytes2, decompress=True) + + # Single long frame that decompresses to a large output + bytes3 = b'(\xb5/\xfd\x00X$\x02\x00\xa4\x03ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz\x01\x00:\xfc\xdfs\x05\x05L\x00\x00\x08s\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08k\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08c\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08[\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08S\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08K\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08C\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08u\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08m\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08e\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08]\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08U\x01\x00\xfc\xff9\x10\x02L\x00\x00\x08M\x01\x00\xfc\xff9\x10\x02M\x00\x00\x08E\x01\x00\xfc\x7f\x1d\x08\x01' + dec3 = codec.decode(bytes3) + dec3_expected = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz' * 1024 * 32 + assert dec3 == dec3_expected + if cli: + assert bytes3 == generate_zstd_streaming_bytes(dec3_expected) + assert dec3_expected == generate_zstd_streaming_bytes(bytes3, decompress=True) + + # Garbage input results in an error + bytes4 = bytes(bytearray([0, 0, 0, 0, 0, 0, 0, 0])) + with pytest.raises(RuntimeError, match='Zstd decompression error: invalid input data'): + codec.decode(bytes4) + + +def test_multi_frame(): + codec = Zstd() + + hello_world = codec.encode(b"Hello world!") + assert codec.decode(hello_world) == b"Hello world!" + assert codec.decode(hello_world * 2) == b"Hello world!Hello world!" + + hola = codec.encode(b"Hola ") + mundo = codec.encode(b"Mundo!") + assert codec.decode(hola) == b"Hola " + assert codec.decode(mundo) == b"Mundo!" + assert codec.decode(hola + mundo) == b"Hola Mundo!" + + bytes_val = b'(\xb5/\xfd\x00Xa\x00\x00Hello World!' + dec = codec.decode(bytes_val) + dec_expected = b'Hello World!' + assert dec == dec_expected + cli = zstd_cli_available() + if cli: + assert bytes_val == generate_zstd_streaming_bytes(dec_expected) + assert dec_expected == generate_zstd_streaming_bytes(bytes_val, decompress=True) + + # Concatenate frames of known sizes and unknown sizes + # unknown size frame at the end + assert codec.decode(hola + mundo + bytes_val) == b"Hola Mundo!Hello World!" + # unknown size frame at the beginning + assert codec.decode(bytes_val + hola + mundo) == b"Hello World!Hola Mundo!" + # unknown size frame in the middle + assert codec.decode(hola + bytes_val + mundo) == b"Hola Hello World!Mundo!" + + +def generate_zstd_streaming_bytes(input: bytes, *, decompress: bool = False) -> bytes: + """ + Use the zstd command line interface to compress or decompress bytes in streaming mode. + """ + if decompress: + args = ["-d"] + else: + args = [] + + p = subprocess.run(["zstd", "--no-check", *args], input=input, capture_output=True) + return p.stdout + + +def view_zstd_streaming_bytes(): + bytes_val = generate_zstd_streaming_bytes(b"Hello world!") + print(f" bytes_val = {bytes_val}") + + bytes3 = generate_zstd_streaming_bytes( + b"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz" * 1024 * 32 + ) + print(f" bytes3 = {bytes3}") + + +def zstd_cli_available() -> bool: + return not subprocess.run( + ["zstd", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ).returncode diff --git a/numcodecs/zstd.pyx b/numcodecs/zstd.pyx index efd12fa2..30337642 100644 --- a/numcodecs/zstd.pyx +++ b/numcodecs/zstd.pyx @@ -14,6 +14,10 @@ from .compat_ext import Buffer from .compat import ensure_contiguous_ndarray from .abc import Codec +from libc.stdlib cimport malloc, realloc, free + +cdef extern from "stdint.h": + cdef size_t SIZE_MAX cdef extern from "zstd.h": @@ -22,6 +26,23 @@ cdef extern from "zstd.h": struct ZSTD_CCtx_s: pass ctypedef ZSTD_CCtx_s ZSTD_CCtx + + struct ZSTD_DStream_s: + pass + ctypedef ZSTD_DStream_s ZSTD_DStream + + struct ZSTD_inBuffer_s: + const void* src + size_t size + size_t pos + ctypedef ZSTD_inBuffer_s ZSTD_inBuffer + + struct ZSTD_outBuffer_s: + void* dst + size_t size + size_t pos + ctypedef ZSTD_outBuffer_s ZSTD_outBuffer + cdef enum ZSTD_cParameter: ZSTD_c_compressionLevel=100 ZSTD_c_checksumFlag=201 @@ -43,10 +64,21 @@ cdef extern from "zstd.h": const void* src, size_t compressedSize) nogil - cdef long ZSTD_CONTENTSIZE_UNKNOWN - cdef long ZSTD_CONTENTSIZE_ERROR + size_t ZSTD_decompressStream(ZSTD_DStream* zds, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input) nogil + + size_t ZSTD_DStreamOutSize() nogil + ZSTD_DStream* ZSTD_createDStream() nogil + size_t ZSTD_freeDStream(ZSTD_DStream* zds) nogil + size_t ZSTD_initDStream(ZSTD_DStream* zds) nogil + + cdef unsigned long long ZSTD_CONTENTSIZE_UNKNOWN + cdef unsigned long long ZSTD_CONTENTSIZE_ERROR + unsigned long long ZSTD_getFrameContentSize(const void* src, size_t srcSize) nogil + size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) nogil int ZSTD_minCLevel() nogil int ZSTD_maxCLevel() nogil @@ -56,7 +88,7 @@ cdef extern from "zstd.h": unsigned ZSTD_isError(size_t code) nogil - const char* ZSTD_getErrorName(size_t code) + const char* ZSTD_getErrorName(size_t code) nogil VERSION_NUMBER = ZSTD_versionNumber() @@ -156,7 +188,10 @@ def decompress(source, dest=None): source : bytes-like Compressed data. Can be any object supporting the buffer protocol. dest : array-like, optional - Object to decompress into. + Object to decompress into. If the content size is unknown, the + length of dest must match the decompressed size. If the content size + is unknown and dest is not provided, streaming decompression will be + used. Returns ------- @@ -170,6 +205,8 @@ def decompress(source, dest=None): Buffer source_buffer Buffer dest_buffer = None size_t source_size, dest_size, decompressed_size + size_t dest_nbytes + unsigned long long content_size # setup source buffer source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS) @@ -177,11 +214,24 @@ def decompress(source, dest=None): source_size = source_buffer.nbytes try: + # determine uncompressed size using unsigned long long for full range + try: + content_size = findTotalContentSize(source_ptr, source_size) + except RuntimeError: + raise RuntimeError('Zstd decompression error: invalid input data') - # determine uncompressed size - dest_size = ZSTD_getFrameContentSize(source_ptr, source_size) - if dest_size == 0 or dest_size == ZSTD_CONTENTSIZE_UNKNOWN or dest_size == ZSTD_CONTENTSIZE_ERROR: + if content_size == ZSTD_CONTENTSIZE_UNKNOWN and dest is None: + return stream_decompress(source_ptr, source_size) + elif content_size == ZSTD_CONTENTSIZE_UNKNOWN: + # dest is not None + # set dest_size based on dest + pass + elif content_size == ZSTD_CONTENTSIZE_ERROR or content_size == 0: raise RuntimeError('Zstd decompression error: invalid input data') + elif content_size > (SIZE_MAX): + raise RuntimeError('Zstd decompression error: content size too large for platform') + + dest_size = content_size # setup destination buffer if dest is None: @@ -192,16 +242,21 @@ def decompress(source, dest=None): arr = ensure_contiguous_ndarray(dest) dest_buffer = Buffer(arr, PyBUF_ANY_CONTIGUOUS | PyBUF_WRITEABLE) dest_ptr = dest_buffer.ptr - if dest_buffer.nbytes < dest_size: + dest_nbytes = dest_buffer.nbytes + + if content_size == ZSTD_CONTENTSIZE_UNKNOWN: + dest_size = dest_nbytes + + # validate output buffer + if dest_nbytes < dest_size: raise ValueError('destination buffer too small; expected at least %s, ' - 'got %s' % (dest_size, dest_buffer.nbytes)) + 'got %s' % (dest_size, dest_nbytes)) # perform decompression with nogil: decompressed_size = ZSTD_decompress(dest_ptr, dest_size, source_ptr, source_size) finally: - # release buffers source_buffer.release() if dest_buffer is not None: @@ -218,6 +273,138 @@ def decompress(source, dest=None): return dest +cdef stream_decompress(const char* source_ptr, size_t source_size): + """Decompress data of unknown size + + Parameters + ---------- + source_ptr : const char* + Pointer to compressed data buffer + source_size : size_t + Size of compressed data buffer + + Returns + ------- + dest : bytes + Object containing decompressed data. + """ + + cdef: + void *dest_ptr + void *new_dst + size_t dest_size + size_t DEST_GROWTH_SIZE, status + ZSTD_inBuffer input + ZSTD_outBuffer output + ZSTD_DStream *zds + + # Recommended size for output buffer, guaranteed to flush at least + # one completely block in all circumstances + DEST_GROWTH_SIZE = ZSTD_DStreamOutSize(); + + # unknown content size, guess it is twice the size as the source + dest_size = source_size * 2 + + if dest_size < DEST_GROWTH_SIZE: + # minimum dest_size is DEST_GROWTH_SIZE + dest_size = DEST_GROWTH_SIZE + + dest_ptr = malloc(dest_size) + zds = ZSTD_createDStream() + + try: + + with nogil: + + status = ZSTD_initDStream(zds) + if ZSTD_isError(status): + error = ZSTD_getErrorName(status) + ZSTD_freeDStream(zds); + raise RuntimeError('Zstd stream decompression error on ZSTD_initDStream: %s' % error) + + input = ZSTD_inBuffer(source_ptr, source_size, 0) + output = ZSTD_outBuffer(dest_ptr, dest_size, 0) + + # Initialize to 1 to force a loop iteration + status = 1 + while(status > 0 or input.pos < input.size): + # Possible returned values of ZSTD_decompressStream: + # 0: frame is completely decoded and fully flushed + # error (<0) + # >0: suggested next input size + status = ZSTD_decompressStream(zds, &output, &input) + + if ZSTD_isError(status): + error = ZSTD_getErrorName(status) + raise RuntimeError('Zstd stream decompression error on ZSTD_decompressStream: %s' % error) + + # There is more to decompress, grow the buffer + if status > 0 and output.pos == output.size: + new_size = output.size + DEST_GROWTH_SIZE + + if new_size < output.size or new_size < DEST_GROWTH_SIZE: + raise RuntimeError('Zstd stream decompression error: output buffer overflow') + + new_dst = realloc(output.dst, new_size) + + if new_dst == NULL: + # output.dst freed in finally block + raise RuntimeError('Zstd stream decompression error on realloc: could not expand output buffer') + + output.dst = new_dst + output.size = new_size + + # Copy the output to a bytes object + dest = PyBytes_FromStringAndSize(output.dst, output.pos) + + finally: + ZSTD_freeDStream(zds) + free(output.dst) + + return dest + + +cdef unsigned long long findTotalContentSize(const char* source_ptr, size_t source_size): + """Find the total uncompressed content size of all frames in the source buffer + + Parameters + ---------- + source_ptr : Pointer to the beginning of the buffer + source_size : Size of the buffer containing the frame sizes to sum + + Returns + ------- + total_content_size: Sum of the content size of all frames within the source buffer + If any of the frame sizes is unknown, return ZSTD_CONTENTSIZE_UNKNOWN. + If any of the frames causes ZSTD_getFrameContentSize to error, return ZSTD_CONTENTSIZE_ERROR. + """ + cdef: + unsigned long long frame_content_size = 0 + unsigned long long total_content_size = 0 + size_t frame_compressed_size = 0 + size_t offset = 0 + + while offset < source_size: + frame_compressed_size = ZSTD_findFrameCompressedSize(source_ptr + offset, source_size - offset); + + if ZSTD_isError(frame_compressed_size): + error = ZSTD_getErrorName(frame_compressed_size) + raise RuntimeError('Could not set determine zstd frame size: %s' % error) + + frame_content_size = ZSTD_getFrameContentSize(source_ptr + offset, frame_compressed_size); + + if frame_content_size == ZSTD_CONTENTSIZE_ERROR: + return ZSTD_CONTENTSIZE_ERROR + + if frame_content_size == ZSTD_CONTENTSIZE_UNKNOWN: + return ZSTD_CONTENTSIZE_UNKNOWN + + total_content_size += frame_content_size + offset += frame_compressed_size + + return total_content_size + + class Zstd(Codec): """Codec providing compression using Zstandard. diff --git a/pyproject.toml b/pyproject.toml index 29ef0bf0..f2c3e846 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ test = [ "coverage", "pytest", "pytest-cov", + "pyzstd", ] test_extras = [ "importlib_metadata", From 15803df58f33b4d621810b1b3aa9fd506400871c Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Thu, 18 Jun 2026 23:56:06 -0400 Subject: [PATCH 2/7] Wrap parametrize itertools.product in tuple pytest 9.1 raises PytestRemovedIn10Warning for non-Collection iterables passed to parametrize, which becomes an error under filterwarnings=error and aborts test collection. Wrap in tuple() to match main. Co-Authored-By: Claude Opus 4.8 --- numcodecs/tests/test_checksum32.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numcodecs/tests/test_checksum32.py b/numcodecs/tests/test_checksum32.py index 9bdc25cb..8af40945 100644 --- a/numcodecs/tests/test_checksum32.py +++ b/numcodecs/tests/test_checksum32.py @@ -54,12 +54,12 @@ ) -@pytest.mark.parametrize(("codec", "arr"), itertools.product(codecs, arrays)) +@pytest.mark.parametrize(("codec", "arr"), tuple(itertools.product(codecs, arrays))) def test_encode_decode(codec, arr): check_encode_decode(arr, codec) -@pytest.mark.parametrize(("codec", "arr"), itertools.product(codecs, arrays)) +@pytest.mark.parametrize(("codec", "arr"), tuple(itertools.product(codecs, arrays))) def test_errors(codec, arr): enc = codec.encode(arr) with pytest.raises(RuntimeError): From c4360702acf5df1dce46c7a1d3b6a23c5048add3 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 19 Jun 2026 00:12:39 -0400 Subject: [PATCH 3/7] Cap zarr to <3.1 in CI The numcodecs.zarr3 module in this branch targets the zarr 3.0.x API. zarr 3.1+ moved these codecs into zarr itself and changed internal APIs (e.g. the new dtype system), breaking test_zarr3.py. The previous `--pre zarr>=3.0.0b2` install resolved to zarr 3.2.1, causing 26 failures on Python 3.12/3.13. Pin the test dependency to a compatible zarr release. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/ci.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a2a62c44..cfeab3cc 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -56,8 +56,9 @@ jobs: # Since zarr v3 requires numpy >= 1.25, on Python 3.11 leave it out # so we can have some tests of our minimum version of numpy (1.24) if: matrix.python-version != '3.11' - # TODO: remove --pre option when zarr v3 is out - run: python -m pip install --pre zarr>=3.0.0b2 + # numcodecs.zarr3 in this branch targets the zarr 3.0.x API; zarr 3.1+ + # moved these codecs into zarr itself and changed internal APIs. + run: python -m pip install "zarr>=3.0,<3.1" - name: List installed packages run: python -m pip list From 64eade51d9ed4a95f00001036fc947559c19f96f Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 19 Jun 2026 00:26:21 -0400 Subject: [PATCH 4/7] Swap macos-13 runner for macos-15-large in CI macos-13 runners are stalling. main already moved to macos-15-large (still an Intel runner, preserving x86 coverage). Update the matrix and the clang-install conditional to match. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/ci.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cfeab3cc..25e43f16 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,8 +13,8 @@ jobs: fail-fast: false matrix: python-version: ["3.11", "3.12", "3.13"] - # macos-13 is an intel runner, macos-14 is a arm64 runner - platform: [ubuntu-latest, windows-latest, macos-13, macos-14] + # macos-15-large is an intel runner, macos-14 is a arm64 runner + platform: [ubuntu-latest, windows-latest, macos-15-large, macos-14] defaults: run: @@ -38,7 +38,7 @@ jobs: run: conda install -y c-compiler cxx-compiler - name: Install clang - if: matrix.platform == 'macos-13' + if: matrix.platform == 'macos-15-large' run: conda install -y 'clang>=12.0.1,<17' - name: Show conda environment info From 7ae0f4b3af1f2aad650f4519f04986ac26b96dfa Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Fri, 19 Jun 2026 07:19:02 -0400 Subject: [PATCH 5/7] Strip duplicate LC_RPATH from extensions on macOS The conda-forge compilers inject -Wl,-rpath,$PREFIX/lib three times per link, producing duplicate LC_RPATH load commands. macOS 15's dyld rejects duplicate LC_RPATH at load time, so the freshly switched macos-15-large runners failed every test with an ImportError on dlopen. macos-14's older dyld tolerated the duplicates. Add a macOS-only post-build step that uses install_name_tool to remove the redundant LC_RPATH entries from numcodecs/*.so. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/ci.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 25e43f16..ed0bb548 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -52,6 +52,22 @@ jobs: export DISABLE_NUMCODECS_AVX2="" python -m pip install -v -e .[test,test_extras,msgpack,crc32c,pcodec,zfpy] + - name: Deduplicate LC_RPATH entries + # The conda-forge compilers inject -Wl,-rpath,$PREFIX/lib multiple + # times, producing duplicate LC_RPATH load commands. macOS 15's dyld + # rejects duplicate LC_RPATH at load time, so strip the extras. + if: runner.os == 'macOS' + run: | + rpaths() { otool -l "$1" | awk '/ LC_RPATH$/{getline; getline; print $2}'; } + for so in numcodecs/*.so; do + for rp in $(rpaths "$so" | sort | uniq -d); do + while [ "$(rpaths "$so" | grep -cx "$rp")" -gt 1 ]; do + echo "Removing duplicate LC_RPATH '$rp' from $so" + install_name_tool -delete_rpath "$rp" "$so" + done + done + done + - name: Install zarr-python # Since zarr v3 requires numpy >= 1.25, on Python 3.11 leave it out # so we can have some tests of our minimum version of numpy (1.24) From dd3c1e346e8d66cf6d2dbe746127108e252040f4 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Sat, 20 Jun 2026 17:25:38 -0400 Subject: [PATCH 6/7] Fix docs, pre-commit, wheel, and coverage CI for the backport branch - .readthedocs.yaml: bump build image to ubuntu-24.04 (ubuntu-20.04 was removed by Read the Docs) and cap docs zarr to <3.1, matching the 0.15.x compatibility range. - .pre-commit-config.yaml: pin the mypy hook's zarr to >=3.0.0rc1,<3.1 so type checking runs against the zarr 3.0 API that numcodecs.zarr3 targets (the unpinned hook pulled zarr 3.2, whose ZDType broke mypy). - zarr3.py: correct two `# type: ignore[arg-type]` to `[call-overload]` for np.dtype(astype) under current numpy. - wheel.yaml: swap the stalled macos-13 intel runner for macos-15-large, matching ci.yaml. - codecov.yml / pyproject.toml: exclude test files from coverage (ignore: tests/** and [tool.coverage.run] omit), matching main. - test_zarr3.py: cover AsType.evolve_from_array_spec when decode_dtype is unset, bringing zarr3.py to 100%. Co-Authored-By: Claude Opus 4.8 --- .github/codecov.yml | 2 ++ .github/workflows/wheel.yaml | 4 ++-- .pre-commit-config.yaml | 2 +- .readthedocs.yaml | 4 ++-- numcodecs/tests/test_pyzstd.py | 1 + numcodecs/tests/test_zarr3.py | 2 ++ numcodecs/zarr3.py | 4 ++-- pyproject.toml | 3 +++ 8 files changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/codecov.yml b/.github/codecov.yml index e9b99c82..c910cfa5 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -7,6 +7,8 @@ coverage: patch: default: target: 100 + ignore: + - "tests/**" comment: layout: "diff, files" behavior: default diff --git a/.github/workflows/wheel.yaml b/.github/workflows/wheel.yaml index 6746ffb8..8227cfa9 100644 --- a/.github/workflows/wheel.yaml +++ b/.github/workflows/wheel.yaml @@ -13,8 +13,8 @@ jobs: strategy: fail-fast: false matrix: - # macos-13 is an intel runner, macos-14 is a arm64 runner - os: [ubuntu-latest, windows-latest, macos-13, macos-14] + # macos-15-large is an intel runner, macos-14 is an arm64 runner + os: [ubuntu-latest, windows-latest, macos-15-large, macos-14] env: CIBW_TEST_COMMAND: python -c "import numcodecs" CIBW_BUILD: "cp311-* cp312-* cp313-*" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5728b2fb..88ba122b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,4 +30,4 @@ repos: hooks: - id: mypy args: [--config-file, pyproject.toml] - additional_dependencies: [numpy, pytest, crc32c, zfpy, 'zarr>=3.0.0rc1'] + additional_dependencies: [numpy, pytest, crc32c, zfpy, 'zarr>=3.0.0rc1,<3.1'] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index fa14bf34..5d81f8d3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -4,12 +4,12 @@ submodules: include: all build: - os: ubuntu-20.04 + os: ubuntu-24.04 tools: python: "3.12" jobs: post_install: - - python -m pip install --pre 'zarr>=3.0.0b2' + - python -m pip install --pre 'zarr>=3.0.0b2,<3.1' sphinx: configuration: docs/conf.py diff --git a/numcodecs/tests/test_pyzstd.py b/numcodecs/tests/test_pyzstd.py index e6df84a7..7ee6084b 100644 --- a/numcodecs/tests/test_pyzstd.py +++ b/numcodecs/tests/test_pyzstd.py @@ -3,6 +3,7 @@ import numpy as np import pytest import pyzstd + from numcodecs.zstd import Zstd test_data = [ diff --git a/numcodecs/tests/test_zarr3.py b/numcodecs/tests/test_zarr3.py index 0d8ecc74..eaf8634e 100644 --- a/numcodecs/tests/test_zarr3.py +++ b/numcodecs/tests/test_zarr3.py @@ -92,12 +92,14 @@ def test_generic_compressor( (numcodecs.zarr3.FixedScaleOffset, {"offset": 0, "scale": 25.5}), (numcodecs.zarr3.FixedScaleOffset, {"offset": 0, "scale": 51, "astype": "uint16"}), (numcodecs.zarr3.AsType, {"encode_dtype": "float32", "decode_dtype": "float32"}), + (numcodecs.zarr3.AsType, {"encode_dtype": "float32"}), ], ids=[ "delta", "fixedscaleoffset", "fixedscaleoffset2", "astype", + "astype_no_decode_dtype", ], ) def test_generic_filter( diff --git a/numcodecs/zarr3.py b/numcodecs/zarr3.py index f1743ffb..43684c3d 100644 --- a/numcodecs/zarr3.py +++ b/numcodecs/zarr3.py @@ -286,7 +286,7 @@ def __init__(self, **codec_config: dict[str, JSON]) -> None: def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: if astype := self.codec_config.get("astype"): - return replace(chunk_spec, dtype=np.dtype(astype)) # type: ignore[arg-type] + return replace(chunk_spec, dtype=np.dtype(astype)) # type: ignore[call-overload] return chunk_spec @@ -304,7 +304,7 @@ def __init__(self, **codec_config: JSON) -> None: def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: if astype := self.codec_config.get("astype"): - return replace(chunk_spec, dtype=np.dtype(astype)) # type: ignore[arg-type] + return replace(chunk_spec, dtype=np.dtype(astype)) # type: ignore[call-overload] return chunk_spec def evolve_from_array_spec(self, array_spec: ArraySpec) -> FixedScaleOffset: diff --git a/pyproject.toml b/pyproject.toml index f2c3e846..24346e37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,6 +118,9 @@ write_to = "numcodecs/version.py" skip = "./.git,fixture" ignore-words-list = "ba, compiletime, hist, nd, unparseable" +[tool.coverage.run] +omit = ["tests/*"] + [tool.coverage.report] exclude_lines = [ "pragma: no cover", From 02dae089b4d6dd60bec70c6e4823433ad3c77108 Mon Sep 17 00:00:00 2001 From: Mark Kittisopikul Date: Sat, 20 Jun 2026 17:57:54 -0400 Subject: [PATCH 7/7] Fix coverage-ignore and docs zarr pin for this branch's layout The previous commit copied main's config verbatim, but this branch differs from main in two ways that broke CI: - Tests live at numcodecs/tests/ here (main moved them to top-level tests/), so codecov `ignore: tests/**` and coverage `omit = ["tests/*"]` matched nothing. Point both at numcodecs/tests so the subprocess/skipped-test lines stop counting against the 100% project target. - Read the Docs strips the quotes around post_install commands, so the `<` in 'zarr>=3.0.0b2,<3.1' was parsed as a shell input redirection ("cannot open 3.1"). Use 'zarr~=3.0.0' instead, which pins to zarr 3.0.x with no shell metacharacters. Co-Authored-By: Claude Opus 4.8 --- .github/codecov.yml | 2 +- .readthedocs.yaml | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/codecov.yml b/.github/codecov.yml index c910cfa5..e9b2ef7c 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -8,7 +8,7 @@ coverage: default: target: 100 ignore: - - "tests/**" + - "numcodecs/tests/**" comment: layout: "diff, files" behavior: default diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 5d81f8d3..5e124a8f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,7 @@ build: python: "3.12" jobs: post_install: - - python -m pip install --pre 'zarr>=3.0.0b2,<3.1' + - python -m pip install --pre 'zarr~=3.0.0' sphinx: configuration: docs/conf.py diff --git a/pyproject.toml b/pyproject.toml index 24346e37..b11105ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,7 +119,7 @@ skip = "./.git,fixture" ignore-words-list = "ba, compiletime, hist, nd, unparseable" [tool.coverage.run] -omit = ["tests/*"] +omit = ["numcodecs/tests/*"] [tool.coverage.report] exclude_lines = [