Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 57 additions & 3 deletions cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZArray.java
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
/*
* Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
* See LICENSE for license information.
*/

package ucar.nc2.iosp.zarr;

import com.fasterxml.jackson.core.JsonParser;
Expand All @@ -16,6 +21,7 @@
import java.io.IOException;
import java.nio.ByteOrder;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
Expand All @@ -36,6 +42,10 @@ public enum Order {
// maps zarr datatypes to CDM datatypes
private static Map<String, DataType> dTypeMap;

// regex for matching numpy byteorder marks
// see https://numpy.org/doc/stable/reference/generated/numpy.dtype.byteorder.html#numpy-dtype-byteorder
private static final Pattern BYTE_ORDER_PATTERN = Pattern.compile("[><|=]");

static {
dTypeMap = new HashMap<>();
dTypeMap.put("b1", DataType.BOOLEAN);
Expand Down Expand Up @@ -71,6 +81,8 @@ public enum Order {
private final Order order;
private final List<Filter> filters;
private final String separator;
private final int elementSize; // size of a single element on disk, in bytes
private final boolean unicodeString; // true for numpy U dtype fixed-length strings

public ZArray(int[] shape, int[] chunks, Object fill_value, String dtype, Filter compressor, String order,
List<Filter> filters, String separator) throws ZarrFormatException {
Expand All @@ -80,6 +92,8 @@ public ZArray(int[] shape, int[] chunks, Object fill_value, String dtype, Filter
this.dtype = dtype;
this.datatype = parseDataType(this.dtype);
this.byteOrder = parseByteOrder(this.dtype);
this.elementSize = parseElementSize(this.dtype);
this.unicodeString = stripByteOrder(this.dtype).charAt(0) == 'U';
this.compressor = compressor;
this.filters = filters;
this.order = parseOrder(order);
Expand Down Expand Up @@ -126,17 +140,57 @@ public ByteOrder getByteOrder() {
return this.byteOrder;
}

/**
* The size, in bytes, of a single element of this array as stored on disk.
*/
public int getElementSize() {
return this.elementSize;
}

/**
* True if this array holds numpy U dtype.
*/
boolean isUnicodeString() {
return this.unicodeString;
}

private static String stripByteOrder(String dtype) {
return BYTE_ORDER_PATTERN.matcher(dtype).replaceAll("");
}

private static DataType parseDataType(String dtype) throws ZarrFormatException {
dtype = dtype.replace(">", "");
dtype = dtype.replace("<", "");
dtype = dtype.replace("|", "");
dtype = stripByteOrder(dtype);
final char typeChar = dtype.charAt(0);
// S (fixed-length byte strings) and U (fixed-length unicode strings) do not follow the
// usual [type char][type size in bytes] pattern: the trailing integer is a fixed character
// count, not a byte size. See https://github.com/Unidata/netcdf-java/issues/1534
if (typeChar == 'S' || typeChar == 'U') {
final int nChars = parseLength(dtype);
// a single byte char maps to CDM CHAR, otherwise it is a fixed-length String
return (typeChar == 'S' && nChars == 1) ? DataType.CHAR : DataType.STRING;
}
DataType dataType = dTypeMap.get(dtype);
if (dataType == null) {
throw new ZarrFormatException(ZarrKeys.DTYPE, dtype);
}
return dataType;
}

private static int parseElementSize(String dtype) throws ZarrFormatException {
dtype = stripByteOrder(dtype);
final char typeChar = dtype.charAt(0);
final int length = parseLength(dtype);
return (typeChar == 'U') ? 4 * length : length;
}

private static int parseLength(String dtype) throws ZarrFormatException {
try {
return Integer.parseInt(dtype.substring(1));
} catch (NumberFormatException | IndexOutOfBoundsException ex) {
throw new ZarrFormatException(ZarrKeys.DTYPE, dtype);
}
}

private static ByteOrder parseByteOrder(String dtype) throws ZarrFormatException {
if (dtype.startsWith(">")) {
return ByteOrder.BIG_ENDIAN;
Expand Down
20 changes: 17 additions & 3 deletions cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrHeader.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2025 University Corporation for Atmospheric Research/Unidata
* Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
* See LICENSE for license information.
*/

Expand Down Expand Up @@ -303,7 +303,8 @@ private void makeVariable(RandomAccessDirectoryItem item, long dataOffset, ZArra

// create VInfo
VInfo vinfo = new VInfo(chunks, zarray.getFillValue(), zarray.getCompressor(), zarray.getByteOrder(),
zarray.getOrder(), zarray.getSeparator(), zarray.getFilters(), dataOffset, initializedChunks);
zarray.getOrder(), zarray.getSeparator(), zarray.getFilters(), dataOffset, initializedChunks,
zarray.getElementSize(), zarray.isUnicodeString());
var.setSPobject(vinfo);

// Include some info from .zarray file in attributes for display when showing variable detail.
Expand Down Expand Up @@ -421,9 +422,12 @@ class VInfo {
private final List<Filter> filters;
private final long offset;
private final Map<Integer, Long> initializedChunks;
private final int elementSize;
private final boolean unicodeString;

VInfo(int[] chunks, Object fillValue, Filter compressor, ByteOrder byteOrder, ZArray.Order order, String separator,
List<Filter> filters, long offset, Map<Integer, Long> initializedChunks) {
List<Filter> filters, long offset, Map<Integer, Long> initializedChunks, int elementSize,
boolean unicodeString) {
this.chunks = chunks;
this.fillValue = fillValue;
this.byteOrder = byteOrder;
Expand All @@ -433,6 +437,8 @@ class VInfo {
this.filters = filters;
this.offset = offset;
this.initializedChunks = initializedChunks;
this.elementSize = elementSize;
this.unicodeString = unicodeString;
}

public int[] getChunks() {
Expand Down Expand Up @@ -471,6 +477,14 @@ public Map<Integer, Long> getInitializedChunks() {
return this.initializedChunks;
}

int getElementSize() {
return this.elementSize;
}

boolean isUnicodeString() {
return this.unicodeString;
}

}

}
66 changes: 63 additions & 3 deletions cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrIosp.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 University Corporation for Atmospheric Research/Unidata
* Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
* See LICENSE for license information.
*/

Expand All @@ -20,6 +20,10 @@

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/**
* IOSP for reading/writing Zarr/NCZarr formats
Expand Down Expand Up @@ -83,8 +87,14 @@ public Array readData(Variable v2, Section section) {
Object fillValue = getFillValue(vinfo, dataType);

// create layout object
Layout layout = new ZarrLayoutBB(v2, section, this.raf);
Object data = IospHelper.readDataFill((LayoutBB) layout, dataType, fillValue);
LayoutBB layout = new ZarrLayoutBB(v2, section, this.raf);
final Object data;
if (dataType == DataType.STRING) {
// fixed-length string types (S/U) need custom decoding (not handled by the generic IospHelper string reader).
data = readStringData(layout, vinfo, fillValue);
} else {
data = IospHelper.readDataFill(layout, dataType, fillValue);
}

Array array = Array.factory(dataType, section.getShape(), data);
if (vinfo.getOrder() == ZArray.Order.F) {
Expand All @@ -99,6 +109,56 @@ public Array readData(Variable v2, Section section) {
return array;
}

/**
* Read fixed-length string data ('S' or 'U' dtypes) from the layout.
*
* <p>
* See https://github.com/Unidata/netcdf-java/issues/1534
*/
private static String[] readStringData(LayoutBB layout, ZarrHeader.VInfo vinfo, Object fillValue) {
final int nelems = (int) layout.getTotalNelems();
final int recSize = layout.getElemSize();
final String[] pa = new String[nelems];
if (fillValue instanceof String) {
java.util.Arrays.fill(pa, (String) fillValue);
}

final Charset charset;
if (vinfo.isUnicodeString()) {
charset =
vinfo.getByteOrder() == ByteOrder.BIG_ENDIAN ? Charset.forName("UTF-32BE") : Charset.forName("UTF-32LE");
} else {
charset = StandardCharsets.UTF_8;
}

while (layout.hasNext()) {
LayoutBB.Chunk chunk = layout.next();
ByteBuffer bb = chunk.getByteBuffer();
// if chunk is empty, use fill value
if (!bb.hasRemaining()) {
continue;
}
bb.position(chunk.getSrcElem() * recSize);
int pos = (int) chunk.getDestElem();
final byte[] raw = new byte[recSize];
for (int i = 0; i < chunk.getNelems(); i++) {
bb.get(raw);
pa[pos++] = decodeFixedLengthString(raw, charset);
}
}
return pa;
}

private static String decodeFixedLengthString(byte[] raw, Charset charset) {
String s = new String(raw, charset);
// NumPy fixed-length strings are null-padded, so strip trailing NUL characters
int end = s.length();
while (end > 0 && s.charAt(end - 1) == '\0') {
end--;
}
return s.substring(0, end);
}

private Object getFillValue(ZarrHeader.VInfo vinfo, DataType dataType) {

// Watch for floating point fill values encoded as Strings
Expand Down
7 changes: 5 additions & 2 deletions cdm/zarr/src/main/java/ucar/nc2/iosp/zarr/ZarrLayoutBB.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2025 University Corporation for Atmospheric Research/Unidata
* Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
* See LICENSE for license information.
*/

Expand Down Expand Up @@ -80,7 +80,10 @@ public ZarrLayoutBB(Variable v2, Section wantSection, RandomAccessFile raf) {
this.want = wantSection;
}

this.elemSize = v2.getDataType().getSize();
// Use the on-disk element byte width from the .zarray metadata. For most types this matches
// DataType.getSize(), but for fixed-length string types (S/U) it captures the true element
// width (N bytes for S, 4*N bytes for U).
this.elemSize = vinfo.getElementSize();

// create delegate and chunk iterator
ZarrLayoutBB.DataChunkIterator iter = new ZarrLayoutBB.DataChunkIterator();
Expand Down
55 changes: 30 additions & 25 deletions cdm/zarr/src/test/data/scripts/make_zarr_dtype_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,15 @@


import zarr
store = zarr.DirectoryStore('../test_dtypes.zarr')
# Zarr-Python 3 API. The store is written using the Zarr format 2
# specification by passing zarr_format=2 to the top level API.
store = zarr.storage.LocalStore('../test_dtypes.zarr')


# In[ ]:


root_grp = zarr.group(store, overwrite=True)
root_grp = zarr.group(store, overwrite=True, zarr_format=2)
# create a group for byte-order independent data types
unordered_group = root_grp.create_group('unordered_group', overwrite=True)

Expand All @@ -82,66 +84,69 @@


# add data to unordered group
b = unordered_group.create_dataset('boolean_data', shape=(4,5), chunks=(2,5), dtype='|b1', overwrite=True, compressor=None)
b = unordered_group.create_array('boolean_data', shape=(4,5), chunks=(2,5), dtype='|b1', overwrite=True, compressors=None)
b[:] = boolean_data
byte = unordered_group.create_dataset('byte_data', shape=(10,8), chunks=(5,4), dtype='|i1', overwrite=True, compressor=None)
byte = unordered_group.create_array('byte_data', shape=(10,8), chunks=(5,4), dtype='|i1', overwrite=True, compressors=None)
byte[:] = bdata
ubyte = unordered_group.create_dataset('ubyte_data', shape=(10,8), chunks=(5,4), dtype='|u1', overwrite=True, compressor=None)
ubyte = unordered_group.create_array('ubyte_data', shape=(10,8), chunks=(5,4), dtype='|u1', overwrite=True, compressors=None)
ubyte[:] = bdata


# In[ ]:


# add data to big endian group
shorts = big_endian.create_dataset('short_data', shape=(4,5), chunks=(2,5), dtype='>i2', overwrite=True, compressor=None)
shorts = big_endian.create_array('short_data', shape=(4,5), chunks=(2,5), dtype='>i2', overwrite=True, compressors=None)
shorts[:] = be_short_data
ushorts = big_endian.create_dataset('ushort_data', shape=(4,5), chunks=(2,5), dtype='>u2', overwrite=True, compressor=None)
ushorts = big_endian.create_array('ushort_data', shape=(4,5), chunks=(2,5), dtype='>u2', overwrite=True, compressors=None)
ushorts[:] = be_short_data
ints = big_endian.create_dataset('int_data', shape=(4,5), chunks=(2,5), dtype='>i4', overwrite=True, compressor=None)
ints = big_endian.create_array('int_data', shape=(4,5), chunks=(2,5), dtype='>i4', overwrite=True, compressors=None)
ints[:] = be_int_data
uints = big_endian.create_dataset('uint_data', shape=(4,5), chunks=(2,5), dtype='>u4', overwrite=True, compressor=None)
uints = big_endian.create_array('uint_data', shape=(4,5), chunks=(2,5), dtype='>u4', overwrite=True, compressors=None)
uints[:] = be_int_data
longs = big_endian.create_dataset('long_data', shape=(5,4), chunks=(5,2), dtype='>i8', overwrite=True, compressor=None)
longs = big_endian.create_array('long_data', shape=(5,4), chunks=(5,2), dtype='>i8', overwrite=True, compressors=None)
longs[:] = be_long_data
ulongs = big_endian.create_dataset('ulong_data', shape=(5,4), chunks=(5,2), dtype='>u8', overwrite=True, compressor=None)
ulongs = big_endian.create_array('ulong_data', shape=(5,4), chunks=(5,2), dtype='>u8', overwrite=True, compressors=None)
ulongs[:] = be_long_data
floats = big_endian.create_dataset('float_data', shape=(4,5), chunks=(2,5), dtype='>f4', overwrite=True, compressor=None)
floats = big_endian.create_array('float_data', shape=(4,5), chunks=(2,5), dtype='>f4', overwrite=True, compressors=None)
floats[:] = be_float_data
doubles = big_endian.create_dataset('double_data', shape=(5,4), chunks=(5,2), dtype='>f8', overwrite=True, compressor=None)
doubles = big_endian.create_array('double_data', shape=(5,4), chunks=(5,2), dtype='>f8', overwrite=True, compressors=None)
doubles[:] = be_double_data


# In[ ]:


# add data to little endian group
shorts = little_endian.create_dataset('short_data', shape=(4,5), chunks=(2,5), dtype='<i2', overwrite=True, compressor=None)
shorts = little_endian.create_array('short_data', shape=(4,5), chunks=(2,5), dtype='<i2', overwrite=True, compressors=None)
shorts[:] = le_short_data
ushorts = little_endian.create_dataset('ushort_data', shape=(4,5), chunks=(2,5), dtype='<u2', overwrite=True, compressor=None)
ushorts = little_endian.create_array('ushort_data', shape=(4,5), chunks=(2,5), dtype='<u2', overwrite=True, compressors=None)
ushorts[:] = le_short_data
ints = little_endian.create_dataset('int_data', shape=(4,5), chunks=(2,5), dtype='<i4', overwrite=True, compressor=None)
ints = little_endian.create_array('int_data', shape=(4,5), chunks=(2,5), dtype='<i4', overwrite=True, compressors=None)
ints[:] = le_int_data
uints = little_endian.create_dataset('uint_data', shape=(4,5), chunks=(2,5), dtype='<u4', overwrite=True, compressor=None)
uints = little_endian.create_array('uint_data', shape=(4,5), chunks=(2,5), dtype='<u4', overwrite=True, compressors=None)
uints[:] = le_int_data
longs = little_endian.create_dataset('long_data', shape=(5,4), chunks=(5,2), dtype='<i8', overwrite=True, compressor=None)
longs = little_endian.create_array('long_data', shape=(5,4), chunks=(5,2), dtype='<i8', overwrite=True, compressors=None)
longs[:] = le_long_data
ulongs = little_endian.create_dataset('ulong_data', shape=(5,4), chunks=(5,2), dtype='<u8', overwrite=True, compressor=None)
ulongs = little_endian.create_array('ulong_data', shape=(5,4), chunks=(5,2), dtype='<u8', overwrite=True, compressors=None)
ulongs[:] = le_long_data
floats = little_endian.create_dataset('float_data', shape=(4,5), chunks=(2,5), dtype='<f4', overwrite=True, compressor=None)
floats = little_endian.create_array('float_data', shape=(4,5), chunks=(2,5), dtype='<f4', overwrite=True, compressors=None)
floats[:] = le_float_data
doubles = little_endian.create_dataset('double_data', shape=(5,4), chunks=(5,2), dtype='<f8', overwrite=True, compressor=None)
doubles = little_endian.create_array('double_data', shape=(5,4), chunks=(5,2), dtype='<f8', overwrite=True, compressors=None)
doubles[:] = le_double_data


# In[ ]:


# add string data
chars = string_group.create_dataset('char_data', shape=(10,12), chunks=(5,3), dtype='S1', overwrite=True, compressor=None)
chars = string_group.create_array('char_data', shape=(10,12), chunks=(5,3), dtype='S1', overwrite=True, compressors=None)
chars[:] = charar
strs = string_group.create_dataset('str_data', shape=(10,12), chunks=(5,6), dtype='S4', overwrite=True, compressor=None)
strs = string_group.create_array('str_data', shape=(10,12), chunks=(5,6), dtype='S4', overwrite=True, compressors=None)
strs[:] = charar
unicode = string_group.create_dataset('unicode_data', shape=(10,12), chunks=(5,6), dtype='U4', overwrite=True, compressor=None)
strs2 = string_group.create_array('str_data_2', shape=(10,12), chunks=(5,6), dtype='S2', overwrite=True, compressors=None)
strs2[:] = charar
unicode = string_group.create_array('unicode_data', shape=(10,12), chunks=(5,6), dtype='U4', overwrite=True, compressors=None)
unicode[:] = charar

unicode2 = string_group.create_array('unicode_data_2', shape=(10,12), chunks=(5,6), dtype='U2', overwrite=True, compressors=None)
unicode2[:] = charar
1 change: 1 addition & 0 deletions cdm/zarr/src/test/data/test_dtypes.zarr/.zattrs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
2 changes: 1 addition & 1 deletion cdm/zarr/src/test/data/test_dtypes.zarr/.zgroup
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"zarr_format": 2
"zarr_format": 2
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Loading