Unidata · lesserwhirls · Jun 17, 2026 · Jun 17, 2026
@@ -1,3 +1,8 @@
+/*
+ * Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
+ * See LICENSE for license information.
+ */
+
 package ucar.nc2.iosp.zarr;
 
 import com.fasterxml.jackson.core.JsonParser;
@@ -16,6 +21,7 @@
 import java.io.IOException;
 import java.nio.ByteOrder;
 import java.util.*;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
@@ -36,6 +42,10 @@ public enum Order {
   // maps zarr datatypes to CDM datatypes
   private static Map<String, DataType> dTypeMap;
 
+  // regex for matching numpy byteorder marks
+  // see https://numpy.org/doc/stable/reference/generated/numpy.dtype.byteorder.html#numpy-dtype-byteorder
+  private static final Pattern BYTE_ORDER_PATTERN = Pattern.compile("[><|=]");
+
   static {
     dTypeMap = new HashMap<>();
     dTypeMap.put("b1", DataType.BOOLEAN);
@@ -71,6 +81,8 @@ public enum Order {
   private final Order order;
   private final List<Filter> filters;
   private final String separator;
+  private final int elementSize; // size of a single element on disk, in bytes
+  private final boolean unicodeString; // true for numpy U dtype fixed-length strings
 
   public ZArray(int[] shape, int[] chunks, Object fill_value, String dtype, Filter compressor, String order,
       List<Filter> filters, String separator) throws ZarrFormatException {
@@ -80,6 +92,8 @@ public ZArray(int[] shape, int[] chunks, Object fill_value, String dtype, Filter
     this.dtype = dtype;
     this.datatype = parseDataType(this.dtype);
     this.byteOrder = parseByteOrder(this.dtype);
+    this.elementSize = parseElementSize(this.dtype);
+    this.unicodeString = stripByteOrder(this.dtype).charAt(0) == 'U';
     this.compressor = compressor;
     this.filters = filters;
     this.order = parseOrder(order);
@@ -126,17 +140,57 @@ public ByteOrder getByteOrder() {
     return this.byteOrder;
   }
 
+  /**
+   * The size, in bytes, of a single element of this array as stored on disk.
+   */
+  public int getElementSize() {
+    return this.elementSize;
+  }
+
+  /**
+   * True if this array holds numpy U dtype.
+   */
+  boolean isUnicodeString() {
+    return this.unicodeString;
+  }
+
+  private static String stripByteOrder(String dtype) {
+    return BYTE_ORDER_PATTERN.matcher(dtype).replaceAll("");
+  }
+
   private static DataType parseDataType(String dtype) throws ZarrFormatException {
-    dtype = dtype.replace(">", "");
-    dtype = dtype.replace("<", "");
-    dtype = dtype.replace("|", "");
+    dtype = stripByteOrder(dtype);
+    final char typeChar = dtype.charAt(0);
+    // S (fixed-length byte strings) and U (fixed-length unicode strings) do not follow the
+    // usual [type char][type size in bytes] pattern: the trailing integer is a fixed character
+    // count, not a byte size. See https://github.com/Unidata/netcdf-java/issues/1534
+    if (typeChar == 'S' || typeChar == 'U') {
+      final int nChars = parseLength(dtype);
+      // a single byte char maps to CDM CHAR, otherwise it is a fixed-length String
+      return (typeChar == 'S' && nChars == 1) ? DataType.CHAR : DataType.STRING;
+    }
     DataType dataType = dTypeMap.get(dtype);
     if (dataType == null) {
       throw new ZarrFormatException(ZarrKeys.DTYPE, dtype);
     }
     return dataType;
   }
 
+  private static int parseElementSize(String dtype) throws ZarrFormatException {
+    dtype = stripByteOrder(dtype);
+    final char typeChar = dtype.charAt(0);
+    final int length = parseLength(dtype);
+    return (typeChar == 'U') ? 4 * length : length;
+  }
+
+  private static int parseLength(String dtype) throws ZarrFormatException {
+    try {
+      return Integer.parseInt(dtype.substring(1));
+    } catch (NumberFormatException | IndexOutOfBoundsException ex) {
+      throw new ZarrFormatException(ZarrKeys.DTYPE, dtype);
+    }
+  }
+
   private static ByteOrder parseByteOrder(String dtype) throws ZarrFormatException {
     if (dtype.startsWith(">")) {
       return ByteOrder.BIG_ENDIAN;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025 University Corporation for Atmospheric Research/Unidata
+ * Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
  * See LICENSE for license information.
  */
 
@@ -303,7 +303,8 @@ private void makeVariable(RandomAccessDirectoryItem item, long dataOffset, ZArra
 
     // create VInfo
     VInfo vinfo = new VInfo(chunks, zarray.getFillValue(), zarray.getCompressor(), zarray.getByteOrder(),
-        zarray.getOrder(), zarray.getSeparator(), zarray.getFilters(), dataOffset, initializedChunks);
+        zarray.getOrder(), zarray.getSeparator(), zarray.getFilters(), dataOffset, initializedChunks,
+        zarray.getElementSize(), zarray.isUnicodeString());
     var.setSPobject(vinfo);
 
     // Include some info from .zarray file in attributes for display when showing variable detail.
@@ -421,9 +422,12 @@ class VInfo {
     private final List<Filter> filters;
     private final long offset;
     private final Map<Integer, Long> initializedChunks;
+    private final int elementSize;
+    private final boolean unicodeString;
 
     VInfo(int[] chunks, Object fillValue, Filter compressor, ByteOrder byteOrder, ZArray.Order order, String separator,
-        List<Filter> filters, long offset, Map<Integer, Long> initializedChunks) {
+        List<Filter> filters, long offset, Map<Integer, Long> initializedChunks, int elementSize,
+        boolean unicodeString) {
       this.chunks = chunks;
       this.fillValue = fillValue;
       this.byteOrder = byteOrder;
@@ -433,6 +437,8 @@ class VInfo {
       this.filters = filters;
       this.offset = offset;
       this.initializedChunks = initializedChunks;
+      this.elementSize = elementSize;
+      this.unicodeString = unicodeString;
     }
 
     public int[] getChunks() {
@@ -471,6 +477,14 @@ public Map<Integer, Long> getInitializedChunks() {
       return this.initializedChunks;
     }
 
+    int getElementSize() {
+      return this.elementSize;
+    }
+
+    boolean isUnicodeString() {
+      return this.unicodeString;
+    }
+
   }
 
 }
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 University Corporation for Atmospheric Research/Unidata
+ * Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
  * See LICENSE for license information.
  */
 
@@ -20,6 +20,10 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
 /**
  * IOSP for reading/writing Zarr/NCZarr formats
@@ -83,8 +87,14 @@ public Array readData(Variable v2, Section section) {
     Object fillValue = getFillValue(vinfo, dataType);
 
     // create layout object
-    Layout layout = new ZarrLayoutBB(v2, section, this.raf);
-    Object data = IospHelper.readDataFill((LayoutBB) layout, dataType, fillValue);
+    LayoutBB layout = new ZarrLayoutBB(v2, section, this.raf);
+    final Object data;
+    if (dataType == DataType.STRING) {
+      // fixed-length string types (S/U) need custom decoding (not handled by the generic IospHelper string reader).
+      data = readStringData(layout, vinfo, fillValue);
+    } else {
+      data = IospHelper.readDataFill(layout, dataType, fillValue);
+    }
 
     Array array = Array.factory(dataType, section.getShape(), data);
     if (vinfo.getOrder() == ZArray.Order.F) {
@@ -99,6 +109,56 @@ public Array readData(Variable v2, Section section) {
     return array;
   }
 
+  /**
+   * Read fixed-length string data ('S' or 'U' dtypes) from the layout.
+   *
+   * <p>
+   * See https://github.com/Unidata/netcdf-java/issues/1534
+   */
+  private static String[] readStringData(LayoutBB layout, ZarrHeader.VInfo vinfo, Object fillValue) {
+    final int nelems = (int) layout.getTotalNelems();
+    final int recSize = layout.getElemSize();
+    final String[] pa = new String[nelems];
+    if (fillValue instanceof String) {
+      java.util.Arrays.fill(pa, (String) fillValue);
+    }
+
+    final Charset charset;
+    if (vinfo.isUnicodeString()) {
+      charset =
+          vinfo.getByteOrder() == ByteOrder.BIG_ENDIAN ? Charset.forName("UTF-32BE") : Charset.forName("UTF-32LE");
+    } else {
+      charset = StandardCharsets.UTF_8;
+    }
+
+    while (layout.hasNext()) {
+      LayoutBB.Chunk chunk = layout.next();
+      ByteBuffer bb = chunk.getByteBuffer();
+      // if chunk is empty, use fill value
+      if (!bb.hasRemaining()) {
+        continue;
+      }
+      bb.position(chunk.getSrcElem() * recSize);
+      int pos = (int) chunk.getDestElem();
+      final byte[] raw = new byte[recSize];
+      for (int i = 0; i < chunk.getNelems(); i++) {
+        bb.get(raw);
+        pa[pos++] = decodeFixedLengthString(raw, charset);
+      }
+    }
+    return pa;
+  }
+
+  private static String decodeFixedLengthString(byte[] raw, Charset charset) {
+    String s = new String(raw, charset);
+    // NumPy fixed-length strings are null-padded, so strip trailing NUL characters
+    int end = s.length();
+    while (end > 0 && s.charAt(end - 1) == '\0') {
+      end--;
+    }
+    return s.substring(0, end);
+  }
+
   private Object getFillValue(ZarrHeader.VInfo vinfo, DataType dataType) {
 
     // Watch for floating point fill values encoded as Strings

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2025 University Corporation for Atmospheric Research/Unidata
+ * Copyright (c) 2021-2026 University Corporation for Atmospheric Research/Unidata
  * See LICENSE for license information.
  */
 
@@ -80,7 +80,10 @@ public ZarrLayoutBB(Variable v2, Section wantSection, RandomAccessFile raf) {
       this.want = wantSection;
     }
 
-    this.elemSize = v2.getDataType().getSize();
+    // Use the on-disk element byte width from the .zarray metadata. For most types this matches
+    // DataType.getSize(), but for fixed-length string types (S/U) it captures the true element
+    // width (N bytes for S, 4*N bytes for U).
+    this.elemSize = vinfo.getElementSize();
 
     // create delegate and chunk iterator
     ZarrLayoutBB.DataChunkIterator iter = new ZarrLayoutBB.DataChunkIterator();

@@ -57,13 +57,15 @@
 
 
 import zarr
-store = zarr.DirectoryStore('../test_dtypes.zarr')
+# Zarr-Python 3 API. The store is written using the Zarr format 2
+# specification by passing zarr_format=2 to the top level API.
+store = zarr.storage.LocalStore('../test_dtypes.zarr')
 
 
 # In[ ]:
 
 
-root_grp = zarr.group(store, overwrite=True)
+root_grp = zarr.group(store, overwrite=True, zarr_format=2)
 # create a group for byte-order independent data types
 unordered_group = root_grp.create_group('unordered_group', overwrite=True)
 
@@ -82,66 +84,69 @@
 
 
 # add data to unordered group
-b = unordered_group.create_dataset('boolean_data', shape=(4,5), chunks=(2,5), dtype='|b1', overwrite=True, compressor=None)
+b = unordered_group.create_array('boolean_data', shape=(4,5), chunks=(2,5), dtype='|b1', overwrite=True, compressors=None)
 b[:] = boolean_data
-byte = unordered_group.create_dataset('byte_data', shape=(10,8), chunks=(5,4), dtype='|i1', overwrite=True, compressor=None)
+byte = unordered_group.create_array('byte_data', shape=(10,8), chunks=(5,4), dtype='|i1', overwrite=True, compressors=None)
 byte[:] = bdata
-ubyte = unordered_group.create_dataset('ubyte_data', shape=(10,8), chunks=(5,4), dtype='|u1', overwrite=True, compressor=None)
+ubyte = unordered_group.create_array('ubyte_data', shape=(10,8), chunks=(5,4), dtype='|u1', overwrite=True, compressors=None)
 ubyte[:] = bdata
 
 
 # In[ ]:
 
 
 # add data to big endian group
-shorts = big_endian.create_dataset('short_data', shape=(4,5), chunks=(2,5), dtype='>i2', overwrite=True, compressor=None)
+shorts = big_endian.create_array('short_data', shape=(4,5), chunks=(2,5), dtype='>i2', overwrite=True, compressors=None)
 shorts[:] = be_short_data
-ushorts = big_endian.create_dataset('ushort_data', shape=(4,5), chunks=(2,5), dtype='>u2', overwrite=True, compressor=None)
+ushorts = big_endian.create_array('ushort_data', shape=(4,5), chunks=(2,5), dtype='>u2', overwrite=True, compressors=None)
 ushorts[:] = be_short_data
-ints = big_endian.create_dataset('int_data', shape=(4,5), chunks=(2,5), dtype='>i4', overwrite=True, compressor=None)
+ints = big_endian.create_array('int_data', shape=(4,5), chunks=(2,5), dtype='>i4', overwrite=True, compressors=None)
 ints[:] = be_int_data
-uints = big_endian.create_dataset('uint_data', shape=(4,5), chunks=(2,5), dtype='>u4', overwrite=True, compressor=None)
+uints = big_endian.create_array('uint_data', shape=(4,5), chunks=(2,5), dtype='>u4', overwrite=True, compressors=None)
 uints[:] = be_int_data
-longs = big_endian.create_dataset('long_data', shape=(5,4), chunks=(5,2), dtype='>i8', overwrite=True, compressor=None)
+longs = big_endian.create_array('long_data', shape=(5,4), chunks=(5,2), dtype='>i8', overwrite=True, compressors=None)
 longs[:] = be_long_data
-ulongs = big_endian.create_dataset('ulong_data', shape=(5,4), chunks=(5,2), dtype='>u8', overwrite=True, compressor=None)
+ulongs = big_endian.create_array('ulong_data', shape=(5,4), chunks=(5,2), dtype='>u8', overwrite=True, compressors=None)
 ulongs[:] = be_long_data
-floats = big_endian.create_dataset('float_data', shape=(4,5), chunks=(2,5), dtype='>f4', overwrite=True, compressor=None)
+floats = big_endian.create_array('float_data', shape=(4,5), chunks=(2,5), dtype='>f4', overwrite=True, compressors=None)
 floats[:] = be_float_data
-doubles = big_endian.create_dataset('double_data', shape=(5,4), chunks=(5,2), dtype='>f8', overwrite=True, compressor=None)
+doubles = big_endian.create_array('double_data', shape=(5,4), chunks=(5,2), dtype='>f8', overwrite=True, compressors=None)
 doubles[:] = be_double_data
 
 
 # In[ ]:
 
 
 # add data to little endian group
-shorts = little_endian.create_dataset('short_data', shape=(4,5), chunks=(2,5), dtype='<i2', overwrite=True, compressor=None)
+shorts = little_endian.create_array('short_data', shape=(4,5), chunks=(2,5), dtype='<i2', overwrite=True, compressors=None)
 shorts[:] = le_short_data
-ushorts = little_endian.create_dataset('ushort_data', shape=(4,5), chunks=(2,5), dtype='<u2', overwrite=True, compressor=None)
+ushorts = little_endian.create_array('ushort_data', shape=(4,5), chunks=(2,5), dtype='<u2', overwrite=True, compressors=None)
 ushorts[:] = le_short_data
-ints = little_endian.create_dataset('int_data', shape=(4,5), chunks=(2,5), dtype='<i4', overwrite=True, compressor=None)
+ints = little_endian.create_array('int_data', shape=(4,5), chunks=(2,5), dtype='<i4', overwrite=True, compressors=None)
 ints[:] = le_int_data
-uints = little_endian.create_dataset('uint_data', shape=(4,5), chunks=(2,5), dtype='<u4', overwrite=True, compressor=None)
+uints = little_endian.create_array('uint_data', shape=(4,5), chunks=(2,5), dtype='<u4', overwrite=True, compressors=None)
 uints[:] = le_int_data
-longs = little_endian.create_dataset('long_data', shape=(5,4), chunks=(5,2), dtype='<i8', overwrite=True, compressor=None)
+longs = little_endian.create_array('long_data', shape=(5,4), chunks=(5,2), dtype='<i8', overwrite=True, compressors=None)
 longs[:] = le_long_data
-ulongs = little_endian.create_dataset('ulong_data', shape=(5,4), chunks=(5,2), dtype='<u8', overwrite=True, compressor=None)
+ulongs = little_endian.create_array('ulong_data', shape=(5,4), chunks=(5,2), dtype='<u8', overwrite=True, compressors=None)
 ulongs[:] = le_long_data
-floats = little_endian.create_dataset('float_data', shape=(4,5), chunks=(2,5), dtype='<f4', overwrite=True, compressor=None)
+floats = little_endian.create_array('float_data', shape=(4,5), chunks=(2,5), dtype='<f4', overwrite=True, compressors=None)
 floats[:] = le_float_data
-doubles = little_endian.create_dataset('double_data', shape=(5,4), chunks=(5,2), dtype='<f8', overwrite=True, compressor=None)
+doubles = little_endian.create_array('double_data', shape=(5,4), chunks=(5,2), dtype='<f8', overwrite=True, compressors=None)
 doubles[:] = le_double_data
 
 
 # In[ ]:
 
 
 # add string data
-chars = string_group.create_dataset('char_data', shape=(10,12), chunks=(5,3), dtype='S1', overwrite=True, compressor=None)
+chars = string_group.create_array('char_data', shape=(10,12), chunks=(5,3), dtype='S1', overwrite=True, compressors=None)
 chars[:] = charar
-strs = string_group.create_dataset('str_data', shape=(10,12), chunks=(5,6), dtype='S4', overwrite=True, compressor=None)
+strs = string_group.create_array('str_data', shape=(10,12), chunks=(5,6), dtype='S4', overwrite=True, compressors=None)
 strs[:] = charar
-unicode = string_group.create_dataset('unicode_data', shape=(10,12), chunks=(5,6), dtype='U4', overwrite=True, compressor=None)
+strs2 = string_group.create_array('str_data_2', shape=(10,12), chunks=(5,6), dtype='S2', overwrite=True, compressors=None)
+strs2[:] = charar
+unicode = string_group.create_array('unicode_data', shape=(10,12), chunks=(5,6), dtype='U4', overwrite=True, compressors=None)
 unicode[:] = charar
-
+unicode2 = string_group.create_array('unicode_data_2', shape=(10,12), chunks=(5,6), dtype='U2', overwrite=True, compressors=None)
+unicode2[:] = charar
@@ -0,0 +1 @@
+{}
@@ -1,3 +1,3 @@
 {
-    "zarr_format": 2
+  "zarr_format": 2
 }
@@ -0,0 +1 @@
+{}