JuliaStrings · stevengj · Mar 19, 2025 · Mar 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@
 /test/case
 /test/iscase
 /test/custom
+/test/maxdecomposition
 /tmp/
 /mingw_static/
 /mingw_shared/

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -81,12 +81,15 @@ if(UTF8PROC_ENABLE_TESTING)
   target_link_libraries(printproperty utf8proc)
   add_executable(valid test/tests.h test/tests.c utf8proc.h test/valid.c)
   target_link_libraries(valid utf8proc)
+  add_executable(maxdecomposition test/tests.h test/tests.c utf8proc.h test/maxdecomposition.c)
+  target_link_libraries(maxdecomposition utf8proc)
   add_test(utf8proc.testcase case)
   add_test(utf8proc.testcustom custom)
   add_test(utf8proc.testiterate iterate)
   add_test(utf8proc.testmisc misc)
   add_test(utf8proc.testprintproperty printproperty)
   add_test(utf8proc.testvalid valid)
+  add_test(utf8proc.testmaxdecomposition maxdecomposition)
 
   if (NOT WIN32)
     # no wcwidth function on Windows

diff --git a/Makefile b/Makefile
@@ -59,7 +59,7 @@ clean:
 ifneq ($(OS),Darwin)
 	rm -f libutf8proc.so.$(MAJOR)
 endif
-	rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase
+	rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase test/maxdecomposition
 	rm -rf MANIFEST.new tmp
 	$(MAKE) -C bench clean
 	$(MAKE) -C data clean
@@ -171,6 +171,9 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
 
+test/maxdecomposition: test/maxdecomposition.c test/tests.o utf8proc.o utf8proc.h test/tests.h
+	$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/maxdecomposition.c test/tests.o utf8proc.o -o $@
+
 # make release tarball from master branch
 dist:
 	git archive master --prefix=utf8proc-$(VERSION)/ -o utf8proc-$(VERSION).tar.gz
@@ -186,7 +189,7 @@ distcheck: dist
 	make -C utf8proc-$(VERSION) check
 	rm -rf utf8proc-$(VERSION)
 
-check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/maxdecomposition test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
 	$(MAKE) -C bench
 	test/normtest data/NormalizationTest.txt
 	test/graphemetest data/GraphemeBreakTest.txt
@@ -197,3 +200,4 @@ check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercas
 	test/case
 	test/iscase data/Lowercase.txt data/Uppercase.txt
 	test/custom
+	test/maxdecomposition
diff --git a/test/maxdecomposition.c b/test/maxdecomposition.c
@@ -0,0 +1,22 @@
+#include "tests.h"
+
+/* Check the maximum decomposed size returned by utf8proc_decompose_char with UTF8PROC_DECOMPOSE,
+   in order to give a hint in the documentation.  The documentation hint will need to be updated if this changes. */
+
+int main(void)
+{
+    utf8proc_int32_t dst[128];
+    utf8proc_ssize_t maxsize = 0, expected_maxsize = 4;
+    int success;
+
+    for (utf8proc_int32_t c = 0; c <= 0x110000; ++c) {
+        utf8proc_ssize_t sz = utf8proc_decompose_char(c, dst, 128, UTF8PROC_DECOMPOSE, NULL);
+        maxsize = sz > maxsize ? sz : maxsize;
+    }
+
+    success = expected_maxsize == maxsize;
+    fprintf(success ? stdout : stderr,
+            "%s: maximum decomposed size = %d chars\n",
+            success ? "SUCCEEDED" : "FAILED", (int) maxsize);
+    return !success;
+}
diff --git a/utf8proc.h b/utf8proc.h
@@ -517,7 +517,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
  * @param dst the destination buffer.
  * @param bufsize the size of the destination buffer.
  * @param options one or more of the following flags:
- * - @ref UTF8PROC_REJECTNA  - return an error `codepoint` is unassigned
+ * - @ref UTF8PROC_REJECTNA  - return an error if `codepoint` is unassigned
  * - @ref UTF8PROC_IGNORE    - strip "default ignorable" codepoints
  * - @ref UTF8PROC_CASEFOLD  - apply Unicode casefolding
  * - @ref UTF8PROC_COMPAT    - replace certain codepoints with their
@@ -532,6 +532,11 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
  * option is used.  If the string is being processed in order, this can be initialized to 0 for
  * the beginning of the string, and is thereafter updated automatically.  Otherwise, this parameter is ignored.
  *
+ * In the current version of utf8proc, the maximum destination buffer with the @ref UTF8PROC_DECOMPOSE
+ * option is 4 elements (or double that with @ref UTF8PROC_CHARBOUND), so this is a good default size.
+ * However, this may increase in future Unicode versions, so you should always check the return value
+ * as described below.
+ *
  * @return
  * In case of success, the number of codepoints written is returned; in case
  * of an error, a negative error code is returned (utf8proc_errmsg()).