From 7d5d272dda439a8ee3597ff9a90e2dd6a04506ab Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 19 Mar 2025 12:47:07 -0400 Subject: [PATCH 1/2] check max size of utf8proc_decompose_char buffer --- .gitignore | 1 + Makefile | 8 ++++++-- test/maxdecomposition.c | 22 ++++++++++++++++++++++ utf8proc.h | 7 ++++++- 4 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 test/maxdecomposition.c diff --git a/.gitignore b/.gitignore index 3a866ba..716378f 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ /test/case /test/iscase /test/custom +/test/maxdecomposition /tmp/ /mingw_static/ /mingw_shared/ diff --git a/Makefile b/Makefile index 94f76e3..3f6bfc9 100644 --- a/Makefile +++ b/Makefile @@ -59,7 +59,7 @@ clean: ifneq ($(OS),Darwin) rm -f libutf8proc.so.$(MAJOR) endif - rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase + rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase test/maxdecomposition rm -rf MANIFEST.new tmp $(MAKE) -C bench clean $(MAKE) -C data clean @@ -171,6 +171,9 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@ +test/maxdecomposition: test/maxdecomposition.c test/tests.o utf8proc.o utf8proc.h test/tests.h + $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/maxdecomposition.c test/tests.o utf8proc.o -o $@ + # make release tarball from master branch dist: git archive master --prefix=utf8proc-$(VERSION)/ -o utf8proc-$(VERSION).tar.gz @@ -186,7 +189,7 @@ distcheck: dist make -C utf8proc-$(VERSION) check rm -rf utf8proc-$(VERSION) -check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o +check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/maxdecomposition test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o $(MAKE) -C bench test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt @@ -197,3 +200,4 @@ check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercas test/case test/iscase data/Lowercase.txt data/Uppercase.txt test/custom + test/maxdecomposition diff --git a/test/maxdecomposition.c b/test/maxdecomposition.c new file mode 100644 index 0000000..967d902 --- /dev/null +++ b/test/maxdecomposition.c @@ -0,0 +1,22 @@ +#include "tests.h" + +/* Check the maximum decomposed size returned by utf8proc_decompose_char with UTF8PROC_DECOMPOSE, + in order to give a hint in the documentation. The documentation hint will need to be updated if this changes. */ + +int main(void) +{ + utf8proc_int32_t dst[128]; + utf8proc_ssize_t maxsize = 0, expected_maxsize = 4; + int success; + + for (utf8proc_int32_t c = 0; c <= 0x110000; ++c) { + utf8proc_ssize_t sz = utf8proc_decompose_char(c, dst, 128, UTF8PROC_DECOMPOSE, NULL); + maxsize = sz > maxsize ? sz : maxsize; + } + + success = expected_maxsize == maxsize; + fprintf(success ? stdout : stderr, + "%s: maximum decomposed size = %d chars\n", + success ? "SUCCEEDED" : "FAILED", (int) maxsize); + return !success; +} diff --git a/utf8proc.h b/utf8proc.h index 039da76..55fcfb5 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -517,7 +517,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int * @param dst the destination buffer. * @param bufsize the size of the destination buffer. * @param options one or more of the following flags: - * - @ref UTF8PROC_REJECTNA - return an error `codepoint` is unassigned + * - @ref UTF8PROC_REJECTNA - return an error if `codepoint` is unassigned * - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints * - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding * - @ref UTF8PROC_COMPAT - replace certain codepoints with their @@ -532,6 +532,11 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int * option is used. If the string is being processed in order, this can be initialized to 0 for * the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored. * + * In the current version of utf8proc, the maximum destination buffer with the @ref UTF8PROC_DECOMPOSE + * option is 4 elements (or double that with @ref UTF8PROC_CHARBOUND), so this is a good default size. + * However, this may increase in future Unicode versions, so you should always check the return value + * as described below. + * * @return * In case of success, the number of codepoints written is returned; in case * of an error, a negative error code is returned (utf8proc_errmsg()). From 41f5707e4fb0904d8be4ed170a007046538b7096 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 19 Mar 2025 12:53:15 -0400 Subject: [PATCH 2/2] cmake rule for maxdecomposition test --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e6f99d1..e4b247c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,12 +81,15 @@ if(UTF8PROC_ENABLE_TESTING) target_link_libraries(printproperty utf8proc) add_executable(valid test/tests.h test/tests.c utf8proc.h test/valid.c) target_link_libraries(valid utf8proc) + add_executable(maxdecomposition test/tests.h test/tests.c utf8proc.h test/maxdecomposition.c) + target_link_libraries(maxdecomposition utf8proc) add_test(utf8proc.testcase case) add_test(utf8proc.testcustom custom) add_test(utf8proc.testiterate iterate) add_test(utf8proc.testmisc misc) add_test(utf8proc.testprintproperty printproperty) add_test(utf8proc.testvalid valid) + add_test(utf8proc.testmaxdecomposition maxdecomposition) if (NOT WIN32) # no wcwidth function on Windows