Skip to content

check max size of utf8proc_decompose_char buffer #291

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
/test/case
/test/iscase
/test/custom
/test/maxdecomposition
/tmp/
/mingw_static/
/mingw_shared/
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,15 @@ if(UTF8PROC_ENABLE_TESTING)
target_link_libraries(printproperty utf8proc)
add_executable(valid test/tests.h test/tests.c utf8proc.h test/valid.c)
target_link_libraries(valid utf8proc)
add_executable(maxdecomposition test/tests.h test/tests.c utf8proc.h test/maxdecomposition.c)
target_link_libraries(maxdecomposition utf8proc)
add_test(utf8proc.testcase case)
add_test(utf8proc.testcustom custom)
add_test(utf8proc.testiterate iterate)
add_test(utf8proc.testmisc misc)
add_test(utf8proc.testprintproperty printproperty)
add_test(utf8proc.testvalid valid)
add_test(utf8proc.testmaxdecomposition maxdecomposition)

if (NOT WIN32)
# no wcwidth function on Windows
Expand Down
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ clean:
ifneq ($(OS),Darwin)
rm -f libutf8proc.so.$(MAJOR)
endif
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase test/maxdecomposition
rm -rf MANIFEST.new tmp
$(MAKE) -C bench clean
$(MAKE) -C data clean
Expand Down Expand Up @@ -171,6 +171,9 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@

test/maxdecomposition: test/maxdecomposition.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/maxdecomposition.c test/tests.o utf8proc.o -o $@

# make release tarball from master branch
dist:
git archive master --prefix=utf8proc-$(VERSION)/ -o utf8proc-$(VERSION).tar.gz
Expand All @@ -186,7 +189,7 @@ distcheck: dist
make -C utf8proc-$(VERSION) check
rm -rf utf8proc-$(VERSION)

check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/maxdecomposition test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
Expand All @@ -197,3 +200,4 @@ check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercas
test/case
test/iscase data/Lowercase.txt data/Uppercase.txt
test/custom
test/maxdecomposition
22 changes: 22 additions & 0 deletions test/maxdecomposition.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#include "tests.h"

/* Check the maximum decomposed size returned by utf8proc_decompose_char with UTF8PROC_DECOMPOSE,
in order to give a hint in the documentation. The documentation hint will need to be updated if this changes. */

int main(void)
{
utf8proc_int32_t dst[128];
utf8proc_ssize_t maxsize = 0, expected_maxsize = 4;
int success;

for (utf8proc_int32_t c = 0; c <= 0x110000; ++c) {
utf8proc_ssize_t sz = utf8proc_decompose_char(c, dst, 128, UTF8PROC_DECOMPOSE, NULL);
maxsize = sz > maxsize ? sz : maxsize;
}

success = expected_maxsize == maxsize;
fprintf(success ? stdout : stderr,
"%s: maximum decomposed size = %d chars\n",
success ? "SUCCEEDED" : "FAILED", (int) maxsize);
return !success;
}
7 changes: 6 additions & 1 deletion utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
* @param dst the destination buffer.
* @param bufsize the size of the destination buffer.
* @param options one or more of the following flags:
* - @ref UTF8PROC_REJECTNA - return an error `codepoint` is unassigned
* - @ref UTF8PROC_REJECTNA - return an error if `codepoint` is unassigned
* - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints
* - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding
* - @ref UTF8PROC_COMPAT - replace certain codepoints with their
Expand All @@ -532,6 +532,11 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
* option is used. If the string is being processed in order, this can be initialized to 0 for
* the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored.
*
* In the current version of utf8proc, the maximum destination buffer with the @ref UTF8PROC_DECOMPOSE
* option is 4 elements (or double that with @ref UTF8PROC_CHARBOUND), so this is a good default size.
* However, this may increase in future Unicode versions, so you should always check the return value
* as described below.
*
* @return
* In case of success, the number of codepoints written is returned; in case
* of an error, a negative error code is returned (utf8proc_errmsg()).
Expand Down
Loading