Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ endif()
if(UTF8PROC_ENABLE_TESTING)
enable_testing()
file(MAKE_DIRECTORY data)
set(UNICODE_VERSION 17.0.0)
set(UNICODE_VERSION 18.0.0)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
Expand Down
2 changes: 1 addition & 1 deletion data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
$(JULIA) --project=. data_generator.jl > $@

# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=17.0.0
UNICODE_VERSION=18.0.0

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
Expand Down
8 changes: 4 additions & 4 deletions data/Manifest.toml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion test/graphemetest.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ int main(int argc, char **argv)
}
fclose(f);
printf("Passed tests after %zd lines!\n", lineno);
lineno = 0; /* no line numbers for subsequent tests */

printf("Performing regression tests...\n");

Expand All @@ -125,7 +126,7 @@ int main(int argc, char **argv)
checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true);
checkline("/ 0915 0300 0300 / 0924 / 0915 /", true);
checkline("/ 0915 0300 094d 0300 / 0078 /", true);
checkline("/ 0300 094d 0300 / 0924 / 0915 /", true);
checkline("/ 0300 094d 0300 0924 / 0915 /", true);

check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");
Expand Down
17 changes: 5 additions & 12 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
}

UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
return "17.0.0";
return "18.0.0";
}

UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
Expand Down Expand Up @@ -294,7 +294,7 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int tic
int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
if (*state == 0) { /* state initialization */
state_bc = lbc;
state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
state_icb = licb;
}
else { /* lbc and licb are already encoded in *state */
state_bc = *state & 0xff; // 1st byte of state is bound class
Expand All @@ -305,16 +305,9 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int tic
!(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
&& ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c

// Special support for GB9c. Don't break between two consonants
// separated 1+ linker characters and 0+ extend characters in any order.
// After a consonant, we enter LINKER state after at least one linker.
if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
state_icb = ticb;
else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
// Special support for GB9c. Don't break between linker + 0+ extend chars and consonant.
// We enter LINKER state after a linker and stay in it for extend chars.
state_icb = (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND) ? state_icb : ticb;

// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
Expand Down
Loading
Loading