JuliaStrings · stevengj · Jun 23, 2026 · Jun 23, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -79,7 +79,7 @@ endif()
 if(UTF8PROC_ENABLE_TESTING)
   enable_testing()
   file(MAKE_DIRECTORY data)
-  set(UNICODE_VERSION 17.0.0)
+  set(UNICODE_VERSION 18.0.0)
   file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
   file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
   add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)

diff --git a/data/Makefile b/data/Makefile
@@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
 	$(JULIA) --project=. data_generator.jl > $@
 
 # Unicode data version (must also update utf8proc_unicode_version function)
-UNICODE_VERSION=17.0.0
+UNICODE_VERSION=18.0.0
 
 UnicodeData.txt:
 	$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt

diff --git a/data/Manifest.toml b/data/Manifest.toml
diff --git a/test/graphemetest.c b/test/graphemetest.c
@@ -100,6 +100,7 @@ int main(int argc, char **argv)
     }
     fclose(f);
     printf("Passed tests after %zd lines!\n", lineno);
+    lineno = 0; /* no line numbers for subsequent tests */
 
     printf("Performing regression tests...\n");
 
@@ -125,7 +126,7 @@ int main(int argc, char **argv)
     checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true);
     checkline("/ 0915 0300 0300 / 0924 / 0915 /", true);
     checkline("/ 0915 0300 094d 0300 / 0078 /", true);
-    checkline("/ 0300 094d 0300 / 0924 / 0915 /", true);
+    checkline("/ 0300 094d 0300 0924 / 0915 /", true);
 
     check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
     check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");

diff --git a/utf8proc.c b/utf8proc.c
@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
-  return "17.0.0";
+  return "18.0.0";
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
@@ -294,7 +294,7 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int tic
     int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
     if (*state == 0) { /* state initialization */
       state_bc = lbc;
-      state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
+      state_icb = licb;
     }
     else { /* lbc and licb are already encoded in *state */
       state_bc = *state & 0xff;  // 1st byte of state is bound class
@@ -305,16 +305,9 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int tic
        !(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
         && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
 
-    // Special support for GB9c.  Don't break between two consonants
-    // separated 1+ linker characters and 0+ extend characters in any order.
-    // After a consonant, we enter LINKER state after at least one linker.
-    if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
-        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
-        || state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
-      state_icb = ticb;
-    else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
-      state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
-                  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
+    // Special support for GB9c.  Don't break between linker + 0+ extend chars and consonant.
+    // We enter LINKER state after a linker and stay in it for extend chars.
+    state_icb = (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER && ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND) ? state_icb : ticb;
 
     // Special support for GB 12/13 made possible by GB999. After two RI
     // class codepoints we want to force a break. Do this by resetting the