Skip to content

Commit d8deb45

Browse files
encukousethmlarsonch4n3-yoonStanFromIrelandpicnixz
committed
[3.12] gh-149079: Fix O(n^2) canonical ordering in unicodedata.normalize() (GH-149080)
Replace the insertion sort used for canonical ordering of combining characters with a hybrid approach: insertion sort for short runs (< 20) and counting sort for longer runs, reducing worst-case complexity from O(n^2) to O(n). This prevents denial of service via crafted Unicode strings with many combining characters in alternating CCC order. (cherry picked from commit 991224b) Co-authored-by: Seth Larson <seth@python.org> Co-authored-by: ch4n3-yoon <ch4n3.yoon@gmail.com> Co-authored-by: Seokchan Yoon <13852925+ch4n3-yoon@users.noreply.github.com> Co-authored-by: Stan Ulbrych <stan@python.org> Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Maurycy Pawłowski-Wieroński <maurycy@maurycy.com>
1 parent 7c999be commit d8deb45

3 files changed

Lines changed: 151 additions & 26 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,34 @@ def test_issue10254(self):
203203
b = 'C\u0338' * 20 + '\xC7'
204204
self.assertEqual(self.db.normalize('NFC', a), b)
205205

206+
def test_long_combining_mark_run(self):
207+
# gh-149079: avoid quadratic canonical ordering.
208+
payload = "a" + ("\u0300\u0327" * 32)
209+
nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
210+
nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)
211+
212+
self.assertEqual(self.db.normalize("NFD", payload), nfd)
213+
self.assertEqual(self.db.normalize("NFKD", payload), nfd)
214+
self.assertEqual(self.db.normalize("NFC", payload), nfc)
215+
self.assertEqual(self.db.normalize("NFKC", payload), nfc)
216+
217+
def test_combining_mark_run_fast_paths(self):
218+
# gh-149079: cover short runs and already-sorted long runs.
219+
short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
220+
short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
221+
short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
222+
long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30)
223+
long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29)
224+
225+
self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd)
226+
self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd)
227+
self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc)
228+
self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc)
229+
self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted)
230+
self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted)
231+
self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc)
232+
self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc)
233+
206234
def test_issue29456(self):
207235
# Fix #29456
208236
u1176_str_a = '\u1100\u1176\u11a8'
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix a potential denial of service in :func:`unicodedata.normalize`. The
2+
canonical ordering step of Unicode normalization used a quadratic-time insertion
3+
sort for reordering combining characters, which could be exploited with
4+
crafted input containing many combining characters in non-canonical order.
5+
Replaced with a linear-time counting sort for long runs.

Modules/unicodedata.c

Lines changed: 118 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -490,19 +490,80 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
490490
#define NCount (VCount*TCount)
491491
#define SCount (LCount*NCount)
492492

493+
/* Small combining runs are usually cheaper with insertion sort. */
494+
#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20
495+
496+
static void
497+
canonical_ordering_sort_insertion(int kind, void *data,
498+
Py_ssize_t start, Py_ssize_t end)
499+
{
500+
for (Py_ssize_t i = start + 1; i < end; i++) {
501+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
502+
unsigned char combining = _getrecord_ex(code)->combining;
503+
Py_ssize_t j = i;
504+
505+
while (j > start) {
506+
Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1);
507+
if (_getrecord_ex(previous)->combining <= combining) {
508+
break;
509+
}
510+
PyUnicode_WRITE(kind, data, j, previous);
511+
j--;
512+
}
513+
if (j != i) {
514+
PyUnicode_WRITE(kind, data, j, code);
515+
}
516+
}
517+
}
518+
519+
static void
520+
canonical_ordering_sort_counting(int kind, void *data,
521+
Py_ssize_t start, Py_ssize_t end,
522+
Py_UCS4 *sortbuf)
523+
{
524+
Py_ssize_t counts[256] = {0};
525+
Py_ssize_t run_length = end - start;
526+
Py_ssize_t total = 0;
527+
528+
for (Py_ssize_t i = start; i < end; i++) {
529+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
530+
unsigned char combining = _getrecord_ex(code)->combining;
531+
counts[combining]++;
532+
}
533+
534+
for (size_t i = 0; i < Py_ARRAY_LENGTH(counts); i++) {
535+
Py_ssize_t count = counts[i];
536+
counts[i] = total;
537+
total += count;
538+
}
539+
540+
/* Reuse counts[] as the next output slot for each CCC. */
541+
for (Py_ssize_t i = start; i < end; i++) {
542+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
543+
unsigned char combining = _getrecord_ex(code)->combining;
544+
sortbuf[counts[combining]++] = code;
545+
}
546+
for (Py_ssize_t i = 0; i < run_length; i++) {
547+
PyUnicode_WRITE(kind, data, start + i, sortbuf[i]);
548+
}
549+
}
550+
493551
static PyObject*
494552
nfd_nfkd(PyObject *self, PyObject *input, int k)
495553
{
496554
PyObject *result;
497555
Py_UCS4 *output;
498556
Py_ssize_t i, o, osize;
499-
int kind;
500-
const void *data;
557+
int input_kind, result_kind;
558+
const void *input_data;
559+
void *result_data;
501560
/* Longest decomposition in Unicode 3.2: U+FDFA */
502561
Py_UCS4 stack[20];
503562
Py_ssize_t space, isize;
504563
int index, prefix, count, stackptr;
505564
unsigned char prev, cur;
565+
Py_UCS4 *sortbuf = NULL;
566+
Py_ssize_t sortbuflen = 0;
506567

507568
stackptr = 0;
508569
isize = PyUnicode_GET_LENGTH(input);
@@ -522,11 +583,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
522583
return NULL;
523584
}
524585
i = o = 0;
525-
kind = PyUnicode_KIND(input);
526-
data = PyUnicode_DATA(input);
586+
input_kind = PyUnicode_KIND(input);
587+
input_data = PyUnicode_DATA(input);
527588

528589
while (i < isize) {
529-
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
590+
stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++);
530591
while(stackptr) {
531592
Py_UCS4 code = stack[--stackptr];
532593
/* Hangul Decomposition adds three characters in
@@ -591,35 +652,66 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
591652
PyMem_Free(output);
592653
if (!result)
593654
return NULL;
655+
594656
/* result is guaranteed to be ready, as it is compact. */
595-
kind = PyUnicode_KIND(result);
596-
data = PyUnicode_DATA(result);
657+
result_kind = PyUnicode_KIND(result);
658+
result_data = PyUnicode_DATA(result);
597659

598-
/* Sort canonically. */
660+
/* Sort each consecutive combining-character run canonically. */
599661
i = 0;
600-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
601-
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
602-
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
603-
if (prev == 0 || cur == 0 || prev <= cur) {
604-
prev = cur;
662+
while (i < o) {
663+
Py_ssize_t run_length, run_start;
664+
int needs_sort = 0;
665+
666+
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
667+
prev = _getrecord_ex(ch)->combining;
668+
if (prev == 0) {
669+
i++;
605670
continue;
606671
}
607-
/* Non-canonical order. Need to switch *i with previous. */
608-
o = i - 1;
609-
while (1) {
610-
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
611-
PyUnicode_WRITE(kind, data, o+1,
612-
PyUnicode_READ(kind, data, o));
613-
PyUnicode_WRITE(kind, data, o, tmp);
614-
o--;
615-
if (o < 0)
616-
break;
617-
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
618-
if (prev == 0 || prev <= cur)
672+
673+
run_start = i++;
674+
while (i < o) {
675+
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
676+
cur = _getrecord_ex(ch)->combining;
677+
if (cur == 0) {
619678
break;
679+
}
680+
if (prev > cur) {
681+
needs_sort = 1;
682+
}
683+
prev = cur;
684+
i++;
685+
}
686+
if (!needs_sort) {
687+
continue;
688+
}
689+
690+
run_length = i - run_start;
691+
if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) {
692+
canonical_ordering_sort_insertion(result_kind, result_data,
693+
run_start, i);
694+
continue;
620695
}
621-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
696+
697+
if (run_length > sortbuflen) {
698+
Py_UCS4 *new_sortbuf = PyMem_Resize(sortbuf,
699+
Py_UCS4,
700+
run_length);
701+
if (new_sortbuf == NULL) {
702+
PyErr_NoMemory();
703+
PyMem_Free(sortbuf);
704+
Py_DECREF(result);
705+
return NULL;
706+
}
707+
sortbuf = new_sortbuf;
708+
sortbuflen = run_length;
709+
}
710+
711+
canonical_ordering_sort_counting(result_kind, result_data,
712+
run_start, i, sortbuf);
622713
}
714+
PyMem_Free(sortbuf);
623715
return result;
624716
}
625717

0 commit comments

Comments
 (0)