gh-150878: Speed up json.dumps(ensure_ascii=False) for long strings

gaborbernat · gaborbernat · commit 0d5d951251ec · 2026-06-03T11:49:35.000-07:00
escape_size() sizes the ensure_ascii=False encoder output one character at a
time; a character needs escaping only when c == '"' || c == '\\' || c &lt; 0x20,
and non-ASCII is kept verbatim. For the one-byte representation, detect the
no-escape case eight bytes at a time and return the verbatim size directly; a
length guard keeps short strings on the original per-character loop. Strings
with characters above U+00FF keep the current path.

Output is byte-identical, verified against test_json and a 199-case dumps
differential in both ensure_ascii modes. dumps of long 1-byte strings runs up
to 5.8x faster (4.2x for Latin-1 text); short keys and non-Latin-1 strings are
unaffected.
diff --git a/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst b/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst
@@ -0,0 +1,4 @@
+Speed up :func:`json.dumps` with ``ensure_ascii=False`` for strings made up of
+long runs of characters that need no escaping, by scanning eight bytes at a
+time. Short strings, strings that need escaping, and strings with characters
+above U+00FF are unaffected. Patch by Bernát Gábor.
diff --git a/Modules/_json.c b/Modules/_json.c
@@ -281,6 +281,36 @@ escape_size(const void *input, int kind, Py_ssize_t input_chars)
     Py_ssize_t i;
     Py_ssize_t output_size;
 
+    /* SWAR no-escape fast path (1-byte): needs-escape is c == '"' || c == '\\'
+       || c < 0x20; non-ASCII (Latin-1 >= 0x80) is kept verbatim here.  A length
+       guard keeps short strings on the original per-character loop. */
+    if (kind == PyUnicode_1BYTE_KIND && input_chars >= 16
+            && input_chars < PY_SSIZE_T_MAX - 2) {
+        const Py_UCS1 *p = (const Py_UCS1 *)input;
+        const uint64_t ones = 0x0101010101010101ULL;
+        const uint64_t high = 0x8080808080808080ULL;
+        const uint64_t bq = 0x22ULL * ones, bs = 0x5cULL * ones, bc = 0xE0ULL * ones;
+        Py_ssize_t j = 0;
+        int needs_escape = 0;
+        for (; j + 8 <= input_chars; j += 8) {
+            uint64_t w;
+            memcpy(&w, p + j, 8);
+            uint64_t mq = w ^ bq; mq = (mq - ones) & ~mq & high;
+            uint64_t ms = w ^ bs; ms = (ms - ones) & ~ms & high;
+            uint64_t vc = w & bc; uint64_t mlo = (vc - ones) & ~vc & high;
+            if (mq | ms | mlo) { needs_escape = 1; break; }
+        }
+        if (!needs_escape) {
+            for (; j < input_chars; j++) {
+                Py_UCS1 c = p[j];
+                if (c == '"' || c == '\\' || c < 0x20) { needs_escape = 1; break; }
+            }
+        }
+        if (!needs_escape) {
+            return input_chars + 2;
+        }
+    }
+
     /* Compute the output size */
     for (i = 0, output_size = 2; i < input_chars; i++) {
         Py_UCS4 c = PyUnicode_READ(kind, input, i);