From 35937b922e5811a90321aeaff7c8176a5f39ffd8 Mon Sep 17 00:00:00 2001 From: He-Pin Date: Sun, 10 May 2026 19:40:19 +0800 Subject: [PATCH] perf: add ASCII-safe substr fast path Motivation: std.substr on long ASCII strings repeatedly pays codepoint-offset scans even when parser-time analysis can prove the literal is printable ASCII and JSON-render safe. Modification: Mark long ASCII JSON-safe literals with the existing _asciiSafe flag using a single platform CharSWAR scan, propagate the flag through string concatenation, and let std.length/std.substr use direct UTF-16 length/substring only for proven-safe values. Add UnicodeHandlingTests coverage for long ASCII length/substr boundaries and concat propagation. Result: Focused JVM JMH improves go_suite/substr from 0.056 ms/op to 0.046-0.047 ms/op with split_resolve unchanged and realistic2 in the same noise range. Scala Native hyperfine is neutral against master on the same case. References: Extracted from ideas in databricks/sjsonnet#776, especially commit a190a800 (ASCII fast paths and asciiSafe propagation), narrowed to avoid the broader join/parseInt changes. --- sjsonnet/src/sjsonnet/Parser.scala | 5 +++- sjsonnet/src/sjsonnet/Val.scala | 8 ++++-- .../src/sjsonnet/stdlib/StringModule.scala | 27 ++++++++++++++----- .../src/sjsonnet/UnicodeHandlingTests.scala | 10 +++++++ 4 files changed, 41 insertions(+), 9 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Parser.scala b/sjsonnet/src/sjsonnet/Parser.scala index e4ad6bbd0..209c6d6e2 100644 --- a/sjsonnet/src/sjsonnet/Parser.scala +++ b/sjsonnet/src/sjsonnet/Parser.scala @@ -748,7 +748,10 @@ class Parser( // cost more than the potential memory savings for strings that are unlikely // to repeat (e.g., 600KB text block literals) val unique = if (s.length > 1024) s else internedStrings.getOrElseUpdate(s, s) - Val.Str(pos, unique) + val result = Val.Str(pos, unique) + if (unique.length > 1024 && CharSWAR.isAsciiJsonSafe(unique)) + result._asciiSafe = true + result } // Any `expr` that isn't naively left-recursive diff --git a/sjsonnet/src/sjsonnet/Val.scala b/sjsonnet/src/sjsonnet/Val.scala index 65be07915..74429f8ea 100644 --- a/sjsonnet/src/sjsonnet/Val.scala +++ b/sjsonnet/src/sjsonnet/Val.scala @@ -433,11 +433,15 @@ object Val { if (ls != null && ls.isEmpty) return right if (rs != null && rs.isEmpty) return left // Small string eagerness: both flat and combined length <= 128 - if (ls != null && rs != null && ls.length + rs.length <= 128) - return new Str(pos, ls + rs) + if (ls != null && rs != null && ls.length + rs.length <= 128) { + val result = new Str(pos, ls + rs) + if (left._asciiSafe && right._asciiSafe) result._asciiSafe = true + return result + } // Rope node: O(1) val node = new Str(pos, null) node._children = Array(left, right) + if (left._asciiSafe && right._asciiSafe) node._asciiSafe = true node } } diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala index d1ac0eca5..7c5dc0a09 100644 --- a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala +++ b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala @@ -82,11 +82,14 @@ object StringModule extends AbstractFunctionModule { Val.cachedNum( pos, (x.value match { - case Val.Str(_, s) => s.codePointCount(0, s.length) - case a: Val.Arr => a.length - case o: Val.Obj => o.visibleKeyNames.length - case o: Val.Func => o.params.names.length - case x => Error.fail("Cannot get length of " + x.prettyName) + case v: Val.Str => + val s = v.str + if (v._asciiSafe) s.length + else s.codePointCount(0, s.length) + case a: Val.Arr => a.length + case o: Val.Obj => o.visibleKeyNames.length + case o: Val.Func => o.params.names.length + case x => Error.fail("Cannot get length of " + x.prettyName) }).toDouble ) } @@ -126,7 +129,9 @@ object StringModule extends AbstractFunctionModule { */ private object Substr extends Val.Builtin3("substr", "str", "from", "len") { def evalRhs(_s: Eval, from: Eval, len: Eval, ev: EvalScope, pos: Position): Val = { - val str = _s.value.asString + val srcVal = _s.value + val str = srcVal.asString + val srcAsciiSafe = srcVal.isInstanceOf[Val.Str] && srcVal.asInstanceOf[Val.Str]._asciiSafe val offset = from.value match { case v: Val.Num => v.asPositiveInt case _ => Error.fail("Expected a number for offset in substr, got " + from.value.prettyName) @@ -138,6 +143,16 @@ object StringModule extends AbstractFunctionModule { if (length <= 0) { Val.Str(pos, "") + } else if (srcAsciiSafe) { + val strLen = str.length + val safeOffset = math.min(offset, strLen) + val safeLength = math.min(length, strLen - safeOffset) + if (safeLength <= 0) Val.Str(pos, "") + else { + val result = Val.Str(pos, str.substring(safeOffset, safeOffset + safeLength)) + result._asciiSafe = true + result + } } else { val requestedEnd = offset.toLong + length.toLong if ( diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index a1197f700..d15b7b547 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -13,6 +13,7 @@ object UnicodeHandlingTests extends TestSuite { test("stringLength") { eval("std.length('๐ŸŒ')") ==> ujson.Num(1) eval("std.length('Hello ๐ŸŒ')") ==> ujson.Num(7) + eval("std.length('ASCII only')") ==> ujson.Num(10) // Jsonnet strings are defined over codepoints, not grapheme clusters, so the // following "family" emoji has a length of 7 (because it has 7 codepoints): eval("std.length('๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ')") ==> ujson.Num(7) @@ -53,10 +54,19 @@ object UnicodeHandlingTests extends TestSuite { eval("std.substr('A๐ŸŒB', 0, 1)") ==> ujson.Str("A") eval("std.substr('A๐ŸŒB', 1, 1)") ==> ujson.Str("๐ŸŒ") eval("std.substr('A๐ŸŒB', 2, 1)") ==> ujson.Str("B") + eval("std.substr('ASCII only', 6, 4)") ==> ujson.Str("only") eval("std.substr('Hello ๐ŸŒ World', 6, 100)") ==> ujson.Str("๐ŸŒ World") eval("std.substr('๐ŸŒ', 1, 5)") ==> ujson.Str("") // Beyond string length } + test("longAsciiLengthAndSubstr") { + val longAscii = "a" * 1030 + eval(s"std.length('$longAscii')") ==> ujson.Num(1030) + eval(s"std.substr('$longAscii', 1028, 20)") ==> ujson.Str("aa") + eval(s"std.substr('$longAscii', 1031, 20)") ==> ujson.Str("") + eval(s"std.substr('$longAscii' + '$longAscii', 2058, 10)") ==> ujson.Str("aa") + } + test("stringSlice") { eval("'A๐ŸŒB'[0:1]") ==> ujson.Str("A") eval("'A๐ŸŒB'[1:2]") ==> ujson.Str("๐ŸŒ")