From 35937b922e5811a90321aeaff7c8176a5f39ffd8 Mon Sep 17 00:00:00 2001
From: He-Pin <hepin1989@gmail.com>
Date: Sun, 10 May 2026 19:40:19 +0800
Subject: [PATCH] perf: add ASCII-safe substr fast path

Motivation:
std.substr on long ASCII strings repeatedly pays codepoint-offset scans even when parser-time analysis can prove the literal is printable ASCII and JSON-render safe.

Modification:
Mark long ASCII JSON-safe literals with the existing _asciiSafe flag using a single platform CharSWAR scan, propagate the flag through string concatenation, and let std.length/std.substr use direct UTF-16 length/substring only for proven-safe values. Add UnicodeHandlingTests coverage for long ASCII length/substr boundaries and concat propagation.

Result:
Focused JVM JMH improves go_suite/substr from 0.056 ms/op to 0.046-0.047 ms/op with split_resolve unchanged and realistic2 in the same noise range. Scala Native hyperfine is neutral against master on the same case.

References:
Extracted from ideas in databricks/sjsonnet#776, especially commit a190a800 (ASCII fast paths and asciiSafe propagation), narrowed to avoid the broader join/parseInt changes.
---
 sjsonnet/src/sjsonnet/Parser.scala            |  5 +++-
 sjsonnet/src/sjsonnet/Val.scala               |  8 ++++--
 .../src/sjsonnet/stdlib/StringModule.scala    | 27 ++++++++++++++-----
 .../src/sjsonnet/UnicodeHandlingTests.scala   | 10 +++++++
 4 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/sjsonnet/src/sjsonnet/Parser.scala b/sjsonnet/src/sjsonnet/Parser.scala
index e4ad6bbd0..209c6d6e2 100644
--- a/sjsonnet/src/sjsonnet/Parser.scala
+++ b/sjsonnet/src/sjsonnet/Parser.scala
@@ -748,7 +748,10 @@ class Parser(
     // cost more than the potential memory savings for strings that are unlikely
     // to repeat (e.g., 600KB text block literals)
     val unique = if (s.length > 1024) s else internedStrings.getOrElseUpdate(s, s)
-    Val.Str(pos, unique)
+    val result = Val.Str(pos, unique)
+    if (unique.length > 1024 && CharSWAR.isAsciiJsonSafe(unique))
+      result._asciiSafe = true
+    result
   }
 
   // Any `expr` that isn't naively left-recursive
diff --git a/sjsonnet/src/sjsonnet/Val.scala b/sjsonnet/src/sjsonnet/Val.scala
index 65be07915..74429f8ea 100644
--- a/sjsonnet/src/sjsonnet/Val.scala
+++ b/sjsonnet/src/sjsonnet/Val.scala
@@ -433,11 +433,15 @@ object Val {
       if (ls != null && ls.isEmpty) return right
       if (rs != null && rs.isEmpty) return left
       // Small string eagerness: both flat and combined length <= 128
-      if (ls != null && rs != null && ls.length + rs.length <= 128)
-        return new Str(pos, ls + rs)
+      if (ls != null && rs != null && ls.length + rs.length <= 128) {
+        val result = new Str(pos, ls + rs)
+        if (left._asciiSafe && right._asciiSafe) result._asciiSafe = true
+        return result
+      }
       // Rope node: O(1)
       val node = new Str(pos, null)
       node._children = Array(left, right)
+      if (left._asciiSafe && right._asciiSafe) node._asciiSafe = true
       node
     }
   }
diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala
index d1ac0eca5..7c5dc0a09 100644
--- a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala
+++ b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala
@@ -82,11 +82,14 @@ object StringModule extends AbstractFunctionModule {
       Val.cachedNum(
         pos,
         (x.value match {
-          case Val.Str(_, s) => s.codePointCount(0, s.length)
-          case a: Val.Arr    => a.length
-          case o: Val.Obj    => o.visibleKeyNames.length
-          case o: Val.Func   => o.params.names.length
-          case x             => Error.fail("Cannot get length of " + x.prettyName)
+          case v: Val.Str =>
+            val s = v.str
+            if (v._asciiSafe) s.length
+            else s.codePointCount(0, s.length)
+          case a: Val.Arr  => a.length
+          case o: Val.Obj  => o.visibleKeyNames.length
+          case o: Val.Func => o.params.names.length
+          case x           => Error.fail("Cannot get length of " + x.prettyName)
         }).toDouble
       )
   }
@@ -126,7 +129,9 @@ object StringModule extends AbstractFunctionModule {
    */
   private object Substr extends Val.Builtin3("substr", "str", "from", "len") {
     def evalRhs(_s: Eval, from: Eval, len: Eval, ev: EvalScope, pos: Position): Val = {
-      val str = _s.value.asString
+      val srcVal = _s.value
+      val str = srcVal.asString
+      val srcAsciiSafe = srcVal.isInstanceOf[Val.Str] && srcVal.asInstanceOf[Val.Str]._asciiSafe
       val offset = from.value match {
         case v: Val.Num => v.asPositiveInt
         case _ => Error.fail("Expected a number for offset in substr, got " + from.value.prettyName)
@@ -138,6 +143,16 @@ object StringModule extends AbstractFunctionModule {
 
       if (length <= 0) {
         Val.Str(pos, "")
+      } else if (srcAsciiSafe) {
+        val strLen = str.length
+        val safeOffset = math.min(offset, strLen)
+        val safeLength = math.min(length, strLen - safeOffset)
+        if (safeLength <= 0) Val.Str(pos, "")
+        else {
+          val result = Val.Str(pos, str.substring(safeOffset, safeOffset + safeLength))
+          result._asciiSafe = true
+          result
+        }
       } else {
         val requestedEnd = offset.toLong + length.toLong
         if (
diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
index a1197f700..d15b7b547 100644
--- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
+++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
@@ -13,6 +13,7 @@ object UnicodeHandlingTests extends TestSuite {
     test("stringLength") {
       eval("std.length('🌍')") ==> ujson.Num(1)
       eval("std.length('Hello 🌍')") ==> ujson.Num(7)
+      eval("std.length('ASCII only')") ==> ujson.Num(10)
       // Jsonnet strings are defined over codepoints, not grapheme clusters, so the
       // following "family" emoji has a length of 7 (because it has 7 codepoints):
       eval("std.length('👨‍👩‍👧‍👦')") ==> ujson.Num(7)
@@ -53,10 +54,19 @@ object UnicodeHandlingTests extends TestSuite {
       eval("std.substr('A🌍B', 0, 1)") ==> ujson.Str("A")
       eval("std.substr('A🌍B', 1, 1)") ==> ujson.Str("🌍")
       eval("std.substr('A🌍B', 2, 1)") ==> ujson.Str("B")
+      eval("std.substr('ASCII only', 6, 4)") ==> ujson.Str("only")
       eval("std.substr('Hello 🌍 World', 6, 100)") ==> ujson.Str("🌍 World")
       eval("std.substr('🌍', 1, 5)") ==> ujson.Str("") // Beyond string length
     }
 
+    test("longAsciiLengthAndSubstr") {
+      val longAscii = "a" * 1030
+      eval(s"std.length('$longAscii')") ==> ujson.Num(1030)
+      eval(s"std.substr('$longAscii', 1028, 20)") ==> ujson.Str("aa")
+      eval(s"std.substr('$longAscii', 1031, 20)") ==> ujson.Str("")
+      eval(s"std.substr('$longAscii' + '$longAscii', 2058, 10)") ==> ujson.Str("aa")
+    }
+
     test("stringSlice") {
       eval("'A🌍B'[0:1]") ==> ujson.Str("A")
       eval("'A🌍B'[1:2]") ==> ujson.Str("🌍")