diff --git a/sjsonnet/src/sjsonnet/Parser.scala b/sjsonnet/src/sjsonnet/Parser.scala index e4ad6bbd..209c6d6e 100644 --- a/sjsonnet/src/sjsonnet/Parser.scala +++ b/sjsonnet/src/sjsonnet/Parser.scala @@ -748,7 +748,10 @@ class Parser( // cost more than the potential memory savings for strings that are unlikely // to repeat (e.g., 600KB text block literals) val unique = if (s.length > 1024) s else internedStrings.getOrElseUpdate(s, s) - Val.Str(pos, unique) + val result = Val.Str(pos, unique) + if (unique.length > 1024 && CharSWAR.isAsciiJsonSafe(unique)) + result._asciiSafe = true + result } // Any `expr` that isn't naively left-recursive diff --git a/sjsonnet/src/sjsonnet/Val.scala b/sjsonnet/src/sjsonnet/Val.scala index 65be0791..74429f8e 100644 --- a/sjsonnet/src/sjsonnet/Val.scala +++ b/sjsonnet/src/sjsonnet/Val.scala @@ -433,11 +433,15 @@ object Val { if (ls != null && ls.isEmpty) return right if (rs != null && rs.isEmpty) return left // Small string eagerness: both flat and combined length <= 128 - if (ls != null && rs != null && ls.length + rs.length <= 128) - return new Str(pos, ls + rs) + if (ls != null && rs != null && ls.length + rs.length <= 128) { + val result = new Str(pos, ls + rs) + if (left._asciiSafe && right._asciiSafe) result._asciiSafe = true + return result + } // Rope node: O(1) val node = new Str(pos, null) node._children = Array(left, right) + if (left._asciiSafe && right._asciiSafe) node._asciiSafe = true node } } diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala index d1ac0eca..7c5dc0a0 100644 --- a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala +++ b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala @@ -82,11 +82,14 @@ object StringModule extends AbstractFunctionModule { Val.cachedNum( pos, (x.value match { - case Val.Str(_, s) => s.codePointCount(0, s.length) - case a: Val.Arr => a.length - case o: Val.Obj => o.visibleKeyNames.length - case o: Val.Func => o.params.names.length - case x => Error.fail("Cannot get length of " + x.prettyName) + case v: Val.Str => + val s = v.str + if (v._asciiSafe) s.length + else s.codePointCount(0, s.length) + case a: Val.Arr => a.length + case o: Val.Obj => o.visibleKeyNames.length + case o: Val.Func => o.params.names.length + case x => Error.fail("Cannot get length of " + x.prettyName) }).toDouble ) } @@ -126,7 +129,9 @@ object StringModule extends AbstractFunctionModule { */ private object Substr extends Val.Builtin3("substr", "str", "from", "len") { def evalRhs(_s: Eval, from: Eval, len: Eval, ev: EvalScope, pos: Position): Val = { - val str = _s.value.asString + val srcVal = _s.value + val str = srcVal.asString + val srcAsciiSafe = srcVal.isInstanceOf[Val.Str] && srcVal.asInstanceOf[Val.Str]._asciiSafe val offset = from.value match { case v: Val.Num => v.asPositiveInt case _ => Error.fail("Expected a number for offset in substr, got " + from.value.prettyName) @@ -138,6 +143,16 @@ object StringModule extends AbstractFunctionModule { if (length <= 0) { Val.Str(pos, "") + } else if (srcAsciiSafe) { + val strLen = str.length + val safeOffset = math.min(offset, strLen) + val safeLength = math.min(length, strLen - safeOffset) + if (safeLength <= 0) Val.Str(pos, "") + else { + val result = Val.Str(pos, str.substring(safeOffset, safeOffset + safeLength)) + result._asciiSafe = true + result + } } else { val requestedEnd = offset.toLong + length.toLong if ( diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index a1197f70..d15b7b54 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -13,6 +13,7 @@ object UnicodeHandlingTests extends TestSuite { test("stringLength") { eval("std.length('๐ŸŒ')") ==> ujson.Num(1) eval("std.length('Hello ๐ŸŒ')") ==> ujson.Num(7) + eval("std.length('ASCII only')") ==> ujson.Num(10) // Jsonnet strings are defined over codepoints, not grapheme clusters, so the // following "family" emoji has a length of 7 (because it has 7 codepoints): eval("std.length('๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ')") ==> ujson.Num(7) @@ -53,10 +54,19 @@ object UnicodeHandlingTests extends TestSuite { eval("std.substr('A๐ŸŒB', 0, 1)") ==> ujson.Str("A") eval("std.substr('A๐ŸŒB', 1, 1)") ==> ujson.Str("๐ŸŒ") eval("std.substr('A๐ŸŒB', 2, 1)") ==> ujson.Str("B") + eval("std.substr('ASCII only', 6, 4)") ==> ujson.Str("only") eval("std.substr('Hello ๐ŸŒ World', 6, 100)") ==> ujson.Str("๐ŸŒ World") eval("std.substr('๐ŸŒ', 1, 5)") ==> ujson.Str("") // Beyond string length } + test("longAsciiLengthAndSubstr") { + val longAscii = "a" * 1030 + eval(s"std.length('$longAscii')") ==> ujson.Num(1030) + eval(s"std.substr('$longAscii', 1028, 20)") ==> ujson.Str("aa") + eval(s"std.substr('$longAscii', 1031, 20)") ==> ujson.Str("") + eval(s"std.substr('$longAscii' + '$longAscii', 2058, 10)") ==> ujson.Str("aa") + } + test("stringSlice") { eval("'A๐ŸŒB'[0:1]") ==> ujson.Str("A") eval("'A๐ŸŒB'[1:2]") ==> ujson.Str("๐ŸŒ")