From 677aa06fddfbb637f39ab8b6e5ea2c8d731e481e Mon Sep 17 00:00:00 2001 From: Jorge Moraleda Date: Tue, 31 Mar 2026 16:16:35 -0400 Subject: [PATCH 1/5] Support arbitrary multi-character delimiter strings --- .../jinjava/tree/parse/ExpressionToken.java | 9 +- .../hubspot/jinjava/tree/parse/NoteToken.java | 7 +- .../tree/parse/StringTokenScannerSymbols.java | 232 +++++++++++++ .../hubspot/jinjava/tree/parse/TagToken.java | 7 +- .../jinjava/tree/parse/TokenScanner.java | 324 ++++++++++++++++++ .../tree/parse/TokenScannerSymbols.java | 61 ++++ .../parse/StringTokenScannerSymbolsTest.java | 258 ++++++++++++++ 7 files changed, 893 insertions(+), 5 deletions(-) create mode 100644 src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java create mode 100644 src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/ExpressionToken.java b/src/main/java/com/hubspot/jinjava/tree/parse/ExpressionToken.java index d8d9996d5..1c0d679c0 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/ExpressionToken.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/ExpressionToken.java @@ -54,7 +54,14 @@ public int getType() { @Override protected void parse() { - this.expr = WhitespaceUtils.unwrap(image, "{{", "}}"); + // Use the symbols-derived delimiter strings instead of the hardcoded "{{" / "}}" + // so that custom delimiters (e.g. "\VAR{" / "}") are stripped correctly. + this.expr = + WhitespaceUtils.unwrap( + image, + getSymbols().getExpressionStart(), + getSymbols().getExpressionEnd() + ); this.expr = handleTrim(expr); this.expr = StringUtils.trimToEmpty(this.expr); } diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/NoteToken.java b/src/main/java/com/hubspot/jinjava/tree/parse/NoteToken.java index 3f5360e67..450f9ccbd 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/NoteToken.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/NoteToken.java @@ -48,8 +48,11 @@ public int getType() { */ @Override protected void parse() { - if (image.length() > 4) { // {# #} - handleTrim(image.substring(2, image.length() - 2)); + int startLen = getSymbols().getCommentStartLength(); + int endLen = getSymbols().getCommentEndLength(); + + if (image.length() > startLen + endLen) { + handleTrim(image.substring(startLen, image.length() - endLen)); } content = ""; } diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java b/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java new file mode 100644 index 000000000..2ef47b719 --- /dev/null +++ b/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java @@ -0,0 +1,232 @@ +/********************************************************************** + * Copyright (c) 2014 HubSpot Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **********************************************************************/ +package com.hubspot.jinjava.tree.parse; + +/** + * A {@link TokenScannerSymbols} implementation that supports arbitrary multi-character + * delimiter strings, addressing + * issue #195. + * + *

Unlike {@link DefaultTokenScannerSymbols}, which is constrained to single-character + * prefixes and postfixes, this class allows any non-empty string for each of the six + * delimiter roles. The delimiters do not need to share a common prefix character. + * + *

{@link TokenScanner} detects this class via {@link #isStringBased()} and activates + * a string-matching scan path. {@link ExpressionToken}, {@link TagToken}, and + * {@link NoteToken} use the length accessors on {@link TokenScannerSymbols} (e.g. + * {@link #getExpressionStartLength()}) to strip delimiters correctly regardless of length. + * + *

The single-character abstract methods inherited from {@link TokenScannerSymbols} + * return private Unicode Private-Use-Area sentinel values. These are used only as + * token-kind discriminators inside {@link Token#newToken} and must never be used for + * scanning template text. + * + *

Example

+ *
{@code
+ * JinjavaConfig config = JinjavaConfig.newBuilder()
+ *     .withTokenScannerSymbols(StringTokenScannerSymbols.builder()
+ *         .withVariableStartString("\\VAR{")
+ *         .withVariableEndString("}")
+ *         .withBlockStartString("\\BLOCK{")
+ *         .withBlockEndString("}")
+ *         .withCommentStartString("\\#{")
+ *         .withCommentEndString("}")
+ *         .build())
+ *     .build();
+ * }
+ */ +public class StringTokenScannerSymbols extends TokenScannerSymbols { + + private static final long serialVersionUID = 1L; + + // ── Internal sentinel chars ──────────────────────────────────────────────── + // Unicode Private Use Area values — guaranteed never to appear in real template + // text, so Token.newToken()'s if-chain dispatches to the right Token subclass. + static final char SENTINEL_FIXED = '\uE000'; + static final char SENTINEL_NOTE = '\uE001'; + static final char SENTINEL_TAG = '\uE002'; + static final char SENTINEL_EXPR_START = '\uE003'; + static final char SENTINEL_EXPR_END = '\uE004'; + static final char SENTINEL_PREFIX = '\uE005'; // unused for scanning + static final char SENTINEL_POSTFIX = '\uE006'; // unused for scanning + static final char SENTINEL_NEWLINE = '\n'; // real newline for line tracking + static final char SENTINEL_TRIM = '-'; // real trim char + + // ── The configured string delimiters ────────────────────────────────────── + private final String variableStartString; + private final String variableEndString; + private final String blockStartString; + private final String blockEndString; + private final String commentStartString; + private final String commentEndString; + + private StringTokenScannerSymbols(Builder builder) { + this.variableStartString = builder.variableStartString; + this.variableEndString = builder.variableEndString; + this.blockStartString = builder.blockStartString; + this.blockEndString = builder.blockEndString; + this.commentStartString = builder.commentStartString; + this.commentEndString = builder.commentEndString; + } + + // ── Abstract char contract — returns sentinels only ─────────────────────── + + @Override + public char getPrefixChar() { + return SENTINEL_PREFIX; + } + + @Override + public char getPostfixChar() { + return SENTINEL_POSTFIX; + } + + @Override + public char getFixedChar() { + return SENTINEL_FIXED; + } + + @Override + public char getNoteChar() { + return SENTINEL_NOTE; + } + + @Override + public char getTagChar() { + return SENTINEL_TAG; + } + + @Override + public char getExprStartChar() { + return SENTINEL_EXPR_START; + } + + @Override + public char getExprEndChar() { + return SENTINEL_EXPR_END; + } + + @Override + public char getNewlineChar() { + return SENTINEL_NEWLINE; + } + + @Override + public char getTrimChar() { + return SENTINEL_TRIM; + } + + // ── String-level getters: MUST override the base-class lazy cache ────────── + // The base class builds these from the char methods above, which would produce + // garbage sentinel strings. We override them to return the real delimiters so + // that ExpressionToken, TagToken, and NoteToken strip content correctly. + + @Override + public String getExpressionStart() { + return variableStartString; + } + + @Override + public String getExpressionEnd() { + return variableEndString; + } + + @Override + public String getExpressionStartWithTag() { + return blockStartString; + } + + @Override + public String getExpressionEndWithTag() { + return blockEndString; + } + + @Override + public String getOpeningComment() { + return commentStartString; + } + + @Override + public String getClosingComment() { + return commentEndString; + } + + // ── isStringBased flag ──────────────────────────────────────────────────── + + @Override + public boolean isStringBased() { + return true; + } + + // ── Builder ──────────────────────────────────────────────────────────────── + + public static Builder builder() { + return new Builder(); + } + + public static final class Builder { + + // Defaults mirror the standard Jinja2 delimiters, so building with no + // overrides behaves identically to DefaultTokenScannerSymbols. + private String variableStartString = "{{"; + private String variableEndString = "}}"; + private String blockStartString = "{%"; + private String blockEndString = "%}"; + private String commentStartString = "{#"; + private String commentEndString = "#}"; + + public Builder withVariableStartString(String s) { + this.variableStartString = requireNonEmpty(s, "variableStartString"); + return this; + } + + public Builder withVariableEndString(String s) { + this.variableEndString = requireNonEmpty(s, "variableEndString"); + return this; + } + + public Builder withBlockStartString(String s) { + this.blockStartString = requireNonEmpty(s, "blockStartString"); + return this; + } + + public Builder withBlockEndString(String s) { + this.blockEndString = requireNonEmpty(s, "blockEndString"); + return this; + } + + public Builder withCommentStartString(String s) { + this.commentStartString = requireNonEmpty(s, "commentStartString"); + return this; + } + + public Builder withCommentEndString(String s) { + this.commentEndString = requireNonEmpty(s, "commentEndString"); + return this; + } + + public StringTokenScannerSymbols build() { + return new StringTokenScannerSymbols(this); + } + + private static String requireNonEmpty(String value, String name) { + if (value == null || value.isEmpty()) { + throw new IllegalArgumentException(name + " must not be null or empty"); + } + return value; + } + } +} diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TagToken.java b/src/main/java/com/hubspot/jinjava/tree/parse/TagToken.java index a737dd96c..0c500c145 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TagToken.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TagToken.java @@ -54,7 +54,10 @@ public int getType() { */ @Override protected void parse() { - if (image.length() < 4) { + int startLen = getSymbols().getTagStartLength(); + int endLen = getSymbols().getTagEndLength(); + + if (image.length() < startLen + endLen) { throw new TemplateSyntaxException( image, "Malformed tag token", @@ -63,7 +66,7 @@ protected void parse() { ); } - content = image.substring(2, image.length() - 2); + content = image.substring(startLen, image.length() - endLen); content = handleTrim(content); int nameStart = -1, pos = 0, len = content.length(); diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java index 7e53b295a..b99ec8b5f 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java @@ -42,6 +42,19 @@ public class TokenScanner extends AbstractIterator { private final TokenScannerSymbols symbols; private final WhitespaceControlParser whitespaceControlParser; + // String-based path state — only populated when symbols.isStringBased() == true. + private final boolean stringBased; + private final char[] varStart; + private final char[] varEnd; + private final char[] blkStart; + private final char[] blkEnd; + private final char[] cmtStart; + private final char[] cmtEnd; + + // Remembers the position where the current opening delimiter began so that the + // emitted block/comment token image starts from the opener, not the content. + private int blockOpenerStart = 0; + public TokenScanner(String input, JinjavaConfig config) { this.config = config; @@ -58,15 +71,326 @@ public TokenScanner(String input, JinjavaConfig config) { inQuote = 0; currLine = 1; lastNewlinePos = 0; + blockOpenerStart = 0; symbols = config.getTokenScannerSymbols(); + stringBased = symbols.isStringBased(); whitespaceControlParser = config.getLegacyOverrides().isParseWhitespaceControlStrictly() ? WhitespaceControlParser.STRICT : WhitespaceControlParser.LENIENT; + + if (stringBased) { + varStart = symbols.getExpressionStart().toCharArray(); + varEnd = symbols.getExpressionEnd().toCharArray(); + blkStart = symbols.getExpressionStartWithTag().toCharArray(); + blkEnd = symbols.getExpressionEndWithTag().toCharArray(); + cmtStart = symbols.getOpeningComment().toCharArray(); + cmtEnd = symbols.getClosingComment().toCharArray(); + } else { + varStart = varEnd = blkStart = blkEnd = cmtStart = cmtEnd = null; + } } + // ── Dispatch ─────────────────────────────────────────────────────────────── + private Token getNextToken() { + return stringBased ? getNextTokenStringBased() : getNextTokenCharBased(); + } + + // ── String-based scanning path ───────────────────────────────────────────── + // + // Design: + // + // tokenStart — start of the next text region to buffer (updated after + // each emitted token to point just past the emitted content). + // blockOpenerStart — position of the opening delimiter character; the emitted + // block/comment token image begins here so that the token's + // parse() method can strip the correct number of delimiter + // characters from both ends. + // lastStart/tokenLength — the slice passed to Token.newToken(). + // + // Two-phase emission: + // 1. Opener detected → flush any buffered plain text as a TEXT token (using + // is[tokenStart..openerPos)). Record blockOpenerStart = openerPos. Advance + // tokenStart and currPost past the opener into the block content. + // 2. Closer detected → emit the full delimited image (is[blockOpenerStart.. + // closerEnd)) as the appropriate token type. Advance tokenStart = currPost + // = closerEnd so the next iteration starts after the closer. + + private Token getNextTokenStringBased() { + while (currPost < length) { + char c = is[currPost]; + + // Track newlines for accurate line/column numbers. + if (c == '\n') { + currLine++; + lastNewlinePos = currPost + 1; + } + + // ── State: inside a comment ──────────────────────────────────────────── + if (inComment > 0) { + if (regionMatches(currPost, cmtEnd)) { + // Emit from the opener start to the end of the closer. + lastStart = blockOpenerStart; + tokenLength = currPost + cmtEnd.length - blockOpenerStart; + tokenStart = currPost + cmtEnd.length; + currPost = tokenStart; + inComment = 0; + int kind = tokenKind; + tokenKind = symbols.getFixed(); + return emitStringToken(kind); + } + currPost++; + continue; + } + + // ── State: inside a block (variable expression or tag) ──────────────── + if (inBlock > 0) { + // Bounds-safe backslash skip outside quoted strings. + if (c == '\\') { + currPost += (currPost + 1 < length) ? 2 : 1; + continue; + } + // Inside a quoted string: handle escape sequences so a delimiter + // character that appears as \" or \' does not prematurely close the block. + if (inQuote != 0) { + if (c == inQuote) { + inQuote = 0; + } + currPost++; + continue; + } + if (c == '\'' || c == '"') { + inQuote = c; + currPost++; + continue; + } + + // Check for the closing delimiter matching the current block type. + char[] closeDelim = closingDelimFor(tokenKind); + if (closeDelim != null && regionMatches(currPost, closeDelim)) { + // Emit from the opener start to the end of the closer. + lastStart = blockOpenerStart; + tokenLength = currPost + closeDelim.length - blockOpenerStart; + tokenStart = currPost + closeDelim.length; + currPost = tokenStart; + inBlock = 0; + int kind = tokenKind; + tokenKind = symbols.getFixed(); + return emitStringToken(kind); + } + currPost++; + continue; + } + + // ── State: plain text — look for any opening delimiter ──────────────── + if (inRaw == 0) { + // Variable opener e.g. "{{" or "\VAR{" + if (regionMatches(currPost, varStart)) { + Token pending = flushTextBefore(currPost); + blockOpenerStart = currPost; + tokenStart = currPost + varStart.length; + currPost = tokenStart; + tokenKind = symbols.getExprStart(); + inBlock = 1; + if (pending != null) { + return pending; + } + continue; + } + // Block opener e.g. "{%" or "\BLOCK{" + if (regionMatches(currPost, blkStart)) { + Token pending = flushTextBefore(currPost); + blockOpenerStart = currPost; + tokenStart = currPost + blkStart.length; + currPost = tokenStart; + tokenKind = symbols.getTag(); + inBlock = 1; + if (pending != null) { + return pending; + } + continue; + } + // Comment opener e.g. "{#" or "\#{" + if (regionMatches(currPost, cmtStart)) { + Token pending = flushTextBefore(currPost); + blockOpenerStart = currPost; + tokenStart = currPost + cmtStart.length; + currPost = tokenStart; + tokenKind = symbols.getNote(); + inComment = 1; + if (pending != null) { + return pending; + } + continue; + } + } else { + // In raw mode: only exit on a block opener immediately followed + // (after optional whitespace) by "endraw". + if (regionMatches(currPost, blkStart)) { + int contentStart = currPost + blkStart.length; + int pos = contentStart; + while (pos < length && Character.isWhitespace(is[pos])) { + pos++; + } + if (charArrayRegionMatches(is, pos, "endraw")) { + Token pending = flushTextBefore(currPost); + blockOpenerStart = currPost; + tokenStart = contentStart; + currPost = tokenStart; + tokenKind = symbols.getTag(); + inBlock = 1; + if (pending != null) { + return pending; + } + continue; + } + } + } + + currPost++; + } + + // End of input: flush any remaining buffered content. + if (currPost > tokenStart) { + return getEndTokenStringBased(); + } + return null; + } + + /** + * If {@code is[tokenStart..upTo)} contains un-emitted plain text, captures it + * as a TEXT token and returns it. Returns {@code null} for zero-length regions. + * + *

The caller MUST set {@code tokenStart} (and other state) after calling this, + * regardless of whether a token was returned. This method does NOT update + * {@code tokenStart} — that would produce the wrong value since the caller needs + * to set it to just past the opening delimiter. + */ + private Token flushTextBefore(int upTo) { + int textLen = upTo - tokenStart; + if (textLen <= 0) { + return null; + } + lastStart = tokenStart; + tokenLength = textLen; + return emitStringToken(symbols.getFixed()); + } + + /** Returns the closing delimiter for the currently open block kind. */ + private char[] closingDelimFor(int currentKind) { + if (currentKind == symbols.getExprStart()) { + return varEnd; + } + if (currentKind == symbols.getTag()) { + return blkEnd; + } + if (currentKind == symbols.getNote()) { + return cmtEnd; + } + return null; + } + + /** + * Constructs a token from {@code lastStart}/{@code tokenLength}, then applies + * trimBlocks and raw-mode post-processing identical to the char-based path. + */ + private Token emitStringToken(int kind) { + Token t = Token.newToken( + kind, + symbols, + whitespaceControlParser, + String.valueOf(is, lastStart, tokenLength), + currLine, + lastStart - lastNewlinePos + 1 + ); + + if ( + (t instanceof TagToken || t instanceof NoteToken) && + config.isTrimBlocks() && + currPost < length && + is[currPost] == '\n' + ) { + lastNewlinePos = currPost + 1; + ++currPost; + ++tokenStart; + } + + if (t instanceof TagToken) { + TagToken tt = (TagToken) t; + if ("raw".equals(tt.getTagName())) { + inRaw = 1; + return tt; + } else if ("endraw".equals(tt.getTagName())) { + inRaw = 0; + return tt; + } + } + + if (inRaw > 0 && t.getType() != symbols.getFixed()) { + return Token.newToken( + symbols.getFixed(), + symbols, + whitespaceControlParser, + t.image, + currLine, + lastStart - lastNewlinePos + 1 + ); + } + + return t; + } + + /** + * Emits whatever remains at end-of-input. + * + *

FIX (infinite loop): advances {@code tokenStart = currPost} so that the + * next call to {@code getNextTokenStringBased()} finds {@code currPost == tokenStart} + * and returns {@code null} (end of data) instead of re-emitting the same slice. + */ + private Token getEndTokenStringBased() { + tokenLength = currPost - tokenStart; + lastStart = tokenStart; + tokenStart = currPost; // ← prevents re-emission on subsequent calls + int type = symbols.getFixed(); + if (inComment > 0) { + type = symbols.getNote(); + } else if (inBlock > 0) { + return new UnclosedToken( + String.valueOf(is, lastStart, tokenLength), + currLine, + lastStart - lastNewlinePos + 1, + symbols, + whitespaceControlParser + ); + } + return Token.newToken( + type, + symbols, + whitespaceControlParser, + String.valueOf(is, lastStart, tokenLength), + currLine, + lastStart - lastNewlinePos + 1 + ); + } + + /** Returns true if {@code is[pos..]} starts with {@code pattern}. */ + private boolean regionMatches(int pos, char[] pattern) { + if (pos + pattern.length > length) { + return false; + } + for (int i = 0; i < pattern.length; i++) { + if (is[pos + i] != pattern[i]) { + return false; + } + } + return true; + } + + // ── Original char-based scanning path (completely unchanged) ────────────── + + private Token getNextTokenCharBased() { char c; while (currPost < length) { c = is[currPost++]; diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java index 771dbda41..d8b5f5aa8 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java @@ -129,4 +129,65 @@ public static boolean isNoteTagOrExprChar(TokenScannerSymbols symbols, char c) { c == symbols.getNote() || c == symbols.getTag() || c == symbols.getExprStartChar() ); } + + // ── New API ──────────────────────────────────────────────────────────────── + + /** + * Returns {@code true} if this instance uses arbitrary string delimiters that + * require the string-matching scan path in {@link TokenScanner}. + * + *

The default returns {@code false}, so all existing subclasses are unaffected. + * {@link StringTokenScannerSymbols} overrides this to return {@code true}. + */ + public boolean isStringBased() { + return false; + } + + /** + * Length of the variable/expression opening delimiter (e.g. 2 for {@code "{{"}), + * used by {@link ExpressionToken#parse()} instead of the hardcoded constant 2. + */ + public int getExpressionStartLength() { + return getExpressionStart().length(); + } + + /** + * Length of the variable/expression closing delimiter (e.g. 2 for {@code "}}"}), + * used by {@link ExpressionToken#parse()} instead of the hardcoded constant 2. + */ + public int getExpressionEndLength() { + return getExpressionEnd().length(); + } + + /** + * Length of the block/tag opening delimiter (e.g. 2 for {@code "{%"}), + * used by {@link TagToken#parse()} instead of the hardcoded constant 2. + */ + public int getTagStartLength() { + return getExpressionStartWithTag().length(); + } + + /** + * Length of the block/tag closing delimiter (e.g. 2 for {@code "%}"}), + * used by {@link TagToken#parse()} instead of the hardcoded constant 2. + */ + public int getTagEndLength() { + return getExpressionEndWithTag().length(); + } + + /** + * Length of the comment opening delimiter (e.g. 2 for {@code "{#"}), + * used by {@link NoteToken#parse()} instead of the hardcoded constant 2. + */ + public int getCommentStartLength() { + return getOpeningComment().length(); + } + + /** + * Length of the comment closing delimiter (e.g. 2 for {@code "#}"}), + * used by {@link NoteToken#parse()} instead of the hardcoded constant 2. + */ + public int getCommentEndLength() { + return getClosingComment().length(); + } } diff --git a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java new file mode 100644 index 000000000..e2f5c7bc2 --- /dev/null +++ b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java @@ -0,0 +1,258 @@ +package com.hubspot.jinjava.tree.parse; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.hubspot.jinjava.BaseJinjavaTest; +import com.hubspot.jinjava.Jinjava; +import com.hubspot.jinjava.JinjavaConfig; +import com.hubspot.jinjava.lib.filter.JoinFilterTest.User; +import java.util.HashMap; +import org.junit.Before; +import org.junit.Test; + +public class StringTokenScannerSymbolsTest { + + // ── Shared symbol configurations ─────────────────────────────────────────── + + /** LaTeX-style delimiters as used in the original issue #195 example. */ + private static final StringTokenScannerSymbols LATEX_SYMBOLS = StringTokenScannerSymbols + .builder() + .withVariableStartString("\\VAR{") + .withVariableEndString("}") + .withBlockStartString("\\BLOCK{") + .withBlockEndString("}") + .withCommentStartString("\\#{") + .withCommentEndString("}") + .build(); + + /** Angle-bracket style — same delimiters as the existing CustomTokenScannerSymbolsTest. */ + private static final StringTokenScannerSymbols ANGLE_SYMBOLS = StringTokenScannerSymbols + .builder() + .withVariableStartString("<<") + .withVariableEndString(">>") + .withBlockStartString("<%") + .withBlockEndString("%>") + .withCommentStartString("<#") + .withCommentEndString("#>") + .build(); + + private Jinjava latexJinjava; + private Jinjava angleJinjava; + + @Before + public void setup() { + latexJinjava = + new Jinjava( + BaseJinjavaTest.newConfigBuilder().withTokenScannerSymbols(LATEX_SYMBOLS).build() + ); + latexJinjava + .getGlobalContext() + .put("numbers", Lists.newArrayList(1L, 2L, 3L, 4L, 5L)); + + angleJinjava = + new Jinjava( + BaseJinjavaTest.newConfigBuilder().withTokenScannerSymbols(ANGLE_SYMBOLS).build() + ); + angleJinjava + .getGlobalContext() + .put("numbers", Lists.newArrayList(1L, 2L, 3L, 4L, 5L)); + } + + // ── Plain text ───────────────────────────────────────────────────────────── + + @Test + public void itRendersPlainText() { + String template = "jinjava interpreter works correctly"; + assertThat(latexJinjava.render(template, new HashMap<>())).isEqualTo(template); + assertThat(angleJinjava.render(template, new HashMap<>())).isEqualTo(template); + } + + // ── Variable expressions ─────────────────────────────────────────────────── + + @Test + public void itRendersVariablesWithLatexSymbols() { + assertThat(latexJinjava.render("\\VAR{ name }", ImmutableMap.of("name", "World"))) + .isEqualTo("World"); + } + + @Test + public void itRendersVariablesWithAngleSymbols() { + assertThat(angleJinjava.render("<< name >>", ImmutableMap.of("name", "World"))) + .isEqualTo("World"); + } + + // ── Default delimiters pass through as literal text ──────────────────────── + + @Test + public void itPassesThroughDefaultCurlyBracesAsLiteralText() { + // With custom delimiters, {{ }} must be treated as plain text, not expressions. + assertThat( + latexJinjava.render( + "{{ not a variable }} \\VAR{ name }", + ImmutableMap.of("name", "Jorge") + ) + ) + .isEqualTo("{{ not a variable }} Jorge"); + + assertThat( + angleJinjava.render( + "{{ not a variable }} << name >>", + ImmutableMap.of("name", "Jorge") + ) + ) + .isEqualTo("{{ not a variable }} Jorge"); + } + + // ── Block tags ───────────────────────────────────────────────────────────── + + @Test + public void itRendersIfBlockWithLatexSymbols() { + assertThat( + latexJinjava.render( + "\\BLOCK{ if show }hello\\BLOCK{ endif }", + ImmutableMap.of("show", true) + ) + ) + .isEqualTo("hello"); + + assertThat( + latexJinjava.render( + "\\BLOCK{ if show }hello\\BLOCK{ endif }", + ImmutableMap.of("show", false) + ) + ) + .isEqualTo(""); + } + + @Test + public void itRendersSetBlockWithAngleSymbols() { + assertThat( + angleJinjava.render( + "<% set d=d | default(\"some random value\") %><< d >>", + new HashMap<>() + ) + ) + .isEqualTo("some random value"); + } + + // ── Comments ─────────────────────────────────────────────────────────────── + + @Test + public void itStripsCommentsWithLatexSymbols() { + assertThat(latexJinjava.render("before\\#{ this is ignored }after", new HashMap<>())) + .isEqualTo("beforeafter"); + } + + @Test + public void itStripsCommentsWithAngleSymbols() { + assertThat(angleJinjava.render("before<# this is ignored #>after", new HashMap<>())) + .isEqualTo("beforeafter"); + } + + // ── Filters ──────────────────────────────────────────────────────────────── + + @Test + public void itRendersFiltersWithLatexSymbols() { + assertThat(latexJinjava.render("\\VAR{ [1, 2, 3, 3]|union(null) }", new HashMap<>())) + .isEqualTo("[1, 2, 3]"); + assertThat( + latexJinjava.render("\\VAR{ numbers|select('equalto', 3) }", new HashMap<>()) + ) + .isEqualTo("[3]"); + } + + @Test + public void itRendersFiltersWithAngleSymbols() { + assertThat(angleJinjava.render("<< [1, 2, 3, 3]|union(null) >>", new HashMap<>())) + .isEqualTo("[1, 2, 3]"); + assertThat(angleJinjava.render("<< numbers|select('equalto', 3) >>", new HashMap<>())) + .isEqualTo("[3]"); + } + + @Test + public void itRendersMapFilterWithLatexSymbols() { + assertThat( + latexJinjava.render( + "\\VAR{ users|map(attribute='username')|join(', ') }", + ImmutableMap.of( + "users", + (Object) Lists.newArrayList(new User("foo"), new User("bar")) + ) + ) + ) + .isEqualTo("foo, bar"); + } + + @Test + public void itRendersMapFilterWithAngleSymbols() { + assertThat( + angleJinjava.render( + "<< users|map(attribute='username')|join(', ') >>", + ImmutableMap.of( + "users", + (Object) Lists.newArrayList(new User("foo"), new User("bar")) + ) + ) + ) + .isEqualTo("foo, bar"); + } + + // ── Delimiter characters inside string literals in expressions ───────────── + + @Test + public void itHandlesClosingDelimiterInsideQuotedString() { + // The "}" inside the default string must not prematurely close \VAR{ + assertThat(latexJinjava.render("\\VAR{ name | default(\"}\") }", new HashMap<>())) + .isEqualTo("}"); + } + + @Test + public void itHandlesClosingDelimiterInsideQuotedStringAngle() { + // ">>" inside a quoted string must not close the << expression + assertThat(angleJinjava.render("<< name | default(\">>\") >>", new HashMap<>())) + .isEqualTo(">>"); + } + + // ── Builder defaults produce same behaviour as DefaultTokenScannerSymbols ── + + @Test + public void defaultBuilderBehavesLikeDefaultSymbols() { + Jinjava defaultJinjava = new Jinjava(); + Jinjava stringBasedDefaultJinjava = new Jinjava( + JinjavaConfig + .newBuilder() + .withTokenScannerSymbols(StringTokenScannerSymbols.builder().build()) + .build() + ); + String template = "{{ greeting }}, {{ name }}!"; + ImmutableMap ctx = ImmutableMap.of( + "greeting", + "Hello", + "name", + "World" + ); + assertThat(stringBasedDefaultJinjava.render(template, ctx)) + .isEqualTo(defaultJinjava.render(template, ctx)); + } + + // ── Builder validation ───────────────────────────────────────────────────── + + @Test + public void builderRejectsEmptyDelimiter() { + assertThatThrownBy(() -> + StringTokenScannerSymbols.builder().withVariableStartString("").build() + ) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void builderRejectsNullDelimiter() { + assertThatThrownBy(() -> + StringTokenScannerSymbols.builder().withBlockEndString(null).build() + ) + .isInstanceOf(IllegalArgumentException.class); + } +} From 612724309cb924edd267c27f71bf61ab38248ccf Mon Sep 17 00:00:00 2001 From: Jorge Moraleda Date: Tue, 31 Mar 2026 17:41:48 -0400 Subject: [PATCH 2/5] Support single line logic for blocks and comments using a prefix --- .../tree/parse/StringTokenScannerSymbols.java | 37 ++ .../jinjava/tree/parse/TokenScanner.java | 477 ++++++++++++------ .../tree/parse/TokenScannerSymbols.java | 25 + .../parse/StringTokenScannerSymbolsTest.java | 88 ++++ 4 files changed, 484 insertions(+), 143 deletions(-) diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java b/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java index 2ef47b719..242abd241 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbols.java @@ -72,6 +72,9 @@ public class StringTokenScannerSymbols extends TokenScannerSymbols { private final String blockEndString; private final String commentStartString; private final String commentEndString; + // Optional; null means disabled. + private final String lineStatementPrefix; + private final String lineCommentPrefix; private StringTokenScannerSymbols(Builder builder) { this.variableStartString = builder.variableStartString; @@ -80,6 +83,8 @@ private StringTokenScannerSymbols(Builder builder) { this.blockEndString = builder.blockEndString; this.commentStartString = builder.commentStartString; this.commentEndString = builder.commentEndString; + this.lineStatementPrefix = builder.lineStatementPrefix; + this.lineCommentPrefix = builder.lineCommentPrefix; } // ── Abstract char contract — returns sentinels only ─────────────────────── @@ -164,6 +169,16 @@ public String getClosingComment() { return commentEndString; } + @Override + public String getLineStatementPrefix() { + return lineStatementPrefix; + } + + @Override + public String getLineCommentPrefix() { + return lineCommentPrefix; + } + // ── isStringBased flag ──────────────────────────────────────────────────── @Override @@ -187,6 +202,8 @@ public static final class Builder { private String blockEndString = "%}"; private String commentStartString = "{#"; private String commentEndString = "#}"; + private String lineStatementPrefix = null; // disabled by default + private String lineCommentPrefix = null; // disabled by default public Builder withVariableStartString(String s) { this.variableStartString = requireNonEmpty(s, "variableStartString"); @@ -218,6 +235,26 @@ public Builder withCommentEndString(String s) { return this; } + /** + * Sets the line statement prefix (e.g. {@code "%%"}). A line beginning with + * this prefix is treated as a block tag, equivalent to wrapping its content + * in the configured block delimiters. Pass {@code null} to disable (default). + */ + public Builder withLineStatementPrefix(String s) { + this.lineStatementPrefix = s; + return this; + } + + /** + * Sets the line comment prefix (e.g. {@code "%#"}). A line beginning with + * this prefix is stripped entirely from the output. Pass {@code null} to + * disable (default). + */ + public Builder withLineCommentPrefix(String s) { + this.lineCommentPrefix = s; + return this; + } + public StringTokenScannerSymbols build() { return new StringTokenScannerSymbols(this); } diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java index b99ec8b5f..fc203ef21 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java @@ -51,8 +51,13 @@ public class TokenScanner extends AbstractIterator { private final char[] cmtStart; private final char[] cmtEnd; - // Remembers the position where the current opening delimiter began so that the - // emitted block/comment token image starts from the opener, not the content. + // Optional line-oriented prefixes; null when not configured. + private final char[] lineStmtPrefix; + private final char[] lineCommentPrefix; + + // Remembers where the current opening delimiter began so the emitted block/comment + // token image starts from the opener (not the content), letting parse() strip the + // correct number of delimiter characters from both ends. private int blockOpenerStart = 0; public TokenScanner(String input, JinjavaConfig config) { @@ -87,8 +92,16 @@ public TokenScanner(String input, JinjavaConfig config) { blkEnd = symbols.getExpressionEndWithTag().toCharArray(); cmtStart = symbols.getOpeningComment().toCharArray(); cmtEnd = symbols.getClosingComment().toCharArray(); + + String lsp = symbols.getLineStatementPrefix(); + lineStmtPrefix = (lsp != null && !lsp.isEmpty()) ? lsp.toCharArray() : null; + + String lcp = symbols.getLineCommentPrefix(); + lineCommentPrefix = (lcp != null && !lcp.isEmpty()) ? lcp.toCharArray() : null; } else { varStart = varEnd = blkStart = blkEnd = cmtStart = cmtEnd = null; + lineStmtPrefix = null; + lineCommentPrefix = null; } } @@ -100,173 +113,365 @@ private Token getNextToken() { // ── String-based scanning path ───────────────────────────────────────────── // - // Design: - // - // tokenStart — start of the next text region to buffer (updated after - // each emitted token to point just past the emitted content). - // blockOpenerStart — position of the opening delimiter character; the emitted - // block/comment token image begins here so that the token's - // parse() method can strip the correct number of delimiter - // characters from both ends. - // lastStart/tokenLength — the slice passed to Token.newToken(). + // tokenStart — start of the next text region to buffer. + // blockOpenerStart — position of the current opening delimiter; the emitted + // block/comment token image begins here. + // lastStart / tokenLength — the slice passed to Token.newToken(). // // Two-phase emission: - // 1. Opener detected → flush any buffered plain text as a TEXT token (using - // is[tokenStart..openerPos)). Record blockOpenerStart = openerPos. Advance - // tokenStart and currPost past the opener into the block content. - // 2. Closer detected → emit the full delimited image (is[blockOpenerStart.. - // closerEnd)) as the appropriate token type. Advance tokenStart = currPost - // = closerEnd so the next iteration starts after the closer. + // 1. Opener detected → flush buffered plain text as TEXT, record + // blockOpenerStart, advance tokenStart/currPost past the opener into + // the block content, set inBlock/inComment. + // 2. Closer detected → emit is[blockOpenerStart .. closerEnd) as the + // appropriate token type; advance tokenStart = currPost = closerEnd. + + // Sentinel returned by scan helpers to mean "a delimiter was matched and + // scanner state was updated — loop again without advancing currPost". + // Any non-null return from a helper that is NOT this sentinel is a real token. + private static final Token DELIMITER_MATCHED = new TextToken( + "", + 0, + 0, + new DefaultTokenScannerSymbols() + ); private Token getNextTokenStringBased() { while (currPost < length) { char c = is[currPost]; - // Track newlines for accurate line/column numbers. if (c == '\n') { currLine++; lastNewlinePos = currPost + 1; } - // ── State: inside a comment ──────────────────────────────────────────── if (inComment > 0) { - if (regionMatches(currPost, cmtEnd)) { - // Emit from the opener start to the end of the closer. - lastStart = blockOpenerStart; - tokenLength = currPost + cmtEnd.length - blockOpenerStart; - tokenStart = currPost + cmtEnd.length; - currPost = tokenStart; - inComment = 0; - int kind = tokenKind; - tokenKind = symbols.getFixed(); - return emitStringToken(kind); + Token t = scanInsideComment(); + if (t != null) { + return t; } - currPost++; - continue; + continue; // scanInsideComment advanced currPost } - // ── State: inside a block (variable expression or tag) ──────────────── if (inBlock > 0) { - // Bounds-safe backslash skip outside quoted strings. - if (c == '\\') { - currPost += (currPost + 1 < length) ? 2 : 1; - continue; + Token t = scanInsideBlock(c); + if (t == DELIMITER_MATCHED) { + continue; // closer not yet found, currPost already advanced } - // Inside a quoted string: handle escape sequences so a delimiter - // character that appears as \" or \' does not prematurely close the block. - if (inQuote != 0) { - if (c == inQuote) { - inQuote = 0; - } - currPost++; - continue; - } - if (c == '\'' || c == '"') { - inQuote = c; - currPost++; - continue; - } - - // Check for the closing delimiter matching the current block type. - char[] closeDelim = closingDelimFor(tokenKind); - if (closeDelim != null && regionMatches(currPost, closeDelim)) { - // Emit from the opener start to the end of the closer. - lastStart = blockOpenerStart; - tokenLength = currPost + closeDelim.length - blockOpenerStart; - tokenStart = currPost + closeDelim.length; - currPost = tokenStart; - inBlock = 0; - int kind = tokenKind; - tokenKind = symbols.getFixed(); - return emitStringToken(kind); + if (t != null) { + return t; } - currPost++; continue; } - // ── State: plain text — look for any opening delimiter ──────────────── if (inRaw == 0) { - // Variable opener e.g. "{{" or "\VAR{" - if (regionMatches(currPost, varStart)) { - Token pending = flushTextBefore(currPost); - blockOpenerStart = currPost; - tokenStart = currPost + varStart.length; - currPost = tokenStart; - tokenKind = symbols.getExprStart(); - inBlock = 1; - if (pending != null) { - return pending; - } - continue; + Token t = scanPlainText(c); + if (t == DELIMITER_MATCHED) { + continue; // opener matched, state updated, no pending text } - // Block opener e.g. "{%" or "\BLOCK{" - if (regionMatches(currPost, blkStart)) { - Token pending = flushTextBefore(currPost); - blockOpenerStart = currPost; - tokenStart = currPost + blkStart.length; - currPost = tokenStart; - tokenKind = symbols.getTag(); - inBlock = 1; - if (pending != null) { - return pending; - } - continue; + if (t != null) { + return t; // pending text flushed, or line-statement token } - // Comment opener e.g. "{#" or "\#{" - if (regionMatches(currPost, cmtStart)) { - Token pending = flushTextBefore(currPost); - blockOpenerStart = currPost; - tokenStart = currPost + cmtStart.length; - currPost = tokenStart; - tokenKind = symbols.getNote(); - inComment = 1; - if (pending != null) { - return pending; - } + // null means nothing matched — fall through to advance + } else { + Token t = scanRawMode(); + if (t == DELIMITER_MATCHED) { continue; } - } else { - // In raw mode: only exit on a block opener immediately followed - // (after optional whitespace) by "endraw". - if (regionMatches(currPost, blkStart)) { - int contentStart = currPost + blkStart.length; - int pos = contentStart; - while (pos < length && Character.isWhitespace(is[pos])) { - pos++; - } - if (charArrayRegionMatches(is, pos, "endraw")) { - Token pending = flushTextBefore(currPost); - blockOpenerStart = currPost; - tokenStart = contentStart; - currPost = tokenStart; - tokenKind = symbols.getTag(); - inBlock = 1; - if (pending != null) { - return pending; - } - continue; - } + if (t != null) { + return t; } } currPost++; } - // End of input: flush any remaining buffered content. if (currPost > tokenStart) { return getEndTokenStringBased(); } return null; } + /** Scans one character while inside a comment block; advances {@code currPost}. */ + private Token scanInsideComment() { + if (regionMatches(currPost, cmtEnd)) { + lastStart = blockOpenerStart; + tokenLength = currPost + cmtEnd.length - blockOpenerStart; + tokenStart = currPost + cmtEnd.length; + currPost = tokenStart; + inComment = 0; + int kind = tokenKind; + tokenKind = symbols.getFixed(); + return emitStringToken(kind); + } + currPost++; + return null; + } + + /** + * Scans one character while inside a variable or tag block; advances + * {@code currPost}. Returns a real token when the closer is found, or + * {@link #DELIMITER_MATCHED} (meaning "keep looping") otherwise. + */ + private Token scanInsideBlock(char c) { + if (inQuote != 0) { + // Inside a quoted string: a backslash escapes the next character so a + // delimiter or quote character following it does not prematurely close + // the block or the string. + if (c == '\\') { + currPost += (currPost + 1 < length) ? 2 : 1; + return DELIMITER_MATCHED; + } + if (c == inQuote) { + inQuote = 0; + } + currPost++; + return DELIMITER_MATCHED; + } + // Outside a quoted string: a backslash escapes the next character. + if (c == '\\') { + currPost += (currPost + 1 < length) ? 2 : 1; + return DELIMITER_MATCHED; + } + if (c == '\'' || c == '"') { + inQuote = c; + currPost++; + return DELIMITER_MATCHED; + } + // Check for the closing delimiter matching the current block type. + char[] closeDelim = closingDelimFor(tokenKind); + if (closeDelim != null && regionMatches(currPost, closeDelim)) { + lastStart = blockOpenerStart; + tokenLength = currPost + closeDelim.length - blockOpenerStart; + tokenStart = currPost + closeDelim.length; + currPost = tokenStart; + inBlock = 0; + int kind = tokenKind; + tokenKind = symbols.getFixed(); + return emitStringToken(kind); + } + currPost++; + return DELIMITER_MATCHED; + } + + /** + * Scans for openers while in normal (non-raw) plain-text mode. + * Returns a real token when one is ready to emit, {@link #DELIMITER_MATCHED} + * when an opener was matched with no pending text, or {@code null} when + * nothing matched (caller should advance {@code currPost}). + */ + private Token scanPlainText(char c) { + // ── Line statement prefix (e.g. "%% if foo") ────────────────────────── + if ( + lineStmtPrefix != null && + isStartOfLine(currPost) && + regionMatches(currPost, lineStmtPrefix) + ) { + return handleLineStatement(); + } + // ── Line comment prefix (e.g. "%# this is ignored") ─────────────────── + if ( + lineCommentPrefix != null && + isStartOfLine(currPost) && + regionMatches(currPost, lineCommentPrefix) + ) { + return handleLineComment(); + } + // ── Variable opener e.g. "{{" or "\VAR{" ────────────────────────────── + if (regionMatches(currPost, varStart)) { + return openBlock(varStart, symbols.getExprStart(), false); + } + // ── Block opener e.g. "{%" or "\BLOCK{" ─────────────────────────────── + if (regionMatches(currPost, blkStart)) { + return openBlock(blkStart, symbols.getTag(), false); + } + // ── Comment opener e.g. "{#" or "\#{" ───────────────────────────────── + if (regionMatches(currPost, cmtStart)) { + return openBlock(cmtStart, symbols.getNote(), true); + } + return null; // nothing matched + } + + /** + * Scans for the endraw block opener while in raw mode. + * Returns a real token, {@link #DELIMITER_MATCHED}, or {@code null}. + */ + private Token scanRawMode() { + if (regionMatches(currPost, blkStart)) { + int contentStart = currPost + blkStart.length; + int pos = contentStart; + while (pos < length && Character.isWhitespace(is[pos])) { + pos++; + } + if (charArrayRegionMatches(is, pos, "endraw")) { + Token pending = flushTextBefore(currPost); + blockOpenerStart = currPost; + tokenStart = contentStart; + currPost = tokenStart; + tokenKind = symbols.getTag(); + inBlock = 1; + if (pending != null) { + return pending; + } + return DELIMITER_MATCHED; + } + } + return null; + } + + /** + * Opens a variable or tag block (sets {@code inBlock}) or a comment block + * (sets {@code inComment}). Flushes any pending text first. + * Returns the pending text token if one exists, {@link #DELIMITER_MATCHED} otherwise. + */ + private Token openBlock(char[] opener, int kind, boolean isComment) { + Token pending = flushTextBefore(currPost); + blockOpenerStart = currPost; + tokenStart = currPost + opener.length; + currPost = tokenStart; + tokenKind = kind; + if (isComment) { + inComment = 1; + } else { + inBlock = 1; + } + return (pending != null) ? pending : DELIMITER_MATCHED; + } + + /** + * Handles a line statement prefix: consumes the line, builds a synthetic block + * tag token, and returns appropriately (stashing the tag if text was pending). + */ + private Token handleLineStatement() { + Token pending = flushTextBefore(lineIndentStart(currPost)); + + int contentStart = currPost + lineStmtPrefix.length; + while (contentStart < length && is[contentStart] == ' ') { + contentStart++; + } + int contentEnd = contentStart; + while (contentEnd < length && is[contentEnd] != '\n') { + contentEnd++; + } + String inner = String.valueOf(is, contentStart, contentEnd - contentStart).trim(); + String syntheticImage = + symbols.getExpressionStartWithTag() + + " " + + inner + + " " + + symbols.getExpressionEndWithTag(); + + int next = contentEnd; + if (next < length && is[next] == '\n') { + next++; + currLine++; + lastNewlinePos = next; + } + tokenStart = next; + currPost = next; + + Token stmtToken = Token.newToken( + symbols.getTag(), + symbols, + whitespaceControlParser, + syntheticImage, + currLine, + 1 + ); + if (pending != null) { + pendingToken = stmtToken; + return pending; + } + return stmtToken; + } + + /** + * Handles a line comment prefix: consumes the entire line (including newline) + * and returns any pending text token, or {@link #DELIMITER_MATCHED} if none. + */ + private Token handleLineComment() { + Token pending = flushTextBefore(lineIndentStart(currPost)); + + int end = currPost + lineCommentPrefix.length; + while (end < length && is[end] != '\n') { + end++; + } + int next = end; + if (next < length && is[next] == '\n') { + next++; + currLine++; + lastNewlinePos = next; + } + tokenStart = next; + currPost = next; + + // The comment itself produces no token. Return pending text if any, + // otherwise DELIMITER_MATCHED so the caller loops without advancing currPost. + return (pending != null) ? pending : DELIMITER_MATCHED; + } + + /** + * Returns the position of the first character of the indentation on the line + * containing {@code pos} — i.e. the position just after the preceding newline + * (or 0 if at the start of input). This is used to exclude leading horizontal + * whitespace from the text token flushed before a line prefix match, so that + * indented line statements and line comments don't leave whitespace in the output. + */ + private int lineIndentStart(int pos) { + // Walk back past the horizontal whitespace that isStartOfLine already accepted. + int p = pos - 1; + while (p >= 0 && (is[p] == ' ' || is[p] == '\t')) { + p--; + } + // p is now at the newline before the indentation, or at -1. + return p + 1; + } + + // ── One-slot stash for the synthetic tag after a line-statement ───────── + // When a line-statement prefix is found and there is pending text to flush + // first, we return the text token immediately and stash the synthetic tag + // here so computeNext() picks it up on the very next call. + private Token pendingToken = null; + + @Override + protected Token computeNext() { + // Drain any stashed token first. + if (pendingToken != null) { + Token t = pendingToken; + pendingToken = null; + return t; + } + + Token t = getNextToken(); + if (t == null) { + return endOfData(); + } + return t; + } + + // ── Helpers ─────────────────────────────────────────────────────────────── + + /** + * Returns true when {@code pos} is at the start of a line — i.e. it is either + * the very first character of the input, or the character immediately after a + * newline (accounting for any leading whitespace that lstripBlocks may allow). + */ + private boolean isStartOfLine(int pos) { + if (pos == 0) { + return true; + } + // Walk backwards past any horizontal whitespace (spaces/tabs). + int p = pos - 1; + while (p >= 0 && (is[p] == ' ' || is[p] == '\t')) { + p--; + } + // True if we hit the beginning of the input or a newline. + return p < 0 || is[p] == '\n'; + } + /** * If {@code is[tokenStart..upTo)} contains un-emitted plain text, captures it * as a TEXT token and returns it. Returns {@code null} for zero-length regions. - * - *

The caller MUST set {@code tokenStart} (and other state) after calling this, - * regardless of whether a token was returned. This method does NOT update - * {@code tokenStart} — that would produce the wrong value since the caller needs - * to set it to just past the opening delimiter. + * Does NOT update {@code tokenStart} — the caller sets it after returning. */ private Token flushTextBefore(int upTo) { int textLen = upTo - tokenStart; @@ -344,15 +549,12 @@ private Token emitStringToken(int kind) { /** * Emits whatever remains at end-of-input. - * - *

FIX (infinite loop): advances {@code tokenStart = currPost} so that the - * next call to {@code getNextTokenStringBased()} finds {@code currPost == tokenStart} - * and returns {@code null} (end of data) instead of re-emitting the same slice. + * Advances {@code tokenStart = currPost} so subsequent calls return null. */ private Token getEndTokenStringBased() { tokenLength = currPost - tokenStart; lastStart = tokenStart; - tokenStart = currPost; // ← prevents re-emission on subsequent calls + tokenStart = currPost; int type = symbols.getFixed(); if (inComment > 0) { type = symbols.getNote(); @@ -635,15 +837,4 @@ private boolean matchToken(char kind) { return kind == tokenKind; } } - - @Override - protected Token computeNext() { - Token t = getNextToken(); - - if (t == null) { - return endOfData(); - } - - return t; - } } diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java index d8b5f5aa8..638220853 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScannerSymbols.java @@ -190,4 +190,29 @@ public int getCommentStartLength() { public int getCommentEndLength() { return getClosingComment().length(); } + + /** + * Optional line statement prefix (e.g. {@code "%%"}). When non-null, any line + * that begins with this prefix (after optional horizontal whitespace) is treated + * as a block tag statement, equivalent to wrapping its content in the block + * delimiters. Returns {@code null} by default (feature disabled). + * + *

Only used by {@link StringTokenScannerSymbols}; has no effect in the + * char-based path. + */ + public String getLineStatementPrefix() { + return null; + } + + /** + * Optional line comment prefix (e.g. {@code "%#"}). When non-null, any line + * that begins with this prefix (after optional horizontal whitespace) is stripped + * entirely from the output. Returns {@code null} by default (feature disabled). + * + *

Only used by {@link StringTokenScannerSymbols}; has no effect in the + * char-based path. + */ + public String getLineCommentPrefix() { + return null; + } } diff --git a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java index e2f5c7bc2..50affae8c 100644 --- a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java +++ b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java @@ -255,4 +255,92 @@ public void builderRejectsNullDelimiter() { ) .isInstanceOf(IllegalArgumentException.class); } + + // ── Line statement prefix ────────────────────────────────────────────────── + + @Test + public void itRendersLineStatementPrefix() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineStatementPrefix("%%").build() + ); + // "%% if show" is equivalent to "{% if show %}" + String template = "%% if show\nhello\n%% endif"; + assertThat(j.render(template, ImmutableMap.of("show", true))).isEqualTo("hello\n"); + assertThat(j.render(template, ImmutableMap.of("show", false))).isEqualTo(""); + } + + @Test + public void itRendersLineStatementPrefixWithLeadingWhitespace() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineStatementPrefix("%%").build() + ); + // Leading spaces before the prefix are allowed + String template = " %% if show\nhello\n %% endif"; + assertThat(j.render(template, ImmutableMap.of("show", true))).isEqualTo("hello\n"); + } + + @Test + public void itRendersLineStatementMixedWithBlockDelimiters() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols + .builder() + .withVariableStartString("<<") + .withVariableEndString(">>") + .withBlockStartString("<%") + .withBlockEndString("%>") + .withCommentStartString("<#") + .withCommentEndString("#>") + .withLineStatementPrefix("%%") + .build() + ); + String template = "%% set x = 42\n<< x >>"; + assertThat(j.render(template, new HashMap<>())).isEqualTo("42"); + } + + // ── Line comment prefix ──────────────────────────────────────────────────── + + @Test + public void itStripsLineCommentPrefix() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() + ); + String template = "before\n%# this whole line is a comment\nafter"; + assertThat(j.render(template, new HashMap<>())).isEqualTo("before\nafter"); + } + + @Test + public void itStripsLineCommentWithLeadingWhitespace() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() + ); + String template = "before\n %# indented comment\nafter"; + assertThat(j.render(template, new HashMap<>())).isEqualTo("before\nafter"); + } + + @Test + public void itHandlesBothLinePrefixesTogether() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols + .builder() + .withVariableStartString("<<") + .withVariableEndString(">>") + .withBlockStartString("<%") + .withBlockEndString("%>") + .withCommentStartString("<#") + .withCommentEndString("#>") + .withLineStatementPrefix("%%") + .withLineCommentPrefix("%#") + .build() + ); + String template = "%# this is stripped\n%% set x = 7\n<< x >>"; + assertThat(j.render(template, new HashMap<>())).isEqualTo("7"); + } + + // ── Helper ──────────────────────────────────────────────────────────────── + + private Jinjava jinjavaWith(StringTokenScannerSymbols symbols) { + return new Jinjava( + BaseJinjavaTest.newConfigBuilder().withTokenScannerSymbols(symbols).build() + ); + } } From 5017d432daa63d6c6d947d3b4eb436e73682e186 Mon Sep 17 00:00:00 2001 From: Jorge Moraleda Date: Sun, 5 Apr 2026 23:46:26 -0400 Subject: [PATCH 3/5] Support for trim-modifier in single-line logic --- .../jinjava/tree/parse/TokenScanner.java | 71 +++++--- .../parse/StringTokenScannerSymbolsTest.java | 162 +++++++++++++++++- 2 files changed, 201 insertions(+), 32 deletions(-) diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java index fc203ef21..1a0543652 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java @@ -352,13 +352,17 @@ private Token handleLineStatement() { while (contentEnd < length && is[contentEnd] != '\n') { contentEnd++; } - String inner = String.valueOf(is, contentStart, contentEnd - contentStart).trim(); - String syntheticImage = - symbols.getExpressionStartWithTag() + - " " + - inner + - " " + - symbols.getExpressionEndWithTag(); + // Do NOT trim inner here — TagToken.parse() calls handleTrim() which detects + // a leading '-' for left-trim whitespace control and a trailing '-' for + // right-trim. Trimming here would strip those control characters before + // TagToken ever sees them. + // Also do not insert a space before the content when it starts with the + // trim char '-', as that space would prevent handleTrim from detecting it. + String inner = String.valueOf(is, contentStart, contentEnd - contentStart); + String prefix = (inner.length() > 0 && inner.charAt(0) == symbols.getTrimChar()) + ? symbols.getExpressionStartWithTag() + : symbols.getExpressionStartWithTag() + " "; + String syntheticImage = prefix + inner + " " + symbols.getExpressionEndWithTag(); int next = contentEnd; if (next < length && is[next] == '\n') { @@ -385,39 +389,60 @@ private Token handleLineStatement() { } /** - * Handles a line comment prefix: consumes the entire line (including newline) - * and returns any pending text token, or {@link #DELIMITER_MATCHED} if none. + * Handles a line comment prefix. + * + *

Matches Python Jinja2 semantics exactly: + *

+ * + *

Neither form affects the newline that ended the preceding line. */ private Token handleLineComment() { + int afterPrefix = currPost + lineCommentPrefix.length; + boolean hasTrimModifier = + afterPrefix < length && is[afterPrefix] == symbols.getTrimChar(); + + // Flush buffered text up to (but not including) the current line's indentation. + // The preceding newline is always preserved regardless of the trim modifier. Token pending = flushTextBefore(lineIndentStart(currPost)); - int end = currPost + lineCommentPrefix.length; + // Advance past the comment content to the end of the line. + int end = afterPrefix; while (end < length && is[end] != '\n') { end++; } - int next = end; - if (next < length && is[next] == '\n') { - next++; - currLine++; - lastNewlinePos = next; + + if (hasTrimModifier) { + // %#- : strip trailing \n too, leaving no blank line. + int next = end; + if (next < length && is[next] == '\n') { + next++; + currLine++; + lastNewlinePos = next; + } + tokenStart = next; + currPost = next; + } else { + // %# : leave the trailing \n in place so it renders as a blank line. + tokenStart = end; + currPost = end; } - tokenStart = next; - currPost = next; - // The comment itself produces no token. Return pending text if any, - // otherwise DELIMITER_MATCHED so the caller loops without advancing currPost. return (pending != null) ? pending : DELIMITER_MATCHED; } /** * Returns the position of the first character of the indentation on the line * containing {@code pos} — i.e. the position just after the preceding newline - * (or 0 if at the start of input). This is used to exclude leading horizontal - * whitespace from the text token flushed before a line prefix match, so that - * indented line statements and line comments don't leave whitespace in the output. + * (or 0 if at the start of input). Used to exclude leading horizontal whitespace + * from the text token flushed before a line prefix match. */ private int lineIndentStart(int pos) { - // Walk back past the horizontal whitespace that isStartOfLine already accepted. int p = pos - 1; while (p >= 0 && (is[p] == ' ' || is[p] == '\t')) { p--; diff --git a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java index 50affae8c..347a70d3f 100644 --- a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java +++ b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java @@ -2,16 +2,15 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; - +import java.util.HashMap; +import org.junit.Before; +import org.junit.Test; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.hubspot.jinjava.BaseJinjavaTest; import com.hubspot.jinjava.Jinjava; import com.hubspot.jinjava.JinjavaConfig; import com.hubspot.jinjava.lib.filter.JoinFilterTest.User; -import java.util.HashMap; -import org.junit.Before; -import org.junit.Test; public class StringTokenScannerSymbolsTest { @@ -238,7 +237,86 @@ public void defaultBuilderBehavesLikeDefaultSymbols() { .isEqualTo(defaultJinjava.render(template, ctx)); } - // ── Builder validation ───────────────────────────────────────────────────── + // ── trimBlocks and lstripBlocks ──────────────────────────────────────────── + // + // trimBlocks is handled in TokenScanner.emitStringToken(): when a TagToken or + // NoteToken is emitted and trimBlocks=true, the immediately following newline + // is consumed. This is equally true in the string-based path. + // + // lstripBlocks is handled in TreeParser, which operates on the token stream + // produced by TokenScanner. It strips leading horizontal whitespace from any + // TextNode that immediately precedes a TagNode. Since TreeParser is path-agnostic, + // lstripBlocks works identically for both char-based and string-based scanning. + + @Test + public void itRespectsTrimBlocksWithAngleSymbols() { + Jinjava j = new Jinjava( + BaseJinjavaTest + .newConfigBuilder() + .withTokenScannerSymbols(ANGLE_SYMBOLS) + .withTrimBlocks(true) + .build() + ); + // Without trimBlocks the newline after <% if show %> would appear in output. + // With trimBlocks=true it is consumed by the scanner, so output is "hello". + String result = j.render( + "<% if show %>\nhello\n<% endif %>", + ImmutableMap.of("show", true) + ); + assertThat(result).isEqualTo("hello\n"); + } + + @Test + public void itRespectsTrimBlocksWithLatexSymbols() { + Jinjava j = new Jinjava( + BaseJinjavaTest + .newConfigBuilder() + .withTokenScannerSymbols(LATEX_SYMBOLS) + .withTrimBlocks(true) + .build() + ); + String result = j.render( + "\\BLOCK{ if show }\nhello\n\\BLOCK{ endif }", + ImmutableMap.of("show", true) + ); + assertThat(result).isEqualTo("hello\n"); + } + + @Test + public void itRespectsLstripBlocksWithAngleSymbols() { + Jinjava j = new Jinjava( + BaseJinjavaTest + .newConfigBuilder() + .withTokenScannerSymbols(ANGLE_SYMBOLS) + .withLstripBlocks(true) + .withTrimBlocks(true) + .build() + ); + // Leading spaces before the tag are stripped by lstripBlocks (TreeParser). + // The newline after the tag is consumed by trimBlocks (TokenScanner). + String result = j.render( + " <% if show %>\nhello\n <% endif %>", + ImmutableMap.of("show", true) + ); + assertThat(result).isEqualTo("hello\n"); + } + + @Test + public void itRespectsLstripBlocksWithLatexSymbols() { + Jinjava j = new Jinjava( + BaseJinjavaTest + .newConfigBuilder() + .withTokenScannerSymbols(LATEX_SYMBOLS) + .withLstripBlocks(true) + .withTrimBlocks(true) + .build() + ); + String result = j.render( + " \\BLOCK{ if show }\nhello\n \\BLOCK{ endif }", + ImmutableMap.of("show", true) + ); + assertThat(result).isEqualTo("hello\n"); + } @Test public void builderRejectsEmptyDelimiter() { @@ -269,6 +347,27 @@ public void itRendersLineStatementPrefix() { assertThat(j.render(template, ImmutableMap.of("show", false))).isEqualTo(""); } + @Test + public void itRendersLineStatementPrefixWithWhitespaceControl() { + Jinjava j = new Jinjava( + BaseJinjavaTest + .newConfigBuilder() + .withTokenScannerSymbols( + StringTokenScannerSymbols.builder().withLineStatementPrefix("%%").build() + ) + .withTrimBlocks(true) + .withLstripBlocks(true) + .build() + ); + // "%%- for" strips the newline before the line (leftTrim). + // trimBlocks consumes the newline after each tag line. + // Expected: the \n after {| is stripped, c| repeated col_num times, each + // followed by \n (from the body line), with the \n after c| stripped by + // the leftTrim on %%- endfor. + String template = "before|\n%%- for _ in range(3)\nc|\n%%- endfor\nafter"; + assertThat(j.render(template, ImmutableMap.of())).isEqualTo("before|c|c|c|after"); + } + @Test public void itRendersLineStatementPrefixWithLeadingWhitespace() { Jinjava j = jinjavaWith( @@ -298,14 +397,20 @@ public void itRendersLineStatementMixedWithBlockDelimiters() { } // ── Line comment prefix ──────────────────────────────────────────────────── + // + // Semantics: + // %# (plain): comment content stripped, trailing \n KEPT → blank line where comment was + // %#- (trim): comment content AND trailing \n stripped → no blank line + // Neither form affects the newline that ended the preceding line. @Test - public void itStripsLineCommentPrefix() { + public void itStripsLineCommentPrefixLeavingBlankLine() { Jinjava j = jinjavaWith( StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() ); + // %# keeps its trailing \n → "before\n" + "\n" + "after" = "before\n\nafter" String template = "before\n%# this whole line is a comment\nafter"; - assertThat(j.render(template, new HashMap<>())).isEqualTo("before\nafter"); + assertThat(j.render(template, new HashMap<>())).isEqualTo("before\n\nafter"); } @Test @@ -313,8 +418,45 @@ public void itStripsLineCommentWithLeadingWhitespace() { Jinjava j = jinjavaWith( StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() ); + // Indentation before %# is stripped, trailing \n is kept → still a blank line String template = "before\n %# indented comment\nafter"; - assertThat(j.render(template, new HashMap<>())).isEqualTo("before\nafter"); + assertThat(j.render(template, new HashMap<>())).isEqualTo("before\n\nafter"); + } + + @Test + public void itStripsLineCommentWithTrimModifier() { + Jinjava j = jinjavaWith( + StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() + ); + // %# keeps trailing \n → blank line: "before\n\nafter" + assertThat(j.render("before\n%# comment\nafter", new HashMap<>())) + .isEqualTo("before\n\nafter"); + // %#- strips trailing \n → no blank line: "before\nafter" + assertThat(j.render("before\n%#- comment\nafter", new HashMap<>())) + .isEqualTo("before\nafter"); + } + + @Test + public void itStripsLineCommentWithoutLeavingBlankLine() { + // %#- strips both content and trailing \n → no blank line. + // "\\begin{document}\n" (preceding \n kept) + "\\section*{...}" (directly) + Jinjava j = new Jinjava( + BaseJinjavaTest + .newConfigBuilder() + .withTokenScannerSymbols( + StringTokenScannerSymbols + .builder() + .withVariableStartString("\\VAR{") + .withVariableEndString("}") + .withLineCommentPrefix("%#") + .build() + ) + .build() + ); + String template = + "\\begin{document}\n%#-\\VAR{reportHeader}\n\\section*{\\VAR{title}}"; + String result = j.render(template, ImmutableMap.of("title", "My Report")); + assertThat(result).isEqualTo("\\begin{document}\n\\section*{My Report}"); } @Test @@ -333,7 +475,9 @@ public void itHandlesBothLinePrefixesTogether() { .build() ); String template = "%# this is stripped\n%% set x = 7\n<< x >>"; - assertThat(j.render(template, new HashMap<>())).isEqualTo("7"); + // %# keeps its trailing \n → blank line, then %% set produces nothing, + // then << x >> renders as 7. Result: "\n7" + assertThat(j.render(template, new HashMap<>())).isEqualTo("\n7"); } // ── Helper ──────────────────────────────────────────────────────────────── From f5ec0f940bd273bb659ad6d7f53c5e1df99452ad Mon Sep 17 00:00:00 2001 From: Jorge Moraleda Date: Sat, 11 Apr 2026 00:32:47 -0400 Subject: [PATCH 4/5] Bugfix in single-line-logic trimming to match jinja output --- .../jinjava/tree/parse/TokenScanner.java | 136 ++++++++++++++---- .../parse/StringTokenScannerSymbolsTest.java | 37 +++-- 2 files changed, 133 insertions(+), 40 deletions(-) diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java index 1a0543652..41ef8471b 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java @@ -269,11 +269,8 @@ private Token scanPlainText(char c) { return handleLineStatement(); } // ── Line comment prefix (e.g. "%# this is ignored") ─────────────────── - if ( - lineCommentPrefix != null && - isStartOfLine(currPost) && - regionMatches(currPost, lineCommentPrefix) - ) { + // Line comments match anywhere on a line, not just at the start. + if (lineCommentPrefix != null && regionMatches(currPost, lineCommentPrefix)) { return handleLineComment(); } // ── Variable opener e.g. "{{" or "\VAR{" ────────────────────────────── @@ -370,6 +367,34 @@ private Token handleLineStatement() { currLine++; lastNewlinePos = next; } + + // When lstrip_blocks is active, Python Jinja2 also consumes any blank lines + // that follow a line statement (lines containing only horizontal whitespace). + // This prevents blank lines between consecutive line statements from + // appearing in the output. + if (config.isLstripBlocks()) { + while (next < length) { + // Scan forward past any horizontal whitespace on this line. + int lineEnd = next; + while ( + lineEnd < length && + is[lineEnd] != '\n' && + (is[lineEnd] == ' ' || is[lineEnd] == '\t') + ) { + lineEnd++; + } + // If we hit a newline (blank or whitespace-only line), consume it. + if (lineEnd < length && is[lineEnd] == '\n') { + next = lineEnd + 1; + currLine++; + lastNewlinePos = next; + } else { + // Hit real content or end of input — stop consuming. + break; + } + } + } + tokenStart = next; currPost = next; @@ -391,25 +416,46 @@ private Token handleLineStatement() { /** * Handles a line comment prefix. * - *

Matches Python Jinja2 semantics exactly: + *

Line comments match anywhere on a line (not just at the start). + * For mid-line comments, everything from the prefix to end of line is + * stripped; the text before the prefix on the same line is kept. + * + *

Confirmed Python Jinja2 semantics: *

- * - *

Neither form affects the newline that ended the preceding line. */ private Token handleLineComment() { + boolean startOfLine = isStartOfLine(currPost); int afterPrefix = currPost + lineCommentPrefix.length; boolean hasTrimModifier = afterPrefix < length && is[afterPrefix] == symbols.getTrimChar(); - // Flush buffered text up to (but not including) the current line's indentation. - // The preceding newline is always preserved regardless of the trim modifier. - Token pending = flushTextBefore(lineIndentStart(currPost)); + int flushUpTo; + if (!startOfLine) { + // Mid-line comment: flush up to the %# prefix, stripping trailing + // horizontal whitespace before it (Python strips spaces/tabs before + // mid-line comments, e.g. "hello %# comment" → "hello"). + int p = currPost - 1; + while (p >= tokenStart && (is[p] == ' ' || is[p] == '\t')) { + p--; + } + flushUpTo = p + 1; + } else if (hasTrimModifier) { + // Start-of-line %#-: strip preceding blank lines and the real-content \n. + flushUpTo = lineIndentStartSkippingBlanks(currPost); + } else { + // Start-of-line %#: strip only the current line's indentation. + flushUpTo = lineIndentStart(currPost); + } + + Token pending = flushTextBefore(flushUpTo); // Advance past the comment content to the end of the line. int end = afterPrefix; @@ -417,21 +463,9 @@ private Token handleLineComment() { end++; } - if (hasTrimModifier) { - // %#- : strip trailing \n too, leaving no blank line. - int next = end; - if (next < length && is[next] == '\n') { - next++; - currLine++; - lastNewlinePos = next; - } - tokenStart = next; - currPost = next; - } else { - // %# : leave the trailing \n in place so it renders as a blank line. - tokenStart = end; - currPost = end; - } + // Both %# and %#- keep the trailing \n — it appears in the output. + tokenStart = end; + currPost = end; return (pending != null) ? pending : DELIMITER_MATCHED; } @@ -451,6 +485,46 @@ private int lineIndentStart(int pos) { return p + 1; } + /** + * Returns the flush boundary for a {@code %#-} line comment. + * + *

Python Jinja2 semantics for {@code %#-}: strip back through any preceding + * blank lines AND the {@code \n} that ends the last real-content line, so that + * the comment's own kept {@code \n} becomes the sole separator. Stops at + * {@code tokenStart} so that {@code \n}s produced by preceding line statements + * or plain {@code %#} comments are not consumed. + * + *

Examples (| marks the flush boundary): + *

+   *   "A\n\n%#-"   →  flush "A|"      → output "A" + comment's \n
+   *   "%% set\n%#-" → flush nothing    → output comment's \n  (tokenStart guard)
+   * 
+ */ + private int lineIndentStartSkippingBlanks(int pos) { + int p = pos - 1; + while (p >= tokenStart) { + // Skip trailing horizontal whitespace on this line (going backwards). + while (p >= tokenStart && (is[p] == ' ' || is[p] == '\t')) { + p--; + } + if (p < tokenStart) { + break; + } + if (is[p] == '\n') { + // Blank line — consume this \n and keep scanning backwards. + p--; + } else { + // Real content at position p. The \n ending this line is at p+1. + // Return p+1 so flushTextBefore(p+1) flushes up to but NOT including + // that \n, stripping it from the output. + return p + 1; + } + } + // Reached tokenStart without finding real content — all blank lines were + // preceded by a line statement or plain comment. Preserve them. + return tokenStart; + } + // ── One-slot stash for the synthetic tag after a line-statement ───────── // When a line-statement prefix is found and there is pending text to flush // first, we return the text token immediately and stash the synthetic tag diff --git a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java index 347a70d3f..a03fe836e 100644 --- a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java +++ b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java @@ -2,15 +2,16 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import java.util.HashMap; -import org.junit.Before; -import org.junit.Test; + import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.hubspot.jinjava.BaseJinjavaTest; import com.hubspot.jinjava.Jinjava; import com.hubspot.jinjava.JinjavaConfig; import com.hubspot.jinjava.lib.filter.JoinFilterTest.User; +import java.util.HashMap; +import org.junit.Before; +import org.junit.Test; public class StringTokenScannerSymbolsTest { @@ -398,6 +399,20 @@ public void itRendersLineStatementMixedWithBlockDelimiters() { // ── Line comment prefix ──────────────────────────────────────────────────── // + // Ground truth confirmed by running both Python Jinja2 and Jinjava against: + // [START] + // %% set x = 1 + // [A] + // %# plain comment + // [B] + // %#- trim comment + // [C] + // %% set y = 2 + // [D] + // [END] + // + // Python output: [START]\n[A]\n\n[B]\n[C]\n[D]\n[END] + // // Semantics: // %# (plain): comment content stripped, trailing \n KEPT → blank line where comment was // %#- (trim): comment content AND trailing \n stripped → no blank line @@ -408,7 +423,7 @@ public void itStripsLineCommentPrefixLeavingBlankLine() { Jinjava j = jinjavaWith( StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() ); - // %# keeps its trailing \n → "before\n" + "\n" + "after" = "before\n\nafter" + // %# keeps its trailing \n → "before\n" + "\n" (comment's own \n) + "after" String template = "before\n%# this whole line is a comment\nafter"; assertThat(j.render(template, new HashMap<>())).isEqualTo("before\n\nafter"); } @@ -418,7 +433,7 @@ public void itStripsLineCommentWithLeadingWhitespace() { Jinjava j = jinjavaWith( StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() ); - // Indentation before %# is stripped, trailing \n is kept → still a blank line + // Indentation before %# is stripped, trailing \n is kept → blank line String template = "before\n %# indented comment\nafter"; assertThat(j.render(template, new HashMap<>())).isEqualTo("before\n\nafter"); } @@ -428,18 +443,22 @@ public void itStripsLineCommentWithTrimModifier() { Jinjava j = jinjavaWith( StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build() ); - // %# keeps trailing \n → blank line: "before\n\nafter" + // %# keeps trailing \n (blank line left in output) assertThat(j.render("before\n%# comment\nafter", new HashMap<>())) .isEqualTo("before\n\nafter"); - // %#- strips trailing \n → no blank line: "before\nafter" + // %#- also keeps trailing \n — the '-' is LEFT-trim only (strips preceding blanks) + // With no preceding blank lines, result is identical to plain %# assertThat(j.render("before\n%#- comment\nafter", new HashMap<>())) .isEqualTo("before\nafter"); + // %#- with a preceding blank line: strips the blank, keeps own trailing \n + assertThat(j.render("before\n\n%#- comment\nafter", new HashMap<>())) + .isEqualTo("before\nafter"); } @Test public void itStripsLineCommentWithoutLeavingBlankLine() { - // %#- strips both content and trailing \n → no blank line. - // "\\begin{document}\n" (preceding \n kept) + "\\section*{...}" (directly) + // %#- with real content before (no blank): strips the preceding \n, + // keeps comment's own \n. "\\begin{document}" + "\n" (comment's \n) + "\\section*{...}" Jinjava j = new Jinjava( BaseJinjavaTest .newConfigBuilder() From 962686ea16b1a59d9c3c6989899b44b954a63c69 Mon Sep 17 00:00:00 2001 From: Jorge Moraleda Date: Wed, 1 Apr 2026 22:29:58 -0400 Subject: [PATCH 5/5] Scanner treats backslash as escape character inside quoted strings only --- .../com/hubspot/jinjava/LegacyOverrides.java | 18 ++ .../jinjava/tree/parse/TokenScanner.java | 21 +- .../tree/parse/BackslashHandlingTest.java | 238 ++++++++++++++++++ 3 files changed, 271 insertions(+), 6 deletions(-) create mode 100644 src/test/java/com/hubspot/jinjava/tree/parse/BackslashHandlingTest.java diff --git a/src/main/java/com/hubspot/jinjava/LegacyOverrides.java b/src/main/java/com/hubspot/jinjava/LegacyOverrides.java index bd3732455..b158ef918 100644 --- a/src/main/java/com/hubspot/jinjava/LegacyOverrides.java +++ b/src/main/java/com/hubspot/jinjava/LegacyOverrides.java @@ -32,6 +32,7 @@ public interface LegacyOverrides extends WithLegacyOverrides { .withAllowAdjacentTextNodes(true) .withUseTrimmingForNotesAndExpressions(true) .withKeepNullableLoopValues(true) + .withHandleBackslashInQuotesOnly(true) .build(); @Value.Default @@ -79,6 +80,23 @@ default boolean isKeepNullableLoopValues() { return false; } + /** + * When {@code true}, the token scanner treats backslash as an escape character + * only inside quoted string literals, leaving bare backslashes outside quotes + * untouched for the expression parser (JUEL) to handle. This matches the + * behaviour of Python's Jinja2, where the template scanner is not responsible + * for backslash interpretation at all. + * + *

When {@code false} (the default), the scanner consumes a backslash and + * the following character unconditionally, regardless of quote context. This + * is the legacy Jinjava behaviour, which prevents closing delimiters from + * being recognized after a backslash but diverges from Jinja2. + */ + @Value.Default + default boolean isHandleBackslashInQuotesOnly() { + return false; + } + class Builder extends ImmutableLegacyOverrides.Builder {} static Builder newBuilder() { diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java index 41ef8471b..936030da9 100644 --- a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java +++ b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java @@ -55,6 +55,11 @@ public class TokenScanner extends AbstractIterator { private final char[] lineStmtPrefix; private final char[] lineCommentPrefix; + // When true, backslash is treated as an escape character only inside quoted + // string literals, matching Jinja2 behaviour. When false (legacy default), + // the scanner consumes backslash + next char unconditionally. + private final boolean backslashInQuotesOnly; + // Remembers where the current opening delimiter began so the emitted block/comment // token image starts from the opener (not the content), letting parse() strip the // correct number of delimiter characters from both ends. @@ -84,6 +89,7 @@ public TokenScanner(String input, JinjavaConfig config) { config.getLegacyOverrides().isParseWhitespaceControlStrictly() ? WhitespaceControlParser.STRICT : WhitespaceControlParser.LENIENT; + backslashInQuotesOnly = config.getLegacyOverrides().isHandleBackslashInQuotesOnly(); if (stringBased) { varStart = symbols.getExpressionStart().toCharArray(); @@ -214,9 +220,7 @@ private Token scanInsideComment() { */ private Token scanInsideBlock(char c) { if (inQuote != 0) { - // Inside a quoted string: a backslash escapes the next character so a - // delimiter or quote character following it does not prematurely close - // the block or the string. + // Inside a quoted string: a backslash always escapes the next character. if (c == '\\') { currPost += (currPost + 1 < length) ? 2 : 1; return DELIMITER_MATCHED; @@ -227,8 +231,9 @@ private Token scanInsideBlock(char c) { currPost++; return DELIMITER_MATCHED; } - // Outside a quoted string: a backslash escapes the next character. - if (c == '\\') { + // Outside a quoted string: only consume the backslash if the legacy + // flag is enabled; otherwise leave it for the expression parser. + if (c == '\\' && !backslashInQuotesOnly) { currPost += (currPost + 1 < length) ? 2 : 1; return DELIMITER_MATCHED; } @@ -700,10 +705,14 @@ private Token getNextTokenCharBased() { } if (inBlock > 0) { - if (c == '\\') { + if (c == '\\' && !backslashInQuotesOnly) { ++currPost; continue; } else if (inQuote != 0) { + if (c == '\\') { + ++currPost; + continue; + } if (inQuote == c) { inQuote = 0; } diff --git a/src/test/java/com/hubspot/jinjava/tree/parse/BackslashHandlingTest.java b/src/test/java/com/hubspot/jinjava/tree/parse/BackslashHandlingTest.java new file mode 100644 index 000000000..de5c9cc7b --- /dev/null +++ b/src/test/java/com/hubspot/jinjava/tree/parse/BackslashHandlingTest.java @@ -0,0 +1,238 @@ +package com.hubspot.jinjava.tree.parse; + +import static org.assertj.core.api.Assertions.assertThat; + +import com.google.common.collect.ImmutableMap; +import com.hubspot.jinjava.Jinjava; +import com.hubspot.jinjava.JinjavaConfig; +import com.hubspot.jinjava.LegacyOverrides; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import org.junit.Test; + +/** + * Tests for backslash handling inside block/variable/comment delimiters, + * covering both the char-based (DefaultTokenScannerSymbols) and string-based + * (StringTokenScannerSymbols) scanning paths, with the + * {@link LegacyOverrides#isHandleBackslashInQuotesOnly()} flag both off (legacy) + * and on (Jinja2-compatible). + */ +public class BackslashHandlingTest { + + // ── Jinjava instances ────────────────────────────────────────────────────── + + /** Char-based scanner, legacy backslash behaviour (flag = false). */ + private static Jinjava charLegacy() { + return new Jinjava( + JinjavaConfig + .newBuilder() + .withLegacyOverrides(LegacyOverrides.newBuilder().build()) + .build() + ); + } + + /** Char-based scanner, Jinja2-compatible backslash behaviour (flag = true). */ + private static Jinjava charNew() { + return new Jinjava( + JinjavaConfig + .newBuilder() + .withLegacyOverrides( + LegacyOverrides.newBuilder().withHandleBackslashInQuotesOnly(true).build() + ) + .build() + ); + } + + /** String-based scanner, legacy backslash behaviour (flag = false). */ + private static Jinjava stringLegacy() { + return new Jinjava( + JinjavaConfig + .newBuilder() + .withTokenScannerSymbols(StringTokenScannerSymbols.builder().build()) + .withLegacyOverrides(LegacyOverrides.newBuilder().build()) + .build() + ); + } + + /** String-based scanner, Jinja2-compatible backslash behaviour (flag = true). */ + private static Jinjava stringNew() { + return new Jinjava( + JinjavaConfig + .newBuilder() + .withTokenScannerSymbols(StringTokenScannerSymbols.builder().build()) + .withLegacyOverrides( + LegacyOverrides.newBuilder().withHandleBackslashInQuotesOnly(true).build() + ) + .build() + ); + } + + // ── Backslash inside a quoted string ────────────────────────────────────── + // + // Both legacy and new behaviour must handle escaped quotes inside strings + // correctly — \" should not close the string. + + @Test + public void charLegacy_escapedQuoteInsideString() { + assertThat(charLegacy().render("{{ \"he said \\\"hi\\\"\" }}", new HashMap<>())) + .isEqualTo("he said \"hi\""); + } + + @Test + public void charNew_escapedQuoteInsideString() { + assertThat(charNew().render("{{ \"he said \\\"hi\\\"\" }}", new HashMap<>())) + .isEqualTo("he said \"hi\""); + } + + @Test + public void stringLegacy_escapedQuoteInsideString() { + assertThat(stringLegacy().render("{{ \"he said \\\"hi\\\"\" }}", new HashMap<>())) + .isEqualTo("he said \"hi\""); + } + + @Test + public void stringNew_escapedQuoteInsideString() { + assertThat(stringNew().render("{{ \"he said \\\"hi\\\"\" }}", new HashMap<>())) + .isEqualTo("he said \"hi\""); + } + + // ── Backslash outside a quoted string ───────────────────────────────────── + // + // Template under test: "prefix {{ x \}} suffix }}" + // + // We test the scanner token structure directly rather than going through + // render(), because the expression "x \..." is always a JUEL lexical error + // regardless of mode. What differs between modes is which token boundaries + // the scanner produces — and that is what we assert on. + // + // Legacy (backslashInQuotesOnly = false): + // Scanner consumes '\' and skips the following '}'. The first '}}' is not + // recognized as a closer. The block runs until the second '}}', so the + // token sequence is: + // TEXT "prefix " | EXPR "{{ x \}} suffix }}" + // + // New (backslashInQuotesOnly = true): + // Scanner leaves '\' untouched. The first '}}' is recognized as the closer. + // The token sequence is: + // TEXT "prefix " | EXPR "{{ x \}}" | TEXT " suffix }}" + + private static final String BACKSLASH_TEMPLATE = "prefix {{ x \\}} suffix }}"; + + @Test + public void charLegacy_backslashConsumesOneDelimiterChar_blockRunsToSecondCloser() { + List tokens = scanAll( + new TokenScanner(BACKSLASH_TEMPLATE, charLegacy().getGlobalConfig()) + ); + assertThat(tokens).hasSize(2); + assertThat(tokens.get(0)).isInstanceOf(TextToken.class); + assertThat(tokens.get(0).image).isEqualTo("prefix "); + assertThat(tokens.get(1)).isInstanceOf(ExpressionToken.class); + assertThat(tokens.get(1).image).isEqualTo("{{ x \\}} suffix }}"); + } + + @Test + public void charNew_backslashIgnored_blockClosesAtFirstDelimiter() { + List tokens = scanAll( + new TokenScanner(BACKSLASH_TEMPLATE, charNew().getGlobalConfig()) + ); + assertThat(tokens).hasSize(3); + assertThat(tokens.get(0)).isInstanceOf(TextToken.class); + assertThat(tokens.get(0).image).isEqualTo("prefix "); + assertThat(tokens.get(1)).isInstanceOf(ExpressionToken.class); + assertThat(tokens.get(1).image).isEqualTo("{{ x \\}}"); + assertThat(tokens.get(2)).isInstanceOf(TextToken.class); + assertThat(tokens.get(2).image).isEqualTo(" suffix }}"); + } + + @Test + public void stringLegacy_backslashConsumesOneDelimiterChar_blockRunsToSecondCloser() { + List tokens = scanAll( + new TokenScanner(BACKSLASH_TEMPLATE, stringLegacy().getGlobalConfig()) + ); + assertThat(tokens).hasSize(2); + assertThat(tokens.get(0)).isInstanceOf(TextToken.class); + assertThat(tokens.get(0).image).isEqualTo("prefix "); + assertThat(tokens.get(1)).isInstanceOf(ExpressionToken.class); + assertThat(tokens.get(1).image).isEqualTo("{{ x \\}} suffix }}"); + } + + @Test + public void stringNew_backslashIgnored_blockClosesAtFirstDelimiter() { + List tokens = scanAll( + new TokenScanner(BACKSLASH_TEMPLATE, stringNew().getGlobalConfig()) + ); + assertThat(tokens).hasSize(3); + assertThat(tokens.get(0)).isInstanceOf(TextToken.class); + assertThat(tokens.get(0).image).isEqualTo("prefix "); + assertThat(tokens.get(1)).isInstanceOf(ExpressionToken.class); + assertThat(tokens.get(1).image).isEqualTo("{{ x \\}}"); + assertThat(tokens.get(2)).isInstanceOf(TextToken.class); + assertThat(tokens.get(2).image).isEqualTo(" suffix }}"); + } + + private static List scanAll(TokenScanner scanner) { + List tokens = new ArrayList<>(); + scanner.forEachRemaining(tokens::add); + return tokens; + } + + // ── Backslash in a plain variable expression ─────────────────────────────── + // + // The most common real-world case: a Windows path or similar string passed + // directly as a variable value. The backslash is in the *value*, not the + // template, so scanner behaviour is irrelevant — both modes should render + // identically. + + @Test + public void backslashInVariableValueIsUnaffectedByFlag_char() { + ImmutableMap ctx = ImmutableMap.of("path", "C:\\Users\\foo"); + assertThat(charLegacy().render("{{ path }}", ctx)).isEqualTo("C:\\Users\\foo"); + assertThat(charNew().render("{{ path }}", ctx)).isEqualTo("C:\\Users\\foo"); + } + + @Test + public void backslashInVariableValueIsUnaffectedByFlag_string() { + ImmutableMap ctx = ImmutableMap.of("path", "C:\\Users\\foo"); + assertThat(stringLegacy().render("{{ path }}", ctx)).isEqualTo("C:\\Users\\foo"); + assertThat(stringNew().render("{{ path }}", ctx)).isEqualTo("C:\\Users\\foo"); + } + + // ── New behaviour: simple expressions are unaffected ────────────────────── + // + // Expressions with no backslash should behave identically under both modes. + + @Test + public void charNew_simpleExpressionUnchanged() { + assertThat(charNew().render("{{ greeting }}", ImmutableMap.of("greeting", "hello"))) + .isEqualTo("hello"); + } + + @Test + public void stringNew_simpleExpressionUnchanged() { + assertThat(stringNew().render("{{ greeting }}", ImmutableMap.of("greeting", "hello"))) + .isEqualTo("hello"); + } + + // ── LegacyOverrides preset assertions ───────────────────────────────────── + // + // handleBackslashInQuotesOnly is an explicit opt-in only. It is NOT included + // in THREE_POINT_0 or NONE because existing templates may rely on the legacy + // behaviour of \} preventing delimiter recognition. Inclusion in a preset + // can be reconsidered in a future major version. + + @Test + public void allPresetDoesNotEnableNewBackslashHandling() { + assertThat(LegacyOverrides.ALL.isHandleBackslashInQuotesOnly()).isTrue(); + } + + @Test + public void threePointZeroPresetDoesNotEnableNewBackslashHandling() { + assertThat(LegacyOverrides.THREE_POINT_0.isHandleBackslashInQuotesOnly()).isFalse(); + } + + @Test + public void nonePresetKeepsLegacyBackslashHandling() { + assertThat(LegacyOverrides.NONE.isHandleBackslashInQuotesOnly()).isFalse(); + } +}