From 04f8fdf208ccdc0b2c428e2cbecc1baefecf2ff8 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Sun, 28 Jun 2026 19:18:50 -0300 Subject: [PATCH 1/9] Added regex check function that takes external flags for testing. --- .../core/parser/RegexTestTemplate.kt | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/core/src/test/kotlin/org/evomaster/core/parser/RegexTestTemplate.kt b/core/src/test/kotlin/org/evomaster/core/parser/RegexTestTemplate.kt index c5aadb2fc3..257abff8f6 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/RegexTestTemplate.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/RegexTestTemplate.kt @@ -3,6 +3,7 @@ package org.evomaster.core.parser import org.evomaster.core.search.gene.Gene import org.evomaster.core.search.gene.regex.RegexGene import org.evomaster.core.search.service.Randomness +import org.evomaster.core.utils.RegexFlags import org.junit.jupiter.api.Assertions import java.lang.AssertionError import java.lang.IllegalStateException @@ -20,6 +21,25 @@ abstract class RegexTestTemplate { return check(regex, regex) } + protected fun checkSameAsJava(regex: String, externalRegexFlags: RegexFlags = RegexFlags()) : RegexGene { + val randomness = Randomness().apply { updateSeed(42) } + + val gene = RegexHandler.createGeneForJVM(regex, externalRegexFlags) + + for(seed in 1..100L) { + + gene.randomize(randomness, false) + + val instance = gene.getValueAsRawString() + + val pattern = Pattern.compile(regex, externalRegexFlags.toJavaFlagBitmask()) + val matcher = pattern.matcher(instance) + Assertions.assertTrue(matcher.find(), "String not matching:\n$regex\n$instance") + } + + return gene + } + protected fun check(regex: String, javaRegex: String) : RegexGene { val randomness = Randomness().apply { updateSeed(42) } From cc167825eb7a7f7e84d05802e5c2b8e1bde80c98 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Sun, 28 Jun 2026 19:24:48 -0300 Subject: [PATCH 2/9] Refactor: moved and renamed parseFlagToken. --- .../core/parser/GeneRegexJavaVisitor.kt | 23 ++----------------- .../org/evomaster/core/utils/RegexFlags.kt | 23 +++++++++++++++++-- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index 926f33d005..33760148a6 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -56,25 +56,6 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege */ private var currentFlags = externalRegexFlags - /** - * Parses a FLAG_GROUP_OPEN or FLAG_SCOPE_OPEN token text like "(?i:", "(?iu:", "(?-i:", "(?i-u:", "(?iu)", etc. - * into a [ParsedFlagExpression] that can be applied to the current flags. - */ - private fun parseFlagToken(tokenText: String): ParsedFlagExpression { - // strip "(?" from start and ":" (or ")") from end - val inner = tokenText.drop(2).dropLast(1) - - val (enableStr, disableStr) = if ('-' in inner) - inner.split('-', limit = 2).let { it[0] to it[1] } - else Pair(inner, "") - - return ParsedFlagExpression( - RegexFlags.fromString(enableStr), - RegexFlags.fromString(disableStr) - ) - } - - override fun visitPattern(ctx: RegexJavaParser.PatternContext): VisitResult { val res = ctx.disjunction().accept(this) @@ -129,7 +110,7 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege val previous = currentFlags val merged = currentFlags.merge( - parseFlagToken(term.FLAG_SCOPE_OPEN().text) + ParsedFlagExpression.fromFlagToken(term.FLAG_SCOPE_OPEN().text) ) merged.validate() @@ -300,7 +281,7 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege val previous = currentFlags val merged = currentFlags.merge( - parseFlagToken(ctx.FLAG_GROUP_OPEN().text) + ParsedFlagExpression.fromFlagToken(ctx.FLAG_GROUP_OPEN().text) ) merged.validate() diff --git a/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt b/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt index 6f977cc781..a767714b62 100644 --- a/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt +++ b/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt @@ -25,6 +25,26 @@ data class ParsedFlagExpression( enable -> true else -> current } + + companion object { + /** + * Parses a FLAG_GROUP_OPEN or FLAG_SCOPE_OPEN token text like "(?i:", "(?iu:", "(?-i:", "(?i-u:", "(?iu)", etc. + * into a [ParsedFlagExpression] that can be applied to the current flags. + */ + fun fromFlagToken(tokenText: String): ParsedFlagExpression { + // strip "(?" from start and ":" (or ")") from end + val inner = tokenText.drop(2).dropLast(1) + + val (enableStr, disableStr) = if ('-' in inner) + inner.split('-', limit = 2).let { it[0] to it[1] } + else Pair(inner, "") + + return ParsedFlagExpression( + RegexFlags.fromString(enableStr), + RegexFlags.fromString(disableStr) + ) + } + } } private val validFlagCharacters = setOf('i', 'u', 's', 'm', 'd', 'U', 'x') @@ -90,8 +110,7 @@ data class RegexFlags( unicodeCharacterClass || comments)) { return "" - } - else { + } else { val sb = StringBuilder() sb.append("(?") From 40b99fa9a1cabb76b0ed3d01e9be8cce124db002 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Sun, 28 Jun 2026 19:26:32 -0300 Subject: [PATCH 3/9] Refactor: moved validFlagCharacters. --- core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt b/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt index a767714b62..56256468ac 100644 --- a/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt +++ b/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt @@ -47,8 +47,6 @@ data class ParsedFlagExpression( } } - private val validFlagCharacters = setOf('i', 'u', 's', 'm', 'd', 'U', 'x') - data class RegexFlags( // currently implemented val caseInsensitive: Boolean = false, // i @@ -63,6 +61,8 @@ data class RegexFlags( ) { companion object { + val validFlagCharacters = setOf('i', 'u', 's', 'm', 'd', 'U', 'x') + /** * Parses a string of flag characters (e.g. "iu", "sm") into a [RegexFlags] instance. * Valid characters are: i, u, s, m, d, U, x. From 45cddff63ea6e40f9649f63d2d85c51084e03a82 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Sun, 28 Jun 2026 19:27:46 -0300 Subject: [PATCH 4/9] Added RegexFlags.toJavaBitmask and isLineTerminator methods. --- .../org/evomaster/core/utils/RegexFlags.kt | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt b/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt index 56256468ac..753c3545ec 100644 --- a/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt +++ b/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt @@ -141,6 +141,18 @@ data class RegexFlags( } } + fun toJavaFlagBitmask(): Int { + var flags = 0 + if (caseInsensitive) flags = flags or Pattern.CASE_INSENSITIVE + if (unicodeCase) flags = flags or Pattern.UNICODE_CASE + if (dotAll) flags = flags or Pattern.DOTALL + if (multiline) flags = flags or Pattern.MULTILINE + if (unixLines) flags = flags or Pattern.UNIX_LINES + if (unicodeCharacterClass) flags = flags or Pattern.UNICODE_CHARACTER_CLASS + if (comments) flags = flags or Pattern.COMMENTS + return flags + } + /** * Merges this [RegexFlags] with a [ParsedFlagExpression], returning a new [RegexFlags] with the * enabled flags turned on and the disabled flags turned off. @@ -161,6 +173,15 @@ data class RegexFlags( if (comments) throw IllegalStateException("Regex flag 'x' (COMMENTS) is not yet supported") } + /** + * Checks if the provided character is a line terminator according to the flag behavior. + */ + fun isLineTerminator(c: Char) = if (unixLines) { + c == '\n' + } else { + c == '\n' || c == '\r' || c == '\u0085' || c == '\u2028' || c == '\u2029' + } + /** * Checks if the provided character has a case variant according to the flag behavior, checking both caseInsensitive * and unicodeCase flag values. From 5470ba4b227e6c4fde174fda1e6cf4085aac542b Mon Sep 17 00:00:00 2001 From: lmasroca Date: Sun, 28 Jun 2026 19:30:50 -0300 Subject: [PATCH 5/9] Implemented java identity escapes, replacing syntax escapes. --- .../org/evomaster/core/parser/RegexJava.g4 | 17 +---------------- .../core/parser/GeneRegexJavaVisitor.kt | 12 ++++-------- 2 files changed, 5 insertions(+), 24 deletions(-) diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index 7af80c6166..0f9d358f03 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -129,7 +129,6 @@ quoteChar | E ; -//TODO CharacterEscape : SLASH ControlEscape | SLASH 'c' ControlLetter @@ -138,8 +137,7 @@ CharacterEscape | SLASH OctalEscapeSequence | SLASH ('p' | 'P') BRACE_open PCharacterClassEscapeLabel BRACE_close // this is only implemented in Java at the moment // as on JS this is allowed only while certain flags are enabled - - //| IdentityEscape + | SLASH ~[a-zA-Z0-9] // identity escape ; // Instead of listing all unicode scripts, blocks, etc. the parser allows anything @@ -164,13 +162,6 @@ fragment ControlLetter : [?-_a-z] ; - -//TODO -//fragment IdentityEscape :: -//SourceCharacter but not IdentifierPart -// -// - //TODO //DecimalEscape // //[lookahead ∉ DecimalDigit] @@ -261,7 +252,6 @@ classEscape atomEscape : CharacterClassEscape | CharacterEscape - | SyntaxEscapes | BackReference | NamedBackReference ; @@ -284,11 +274,6 @@ CharacterClassEscape : SLASH [dDsSwWvVhH] ; - -SyntaxEscapes - : SLASH [^$\\.*+?()[\]{}|/\-,:<>=!] - ; - CARET : '^'; DOLLAR : '$'; SLASH : '\\'; diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index 33760148a6..f9c9f0c905 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -28,13 +28,9 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege ) /** - * These are the Java regex syntax characters, all of these can be escaped to be treated as literals. + * None of these can be escaped to be treated as literals. Some may be part of legal escape sequences. */ - private val allowedSyntaxEscapes = setOf( - '^', '$', '\\', '.', '*', '+', '?', - '(', ')', '[', ']', '{', '}', '|', - '/', '-', ',' ,':', '<', '>', '=', '!' - ) + private val notIdentityEscapes = ('a'..'z').toList() + ('A'..'Z').toList() + ('0'..'9').toList() /** * Capture groups in order of appearance (1-based index -> list index 0). @@ -470,7 +466,7 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege } else { // This case handles the escaped syntax characters, like "\." and "\+", etc. cases // where '.' and '+', etc. should be treated as regular chars - assert(startText[0] == '\\' && startText[1] in allowedSyntaxEscapes) + assert(startText[0] == '\\' && startText[1] !in notIdentityEscapes) start = startText[1] end = start } @@ -647,7 +643,7 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege currentFlags ) } - in allowedSyntaxEscapes -> PatternCharacterBlockGene(txt, txt.substring(1), currentFlags) + !in notIdentityEscapes -> PatternCharacterBlockGene(txt, txt.substring(1), currentFlags) else -> CharacterClassEscapeRxGene(txt.substring(1), currentFlags) }) } From 37dfdf1b05fd771a5a81ab5ae3528b7c67473aae Mon Sep 17 00:00:00 2001 From: lmasroca Date: Sun, 28 Jun 2026 19:34:04 -0300 Subject: [PATCH 6/9] Added regex preprocessor to handle java COMMENTS flag behavior. --- .../org/evomaster/core/parser/RegexHandler.kt | 106 +++++++++++++++++- .../org/evomaster/core/utils/RegexFlags.kt | 1 - 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt b/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt index 64b0adfa37..b581d691d4 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt @@ -3,6 +3,7 @@ package org.evomaster.core.parser import org.antlr.v4.runtime.* import org.antlr.v4.runtime.misc.ParseCancellationException import org.evomaster.core.search.gene.regex.RegexGene +import org.evomaster.core.utils.ParsedFlagExpression import org.evomaster.core.utils.RegexFlags import org.evomaster.core.utils.RegexWithExternalFlags @@ -33,7 +34,9 @@ object RegexHandler { return cacheJVM[key]!!.copy() as RegexGene } - val stream = CharStreams.fromString(regex) + val preprocessedRegex = preprocessCommentsForJavaRegex(regex, externalRegexFlags) + + val stream = CharStreams.fromString(preprocessedRegex) val lexer = RegexJavaLexer(stream) val tokenStream = prepareLexer(lexer) val parser = RegexJavaParser(tokenStream) @@ -48,6 +51,107 @@ object RegexHandler { return gene } + /** + * This function handles comments and whitespace for Java regex, striping them when the "x" flag is on. + */ + private fun preprocessCommentsForJavaRegex(regex: String, externalRegexFlags: RegexFlags): String { + val result = StringBuilder(regex.length) + val scopeStack = ArrayDeque() // stack of flags per level + var currentFlags = externalRegexFlags + var i = 0 + + while (i < regex.length) { + val c = regex[i] + when { + // backslash escape + c == '\\' && i + 1 < regex.length -> { + when { + regex[i+1] == 'Q' -> { + // \Q...\E quote block, copy everything + result.append('\\'); result.append('Q') + i += 2 + while (i < regex.length) { + if (regex[i] == '\\' && i+1 < regex.length && regex[i+1] == 'E') { + result.append('\\'); result.append('E') + i += 2; break + } + result.append(regex[i++]) + } + } + else -> { + // regular escape, copy both + result.append(c); result.append(regex[i+1]) + i += 2 + } + } + } + + // opening paren: check for flag group or scope + c == '(' && i+1 < regex.length && regex[i+1] == '?' -> { + // scan forward to find the flag content + val flagStart = i + 2 + var j = flagStart + // lookahead to end of group/scope/other, set j to that position + while (j < regex.length && regex[j] != ':' && regex[j] != ')' && regex[j] != '(') j++ + + // check if regex[i..j] forms valid flag scope/group + if (j < regex.length && (regex[j] == ':' || regex[j] == ')') && j > i+2 + && regex.substring(i+2, j).all{ it in RegexFlags.validFlagCharacters || it == '-' }) { + // valid flag group/scope + if(regex[j] == ':') { + // flag group (?flags:...): parse flags and push scope + val flagToken = regex.substring(i, j+1) // e.g. "(?iu:" + val newFlags = currentFlags.merge(ParsedFlagExpression.fromFlagToken(flagToken)) + scopeStack.addLast(currentFlags) + currentFlags = newFlags + result.append(regex.substring(i, j+1)) + i = j + 1 + } else { + // flag scope (?flags), update currentFlags + val flagToken = regex.substring(i, j+1) // e.g. "(?iu)" + currentFlags = currentFlags.merge(ParsedFlagExpression.fromFlagToken(flagToken)) + result.append(regex.substring(i, j+1)) + i = j + 1 + } + } else { + // not a flag group/scope: push current flags unchanged + scopeStack.addLast(currentFlags) + result.append(c); i++ + } + } + + c == '(' -> { + scopeStack.addLast(currentFlags) + result.append(c); i++ + } + + c == ')' -> { + currentFlags = scopeStack.removeLastOrNull() ?: externalRegexFlags + result.append(c); i++ + } + + // comment + c == '#' && currentFlags.comments -> { + i++ + while (i < regex.length && !currentFlags.isLineTerminator(regex[i])) i++ + // consume line terminator + if (i < regex.length) { + // \r\n is a 2-character line terminator + if (regex[i] == '\r' && i+1 < regex.length && regex[i+1] == '\n') i += 2 + else i++ + } + } + + // whitespace, skip when comments flag is on + c.isWhitespace() && currentFlags.comments -> i++ + + // else copy + else -> { result.append(c); i++ } + } + } + return result.toString() + } + /** * Given a ECMA262 regex string, generate RegexGene for it. * Based on RegexEcma262.g4 file. diff --git a/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt b/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt index 753c3545ec..8a8b091ce7 100644 --- a/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt +++ b/core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt @@ -170,7 +170,6 @@ data class RegexFlags( if (multiline) throw IllegalStateException("Regex flag 'm' (MULTILINE) is not yet supported") if (unixLines) throw IllegalStateException("Regex flag 'd' (UNIX_LINES) is not yet supported") if (unicodeCharacterClass) throw IllegalStateException("Regex flag 'U' (UNICODE_CHARACTER_CLASS) is not yet supported") - if (comments) throw IllegalStateException("Regex flag 'x' (COMMENTS) is not yet supported") } /** From 558946d76d3f72dffffb0bd83c58be16865e3f29 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Sun, 28 Jun 2026 19:36:55 -0300 Subject: [PATCH 7/9] Added some test cases for java COMMENTS regex flag. --- .../core/parser/GeneRegexJavaVisitorTest.kt | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt index 489657b294..0cc7afa72a 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt @@ -1,6 +1,7 @@ package org.evomaster.core.parser import org.evomaster.core.search.gene.regex.RegexGene +import org.evomaster.core.utils.RegexFlags import org.junit.jupiter.api.Test /** @@ -274,4 +275,26 @@ class GeneRegexJavaVisitorTest : GeneRegexEcma262VisitorTest() { checkSameAsJava("[a[b[c[d&&[\\w]]]][0-7&&\\d&&[0-5]&&1-5]]") checkSameAsJava("&&") } + + @Test + fun testCommentsFlag(){ + val commentsOn = RegexFlags(comments=true) + checkSameAsJava("a b c", commentsOn) + checkSameAsJava("a b c #comment\n after comment", commentsOn) + checkSameAsJava("[also within char classes#comments too\n]", commentsOn) + checkSameAsJava("a#comment\nb#noNewLine", commentsOn) + checkSameAsJava("a#c1\n#c2\nb", commentsOn) + checkSameAsJava("(a|b|#comment\nc)", commentsOn) + checkSameAsJava("(?-x)( #notAComment)") + checkSameAsJava("(?-x)( #notAComment)", commentsOn) + checkCanSample("(?x)(a|b|#comment\nc)", listOf("a", "b", "c"), 100) + checkSameAsJava("a\\ b +", commentsOn) + checkSameAsJava("\\#a{1,3 #comment\n} ", commentsOn) + checkSameAsJava(" ", commentsOn) + checkCanSample("(?x)a|#comment", listOf("a", ""), 100) + checkSameAsJava("a(?x:b c(?-x: d )e f)g") + checkSameAsJava("\\Q#not a comment\\E", commentsOn) + checkSameAsJava("a b(?-x: c d(?x: e f)g h)i j", commentsOn) + checkSameAsJava("a b(?-x: c d(?x: e f (?-x) #no (?x: a b))g h)i j", commentsOn) + } } \ No newline at end of file From a9120ed71d28a8880a9d07baad2bc12c9261b5c9 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Sun, 28 Jun 2026 19:40:48 -0300 Subject: [PATCH 8/9] Comments. --- .../main/kotlin/org/evomaster/core/parser/RegexHandler.kt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt b/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt index b581d691d4..2a6418bf42 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt @@ -130,11 +130,12 @@ object RegexHandler { result.append(c); i++ } - // comment + // comment (when COMMENTS flag is on) c == '#' && currentFlags.comments -> { i++ + // advance index until line terminator (eg: "#...\n") without copying while (i < regex.length && !currentFlags.isLineTerminator(regex[i])) i++ - // consume line terminator + // consume line terminator too: if (i < regex.length) { // \r\n is a 2-character line terminator if (regex[i] == '\r' && i+1 < regex.length && regex[i+1] == '\n') i += 2 @@ -142,7 +143,7 @@ object RegexHandler { } } - // whitespace, skip when comments flag is on + // whitespace, skip copying when comments flag is on c.isWhitespace() && currentFlags.comments -> i++ // else copy From 5c4dc7af7b325f5fb70b19912f4f6698fb409d86 Mon Sep 17 00:00:00 2001 From: lmasroca Date: Wed, 1 Jul 2026 19:07:23 -0300 Subject: [PATCH 9/9] Requested changes. --- .../kotlin/org/evomaster/core/parser/RegexHandler.kt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt b/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt index 2a6418bf42..0835531389 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt @@ -53,6 +53,17 @@ object RegexHandler { /** * This function handles comments and whitespace for Java regex, striping them when the "x" flag is on. + * + * This cannot be handled at the ANTLR level because the flag can be enabled and disabled + * mid-pattern via inline flag groups like `(?x:...)` and `(?-x:...)`, which are only known + * at parse time. Lexer modes cannot react to parser-level flag state. + * + * The visitor cannot handle this either, as some constructs (character classes, quantifier bounds, etc.) + * are tokenised before the visitor sees them. + * + * This function therefore performs a linear scan of the raw string before ANTLR, tracking + * flag state across inline scopes, producing a cleaned string that requires no special + * handling in the lexer or visitor. */ private fun preprocessCommentsForJavaRegex(regex: String, externalRegexFlags: RegexFlags): String { val result = StringBuilder(regex.length)