Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 1 addition & 16 deletions core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ quoteChar
| E
;

//TODO
CharacterEscape
: SLASH ControlEscape
| SLASH 'c' ControlLetter
Expand All @@ -138,8 +137,7 @@ CharacterEscape
| SLASH OctalEscapeSequence
| SLASH ('p' | 'P') BRACE_open PCharacterClassEscapeLabel BRACE_close // this is only implemented in Java at the moment
// as on JS this is allowed only while certain flags are enabled

//| IdentityEscape
| SLASH ~[a-zA-Z0-9] // identity escape
;

// Instead of listing all unicode scripts, blocks, etc. the parser allows anything
Expand All @@ -164,13 +162,6 @@ fragment ControlLetter
: [?-_a-z]
;


//TODO
//fragment IdentityEscape ::
//SourceCharacter but not IdentifierPart
//<ZWJ>
//<ZWNJ>

//TODO
//DecimalEscape
// //[lookahead ∉ DecimalDigit]
Expand Down Expand Up @@ -261,7 +252,6 @@ classEscape
atomEscape
: CharacterClassEscape
| CharacterEscape
| SyntaxEscapes
| BackReference
| NamedBackReference
;
Expand All @@ -284,11 +274,6 @@ CharacterClassEscape
: SLASH [dDsSwWvVhH]
;


SyntaxEscapes
: SLASH [^$\\.*+?()[\]{}|/\-,:<>=!]
;

CARET : '^';
DOLLAR : '$';
SLASH : '\\';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,9 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege
)

/**
* These are the Java regex syntax characters, all of these can be escaped to be treated as literals.
* None of these can be escaped to be treated as literals. Some may be part of legal escape sequences.
*/
private val allowedSyntaxEscapes = setOf(
'^', '$', '\\', '.', '*', '+', '?',
'(', ')', '[', ']', '{', '}', '|',
'/', '-', ',' ,':', '<', '>', '=', '!'
)
private val notIdentityEscapes = ('a'..'z').toList() + ('A'..'Z').toList() + ('0'..'9').toList()

/**
* Capture groups in order of appearance (1-based index -> list index 0).
Expand Down Expand Up @@ -64,24 +60,6 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege
*/
private var currentFlags = externalRegexFlags

/**
* Parses a FLAG_GROUP_OPEN or FLAG_SCOPE_OPEN token text like "(?i:", "(?iu:", "(?-i:", "(?i-u:", "(?iu)", etc.
* into a [ParsedFlagExpression] that can be applied to the current flags.
*/
private fun parseFlagToken(tokenText: String): ParsedFlagExpression {
// strip "(?" from start and ":" (or ")") from end
val inner = tokenText.drop(2).dropLast(1)

val (enableStr, disableStr) = if ('-' in inner)
inner.split('-', limit = 2).let { it[0] to it[1] }
else Pair(inner, "")

return ParsedFlagExpression(
RegexFlags.fromString(enableStr),
RegexFlags.fromString(disableStr)
)
}

/**
* Builds DisjunctionListRxGenes from a disjunction context, returns null if disjunction is unsatisfiable.
*/
Expand Down Expand Up @@ -181,7 +159,7 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege
val previous = currentFlags

val merged = currentFlags.merge(
parseFlagToken(term.FLAG_SCOPE_OPEN().text)
ParsedFlagExpression.fromFlagToken(term.FLAG_SCOPE_OPEN().text)
)

merged.validate()
Expand Down Expand Up @@ -367,7 +345,7 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege
val previous = currentFlags

val merged = currentFlags.merge(
parseFlagToken(ctx.FLAG_GROUP_OPEN().text)
ParsedFlagExpression.fromFlagToken(ctx.FLAG_GROUP_OPEN().text)
)

merged.validate()
Expand Down Expand Up @@ -546,7 +524,7 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege
} else {
// This case handles the escaped syntax characters, like "\." and "\+", etc. cases
// where '.' and '+', etc. should be treated as regular chars
assert(startText[0] == '\\' && startText[1] in allowedSyntaxEscapes)
assert(startText[0] == '\\' && startText[1] !in notIdentityEscapes)
start = startText[1]
end = start
}
Expand Down Expand Up @@ -722,7 +700,7 @@ class GeneRegexJavaVisitor(externalRegexFlags: RegexFlags = RegexFlags()) : Rege
currentFlags
)
}
in allowedSyntaxEscapes -> PatternCharacterBlockGene(txt, txt.substring(1), currentFlags)
!in notIdentityEscapes -> PatternCharacterBlockGene(txt, txt.substring(1), currentFlags)
else -> CharacterClassEscapeRxGene(txt.substring(1), currentFlags)
})
}
Expand Down
118 changes: 117 additions & 1 deletion core/src/main/kotlin/org/evomaster/core/parser/RegexHandler.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package org.evomaster.core.parser
import org.antlr.v4.runtime.*
import org.antlr.v4.runtime.misc.ParseCancellationException
import org.evomaster.core.search.gene.regex.RegexGene
import org.evomaster.core.utils.ParsedFlagExpression
import org.evomaster.core.utils.RegexFlags
import org.evomaster.core.utils.RegexWithExternalFlags

Expand Down Expand Up @@ -33,7 +34,9 @@ object RegexHandler {
return cacheJVM[key]!!.copy() as RegexGene
}

val stream = CharStreams.fromString(regex)
val preprocessedRegex = preprocessCommentsForJavaRegex(regex, externalRegexFlags)

val stream = CharStreams.fromString(preprocessedRegex)
val lexer = RegexJavaLexer(stream)
val tokenStream = prepareLexer(lexer)
val parser = RegexJavaParser(tokenStream)
Expand All @@ -48,6 +51,119 @@ object RegexHandler {
return gene
}

/**
* This function handles comments and whitespace for Java regex, striping them when the "x" flag is on.
*
* This cannot be handled at the ANTLR level because the flag can be enabled and disabled
* mid-pattern via inline flag groups like `(?x:...)` and `(?-x:...)`, which are only known
* at parse time. Lexer modes cannot react to parser-level flag state.
*
* The visitor cannot handle this either, as some constructs (character classes, quantifier bounds, etc.)
* are tokenised before the visitor sees them.
*
* This function therefore performs a linear scan of the raw string before ANTLR, tracking
* flag state across inline scopes, producing a cleaned string that requires no special
* handling in the lexer or visitor.
*/
private fun preprocessCommentsForJavaRegex(regex: String, externalRegexFlags: RegexFlags): String {
Comment thread
lmasroca marked this conversation as resolved.
val result = StringBuilder(regex.length)
val scopeStack = ArrayDeque<RegexFlags>() // stack of flags per level
var currentFlags = externalRegexFlags
var i = 0

while (i < regex.length) {
val c = regex[i]
when {
// backslash escape
c == '\\' && i + 1 < regex.length -> {
when {
regex[i+1] == 'Q' -> {
// \Q...\E quote block, copy everything
result.append('\\'); result.append('Q')
i += 2
while (i < regex.length) {
if (regex[i] == '\\' && i+1 < regex.length && regex[i+1] == 'E') {
result.append('\\'); result.append('E')
i += 2; break
}
result.append(regex[i++])
}
}
else -> {
// regular escape, copy both
result.append(c); result.append(regex[i+1])
i += 2
}
}
}

// opening paren: check for flag group or scope
c == '(' && i+1 < regex.length && regex[i+1] == '?' -> {
// scan forward to find the flag content
val flagStart = i + 2
var j = flagStart
// lookahead to end of group/scope/other, set j to that position
while (j < regex.length && regex[j] != ':' && regex[j] != ')' && regex[j] != '(') j++

// check if regex[i..j] forms valid flag scope/group
if (j < regex.length && (regex[j] == ':' || regex[j] == ')') && j > i+2
&& regex.substring(i+2, j).all{ it in RegexFlags.validFlagCharacters || it == '-' }) {
// valid flag group/scope
if(regex[j] == ':') {
// flag group (?flags:...): parse flags and push scope
val flagToken = regex.substring(i, j+1) // e.g. "(?iu:"
val newFlags = currentFlags.merge(ParsedFlagExpression.fromFlagToken(flagToken))
scopeStack.addLast(currentFlags)
currentFlags = newFlags
result.append(regex.substring(i, j+1))
i = j + 1
} else {
// flag scope (?flags), update currentFlags
val flagToken = regex.substring(i, j+1) // e.g. "(?iu)"
currentFlags = currentFlags.merge(ParsedFlagExpression.fromFlagToken(flagToken))
result.append(regex.substring(i, j+1))
i = j + 1
}
} else {
// not a flag group/scope: push current flags unchanged
scopeStack.addLast(currentFlags)
result.append(c); i++
}
}

c == '(' -> {
scopeStack.addLast(currentFlags)
result.append(c); i++
}

c == ')' -> {
currentFlags = scopeStack.removeLastOrNull() ?: externalRegexFlags
result.append(c); i++
}

// comment (when COMMENTS flag is on)
c == '#' && currentFlags.comments -> {
i++
// advance index until line terminator (eg: "#...\n") without copying
while (i < regex.length && !currentFlags.isLineTerminator(regex[i])) i++
// consume line terminator too:
if (i < regex.length) {
// \r\n is a 2-character line terminator
if (regex[i] == '\r' && i+1 < regex.length && regex[i+1] == '\n') i += 2
else i++
}
}

// whitespace, skip copying when comments flag is on
c.isWhitespace() && currentFlags.comments -> i++

// else copy
else -> { result.append(c); i++ }
}
}
return result.toString()
}

/**
* Given a ECMA262 regex string, generate RegexGene for it.
* Based on RegexEcma262.g4 file.
Expand Down
49 changes: 44 additions & 5 deletions core/src/main/kotlin/org/evomaster/core/utils/RegexFlags.kt
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,27 @@ data class ParsedFlagExpression(
enable -> true
else -> current
}
}

private val validFlagCharacters = setOf('i', 'u', 's', 'm', 'd', 'U', 'x')
companion object {
/**
* Parses a FLAG_GROUP_OPEN or FLAG_SCOPE_OPEN token text like "(?i:", "(?iu:", "(?-i:", "(?i-u:", "(?iu)", etc.
* into a [ParsedFlagExpression] that can be applied to the current flags.
*/
fun fromFlagToken(tokenText: String): ParsedFlagExpression {
// strip "(?" from start and ":" (or ")") from end
val inner = tokenText.drop(2).dropLast(1)

val (enableStr, disableStr) = if ('-' in inner)
inner.split('-', limit = 2).let { it[0] to it[1] }
else Pair(inner, "")

return ParsedFlagExpression(
RegexFlags.fromString(enableStr),
RegexFlags.fromString(disableStr)
)
}
}
}

data class RegexFlags(
// currently implemented
Expand All @@ -43,6 +61,8 @@ data class RegexFlags(
) {

companion object {
val validFlagCharacters = setOf('i', 'u', 's', 'm', 'd', 'U', 'x')

/**
* Parses a string of flag characters (e.g. "iu", "sm") into a [RegexFlags] instance.
* Valid characters are: i, u, s, m, d, U, x.
Expand Down Expand Up @@ -90,8 +110,7 @@ data class RegexFlags(
unicodeCharacterClass ||
comments)) {
return ""
}
else {
} else {
val sb = StringBuilder()
sb.append("(?")

Expand Down Expand Up @@ -122,6 +141,18 @@ data class RegexFlags(
}
}

fun toJavaFlagBitmask(): Int {
var flags = 0
if (caseInsensitive) flags = flags or Pattern.CASE_INSENSITIVE
if (unicodeCase) flags = flags or Pattern.UNICODE_CASE
if (dotAll) flags = flags or Pattern.DOTALL
if (multiline) flags = flags or Pattern.MULTILINE
if (unixLines) flags = flags or Pattern.UNIX_LINES
if (unicodeCharacterClass) flags = flags or Pattern.UNICODE_CHARACTER_CLASS
if (comments) flags = flags or Pattern.COMMENTS
return flags
}

/**
* Merges this [RegexFlags] with a [ParsedFlagExpression], returning a new [RegexFlags] with the
* enabled flags turned on and the disabled flags turned off.
Expand All @@ -139,9 +170,17 @@ data class RegexFlags(
if (multiline) throw IllegalStateException("Regex flag 'm' (MULTILINE) is not yet supported")
if (unixLines) throw IllegalStateException("Regex flag 'd' (UNIX_LINES) is not yet supported")
if (unicodeCharacterClass) throw IllegalStateException("Regex flag 'U' (UNICODE_CHARACTER_CLASS) is not yet supported")
if (comments) throw IllegalStateException("Regex flag 'x' (COMMENTS) is not yet supported")
}

/**
* Checks if the provided character is a line terminator according to the flag behavior.
*/
fun isLineTerminator(c: Char) = if (unixLines) {
c == '\n'
} else {
c == '\n' || c == '\r' || c == '\u0085' || c == '\u2028' || c == '\u2029'
}

/**
* Checks if the provided character has a case variant according to the flag behavior, checking both caseInsensitive
* and unicodeCase flag values.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.evomaster.core.parser

import org.evomaster.core.search.gene.regex.RegexGene
import org.evomaster.core.utils.RegexFlags
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.assertThrows

Expand Down Expand Up @@ -399,4 +400,26 @@ class GeneRegexJavaVisitorTest : GeneRegexEcma262VisitorTest() {
assertThrows<IllegalStateException> { checkSameAsJava("a([b&&c])d") }
assertThrows<IllegalStateException> { checkSameAsJava("abc|\\k<name>") }
}

@Test
fun testCommentsFlag(){
val commentsOn = RegexFlags(comments=true)
checkSameAsJava("a b c", commentsOn)
checkSameAsJava("a b c #comment\n after comment", commentsOn)
checkSameAsJava("[also within char classes#comments too\n]", commentsOn)
checkSameAsJava("a#comment\nb#noNewLine", commentsOn)
checkSameAsJava("a#c1\n#c2\nb", commentsOn)
checkSameAsJava("(a|b|#comment\nc)", commentsOn)
checkSameAsJava("(?-x)( #notAComment)")
checkSameAsJava("(?-x)( #notAComment)", commentsOn)
checkCanSample("(?x)(a|b|#comment\nc)", listOf("a", "b", "c"), 100)
checkSameAsJava("a\\ b +", commentsOn)
checkSameAsJava("\\#a{1,3 #comment\n} ", commentsOn)
checkSameAsJava(" ", commentsOn)
checkCanSample("(?x)a|#comment", listOf("a", ""), 100)
checkSameAsJava("a(?x:b c(?-x: d )e f)g")
checkSameAsJava("\\Q#not a comment\\E", commentsOn)
checkSameAsJava("a b(?-x: c d(?x: e f)g h)i j", commentsOn)
checkSameAsJava("a b(?-x: c d(?x: e f (?-x) #no (?x: a b))g h)i j", commentsOn)
}
}
Loading
Loading