Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9d8e23e
Inline terminal matching and defer parse node allocation
JanJakes Apr 24, 2026
81db9d2
Use per-branch FIRST sets to skip unreachable branches
JanJakes Apr 24, 2026
dc0db55
Short-circuit nullable-fallback and inline single-branch fragments
JanJakes Apr 24, 2026
0e38795
Strip epsilon markers and cache grammar refs on the parser
JanJakes Apr 24, 2026
658245d
Return fragment results as children arrays, skip the intermediate node
JanJakes Apr 24, 2026
2bf90e8
Append end-of-input sentinel token to drop range checks
JanJakes Apr 24, 2026
9de4be2
Embed branch symbol sequences directly in the per-token selector
JanJakes Apr 24, 2026
b3931d0
Compare selectStatement by rule id instead of by name
JanJakes Apr 24, 2026
0169a66
Re-align grammar and parser whitespace after recent changes
JanJakes Apr 24, 2026
0f7c1f9
Deduplicate selector entries while embedding branch sequences
JanJakes Apr 24, 2026
25e04ed
Add direct-return fast path for single-candidate rules
JanJakes Apr 24, 2026
96184de
Mark WP_Parser_Node as final
JanJakes Apr 24, 2026
b6029fd
Speed up the lexer with cheaper byte checks
JanJakes Apr 28, 2026
9f75802
Skip parent constructor in WP_MySQL_Token
JanJakes Apr 28, 2026
d5f155e
Use ! empty() in WP_Parser_Node::has_child()
JanJakes Apr 28, 2026
4d7970a
Inline leading-whitespace skip in lexer's token loops
JanJakes Apr 29, 2026
a1e0e6c
Catch identifier and keyword tokens at the top of the chain
JanJakes Apr 29, 2026
17e06b3
Add a single-byte operator dispatch table
JanJakes Apr 29, 2026
49acebd
Document non-obvious lexer dispatch conditions
JanJakes May 4, 2026
242acf6
Remove single-byte operator arms shadowed by the dispatch table
JanJakes May 4, 2026
aa0feda
Unroll whitespace check in '--' line-comment dispatch
JanJakes May 4, 2026
873bed5
Drop 'final' from WP_Parser_Node
JanJakes May 4, 2026
8c11f76
Maintain end-of-input sentinel in reset_tokens()
JanJakes May 4, 2026
6256807
Restore lookahead_is_match_possible as a native-parser-bridge view
JanJakes May 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 143 additions & 74 deletions packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer {
*/
private $sql;

/**
* Byte length of the SQL payload.
*
* @var int
*/
private $sql_length;

/**
* The version of the MySQL server that the SQL payload is intended for.
*
Expand Down Expand Up @@ -2189,6 +2196,7 @@ public function __construct(
array $sql_modes = array()
) {
$this->sql = $sql;
$this->sql_length = strlen( $sql );
$this->mysql_version = $mysql_version;

foreach ( $sql_modes as $sql_mode ) {
Expand Down Expand Up @@ -2227,6 +2235,9 @@ public function next_token(): bool {
return false;
}

// Skip leading whitespace inline for optimal performance.
$this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read );

do {
$this->token_starts_at = $this->bytes_already_read;
$this->token_type = $this->read_next_token();
Expand Down Expand Up @@ -2284,10 +2295,51 @@ public function get_token(): ?WP_MySQL_Token {
* @return WP_MySQL_Token[] An array of token objects representing the remaining tokens.
*/
public function remaining_tokens(): array {
$tokens = array();
while ( true === $this->next_token() ) {
$token = $this->get_token();
$tokens[] = $token;
$tokens = array();
$no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active(
self::SQL_MODE_NO_BACKSLASH_ESCAPES
);

while ( true ) {
// Bail on EOF, or on a null token type once at least one byte has
// been consumed (read_next_token() hit invalid input mid-stream).
if (
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (
// Break on file end
if (

Copy link
Copy Markdown
Member Author

@JanJakes JanJakes May 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in f9172e1.

self::EOF === $this->token_type
|| ( null === $this->token_type && $this->bytes_already_read > 0 )
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't EOF cover that?

Copy link
Copy Markdown
Member Author

@JanJakes JanJakes May 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in f9172e1. EOF and the second arm catch different cases: self::EOF is set when read_next_token() sees a null byte at the start of a token (clean end-of-input). The null === $this->token_type && $this->bytes_already_read > 0 arm catches the case where read_next_token() returned null mid-stream because of an invalid byte. The > 0 guard keeps the very first iteration alive — at that point $this->token_type is still null because nothing has been read yet, not because we've failed.

) {
$this->token_type = null;
break;
}

// Skip leading whitespace inline for optimal performance.
$this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read );

do {
$this->token_starts_at = $this->bytes_already_read;
$this->token_type = $this->read_next_token();
} while (
self::WHITESPACE === $this->token_type
|| self::COMMENT === $this->token_type
|| self::MYSQL_COMMENT_START === $this->token_type
|| self::MYSQL_COMMENT_END === $this->token_type
);

if ( null === $this->token_type ) {
break;
}

$tokens[] = new WP_MySQL_Token(
$this->token_type,
$this->token_starts_at,
$this->bytes_already_read - $this->token_starts_at,
$this->sql,
$no_backslash_escapes_sql_mode_set
);

if ( self::EOF === $this->token_type ) {
$this->token_type = null;
break;
}
}
return $tokens;
}
Expand Down Expand Up @@ -2354,20 +2406,60 @@ private function read_next_token(): ?int {
$byte = $this->sql[ $this->bytes_already_read ] ?? null;
$next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null;

if ( "'" === $byte || '"' === $byte || '`' === $byte ) {
// A map for a single-byte symbol fast path.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice

static $single_byte_ops = array(
'(' => self::OPEN_PAR_SYMBOL,
')' => self::CLOSE_PAR_SYMBOL,
',' => self::COMMA_SYMBOL,
';' => self::SEMICOLON_SYMBOL,
'+' => self::PLUS_OPERATOR,
'~' => self::BITWISE_NOT_OPERATOR,
'%' => self::MOD_OPERATOR,
'^' => self::BITWISE_XOR_OPERATOR,
'?' => self::PARAM_MARKER,
'{' => self::OPEN_CURLY_SYMBOL,
'}' => self::CLOSE_CURLY_SYMBOL,
'=' => self::EQUAL_OPERATOR,
);

// Fast path for keywords and identifiers.
// `$byte > "\x7F"` catches UTF-8 multi-byte starters (U+0080-U+FFFF).
// `"'" !== $next_byte` defers x'..', n'..' and similar special
// literals to their dedicated branches below; only single quotes
// form those, regardless of SQL mode.
if (
(
( $byte >= 'a' && $byte <= 'z' )
|| ( $byte >= 'A' && $byte <= 'Z' )
|| $byte > "\x7F"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd leave a comment on why \x7F is special here

Copy link
Copy Markdown
Member Author

@JanJakes JanJakes May 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in f9172e1.

)
&& "'" !== $next_byte
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why just ' and not "? Would any quotes-related sql mode/session options have impact here?

Copy link
Copy Markdown
Member Author

@JanJakes JanJakes May 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in f9172e1.

) {
$started_at = $this->bytes_already_read;
$type = $this->read_identifier();
if ( self::IDENTIFIER === $type ) {
// When preceded by a dot, it is always an identifier.
if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) {
$type = self::IDENTIFIER;
} else {
$type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() );
}
}
} elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) {
// Fast path for single-byte symbols.
$this->bytes_already_read += 1;
$type = $single_byte_ops[ $byte ];
} elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) {
$type = $this->read_quoted_text();
} elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) {
} elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) {
$type = $this->read_number();
} elseif ( '.' === $byte ) {
if ( null !== $next_byte && strspn( $next_byte, self::DIGIT_MASK ) > 0 ) {
if ( null !== $next_byte && $next_byte >= '0' && $next_byte <= '9' ) {
$type = $this->read_number();
} else {
$this->bytes_already_read += 1;
$type = self::DOT_SYMBOL;
}
} elseif ( '=' === $byte ) {
$this->bytes_already_read += 1;
$type = self::EQUAL_OPERATOR;
} elseif ( ':' === $byte ) {
$this->bytes_already_read += 1; // Consume the ':'.
if ( '=' === $next_byte ) {
Expand Down Expand Up @@ -2414,14 +2506,17 @@ private function read_next_token(): ?int {
} else {
$type = self::LOGICAL_NOT_OPERATOR;
}
} elseif ( '+' === $byte ) {
$this->bytes_already_read += 1;
$type = self::PLUS_OPERATOR;
} elseif ( '-' === $byte ) {
$third_byte = $this->sql[ $this->bytes_already_read + 2 ] ?? null;
if (
'-' === $next_byte
&& $this->bytes_already_read + 2 < strlen( $this->sql )
&& strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0
&& (
' ' === $third_byte
|| "\t" === $third_byte
|| "\n" === $third_byte
|| "\r" === $third_byte
|| "\f" === $third_byte
)
) {
$type = $this->read_line_comment();
} elseif ( '>' === $next_byte ) {
Expand Down Expand Up @@ -2466,9 +2561,6 @@ private function read_next_token(): ?int {
$this->bytes_already_read += 1;
$type = self::DIV_OPERATOR;
}
} elseif ( '%' === $byte ) {
$this->bytes_already_read += 1;
$type = self::MOD_OPERATOR;
} elseif ( '&' === $byte ) {
$this->bytes_already_read += 1; // Consume the '&'.
if ( '&' === $next_byte ) {
Expand All @@ -2477,9 +2569,6 @@ private function read_next_token(): ?int {
} else {
$type = self::BITWISE_AND_OPERATOR;
}
} elseif ( '^' === $byte ) {
$this->bytes_already_read += 1;
$type = self::BITWISE_XOR_OPERATOR;
} elseif ( '|' === $byte ) {
$this->bytes_already_read += 1; // Consume the '|'.
if ( '|' === $next_byte ) {
Expand All @@ -2490,27 +2579,6 @@ private function read_next_token(): ?int {
} else {
$type = self::BITWISE_OR_OPERATOR;
}
} elseif ( '~' === $byte ) {
$this->bytes_already_read += 1;
$type = self::BITWISE_NOT_OPERATOR;
} elseif ( ',' === $byte ) {
$this->bytes_already_read += 1;
$type = self::COMMA_SYMBOL;
} elseif ( ';' === $byte ) {
$this->bytes_already_read += 1;
$type = self::SEMICOLON_SYMBOL;
} elseif ( '(' === $byte ) {
$this->bytes_already_read += 1;
$type = self::OPEN_PAR_SYMBOL;
} elseif ( ')' === $byte ) {
$this->bytes_already_read += 1;
$type = self::CLOSE_PAR_SYMBOL;
} elseif ( '{' === $byte ) {
$this->bytes_already_read += 1;
$type = self::OPEN_CURLY_SYMBOL;
} elseif ( '}' === $byte ) {
$this->bytes_already_read += 1;
$type = self::CLOSE_CURLY_SYMBOL;
} elseif ( '@' === $byte ) {
$this->bytes_already_read += 1; // Consume the '@'.

Expand All @@ -2534,9 +2602,6 @@ private function read_next_token(): ?int {
$type = self::AT_SIGN_SYMBOL;
}
}
} elseif ( '?' === $byte ) {
$this->bytes_already_read += 1;
$type = self::PARAM_MARKER;
} elseif ( '\\' === $byte ) {
$this->bytes_already_read += 1; // Consume the '\'.
if ( 'N' === $next_byte ) {
Expand All @@ -2547,7 +2612,13 @@ private function read_next_token(): ?int {
}
} elseif ( '#' === $byte ) {
$type = $this->read_line_comment();
} elseif ( null !== $byte && strspn( $byte, self::WHITESPACE_MASK ) > 0 ) {
} elseif (
' ' === $byte
Copy link
Copy Markdown
Collaborator

@adamziel adamziel May 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would array + isset() be faster?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Marginally faster, but this branch rarely fires. next_token() and remaining_tokens() inline-skip whitespace before calling read_next_token() (commit f5b8932), so this arm only handles whitespace that appears between comments. Keeping the === chain for consistency with the rest of the dispatch.

|| "\t" === $byte
|| "\n" === $byte
|| "\r" === $byte
|| "\f" === $byte
) {
$this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read );
$type = self::WHITESPACE;
} elseif ( ( 'x' === $byte || 'X' === $byte || 'b' === $byte || 'B' === $byte ) && "'" === $next_byte ) {
Expand All @@ -2561,13 +2632,9 @@ private function read_next_token(): ?int {
} elseif ( null === $byte ) {
$type = self::EOF;
} else {
$started_at = $this->bytes_already_read;
$type = $this->read_identifier();
$type = $this->read_identifier();
if ( self::IDENTIFIER === $type ) {
// When preceded by a dot, it is always an identifier.
if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) {
$type = self::IDENTIFIER;
} elseif ( '_' === $byte && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) {
if ( '_' === $byte && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) {
$type = self::UNDERSCORE_CHARSET;
} else {
$type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() );
Expand Down Expand Up @@ -2675,7 +2742,7 @@ private function read_number(): ?int {
'0' === $byte
&& 'x' === $next_byte
&& null !== $third_byte
&& strspn( $third_byte, self::HEX_DIGIT_MASK ) > 0
&& false !== strpos( self::HEX_DIGIT_MASK, $third_byte )
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clever

)
// HEX number in the form of x'N' or X'N'.
|| ( ( 'x' === $byte || 'X' === $byte ) && "'" === $next_byte )
Expand All @@ -2685,7 +2752,7 @@ private function read_number(): ?int {
$this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read );
if ( $is_quoted ) {
if (
$this->bytes_already_read >= strlen( $this->sql )
$this->bytes_already_read >= $this->sql_length
|| "'" !== $this->sql[ $this->bytes_already_read ]
) {
return null; // Invalid input.
Expand All @@ -2708,7 +2775,7 @@ private function read_number(): ?int {
$this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read );
if ( $is_quoted ) {
if (
$this->bytes_already_read >= strlen( $this->sql )
$this->bytes_already_read >= $this->sql_length
|| "'" !== $this->sql[ $this->bytes_already_read ]
) {
return null; // Invalid input.
Expand Down Expand Up @@ -2737,11 +2804,12 @@ private function read_number(): ?int {
( 'e' === $byte || 'E' === $byte )
&& null !== $next_byte
&& (
strspn( $next_byte, self::DIGIT_MASK ) > 0
( $next_byte >= '0' && $next_byte <= '9' )
|| (
( '+' === $next_byte || '-' === $next_byte )
&& $this->bytes_already_read + 2 < strlen( $this->sql )
&& strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0
&& $this->bytes_already_read + 2 < $this->sql_length
&& $this->sql[ $this->bytes_already_read + 2 ] >= '0'
&& $this->sql[ $this->bytes_already_read + 2 ] <= '9'
)
);
if ( $has_exponent ) {
Expand Down Expand Up @@ -2838,12 +2906,11 @@ private function read_quoted_text(): ?int {
// in which case the escape sequence is consumed and the loop continues.
$at = $this->bytes_already_read;
while ( true ) {
$at += strcspn( $this->sql, $quote, $at );

// Unclosed string - unexpected EOF.
if ( ( $this->sql[ $at ] ?? null ) !== $quote ) {
$quote_at = strpos( $this->sql, $quote, $at );
if ( false === $quote_at ) {
return null; // Invalid input.
}
$at = $quote_at;

/*
* By default, quotes can be escaped with a "\".
Expand All @@ -2853,9 +2920,17 @@ private function read_quoted_text(): ?int {
* The quote is escaped only when the number of preceding backslashes
* is odd - "\" is an escape sequence, "\\" is an escaped backslash,
* "\\\" is an escaped backslash and an escape sequence, and so on.
*
* The `($at - $i - 1) >= 0` guard prevents PHP's negative-string-
* offset wraparound (PHP 7.1+) when the closing-quote candidate
* sits at the very start of the input. The `?? null` covers
* positive out-of-range indexes belt-and-suspenders.
*/
if ( ! $no_backslash_escapes ) {
for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 );
$i = 0;
while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) {
$i += 1;
}
if ( 1 === $i % 2 ) {
$at += 1;
continue;
Expand Down Expand Up @@ -2920,17 +2995,11 @@ private function read_mysql_comment(): int {
}

private function read_comment_content(): void {
while ( true ) {
$this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read );
$this->bytes_already_read += 1; // Consume the '*'.
$byte = $this->sql[ $this->bytes_already_read ] ?? null;
if ( null === $byte ) {
break;
}
if ( '/' === $byte ) {
$this->bytes_already_read += 1; // Consume the '/'.
break;
}
$comment_end = strpos( $this->sql, '*/', $this->bytes_already_read );
if ( false === $comment_end ) {
$this->bytes_already_read = $this->sql_length;
} else {
$this->bytes_already_read = $comment_end + 2;
}
}

Expand Down
6 changes: 5 additions & 1 deletion packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ class WP_MySQL_Parser extends WP_Parser {
* @param array<WP_Parser_Token> $tokens The parser tokens.
*/
public function reset_tokens( array $tokens ): void {
$this->token_count = count( $tokens );
// Maintain the end-of-input sentinel that parse_recursive() relies on.
// See WP_Parser::__construct for the invariants.
$tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' );
$this->tokens = $tokens;
$this->position = 0;
$this->current_ast = null;
Expand All @@ -40,7 +44,7 @@ public function reset_tokens( array $tokens ): void {
* @return bool Whether a query was successfully parsed.
*/
public function next_query(): bool {
if ( $this->position >= count( $this->tokens ) ) {
if ( $this->position >= $this->token_count ) {
return false;
}
$this->current_ast = $this->parse();
Expand Down
Loading
Loading