diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php index 10ecd90a..e578fefb 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-lexer.php @@ -2111,6 +2111,13 @@ class WP_MySQL_Lexer { */ private $sql; + /** + * Byte length of the SQL payload. + * + * @var int + */ + private $sql_length; + /** * The version of the MySQL server that the SQL payload is intended for. * @@ -2189,6 +2196,7 @@ public function __construct( array $sql_modes = array() ) { $this->sql = $sql; + $this->sql_length = strlen( $sql ); $this->mysql_version = $mysql_version; foreach ( $sql_modes as $sql_mode ) { @@ -2227,6 +2235,9 @@ public function next_token(): bool { return false; } + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + do { $this->token_starts_at = $this->bytes_already_read; $this->token_type = $this->read_next_token(); @@ -2284,10 +2295,51 @@ public function get_token(): ?WP_MySQL_Token { * @return WP_MySQL_Token[] An array of token objects representing the remaining tokens. */ public function remaining_tokens(): array { - $tokens = array(); - while ( true === $this->next_token() ) { - $token = $this->get_token(); - $tokens[] = $token; + $tokens = array(); + $no_backslash_escapes_sql_mode_set = $this->is_sql_mode_active( + self::SQL_MODE_NO_BACKSLASH_ESCAPES + ); + + while ( true ) { + // Bail on EOF, or on a null token type once at least one byte has + // been consumed (read_next_token() hit invalid input mid-stream). + if ( + self::EOF === $this->token_type + || ( null === $this->token_type && $this->bytes_already_read > 0 ) + ) { + $this->token_type = null; + break; + } + + // Skip leading whitespace inline for optimal performance. + $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); + + do { + $this->token_starts_at = $this->bytes_already_read; + $this->token_type = $this->read_next_token(); + } while ( + self::WHITESPACE === $this->token_type + || self::COMMENT === $this->token_type + || self::MYSQL_COMMENT_START === $this->token_type + || self::MYSQL_COMMENT_END === $this->token_type + ); + + if ( null === $this->token_type ) { + break; + } + + $tokens[] = new WP_MySQL_Token( + $this->token_type, + $this->token_starts_at, + $this->bytes_already_read - $this->token_starts_at, + $this->sql, + $no_backslash_escapes_sql_mode_set + ); + + if ( self::EOF === $this->token_type ) { + $this->token_type = null; + break; + } } return $tokens; } @@ -2354,20 +2406,60 @@ private function read_next_token(): ?int { $byte = $this->sql[ $this->bytes_already_read ] ?? null; $next_byte = $this->sql[ $this->bytes_already_read + 1 ] ?? null; - if ( "'" === $byte || '"' === $byte || '`' === $byte ) { + // A map for a single-byte symbol fast path. + static $single_byte_ops = array( + '(' => self::OPEN_PAR_SYMBOL, + ')' => self::CLOSE_PAR_SYMBOL, + ',' => self::COMMA_SYMBOL, + ';' => self::SEMICOLON_SYMBOL, + '+' => self::PLUS_OPERATOR, + '~' => self::BITWISE_NOT_OPERATOR, + '%' => self::MOD_OPERATOR, + '^' => self::BITWISE_XOR_OPERATOR, + '?' => self::PARAM_MARKER, + '{' => self::OPEN_CURLY_SYMBOL, + '}' => self::CLOSE_CURLY_SYMBOL, + '=' => self::EQUAL_OPERATOR, + ); + + // Fast path for keywords and identifiers. + // `$byte > "\x7F"` catches UTF-8 multi-byte starters (U+0080-U+FFFF). + // `"'" !== $next_byte` defers x'..', n'..' and similar special + // literals to their dedicated branches below; only single quotes + // form those, regardless of SQL mode. + if ( + ( + ( $byte >= 'a' && $byte <= 'z' ) + || ( $byte >= 'A' && $byte <= 'Z' ) + || $byte > "\x7F" + ) + && "'" !== $next_byte + ) { + $started_at = $this->bytes_already_read; + $type = $this->read_identifier(); + if ( self::IDENTIFIER === $type ) { + // When preceded by a dot, it is always an identifier. + if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { + $type = self::IDENTIFIER; + } else { + $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); + } + } + } elseif ( null !== $byte && isset( $single_byte_ops[ $byte ] ) ) { + // Fast path for single-byte symbols. + $this->bytes_already_read += 1; + $type = $single_byte_ops[ $byte ]; + } elseif ( "'" === $byte || '"' === $byte || '`' === $byte ) { $type = $this->read_quoted_text(); - } elseif ( null !== $byte && strspn( $byte, self::DIGIT_MASK ) > 0 ) { + } elseif ( null !== $byte && $byte >= '0' && $byte <= '9' ) { $type = $this->read_number(); } elseif ( '.' === $byte ) { - if ( null !== $next_byte && strspn( $next_byte, self::DIGIT_MASK ) > 0 ) { + if ( null !== $next_byte && $next_byte >= '0' && $next_byte <= '9' ) { $type = $this->read_number(); } else { $this->bytes_already_read += 1; $type = self::DOT_SYMBOL; } - } elseif ( '=' === $byte ) { - $this->bytes_already_read += 1; - $type = self::EQUAL_OPERATOR; } elseif ( ':' === $byte ) { $this->bytes_already_read += 1; // Consume the ':'. if ( '=' === $next_byte ) { @@ -2414,14 +2506,17 @@ private function read_next_token(): ?int { } else { $type = self::LOGICAL_NOT_OPERATOR; } - } elseif ( '+' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PLUS_OPERATOR; } elseif ( '-' === $byte ) { + $third_byte = $this->sql[ $this->bytes_already_read + 2 ] ?? null; if ( '-' === $next_byte - && $this->bytes_already_read + 2 < strlen( $this->sql ) - && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::WHITESPACE_MASK ) > 0 + && ( + ' ' === $third_byte + || "\t" === $third_byte + || "\n" === $third_byte + || "\r" === $third_byte + || "\f" === $third_byte + ) ) { $type = $this->read_line_comment(); } elseif ( '>' === $next_byte ) { @@ -2466,9 +2561,6 @@ private function read_next_token(): ?int { $this->bytes_already_read += 1; $type = self::DIV_OPERATOR; } - } elseif ( '%' === $byte ) { - $this->bytes_already_read += 1; - $type = self::MOD_OPERATOR; } elseif ( '&' === $byte ) { $this->bytes_already_read += 1; // Consume the '&'. if ( '&' === $next_byte ) { @@ -2477,9 +2569,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_AND_OPERATOR; } - } elseif ( '^' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_XOR_OPERATOR; } elseif ( '|' === $byte ) { $this->bytes_already_read += 1; // Consume the '|'. if ( '|' === $next_byte ) { @@ -2490,27 +2579,6 @@ private function read_next_token(): ?int { } else { $type = self::BITWISE_OR_OPERATOR; } - } elseif ( '~' === $byte ) { - $this->bytes_already_read += 1; - $type = self::BITWISE_NOT_OPERATOR; - } elseif ( ',' === $byte ) { - $this->bytes_already_read += 1; - $type = self::COMMA_SYMBOL; - } elseif ( ';' === $byte ) { - $this->bytes_already_read += 1; - $type = self::SEMICOLON_SYMBOL; - } elseif ( '(' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_PAR_SYMBOL; - } elseif ( ')' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_PAR_SYMBOL; - } elseif ( '{' === $byte ) { - $this->bytes_already_read += 1; - $type = self::OPEN_CURLY_SYMBOL; - } elseif ( '}' === $byte ) { - $this->bytes_already_read += 1; - $type = self::CLOSE_CURLY_SYMBOL; } elseif ( '@' === $byte ) { $this->bytes_already_read += 1; // Consume the '@'. @@ -2534,9 +2602,6 @@ private function read_next_token(): ?int { $type = self::AT_SIGN_SYMBOL; } } - } elseif ( '?' === $byte ) { - $this->bytes_already_read += 1; - $type = self::PARAM_MARKER; } elseif ( '\\' === $byte ) { $this->bytes_already_read += 1; // Consume the '\'. if ( 'N' === $next_byte ) { @@ -2547,7 +2612,13 @@ private function read_next_token(): ?int { } } elseif ( '#' === $byte ) { $type = $this->read_line_comment(); - } elseif ( null !== $byte && strspn( $byte, self::WHITESPACE_MASK ) > 0 ) { + } elseif ( + ' ' === $byte + || "\t" === $byte + || "\n" === $byte + || "\r" === $byte + || "\f" === $byte + ) { $this->bytes_already_read += strspn( $this->sql, self::WHITESPACE_MASK, $this->bytes_already_read ); $type = self::WHITESPACE; } elseif ( ( 'x' === $byte || 'X' === $byte || 'b' === $byte || 'B' === $byte ) && "'" === $next_byte ) { @@ -2561,13 +2632,9 @@ private function read_next_token(): ?int { } elseif ( null === $byte ) { $type = self::EOF; } else { - $started_at = $this->bytes_already_read; - $type = $this->read_identifier(); + $type = $this->read_identifier(); if ( self::IDENTIFIER === $type ) { - // When preceded by a dot, it is always an identifier. - if ( $started_at > 0 && '.' === $this->sql[ $started_at - 1 ] ) { - $type = self::IDENTIFIER; - } elseif ( '_' === $byte && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) { + if ( '_' === $byte && isset( self::UNDERSCORE_CHARSETS[ strtolower( $this->get_current_token_bytes() ) ] ) ) { $type = self::UNDERSCORE_CHARSET; } else { $type = $this->determine_identifier_or_keyword_type( $this->get_current_token_bytes() ); @@ -2675,7 +2742,7 @@ private function read_number(): ?int { '0' === $byte && 'x' === $next_byte && null !== $third_byte - && strspn( $third_byte, self::HEX_DIGIT_MASK ) > 0 + && false !== strpos( self::HEX_DIGIT_MASK, $third_byte ) ) // HEX number in the form of x'N' or X'N'. || ( ( 'x' === $byte || 'X' === $byte ) && "'" === $next_byte ) @@ -2685,7 +2752,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, self::HEX_DIGIT_MASK, $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2708,7 +2775,7 @@ private function read_number(): ?int { $this->bytes_already_read += strspn( $this->sql, '01', $this->bytes_already_read ); if ( $is_quoted ) { if ( - $this->bytes_already_read >= strlen( $this->sql ) + $this->bytes_already_read >= $this->sql_length || "'" !== $this->sql[ $this->bytes_already_read ] ) { return null; // Invalid input. @@ -2737,11 +2804,12 @@ private function read_number(): ?int { ( 'e' === $byte || 'E' === $byte ) && null !== $next_byte && ( - strspn( $next_byte, self::DIGIT_MASK ) > 0 + ( $next_byte >= '0' && $next_byte <= '9' ) || ( ( '+' === $next_byte || '-' === $next_byte ) - && $this->bytes_already_read + 2 < strlen( $this->sql ) - && strspn( $this->sql[ $this->bytes_already_read + 2 ], self::DIGIT_MASK ) > 0 + && $this->bytes_already_read + 2 < $this->sql_length + && $this->sql[ $this->bytes_already_read + 2 ] >= '0' + && $this->sql[ $this->bytes_already_read + 2 ] <= '9' ) ); if ( $has_exponent ) { @@ -2838,12 +2906,11 @@ private function read_quoted_text(): ?int { // in which case the escape sequence is consumed and the loop continues. $at = $this->bytes_already_read; while ( true ) { - $at += strcspn( $this->sql, $quote, $at ); - - // Unclosed string - unexpected EOF. - if ( ( $this->sql[ $at ] ?? null ) !== $quote ) { + $quote_at = strpos( $this->sql, $quote, $at ); + if ( false === $quote_at ) { return null; // Invalid input. } + $at = $quote_at; /* * By default, quotes can be escaped with a "\". @@ -2853,9 +2920,17 @@ private function read_quoted_text(): ?int { * The quote is escaped only when the number of preceding backslashes * is odd - "\" is an escape sequence, "\\" is an escaped backslash, * "\\\" is an escaped backslash and an escape sequence, and so on. + * + * The `($at - $i - 1) >= 0` guard prevents PHP's negative-string- + * offset wraparound (PHP 7.1+) when the closing-quote candidate + * sits at the very start of the input. The `?? null` covers + * positive out-of-range indexes belt-and-suspenders. */ if ( ! $no_backslash_escapes ) { - for ( $i = 0; ( $at - $i - 1 ) >= 0 && '\\' === $this->sql[ $at - $i - 1 ]; $i += 1 ); + $i = 0; + while ( ( $at - $i - 1 ) >= 0 && '\\' === ( $this->sql[ $at - $i - 1 ] ?? null ) ) { + $i += 1; + } if ( 1 === $i % 2 ) { $at += 1; continue; @@ -2920,17 +2995,11 @@ private function read_mysql_comment(): int { } private function read_comment_content(): void { - while ( true ) { - $this->bytes_already_read += strcspn( $this->sql, '*', $this->bytes_already_read ); - $this->bytes_already_read += 1; // Consume the '*'. - $byte = $this->sql[ $this->bytes_already_read ] ?? null; - if ( null === $byte ) { - break; - } - if ( '/' === $byte ) { - $this->bytes_already_read += 1; // Consume the '/'. - break; - } + $comment_end = strpos( $this->sql, '*/', $this->bytes_already_read ); + if ( false === $comment_end ) { + $this->bytes_already_read = $this->sql_length; + } else { + $this->bytes_already_read = $comment_end + 2; } } diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php index 69282b9c..bbae3efd 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-parser.php @@ -14,6 +14,10 @@ class WP_MySQL_Parser extends WP_Parser { * @param array $tokens The parser tokens. */ public function reset_tokens( array $tokens ): void { + $this->token_count = count( $tokens ); + // Maintain the end-of-input sentinel that parse_recursive() relies on. + // See WP_Parser::__construct for the invariants. + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); $this->tokens = $tokens; $this->position = 0; $this->current_ast = null; @@ -40,7 +44,7 @@ public function reset_tokens( array $tokens ): void { * @return bool Whether a query was successfully parsed. */ public function next_query(): bool { - if ( $this->position >= count( $this->tokens ) ) { + if ( $this->position >= $this->token_count ) { return false; } $this->current_ast = $this->parse(); diff --git a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php index 1fb25ab4..0840bc2f 100644 --- a/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php +++ b/packages/mysql-on-sqlite/src/mysql/class-wp-mysql-token.php @@ -30,7 +30,11 @@ public function __construct( string $input, bool $sql_mode_no_backslash_escapes_enabled ) { - parent::__construct( $id, $start, $length, $input ); + $this->id = $id; + $this->start = $start; + $this->length = $length; + $this->input = $input; + $this->sql_mode_no_backslash_escapes_enabled = $sql_mode_no_backslash_escapes_enabled; } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php index 9bf30b97..6e54dc91 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-grammar.php @@ -28,12 +28,75 @@ class WP_Parser_Grammar { */ public $rules; public $rule_names; - public $fragment_ids; + public $fragment_ids = array(); + + /** + * Per-rule branch selector keyed by the next token id. + * + * When set, `$branches_for_token[$rule_id][$token_id]` is the ordered list + * of candidate branch symbol sequences (drawn from `$rules[$rule_id]`) + * that can possibly match when the current token has the given id. + * Nullable branches appear in every entry. + * + * If an entry does not exist for the current token, `$nullable_branches` + * is consulted. If neither has an entry for this rule, the rule cannot + * match and the parser returns immediately. + * + * @var array> + */ + public $branches_for_token = array(); + + /** + * Per-rule marker indicating the rule has at least one nullable branch. + * + * @var array + */ + public $nullable_branches = array(); + + /** + * Per-rule flag indicating every (rule, token) selector entry points + * to exactly one branch. The parser uses this to skip the outer + * foreach when a single candidate is the only possibility. + * + * @var array + */ + public $single_candidate_rules = array(); + + /** + * Backward-compatible view of `$branches_for_token` for the native (Rust) + * parser bridge. + * + * Trunk's `mysql-rust-bridge.php` exports this property to the native + * extension, which uses it for early-bailout the same way the previous + * pure-PHP parser did: `lookahead_is_match_possible[$rule_id][$tid]` + * means "this rule can possibly match when the next token is `$tid`". + * `EMPTY_RULE_ID` (0) marks the rule as having a nullable branch. + * + * The performance branch replaced this map with the more precise + * `$branches_for_token` + `$nullable_branches` pair on the parser hot + * path. The view is kept around so the native extension keeps working + * without a Rust-side change. It is not consulted by the pure-PHP parser. + * + * @var array> + */ public $lookahead_is_match_possible = array(); + public $lowest_non_terminal_id; public $highest_terminal_id; public $native_grammar; + /** + * Memoized rule-id lookups, keyed by rule name. + * + * `get_rule_id()` is a linear `array_search` over `$rule_names` and + * costs a few microseconds per call on the MySQL grammar. The parser + * looks up its start rule and the `selectStatement` rule on a hot path, + * so the results are memoized via `get_or_cache_rule_id()`. + * + * @var array + */ + private $cached_rule_ids = array(); + public function __construct( array $rules ) { $this->inflate( $rules ); } @@ -46,6 +109,25 @@ public function get_rule_id( $rule_name ) { return array_search( $rule_name, $this->rule_names, true ); } + /** + * Return the rule id for a given rule name, memoizing the result. + * + * Equivalent to `get_rule_id()` but caches the lookup so repeated + * queries for the same rule name (typically the start rule and a few + * grammar-specific rules consulted on the parser hot path) avoid + * the linear scan over `$rule_names`. Returns `false` for unknown + * rule names, mirroring `get_rule_id()`. + * + * @param string $rule_name + * @return int|false + */ + public function get_or_cache_rule_id( $rule_name ) { + if ( ! array_key_exists( $rule_name, $this->cached_rule_ids ) ) { + $this->cached_rule_ids[ $rule_name ] = $this->get_rule_id( $rule_name ); + } + return $this->cached_rule_ids[ $rule_name ]; + } + /** * Inflate the grammar to an internal representation optimized for parsing. * @@ -57,8 +139,8 @@ private function inflate( $grammar ) { $this->highest_terminal_id = $this->lowest_non_terminal_id - 1; foreach ( $grammar['rules_names'] as $rule_index => $rule_name ) { - $this->rule_names[ $rule_index + $grammar['rules_offset'] ] = $rule_name; - $this->rules[ $rule_index + $grammar['rules_offset'] ] = array(); + $rule_id = $rule_index + $grammar['rules_offset']; + $this->rule_names[ $rule_id ] = $rule_name; /** * Treat all intermediate rules as fragments to inline before returning @@ -76,7 +158,7 @@ private function inflate( $grammar ) { * They are prefixed with a "%" to be distinguished from the original rules. */ if ( '%' === $rule_name[0] ) { - $this->fragment_ids[ $rule_index + $grammar['rules_offset'] ] = true; + $this->fragment_ids[ $rule_id ] = true; } } @@ -86,55 +168,309 @@ private function inflate( $grammar ) { $this->rules[ $rule_id ] = $branches; } - /** - * Compute a rule => [token => true] lookup table for each rule - * that starts with a terminal OR with another rule that already - * has a lookahead mapping. - * - * This is similar to left-factoring the grammar, even if not quite - * the same. - * - * This enables us to quickly bail out from checking branches that - * cannot possibly match the current token. This increased the parser - * speed by a whopping 80%! - * - * @TODO: Explore these possible next steps: - * - * * Compute a rule => [token => branch[]] list lookup table and only - * process the branches that have a chance of matching the current token. - * * Actually left-factor the grammar as much as possible. This, however, - * could inflate the serialized grammar size. - */ - // 5 iterations seem to give us all the speed gains we can get from this. - for ( $i = 0; $i < 5; $i++ ) { - foreach ( $grammar['grammar'] as $rule_index => $branches ) { - $rule_id = $rule_index + $grammar['rules_offset']; - if ( isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { + $this->inline_single_branch_fragments(); + $this->strip_epsilon_markers(); + $this->build_branch_selectors(); + } + + /** + * Remove explicit `EMPTY_RULE_ID` markers from branches. + * + * The epsilon marker is a zero-width, always-matching symbol used in the + * grammar to express optional productions. At parse time it would still + * be walked and "continued" over for no effect, so stripping it ahead of + * time removes a per-symbol branch in the hot loop. + * + * A pure-epsilon branch (`[EMPTY_RULE_ID]`) becomes an empty branch (`[]`) + * which the parser already handles: the inner symbol loop does nothing and + * the rule returns a successful empty match. + */ + private function strip_epsilon_markers() { + foreach ( $this->rules as $rule_id => $branches ) { + foreach ( $branches as $i => $branch ) { + if ( in_array( self::EMPTY_RULE_ID, $branch, true ) ) { + $this->rules[ $rule_id ][ $i ] = array_values( + array_filter( + $branch, + static function ( $s ) { + return self::EMPTY_RULE_ID !== $s; + } + ) + ); + } + } + } + } + + /** + * Inline single-branch fragment rules into their call sites. + * + * The grammar contains many single-branch fragment rules that exist only + * to factor shared sub-sequences out of larger productions. At runtime + * the parser would descend into each such fragment via a recursive call + * just to walk the same symbol sequence and splice the results back into + * the parent. Expanding them in-place at build time eliminates that call + * chain without changing the resulting AST because fragment children are + * already flattened into the parent node. + * + * Fragments with two or more alternatives (e.g., `%EOF_zero_or_one`) are + * left intact because they represent real choices that must be evaluated + * against the current token. + */ + private function inline_single_branch_fragments() { + $rules = $this->rules; + $fragment_ids = $this->fragment_ids; + $low_nt = $this->lowest_non_terminal_id; + + // Precompute the set of single-branch fragments that are candidates + // for inlining. + $inlinable = array(); + foreach ( $fragment_ids as $rule_id => $_ ) { + if ( isset( $rules[ $rule_id ] ) && 1 === count( $rules[ $rule_id ] ) ) { + $inlinable[ $rule_id ] = true; + } + } + + // Depth-first expansion memoized per rule, with cycle detection. + $expanded = array(); + $visiting = array(); + $expand_branch = function ( array $branch ) use ( &$expand_branch, &$expanded, &$visiting, $rules, $low_nt, $inlinable ) { + $out = array(); + foreach ( $branch as $sym ) { + if ( $sym < $low_nt ) { + $out[] = $sym; + continue; + } + if ( ! isset( $inlinable[ $sym ] ) ) { + $out[] = $sym; continue; } - $rule_lookup = array(); - $first_symbol_can_be_expanded_to_all_terminals = true; + if ( isset( $visiting[ $sym ] ) ) { + // Cycle: leave the reference in place. + $out[] = $sym; + continue; + } + if ( ! isset( $expanded[ $sym ] ) ) { + $visiting[ $sym ] = true; + $expanded[ $sym ] = $expand_branch( $rules[ $sym ][0] ); + unset( $visiting[ $sym ] ); + } + foreach ( $expanded[ $sym ] as $s ) { + $out[] = $s; + } + } + return $out; + }; + + // Rewrite every rule's branches with fragments inlined. + foreach ( $this->rules as $rule_id => $branches ) { + $new_branches = array(); + foreach ( $branches as $branch ) { + $new_branches[] = $expand_branch( $branch ); + } + $this->rules[ $rule_id ] = $new_branches; + } + } + + /** + * Compute FIRST and NULLABLE sets for every non-terminal, then denormalize + * them into a per-rule map of `token_id => branch_index[]` so the parser + * can jump straight to the branches that can possibly match the current + * token. + * + * This replaces the previous coarse "can any branch match this token?" + * lookahead. On the MySQL corpus the fine-grained selector skips ~60% + * of the branch attempts that the parser used to try and fail. + */ + private function build_branch_selectors() { + $rules = $this->rules; + $low_nt = $this->lowest_non_terminal_id; + $empty_rule = self::EMPTY_RULE_ID; + $rule_ids = array_keys( $rules ); + $nullable = array(); + $first_sets = array(); + + foreach ( $rule_ids as $rule_id ) { + $nullable[ $rule_id ] = false; + $first_sets[ $rule_id ] = array(); + } + + // Iterate to fixpoint. FIRST and NULLABLE set monotonically grow. + do { + $changed = false; + foreach ( $rule_ids as $rule_id ) { + $branches = $rules[ $rule_id ]; foreach ( $branches as $branch ) { - $terminals = false; - $branch_starts_with_terminal = $branch[0] < $this->lowest_non_terminal_id; - if ( $branch_starts_with_terminal ) { - $terminals = array( $branch[0] ); - } elseif ( isset( $this->lookahead_is_match_possible[ $branch[0] ] ) ) { - $terminals = array_keys( $this->lookahead_is_match_possible[ $branch[0] ] ); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + // ε: contributes nothing to FIRST, stays nullable. + continue; + } + if ( $symbol < $low_nt ) { + // Terminal. + if ( ! isset( $first_sets[ $rule_id ][ $symbol ] ) ) { + $first_sets[ $rule_id ][ $symbol ] = true; + $changed = true; + } + $branch_nullable = false; + break; + } + // Non-terminal. + foreach ( $first_sets[ $symbol ] as $tid => $_ ) { + if ( ! isset( $first_sets[ $rule_id ][ $tid ] ) ) { + $first_sets[ $rule_id ][ $tid ] = true; + $changed = true; + } + } + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; + break; + } + } + if ( $branch_nullable && ! $nullable[ $rule_id ] ) { + $nullable[ $rule_id ] = true; + $changed = true; } + } + } + } while ( $changed ); - if ( false === $terminals ) { - $first_symbol_can_be_expanded_to_all_terminals = false; + // Build per-(rule, token) branch indices. + foreach ( $rule_ids as $rule_id ) { + $branches = $rules[ $rule_id ]; + $selector = array(); + $nullable_branch_ids = array(); + foreach ( $branches as $idx => $branch ) { + $branch_first = array(); + $branch_nullable = true; + foreach ( $branch as $symbol ) { + if ( $empty_rule === $symbol ) { + continue; + } + if ( $symbol < $low_nt ) { + $branch_first[ $symbol ] = true; + $branch_nullable = false; + break; + } + foreach ( $first_sets[ $symbol ] as $tid => $_ ) { + $branch_first[ $tid ] = true; + } + if ( ! $nullable[ $symbol ] ) { + $branch_nullable = false; break; } - foreach ( $terminals as $terminal ) { - $rule_lookup[ $terminal ] = true; + } + foreach ( $branch_first as $tid => $_ ) { + $selector[ $tid ][] = $idx; + } + if ( $branch_nullable ) { + $nullable_branch_ids[] = $idx; + } + } + + // Nullable branches also match when the current token is not in + // any branch's FIRST set. Fold them into every populated entry + // so the runtime lookup is a single array access. + if ( $nullable_branch_ids ) { + $merged = array(); + foreach ( $selector as $tid => $idx_list ) { + $merged[ $tid ] = self::merge_sorted( $idx_list, $nullable_branch_ids ); + } + $selector = $merged; + $this->nullable_branches[ $rule_id ] = true; + } + if ( $selector ) { + // Expand branch indexes to the branch symbol sequences so + // the parser can foreach candidate branches without an extra + // $branches[$idx] indirection on every attempt. Many tokens + // inside the same rule end up pointing to the same branch-id + // list, so deduplicate by signature and let copy-on-write + // share one sequences array across all of them. + // + // Trade-off vs trunk: storing branch sequences inline (rather + // than just branch indexes plus the trunk lookahead bitmap) + // costs ~+16 MiB of grammar memory after dedup but eliminates + // the per-attempt $rules[$rule_id][$idx] indirection in the + // parser hot loop. The dedup itself is what keeps the cost at + // ~+16 MiB; without it the embedded table would be ~40 MB. + $by_signature = array(); + $all_single_candidates = true; + foreach ( $selector as $tid => $idx_list ) { + if ( 1 !== count( $idx_list ) ) { + $all_single_candidates = false; + } + $sig = implode( ',', $idx_list ); + if ( isset( $by_signature[ $sig ] ) ) { + $selector[ $tid ] = $by_signature[ $sig ]; + } else { + $seqs = array(); + foreach ( $idx_list as $idx ) { + $seqs[] = $branches[ $idx ]; + } + $by_signature[ $sig ] = $seqs; + $selector[ $tid ] = $seqs; } } - if ( $first_symbol_can_be_expanded_to_all_terminals ) { - $this->lookahead_is_match_possible[ $rule_id ] = $rule_lookup; + $this->branches_for_token[ $rule_id ] = $selector; + if ( $all_single_candidates ) { + $this->single_candidate_rules[ $rule_id ] = true; } } } + + // Build the backward-compat lookahead view for the native parser + // bridge. See $lookahead_is_match_possible. + foreach ( $this->branches_for_token as $rule_id => $sel ) { + $entry = array(); + foreach ( $sel as $tid => $_ ) { + $entry[ $tid ] = true; + } + if ( isset( $this->nullable_branches[ $rule_id ] ) ) { + $entry[ self::EMPTY_RULE_ID ] = true; + } + $this->lookahead_is_match_possible[ $rule_id ] = $entry; + } + foreach ( $this->nullable_branches as $rule_id => $_ ) { + if ( ! isset( $this->lookahead_is_match_possible[ $rule_id ] ) ) { + $this->lookahead_is_match_possible[ $rule_id ] = array( + self::EMPTY_RULE_ID => true, + ); + } + } + } + + /** + * Merge two ascending int arrays into one ascending int array without + * duplicates. Preserves original branch order as required by the parser. + * + * @param int[] $a + * @param int[] $b + * @return int[] + */ + private static function merge_sorted( array $a, array $b ): array { + $i = 0; + $j = 0; + $na = count( $a ); + $nb = count( $b ); + $out = array(); + while ( $i < $na && $j < $nb ) { + if ( $a[ $i ] < $b[ $j ] ) { + $out[] = $a[ $i++ ]; + } elseif ( $a[ $i ] > $b[ $j ] ) { + $out[] = $b[ $j++ ]; + } else { + $out[] = $a[ $i ]; + ++$i; + ++$j; + } + } + while ( $i < $na ) { + $out[] = $a[ $i++ ]; + } + while ( $j < $nb ) { + $out[] = $b[ $j++ ]; + } + return $out; } } diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php index b61f38d5..096f17fc 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-node.php @@ -15,11 +15,12 @@ class WP_Parser_Node { */ public $rule_id; public $rule_name; - protected $children = array(); + protected $children; - public function __construct( $rule_id, $rule_name ) { + public function __construct( $rule_id, $rule_name, array $children = array() ) { $this->rule_id = $rule_id; $this->rule_name = $rule_name; + $this->children = $children; } public function append_child( $node ) { @@ -108,7 +109,7 @@ public function merge_fragment( $node ) { * @return bool True if this node has any child nodes or tokens, false otherwise. */ public function has_child(): bool { - return count( $this->children ) > 0; + return ! empty( $this->children ); } /** diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php index b7726189..4132ba38 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser-token.php @@ -35,7 +35,7 @@ class WP_Parser_Token { * * @var string */ - private $input; + protected $input; /** * Constructor. diff --git a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php index 4436892f..03c00280 100644 --- a/packages/mysql-on-sqlite/src/parser/class-wp-parser.php +++ b/packages/mysql-on-sqlite/src/parser/class-wp-parser.php @@ -11,85 +11,177 @@ class WP_Parser { protected $grammar; protected $tokens; + protected $token_count; protected $position; + // Grammar data cached as instance fields so the hot path avoids an extra + // property hop via $this->grammar on every recursive call. + private $rule_names; + private $fragment_ids; + private $branches_for_token; + private $nullable_branches; + private $highest_terminal_id; + private $select_statement_rule_id; + private $single_candidate_rules; + public function __construct( WP_Parser_Grammar $grammar, array $tokens ) { - $this->grammar = $grammar; - $this->tokens = $tokens; - $this->position = 0; + $this->grammar = $grammar; + $this->token_count = count( $tokens ); + // Append an end-of-input sentinel token whose id is EMPTY_RULE_ID + // (0). The hot path can then read $tokens[$pos]->id unconditionally + // when $pos is the current cursor, because the sentinel naturally + // fails to match any real grammar terminal while feeding the + // nullable-fallback branch of the selector check. + // + // Invariants the hot path relies on: + // - The sentinel id (0) cannot match any grammar terminal. + // strip_epsilon_markers() removes id 0 from every branch at + // grammar build time, so no $subrule_id in the inner loop ever + // equals 0 and ++$this->position can never advance past the + // sentinel. + // - The sentinel must never be appended to a node's children. It + // is only inspected via $tokens[$pos]->id; tokens are pushed + // into $children only on terminal-id equality, which the + // sentinel cannot satisfy. + // - WP_MySQL_Parser::next_query() bounds at $position < $token_count + // (set above, before the append), so the sentinel sits at index + // $token_count and is never fed into a parse round. + $tokens[] = new WP_Parser_Token( WP_Parser_Grammar::EMPTY_RULE_ID, 0, 0, '' ); + $this->tokens = $tokens; + $this->position = 0; + $this->rule_names = $grammar->rule_names; + $this->fragment_ids = $grammar->fragment_ids; + $this->branches_for_token = $grammar->branches_for_token; + $this->nullable_branches = $grammar->nullable_branches; + $this->highest_terminal_id = $grammar->highest_terminal_id; + $this->single_candidate_rules = $grammar->single_candidate_rules; + + // The INTO negative-lookahead only fires for selectStatement. Cache + // the rule id so the per-call check is an int compare instead of a + // string compare. + $this->select_statement_rule_id = $grammar->get_or_cache_rule_id( 'selectStatement' ); } public function parse() { // @TODO: Make the starting rule lookup non-grammar-specific. - $query_rule_id = $this->grammar->get_rule_id( 'query' ); - $ast = $this->parse_recursive( $query_rule_id ); + $ast = $this->parse_recursive( $this->grammar->get_or_cache_rule_id( 'query' ) ); return false === $ast ? null : $ast; } + /** + * Parse a single non-terminal rule. + * + * This function is only called for non-terminal rule ids. Terminals are + * matched inline inside the branch loop below to avoid a function-call + * round trip per consumed token. + */ private function parse_recursive( $rule_id ) { - $is_terminal = $rule_id <= $this->grammar->highest_terminal_id; - if ( $is_terminal ) { - if ( $this->position >= count( $this->tokens ) ) { - return false; - } - - if ( WP_Parser_Grammar::EMPTY_RULE_ID === $rule_id ) { - return true; - } + $tokens = $this->tokens; + $position = $this->position; - if ( $this->tokens[ $this->position ]->id === $rule_id ) { - ++$this->position; - return $this->tokens[ $this->position - 1 ]; - } + // Narrow the set of branches worth trying using the precomputed FIRST + // sets. When no entry exists for the current token but the rule is + // nullable, all candidate branches would match empty, so we return + // immediately without entering any branch. + $tid = $tokens[ $position ]->id; + if ( isset( $this->branches_for_token[ $rule_id ][ $tid ] ) ) { + $candidate_branches = $this->branches_for_token[ $rule_id ][ $tid ]; + } elseif ( isset( $this->nullable_branches[ $rule_id ] ) ) { + return true; + } else { return false; } - $branches = $this->grammar->rules[ $rule_id ]; - if ( ! count( $branches ) ) { - return false; - } + $highest_terminal_id = $this->highest_terminal_id; + $is_fragment = isset( $this->fragment_ids[ $rule_id ] ); + $is_select_statement = $rule_id === $this->select_statement_rule_id; - // Bale out from processing the current branch if none of its rules can - // possibly match the current token. - if ( isset( $this->grammar->lookahead_is_match_possible[ $rule_id ] ) ) { - $token_id = $this->tokens[ $this->position ]->id; - if ( - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ $token_id ] ) && - ! isset( $this->grammar->lookahead_is_match_possible[ $rule_id ][ WP_Parser_Grammar::EMPTY_RULE_ID ] ) - ) { + // Fast path for rules where every (rule, token) selector entry + // points to exactly one branch - about 55% of nonterminal calls + // on the MySQL corpus. Skip the outer foreach and the + // $branch_matches bookkeeping; every failure path just rewinds + // the position and returns false directly. + if ( isset( $this->single_candidate_rules[ $rule_id ] ) ) { + $branch = $candidate_branches[0]; + $children = array(); + foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $this->position = $position; + return false; + } + + $subnode = $this->parse_recursive( $subrule_id ); + if ( false === $subnode ) { + $this->position = $position; + return false; + } + if ( true === $subnode ) { + continue; + } + if ( is_array( $subnode ) ) { + foreach ( $subnode as $c ) { + $children[] = $c; + } + } else { + $children[] = $subnode; + } + } + + if ( $is_select_statement && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id ) { + $this->position = $position; return false; } + if ( ! $children ) { + return true; + } + if ( $is_fragment ) { + return $children; + } + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); } - $rule_name = $this->grammar->rule_names[ $rule_id ]; - $starting_position = $this->position; - foreach ( $branches as $branch ) { - $this->position = $starting_position; - $node = new WP_Parser_Node( $rule_id, $rule_name ); + $branch_matches = false; + $children = array(); + foreach ( $candidate_branches as $branch ) { + $this->position = $position; + $children = array(); $branch_matches = true; foreach ( $branch as $subrule_id ) { + if ( $subrule_id <= $highest_terminal_id ) { + // The sentinel at $tokens[$token_count] has id 0 so it + // cannot match any real terminal, making the range check + // unnecessary here. + if ( $tokens[ $this->position ]->id === $subrule_id ) { + $children[] = $tokens[ $this->position ]; + ++$this->position; + continue; + } + $branch_matches = false; + break; + } + $subnode = $this->parse_recursive( $subrule_id ); if ( false === $subnode ) { $branch_matches = false; break; - } elseif ( true === $subnode ) { - /* - * The subrule was matched without actually matching a token. - * This means a special empty "ε" (epsilon) rule was matched. - * An "ε" rule in a grammar matches an empty input of 0 bytes. - * It is used to represent optional grammar productions. - */ - continue; - } elseif ( is_array( $subnode ) && 0 === count( $subnode ) ) { - continue; } - if ( is_array( $subnode ) && ! count( $subnode ) ) { + if ( true === $subnode ) { continue; } - if ( isset( $this->grammar->fragment_ids[ $subrule_id ] ) ) { - $node->merge_fragment( $subnode ); + if ( is_array( $subnode ) ) { + // Fragment results are returned directly as a children + // array so the parser does not allocate a Parser_Node + // that would immediately be unwrapped into the parent. + foreach ( $subnode as $c ) { + $children[] = $c; + } } else { - $node->append_child( $subnode ); + $children[] = $subnode; } } @@ -100,25 +192,36 @@ private function parse_recursive( $rule_id ) { // for right-associative rules, which could solve this. // See: https://github.com/mysql/mysql-workbench/blob/8.0.38/library/parsers/grammars/MySQLParser.g4#L994 // See: https://github.com/antlr/antlr4/issues/488 - $la = $this->tokens[ $this->position ] ?? null; - if ( $la && 'selectStatement' === $rule_name && WP_MySQL_Lexer::INTO_SYMBOL === $la->id ) { + if ( + $branch_matches + && $is_select_statement + && WP_MySQL_Lexer::INTO_SYMBOL === $tokens[ $this->position ]->id + ) { $branch_matches = false; } - if ( true === $branch_matches ) { + if ( $branch_matches ) { break; } } if ( ! $branch_matches ) { - $this->position = $starting_position; + $this->position = $position; return false; } - if ( ! $node->has_child() ) { + if ( ! $children ) { return true; } - return $node; + // Fragments exist only to group symbols for reuse; their "node" would + // get inlined into the parent on the very next step. Return the raw + // children array so the caller can splice it without allocating a + // throwaway WP_Parser_Node. + if ( $is_fragment ) { + return $children; + } + + return new WP_Parser_Node( $rule_id, $this->rule_names[ $rule_id ], $children ); } }