From 3ad9db38fe90aadae8e8c83acac7b61dcc229bd0 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Mon, 20 Apr 2026 19:16:56 +0200 Subject: [PATCH 1/3] Optimize ripper bounds Basically a port of https://github.com/ruby/ruby/commit/c45f781771314a71856c9b348c640ba532f54349 into ruby It's quite effective at ~97% hit rate for me. Speeds it up from ~6.77x slower to only 4.07x slower. For the lexer `on_sp` it also gives a bit of an improvement: 1.04x slower to 1.10x faster I guess the class may be universally useful but for now I just made it nodoc. --- lib/prism/lex_compat.rb | 11 ++++- lib/prism/parse_result.rb | 4 +- lib/prism/translation/ripper.rb | 69 ++++++++++++++++++++++++++-- rbi/generated/prism/lex_compat.rbi | 8 ++++ rbi/generated/prism/parse_result.rbi | 4 +- sig/generated/prism/lex_compat.rbs | 6 +++ sig/generated/prism/parse_result.rbs | 4 +- 7 files changed, 91 insertions(+), 15 deletions(-) diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb index e1b04fc6ce..7aacec037d 100644 --- a/lib/prism/lex_compat.rb +++ b/lib/prism/lex_compat.rb @@ -23,6 +23,12 @@ module Prism # def self.[]: (Integer value) -> State # end # end + # + # class LineAndColumnCache + # def initialize: (Source source) -> void + # + # def line_and_column: (Integer byte_offset) -> [Integer, Integer] + # end # end # end @@ -837,6 +843,8 @@ def post_process_tokens(tokens, source, data_loc, bom, eof_token) prev_token_state = Translation::Ripper::Lexer::State[Translation::Ripper::EXPR_BEG] prev_token_end = bom ? 3 : 0 + cache = Translation::Ripper::LineAndColumnCache.new(source) + tokens.each do |token| # Skip missing heredoc ends. next if token[1] == :on_heredoc_end && token[2] == "" @@ -851,8 +859,7 @@ def post_process_tokens(tokens, source, data_loc, bom, eof_token) if start_offset > prev_token_end sp_value = source.slice(prev_token_end, start_offset - prev_token_end) - sp_line = source.line(prev_token_end) - sp_column = source.column(prev_token_end) + sp_line, sp_column = cache.line_and_column(prev_token_end) # Ripper reports columns on line 1 without counting the BOM sp_column -= 3 if sp_line == 1 && bom continuation_index = sp_value.byteindex("\\") diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb index 4f7bcf07d6..e37a8cd843 100644 --- a/lib/prism/parse_result.rb +++ b/lib/prism/parse_result.rb @@ -223,9 +223,7 @@ def deep_freeze freeze end - private - - # Binary search through the offsets to find the line number for the given + # Binary search through the offsets to find the index for the given # byte offset. #-- #: (Integer byte_offset) -> Integer diff --git a/lib/prism/translation/ripper.rb b/lib/prism/translation/ripper.rb index b066f3e3ac..d5dd760e58 100644 --- a/lib/prism/translation/ripper.rb +++ b/lib/prism/translation/ripper.rb @@ -446,6 +446,64 @@ def self.sexp_raw(src, filename = "-", lineno = 1, raise_errors: false) autoload :SexpBuilder, "prism/translation/ripper/sexp" autoload :SexpBuilderPP, "prism/translation/ripper/sexp" + # Provides optimized access to line and column information. + # Ripper bounds are mostly accessed in a linear fashion, so + # we can try a linear scan first and fall back to binary search. + class LineAndColumnCache # :nodoc: + # How many should it look ahead/behind before falling back to binary searching. + WINDOW = 8 + private_constant :WINDOW + + #: (Source source) -> void + def initialize(source) + @source = source + @offsets = source.offsets + @hint = 0 + end + + #: (Integer byte_offset) -> [Integer, Integer] + def line_and_column(byte_offset) + @hint = new_hint(byte_offset) || @source.find_line(byte_offset) + return [@hint + @source.start_line, byte_offset - @offsets[@hint]] + end + + private + + def new_hint(byte_offset) + if @offsets[@hint] <= byte_offset + # Same line? + if (@hint + 1 >= @offsets.size || @offsets[@hint + 1] > byte_offset) + return @hint + end + + # Scan forwards + limit = [@hint + WINDOW + 1, @offsets.size].min + idx = @hint + 1 + while idx < limit + if @offsets[idx] > byte_offset + return idx - 1 + end + if @offsets[idx] == byte_offset + return idx + end + idx += 1 + end + else + # Scan backwards + limit = @hint > WINDOW ? @hint - WINDOW : 0 + idx = @hint + while idx >= limit + 1 + if @offsets[idx - 1] <= byte_offset + return idx - 1 + end + idx -= 1 + end + end + + nil + end + end + # :stopdoc: # This is not part of the public API but used by some gems. @@ -489,6 +547,7 @@ def initialize(source, filename = "(ripper)", lineno = 1) @lineno = lineno @column = 0 @result = nil + @line_and_column_cache = nil end ########################################################################## @@ -4014,6 +4073,10 @@ def result @result ||= Prism.parse(source, partial_script: true, version: "current") end + def line_and_column_cache + @line_and_column_cache ||= LineAndColumnCache.new(result.source) + end + ########################################################################## # Helpers ########################################################################## @@ -4114,12 +4177,8 @@ def visit_write_value(node) # This method is responsible for updating lineno and column information # to reflect the current node. - # - # This method could be drastically improved with some caching on the start - # of every line, but for now it's good enough. def bounds(location) - @lineno = location.start_line - @column = location.start_column + @lineno, @column = line_and_column_cache.line_and_column(location.start_offset) end # :startdoc: diff --git a/rbi/generated/prism/lex_compat.rbi b/rbi/generated/prism/lex_compat.rbi index ca479b7225..b1f72a815b 100644 --- a/rbi/generated/prism/lex_compat.rbi +++ b/rbi/generated/prism/lex_compat.rbi @@ -20,6 +20,14 @@ module Prism def self.[](value); end end end + + class LineAndColumnCache + sig { params(source: Source).void } + def initialize(source); end + + sig { params(byte_offset: Integer).returns([Integer, Integer]) } + def line_and_column(byte_offset); end + end end end diff --git a/rbi/generated/prism/parse_result.rbi b/rbi/generated/prism/parse_result.rbi index 4d065b5be1..44fbf42c96 100644 --- a/rbi/generated/prism/parse_result.rbi +++ b/rbi/generated/prism/parse_result.rbi @@ -123,10 +123,10 @@ module Prism sig { void } def deep_freeze; end - # Binary search through the offsets to find the line number for the given + # Binary search through the offsets to find the index for the given # byte offset. sig { params(byte_offset: Integer).returns(Integer) } - private def find_line(byte_offset); end + def find_line(byte_offset); end end # A cache that can be used to quickly compute code unit offsets from byte diff --git a/sig/generated/prism/lex_compat.rbs b/sig/generated/prism/lex_compat.rbs index 707a96b9a8..1712955ff8 100644 --- a/sig/generated/prism/lex_compat.rbs +++ b/sig/generated/prism/lex_compat.rbs @@ -19,6 +19,12 @@ module Prism def self.[]: (Integer value) -> State end end + + class LineAndColumnCache + def initialize: (Source source) -> void + + def line_and_column: (Integer byte_offset) -> [ Integer, Integer ] + end end end diff --git a/sig/generated/prism/parse_result.rbs b/sig/generated/prism/parse_result.rbs index f005f17375..da9c7b9636 100644 --- a/sig/generated/prism/parse_result.rbs +++ b/sig/generated/prism/parse_result.rbs @@ -146,9 +146,7 @@ module Prism # : () -> void def deep_freeze: () -> void - private - - # Binary search through the offsets to find the line number for the given + # Binary search through the offsets to find the index for the given # byte offset. # -- # : (Integer byte_offset) -> Integer From 9e93bd6bd523e43786e574eaf352b24e594dac1a Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Mon, 20 Apr 2026 19:26:30 +0200 Subject: [PATCH 2/3] Freeze the parse result for the ripper translator It's a small, somewhat hacky performance boost. Locations are lazy, by freezing the result they don't have to be pack/unpacked redundantly. This gives about a 4% speed boost. Other changes are to not modify the frozen AST --- lib/prism/translation/ripper.rb | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/prism/translation/ripper.rb b/lib/prism/translation/ripper.rb index d5dd760e58..faf2865434 100644 --- a/lib/prism/translation/ripper.rb +++ b/lib/prism/translation/ripper.rb @@ -1007,7 +1007,7 @@ def visit_begin_node(node) on_stmts_add(on_stmts_new, on_void_stmt) else body = node.statements.body - body.unshift(nil) if void_stmt?(location, node.statements.body[0].location, allow_newline) + body = [nil, *body] if void_stmt?(location, node.statements.body[0].location, allow_newline) bounds(node.statements.location) visit_statements_node_body(body) @@ -1024,7 +1024,7 @@ def visit_begin_node(node) [nil] else body = else_clause_node.statements.body - body.unshift(nil) if void_stmt?(else_clause_node.else_keyword_loc, else_clause_node.statements.body[0].location, allow_newline) + body = [nil, *body] if void_stmt?(else_clause_node.else_keyword_loc, else_clause_node.statements.body[0].location, allow_newline) body end @@ -1046,7 +1046,7 @@ def visit_begin_node(node) on_bodystmt(visit_statements_node_body([nil]), nil, nil, nil) when StatementsNode body = [*node.body] - body.unshift(nil) if void_stmt?(location, body[0].location, allow_newline) + body = [nil, *body] if void_stmt?(location, body[0].location, allow_newline) stmts = visit_statements_node_body(body) bounds(node.body.first.location) @@ -1095,7 +1095,7 @@ def visit_block_node(node) braces ? stmts : on_bodystmt(stmts, nil, nil, nil) when StatementsNode stmts = node.body.body - stmts.unshift(nil) if void_stmt?(node.parameters&.location || node.opening_loc, node.body.location, false) + stmts = [nil, *stmts] if void_stmt?(node.parameters&.location || node.opening_loc, node.body.location, false) stmts = visit_statements_node_body(stmts) bounds(node.body.location) @@ -2022,7 +2022,7 @@ def visit_else_node(node) [nil] else body = node.statements.body - body.unshift(nil) if void_stmt?(node.else_keyword_loc, node.statements.body[0].location, false) + body = [nil, *body] if void_stmt?(node.else_keyword_loc, node.statements.body[0].location, false) body end @@ -2077,7 +2077,7 @@ def visit_ensure_node(node) [nil] else body = node.statements.body - body.unshift(nil) if void_stmt?(node.ensure_keyword_loc, body[0].location, false) + body = [nil, *body] if void_stmt?(node.ensure_keyword_loc, body[0].location, false) body end @@ -2860,7 +2860,7 @@ def visit_lambda_node(node) braces ? stmts : on_bodystmt(stmts, nil, nil, nil) when StatementsNode stmts = node.body.body - stmts.unshift(nil) if void_stmt?(node.parameters&.location || node.opening_loc, node.body.location, false) + stmts = [nil, *stmts] if void_stmt?(node.parameters&.location || node.opening_loc, node.body.location, false) stmts = visit_statements_node_body(stmts) bounds(node.body.location) @@ -3354,7 +3354,7 @@ def visit_pre_execution_node(node) # The top-level program node. def visit_program_node(node) body = node.statements.body - body << nil if body.empty? + body = [nil] if body.empty? statements = visit_statements_node_body(body) bounds(node.location) @@ -4070,7 +4070,7 @@ def visit_yield_node(node) # Lazily initialize the parse result. def result - @result ||= Prism.parse(source, partial_script: true, version: "current") + @result ||= Prism.parse(source, partial_script: true, version: "current", freeze: true) end def line_and_column_cache From d611aa9d1161a3da3a109972d18784304476f5cc Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:38:06 +0200 Subject: [PATCH 3/3] Optimize ripper `visit_token` It was showing up in profiles. So: * Don't splat `KEYWORDS` (also did the same for `BINARY_OPERATORS`) * Use `start_with?` if possible Overall gives a ~5% speed boost --- lib/prism/translation/ripper.rb | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/lib/prism/translation/ripper.rb b/lib/prism/translation/ripper.rb index faf2865434..0f5608b1ec 100644 --- a/lib/prism/translation/ripper.rb +++ b/lib/prism/translation/ripper.rb @@ -347,7 +347,7 @@ def self.coerce_source(source) # :nodoc: "__ENCODING__", "__FILE__", "__LINE__" - ] + ].to_set # A list of all of the Ruby binary operators. BINARY_OPERATORS = [ @@ -372,7 +372,7 @@ def self.coerce_source(source) # :nodoc: :/, :*, :** - ] + ].to_set private_constant :KEYWORDS, :BINARY_OPERATORS @@ -1295,7 +1295,7 @@ def visit_call_node(node) bounds(node.location) on_unary(:!, receiver) end - when *BINARY_OPERATORS + when BINARY_OPERATORS receiver = visit(node.receiver) bounds(node.message_loc) @@ -4095,24 +4095,23 @@ def void_stmt?(left, right, allow_newline) # Visit the string content of a particular node. This method is used to # split into the various token types. def visit_token(token, allow_keywords = true) - case token - when "." + if token == "." on_period(token) - when "`" + elsif token == "`" on_backtick(token) - when *(allow_keywords ? KEYWORDS : []) + elsif allow_keywords && KEYWORDS.include?(token) on_kw(token) - when /^_/ + elsif token.start_with?("_") on_ident(token) - when /^[[:upper:]]\w*$/ + elsif token.match?(/^[[:upper:]]\w*$/) on_const(token) - when /^@@/ + elsif token.start_with?("@@") on_cvar(token) - when /^@/ + elsif token.start_with?("@") on_ivar(token) - when /^\$/ + elsif token.start_with?("$") on_gvar(token) - when /^[[:punct:]]/ + elsif token.match?(/^[[:punct:]]/) on_op(token) else on_ident(token)