From 684dff3e216bd3db708001b9312ef257bb3246bb Mon Sep 17 00:00:00 2001 From: Veesh Goldman Date: Thu, 18 Jun 2026 22:08:32 +0300 Subject: [PATCH] Perl: switch to the more accurate ts-parser-perl grammar The crates.io `tree-sitter-perl` is an independent, older Perl grammar by a different author; `ts-parser-perl` is the crate for github.com/tree-sitter-perl/tree-sitter-perl, which parses real-world Perl substantially more accurately (95.4% vs 40.2% clean parse across 8,342 real-world files; ~92% fewer ERROR/MISSING failures). For a symbol/import/call indexer, an ERROR node makes that subtree untraversable, so the more accurate grammar recovers symbols that are silently missed today. The two grammars use different node names, so this is not a drop-in dependency bump. Changes: - Cargo: package-rename so the `tree_sitter_perl` path is unchanged. - parser.rs PERL_QUERY: package_statement/subroutine_declaration_statement/ method_declaration_statement; `use constant` constants captured and gated on pragma text in Rust (the binding does not evaluate #eq? predicates); variable capture keeps the sigil ($counter). - perl_package_name: read the `package` name node. - imports/perl.rs: every use/no pragma is one `use_statement` (use/no keyword + `module` field); require is an `expression_statement` wrapping `require_expression`. Extend the statement range through the trailing `;` sibling so organize/remove leave no stray semicolon. - calls.rs: function_call_expression/method_call_expression; method-call callee resolves via the `method` field. All Perl tests pass (imports, outline, round-trip, organize golden corpus). Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 23 ++++----- crates/aft/Cargo.toml | 5 +- crates/aft/src/calls.rs | 15 +++--- crates/aft/src/imports/perl.rs | 91 ++++++++++++++++++---------------- crates/aft/src/parser.rs | 46 +++++++++++++---- 5 files changed, 105 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8bfe5de8..b3e178a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,7 +84,6 @@ dependencies = [ "tree-sitter-lua", "tree-sitter-md", "tree-sitter-pascal", - "tree-sitter-perl", "tree-sitter-php", "tree-sitter-python", "tree-sitter-r", @@ -97,6 +96,7 @@ dependencies = [ "tree-sitter-vue-next", "tree-sitter-yaml", "tree-sitter-zig", + "ts-parser-perl", "url", "which", ] @@ -3540,17 +3540,6 @@ dependencies = [ "tree-sitter-language", ] -[[package]] -name = "tree-sitter-perl" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79d4ae71b42e595c24c354b59905ade8162bd400ebc53ab916ea8aec54da92d" -dependencies = [ - "cc", - "tree-sitter", - "tree-sitter-language", -] - [[package]] name = "tree-sitter-php" version = "0.24.2" @@ -3677,6 +3666,16 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "ts-parser-perl" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "720dcad2c9e8445465c98a7117574224bede0ee4168d081a37bfbad9699cd459" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "typenum" version = "1.19.0" diff --git a/crates/aft/Cargo.toml b/crates/aft/Cargo.toml index 4cc6409c..52e8827f 100644 --- a/crates/aft/Cargo.toml +++ b/crates/aft/Cargo.toml @@ -54,7 +54,10 @@ tree-sitter-kotlin-sg = "0.4" tree-sitter-swift = "0.7" tree-sitter-php = "0.24" tree-sitter-lua = "0.5" -tree-sitter-perl = "1.1" +# `ts-parser-perl` is the crate for github.com/tree-sitter-perl/tree-sitter-perl, +# a substantially more accurate Perl grammar. The `package =` rename keeps the +# `tree_sitter_perl` path working unchanged. +tree-sitter-perl = { package = "ts-parser-perl", version = "1.1" } tree-sitter-pascal = "0.10.2" tree-sitter-r = "1.2.0" tree-sitter-yaml = "0.7" diff --git a/crates/aft/src/calls.rs b/crates/aft/src/calls.rs index 19085d0d..eb6965b6 100644 --- a/crates/aft/src/calls.rs +++ b/crates/aft/src/calls.rs @@ -40,15 +40,7 @@ pub fn call_node_kinds(lang: LangId) -> Vec<&'static str> { "nullsafe_member_call_expression", "scoped_call_expression", ], - LangId::Perl => vec![ - "call_expression_recursive", - "call_expression_with_args_with_brackets", - "call_expression_with_bareword", - "call_expression_with_spaced_args", - "call_expression_with_sub", - "call_expression_with_variable", - "method_invocation", - ], + LangId::Perl => vec!["function_call_expression", "method_call_expression"], LangId::Lua => vec!["function_call"], LangId::C | LangId::Cpp | LangId::Zig => vec!["call_expression"], LangId::CSharp => vec!["invocation_expression"], @@ -206,6 +198,11 @@ fn callee_node<'a>(node: &tree_sitter::Node<'a>) -> Option "jsx_opening_element" | "jsx_self_closing_element" => node .child_by_field_name("name") .or_else(|| node.named_child(0)), + // Perl method call `$obj->method(...)`: the callee is the `method` + // field, not the leading invocant that `child(0)` would return. + "method_call_expression" => node + .child_by_field_name("method") + .or_else(|| node.child_by_field_name("function")), _ => node .child_by_field_name("function") .or_else(|| node.child(0)), diff --git a/crates/aft/src/imports/perl.rs b/crates/aft/src/imports/perl.rs index 12329ac8..3b25e888 100644 --- a/crates/aft/src/imports/perl.rs +++ b/crates/aft/src/imports/perl.rs @@ -41,19 +41,24 @@ pub(crate) fn parse_perl_imports(source: &str, tree: &Tree) -> ImportBlock { fn parse_perl_import_statement(source: &str, node: &Node) -> Option { match node.kind() { - "use_no_statement" => parse_perl_use_no_statement(source, node), - "use_parent_statement" => { - parse_perl_keyword_module_statement(source, node, PERL_USE_KIND, "parent") + // `use`/`no` pragmas — including `use parent ...` and `use constant ...` + // — all parse as a single `use_statement`. The leading `use`/`no` + // keyword carries the kind and the `module` field carries the pragma + // name ("parent", "constant", or an ordinary module), so no per-pragma + // special-casing is needed. + "use_statement" => parse_perl_use_statement(source, node), + // `require Foo;` parses as an expression statement wrapping a + // `require_expression`. + "expression_statement" => { + let require = find_direct_child(node, "require_expression")?; + let module_node = first_named_child(&require)?; + build_perl_import(source, node, &module_node, PERL_REQUIRE_KIND) } - "use_constant_statement" => { - parse_perl_keyword_module_statement(source, node, PERL_USE_KIND, "constant") - } - "require_statement" => parse_perl_require_statement(source, node), _ => None, } } -fn parse_perl_use_no_statement(source: &str, node: &Node) -> Option { +fn parse_perl_use_statement(source: &str, node: &Node) -> Option { let import_kind = if find_direct_child(node, PERL_USE_KIND).is_some() { PERL_USE_KIND } else if find_direct_child(node, PERL_NO_KIND).is_some() { @@ -61,25 +66,10 @@ fn parse_perl_use_no_statement(source: &str, node: &Node) -> Option Option { - let module_node = find_direct_child(node, module_child_kind)?; + let module_node = node.child_by_field_name("module")?; build_perl_import(source, node, &module_node, import_kind) } -fn parse_perl_require_statement(source: &str, node: &Node) -> Option { - let module_node = find_direct_child(node, "package_name")?; - build_perl_import(source, node, &module_node, PERL_REQUIRE_KIND) -} - fn build_perl_import( source: &str, statement: &Node, @@ -93,8 +83,16 @@ fn build_perl_import( let raw_args = raw_args_after_module(source, statement, module_node)?; let modifiers = perl_arg_modifiers(&raw_args); - let raw_text = source[statement.byte_range()].to_string(); - let byte_range = statement.byte_range(); + // `use_statement` spans its own terminating `;`, but an `expression_statement` + // (e.g. `require Foo;`) does not — the `;` is a following sibling. Extend the + // range through it so organize/remove replaces the whole statement and leaves + // no stray semicolon behind. + let end_byte = match statement.next_sibling() { + Some(next) if next.kind() == ";" => next.end_byte(), + _ => statement.end_byte(), + }; + let raw_text = source[statement.start_byte()..end_byte].to_string(); + let byte_range = statement.start_byte()..end_byte; let group = classify_group_perl(&module_path); Some(ImportStatement { @@ -158,6 +156,22 @@ fn find_direct_child<'tree>(node: &Node<'tree>, kind: &str) -> Option(node: &Node<'tree>) -> Option> { + let mut cursor = node.walk(); + if cursor.goto_first_child() { + loop { + let child = cursor.node(); + if child.is_named() { + return Some(child); + } + if !cursor.goto_next_sibling() { + break; + } + } + } + None +} + fn is_perl_import_kind(kind: &str) -> bool { matches!(kind, PERL_USE_KIND | PERL_REQUIRE_KIND | PERL_NO_KIND) } @@ -272,9 +286,11 @@ mod tests { } /// Grammar fixture: lock the exact tree-sitter-perl node kinds this parser - /// depends on. The current grammar represents plain `use` and `no` pragmas - /// as `use_no_statement`, specialized pragmas as `use_parent_statement` / - /// `use_constant_statement`, and runtime requires as `require_statement`. + /// depends on. This grammar represents every `use`/`no` pragma (plain or + /// specialized like `use parent` / `use constant`) as a single + /// `use_statement` with a `module` field and a leading `use`/`no` keyword + /// token, and runtime `require` as an `expression_statement` wrapping a + /// `require_expression`. #[test] fn perl_grammar_node_kinds_are_stable() { let src = "use Foo::Bar;\nuse Foo qw(a b);\nuse parent -norequire, 'Base';\nuse constant PI => 3.14;\nrequire Foo;\nno warnings;\nno strict 'refs';\n"; @@ -298,20 +314,11 @@ mod tests { for required in [ "source_file", - "use_no_statement", - "use_parent_statement", - "use_constant_statement", - "require_statement", - "package_name", - "word_list_qw", - "no_require", - "string_single_quoted", - "fat_comma", - "floating_point", - "parent", - "constant", + "use_statement", + "expression_statement", + "require_expression", + "package", "use", - "require", "no", ";", ] { diff --git a/crates/aft/src/parser.rs b/crates/aft/src/parser.rs index 976a7eea..98716184 100644 --- a/crates/aft/src/parser.rs +++ b/crates/aft/src/parser.rs @@ -598,17 +598,22 @@ const LUA_QUERY: &str = r#" const PERL_QUERY: &str = r#" ;; packages and subroutines (package_statement - (package_name) @package.name) @package.def -(function_definition - name: (identifier) @fn.name) @fn.def -(function_definition_without_sub - name: (identifier) @fn.name) @fn.def - -;; constants / lexical variables -(use_constant_statement - constant: (identifier) @var.name) @var.def + name: (package) @package.name) @package.def +(subroutine_declaration_statement + name: (bareword) @fn.name) @fn.def +(method_declaration_statement + name: (bareword) @fn.name) @fn.def + +;; constants: `use constant NAME => ...;` — the `constant` pragma is filtered in +;; Rust (the tree-sitter binding does not evaluate `#eq?` predicates). +(use_statement + module: (package) @const.pragma + (list_expression (autoquoted_bareword) @const.name)) @const.def + +;; lexical / package variables: capture the sigil-bearing variable node so the +;; symbol name matches `$counter`, not the bare `counter`. (variable_declaration - variable_name: (_) @var.name) @var.def + variable: (_) @var.name) @var.def "#; /// Supported language identifier. @@ -5829,7 +5834,7 @@ fn perl_package_name(source: &str, node: &Node) -> Option { loop { let child = cursor.node(); - if child.kind() == "package_name" { + if child.kind() == "package" { return Some(node_text(source, &child).to_string()); } if !cursor.goto_next_sibling() { @@ -5887,6 +5892,9 @@ fn extract_perl_symbols(source: &str, root: &Node, query: &Query) -> Result Result fn_def_node = Some(cap.node), "var.name" => var_name_node = Some(cap.node), "var.def" => var_def_node = Some(cap.node), + "const.pragma" => const_pragma_node = Some(cap.node), + "const.name" => const_name_node = Some(cap.node), + "const.def" => const_def_node = Some(cap.node), _ => {} } } + // `use constant NAME => ...;` defines a constant. Our grammar parses every + // `use`/`no` pragma as a generic `use_statement`, so gate on the pragma + // module text to avoid treating e.g. `use parent -norequire, ...` as a + // constant definition. + if let (Some(pragma_node), Some(name_node), Some(def_node)) = + (const_pragma_node, const_name_node, const_def_node) + { + if node_text(source, &pragma_node) == "constant" { + var_name_node = Some(name_node); + var_def_node = Some(def_node); + } + } + if let (Some(name_node), Some(def_node)) = (package_name_node, package_def_node) { push_captured_symbol( &mut symbols,