Skip to content

Commit 5f3ced9

Browse files
committed
fix: exclude entity name from structural_hash for rename detection (#20)
Small function renames were detected as Deleted + Added because the name token dominated the hash for small functions. Now structural_hash strips the name node's byte range, so renamed functions with identical bodies produce the same hash and match in Phase 2.
1 parent 858e4b0 commit 5f3ced9

File tree

3 files changed

+203
-3
lines changed

3 files changed

+203
-3
lines changed

crates/sem-core/src/parser/plugins/code/entity_extractor.rs

Lines changed: 122 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use tree_sitter::{Node, Tree};
22

33
use crate::model::entity::{build_entity_id, SemanticEntity};
4-
use crate::utils::hash::{content_hash, structural_hash};
4+
use crate::utils::hash::{content_hash, structural_hash, structural_hash_excluding_range};
55
use super::languages::LanguageConfig;
66

77
pub fn extract_entities(
@@ -39,7 +39,7 @@ fn visit_node(
3939
if let Some((name, entity_type)) = extract_call_entity(node, config, source) {
4040
let content_str = node_text(node, source);
4141
let content = content_str.to_string();
42-
let struct_hash = structural_hash(node, source);
42+
let struct_hash = compute_structural_hash(node, source);
4343
let entity = SemanticEntity {
4444
id: build_entity_id(file_path, entity_type, &name, parent_id),
4545
file_path: file_path.to_string(),
@@ -92,7 +92,7 @@ fn visit_node(
9292
let content_str = node_text(node, source);
9393
let content = content_str.to_string();
9494

95-
let struct_hash = structural_hash(node, source);
95+
let struct_hash = compute_structural_hash(node, source);
9696
let entity = SemanticEntity {
9797
id: build_entity_id(file_path, entity_type, &name, parent_id),
9898
file_path: file_path.to_string(),
@@ -157,6 +157,125 @@ fn visit_node(
157157
}
158158
}
159159

160+
/// Compute the structural hash for an entity, excluding the name token so that
161+
/// renames of otherwise identical entities produce the same hash.
162+
fn compute_structural_hash(node: Node, source: &[u8]) -> String {
163+
match find_name_byte_range(node, source) {
164+
Some((start, end)) => structural_hash_excluding_range(node, source, start, end),
165+
None => structural_hash(node, source),
166+
}
167+
}
168+
169+
/// Find the byte range of the name node, mirroring extract_name() logic.
170+
/// Returns (start_byte, end_byte) of the name token to exclude from hashing.
171+
fn find_name_byte_range(node: Node, _source: &[u8]) -> Option<(usize, usize)> {
172+
// Try 'name' field first (works for most languages)
173+
if let Some(name_node) = node.child_by_field_name("name") {
174+
return Some((name_node.start_byte(), name_node.end_byte()));
175+
}
176+
177+
let node_type = node.kind();
178+
179+
// Variable/lexical declarations: name is inside variable_declarator
180+
if node_type == "lexical_declaration" || node_type == "variable_declaration" {
181+
let mut cursor = node.walk();
182+
for child in node.named_children(&mut cursor) {
183+
if child.kind() == "variable_declarator" {
184+
if let Some(decl_name) = child.child_by_field_name("name") {
185+
return Some((decl_name.start_byte(), decl_name.end_byte()));
186+
}
187+
}
188+
}
189+
}
190+
191+
// Decorated definitions (Python): look at the inner definition
192+
if node_type == "decorated_definition" {
193+
let mut cursor = node.walk();
194+
for child in node.named_children(&mut cursor) {
195+
if child.kind() == "function_definition" || child.kind() == "class_definition" {
196+
if let Some(inner_name) = child.child_by_field_name("name") {
197+
return Some((inner_name.start_byte(), inner_name.end_byte()));
198+
}
199+
}
200+
}
201+
}
202+
203+
// C/C++ function_definition: name is inside declarator
204+
if node_type == "function_definition" {
205+
if let Some(declarator) = node.child_by_field_name("declarator") {
206+
return find_declarator_name_range(declarator);
207+
}
208+
}
209+
210+
// C++ template_declaration
211+
if node_type == "template_declaration" {
212+
let mut cursor = node.walk();
213+
for child in node.named_children(&mut cursor) {
214+
if child.kind() != "template_parameter_list" {
215+
if let Some(name) = child.child_by_field_name("name") {
216+
return Some((name.start_byte(), name.end_byte()));
217+
}
218+
if let Some(declarator) = child.child_by_field_name("declarator") {
219+
return find_declarator_name_range(declarator);
220+
}
221+
}
222+
}
223+
}
224+
225+
// C declarations
226+
if node_type == "declaration" || node_type == "type_definition" {
227+
if let Some(declarator) = node.child_by_field_name("declarator") {
228+
return find_declarator_name_range(declarator);
229+
}
230+
}
231+
232+
// Fallback: first identifier child
233+
let mut cursor = node.walk();
234+
for child in node.named_children(&mut cursor) {
235+
if child.kind() == "identifier" || child.kind() == "type_identifier" {
236+
return Some((child.start_byte(), child.end_byte()));
237+
}
238+
}
239+
240+
None
241+
}
242+
243+
/// Find the byte range of the name within a C-style declarator chain.
244+
fn find_declarator_name_range(node: Node) -> Option<(usize, usize)> {
245+
match node.kind() {
246+
"identifier" | "type_identifier" | "field_identifier" => {
247+
Some((node.start_byte(), node.end_byte()))
248+
}
249+
"qualified_identifier" | "scoped_identifier" => {
250+
Some((node.start_byte(), node.end_byte()))
251+
}
252+
"pointer_declarator" | "function_declarator" | "array_declarator"
253+
| "parenthesized_declarator" => {
254+
if let Some(inner) = node.child_by_field_name("declarator") {
255+
find_declarator_name_range(inner)
256+
} else {
257+
let mut cursor = node.walk();
258+
let result = node
259+
.named_children(&mut cursor)
260+
.find(|c| c.kind() == "identifier" || c.kind() == "type_identifier")
261+
.map(|c| (c.start_byte(), c.end_byte()));
262+
result
263+
}
264+
}
265+
_ => {
266+
if let Some(name) = node.child_by_field_name("name") {
267+
return Some((name.start_byte(), name.end_byte()));
268+
}
269+
let mut cursor = node.walk();
270+
let result = node
271+
.named_children(&mut cursor)
272+
.find(|c| c.kind() == "identifier" || c.kind() == "type_identifier")
273+
.map(|c| (c.start_byte(), c.end_byte()));
274+
result
275+
}
276+
}
277+
}
278+
160279
fn extract_name(node: Node, source: &[u8]) -> Option<String> {
161280
// Try 'name' field first (works for most languages)
162281
if let Some(name_node) = node.child_by_field_name("name") {

crates/sem-core/src/parser/plugins/code/mod.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,33 @@ function outer() {
424424
assert!(names.contains(&"outer"), "got: {:?}", names);
425425
}
426426

427+
#[test]
428+
fn test_renamed_function_same_structural_hash() {
429+
let code_a = "def get_card():\n return db.query('cards')\n";
430+
let code_b = "def get_card_1():\n return db.query('cards')\n";
431+
432+
let plugin = CodeParserPlugin;
433+
let entities_a = plugin.extract_entities(code_a, "a.py");
434+
let entities_b = plugin.extract_entities(code_b, "b.py");
435+
436+
assert_eq!(entities_a.len(), 1, "Should find one entity in a");
437+
assert_eq!(entities_b.len(), 1, "Should find one entity in b");
438+
assert_eq!(entities_a[0].name, "get_card");
439+
assert_eq!(entities_b[0].name, "get_card_1");
440+
441+
// Structural hash should match since only the name differs
442+
assert_eq!(
443+
entities_a[0].structural_hash, entities_b[0].structural_hash,
444+
"Renamed function with identical body should have same structural_hash"
445+
);
446+
447+
// Content hash should differ (it includes the name)
448+
assert_ne!(
449+
entities_a[0].content_hash, entities_b[0].content_hash,
450+
"Content hash should differ since raw content includes the name"
451+
);
452+
}
453+
427454
#[test]
428455
fn test_hcl_entity_extraction() {
429456
let code = r#"

crates/sem-core/src/utils/hash.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,20 @@ pub fn structural_hash(node: Node, source: &[u8]) -> String {
2121
format!("{:016x}", hasher.finish())
2222
}
2323

24+
/// Compute a structural hash that excludes tokens within a given byte range.
25+
/// Used to strip the entity name from the hash so that renames of otherwise
26+
/// identical entities produce the same hash, enabling Phase 2 rename detection.
27+
pub fn structural_hash_excluding_range(
28+
node: Node,
29+
source: &[u8],
30+
exclude_start: usize,
31+
exclude_end: usize,
32+
) -> String {
33+
let mut hasher = Xxh3::new();
34+
hash_structural_tokens_excluding(node, source, &mut hasher, exclude_start, exclude_end);
35+
format!("{:016x}", hasher.finish())
36+
}
37+
2438
/// Recursively hash tokens from the AST, skipping comments.
2539
/// Hashes both node types (structure) and leaf text (content) so that
2640
/// structurally different ASTs with identical leaf tokens produce different hashes.
@@ -57,6 +71,46 @@ fn hash_structural_tokens(node: Node, source: &[u8], hasher: &mut Xxh3) {
5771
}
5872
}
5973

74+
/// Like `hash_structural_tokens` but skips any leaf node whose byte range
75+
/// overlaps the excluded range (the entity name).
76+
fn hash_structural_tokens_excluding(
77+
node: Node,
78+
source: &[u8],
79+
hasher: &mut Xxh3,
80+
exclude_start: usize,
81+
exclude_end: usize,
82+
) {
83+
let kind = node.kind();
84+
85+
if is_comment_node(kind) {
86+
return;
87+
}
88+
89+
if node.child_count() == 0 {
90+
let start = node.start_byte();
91+
let end = node.end_byte();
92+
// Skip leaf nodes that overlap the excluded range
93+
if start < exclude_end && end > exclude_start {
94+
return;
95+
}
96+
if start < end && end <= source.len() {
97+
let bytes = &source[start..end];
98+
let trimmed = trim_bytes(bytes);
99+
if !trimmed.is_empty() {
100+
hasher.write(trimmed);
101+
hasher.write(b" ");
102+
}
103+
}
104+
} else {
105+
hasher.write(kind.as_bytes());
106+
hasher.write(b":");
107+
let mut cursor = node.walk();
108+
for child in node.children(&mut cursor) {
109+
hash_structural_tokens_excluding(child, source, hasher, exclude_start, exclude_end);
110+
}
111+
}
112+
}
113+
60114
/// Trim leading/trailing ASCII whitespace from a byte slice without allocating.
61115
#[inline]
62116
fn trim_bytes(bytes: &[u8]) -> &[u8] {

0 commit comments

Comments
 (0)