fix: exclude entity name from structural_hash for rename detection (#20)

rs545837 · rs545837 · commit 5f3ced9b01f2 · 2026-03-09T20:37:56.000-07:00
Small function renames were detected as Deleted + Added because the name
token dominated the hash for small functions. Now structural_hash strips
the name node's byte range, so renamed functions with identical bodies
produce the same hash and match in Phase 2.
diff --git a/crates/sem-core/src/parser/plugins/code/entity_extractor.rs b/crates/sem-core/src/parser/plugins/code/entity_extractor.rs
@@ -1,7 +1,7 @@
 use tree_sitter::{Node, Tree};
 
 use crate::model::entity::{build_entity_id, SemanticEntity};
-use crate::utils::hash::{content_hash, structural_hash};
+use crate::utils::hash::{content_hash, structural_hash, structural_hash_excluding_range};
 use super::languages::LanguageConfig;
 
 pub fn extract_entities(
@@ -39,7 +39,7 @@ fn visit_node(
         if let Some((name, entity_type)) = extract_call_entity(node, config, source) {
             let content_str = node_text(node, source);
             let content = content_str.to_string();
-            let struct_hash = structural_hash(node, source);
+            let struct_hash = compute_structural_hash(node, source);
             let entity = SemanticEntity {
                 id: build_entity_id(file_path, entity_type, &name, parent_id),
                 file_path: file_path.to_string(),
@@ -92,7 +92,7 @@ fn visit_node(
                 let content_str = node_text(node, source);
                 let content = content_str.to_string();
 
-                let struct_hash = structural_hash(node, source);
+                let struct_hash = compute_structural_hash(node, source);
                 let entity = SemanticEntity {
                     id: build_entity_id(file_path, entity_type, &name, parent_id),
                     file_path: file_path.to_string(),
@@ -157,6 +157,125 @@ fn visit_node(
     }
 }
 
+/// Compute the structural hash for an entity, excluding the name token so that
+/// renames of otherwise identical entities produce the same hash.
+fn compute_structural_hash(node: Node, source: &[u8]) -> String {
+    match find_name_byte_range(node, source) {
+        Some((start, end)) => structural_hash_excluding_range(node, source, start, end),
+        None => structural_hash(node, source),
+    }
+}
+
+/// Find the byte range of the name node, mirroring extract_name() logic.
+/// Returns (start_byte, end_byte) of the name token to exclude from hashing.
+fn find_name_byte_range(node: Node, _source: &[u8]) -> Option<(usize, usize)> {
+    // Try 'name' field first (works for most languages)
+    if let Some(name_node) = node.child_by_field_name("name") {
+        return Some((name_node.start_byte(), name_node.end_byte()));
+    }
+
+    let node_type = node.kind();
+
+    // Variable/lexical declarations: name is inside variable_declarator
+    if node_type == "lexical_declaration" || node_type == "variable_declaration" {
+        let mut cursor = node.walk();
+        for child in node.named_children(&mut cursor) {
+            if child.kind() == "variable_declarator" {
+                if let Some(decl_name) = child.child_by_field_name("name") {
+                    return Some((decl_name.start_byte(), decl_name.end_byte()));
+                }
+            }
+        }
+    }
+
+    // Decorated definitions (Python): look at the inner definition
+    if node_type == "decorated_definition" {
+        let mut cursor = node.walk();
+        for child in node.named_children(&mut cursor) {
+            if child.kind() == "function_definition" || child.kind() == "class_definition" {
+                if let Some(inner_name) = child.child_by_field_name("name") {
+                    return Some((inner_name.start_byte(), inner_name.end_byte()));
+                }
+            }
+        }
+    }
+
+    // C/C++ function_definition: name is inside declarator
+    if node_type == "function_definition" {
+        if let Some(declarator) = node.child_by_field_name("declarator") {
+            return find_declarator_name_range(declarator);
+        }
+    }
+
+    // C++ template_declaration
+    if node_type == "template_declaration" {
+        let mut cursor = node.walk();
+        for child in node.named_children(&mut cursor) {
+            if child.kind() != "template_parameter_list" {
+                if let Some(name) = child.child_by_field_name("name") {
+                    return Some((name.start_byte(), name.end_byte()));
+                }
+                if let Some(declarator) = child.child_by_field_name("declarator") {
+                    return find_declarator_name_range(declarator);
+                }
+            }
+        }
+    }
+
+    // C declarations
+    if node_type == "declaration" || node_type == "type_definition" {
+        if let Some(declarator) = node.child_by_field_name("declarator") {
+            return find_declarator_name_range(declarator);
+        }
+    }
+
+    // Fallback: first identifier child
+    let mut cursor = node.walk();
+    for child in node.named_children(&mut cursor) {
+        if child.kind() == "identifier" || child.kind() == "type_identifier" {
+            return Some((child.start_byte(), child.end_byte()));
+        }
+    }
+
+    None
+}
+
+/// Find the byte range of the name within a C-style declarator chain.
+fn find_declarator_name_range(node: Node) -> Option<(usize, usize)> {
+    match node.kind() {
+        "identifier" | "type_identifier" | "field_identifier" => {
+            Some((node.start_byte(), node.end_byte()))
+        }
+        "qualified_identifier" | "scoped_identifier" => {
+            Some((node.start_byte(), node.end_byte()))
+        }
+        "pointer_declarator" | "function_declarator" | "array_declarator"
+        | "parenthesized_declarator" => {
+            if let Some(inner) = node.child_by_field_name("declarator") {
+                find_declarator_name_range(inner)
+            } else {
+                let mut cursor = node.walk();
+                let result = node
+                    .named_children(&mut cursor)
+                    .find(|c| c.kind() == "identifier" || c.kind() == "type_identifier")
+                    .map(|c| (c.start_byte(), c.end_byte()));
+                result
+            }
+        }
+        _ => {
+            if let Some(name) = node.child_by_field_name("name") {
+                return Some((name.start_byte(), name.end_byte()));
+            }
+            let mut cursor = node.walk();
+            let result = node
+                .named_children(&mut cursor)
+                .find(|c| c.kind() == "identifier" || c.kind() == "type_identifier")
+                .map(|c| (c.start_byte(), c.end_byte()));
+            result
+        }
+    }
+}
+
 fn extract_name(node: Node, source: &[u8]) -> Option<String> {
     // Try 'name' field first (works for most languages)
     if let Some(name_node) = node.child_by_field_name("name") {
diff --git a/crates/sem-core/src/parser/plugins/code/mod.rs b/crates/sem-core/src/parser/plugins/code/mod.rs
@@ -424,6 +424,33 @@ function outer() {
         assert!(names.contains(&"outer"), "got: {:?}", names);
     }
 
+    #[test]
+    fn test_renamed_function_same_structural_hash() {
+        let code_a = "def get_card():\n    return db.query('cards')\n";
+        let code_b = "def get_card_1():\n    return db.query('cards')\n";
+
+        let plugin = CodeParserPlugin;
+        let entities_a = plugin.extract_entities(code_a, "a.py");
+        let entities_b = plugin.extract_entities(code_b, "b.py");
+
+        assert_eq!(entities_a.len(), 1, "Should find one entity in a");
+        assert_eq!(entities_b.len(), 1, "Should find one entity in b");
+        assert_eq!(entities_a[0].name, "get_card");
+        assert_eq!(entities_b[0].name, "get_card_1");
+
+        // Structural hash should match since only the name differs
+        assert_eq!(
+            entities_a[0].structural_hash, entities_b[0].structural_hash,
+            "Renamed function with identical body should have same structural_hash"
+        );
+
+        // Content hash should differ (it includes the name)
+        assert_ne!(
+            entities_a[0].content_hash, entities_b[0].content_hash,
+            "Content hash should differ since raw content includes the name"
+        );
+    }
+
     #[test]
     fn test_hcl_entity_extraction() {
         let code = r#"
diff --git a/crates/sem-core/src/utils/hash.rs b/crates/sem-core/src/utils/hash.rs
@@ -21,6 +21,20 @@ pub fn structural_hash(node: Node, source: &[u8]) -> String {
     format!("{:016x}", hasher.finish())
 }
 
+/// Compute a structural hash that excludes tokens within a given byte range.
+/// Used to strip the entity name from the hash so that renames of otherwise
+/// identical entities produce the same hash, enabling Phase 2 rename detection.
+pub fn structural_hash_excluding_range(
+    node: Node,
+    source: &[u8],
+    exclude_start: usize,
+    exclude_end: usize,
+) -> String {
+    let mut hasher = Xxh3::new();
+    hash_structural_tokens_excluding(node, source, &mut hasher, exclude_start, exclude_end);
+    format!("{:016x}", hasher.finish())
+}
+
 /// Recursively hash tokens from the AST, skipping comments.
 /// Hashes both node types (structure) and leaf text (content) so that
 /// structurally different ASTs with identical leaf tokens produce different hashes.
@@ -57,6 +71,46 @@ fn hash_structural_tokens(node: Node, source: &[u8], hasher: &mut Xxh3) {
     }
 }
 
+/// Like `hash_structural_tokens` but skips any leaf node whose byte range
+/// overlaps the excluded range (the entity name).
+fn hash_structural_tokens_excluding(
+    node: Node,
+    source: &[u8],
+    hasher: &mut Xxh3,
+    exclude_start: usize,
+    exclude_end: usize,
+) {
+    let kind = node.kind();
+
+    if is_comment_node(kind) {
+        return;
+    }
+
+    if node.child_count() == 0 {
+        let start = node.start_byte();
+        let end = node.end_byte();
+        // Skip leaf nodes that overlap the excluded range
+        if start < exclude_end && end > exclude_start {
+            return;
+        }
+        if start < end && end <= source.len() {
+            let bytes = &source[start..end];
+            let trimmed = trim_bytes(bytes);
+            if !trimmed.is_empty() {
+                hasher.write(trimmed);
+                hasher.write(b" ");
+            }
+        }
+    } else {
+        hasher.write(kind.as_bytes());
+        hasher.write(b":");
+        let mut cursor = node.walk();
+        for child in node.children(&mut cursor) {
+            hash_structural_tokens_excluding(child, source, hasher, exclude_start, exclude_end);
+        }
+    }
+}
+
 /// Trim leading/trailing ASCII whitespace from a byte slice without allocating.
 #[inline]
 fn trim_bytes(bytes: &[u8]) -> &[u8] {