jupyter-ai-contrib · bhavana-nair · Jun 6, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 14, 2025
diff --git a/jupyter_ai_personas/knowledge_graph/bulk_analyzer.py b/jupyter_ai_personas/knowledge_graph/bulk_analyzer.py
@@ -0,0 +1,209 @@
+import os
+import tree_sitter_python as tspython
+from tree_sitter import Language, Parser
+from neo4j import GraphDatabase
+import hashlib
+import boto3
+import json
+
+
+class BulkCodeAnalyzer:
+    def __init__(self, uri, auth, embd_name=None, embd_id=None):
+        self.driver = GraphDatabase.driver(uri, auth=auth)
+        self.PY_LANGUAGE = Language(tspython.language())
+        self.parser = Parser(self.PY_LANGUAGE)
+        self.embd_name = embd_name  # Bedrock
+        self.embd_id = embd_id  # amazon.titan-embed-text-v1
+        self.bedrock_client = boto3.client("bedrock-runtime") if embd_name else None
+
+    def analyze_folder(self, folder_path, clear_existing=False):
+        """Analyze all supported files in a folder and add to knowledge graph"""
+        if clear_existing:
+            with self.driver.session() as session:
+                session.run("MATCH (n) DETACH DELETE n")
+                print("Cleared existing graph")
+
+        # Supported file extensions
+        supported_extensions = {".py"}  # for 1st phase just py
+
+        all_files = []
+        for root, dirs, files in os.walk(folder_path):
+            for file in files:
+                file_ext = os.path.splitext(file)[1]
+                if file_ext in supported_extensions:
+                    all_files.append(os.path.join(root, file))
+
+        print(f"Found {len(all_files)} supported files")
+
+        with self.driver.session() as session:
+            for file_path in all_files:
+                print(f"Analyzing: {file_path}")
+                try:
+                    if file_path.endswith(".py"):
+                        self._analyze_file(file_path, session)
+                    else:
+                        self._analyze_non_python_file(file_path, session)
+                except Exception as e:
+                    print(f"Error analyzing {file_path}: {e}")
+
+    def _analyze_file(self, file_path, session):
+        with open(file_path, "r", encoding="utf-8") as f:
+            code = f.read()
+
+        tree = self.parser.parse(bytes(code, "utf8"))
+        self._extract_code_elements(tree.root_node, session, file_path)
+
+    def _extract_code_elements(self, node, session, file_path, current_class=None):
+        if node.type == "class_definition":
+            class_name = node.child_by_field_name("name").text.decode("utf8")
+            class_code = node.text.decode("utf8", errors="ignore")
+            embedding = self._get_embedding(class_code) if self.bedrock_client else None
+
+            session.run(
+                "MERGE (c:Class {name: $name}) SET c.file = $file, c.embedding = $embedding",
+                name=class_name,
+                file=file_path,
+                embedding=embedding,
+            )
+
+            superclasses = node.child_by_field_name("superclasses")
+            if superclasses:
+                for child in superclasses.children:
+                    if child.type == "identifier":
+                        parent = child.text.decode("utf8")
+                        session.run(
+                            "MERGE (parent:Class {name: $parent})", parent=parent
+                        )
+                        session.run(
+                            "MATCH (parent:Class {name: $parent}), (child:Class {name: $child}) "
+                            "MERGE (child)-[:INHERITS_FROM]->(parent)",
+                            parent=parent,
+                            child=class_name,
+                        )
+
+            for child in node.children:
+                self._extract_code_elements(child, session, file_path, class_name)
+
+        elif node.type == "function_definition":
+            func_name = node.child_by_field_name("name").text.decode("utf8")
+            func_code = node.text.decode("utf8", errors="ignore")
+
+            params_node = node.child_by_field_name("parameters")
+            params = []
+            if params_node:
+                for child in params_node.children:
+                    if child.type == "identifier":
+                        params.append(child.text.decode("utf8"))
+
+            code_hash = hashlib.md5(func_code.encode()).hexdigest()
+
+            # Generate embedding for function code
+            embedding = self._get_embedding(func_code) if self.bedrock_client else None
+
+            session.run(
+                "MERGE (f:Function {name: $name, file: $file}) "
+                "SET f.code = $code, f.code_hash = $hash, f.parameters = $params, f.line_start = $start, f.line_end = $end, f.embedding = $embedding",
+                name=func_name,
+                file=file_path,
+                code=func_code,
+                hash=code_hash,
+                params=params,
+                start=node.start_point[0],
+                end=node.end_point[0],
+                embedding=embedding,
+            )
+
+            if current_class:
+                session.run(
+                    "MATCH (c:Class {name: $class_name}), (f:Function {name: $func_name, file: $file}) "
+                    "MERGE (c)-[:CONTAINS]->(f)",
+                    class_name=current_class,
+                    func_name=func_name,
+                    file=file_path,
+                )
+
+            # Extract function calls
+            self._extract_function_calls(node, session, func_name, file_path)
+
+        else:
+            for child in node.children:
+                self._extract_code_elements(child, session, file_path, current_class)
+
+    def _analyze_non_python_file(self, file_path, session):
+        """Analyze non-Python files (basic content indexing)"""
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                content = f.read()
+
+            # Create a File node for non-Python files
+            embedding = (
+                self._get_embedding(content[:5000]) if self.bedrock_client else None
+            )
+
+            session.run(
+                "MERGE (f:File {path: $path}) SET f.content = $content, f.size = $size, f.type = $type, f.embedding = $embedding",
+                path=file_path,
+                content=content[:5000],
+                size=len(content),
+                type=os.path.splitext(file_path)[1],
+                embedding=embedding,
+            )
+
+        except Exception as e:
+            print(f"Error reading {file_path}: {e}")
+            # Create File node without content
+            session.run(
+                "MERGE (f:File {path: $path}) SET f.error = $error, f.type = $type",
+                path=file_path,
+                error=str(e),
+                type=os.path.splitext(file_path)[1],
+            )
+
+    def _extract_function_calls(self, func_node, session, caller_name, file_path):
+        """Extract function calls from a function body"""
+
+        def find_calls(node):
+            calls = []
+            if node.type == "call":
+                func_expr = node.child_by_field_name("function")
+                if func_expr and func_expr.type == "identifier":
+                    called_func = func_expr.text.decode("utf8")
+                    calls.append(called_func)
+                elif func_expr and func_expr.type == "attribute":
+                    # Handle method calls like obj.method()
+                    attr = func_expr.child_by_field_name("attribute")
+                    if attr:
+                        called_func = attr.text.decode("utf8")
+                        calls.append(called_func)
+
+            for child in node.children:
+                calls.extend(find_calls(child))
+            return calls
+
+        called_functions = find_calls(func_node)
+
+        for called_func in called_functions:
+            # Create CALLS relationship
+            session.run(
+                "MATCH (caller:Function {name: $caller, file: $file}) "
+                "MERGE (called:Function {name: $called}) "
+                "MERGE (caller)-[:CALLS]->(called)",
+                caller=caller_name,
+                called=called_func,
+                file=file_path,
+            )
+
+    def _get_embedding(self, text):
+        """Generate embedding using AWS Bedrock Titan model"""
+        try:
+            response = self.bedrock_client.invoke_model(
+                modelId=self.embd_id, body=json.dumps({"inputText": text})
+            )
+            return json.loads(response["body"].read())["embedding"]
+        except Exception as e:
+            print(f"Error generating embedding: {e}")
+            return None
+
+
+# analyzer = BulkCodeAnalyzer("neo4j://127.0.0.1:7687", ("neo4j", "Bhavana@97"))
+# analyzer.analyze_folder("source_code", clear_existing=True)
diff --git a/jupyter_ai_personas/knowledge_graph/code_analysis_tool.py b/jupyter_ai_personas/knowledge_graph/code_analysis_tool.py
@@ -0,0 +1,123 @@
+from agno.tools import Toolkit
+from .bulk_analyzer import BulkCodeAnalyzer
+from neo4j import GraphDatabase
+import ast
+import os
+
+
+class CodeAnalysisTool(Toolkit):
+    def __init__(self):
+        super().__init__(name="code_analysis")
+        # Use environment variables for Neo4j credentials with defaults
+        neo4j_uri = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
+        neo4j_user = os.getenv("NEO4J_USER", "neo4j")
+        neo4j_password = os.getenv("NEO4J_PASSWORD")
+
+        if not neo4j_password:
+            raise ValueError("NEO4J_PASSWORD environment variable must be set")
+
+        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
+        self.register(self.get_class_info)
+        self.register(self.find_related_classes)
+        self.register(self.query_code)
+        self.register(self.get_function_code)
+
+    def get_class_info(self, class_name: str) -> str:
+        """Get detailed information about a class from the knowledge graph"""
+        try:
+            with self.driver.session() as session:
+                # Get class info
+                class_result = session.run(
+                    "MATCH (c:Class {name: $class_name}) RETURN c.file as file",
+                    class_name=class_name,
+                )
+                class_record = class_result.single()
+                if not class_record:
+                    return f"Class {class_name} not found in knowledge graph"
+
+                inherit_result = session.run(
+                    "MATCH (c:Class {name: $class_name})-[:INHERITS_FROM]->(parent:Class) "
+                    "RETURN parent.name as parent_name",
+                    class_name=class_name,
+                )
+                parents = [record["parent_name"] for record in inherit_result]
+
+                method_result = session.run(
+                    "MATCH (c:Class {name: $class_name})-[:CONTAINS]->(f:Function) "
+                    "RETURN f.name as method_name, f.parameters as params",
+                    class_name=class_name,
+                )
+                methods = [
+                    (record["method_name"], record["params"])
+                    for record in method_result
+                ]
+
+                info = f"Class {class_name}:\n"
+                info += f"  File: {class_record['file']}\n"
+                if parents:
+                    info += f"  Inherits from: {', '.join(parents)}\n"
+                info += f"  Methods:\n"
+                for method_name, params in methods:
+                    param_str = ", ".join(params) if params else ""
+                    info += f"    {method_name}({param_str})\n"
+
+                return info
+        except Exception as e:
+            return f"Error getting class info: {str(e)}"
+
+    def get_function_code(self, function_name: str, class_name: str = None) -> str:
+        """Get the source code of a function from the knowledge graph"""
+        try:
+            with self.driver.session() as session:
+                # Query function with code directly
+                if class_name:
+                    result = session.run(
+                        "MATCH (c:Class {name: $class_name})-[:CONTAINS]->(f:Function {name: $function_name}) "
+                        "RETURN f.code as code, f.file as file, f.line_start as line_start, f.line_end as line_end",
+                        class_name=class_name,
+                        function_name=function_name,
+                    )
+                else:
+                    result = session.run(
+                        "MATCH (f:Function {name: $function_name}) "
+                        "RETURN f.code as code, f.file as file, f.line_start as line_start, f.line_end as line_end",
+                        function_name=function_name,
+                    )
+
+                record = result.single()
+                if not record:
+                    return f"Function {function_name} not found"
+
+                # If code is stored directly on the function node
+                if record["code"]:
+                    return f"Function {function_name} code:\n{record['code']}"
+
+        except Exception as e:
+            return f"Error getting function code: {str(e)}"
+
+    def find_related_classes(self, class_name: str) -> str:
+        """Find all classes that inherit from the given class"""
+        try:
+            with self.driver.session() as session:
+                result = session.run(
+                    "MATCH (related:Class)-[:INHERITS_FROM*]->(c:Class {name: $class_name}) "
+                    "RETURN related.name as related_class",
+                    class_name=class_name,
+                )
+                related = [record["related_class"] for record in result]
+            if related:
+                return f"Classes that inherit from {class_name}: {', '.join(related)}"
+            else:
+                return f"No classes inherit from {class_name}"
+        except Exception as e:
+            return f"Error finding related classes: {str(e)}"
+
+    def query_code(self, query: str) -> str:
+        """Execute custom Cypher queries on the code knowledge graph"""
+        try:
+            with self.driver.session() as session:
+                result = session.run(query)
+                records = [dict(record) for record in result]
+                return str(records) if records else "No results found"
+        except Exception as e:
+            return f"Query error: {str(e)}"