Skip to content
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ec003b9
New persona: PR Reviewer
bhavana-nair Jun 6, 2025
dd50cea
added ci_tools, removed pr_tools
bhavana-nair Jun 12, 2025
3ac5afb
class name changed to PRReviewPersona
bhavana-nair Jun 12, 2025
19d3d40
GITHUB_ACCESS_TOKEN variable name change and handles errors gracefully
bhavana-nair Jun 14, 2025
b129166
Readme and comments to handle ThrottlingException error
bhavana-nair Jun 14, 2025
e4c5c98
commented out temp file deletion
bhavana-nair Jun 16, 2025
7670b3f
added unit tests
bhavana-nair Jun 18, 2025
6f827a9
Merge branch 'main' into pr-reviewer
bhavana-nair Jun 18, 2025
6cb0fd6
added all the dependencies
bhavana-nair Jun 18, 2025
220eeaf
python version in test
bhavana-nair Jun 18, 2025
f6ba3cf
test ci
bhavana-nair Jun 18, 2025
9705f40
test ci
bhavana-nair Jun 18, 2025
fabcdd6
fixing async methods in test
bhavana-nair Jun 23, 2025
0b321b6
stream_message to send_message, removed print statements, replaced ci…
bhavana-nair Jun 30, 2025
4ddc6dc
Create inline PR comments
bhavana-nair Jul 6, 2025
387b9b7
Unit test
bhavana-nair Jul 7, 2025
25f979b
Merge branch 'main' into pr-reviewer
bhavana-nair Jul 7, 2025
fd1961d
ruff format
bhavana-nair Jul 7, 2025
75caf02
Modified commenting tool to handle larger PRs smoothly.
bhavana-nair Jul 11, 2025
d2dac41
Merge branch 'main' into pr-reviewer
bhavana-nair Jul 11, 2025
d989cc3
Updating personas to use config_manager insted of config to make latt…
bhavana-nair Jul 11, 2025
f217837
Context awareness tool and KG implementation
bhavana-nair Jul 13, 2025
c909b81
Added code embedings in KG for semantic similarity search
bhavana-nair Jul 14, 2025
d360c74
Formatting
bhavana-nair Jul 16, 2025
6e8959e
innitial msg
bhavana-nair Jul 24, 2025
a2eed7d
Merge branch 'main' into context-awareness
bhavana-nair Jul 24, 2025
fe28119
Test fix
bhavana-nair Jul 24, 2025
b79f5d9
non-functional code cleanup
bhavana-nair Aug 1, 2025
81e9ee1
Merge branch 'main' into context-awareness
bhavana-nair Aug 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 209 additions & 0 deletions jupyter_ai_personas/knowledge_graph/bulk_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import os
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Documentation: Missing module-level documentation and usage examples. Please add:

  • Module purpose and functionality overview
  • Configuration options
  • Usage examples
  • Error handling documentation

Example:

"""
Bulk Analyzer module for processing multiple code artifacts in the knowledge graph.

This module provides tools for analyzing multiple code files or components simultaneously,
enabling efficient knowledge graph population.

Configuration:
    See config_example.yaml for detailed settings
"""

import tree_sitter_python as tspython
from tree_sitter import Language, Parser
from neo4j import GraphDatabase
import hashlib
import boto3
import json


class BulkCodeAnalyzer:
def __init__(self, uri, auth, embd_name=None, embd_id=None):
self.driver = GraphDatabase.driver(uri, auth=auth)
self.PY_LANGUAGE = Language(tspython.language())
self.parser = Parser(self.PY_LANGUAGE)
self.embd_name = embd_name # Bedrock
self.embd_id = embd_id # amazon.titan-embed-text-v1
self.bedrock_client = boto3.client("bedrock-runtime") if embd_name else None

def analyze_folder(self, folder_path, clear_existing=False):
"""Analyze all supported files in a folder and add to knowledge graph"""
if clear_existing:
with self.driver.session() as session:
session.run("MATCH (n) DETACH DELETE n")
print("Cleared existing graph")

# Supported file extensions
supported_extensions = {".py"} # for 1st phase just py

all_files = []
for root, dirs, files in os.walk(folder_path):
for file in files:
file_ext = os.path.splitext(file)[1]
if file_ext in supported_extensions:
all_files.append(os.path.join(root, file))

print(f"Found {len(all_files)} supported files")

with self.driver.session() as session:
for file_path in all_files:
print(f"Analyzing: {file_path}")
try:
if file_path.endswith(".py"):
self._analyze_file(file_path, session)
else:
self._analyze_non_python_file(file_path, session)
except Exception as e:
print(f"Error analyzing {file_path}: {e}")

def _analyze_file(self, file_path, session):
with open(file_path, "r", encoding="utf-8") as f:
code = f.read()

tree = self.parser.parse(bytes(code, "utf8"))
self._extract_code_elements(tree.root_node, session, file_path)

def _extract_code_elements(self, node, session, file_path, current_class=None):
if node.type == "class_definition":
class_name = node.child_by_field_name("name").text.decode("utf8")
class_code = node.text.decode("utf8", errors="ignore")
embedding = self._get_embedding(class_code) if self.bedrock_client else None

session.run(
"MERGE (c:Class {name: $name}) SET c.file = $file, c.embedding = $embedding",
name=class_name,
file=file_path,
embedding=embedding,
)

superclasses = node.child_by_field_name("superclasses")
if superclasses:
for child in superclasses.children:
if child.type == "identifier":
parent = child.text.decode("utf8")
session.run(
"MERGE (parent:Class {name: $parent})", parent=parent
)
session.run(
"MATCH (parent:Class {name: $parent}), (child:Class {name: $child}) "
"MERGE (child)-[:INHERITS_FROM]->(parent)",
parent=parent,
child=class_name,
)

for child in node.children:
self._extract_code_elements(child, session, file_path, class_name)

elif node.type == "function_definition":
func_name = node.child_by_field_name("name").text.decode("utf8")
func_code = node.text.decode("utf8", errors="ignore")

params_node = node.child_by_field_name("parameters")
params = []
if params_node:
for child in params_node.children:
if child.type == "identifier":
params.append(child.text.decode("utf8"))

code_hash = hashlib.md5(func_code.encode()).hexdigest()

# Generate embedding for function code
embedding = self._get_embedding(func_code) if self.bedrock_client else None

session.run(
"MERGE (f:Function {name: $name, file: $file}) "
"SET f.code = $code, f.code_hash = $hash, f.parameters = $params, f.line_start = $start, f.line_end = $end, f.embedding = $embedding",
name=func_name,
file=file_path,
code=func_code,
hash=code_hash,
params=params,
start=node.start_point[0],
end=node.end_point[0],
embedding=embedding,
)

if current_class:
session.run(
"MATCH (c:Class {name: $class_name}), (f:Function {name: $func_name, file: $file}) "
"MERGE (c)-[:CONTAINS]->(f)",
class_name=current_class,
func_name=func_name,
file=file_path,
)

# Extract function calls
self._extract_function_calls(node, session, func_name, file_path)

else:
for child in node.children:
self._extract_code_elements(child, session, file_path, current_class)

def _analyze_non_python_file(self, file_path, session):
"""Analyze non-Python files (basic content indexing)"""
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()

# Create a File node for non-Python files
embedding = (
self._get_embedding(content[:5000]) if self.bedrock_client else None
)

session.run(
"MERGE (f:File {path: $path}) SET f.content = $content, f.size = $size, f.type = $type, f.embedding = $embedding",
path=file_path,
content=content[:5000],
size=len(content),
type=os.path.splitext(file_path)[1],
embedding=embedding,
)

except Exception as e:
print(f"Error reading {file_path}: {e}")
# Create File node without content
session.run(
"MERGE (f:File {path: $path}) SET f.error = $error, f.type = $type",
path=file_path,
error=str(e),
type=os.path.splitext(file_path)[1],
)

def _extract_function_calls(self, func_node, session, caller_name, file_path):
"""Extract function calls from a function body"""

def find_calls(node):
calls = []
if node.type == "call":
func_expr = node.child_by_field_name("function")
if func_expr and func_expr.type == "identifier":
called_func = func_expr.text.decode("utf8")
calls.append(called_func)
elif func_expr and func_expr.type == "attribute":
# Handle method calls like obj.method()
attr = func_expr.child_by_field_name("attribute")
if attr:
called_func = attr.text.decode("utf8")
calls.append(called_func)

for child in node.children:
calls.extend(find_calls(child))
return calls

called_functions = find_calls(func_node)

for called_func in called_functions:
# Create CALLS relationship
session.run(
"MATCH (caller:Function {name: $caller, file: $file}) "
"MERGE (called:Function {name: $called}) "
"MERGE (caller)-[:CALLS]->(called)",
caller=caller_name,
called=called_func,
file=file_path,
)

def _get_embedding(self, text):
"""Generate embedding using AWS Bedrock Titan model"""
try:
response = self.bedrock_client.invoke_model(
modelId=self.embd_id, body=json.dumps({"inputText": text})
)
return json.loads(response["body"].read())["embedding"]
except Exception as e:
print(f"Error generating embedding: {e}")
return None


# analyzer = BulkCodeAnalyzer("neo4j://127.0.0.1:7687", ("neo4j", "Bhavana@97"))
# analyzer.analyze_folder("source_code", clear_existing=True)
123 changes: 123 additions & 0 deletions jupyter_ai_personas/knowledge_graph/code_analysis_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from agno.tools import Toolkit
from .bulk_analyzer import BulkCodeAnalyzer
from neo4j import GraphDatabase
import ast
import os


class CodeAnalysisTool(Toolkit):
def __init__(self):
super().__init__(name="code_analysis")
# Use environment variables for Neo4j credentials with defaults
neo4j_uri = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
neo4j_user = os.getenv("NEO4J_USER", "neo4j")
neo4j_password = os.getenv("NEO4J_PASSWORD")

if not neo4j_password:
raise ValueError("NEO4J_PASSWORD environment variable must be set")

self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
self.register(self.get_class_info)
self.register(self.find_related_classes)
self.register(self.query_code)
self.register(self.get_function_code)

def get_class_info(self, class_name: str) -> str:
"""Get detailed information about a class from the knowledge graph"""
try:
with self.driver.session() as session:
# Get class info
class_result = session.run(
"MATCH (c:Class {name: $class_name}) RETURN c.file as file",
class_name=class_name,
)
class_record = class_result.single()
if not class_record:
return f"Class {class_name} not found in knowledge graph"

inherit_result = session.run(
"MATCH (c:Class {name: $class_name})-[:INHERITS_FROM]->(parent:Class) "
"RETURN parent.name as parent_name",
class_name=class_name,
)
parents = [record["parent_name"] for record in inherit_result]

method_result = session.run(
"MATCH (c:Class {name: $class_name})-[:CONTAINS]->(f:Function) "
"RETURN f.name as method_name, f.parameters as params",
class_name=class_name,
)
methods = [
(record["method_name"], record["params"])
for record in method_result
]

info = f"Class {class_name}:\n"
info += f" File: {class_record['file']}\n"
if parents:
info += f" Inherits from: {', '.join(parents)}\n"
info += f" Methods:\n"
for method_name, params in methods:
param_str = ", ".join(params) if params else ""
info += f" {method_name}({param_str})\n"

return info
except Exception as e:
return f"Error getting class info: {str(e)}"

def get_function_code(self, function_name: str, class_name: str = None) -> str:
"""Get the source code of a function from the knowledge graph"""
try:
with self.driver.session() as session:
# Query function with code directly
if class_name:
result = session.run(
"MATCH (c:Class {name: $class_name})-[:CONTAINS]->(f:Function {name: $function_name}) "
"RETURN f.code as code, f.file as file, f.line_start as line_start, f.line_end as line_end",
class_name=class_name,
function_name=function_name,
)
else:
result = session.run(
"MATCH (f:Function {name: $function_name}) "
"RETURN f.code as code, f.file as file, f.line_start as line_start, f.line_end as line_end",
function_name=function_name,
)

record = result.single()
if not record:
return f"Function {function_name} not found"

# If code is stored directly on the function node
if record["code"]:
return f"Function {function_name} code:\n{record['code']}"

except Exception as e:
return f"Error getting function code: {str(e)}"

def find_related_classes(self, class_name: str) -> str:
"""Find all classes that inherit from the given class"""
try:
with self.driver.session() as session:
result = session.run(
"MATCH (related:Class)-[:INHERITS_FROM*]->(c:Class {name: $class_name}) "
"RETURN related.name as related_class",
class_name=class_name,
)
related = [record["related_class"] for record in result]
if related:
return f"Classes that inherit from {class_name}: {', '.join(related)}"
else:
return f"No classes inherit from {class_name}"
except Exception as e:
return f"Error finding related classes: {str(e)}"

def query_code(self, query: str) -> str:
"""Execute custom Cypher queries on the code knowledge graph"""
try:
with self.driver.session() as session:
result = session.run(query)
records = [dict(record) for record in result]
return str(records) if records else "No results found"
except Exception as e:
return f"Query error: {str(e)}"
Loading
Loading