Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
ec003b9
New persona: PR Reviewer
bhavana-nair Jun 6, 2025
dd50cea
added ci_tools, removed pr_tools
bhavana-nair Jun 12, 2025
3ac5afb
class name changed to PRReviewPersona
bhavana-nair Jun 12, 2025
19d3d40
GITHUB_ACCESS_TOKEN variable name change and handles errors gracefully
bhavana-nair Jun 14, 2025
b129166
Readme and comments to handle ThrottlingException error
bhavana-nair Jun 14, 2025
e4c5c98
commented out temp file deletion
bhavana-nair Jun 16, 2025
7670b3f
added unit tests
bhavana-nair Jun 18, 2025
6f827a9
Merge branch 'main' into pr-reviewer
bhavana-nair Jun 18, 2025
6cb0fd6
added all the dependencies
bhavana-nair Jun 18, 2025
220eeaf
python version in test
bhavana-nair Jun 18, 2025
f6ba3cf
test ci
bhavana-nair Jun 18, 2025
9705f40
test ci
bhavana-nair Jun 18, 2025
fabcdd6
fixing async methods in test
bhavana-nair Jun 23, 2025
0b321b6
stream_message to send_message, removed print statements, replaced ci…
bhavana-nair Jun 30, 2025
4ddc6dc
Create inline PR comments
bhavana-nair Jul 6, 2025
387b9b7
Unit test
bhavana-nair Jul 7, 2025
25f979b
Merge branch 'main' into pr-reviewer
bhavana-nair Jul 7, 2025
fd1961d
ruff format
bhavana-nair Jul 7, 2025
75caf02
Modified commenting tool to handle larger PRs smoothly.
bhavana-nair Jul 11, 2025
d2dac41
Merge branch 'main' into pr-reviewer
bhavana-nair Jul 11, 2025
d989cc3
Updating personas to use config_manager insted of config to make latt…
bhavana-nair Jul 11, 2025
f217837
Context awareness tool and KG implementation
bhavana-nair Jul 13, 2025
c909b81
Added code embedings in KG for semantic similarity search
bhavana-nair Jul 14, 2025
d759cf6
PR Fixing Persona integrated with KG access
bhavana-nair Jul 14, 2025
771afa6
Removed issue logic
bhavana-nair Jul 15, 2025
0984ef6
Tool utilization updated
bhavana-nair Jul 20, 2025
385382b
Fix: Added Neo4j connection context manager and improved documentation
bhavana-nair Jul 22, 2025
8877276
Fix: Improved function documentation and added type hints
bhavana-nair Jul 22, 2025
005d04d
Fix: Added function documentation and type hints
bhavana-nair Jul 22, 2025
04ffbd2
Fix: Added comprehensive documentation and nested context manager usage
bhavana-nair Jul 22, 2025
2614855
Fix: Added documentation, type hints, and example
bhavana-nair Jul 22, 2025
9d14f2f
Fix: Added comprehensive documentation, type hints, and error handling
bhavana-nair Jul 22, 2025
42eca8c
Fix: Added required base dependencies with version constraints
bhavana-nair Jul 22, 2025
b5b9652
Fix: Added version constraints and test dependencies
bhavana-nair Jul 22, 2025
bcf4b42
Fix: Added version constraints and test dependencies
bhavana-nair Jul 22, 2025
aa9ef08
Fix: Added dev dependencies for testing and code quality
bhavana-nair Jul 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .jupyter_ystore.db
Binary file not shown.
49 changes: 0 additions & 49 deletions gfhgd.chat

This file was deleted.

334 changes: 334 additions & 0 deletions jupyter_ai_personas/knowledge_graph/bulk_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
import os
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
from neo4j import GraphDatabase
import hashlib
import boto3
import json
from contextlib import contextmanager
from typing import Optional

class Neo4jConnection:
"""Context manager for Neo4j database connections.

Usage:
analyzer = BulkCodeAnalyzer(uri, auth)
with analyzer.db_connection() as session:
session.run("MATCH (n) RETURN n")
"""
def __init__(self, driver):
self.driver = driver

def __enter__(self):
self.session = self.driver.session()
return self.session

def __exit__(self, exc_type, exc_val, exc_tb):
if self.session:
self.session.close()

class BulkCodeAnalyzer:
"""Analyzes Python source code and builds a knowledge graph in Neo4j.

This class parses Python source files using tree-sitter, extracts code elements
like classes and functions, and stores their relationships in a Neo4j graph database.
Optionally generates embeddings using AWS Bedrock for semantic analysis.

Args:
uri (str): Neo4j database URI
auth (tuple): Neo4j authentication tuple (username, password)
embd_name (str, optional): Name of the embedding service (e.g. "Bedrock")
embd_id (str, optional): Model ID for embeddings (e.g. "amazon.titan-embed-text-v1")

Example:
>>> analyzer = BulkCodeAnalyzer(
... "neo4j://localhost:7687",
... ("neo4j", "password"),
... embd_name="Bedrock",
... embd_id="amazon.titan-embed-text-v1"
... )
>>> analyzer.analyze_folder("path/to/code")
"""
def __init__(self, uri, auth, embd_name: Optional[str] = None, embd_id: Optional[str] = None):
self.driver = GraphDatabase.driver(uri, auth=auth)
self.PY_LANGUAGE = Language(tspython.language())
self.parser = Parser(self.PY_LANGUAGE)
self.embd_name = embd_name # Bedrock
self.embd_id = embd_id # amazon.titan-embed-text-v1
self.bedrock_client = boto3.client('bedrock-runtime') if embd_name else None

@contextmanager
def db_connection(self):
"""Context manager for Neo4j database sessions.

Yields:
neo4j.Session: Database session that will be automatically closed

Example:
>>> with analyzer.db_connection() as session:
... session.run("MATCH (n) RETURN n")
"""
conn = Neo4jConnection(self.driver)
with conn as session:
yield session

def analyze_folder(self, folder_path: str, clear_existing: bool = False) -> None:
"""Analyze all supported files in a folder and add to knowledge graph.

Walks through a directory tree and processes supported files to extract code
elements and their relationships. Currently supports Python files.

Args:
folder_path (str): Path to the folder containing source code
clear_existing (bool, optional): Whether to clear existing graph data. Defaults to False.

Example:
>>> analyzer = BulkCodeAnalyzer(uri, auth)
>>> # Clear graph and analyze project
>>> analyzer.analyze_folder("my_project", clear_existing=True)
>>> # Add more files without clearing
>>> analyzer.analyze_folder("another_project")
"""
# Clear existing graph if requested
if clear_existing:
with self.db_connection() as session:
session.run("MATCH (n) DETACH DELETE n")
print("Cleared existing graph")

# Supported file extensions
supported_extensions = {'.py'} # Phase 1: Python files only

# Find all supported files
all_files = []
for root, dirs, files in os.walk(folder_path):
for file in files:
file_ext = os.path.splitext(file)[1]
if file_ext in supported_extensions:
all_files.append(os.path.join(root, file))

print(f"Found {len(all_files)} supported files")

# Process each file with safe database connection handling
with self.db_connection() as session:
for file_path in all_files:
print(f"Analyzing: {file_path}")
try:
if file_path.endswith('.py'):
self._analyze_file(file_path, session)
else:
self._analyze_non_python_file(file_path, session)
except Exception as e:
print(f"Error analyzing {file_path}: {e}")

def _analyze_file(self, file_path: str, session) -> None:
"""Parse and analyze a Python source file.

Uses tree-sitter to parse the file and extract code elements.

Args:
file_path (str): Path to the Python file to analyze
session: Neo4j database session

Example:
>>> with analyzer.db_connection() as session:
... analyzer._analyze_file("my_code.py", session)
"""
with open(file_path, 'r', encoding='utf-8') as f:
code = f.read()

tree = self.parser.parse(bytes(code, 'utf8'))
self._extract_code_elements(tree.root_node, session, file_path)

def _extract_code_elements(self, node, session, file_path: str, current_class: Optional[str] = None) -> None:
"""Recursively extract code elements from an AST node.

Processes class definitions, function definitions, and their relationships.
Builds graph nodes and relationships for code structure analysis.

Args:
node: tree-sitter AST node
session: Neo4j database session
file_path (str): Source file path
current_class (str, optional): Name of current class being processed

Example:
>>> tree = parser.parse(bytes(code, 'utf8'))
>>> with analyzer.db_connection() as session:
... analyzer._extract_code_elements(tree.root_node, session, "code.py")
"""
if node.type == 'class_definition':
class_name = node.child_by_field_name("name").text.decode('utf8')
class_code = node.text.decode('utf8', errors='ignore')
embedding = self._get_embedding(class_code) if self.bedrock_client else None

session.run(
"MERGE (c:Class {name: $name}) SET c.file = $file, c.embedding = $embedding",
name=class_name, file=file_path, embedding=embedding
)

superclasses = node.child_by_field_name("superclasses")
if superclasses:
for child in superclasses.children:
if child.type == 'identifier':
parent = child.text.decode('utf8')
session.run("MERGE (parent:Class {name: $parent})", parent=parent)
session.run(
"MATCH (parent:Class {name: $parent}), (child:Class {name: $child}) "
"MERGE (child)-[:INHERITS_FROM]->(parent)",
parent=parent, child=class_name
)

for child in node.children:
self._extract_code_elements(child, session, file_path, class_name)

elif node.type == 'function_definition':
func_name = node.child_by_field_name("name").text.decode('utf8')
func_code = node.text.decode('utf8', errors='ignore')

params_node = node.child_by_field_name("parameters")
params = []
if params_node:
for child in params_node.children:
if child.type == 'identifier':
params.append(child.text.decode('utf8'))

code_hash = hashlib.md5(func_code.encode()).hexdigest()

# Generate embedding for function code
embedding = self._get_embedding(func_code) if self.bedrock_client else None

session.run(
"MERGE (f:Function {name: $name, file: $file}) "
"SET f.code = $code, f.code_hash = $hash, f.parameters = $params, f.line_start = $start, f.line_end = $end, f.embedding = $embedding",
name=func_name, file=file_path, code=func_code, hash=code_hash, params=params,
start=node.start_point[0], end=node.end_point[0], embedding=embedding
)

if current_class:
session.run(
"MATCH (c:Class {name: $class_name}), (f:Function {name: $func_name, file: $file}) "
"MERGE (c)-[:CONTAINS]->(f)",
class_name=current_class, func_name=func_name, file=file_path
)

# Extract function calls
self._extract_function_calls(node, session, func_name, file_path)

else:
for child in node.children:
self._extract_code_elements(child, session, file_path, current_class)

def _analyze_non_python_file(self, file_path: str, session) -> None:
"""Analyze non-Python files with basic content indexing.

Creates File nodes in the graph with basic metadata and content preview.
Optionally generates embeddings for semantic analysis.

Args:
file_path (str): Path to the file to analyze
session: Neo4j database session

Note:
- Stores only first 5000 characters of content
- Files that can't be read still get nodes with error info

Example:
>>> with analyzer.db_connection() as session:
... analyzer._analyze_non_python_file("README.md", session)
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()

# Create a File node for non-Python files
# Only use first 5000 chars to avoid overloading db
content_preview = content[:5000]
embedding = self._get_embedding(content_preview) if self.bedrock_client else None

with self.db_connection() as session:
session.run(
"MERGE (f:File {path: $path}) SET f.content = $content, f.size = $size, f.type = $type, f.embedding = $embedding",
path=file_path,
content=content_preview,
size=len(content),
type=os.path.splitext(file_path)[1],
embedding=embedding
)

except Exception as e:
print(f"Error reading {file_path}: {e}")
# Create File node without content but with error info
with self.db_connection() as session:
session.run(
"MERGE (f:File {path: $path}) SET f.error = $error, f.type = $type",
path=file_path,
error=str(e),
type=os.path.splitext(file_path)[1]
)

def _extract_function_calls(self, func_node, session, caller_name, file_path):
"""Extract function calls from a function body"""
def find_calls(node):
calls = []
if node.type == 'call':
func_expr = node.child_by_field_name('function')
if func_expr and func_expr.type == 'identifier':
called_func = func_expr.text.decode('utf8')
calls.append(called_func)
elif func_expr and func_expr.type == 'attribute':
# Handle method calls like obj.method()
attr = func_expr.child_by_field_name('attribute')
if attr:
called_func = attr.text.decode('utf8')
calls.append(called_func)

for child in node.children:
calls.extend(find_calls(child))
return calls

called_functions = find_calls(func_node)

for called_func in called_functions:
# Create CALLS relationship
session.run(
"MATCH (caller:Function {name: $caller, file: $file}) "
"MERGE (called:Function {name: $called}) "
"MERGE (caller)-[:CALLS]->(called)",
caller=caller_name, called=called_func, file=file_path
)

def _get_embedding(self, text: str) -> Optional[list]:
"""Generate embedding using AWS Bedrock Titan model.

Uses AWS Bedrock service to create semantic embeddings for text content.
Useful for semantic similarity search and content analysis.

Args:
text (str): Text content to generate embedding for

Returns:
Optional[list]: Embedding vector if successful, None if failed

Example:
>>> code = "def hello(): print('world')"
>>> embedding = analyzer._get_embedding(code)
>>> if embedding:
... print(f"Got {len(embedding)}-dimensional embedding")
"""
try:
if not self.bedrock_client:
return None

response = self.bedrock_client.invoke_model(
modelId=self.embd_id,
body=json.dumps({"inputText": text})
)
embedding = json.loads(response['body'].read())['embedding']
return embedding
except Exception as e:
print(f"Error generating embedding: {e}")
return None


# analyzer = BulkCodeAnalyzer("neo4j://127.0.0.1:7687", ("neo4j", "Bhavana@97"))
# analyzer.analyze_folder("source_code", clear_existing=True)
Loading