Skip to content
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
7c17189
Created files for the data science persona
jonahjung22 Jul 17, 2025
2e85525
Rebuilt the framework, implementing an agent and autogluon tools
jonahjung22 Jul 18, 2025
e04033d
enhancing ml modeling capabilities
jonahjung22 Jul 22, 2025
8445498
modified toml
jonahjung22 Jul 22, 2025
00d44e4
improved autogluoon tool data reading capabilities; agent greeting ha…
jonahjung22 Jul 22, 2025
bdb5966
Changed agent features, new test notebook, and autogluon data handling
jonahjung22 Jul 23, 2025
a7aa9c7
updated toml
jonahjung22 Jul 23, 2025
1133b34
Merge branch 'main' into pocketflow-ds
jonahjung22 Jul 23, 2025
97d63fc
Updated README file
jonahjung22 Jul 24, 2025
a484210
added greetings
jonahjung22 Jul 24, 2025
ae865d4
file added to wrong branch
jonahjung22 Jul 24, 2025
238d4da
modified file reading capabilitiesand data injestion
jonahjung22 Jul 24, 2025
732553e
refined test files and persona main code
jonahjung22 Jul 28, 2025
a479d48
new dataset recommendation tool feature
jonahjung22 Jul 28, 2025
060bc1c
added test files
jonahjung22 Jul 29, 2025
e0de226
enhanced autogluon model training capabilties and improve dataset_rec…
jonahjung22 Jul 29, 2025
a8fd8d9
separated nodes and agent into separate files after improvements to t…
jonahjung22 Jul 29, 2025
c420dbd
removing lines
jonahjung22 Jul 29, 2025
4319942
enhanced featurse and removed unnecessary code
jonahjung22 Jul 29, 2025
4e25b7a
improved strategy implementation
jonahjung22 Jul 29, 2025
0c96efc
modified toml and test case
jonahjung22 Jul 29, 2025
466549c
minor changes for better prompting with train_ml decision
jonahjung22 Jul 29, 2025
90bc37e
adding PR fixes and code logic, calling the llm for domain type, and …
jonahjung22 Jul 30, 2025
8bf6646
updated README
jonahjung22 Jul 30, 2025
a4f9b86
autogluon tool domain extraction improvement
jonahjung22 Aug 1, 2025
2129aad
optimizing code for review
jonahjung22 Aug 2, 2025
a0909bb
fixing unit test dependency failure
jonahjung22 Aug 2, 2025
7185762
dependency change
jonahjung22 Aug 2, 2025
9d12ec1
Dependency fix
jonahjung22 Aug 2, 2025
ad17651
Dependency fix
jonahjung22 Aug 4, 2025
5d32d39
Dependency fix
jonahjung22 Aug 4, 2025
c828dff
Dependency fix
jonahjung22 Aug 4, 2025
7c15a81
dependency fix
jonahjung22 Aug 4, 2025
a9b2d16
removing un-related data science persona files
jonahjung22 Aug 4, 2025
9176ab8
removed unnecessary comment, fixed logic of the autogluon and data re…
jonahjung22 Aug 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
412 changes: 412 additions & 0 deletions jupyter_ai_personas/data_science_persona/README.md

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions jupyter_ai_personas/data_science_persona/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Import the main persona
from .persona import DataSciencePersona

# Import PocketFlow classes for convenience
from .pocketflow import Node, Flow, BaseNode

# Import tools
from .file_reader_tool import NotebookReaderTool
from .autogluon_tool import AutoGluonTool

__all__ = ["DataSciencePersona", "Node", "Flow", "BaseNode", "NotebookReaderTool", "AutoGluonTool"]
497 changes: 497 additions & 0 deletions jupyter_ai_personas/data_science_persona/agent.py

Large diffs are not rendered by default.

614 changes: 614 additions & 0 deletions jupyter_ai_personas/data_science_persona/autogluon_tool.py

Large diffs are not rendered by default.

Large diffs are not rendered by default.

170 changes: 170 additions & 0 deletions jupyter_ai_personas/data_science_persona/file_reader_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import json
import os
from typing import Dict, Any, List, Optional
from agno.tools import Toolkit


class NotebookReaderTool(Toolkit):
"""Tool for reading and extracting complete content from Jupyter notebooks."""

def __init__(self):
super().__init__(name="notebook_reader")
self.register(self.extract_rag_context)

def extract_rag_context(self, notebook_path: str) -> str:
"""
Extract complete content from a Jupyter notebook for RAG context.

Args:
notebook_path: Path to the .ipynb notebook file

Returns:
str: Formatted string containing all notebook content including cells,
outputs, markdown, and metadata
"""
try:
if not os.path.exists(notebook_path):
return f"Error: Notebook file not found at {notebook_path}"

if not notebook_path.endswith('.ipynb'):
return f"Error: File must be a .ipynb notebook file, got {notebook_path}"

with open(notebook_path, 'r', encoding='utf-8') as f:
notebook = json.load(f)

# Extract notebook metadata
context = f"=== NOTEBOOK ANALYSIS ===\n"
context += f"File: {notebook_path}\n"
context += f"Kernel: {notebook.get('metadata', {}).get('kernelspec', {}).get('display_name', 'Unknown')}\n"
context += f"Language: {notebook.get('metadata', {}).get('kernelspec', {}).get('language', 'Unknown')}\n\n"

# Extract cells content
cells = notebook.get('cells', [])
context += f"=== NOTEBOOK CONTENT ({len(cells)} cells) ===\n\n"

for i, cell in enumerate(cells, 1):
cell_type = cell.get('cell_type', 'unknown')
context += f"--- Cell {i} ({cell_type.upper()}) ---\n"

# Get cell source
source = cell.get('source', [])
if isinstance(source, list):
source_text = ''.join(source)
else:
source_text = str(source)

context += f"SOURCE:\n{source_text}\n"

# Get cell outputs for code cells
if cell_type == 'code':
outputs = cell.get('outputs', [])
if outputs:
context += f"OUTPUTS:\n"
for j, output in enumerate(outputs):
output_type = output.get('output_type', 'unknown')
context += f" Output {j+1} ({output_type}):\n"

# Handle different output types
if output_type == 'stream':
text = ''.join(output.get('text', []))
context += f" {text}\n"
elif output_type == 'execute_result' or output_type == 'display_data':
data = output.get('data', {})
for mime_type, content in data.items():
if mime_type == 'text/plain':
if isinstance(content, list):
content = ''.join(content)
context += f" {content}\n"
elif mime_type == 'text/html':
context += f" [HTML OUTPUT]\n"
elif 'image' in mime_type:
context += f" [IMAGE: {mime_type}]\n"
elif output_type == 'error':
ename = output.get('ename', 'Error')
evalue = output.get('evalue', '')
context += f" ERROR: {ename}: {evalue}\n"

context += "\n"

# Extract imports and library usage
imports = self._extract_imports(notebook)
if imports:
context += f"=== DETECTED LIBRARIES ===\n"
for imp in imports:
context += f"- {imp}\n"
context += "\n"

# Extract data science context
ds_context = self._extract_data_science_context(notebook)
if ds_context:
context += f"=== DATA SCIENCE CONTEXT ===\n{ds_context}\n"

return context

except json.JSONDecodeError:
return f"Error: Invalid JSON in notebook file {notebook_path}"
except Exception as e:
return f"Error reading notebook {notebook_path}: {str(e)}"

def _extract_imports(self, notebook: Dict[str, Any]) -> List[str]:
"""Extract import statements from notebook cells."""
imports = []
cells = notebook.get('cells', [])

for cell in cells:
if cell.get('cell_type') == 'code':
source = cell.get('source', [])
if isinstance(source, list):
source_text = ''.join(source)
else:
source_text = str(source)

# Look for import statements
lines = source_text.split('\n')
for line in lines:
line = line.strip()
if line.startswith('import ') or line.startswith('from '):
imports.append(line)

return list(set(imports)) # Remove duplicates

def _extract_data_science_context(self, notebook: Dict[str, Any]) -> str:
"""Extract data science context from notebook content."""
context_items = []
cells = notebook.get('cells', [])

# Common data science patterns
ds_patterns = {
'pandas': ['pd.read_', 'DataFrame', '.head()', '.describe()', '.info()'],
'numpy': ['np.array', 'np.mean', 'np.std', 'numpy'],
'matplotlib': ['plt.', 'matplotlib', '.plot()', '.show()'],
'seaborn': ['sns.', 'seaborn'],
'sklearn': ['sklearn', 'fit()', 'predict()', 'score()'],
'analysis': ['correlation', 'regression', 'classification', 'clustering'],
'data_ops': ['merge', 'join', 'groupby', 'pivot', 'melt']
}

detected = {category: [] for category in ds_patterns.keys()}

for cell in cells:
if cell.get('cell_type') == 'code':
source = cell.get('source', [])
if isinstance(source, list):
source_text = ''.join(source)
else:
source_text = str(source)

for category, patterns in ds_patterns.items():
for pattern in patterns:
if pattern.lower() in source_text.lower():
detected[category].append(pattern)

# Build context description
active_categories = {k: list(set(v)) for k, v in detected.items() if v}

if active_categories:
context_items.append("Analysis stage indicators:")
for category, patterns in active_categories.items():
context_items.append(f" {category}: {', '.join(patterns[:3])}") # Limit to 3 examples

return '\n'.join(context_items) if context_items else ""
Loading
Loading