diff --git a/jupyter-ai-personas b/jupyter-ai-personas deleted file mode 160000 index 4af5de3..0000000 --- a/jupyter-ai-personas +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4af5de32685badcea70cb30f8abfde93bf2c2ed6 diff --git a/jupyter_ai_personas/context_retrieval_persona/README.md b/jupyter_ai_personas/context_retrieval_persona/README.md new file mode 100644 index 0000000..1382e95 --- /dev/null +++ b/jupyter_ai_personas/context_retrieval_persona/README.md @@ -0,0 +1,214 @@ +# Context Retrieval Persona + +## Overview + +The Context Retrieval Persona analyzes your data science notebooks and finds relevant resources from the Python Data Science Handbook using RAG (Retrieval-Augmented Generation). It employs a three-agent system to provide comprehensive analysis and actionable recommendations. + +## Features + +- **Intelligent Notebook Analysis**: Extracts libraries, analysis stage, domain, and objectives from your notebooks +- **Full Notebook RAG Search**: Returns complete relevant notebooks instead of fragments for comprehensive context +- **Handbook-Only Search**: Avoids redundant searching by focusing on external handbook content only +- **Multi-Agent Coordination**: NotebookAnalyzer, KnowledgeSearcher, and MarkdownGenerator working together +- **Comprehensive Markdown Reports**: Detailed reports with code examples, explanations, and next steps +- **Optimized Search**: 1-2 complete notebooks per query with clean terminal logging +- **Automatic Report Generation**: Creates `repo_context.md` with comprehensive analysis + +## Architecture + +### Three-Agent System + +1. **NotebookAnalyzer**: Extracts structured context from your notebook + + - Uses `extract_rag_context` tool to read notebook content + - Identifies libraries (pandas, numpy, sklearn, matplotlib, etc.) + - Determines analysis stage (data_loading, eda, preprocessing, modeling, evaluation, visualization) + - Outputs structured JSON with path, libraries, stage, domain, and objectives + +2. **KnowledgeSearcher**: Performs targeted handbook-only RAG searches + + - Generates 4-5 targeted search queries based on notebook analysis + - Uses `search_handbook_only` to find relevant complete notebooks + - Each search returns 1-2 most relevant notebooks (not fragments) + - Provides comprehensive handbook content to MarkdownGenerator + +3. **MarkdownGenerator**: Creates detailed markdown reports + - Synthesizes notebook analysis with RAG search results + - Includes substantial content from retrieved handbooks + - Creates cross-references between user's work and handbook examples + - Saves comprehensive reports as `repo_context.md` + +## Core Components + +### Context Retrieval Persona (`persona.py`) + +- Main persona class orchestrating the three-agent system +- Handles Jupyter AI integration and message processing +- Initializes AWS Bedrock models and agent coordination +- Manages greeting detection and team workflow + +### RAG Tool (`rag_tool.py`) + +Core RAG system with two main classes: + +- **RAG**: Loads handbook content into ChromaDB vectorstore using HuggingFace embeddings +- **RAGTool**: Agno toolkit providing `search_handbook_only()` function +- Returns complete notebooks (1-2 per search) instead of fragments +- Clean terminal logging showing retrieved notebook titles and stats + +### Notebook Reader Tool (`file_reader_tool.py`) + +- `NotebookReaderTool`: Provides `extract_rag_context` function +- Reads complete notebook content and metadata +- Extracts context for the NotebookAnalyzer agent + +## Installation & Setup + +### Prerequisites + +Install the context retrieval persona with its dependencies: + +```bash +pip install -e ".[context_retriever]" +``` + +This installs: + +- `agno` - Multi-agent framework +- `boto3` - AWS Bedrock integration +- `langchain` & `langchain-core` & `langchain-community` - RAG framework +- `sentence-transformers` - Embedding models +- `chromadb` - Vector database +- `nbformat` - Jupyter notebook reading + +### Setup Python Data Science Handbook + +```bash +# Clone the handbook repository +cd jupyter_ai_personas/context_retrieval_persona/ +git clone https://github.com/jakevdp/PythonDataScienceHandbook.git +``` + +### AWS Configuration + +Configure AWS credentials for Bedrock access: + +```bash +aws configure +# or set environment variables: +export AWS_ACCESS_KEY_ID=your_key +export AWS_SECRET_ACCESS_KEY=your_secret +export AWS_DEFAULT_REGION=us-east-1 +``` + +## Usage + +### Basic Usage + +In Jupyter AI chat, use the @ mention to activate the persona: + +``` +@ContextRetrievalPersona notebook: /path/to/your/notebook.ipynb +Analyze my machine learning workflow and find relevant handbook resources +``` + +### Workflow Example + +1. **User Request**: Provides notebook path and description +2. **NotebookAnalyzer**: Reads and analyzes notebook content +3. **KnowledgeSearcher**: Performs 4-5 targeted searches in handbook +4. **MarkdownGenerator**: Creates comprehensive `repo_context.md` report + +### Terminal Output + +During processing, you'll see clean RAG search logs: + +``` +🔍 RAG SEARCH: 'sklearn RandomForest classification' +📚 Found 2 relevant notebooks: + 1. 05.08-Random-Forests.ipynb (15 cells, 12450 chars) + 2. 05.03-Hyperparameters-and-Model-Validation.ipynb (22 cells, 18920 chars) +``` + +### Generated Report Structure + +The `repo_context.md` file includes: + +- **Executive Summary**: Overview of findings and connections +- **Current Notebook Analysis**: Libraries, stage, domain, objectives from your notebook +- **Comprehensive Handbook Resources**: Full code examples and explanations from retrieved notebooks +- **Detailed Code Examples**: Complete implementations from handbook +- **Cross-References and Learning Paths**: Connections between your work and handbook content +- **Actionable Implementation Steps**: Specific next steps based on analysis + +## Technical Details + +### RAG Implementation + +- **Embedding Model**: `sentence-transformers/all-MiniLM-L6-v2` +- **Vector Store**: ChromaDB with persistent storage +- **Search Strategy**: Similarity search returning complete notebooks (not fragments) +- **Results per Search**: 2 most relevant complete notebooks +- **Cell-Based Chunking**: Uses notebook cells as natural document boundaries + +### Optimizations + +- **Handbook-Only Search**: Avoids redundant notebook content in RAG results +- **Complete Notebook Retrieval**: Returns full notebooks instead of fragments for better context +- **One-Time Loading**: Vector store loaded once per session with handbook_loaded flag +- **Clean Logging**: Minimal terminal output showing only essential search information +- **JSON Validation Fix**: Uses `capture_validation_error=None` to suppress nbformat warnings + +## File Structure + +``` +context_retrieval_persona/ +├── README.md # This documentation +├── persona.py # Main persona class with three-agent system +├── rag_tool.py # RAG and RAGTool classes for handbook search +├── file_reader_tool.py # NotebookReaderTool for content extraction +├── __init__.py # Package initialization +├── repo_context.md # Generated markdown reports +├── PythonDataScienceHandbook/ # Cloned handbook repository +│ └── notebooks/ # 100+ handbook notebooks +└── vector_stores/ # ChromaDB vector storage + └── rag/ # Renamed from simple_rag + ├── chroma.sqlite3 + └── [vector files] +``` + +## Troubleshooting + +### Common Issues + +1. **Missing Dependencies**: Install all required packages + + ```bash + pip install -e ".[context_retriever]" + ``` + +2. **Handbook Not Found**: Clone the handbook repository + + ```bash + cd jupyter_ai_personas/context_retrieval_persona/ + git clone https://github.com/jakevdp/PythonDataScienceHandbook.git + ``` + +3. **AWS/Bedrock Issues**: Configure AWS credentials + + ```bash + aws configure + ``` + +4. **JSON Validation Warnings**: These are now suppressed with `capture_validation_error=None` + +5. **Vector Store Loading**: First run builds the vector store (5-10 minutes), subsequent runs are fast + +## Contributing + +To extend the system: + +1. **Enhance RAG Search**: Modify `RAGTool` class in `rag_tool.py` +2. **Improve Context Extraction**: Update `NotebookReaderTool` in `file_reader_tool.py` +3. **Refine Agent Instructions**: Update agent prompts in `persona.py` +4. **Add New Analysis Capabilities**: Extend the three-agent system workflow diff --git a/jupyter_ai_personas/context_retrieval_persona/__init__.py b/jupyter_ai_personas/context_retrieval_persona/__init__.py new file mode 100644 index 0000000..879b745 --- /dev/null +++ b/jupyter_ai_personas/context_retrieval_persona/__init__.py @@ -0,0 +1 @@ +"""Data Science Persona package for Jupyter AI.""" \ No newline at end of file diff --git a/jupyter_ai_personas/context_retrieval_persona/file_reader_tool.py b/jupyter_ai_personas/context_retrieval_persona/file_reader_tool.py new file mode 100644 index 0000000..7f81c36 --- /dev/null +++ b/jupyter_ai_personas/context_retrieval_persona/file_reader_tool.py @@ -0,0 +1,160 @@ +import json +import os +from typing import Dict, Any, List, Optional +from agno.tools import Toolkit + +class NotebookReaderTool(Toolkit): + """Tool for reading and extracting complete content from Jupyter notebooks.""" + + def __init__(self): + super().__init__(name="notebook_reader") + self.register(self.extract_rag_context) + + def extract_rag_context(self, notebook_path: str) -> str: + """ + Extract complete content from a Jupyter notebook for RAG context. + + Args: + notebook_path: Path to the .ipynb notebook file + + Returns: + str: Formatted string containing all notebook content including cells, + outputs, markdown, and metadata + """ + try: + if not os.path.exists(notebook_path): + return f"Error: Notebook file not found at {notebook_path}" + + if not notebook_path.endswith('.ipynb'): + return f"Error: File must be a .ipynb notebook file, got {notebook_path}" + + with open(notebook_path, 'r', encoding='utf-8') as f: + notebook = json.load(f) + + # Extract notebook metadata and cells + context = f"=== NOTEBOOK ANALYSIS ===\n" + context += f"File: {notebook_path}\n" + context += f"Kernel: {notebook.get('metadata', {}).get('kernelspec', {}).get('display_name', 'Unknown')}\n" + context += f"Language: {notebook.get('metadata', {}).get('kernelspec', {}).get('language', 'Unknown')}\n\n" + cells = notebook.get('cells', []) + context += f"=== NOTEBOOK CONTENT ({len(cells)} cells) ===\n\n" + + for i, cell in enumerate(cells, 1): + cell_type = cell.get('cell_type', 'unknown') + context += f"--- Cell {i} ({cell_type.upper()}) ---\n" + source = cell.get('source', []) + if isinstance(source, list): + source_text = ''.join(source) + else: + source_text = str(source) + + context += f"SOURCE:\n{source_text}\n" + + # Get cell outputs for code cells + if cell_type == 'code': + outputs = cell.get('outputs', []) + if outputs: + context += f"OUTPUTS:\n" + for j, output in enumerate(outputs): + output_type = output.get('output_type', 'unknown') + context += f" Output {j+1} ({output_type}):\n" + if output_type == 'stream': + text = ''.join(output.get('text', [])) + context += f" {text}\n" + elif output_type == 'execute_result' or output_type == 'display_data': + data = output.get('data', {}) + for mime_type, content in data.items(): + if mime_type == 'text/plain': + if isinstance(content, list): + content = ''.join(content) + context += f" {content}\n" + elif mime_type == 'text/html': + context += f" [HTML OUTPUT]\n" + elif 'image' in mime_type: + context += f" [IMAGE: {mime_type}]\n" + elif output_type == 'error': + ename = output.get('ename', 'Error') + evalue = output.get('evalue', '') + context += f" ERROR: {ename}: {evalue}\n" + + context += "\n" + + # Extract imports and library usage + imports = self._extract_imports(notebook) + if imports: + context += f"=== DETECTED LIBRARIES ===\n" + for imp in imports: + context += f"- {imp}\n" + context += "\n" + + # Extract data science context + ds_context = self._extract_data_science_context(notebook) + if ds_context: + context += f"=== DATA SCIENCE CONTEXT ===\n{ds_context}\n" + + return context + + except json.JSONDecodeError: + return f"Error: Invalid JSON in notebook file {notebook_path}" + except Exception as e: + return f"Error reading notebook {notebook_path}: {str(e)}" + + def _extract_imports(self, notebook: Dict[str, Any]) -> List[str]: + """Extract import statements from notebook cells.""" + imports = [] + cells = notebook.get('cells', []) + + for cell in cells: + if cell.get('cell_type') == 'code': + source = cell.get('source', []) + if isinstance(source, list): + source_text = ''.join(source) + else: + source_text = str(source) + + lines = source_text.split('\n') + for line in lines: + line = line.strip() + if line.startswith('import ') or line.startswith('from '): + imports.append(line) + + return list(set(imports)) + + def _extract_data_science_context(self, notebook: Dict[str, Any]) -> str: + """Extract data science context from notebook content.""" + context_items = [] + cells = notebook.get('cells', []) + + ds_patterns = { + 'pandas': ['pd.read_', 'DataFrame', '.head()', '.describe()', '.info()'], + 'numpy': ['np.array', 'np.mean', 'np.std', 'numpy'], + 'matplotlib': ['plt.', 'matplotlib', '.plot()', '.show()'], + 'seaborn': ['sns.', 'seaborn'], + 'sklearn': ['sklearn', 'fit()', 'predict()', 'score()'], + 'analysis': ['correlation', 'regression', 'classification', 'clustering'], + 'data_ops': ['merge', 'join', 'groupby', 'pivot', 'melt'] + } + + detected = {category: [] for category in ds_patterns.keys()} + + for cell in cells: + if cell.get('cell_type') == 'code': + source = cell.get('source', []) + if isinstance(source, list): + source_text = ''.join(source) + else: + source_text = str(source) + + for category, patterns in ds_patterns.items(): + for pattern in patterns: + if pattern.lower() in source_text.lower(): + detected[category].append(pattern) + + active_categories = {k: list(set(v)) for k, v in detected.items() if v} + + if active_categories: + context_items.append("Analysis stage indicators:") + for category, patterns in active_categories.items(): + context_items.append(f" {category}: {', '.join(patterns[:3])}") + + return '\n'.join(context_items) if context_items else "" \ No newline at end of file diff --git a/jupyter_ai_personas/context_retrieval_persona/persona.py b/jupyter_ai_personas/context_retrieval_persona/persona.py new file mode 100644 index 0000000..847ed20 --- /dev/null +++ b/jupyter_ai_personas/context_retrieval_persona/persona.py @@ -0,0 +1,251 @@ +from jupyter_ai.personas.base_persona import BasePersona, PersonaDefaults +from jupyterlab_chat.models import Message +from jupyter_ai.history import YChatHistory +from agno.agent import Agent +from agno.models.aws import AwsBedrock +from agno.team.team import Team +from agno.tools.file import FileTools +import boto3 +from langchain_core.messages import HumanMessage +from .file_reader_tool import NotebookReaderTool +from .rag_tool import create_rag_tools + +session = boto3.Session() + +class ContextRetrievalPersona(BasePersona): + """ + Context Retrieval Specialist that analyzes prompts and notebook content + to find relevant documentation and resources using RAG. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @property + def defaults(self): + return PersonaDefaults( + name="ContextRetrievalPersona", + avatar_path="/api/ai/static/jupyternaut.svg", + description="Context retrieval specialist for data science projects. Analyzes prompts and notebooks to find relevant resources using RAG.", + system_prompt="""I am a context retrieval specialist team that analyzes your data science work and finds relevant resources from the Python Data Science Handbook using RAG search. + + My team consists of: + - NotebookAnalyzer: Analyzes your current notebook content and context + - KnowledgeSearcher: Uses RAG to find relevant handbook examples and documentation + - MarkdownGenerator: Creates comprehensive reports with actionable recommendations + + I can help with: + - Finding relevant code examples for your current analysis stage + - Semantic search through the Python Data Science Handbook + - Context-aware recommendations based on your notebook content + - Best practices and patterns for data science workflows + + To use me: + - Provide your prompt or objective + - Include: notebook: /path/to/notebook.ipynb + - I'll create a comprehensive markdown report with relevant handbook content""", + ) + + def get_knowledge_tools(self): + """Get knowledge search tools - RAG if available, FileTools as fallback.""" + try: + return [create_rag_tools()] + except Exception: + return [FileTools()] + + def initialize_context_retrieval_team(self, system_prompt: str): + """Initialize the 3-agent context retrieval team.""" + model_id = self.config_manager.lm_provider_params["model_id"] + notebook_tools = [NotebookReaderTool()] + knowledge_tools = self.get_knowledge_tools() + + notebook_analyzer = Agent( + name="NotebookAnalyzer", + role="Notebook analysis specialist that extracts context and content for search", + model=AwsBedrock(id=model_id, session=session), + instructions=[ + "Use extract_rag_context tool to read notebook content - do NOT generate new code", + "Look for notebook path in user prompt (extract the actual file path)", + "If no path provided, use: /Users/jujonahj/jupyter-ai-personas/jupyter_ai_personas/data_science_persona/test_context_retrieval.ipynb", + "Extract notebook context including:", + "- Libraries being used (pandas, numpy, sklearn, matplotlib, etc.)", + "- Analysis stage: data_loading, eda, preprocessing, modeling, evaluation, visualization", + "- Data characteristics and problem domain", + "- Current objectives and next steps", + "CRITICAL: You MUST end your response with this EXACT format for KnowledgeSearcher:", + "", + "```json", + "NOTEBOOK_ANALYSIS: {", + " \"path\": \"/extracted/path/from/notebook.ipynb\",", + " \"name\": \"extracted_filename.ipynb\",", + " \"libraries\": [\"list\", \"of\", \"libraries\"],", + " \"stage\": \"analysis_stage_identified\",", + " \"domain\": \"problem_domain\",", + " \"objectives\": \"current_objectives\",", + " \"content_summary\": \"brief summary of notebook content\"", + "}", + "```", + "", + "This provides context for handbook searches without complex JSON nesting." + ], + tools=notebook_tools, + markdown=True, + show_tool_calls=True + ) + + knowledge_searcher = Agent( + name="KnowledgeSearcher", + role="Repository search specialist that finds relevant handbook content", + model=AwsBedrock(id=model_id, session=session), + instructions=[ + "1. Look for NOTEBOOK_ANALYSIS JSON in NotebookAnalyzer's response", + "2. Extract: libraries, stage, domain, objectives, content_summary", + "3. Generate 4-5 targeted searches based on this analysis to find relevant handbook content:", + " - Primary objective/task searches (e.g., 'classification', 'clustering', 'dimensionality reduction')", + " - Library-specific searches (e.g., 'sklearn RandomForest', 'pandas preprocessing', 'matplotlib visualization')", + " - Analysis stage searches (e.g., 'model evaluation', 'feature selection', 'data exploration')", + " - Problem domain/data type searches (e.g., 'time series', 'text analysis', 'image processing')", + "4. Use ONLY search_handbook_only(query='terms') for each search", + "5. CRITICAL: Provide ALL search results to MarkdownGenerator with key content from each notebook:"," - Complete list of all retrieved notebooks from all searches", + " - FULL code examples, algorithms, and implementations from each notebook", + " - Detailed explanations, theory, and methodology from handbook cells", + " - Best practices, tips, and advanced techniques mentioned", + " - Specific connections between each handbook topic and user's notebook analysis", + "6. Ensure MarkdownGenerator receives comprehensive handbook content to work with", + "IMPORTANT: Only search handbook - notebook content is already analyzed by NotebookAnalyzer!" + ], + tools=knowledge_tools, + markdown=True, + show_tool_calls=True + ) + + markdown_generator = Agent( + name="MarkdownGenerator", + role="Content synthesis specialist that creates markdown reports", + model=AwsBedrock(id=model_id, session=session), + instructions=[ + "Create comprehensive markdown reports using ALL available RAG search results from KnowledgeSearcher", + "CRITICAL: Extract and include substantial content from each RAG search result - don't just summarize", + "Structure with sections:", + "- Executive Summary", + "- Current Notebook Analysis (from NotebookAnalyzer)", + "- Comprehensive Handbook Resources (include FULL relevant code from each RAG result)", + "- Detailed Code Examples and Explanations (extensive quotes from handbook notebooks)", + "- Cross-References and Learning Paths", + "- Actionable Implementation Steps", + "", + "REQUIREMENTS:", + "- Include complete code blocks from handbook results, not just snippets", + "- Quote extensive explanations and context from handbook cells", + "- Show multiple approaches/techniques for each topic from different handbook sections", + "- Create detailed cross-references between user's notebook and handbook content", + "- Provide substantial educational content that users can learn from", + "", + "IMPORTANT: Name the markdown file: 'repo_context.md'" + ], + tools=[FileTools()], + markdown=True, + show_tool_calls=True + ) + + context_team = Team( + name="context-retrieval-team", + mode="coordinate", + members=[notebook_analyzer, knowledge_searcher, markdown_generator], + model=AwsBedrock(id=model_id, session=session), + instructions=[ + f"Context Retrieval Session: {system_prompt}", + "WORKFLOW:", + "1. NotebookAnalyzer: Extract context from user prompt + notebook", + "2. KnowledgeSearcher: Search handbook for relevant content", + "3. MarkdownGenerator: Create comprehensive markdown report", + "Focus on providing actionable recommendations" + ], + markdown=True, + show_members_responses=True, + enable_agentic_context=True, + add_datetime_to_instructions=True, + show_tool_calls=True + ) + return context_team + + def is_greeting(self, message_text: str) -> bool: + """Check if the message is a greeting or simple conversation.""" + greetings = {"hello", "hi", "hey", "help", "who are you"} + message_lower = message_text.lower().strip() + return any(greeting in message_lower for greeting in greetings) or \ + message_lower.startswith(("good ", "what", "how are")) + + async def process_message(self, message: Message): + """Process messages using the context retrieval team.""" + print(f"🚀 CONTEXT RETRIEVAL REQUEST: {message.body}") + message_text = message.body + + # Handle greetings and simple messages without RAG + if self.is_greeting(message_text): + greeting_response = """👋 Hello! I'm your Context Retrieval Persona. + +I help analyze your data science work and find relevant resources from the Python Data Science Handbook using RAG search. + +**How to use me:** +- Ask me questions about data science concepts, techniques, or problems +- Include `notebook: /path/to/your/notebook.ipynb` to analyze your current work +- I'll search the Python Data Science Handbook and create a comprehensive report + +**I can help with:** +- Finding relevant code examples for your analysis +- Semantic search through data science documentation +- Context-aware recommendations based on your notebook +- Best practices and patterns for data science workflows + +What would you like help with today?""" + + async def response_iterator(): + yield greeting_response + + await self.stream_message(response_iterator()) + return + + provider_name = self.config_manager.lm_provider.name + model_id = self.config_manager.lm_provider_params["model_id"] + + # Get chat history + history = YChatHistory(ychat=self.ychat, k=2) + messages = await history.aget_messages() + history_text = "" + if messages: + history_text = "\nPrevious conversation:\n" + for msg in messages: + role = "User" if isinstance(msg, HumanMessage) else "Assistant" + history_text += f"{role}: {msg.content}\n" + + system_prompt = f""" + Context Retrieval Session: + Model: {model_id} + Provider: {provider_name} + User Request: {message_text} + {history_text} + + Goal: Analyze notebook context and find relevant Python Data Science Handbook content. + """ + + context_team = self.initialize_context_retrieval_team(system_prompt) + + try: + response = context_team.run( + message_text, + stream=False, + stream_intermediate_steps=True, + show_full_reasoning=True, + ) + + response_content = response.content + + except Exception as e: + print(f"❌ Team execution error: {e}") + response_content = f"Error in context retrieval: {str(e)}\n\nPlease try again or check the logs for more details." + + async def response_iterator(): + yield response_content + + await self.stream_message(response_iterator()) \ No newline at end of file diff --git a/jupyter_ai_personas/context_retrieval_persona/rag_tool.py b/jupyter_ai_personas/context_retrieval_persona/rag_tool.py new file mode 100644 index 0000000..b7a11bb --- /dev/null +++ b/jupyter_ai_personas/context_retrieval_persona/rag_tool.py @@ -0,0 +1,128 @@ +import os +import json +from pathlib import Path +from typing import List, Dict, Any +import logging +import nbformat +from agno.tools import Toolkit + +from langchain.schema import Document +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import Chroma + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class RAG: + def __init__(self): + script_dir = Path(__file__).parent.absolute() + self.handbook_path = script_dir / "PythonDataScienceHandbook" / "notebooks" + self.persist_dir = script_dir / "vector_stores" / "rag" + self.persist_dir.mkdir(parents=True, exist_ok=True) + + self.embeddings = HuggingFaceEmbeddings( + model_name="sentence-transformers/all-MiniLM-L6-v2", + model_kwargs={'device': 'cpu'} + ) + self.vectorstore = None + + def load_content(self): + """Load handbook content into vectorstore for similarity search.""" + documents = [] + + for notebook_file in self.handbook_path.glob("*.ipynb"): + with open(notebook_file, 'r', encoding='utf-8') as f: + nb = nbformat.read(f, as_version=nbformat.NO_CONVERT, capture_validation_error=None) + + for cell_idx, cell in enumerate(nb.cells): + content = cell.get('source', '').strip() + if content: + documents.append(Document( + page_content=content, + metadata={ + 'source': notebook_file.name, + 'type': 'handbook', + 'cell_idx': cell_idx + } + )) + + self.vectorstore = Chroma.from_documents( + documents=documents, + embedding=self.embeddings, + persist_directory=str(self.persist_dir) + ) + + logger.info(f"Loaded {len(documents)} handbook cells") + + def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]: + """RAG similarity search - returns entire relevant notebooks.""" + docs = self.vectorstore.similarity_search(query, k=k*3) + + # Group by notebook file and get top notebooks + notebook_scores = {} + for doc in docs: + source = doc.metadata['source'] + if source not in notebook_scores: + notebook_scores[source] = 0 + notebook_scores[source] += 1 # Simple scoring by relevance count + + # Get top notebooks + top_notebooks = sorted(notebook_scores.items(), key=lambda x: x[1], reverse=True)[:2] + + results = [] + for notebook_name, _ in top_notebooks: + # Load entire notebook + notebook_path = self.handbook_path / notebook_name + with open(notebook_path, 'r', encoding='utf-8') as f: + nb = nbformat.read(f, as_version=nbformat.NO_CONVERT, capture_validation_error=None) + + # Combine all cells into one result + full_content = [] + for cell_idx, cell in enumerate(nb.cells): + content = cell.get('source', '').strip() + if content: + full_content.append(f"# Cell {cell_idx}\n{content}") + + results.append({ + 'content': '\n\n'.join(full_content), + 'source': notebook_name, + 'type': 'full_notebook', + 'cell_count': len([c for c in nb.cells if c.get('source', '').strip()]) + }) + + return results + +class RAGTool(Toolkit): + def __init__(self): + super().__init__(name="rag") + self.rag = None + self.handbook_loaded = False + + self.register(self.search_handbook_only) + + def search_handbook_only(self, query: str, k: int = 5) -> str: + """RAG similarity search in handbook only.""" + if not self.handbook_loaded: + logger.info("Loading handbook (one-time initialization)") + self.rag = RAG() + self.rag.load_content() + self.handbook_loaded = True + + results = self.rag.search(query, k=k) + + # Log RAG search results (titles only) + print(f"\n🔍 RAG SEARCH: '{query}'") + print(f"📚 Found {len(results)} relevant notebooks:") + for i, result in enumerate(results): + print(f" {i+1}. {result['source']} ({result['cell_count']} cells, {len(result['content'])} chars)") + print("=" * 60) + + return json.dumps({ + "query": query, + "total_results": len(results), + "results": results + }, indent=2) + +def create_rag_tools() -> RAGTool: + return RAGTool() \ No newline at end of file diff --git a/jupyter_ai_personas/context_retrieval_persona/test_tabular.ipynb b/jupyter_ai_personas/context_retrieval_persona/test_tabular.ipynb new file mode 100644 index 0000000..a23a7b1 --- /dev/null +++ b/jupyter_ai_personas/context_retrieval_persona/test_tabular.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sales Data Analysis Test Notebook\n", + "\n", + "This notebook demonstrates a simple data science workflow for testing the context retrieval persona." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error, r2_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample sales data\n", + "np.random.seed(42)\n", + "n_samples = 1000\n", + "\n", + "data = {\n", + " 'advertising_spend': np.random.uniform(1000, 50000, n_samples),\n", + " 'sales_team_size': np.random.randint(5, 50, n_samples),\n", + " 'market_size': np.random.uniform(100000, 1000000, n_samples),\n", + " 'season': np.random.choice(['Q1', 'Q2', 'Q3', 'Q4'], n_samples)\n", + "}\n", + "\n", + "# Generate revenue with some realistic relationships\n", + "data['revenue'] = (\n", + " data['advertising_spend'] * 2.5 + \n", + " data['sales_team_size'] * 1000 + \n", + " data['market_size'] * 0.1 +\n", + " np.random.normal(0, 10000, n_samples)\n", + ")\n", + "\n", + "df = pd.DataFrame(data)\n", + "print(f\"Dataset shape: {df.shape}\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare data for modeling\n", + "# One-hot encode categorical variables\n", + "df_encoded = pd.get_dummies(df, columns=['season'], prefix='season')\n", + "\n", + "# Define features and target\n", + "feature_columns = [col for col in df_encoded.columns if col != 'revenue']\n", + "X = df_encoded[feature_columns]\n", + "y = df_encoded['revenue']\n", + "\n", + "print(f\"Features: {X.columns.tolist()}\")\n", + "print(f\"Target: revenue\")\n", + "print(f\"Feature matrix shape: {X.shape}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Split the data\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "print(f\"Training set size: {X_train.shape[0]}\")\n", + "print(f\"Test set size: {X_test.shape[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train a simple linear regression model\n", + "model = LinearRegression()\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Make predictions\n", + "y_train_pred = model.predict(X_train)\n", + "y_test_pred = model.predict(X_test)\n", + "\n", + "# Calculate metrics\n", + "train_mse = mean_squared_error(y_train, y_train_pred)\n", + "test_mse = mean_squared_error(y_test, y_test_pred)\n", + "train_r2 = r2_score(y_train, y_train_pred)\n", + "test_r2 = r2_score(y_test, y_test_pred)\n", + "\n", + "print(\"Model Performance:\")\n", + "print(f\"Training MSE: {train_mse:,.2f}\")\n", + "print(f\"Test MSE: {test_mse:,.2f}\")\n", + "print(f\"Training R²: {train_r2:.4f}\")\n", + "print(f\"Test R²: {test_r2:.4f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/jupyter_ai_personas/context_retrieval_persona/test_time_series.ipynb b/jupyter_ai_personas/context_retrieval_persona/test_time_series.ipynb new file mode 100644 index 0000000..39edc77 --- /dev/null +++ b/jupyter_ai_personas/context_retrieval_persona/test_time_series.ipynb @@ -0,0 +1,279 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cell-0", + "metadata": {}, + "source": [ + "# Time Series Forecasting Test Notebook\n", + "\n", + "This notebook demonstrates a time series analysis workflow for testing the context retrieval persona with temporal data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-1", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from datetime import datetime, timedelta\n", + "from sklearn.metrics import mean_absolute_error, mean_squared_error\n", + "from statsmodels.tsa.seasonal import seasonal_decompose\n", + "from statsmodels.tsa.arima.model import ARIMA\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-2", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate synthetic time series data for e-commerce sales\n", + "np.random.seed(42)\n", + "start_date = datetime(2020, 1, 1)\n", + "end_date = datetime(2023, 12, 31)\n", + "date_range = pd.date_range(start=start_date, end=end_date, freq='D')\n", + "\n", + "# Create base trend\n", + "n_days = len(date_range)\n", + "trend = np.linspace(1000, 2000, n_days)\n", + "\n", + "# Add seasonal patterns (weekly and yearly)\n", + "weekly_pattern = 200 * np.sin(2 * np.pi * np.arange(n_days) / 7)\n", + "yearly_pattern = 300 * np.sin(2 * np.pi * np.arange(n_days) / 365.25)\n", + "\n", + "# Add random noise\n", + "noise = np.random.normal(0, 100, n_days)\n", + "\n", + "# Combine all components\n", + "sales = trend + weekly_pattern + yearly_pattern + noise\n", + "\n", + "# Create DataFrame\n", + "ts_data = pd.DataFrame({\n", + " 'date': date_range,\n", + " 'daily_sales': np.maximum(sales, 0) # Ensure non-negative sales\n", + "})\n", + "\n", + "ts_data.set_index('date', inplace=True)\n", + "print(f\"Time series shape: {ts_data.shape}\")\n", + "print(f\"Date range: {ts_data.index.min()} to {ts_data.index.max()}\")\n", + "ts_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-3", + "metadata": {}, + "outputs": [], + "source": [ + "# Basic time series visualization\n", + "plt.figure(figsize=(15, 8))\n", + "\n", + "plt.subplot(2, 2, 1)\n", + "plt.plot(ts_data.index, ts_data['daily_sales'])\n", + "plt.title('Daily Sales Over Time')\n", + "plt.ylabel('Sales ($)')\n", + "\n", + "plt.subplot(2, 2, 2)\n", + "monthly_sales = ts_data.resample('M').sum()\n", + "plt.plot(monthly_sales.index, monthly_sales['daily_sales'])\n", + "plt.title('Monthly Sales')\n", + "plt.ylabel('Monthly Sales ($)')\n", + "\n", + "plt.subplot(2, 2, 3)\n", + "ts_data['daily_sales'].hist(bins=50)\n", + "plt.title('Distribution of Daily Sales')\n", + "plt.xlabel('Sales ($)')\n", + "\n", + "plt.subplot(2, 2, 4)\n", + "weekly_avg = ts_data.groupby(ts_data.index.dayofweek)['daily_sales'].mean()\n", + "days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']\n", + "plt.bar(days, weekly_avg)\n", + "plt.title('Average Sales by Day of Week')\n", + "plt.ylabel('Average Sales ($)')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(f\"Daily sales statistics:\")\n", + "print(ts_data['daily_sales'].describe())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-4", + "metadata": {}, + "outputs": [], + "source": [ + "# Time series decomposition\n", + "print(\"Performing time series decomposition...\")\n", + "\n", + "# Decompose the time series\n", + "decomposition = seasonal_decompose(ts_data['daily_sales'], model='additive', period=365)\n", + "\n", + "# Plot decomposition\n", + "fig, axes = plt.subplots(4, 1, figsize=(15, 12))\n", + "\n", + "decomposition.observed.plot(ax=axes[0], title='Original Time Series')\n", + "decomposition.trend.plot(ax=axes[1], title='Trend Component')\n", + "decomposition.seasonal.plot(ax=axes[2], title='Seasonal Component')\n", + "decomposition.resid.plot(ax=axes[3], title='Residual Component')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Calculate component statistics\n", + "trend_strength = 1 - (decomposition.resid.var() / (decomposition.trend + decomposition.resid).var())\n", + "seasonal_strength = 1 - (decomposition.resid.var() / (decomposition.seasonal + decomposition.resid).var())\n", + "\n", + "print(f\"Trend strength: {trend_strength:.3f}\")\n", + "print(f\"Seasonal strength: {seasonal_strength:.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-5", + "metadata": {}, + "outputs": [], + "source": [ + "# Split data for time series forecasting\n", + "train_size = int(len(ts_data) * 0.8)\n", + "train_data = ts_data[:train_size]\n", + "test_data = ts_data[train_size:]\n", + "\n", + "print(f\"Training period: {train_data.index.min()} to {train_data.index.max()}\")\n", + "print(f\"Test period: {test_data.index.min()} to {test_data.index.max()}\")\n", + "print(f\"Training samples: {len(train_data)}\")\n", + "print(f\"Test samples: {len(test_data)}\")\n", + "\n", + "# Visualize train/test split\n", + "plt.figure(figsize=(15, 6))\n", + "plt.plot(train_data.index, train_data['daily_sales'], label='Training', color='blue')\n", + "plt.plot(test_data.index, test_data['daily_sales'], label='Test', color='red')\n", + "plt.axvline(x=train_data.index[-1], color='black', linestyle='--', alpha=0.7, label='Train/Test Split')\n", + "plt.title('Train/Test Split Visualization')\n", + "plt.ylabel('Daily Sales ($)')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-6", + "metadata": {}, + "outputs": [], + "source": [ + "# Fit ARIMA model for forecasting\n", + "print(\"Fitting ARIMA model...\")\n", + "\n", + "# Simple ARIMA model (could be improved with proper order selection)\n", + "model = ARIMA(train_data['daily_sales'], order=(1, 1, 1))\n", + "fitted_model = model.fit()\n", + "\n", + "# Generate forecasts\n", + "forecast_steps = len(test_data)\n", + "forecast = fitted_model.forecast(steps=forecast_steps)\n", + "forecast_ci = fitted_model.get_forecast(steps=forecast_steps).conf_int()\n", + "\n", + "print(f\"Model summary:\")\n", + "print(fitted_model.summary())\n", + "\n", + "# Calculate forecast errors\n", + "mae = mean_absolute_error(test_data['daily_sales'], forecast)\n", + "rmse = np.sqrt(mean_squared_error(test_data['daily_sales'], forecast))\n", + "mape = np.mean(np.abs((test_data['daily_sales'] - forecast) / test_data['daily_sales'])) * 100\n", + "\n", + "print(f\"\\nForecast Performance:\")\n", + "print(f\"MAE: ${mae:.2f}\")\n", + "print(f\"RMSE: ${rmse:.2f}\")\n", + "print(f\"MAPE: {mape:.2f}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-7", + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize forecasts\n", + "plt.figure(figsize=(15, 8))\n", + "\n", + "# Plot training data\n", + "plt.plot(train_data.index[-100:], train_data['daily_sales'][-100:], \n", + " label='Historical (last 100 days)', color='blue', alpha=0.7)\n", + "\n", + "# Plot actual test data\n", + "plt.plot(test_data.index, test_data['daily_sales'], \n", + " label='Actual', color='green', linewidth=2)\n", + "\n", + "# Plot forecasts\n", + "plt.plot(test_data.index, forecast, \n", + " label='ARIMA Forecast', color='red', linewidth=2)\n", + "\n", + "# Plot confidence intervals\n", + "plt.fill_between(test_data.index, \n", + " forecast_ci.iloc[:, 0], \n", + " forecast_ci.iloc[:, 1], \n", + " color='red', alpha=0.2, label='95% Confidence Interval')\n", + "\n", + "plt.axvline(x=train_data.index[-1], color='black', linestyle='--', alpha=0.7, label='Forecast Start')\n", + "plt.title('Time Series Forecasting Results')\n", + "plt.ylabel('Daily Sales ($)')\n", + "plt.legend()\n", + "plt.grid(True, alpha=0.3)\n", + "plt.show()\n", + "\n", + "# Residual analysis\n", + "residuals = test_data['daily_sales'] - forecast\n", + "\n", + "plt.figure(figsize=(12, 4))\n", + "plt.subplot(1, 2, 1)\n", + "plt.plot(test_data.index, residuals)\n", + "plt.title('Forecast Residuals')\n", + "plt.ylabel('Residual')\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "plt.hist(residuals, bins=30, alpha=0.7)\n", + "plt.title('Distribution of Residuals')\n", + "plt.xlabel('Residual')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 0c403d7..b5ba5bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,18 @@ data_analytics = [ "seaborn" ] -all = ["jupyter-ai-personas[finance,emoji,software_team,data_analytics,pr_review]"] +context_retriever = [ + "agno", + "boto3", + "langchain", + "langchain-core", + "langchain-community", + "sentence-transformers", + "chromadb", + "nbformat" +] + +all = ["jupyter-ai-personas[finance,emoji,software_team,data_analytics,pr_review,context_retriever]"] [build-system] requires = ["hatchling"] @@ -74,3 +85,4 @@ emoji_persona = "jupyter_ai_personas.emoji_persona.persona:EmojiPersona" software_team_persona = "jupyter_ai_personas.software_team_persona.persona:SoftwareTeamPersona" data_analytics_persona = "jupyter_ai_personas.data_analytics_persona.persona:DataAnalyticsTeam" pr_review_persona = "jupyter_ai_personas.pr_review_persona.persona:PRReviewPersona" +context_retrieval_persona = "jupyter_ai_personas.context_retrieval_persona.persona:ContextRetrievalPersona"