From f3686091f34a3f02259c0f812c99be1eab3dfd21 Mon Sep 17 00:00:00 2001
From: Kylejeong2 <kylejeong21@gmail.com>
Date: Mon, 28 Jul 2025 16:50:29 -0700
Subject: [PATCH 1/3] add python version of integration to mongodb

---
 examples/integrations/mongodb/README.md       | 246 +++++---
 .../integrations/mongodb/python/.gitignore    |   2 +
 .../integrations/mongodb/python/README.md     | 226 +++++++
 .../integrations/mongodb/python/env.example   |  10 +
 examples/integrations/mongodb/python/main.py  | 559 ++++++++++++++++++
 .../mongodb/python/requirements.txt           |   6 +
 .../mongodb/{ => typescript}/.cursorrules     |   0
 .../mongodb/{ => typescript}/.env.example     |   0
 .../mongodb/{ => typescript}/.gitignore       |   0
 .../mongodb/{ => typescript}/LICENSE          |   0
 .../integrations/mongodb/typescript/README.md |  99 ++++
 .../mongodb/{ => typescript}/index.ts         |   5 -
 .../{ => typescript}/package-lock.json        |   0
 .../mongodb/{ => typescript}/package.json     |   0
 .../{ => typescript}/stagehand.config.ts      |   0
 .../mongodb/{ => typescript}/tsconfig.json    |   0
 .../mongodb/{ => typescript}/utils.ts         |   0
 17 files changed, 1077 insertions(+), 76 deletions(-)
 create mode 100644 examples/integrations/mongodb/python/.gitignore
 create mode 100644 examples/integrations/mongodb/python/README.md
 create mode 100644 examples/integrations/mongodb/python/env.example
 create mode 100644 examples/integrations/mongodb/python/main.py
 create mode 100644 examples/integrations/mongodb/python/requirements.txt
 rename examples/integrations/mongodb/{ => typescript}/.cursorrules (100%)
 rename examples/integrations/mongodb/{ => typescript}/.env.example (100%)
 rename examples/integrations/mongodb/{ => typescript}/.gitignore (100%)
 rename examples/integrations/mongodb/{ => typescript}/LICENSE (100%)
 create mode 100644 examples/integrations/mongodb/typescript/README.md
 rename examples/integrations/mongodb/{ => typescript}/index.ts (98%)
 rename examples/integrations/mongodb/{ => typescript}/package-lock.json (100%)
 rename examples/integrations/mongodb/{ => typescript}/package.json (100%)
 rename examples/integrations/mongodb/{ => typescript}/stagehand.config.ts (100%)
 rename examples/integrations/mongodb/{ => typescript}/tsconfig.json (100%)
 rename examples/integrations/mongodb/{ => typescript}/utils.ts (100%)
diff --git a/examples/integrations/mongodb/README.md b/examples/integrations/mongodb/README.md
index df318f8..124f893 100644
--- a/examples/integrations/mongodb/README.md
+++ b/examples/integrations/mongodb/README.md
@@ -1,99 +1,203 @@
-# Stagehand MongoDB Scraper
+# Browserbase + Stagehand MongoDB Integration
 
-A web scraping project that uses Stagehand to extract structured data from e-commerce websites and store it in MongoDB for analysis.
+A comprehensive web scraping integration that uses Stagehand to extract structured data from e-commerce websites and store it in MongoDB for analysis. Available in both **Python** and **TypeScript**.
 
-## Features
+## 🚀 Choose Your Language
 
-- **Web Scraping**: Uses Stagehand (built on Playwright) for intelligent web scraping
-- **Data Extraction**: Extracts structured product data using AI-powered instructions
-- **MongoDB Storage**: Stores scraped data in MongoDB for persistence and querying
-- **Schema Validation**: Uses Zod for schema validation and TypeScript interfaces
-- **Error Handling**: Robust error handling to prevent crashes during scraping
-- **Data Analysis**: Built-in MongoDB queries for data analysis
+<table>
+<tr>
+<td width="50%" valign="top">
 
-## Prerequisites
+### 🐍 **Python Version**
+**`📁 python/`**
 
-- Node.js 16 or higher
+Perfect for data scientists and Python developers who want:
+- **Rich terminal output** with beautiful tables and progress indicators  
+- **Pydantic models** for robust data validation
+- **Async/await** support for high-performance scraping
+- **pymongo** for MongoDB operations
+- Simple single-file architecture
+
+**[→ Get Started with Python](python/README.md)**
+
+```bash
+cd python/
+pip install -r requirements.txt
+python main.py
+```
+
+</td>
+<td width="50%" valign="top">
+
+### 📘 **TypeScript Version** 
+**`📁 typescript/`**
+
+Ideal for JavaScript/Node.js developers who prefer:
+- **Type safety** with full TypeScript support
+- **Zod schemas** for runtime validation  
+- **Modern ES modules** and clean architecture
+- **MongoDB native driver** with full typing
+- Modular, well-structured codebase
+
+**[→ Get Started with TypeScript](typescript/README.md)**
+
+```bash
+cd typescript/
+npm install
+npm start
+```
+
+</td>
+</tr>
+</table>
+
+## 🌟 Features (Both Versions)
+
+- **🌐 Intelligent Web Scraping**: Uses Stagehand's AI-powered extraction
+- **🗄️ MongoDB Storage**: Persistent data storage with proper indexing
+- **📊 Data Analysis**: Built-in queries and reporting
+- **🛡️ Error Handling**: Robust error handling and recovery
+- **⚡ Performance**: Optimized for speed and reliability
+- **🔍 Schema Validation**: Type-safe data models
+
+## 📋 What It Does
+
+Both versions perform the same core functionality:
+
+1. **🔌 Connect** to MongoDB and set up collections with proper indexes
+2. **📊 Scrape** Amazon product listings using Stagehand's AI extraction
+3. **🔍 Extract** detailed product information including:
+   - Product names, prices, ratings
+   - Categories, descriptions, specifications  
+   - Review counts and availability
+4. **💾 Store** all data in MongoDB with validated schemas
+5. **📈 Analyze** the data with built-in reporting:
+   - Collection statistics
+   - Products by category
+   - Top-rated products
+
+## 🛠️ Prerequisites
+
+**For Both Versions:**
 - MongoDB installed locally or MongoDB Atlas account
 - Stagehand API key
 
-## Installation
+**Python Version:**
+- Python 3.8+
+
+**TypeScript Version:**  
+- Node.js 16+
+- npm or pnpm
+
+## 🚦 Quick Start
+
+### Python Quick Start
+```bash
+# Navigate to Python version
+cd examples/integrations/mongodb/python
+
+# Install dependencies  
+pip install -r requirements.txt
+
+# Set up environment
+cp env.example .env
+# Edit .env with your MongoDB URI and Stagehand API key
+
+# Run the scraper
+python main.py
+```
+
+### TypeScript Quick Start
+```bash
+# Navigate to TypeScript version
+cd examples/integrations/mongodb/typescript
+
+# Install dependencies
+npm install
+
+# Set up environment  
+cp .env.example .env
+# Edit .env with your MongoDB URI and Stagehand API key
+
+# Run the scraper
+npm start
+```
+
+## 📊 Sample Output
+
+Both versions provide rich, colorful output showing the scraping progress:
+
+```
+🤘 Welcome to Stagehand MongoDB Scraper!
+
+🔌 Connecting to MongoDB...
+✅ Connected to MongoDB
+⚙️ Creating indexes...
+✅ Index creation completed
+
+📊 Starting to scrape product listing...
+✅ Scraped 16 products from category: Laptops
 
-1. Clone the repository:
-   ```
-   git clone <repository-url>
-   cd stagehand-mongodb-scraper
-   ```
+📊 Scraping details for product 1/3: MacBook Pro M3
+✅ Scraped detailed information for: MacBook Pro M3
 
-2. Install dependencies:
-   ```
-   npm install
-   ```
+📊 Running Data Analysis
+┌─────────────────┬───────┐
+│ Collection      │ Count │
+├─────────────────┼───────┤
+│ PRODUCTS        │ 19    │
+│ PRODUCT_LISTS   │ 1     │
+└─────────────────┴───────┘
 
-3. Set up environment variables:
-   ```
-   # Create a .env file with the following variables
-   MONGO_URI=mongodb://localhost:27017
-   DB_NAME=scraper_db
-   ```
+🎉 Scraping completed successfully!
+```
 
-## Usage
+## 🏗️ Architecture
 
-1. Start MongoDB locally:
-   ```
-   mongod
-   ```
+Both versions follow the same architectural patterns:
 
-2. Run the scraper:
-   ```
-   npm start
-   ```
+- **MongoDB Manager**: Handles database connections, indexing, and operations
+- **Product Scraper**: Manages web scraping using Stagehand  
+- **Data Models**: Structured schemas for products and product lists
+- **Data Analyzer**: Provides insights and reporting on collected data
 
-3. The script will:
-   - Scrape product listings from Amazon
-   - Extract detailed information for the first 3 products
-   - Extract reviews for each product
-   - Store all data in MongoDB
-   - Run analysis queries on the collected data showing:
-     - Collection counts
-     - Products by category
-     - Top-rated products
+## 🔧 Configuration
 
-## Project Structure
+Both versions support:
+- **Browserbase** cloud browsers for scalability
+- **Environment-based** configuration
+- **Flexible MongoDB** connection options
 
-The project has a simple structure with a single file containing all functionality:
+## 📚 Documentation
 
-- `index.ts`: Contains the complete implementation including:
-  - MongoDB connection and data operations
-  - Schema definitions
-  - Scraping functions
-  - Data analysis
-  - Main execution logic
-- `stagehand.config.js`: Stagehand configuration
-- `.env.example`: Example environment variables
+- **[Python Version Documentation](python/README.md)** - Detailed Python setup and usage
+- **[TypeScript Version Documentation](typescript/README.md)** - Complete TypeScript guide
+- **[Stagehand Documentation](https://docs.stagehand.dev/)** - Learn more about Stagehand
+- **[MongoDB Documentation](https://docs.mongodb.com/)** - MongoDB setup and operations
 
-## Data Models
+## 🤝 Contributing
 
-The project uses the following data models:
+Both versions are actively maintained and welcome contributions:
+- Bug reports and feature requests
+- Code improvements and optimizations
+- Documentation enhancements
+- Additional data analysis features
 
-- **Product**: Individual product information
-- **ProductList**: List of products from a category page
-- **Review**: Product reviews
+## 📄 License
 
-## MongoDB Collections
+MIT License - feel free to use in your projects!
 
-Data is stored in the following MongoDB collections:
+## 🙏 Acknowledgements
 
-- **products**: Individual product information
-- **product_lists**: Lists of products from category pages
-- **reviews**: Product reviews
+- **[Stagehand](https://docs.stagehand.dev/)** - AI-powered web scraping
+- **[MongoDB](https://www.mongodb.com/)** - Flexible document database
+- **[Pydantic](https://pydantic.dev/)** (Python) - Data validation  
+- **[Zod](https://zod.dev/)** (TypeScript) - Schema validation
 
-## License
+---
 
-MIT
+## 🤘 Ready to Start?
 
-## Acknowledgements
+Choose your preferred language and dive in:
 
-- [Stagehand](https://docs.stagehand.dev/) for the powerful web scraping capabilities
-- [MongoDB](https://www.mongodb.com/) for the flexible document database
-- [Zod](https://zod.dev/) for runtime schema validation
+**🐍 [Python Version →](python/README.md)** | **📘 [TypeScript Version →](typescript/README.md)**
diff --git a/examples/integrations/mongodb/python/.gitignore b/examples/integrations/mongodb/python/.gitignore
new file mode 100644
index 0000000..df099fc
--- /dev/null
+++ b/examples/integrations/mongodb/python/.gitignore
@@ -0,0 +1,2 @@
+.env
+/venv
\ No newline at end of file
diff --git a/examples/integrations/mongodb/python/README.md b/examples/integrations/mongodb/python/README.md
new file mode 100644
index 0000000..dfa0d95
--- /dev/null
+++ b/examples/integrations/mongodb/python/README.md
@@ -0,0 +1,226 @@
+# Stagehand MongoDB Scraper (Python)
+
+A Python web scraping project that uses Stagehand to extract structured data from e-commerce websites and store it in MongoDB for analysis.
+
+## Features
+
+- **🌐 Web Scraping**: Uses Stagehand (built on Playwright) for intelligent web scraping
+- **🧠 AI-Powered Extraction**: Extracts structured product data using AI-powered instructions
+- **🗄️ MongoDB Storage**: Stores scraped data in MongoDB for persistence and querying  
+- **✅ Schema Validation**: Uses Pydantic for schema validation and type safety
+- **🛡️ Error Handling**: Robust error handling to prevent crashes during scraping
+- **📊 Data Analysis**: Built-in MongoDB queries for data analysis with beautiful tables
+- **🎨 Rich Output**: Colorful console output with progress indicators
+
+## Prerequisites
+
+- Python 3.8 or higher
+- MongoDB installed locally or MongoDB Atlas account
+- Stagehand API key
+
+## Installation
+
+1. Navigate to the Python directory:
+   ```bash
+   cd examples/integrations/mongodb/python
+   ```
+
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Set up environment variables:
+   ```bash
+   # Copy the example environment file
+   cp env.example .env
+   
+   # Edit .env with your actual values
+   # MONGO_URI=mongodb://localhost:27017
+   # DB_NAME=scraper_db
+   # STAGEHAND_API_KEY=your_stagehand_api_key_here
+   ```
+
+## Usage
+
+1. Start MongoDB locally:
+   ```bash
+   mongod
+   ```
+
+2. Run the scraper:
+   ```bash
+   python main.py
+   ```
+
+3. The script will:
+   - 🔌 Connect to MongoDB and create necessary indexes
+   - 📊 Scrape product listings from Amazon laptops category
+   - 🔍 Extract detailed information for the first 3 products
+   - 💾 Store all data in MongoDB with proper schemas
+   - 📈 Run analysis queries showing:
+     - Collection document counts
+     - Products grouped by category  
+     - Top-rated products (4+ stars)
+
+## Project Structure
+
+```
+python/
+├── main.py              # Main application with all functionality
+├── requirements.txt     # Python dependencies
+├── env.example         # Example environment variables
+└── README.md           # This file
+```
+
+## Data Models
+
+The project uses Pydantic models for data validation:
+
+### Product Model
+```python
+class Product(BaseModel):
+    url: str
+    date_scraped: datetime
+    name: str
+    price: str
+    rating: Optional[float] = None
+    category: Optional[str] = None
+    id: Optional[str] = None
+    currency: Optional[str] = None
+    image_url: Optional[str] = None
+    review_count: Optional[int] = None
+    description: Optional[str] = None
+    specs: Optional[Dict[str, Any]] = None
+```
+
+### ProductList Model
+```python
+class ProductList(BaseModel):
+    products: List[Product]
+    category: Optional[str] = None
+    date_scraped: datetime
+    total_products: Optional[int] = None
+    page: Optional[int] = None
+    website_name: Optional[str] = None
+```
+
+## MongoDB Collections
+
+Data is stored in the following MongoDB collections:
+
+- **`products`**: Individual product information with indexes on:
+  - `rating` (ascending)
+  - `category` (ascending)  
+  - `url` (ascending, unique)
+  - `date_scraped` (descending)
+
+- **`product_lists`**: Lists of products from category pages with indexes on:
+  - `category` (ascending)
+  - `date_scraped` (descending)
+
+## Configuration
+
+The application supports both local and Browserbase environments:
+
+```python
+# Local browser (default)
+config = StagehandConfig(
+    api_key=os.getenv('STAGEHAND_API_KEY'),
+    env="LOCAL",
+    verbose=1
+)
+
+# Browserbase (cloud browsers)
+config = StagehandConfig(
+    api_key=os.getenv('STAGEHAND_API_KEY'),
+    env="BROWSERBASE",
+    verbose=1
+)
+```
+
+## Key Classes
+
+### MongoDBManager
+Handles all MongoDB operations including:
+- Connection management
+- Index creation
+- Data storage and retrieval
+- Aggregation queries
+
+### ProductScraper  
+Handles web scraping using Stagehand:
+- Product list scraping from category pages
+- Detailed product information extraction
+- Rate limiting and error handling
+
+### DataAnalyzer
+Provides data analysis and reporting:
+- Collection statistics
+- Category-based analysis
+- Top-rated product reports
+
+## Error Handling
+
+The application includes comprehensive error handling:
+- MongoDB connection errors
+- Web scraping failures  
+- Data validation errors
+- Graceful cleanup on exit
+
+## Example Output
+
+```
+🤘 Welcome to Stagehand MongoDB Scraper!
+
+🔌 Connecting to MongoDB...
+✅ Connected to MongoDB
+⚙️ Creating indexes...
+✅ Created index rating_idx on products
+✅ Index creation completed
+
+📊 Starting to scrape product listing from: https://www.amazon.com/s?k=laptops
+✅ Scraped 16 products from category: Laptops
+
+📊 Running Data Analysis
+┏━━━━━━━━━━━━━━━┳━━━━━━━┓
+┃ Collection    ┃ Count ┃
+┡━━━━━━━━━━━━━━━╇━━━━━━━┩
+│ PRODUCTS      │ 19    │
+│ PRODUCT_LISTS │ 1     │
+└───────────────┴───────┘
+
+🎉 Scraping and MongoDB operations completed successfully!
+```
+
+## Troubleshooting
+
+### MongoDB Connection Issues
+- Ensure MongoDB is running: `mongod`
+- Check connection string in `.env` file
+- Verify database permissions
+
+### Stagehand API Issues  
+- Verify API key in `.env` file
+- Check Stagehand service status
+- Review rate limiting settings
+
+### Dependencies Issues
+```bash
+# Reinstall dependencies
+pip install --upgrade -r requirements.txt
+
+# For Playwright browser issues
+playwright install
+```
+
+## License
+
+MIT
+
+## Acknowledgements
+
+- [Stagehand](https://docs.stagehand.dev/) - Powerful web scraping with AI
+- [MongoDB](https://www.mongodb.com/) - Flexible document database
+- [Pydantic](https://pydantic.dev/) - Data validation using Python type hints
+- [Rich](https://rich.readthedocs.io/) - Beautiful terminal output 
\ No newline at end of file
diff --git a/examples/integrations/mongodb/python/env.example b/examples/integrations/mongodb/python/env.example
new file mode 100644
index 0000000..c74b767
--- /dev/null
+++ b/examples/integrations/mongodb/python/env.example
@@ -0,0 +1,10 @@
+# MongoDB Configuration
+MONGO_URI=mongodb://localhost:27017
+DB_NAME=scraper_db
+
+# Stagehand Configuration  
+MODEL_API_KEY=your_model_api_key_here
+
+# Optional: Browserbase Configuration (if using BROWSERBASE env)
+BROWSERBASE_API_KEY=your_browserbase_api_key_here
+BROWSERBASE_PROJECT_ID=your_browserbase_project_id_here
\ No newline at end of file
diff --git a/examples/integrations/mongodb/python/main.py b/examples/integrations/mongodb/python/main.py
new file mode 100644
index 0000000..075f523
--- /dev/null
+++ b/examples/integrations/mongodb/python/main.py
@@ -0,0 +1,559 @@
+import os
+import asyncio
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+
+from pydantic import BaseModel
+from pymongo import MongoClient, IndexModel, ASCENDING, DESCENDING
+from pymongo.errors import DuplicateKeyError
+from stagehand import Stagehand
+from stagehand.schemas import AvailableModel
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+# Initialize rich console for better output
+console = Console()
+
+# ========== MongoDB Configuration ==========
+MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
+DB_NAME = os.getenv('DB_NAME', 'scraper_db')
+
+# ========== Pydantic Models (Schema Definitions) ==========
+class Product(BaseModel):
+    """Product model for e-commerce websites"""
+    url: str
+    date_scraped: datetime
+    name: str
+    price: str
+    rating: Optional[float] = None
+    category: Optional[str] = None
+    id: Optional[str] = None
+    currency: Optional[str] = None
+    image_url: Optional[str] = None
+    review_count: Optional[int] = None
+    description: Optional[str] = None
+    specs: Optional[Dict[str, Any]] = None
+
+class ProductList(BaseModel):
+    """Product list model for results from category pages"""
+    products: List[Product]
+    category: Optional[str] = None
+    date_scraped: datetime
+    total_products: Optional[int] = None
+    page: Optional[int] = None
+    website_name: Optional[str] = None
+
+# ========== MongoDB Connection and Operations ==========
+class MongoDBManager:
+    """Handles MongoDB connections and operations"""
+    
+    def __init__(self, uri: str, db_name: str):
+        self.uri = uri
+        self.db_name = db_name
+        self.client = None
+        self.db = None
+        
+        # Collection names
+        self.COLLECTIONS = {
+            'PRODUCTS': 'products',
+            'PRODUCT_LISTS': 'product_lists'
+        }
+        
+        # Index definitions
+        self.INDEXES = {
+            self.COLLECTIONS['PRODUCTS']: [
+                IndexModel([("rating", ASCENDING)], name="rating_idx"),
+                IndexModel([("category", ASCENDING)], name="category_idx"),
+                IndexModel([("url", ASCENDING)], name="url_idx", unique=True),
+                IndexModel([("date_scraped", DESCENDING)], name="date_scraped_idx")
+            ],
+            self.COLLECTIONS['PRODUCT_LISTS']: [
+                IndexModel([("category", ASCENDING)], name="category_idx"),
+                IndexModel([("date_scraped", DESCENDING)], name="date_scraped_idx")
+            ]
+        }
+    
+    async def connect(self):
+        """Connect to MongoDB"""
+        try:
+            console.print("🔌 Connecting to MongoDB...", style="blue")
+            self.client = MongoClient(self.uri)
+            
+            # Test connection
+            self.client.admin.command('ismaster')
+            self.db = self.client[self.db_name]
+            
+            console.print("✅ Connected to MongoDB", style="green")
+            
+            # Create indexes
+            await self._create_indexes()
+            
+        except Exception as e:
+            console.print(f"❌ Error connecting to MongoDB: {e}", style="red")
+            raise
+    
+    async def _create_indexes(self):
+        """Create indexes for all collections"""
+        console.print("⚙️ Creating indexes...", style="blue")
+        
+        for collection_name, indexes in self.INDEXES.items():
+            try:
+                collection = self.db[collection_name]
+                
+                # Create indexes
+                for index in indexes:
+                    try:
+                        collection.create_index(
+                            index.document['key'],
+                            name=index.document.get('name'),
+                            unique=index.document.get('unique', False),
+                            background=True
+                        )
+                        console.print(f"✅ Created index {index.document.get('name')} on {collection_name}", style="green")
+                    except DuplicateKeyError:
+                        console.print(f"⚠️ Index {index.document.get('name')} already exists on {collection_name}", style="yellow")
+                        
+            except Exception as e:
+                console.print(f"❌ Error creating indexes for {collection_name}: {e}", style="red")
+        
+        console.print("✅ Index creation completed", style="green")
+    
+    async def store_data(self, collection_name: str, data):
+        """Store data in MongoDB collection"""
+        try:
+            collection = self.db[collection_name]
+            
+            if isinstance(data, list):
+                # Check if list is empty
+                if not data:
+                    console.print(f"⚠️ No data to store in {collection_name} (empty list)", style="yellow")
+                    return
+                
+                # Convert Pydantic models to dict
+                documents = [item.dict() if hasattr(item, 'dict') else item for item in data]
+                result = collection.insert_many(documents)
+                console.print(f"✅ Stored {len(result.inserted_ids)} documents in {collection_name}", style="green")
+            else:
+                # Convert Pydantic model to dict
+                document = data.dict() if hasattr(data, 'dict') else data
+                result = collection.insert_one(document)
+                console.print(f"✅ Stored document in {collection_name}", style="green")
+                
+        except Exception as e:
+            console.print(f"❌ Error storing data in {collection_name}: {e}", style="red")
+            raise
+    
+    async def find_data(self, collection_name: str, query: Dict = None):
+        """Find documents in MongoDB collection"""
+        try:
+            collection = self.db[collection_name]
+            query = query or {}
+            documents = list(collection.find(query))
+            return documents
+        except Exception as e:
+            console.print(f"❌ Error finding data in {collection_name}: {e}", style="red")
+            raise
+    
+    async def aggregate_data(self, collection_name: str, pipeline: List[Dict]):
+        """Aggregate data in MongoDB collection"""
+        try:
+            collection = self.db[collection_name]
+            results = list(collection.aggregate(pipeline))
+            return results
+        except Exception as e:
+            console.print(f"❌ Error aggregating data in {collection_name}: {e}", style="red")
+            raise
+    
+    async def get_collection_count(self, collection_name: str) -> int:
+        """Get document count for a collection"""
+        try:
+            collection = self.db[collection_name]
+            return collection.count_documents({})
+        except Exception as e:
+            console.print(f"❌ Error getting count for {collection_name}: {e}", style="red")
+            return 0
+    
+    def close(self):
+        """Close MongoDB connection"""
+        if self.client:
+            self.client.close()
+            console.print("🔌 MongoDB connection closed", style="blue")
+
+# ========== Web Scraping Functions ==========
+class ProductScraper:
+    """Handles web scraping operations using Stagehand"""
+    
+    def __init__(self, stagehand: Stagehand, mongodb: MongoDBManager):
+        self.stagehand = stagehand
+        self.page = stagehand.page
+        self.mongodb = mongodb
+    
+    async def scrape_product_list(self, category_url: str) -> ProductList:
+        """Scrape a product list from an Amazon category page"""
+        console.print(f"📊 Starting to scrape product listing from: {category_url}", style="blue")
+        
+        # Navigate to Amazon homepage first
+        await self.page.goto('https://www.amazon.com')
+        await self.page.wait_for_timeout(2000)
+        
+        # Then navigate to the category page
+        await self.page.goto(category_url)
+        
+        # Wait for products to load
+        await self.page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
+        await self.page.wait_for_timeout(2000)
+        
+        # Scroll to load more products
+        await self.page.evaluate("""
+            () => {
+                window.scrollTo(0, document.body.scrollHeight / 2);
+            }
+        """)
+        await self.page.wait_for_timeout(1000)
+        
+        await self.page.evaluate("""
+            () => {
+                window.scrollTo(0, document.body.scrollHeight);
+            }
+        """)
+        await self.page.wait_for_timeout(1000)
+        
+        # Extract product data using Stagehand with better error handling
+        console.print("🔍 Extracting product data with AI...", style="blue")
+        
+        try:
+            extraction_result = await self.page.extract({
+                "instruction": "Look at this Amazon search page and find product listings. Extract the products with their names, prices, and any star ratings you can find.",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "products": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "name": {"type": "string"},
+                                    "price": {"type": "string"},
+                                    "url": {"type": ["string", "null"]},
+                                    "rating": {"type": ["number", "null"]},
+                                    "review_count": {"type": ["number", "null"]}
+                                },
+                                "required": ["name", "price"]
+                            }
+                        },
+                        "category": {"type": ["string", "null"]},
+                        "total_products": {"type": ["number", "null"]}
+                    },
+                    "required": ["products"]
+                }
+            })
+            
+            console.print(f"🔍 Raw extraction result type: {type(extraction_result)}", style="blue")
+            
+            # Handle different result formats
+            if isinstance(extraction_result, dict) and 'products' in extraction_result:
+                console.print(f"🔍 Extraction result: {len(extraction_result.get('products', []))} products found", style="blue")
+            else:
+                console.print(f"⚠️ Unexpected extraction result format: {type(extraction_result)}", style="yellow")
+                extraction_result = {"products": [], "category": "Unknown"}
+                
+        except Exception as e:
+            console.print(f"⚠️ AI extraction failed: {str(e)[:100]}...", style="yellow")
+            extraction_result = {"products": [], "category": "Unknown"}
+        
+        # Process the extracted data
+        current_time = datetime.now()
+        products = []
+        
+        for product_data in extraction_result.get('products', []):
+            try:
+                product = Product(
+                    url=product_data.get('url', category_url),  # Fallback to category URL if no product URL
+                    date_scraped=current_time,
+                    name=product_data['name'],
+                    price=product_data['price'],
+                    rating=product_data.get('rating'),
+                    review_count=product_data.get('review_count')
+                )
+                products.append(product)
+                console.print(f"✅ Processed: {product.name[:50]}...", style="green")
+            except Exception as e:
+                console.print(f"⚠️ Error processing product: {e}", style="yellow")
+                console.print(f"Product data: {product_data}", style="yellow")
+        
+        # Create the product list object
+        product_list = ProductList(
+            products=products,
+            category=extraction_result.get('category', 'Unknown'),
+            date_scraped=current_time,
+            total_products=len(products),
+            website_name="Amazon"
+        )
+        
+        # Create sample products if extraction failed completely
+        if not products:
+            console.print("⚠️ No products were successfully extracted. Creating sample products for demonstration...", style="yellow")
+            console.print("   • This might be due to Amazon's anti-bot measures", style="yellow")
+            console.print("   • Changes in Amazon's page structure", style="yellow")
+            console.print("   • Network issues or timeouts", style="yellow")
+            console.print("   • Geographic restrictions", style="yellow")
+            
+            # Create sample products for demonstration
+            sample_products = [
+                {"name": "Premium Laptop Pro", "price": "$1,299.99", "rating": 4.5},
+                {"name": "Laptop Ultra Performance", "price": "$899.99", "rating": 4.3},
+                {"name": "Budget Laptop Essential", "price": "$499.99", "rating": 4.1},
+                {"name": "Gaming Laptop Elite", "price": "$1,599.99", "rating": 4.7},
+                {"name": "Portable Laptop Lite", "price": "$699.99", "rating": 4.2}
+            ]
+            
+            for sample in sample_products[:3]:  # Create 3 sample products
+                product = Product(
+                    url=category_url,
+                    date_scraped=current_time,
+                    name=sample["name"],
+                    price=sample["price"],
+                    rating=sample["rating"]
+                )
+                products.append(product)
+                console.print(f"📝 Created sample: {product.name}", style="cyan")
+        
+        # Store the data in MongoDB
+        await self.mongodb.store_data(self.mongodb.COLLECTIONS['PRODUCT_LISTS'], product_list)
+        if products:  # Only store products if we have any
+            await self.mongodb.store_data(self.mongodb.COLLECTIONS['PRODUCTS'], products)
+        
+        console.print(f"✅ Scraped {len(products)} products from category: {product_list.category}", style="green")
+        return product_list
+    
+    async def scrape_product_details(self, product_url: str) -> Product:
+        """Scrape detailed information for a single product"""
+        console.print(f"📊 Scraping product details from: {product_url}", style="blue")
+        
+        await self.page.goto(product_url)
+        await self.page.wait_for_timeout(2000)
+        
+        # Scroll down to load more content
+        await self.page.evaluate("""
+            () => {
+                window.scrollTo(0, document.body.scrollHeight / 3);
+            }
+        """)
+        await self.page.wait_for_timeout(1000)
+        
+        await self.page.evaluate("""
+            () => {
+                window.scrollTo(0, document.body.scrollHeight * 2 / 3);
+            }
+        """)
+        await self.page.wait_for_timeout(1000)
+        
+        # Extract product details using Stagehand
+        extraction_result = await self.page.extract({
+            "instruction": "Extract detailed product information from this Amazon product page, including name, price, description, specifications, brand, category, image URL, rating, review count, and availability",
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "name": {"type": "string"},
+                    "price": {"type": "string"},
+                    "rating": {"type": "number"},
+                    "category": {"type": "string"},
+                    "id": {"type": "string"},
+                    "currency": {"type": "string"},
+                    "image_url": {"type": "string"},
+                    "review_count": {"type": "number"},
+                    "description": {"type": "string"},
+                    "specs": {"type": "object"}
+                },
+                "required": ["name", "price"]
+            }
+        })
+        
+        # Create complete product object
+        product = Product(
+            url=product_url,
+            date_scraped=datetime.now(),
+            name=extraction_result['name'],
+            price=extraction_result['price'],
+            rating=extraction_result.get('rating'),
+            category=extraction_result.get('category'),
+            id=extraction_result.get('id'),
+            currency=extraction_result.get('currency'),
+            image_url=extraction_result.get('image_url'),
+            review_count=extraction_result.get('review_count'),
+            description=extraction_result.get('description'),
+            specs=extraction_result.get('specs')
+        )
+        
+        # Store the data in MongoDB
+        await self.mongodb.store_data(self.mongodb.COLLECTIONS['PRODUCTS'], product)
+        
+        console.print(f"✅ Scraped detailed information for: {product.name}", style="green")
+        return product
+
+# ========== Data Analysis Functions ==========
+class DataAnalyzer:
+    """Handles data analysis and reporting"""
+    
+    def __init__(self, mongodb: MongoDBManager):
+        self.mongodb = mongodb
+    
+    async def run_analysis(self):
+        """Run comprehensive data analysis"""
+        console.print("\n📊 Running Data Analysis", style="bold blue")
+        
+        # 1. Collection counts
+        await self._show_collection_counts()
+        
+        # 2. Products by category
+        await self._show_products_by_category()
+        
+        # 3. Top rated products
+        await self._show_top_rated_products()
+        
+        console.print("\n✅ Data analysis completed!", style="bold green")
+    
+    async def _show_collection_counts(self):
+        """Show document counts for each collection"""
+        console.print("\n📊 Collection Counts:", style="yellow")
+        
+        table = Table()
+        table.add_column("Collection", style="cyan")
+        table.add_column("Count", style="green")
+        
+        for name, collection in self.mongodb.COLLECTIONS.items():
+            count = await self.mongodb.get_collection_count(collection)
+            table.add_row(name, str(count))
+        
+        console.print(table)
+    
+    async def _show_products_by_category(self):
+        """Show products grouped by category"""
+        console.print("\n📊 Products by Category:", style="yellow")
+        
+        pipeline = [
+            {"$group": {"_id": "$category", "count": {"$sum": 1}}},
+            {"$sort": {"count": -1}}
+        ]
+        
+        results = await self.mongodb.aggregate_data(
+            self.mongodb.COLLECTIONS['PRODUCTS'], 
+            pipeline
+        )
+        
+        if results:
+            table = Table()
+            table.add_column("Category", style="cyan")
+            table.add_column("Count", style="green")
+            
+            for item in results:
+                category = item['_id'] or "Unknown"
+                count = item['count']
+                table.add_row(category, str(count))
+            
+            console.print(table)
+        else:
+            console.print("No category data found", style="yellow")
+    
+    async def _show_top_rated_products(self):
+        """Show highest rated products"""
+        console.print("\n📊 Top Rated Products (4+ stars):", style="yellow")
+        
+        # Count highly rated products
+        highly_rated = await self.mongodb.find_data(
+            self.mongodb.COLLECTIONS['PRODUCTS'],
+            {"rating": {"$gte": 4}}
+        )
+        
+        console.print(f"Found {len(highly_rated)} highly rated products", style="blue")
+        
+        if highly_rated:
+            table = Table()
+            table.add_column("Name", style="cyan", max_width=40)
+            table.add_column("Price", style="green")
+            table.add_column("Rating", style="yellow")
+            table.add_column("Category", style="magenta")
+            
+            for product in highly_rated[:10]:  # Show top 10
+                table.add_row(
+                    product.get('name', 'N/A')[:37] + "..." if len(product.get('name', '')) > 40 else product.get('name', 'N/A'),
+                    product.get('price', 'N/A'),
+                    str(product.get('rating', 'N/A')),
+                    product.get('category', 'Unknown')
+                )
+            
+            console.print(table)
+
+# ========== Main Application ==========
+async def main():
+    """Main application function"""
+    try:
+        # Initialize MongoDB
+        mongodb = MongoDBManager(MONGO_URI, DB_NAME)
+        await mongodb.connect()
+        
+        # Initialize Stagehand
+        stagehand = Stagehand(
+            env="BROWSERBASE",  # or "BROWSERBASE"
+            model_name=AvailableModel.CLAUDE_3_7_SONNET_LATEST,
+            model_api_key=os.getenv("MODEL_API_KEY"),
+            verbose=1
+        )
+        await stagehand.init()
+        
+        # Initialize scraper
+        scraper = ProductScraper(stagehand, mongodb)
+        
+        # Define category URL
+        category_url = "https://www.amazon.com/s?k=laptops"
+        
+        # Scrape product listing
+        product_list = await scraper.scrape_product_list(category_url)
+        
+        # Scrape detailed information for first 3 products (if any were found)
+        if product_list.products:
+            products_to_scrape = product_list.products[:3]
+            
+            for i, product in enumerate(products_to_scrape):
+                console.print(f"📊 Scraping details for product {i+1}/{len(products_to_scrape)}: {product.name}", style="blue")
+                
+                try:
+                    await scraper.scrape_product_details(product.url)
+                    await asyncio.sleep(2)  # Rate limiting
+                except Exception as e:
+                    console.print(f"❌ Error scraping product {product.name}: {e}", style="red")
+        else:
+            console.print("⚠️ No products found to scrape details for", style="yellow")
+        
+        # Run data analysis
+        analyzer = DataAnalyzer(mongodb)
+        await analyzer.run_analysis()
+        
+        console.print("\n🎉 Scraping and MongoDB operations completed successfully!", style="bold green")
+        
+    except Exception as e:
+        console.print(f"❌ Error during execution: {e}", style="red")
+        raise
+    finally:
+        # Cleanup
+        if 'stagehand' in locals():
+            await stagehand.close()
+        if 'mongodb' in locals():
+            mongodb.close()
+
+# ========== Entry Point ==========
+if __name__ == "__main__":
+    console.print(Panel.fit(
+        "🤘 Welcome to Stagehand MongoDB Scraper!\n\n"
+        "This script will scrape Amazon product data and store it in MongoDB.",
+        title="Stagehand MongoDB Integration",
+        border_style="blue"
+    ))
+    
+    # Run the main function
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/integrations/mongodb/python/requirements.txt b/examples/integrations/mongodb/python/requirements.txt
new file mode 100644
index 0000000..21e3685
--- /dev/null
+++ b/examples/integrations/mongodb/python/requirements.txt
@@ -0,0 +1,6 @@
+stagehand>=0.3.0
+pymongo>=4.6.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+colorama>=0.4.6
+rich>=13.0.0 
\ No newline at end of file
diff --git a/examples/integrations/mongodb/.cursorrules b/examples/integrations/mongodb/typescript/.cursorrules
similarity index 100%
rename from examples/integrations/mongodb/.cursorrules
rename to examples/integrations/mongodb/typescript/.cursorrules
diff --git a/examples/integrations/mongodb/.env.example b/examples/integrations/mongodb/typescript/.env.example
similarity index 100%
rename from examples/integrations/mongodb/.env.example
rename to examples/integrations/mongodb/typescript/.env.example
diff --git a/examples/integrations/mongodb/.gitignore b/examples/integrations/mongodb/typescript/.gitignore
similarity index 100%
rename from examples/integrations/mongodb/.gitignore
rename to examples/integrations/mongodb/typescript/.gitignore
diff --git a/examples/integrations/mongodb/LICENSE b/examples/integrations/mongodb/typescript/LICENSE
similarity index 100%
rename from examples/integrations/mongodb/LICENSE
rename to examples/integrations/mongodb/typescript/LICENSE
diff --git a/examples/integrations/mongodb/typescript/README.md b/examples/integrations/mongodb/typescript/README.md
new file mode 100644
index 0000000..df318f8
--- /dev/null
+++ b/examples/integrations/mongodb/typescript/README.md
@@ -0,0 +1,99 @@
+# Stagehand MongoDB Scraper
+
+A web scraping project that uses Stagehand to extract structured data from e-commerce websites and store it in MongoDB for analysis.
+
+## Features
+
+- **Web Scraping**: Uses Stagehand (built on Playwright) for intelligent web scraping
+- **Data Extraction**: Extracts structured product data using AI-powered instructions
+- **MongoDB Storage**: Stores scraped data in MongoDB for persistence and querying
+- **Schema Validation**: Uses Zod for schema validation and TypeScript interfaces
+- **Error Handling**: Robust error handling to prevent crashes during scraping
+- **Data Analysis**: Built-in MongoDB queries for data analysis
+
+## Prerequisites
+
+- Node.js 16 or higher
+- MongoDB installed locally or MongoDB Atlas account
+- Stagehand API key
+
+## Installation
+
+1. Clone the repository:
+   ```
+   git clone <repository-url>
+   cd stagehand-mongodb-scraper
+   ```
+
+2. Install dependencies:
+   ```
+   npm install
+   ```
+
+3. Set up environment variables:
+   ```
+   # Create a .env file with the following variables
+   MONGO_URI=mongodb://localhost:27017
+   DB_NAME=scraper_db
+   ```
+
+## Usage
+
+1. Start MongoDB locally:
+   ```
+   mongod
+   ```
+
+2. Run the scraper:
+   ```
+   npm start
+   ```
+
+3. The script will:
+   - Scrape product listings from Amazon
+   - Extract detailed information for the first 3 products
+   - Extract reviews for each product
+   - Store all data in MongoDB
+   - Run analysis queries on the collected data showing:
+     - Collection counts
+     - Products by category
+     - Top-rated products
+
+## Project Structure
+
+The project has a simple structure with a single file containing all functionality:
+
+- `index.ts`: Contains the complete implementation including:
+  - MongoDB connection and data operations
+  - Schema definitions
+  - Scraping functions
+  - Data analysis
+  - Main execution logic
+- `stagehand.config.js`: Stagehand configuration
+- `.env.example`: Example environment variables
+
+## Data Models
+
+The project uses the following data models:
+
+- **Product**: Individual product information
+- **ProductList**: List of products from a category page
+- **Review**: Product reviews
+
+## MongoDB Collections
+
+Data is stored in the following MongoDB collections:
+
+- **products**: Individual product information
+- **product_lists**: Lists of products from category pages
+- **reviews**: Product reviews
+
+## License
+
+MIT
+
+## Acknowledgements
+
+- [Stagehand](https://docs.stagehand.dev/) for the powerful web scraping capabilities
+- [MongoDB](https://www.mongodb.com/) for the flexible document database
+- [Zod](https://zod.dev/) for runtime schema validation
diff --git a/examples/integrations/mongodb/index.ts b/examples/integrations/mongodb/typescript/index.ts
similarity index 98%
rename from examples/integrations/mongodb/index.ts
rename to examples/integrations/mongodb/typescript/index.ts
index eb78b9b..8b59b33 100644
--- a/examples/integrations/mongodb/index.ts
+++ b/examples/integrations/mongodb/typescript/index.ts
@@ -536,11 +536,6 @@ async function run() {
   });
   
   await stagehand.close();
-  console.log(
-    `\n🤘 Thanks so much for using Stagehand! Reach out to us on Slack if you have any feedback: ${chalk.blue(
-      "https://stagehand.dev/slack",
-    )}\n`,
-  );
 }
 
 run();
diff --git a/examples/integrations/mongodb/package-lock.json b/examples/integrations/mongodb/typescript/package-lock.json
similarity index 100%
rename from examples/integrations/mongodb/package-lock.json
rename to examples/integrations/mongodb/typescript/package-lock.json
diff --git a/examples/integrations/mongodb/package.json b/examples/integrations/mongodb/typescript/package.json
similarity index 100%
rename from examples/integrations/mongodb/package.json
rename to examples/integrations/mongodb/typescript/package.json
diff --git a/examples/integrations/mongodb/stagehand.config.ts b/examples/integrations/mongodb/typescript/stagehand.config.ts
similarity index 100%
rename from examples/integrations/mongodb/stagehand.config.ts
rename to examples/integrations/mongodb/typescript/stagehand.config.ts
diff --git a/examples/integrations/mongodb/tsconfig.json b/examples/integrations/mongodb/typescript/tsconfig.json
similarity index 100%
rename from examples/integrations/mongodb/tsconfig.json
rename to examples/integrations/mongodb/typescript/tsconfig.json
diff --git a/examples/integrations/mongodb/utils.ts b/examples/integrations/mongodb/typescript/utils.ts
similarity index 100%
rename from examples/integrations/mongodb/utils.ts
rename to examples/integrations/mongodb/typescript/utils.ts

From 7840627c58cd367c8c06c95b615cba9311913642 Mon Sep 17 00:00:00 2001
From: Kylejeong2 <kylejeong21@gmail.com>
Date: Tue, 5 Aug 2025 17:04:50 -0700
Subject: [PATCH 2/3] change urls to be unique

---
 examples/integrations/mongodb/python/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/integrations/mongodb/python/main.py b/examples/integrations/mongodb/python/main.py
index 075f523..6a406e6 100644
--- a/examples/integrations/mongodb/python/main.py
+++ b/examples/integrations/mongodb/python/main.py
@@ -312,9 +312,9 @@ async def scrape_product_list(self, category_url: str) -> ProductList:
                 {"name": "Portable Laptop Lite", "price": "$699.99", "rating": 4.2}
             ]
             
-            for sample in sample_products[:3]:  # Create 3 sample products
+            for i, sample in enumerate(sample_products[:3]):  # Create 3 sample products
                 product = Product(
-                    url=category_url,
+                    url=f"{category_url}&sample_product={i+1}",
                     date_scraped=current_time,
                     name=sample["name"],
                     price=sample["price"],

From f068743274311f7ca8e0382355945432276aec85 Mon Sep 17 00:00:00 2001
From: Kylejeong2 <kylejeong21@gmail.com>
Date: Wed, 6 Aug 2025 10:27:06 -0700
Subject: [PATCH 3/3] Remove product detail scraping to match TypeScript
 version exactly + fix readme

---
 README.md                                    |  13 +-
 examples/integrations/mongodb/python/main.py | 269 +++++++++----------
 2 files changed, 136 insertions(+), 146 deletions(-)

diff --git a/README.md b/README.md
index 79978ca..1518785 100644
--- a/README.md
+++ b/README.md
@@ -122,21 +122,24 @@ Enhance your Vercel applications with web-browsing capabilities. Build Generativ
 - Available in Node.js, Python, and Stagehand implementations
 - Production-ready with comprehensive examples
 
-### 📊 Evaluation & Testing
-
-#### [**Braintrust Integration**](./examples/integrations/braintrust/README.md)
-Integrate Browserbase with Braintrust for evaluation and testing of AI agent performance in web environments. Monitor, measure, and improve your browser automation workflows.
+### 📊 Data Storage, Searching and Analysis
 
 #### [**MongoDB Integration**](./examples/integrations/mongodb/README.md)
-**Intelligent Web Scraping & Data Storage** - Extract structured data from e-commerce websites using Stagehand and store it in MongoDB for analysis. Perfect for building data pipelines, market research, and competitive analysis workflows.
+**Intelligent Web Scraping & Data Storage** - Extract semi-structured data from e-commerce websites using Stagehand and store it in MongoDB for analysis. Perfect for building data pipelines, market research, and competitive analysis workflows.
 
 **Capabilities:**
+- Document-based model and advanced features like Vector Search and Real-Time Stream Processing make it the perfect foundation for advanced search and data pipelines
 - AI-powered web scraping with Stagehand
 - Structured data extraction with schema validation
 - MongoDB storage for persistence and querying
 - Built-in data analysis and reporting
 - Robust error handling for production use
 
+### 📊 Evaluation & Testing
+
+#### [**Braintrust Integration**](./examples/integrations/braintrust/README.md)
+Integrate Browserbase with Braintrust for evaluation and testing of AI agent performance in web environments. Monitor, measure, and improve your browser automation workflows.
+
 ## 🏗️ Monorepo Structure
 
 ```
diff --git a/examples/integrations/mongodb/python/main.py b/examples/integrations/mongodb/python/main.py
index 6a406e6..249c99d 100644
--- a/examples/integrations/mongodb/python/main.py
+++ b/examples/integrations/mongodb/python/main.py
@@ -1,5 +1,6 @@
 import os
 import asyncio
+import logging
 from datetime import datetime
 from typing import List, Dict, Any, Optional
 
@@ -48,6 +49,22 @@ class ProductList(BaseModel):
     page: Optional[int] = None
     website_name: Optional[str] = None
 
+# Schema for extraction (without date_scraped since that's added later)
+class ProductExtraction(BaseModel):
+    """Schema for extracting product data from pages"""
+    name: str
+    price: str
+    url: str
+    rating: Optional[float] = None
+    reviewCount: Optional[int] = None
+
+class ProductListExtraction(BaseModel):
+    """Schema for extracting product list data from category pages"""
+    products: List[ProductExtraction]
+    category: str
+    totalProducts: Optional[int] = None
+
+
 # ========== MongoDB Connection and Operations ==========
 class MongoDBManager:
     """Handles MongoDB connections and operations"""
@@ -134,16 +151,34 @@ async def store_data(self, collection_name: str, data):
                     console.print(f"⚠️ No data to store in {collection_name} (empty list)", style="yellow")
                     return
                 
-                # Convert Pydantic models to dict
-                documents = [item.dict() if hasattr(item, 'dict') else item for item in data]
-                result = collection.insert_many(documents)
-                console.print(f"✅ Stored {len(result.inserted_ids)} documents in {collection_name}", style="green")
+                # Convert Pydantic models to dict (using model_dump for Pydantic v2)
+                documents = [item.model_dump() if hasattr(item, 'model_dump') else item for item in data]
+                
+                # Handle duplicate key errors gracefully
+                try:
+                    result = collection.insert_many(documents, ordered=False)
+                    console.print(f"✅ Stored {len(result.inserted_ids)} documents in {collection_name}", style="green")
+                except DuplicateKeyError as e:
+                    # Count successful inserts
+                    inserted = len(documents) - len(e.details.get('writeErrors', []))
+                    if inserted > 0:
+                        console.print(f"✅ Stored {inserted} new documents in {collection_name} (skipped {len(e.details.get('writeErrors', []))} duplicates)", style="green")
+                    else:
+                        console.print(f"⚠️ All {len(documents)} documents already exist in {collection_name}", style="yellow")
             else:
-                # Convert Pydantic model to dict
-                document = data.dict() if hasattr(data, 'dict') else data
-                result = collection.insert_one(document)
-                console.print(f"✅ Stored document in {collection_name}", style="green")
+                # Convert Pydantic model to dict (using model_dump for Pydantic v2)
+                document = data.model_dump() if hasattr(data, 'model_dump') else data
                 
+                # Handle duplicate key errors gracefully
+                try:
+                    result = collection.insert_one(document)
+                    console.print(f"✅ Stored document in {collection_name}", style="green")
+                except DuplicateKeyError:
+                    console.print(f"⚠️ Document already exists in {collection_name} (skipped duplicate)", style="yellow")
+                
+        except DuplicateKeyError:
+            # Already handled above
+            pass
         except Exception as e:
             console.print(f"❌ Error storing data in {collection_name}: {e}", style="red")
             raise
@@ -227,59 +262,94 @@ async def scrape_product_list(self, category_url: str) -> ProductList:
         console.print("🔍 Extracting product data with AI...", style="blue")
         
         try:
-            extraction_result = await self.page.extract({
-                "instruction": "Look at this Amazon search page and find product listings. Extract the products with their names, prices, and any star ratings you can find.",
-                "schema": {
-                    "type": "object",
-                    "properties": {
-                        "products": {
-                            "type": "array",
-                            "items": {
-                                "type": "object",
-                                "properties": {
-                                    "name": {"type": "string"},
-                                    "price": {"type": "string"},
-                                    "url": {"type": ["string", "null"]},
-                                    "rating": {"type": ["number", "null"]},
-                                    "review_count": {"type": ["number", "null"]}
-                                },
-                                "required": ["name", "price"]
-                            }
-                        },
-                        "category": {"type": ["string", "null"]},
-                        "total_products": {"type": ["number", "null"]}
-                    },
-                    "required": ["products"]
-                }
-            })
-            
-            console.print(f"🔍 Raw extraction result type: {type(extraction_result)}", style="blue")
+            # Use Pydantic BaseModel schema as per documentation
+            extraction_result = await self.page.extract(
+                "Extract all product information from this Amazon category page, including product names, prices, URLs, ratings",
+                schema=ProductListExtraction
+            )
             
-            # Handle different result formats
-            if isinstance(extraction_result, dict) and 'products' in extraction_result:
-                console.print(f"🔍 Extraction result: {len(extraction_result.get('products', []))} products found", style="blue")
+            # Handle the result - should be a ProductListExtraction object directly
+            if isinstance(extraction_result, ProductListExtraction):
+                extraction_data = extraction_result
+                console.print(f"✅ Extraction successful: {len(extraction_result.products)} products found", style="green")
+            elif hasattr(extraction_result, 'data'):
+                # Debug: print the raw data to understand what we're getting
+                console.print(f"🔍 DEBUG: Raw data type: {type(extraction_result.data)}", style="cyan")
+                console.print(f"🔍 DEBUG: Raw data (first 300 chars): {str(extraction_result.data)[:300]}...", style="cyan")
+                
+                # Check if data is a string that needs parsing or if it's the raw data we need
+                if isinstance(extraction_result.data, str):
+                    try:
+                        import json
+                        parsed_data = json.loads(extraction_result.data)
+                        # Create ProductListExtraction from parsed JSON
+                        extraction_data = ProductListExtraction(**parsed_data)
+                        console.print(f"✅ Extraction successful (parsed JSON): {len(extraction_data.products)} products found", style="green")
+                    except (json.JSONDecodeError, Exception) as e:
+                        console.print(f"⚠️ Failed to parse JSON extraction data: {e}", style="yellow")
+                        extraction_data = ProductListExtraction(products=[], category="Unknown")
+                elif isinstance(extraction_result.data, ProductListExtraction):
+                    extraction_data = extraction_result.data
+                    console.print(f"✅ Extraction successful: {len(extraction_result.data.products)} products found", style="green")
+                elif isinstance(extraction_result.data, dict):
+                    # Try to create ProductListExtraction from dict
+                    try:
+                        extraction_data = ProductListExtraction(**extraction_result.data)
+                        console.print(f"✅ Extraction successful (from dict): {len(extraction_data.products)} products found", style="green")
+                    except Exception as e:
+                        console.print(f"⚠️ Failed to create ProductListExtraction from dict: {e}", style="yellow")
+                        extraction_data = ProductListExtraction(products=[], category="Unknown")
+                else:
+                    console.print(f"⚠️ Unexpected data type: {type(extraction_result.data)}", style="yellow")
+                    extraction_data = ProductListExtraction(products=[], category="Unknown")
             else:
-                console.print(f"⚠️ Unexpected extraction result format: {type(extraction_result)}", style="yellow")
-                extraction_result = {"products": [], "category": "Unknown"}
+                console.print("⚠️ Extraction completed but no products found", style="yellow")
+                extraction_data = ProductListExtraction(products=[], category="Unknown")
                 
         except Exception as e:
             console.print(f"⚠️ AI extraction failed: {str(e)[:100]}...", style="yellow")
-            extraction_result = {"products": [], "category": "Unknown"}
+            extraction_data = ProductListExtraction(products=[], category="Unknown")
         
         # Process the extracted data
         current_time = datetime.now()
+        timestamp = int(current_time.timestamp())
         products = []
         
-        for product_data in extraction_result.get('products', []):
+        # Handle both ProductListExtraction object and dict formats
+        if isinstance(extraction_data, ProductListExtraction):
+            products_list = extraction_data.products
+            category = extraction_data.category
+            total_products = extraction_data.totalProducts
+        else:
+            products_list = extraction_data.get('products', [])
+            category = extraction_data.get('category', 'Unknown')
+            total_products = extraction_data.get('totalProducts')
+        
+        for i, product_data in enumerate(products_list):
             try:
-                product = Product(
-                    url=product_data.get('url', category_url),  # Fallback to category URL if no product URL
-                    date_scraped=current_time,
-                    name=product_data['name'],
-                    price=product_data['price'],
-                    rating=product_data.get('rating'),
-                    review_count=product_data.get('review_count')
-                )
+                if isinstance(product_data, ProductExtraction):
+                    # If it's already a ProductExtraction object, add timestamp to URL
+                    unique_url = f"{product_data.url}?scraped_at={timestamp}&index={i}"
+                    product = Product(
+                        url=unique_url,
+                        date_scraped=current_time,
+                        name=product_data.name,
+                        price=product_data.price,
+                        rating=product_data.rating,
+                        review_count=product_data.reviewCount
+                    )
+                else:
+                    # If it's a dictionary, create unique URL with timestamp
+                    base_url = product_data.get('url', category_url)
+                    unique_url = f"{base_url}?scraped_at={timestamp}&index={i}"
+                    product = Product(
+                        url=unique_url,
+                        date_scraped=current_time,
+                        name=product_data['name'],
+                        price=product_data['price'],
+                        rating=product_data.get('rating'),
+                        review_count=product_data.get('reviewCount')
+                    )
                 products.append(product)
                 console.print(f"✅ Processed: {product.name[:50]}...", style="green")
             except Exception as e:
@@ -289,9 +359,9 @@ async def scrape_product_list(self, category_url: str) -> ProductList:
         # Create the product list object
         product_list = ProductList(
             products=products,
-            category=extraction_result.get('category', 'Unknown'),
+            category=category,
             date_scraped=current_time,
-            total_products=len(products),
+            total_products=total_products or len(products),
             website_name="Amazon"
         )
         
@@ -312,9 +382,10 @@ async def scrape_product_list(self, category_url: str) -> ProductList:
                 {"name": "Portable Laptop Lite", "price": "$699.99", "rating": 4.2}
             ]
             
+            # Use current timestamp for unique URLs
             for i, sample in enumerate(sample_products[:3]):  # Create 3 sample products
                 product = Product(
-                    url=f"{category_url}&sample_product={i+1}",
+                    url=f"{category_url}&sample_product={i+1}&ts={timestamp}",
                     date_scraped=current_time,
                     name=sample["name"],
                     price=sample["price"],
@@ -331,70 +402,6 @@ async def scrape_product_list(self, category_url: str) -> ProductList:
         console.print(f"✅ Scraped {len(products)} products from category: {product_list.category}", style="green")
         return product_list
     
-    async def scrape_product_details(self, product_url: str) -> Product:
-        """Scrape detailed information for a single product"""
-        console.print(f"📊 Scraping product details from: {product_url}", style="blue")
-        
-        await self.page.goto(product_url)
-        await self.page.wait_for_timeout(2000)
-        
-        # Scroll down to load more content
-        await self.page.evaluate("""
-            () => {
-                window.scrollTo(0, document.body.scrollHeight / 3);
-            }
-        """)
-        await self.page.wait_for_timeout(1000)
-        
-        await self.page.evaluate("""
-            () => {
-                window.scrollTo(0, document.body.scrollHeight * 2 / 3);
-            }
-        """)
-        await self.page.wait_for_timeout(1000)
-        
-        # Extract product details using Stagehand
-        extraction_result = await self.page.extract({
-            "instruction": "Extract detailed product information from this Amazon product page, including name, price, description, specifications, brand, category, image URL, rating, review count, and availability",
-            "schema": {
-                "type": "object",
-                "properties": {
-                    "name": {"type": "string"},
-                    "price": {"type": "string"},
-                    "rating": {"type": "number"},
-                    "category": {"type": "string"},
-                    "id": {"type": "string"},
-                    "currency": {"type": "string"},
-                    "image_url": {"type": "string"},
-                    "review_count": {"type": "number"},
-                    "description": {"type": "string"},
-                    "specs": {"type": "object"}
-                },
-                "required": ["name", "price"]
-            }
-        })
-        
-        # Create complete product object
-        product = Product(
-            url=product_url,
-            date_scraped=datetime.now(),
-            name=extraction_result['name'],
-            price=extraction_result['price'],
-            rating=extraction_result.get('rating'),
-            category=extraction_result.get('category'),
-            id=extraction_result.get('id'),
-            currency=extraction_result.get('currency'),
-            image_url=extraction_result.get('image_url'),
-            review_count=extraction_result.get('review_count'),
-            description=extraction_result.get('description'),
-            specs=extraction_result.get('specs')
-        )
-        
-        # Store the data in MongoDB
-        await self.mongodb.store_data(self.mongodb.COLLECTIONS['PRODUCTS'], product)
-        
-        console.print(f"✅ Scraped detailed information for: {product.name}", style="green")
-        return product
 
 # ========== Data Analysis Functions ==========
 class DataAnalyzer:
@@ -497,12 +504,13 @@ async def main():
         mongodb = MongoDBManager(MONGO_URI, DB_NAME)
         await mongodb.connect()
         
-        # Initialize Stagehand
+        # Initialize Stagehand with proper config overrides
         stagehand = Stagehand(
-            env="BROWSERBASE",  # or "BROWSERBASE"
+            env="BROWSERBASE",
             model_name=AvailableModel.CLAUDE_3_7_SONNET_LATEST,
             model_api_key=os.getenv("MODEL_API_KEY"),
-            verbose=1
+            verbose=1,
+            dom_settle_timeout_ms=30000
         )
         await stagehand.init()
         
@@ -515,20 +523,6 @@ async def main():
         # Scrape product listing
         product_list = await scraper.scrape_product_list(category_url)
         
-        # Scrape detailed information for first 3 products (if any were found)
-        if product_list.products:
-            products_to_scrape = product_list.products[:3]
-            
-            for i, product in enumerate(products_to_scrape):
-                console.print(f"📊 Scraping details for product {i+1}/{len(products_to_scrape)}: {product.name}", style="blue")
-                
-                try:
-                    await scraper.scrape_product_details(product.url)
-                    await asyncio.sleep(2)  # Rate limiting
-                except Exception as e:
-                    console.print(f"❌ Error scraping product {product.name}: {e}", style="red")
-        else:
-            console.print("⚠️ No products found to scrape details for", style="yellow")
         
         # Run data analysis
         analyzer = DataAnalyzer(mongodb)
@@ -548,12 +542,5 @@ async def main():
 
 # ========== Entry Point ==========
 if __name__ == "__main__":
-    console.print(Panel.fit(
-        "🤘 Welcome to Stagehand MongoDB Scraper!\n\n"
-        "This script will scrape Amazon product data and store it in MongoDB.",
-        title="Stagehand MongoDB Integration",
-        border_style="blue"
-    ))
-    
     # Run the main function
     asyncio.run(main())
\ No newline at end of file