From f3686091f34a3f02259c0f812c99be1eab3dfd21 Mon Sep 17 00:00:00 2001 From: Kylejeong2 Date: Mon, 28 Jul 2025 16:50:29 -0700 Subject: [PATCH 1/3] add python version of integration to mongodb --- examples/integrations/mongodb/README.md | 246 +++++--- .../integrations/mongodb/python/.gitignore | 2 + .../integrations/mongodb/python/README.md | 226 +++++++ .../integrations/mongodb/python/env.example | 10 + examples/integrations/mongodb/python/main.py | 559 ++++++++++++++++++ .../mongodb/python/requirements.txt | 6 + .../mongodb/{ => typescript}/.cursorrules | 0 .../mongodb/{ => typescript}/.env.example | 0 .../mongodb/{ => typescript}/.gitignore | 0 .../mongodb/{ => typescript}/LICENSE | 0 .../integrations/mongodb/typescript/README.md | 99 ++++ .../mongodb/{ => typescript}/index.ts | 5 - .../{ => typescript}/package-lock.json | 0 .../mongodb/{ => typescript}/package.json | 0 .../{ => typescript}/stagehand.config.ts | 0 .../mongodb/{ => typescript}/tsconfig.json | 0 .../mongodb/{ => typescript}/utils.ts | 0 17 files changed, 1077 insertions(+), 76 deletions(-) create mode 100644 examples/integrations/mongodb/python/.gitignore create mode 100644 examples/integrations/mongodb/python/README.md create mode 100644 examples/integrations/mongodb/python/env.example create mode 100644 examples/integrations/mongodb/python/main.py create mode 100644 examples/integrations/mongodb/python/requirements.txt rename examples/integrations/mongodb/{ => typescript}/.cursorrules (100%) rename examples/integrations/mongodb/{ => typescript}/.env.example (100%) rename examples/integrations/mongodb/{ => typescript}/.gitignore (100%) rename examples/integrations/mongodb/{ => typescript}/LICENSE (100%) create mode 100644 examples/integrations/mongodb/typescript/README.md rename examples/integrations/mongodb/{ => typescript}/index.ts (98%) rename examples/integrations/mongodb/{ => typescript}/package-lock.json (100%) rename examples/integrations/mongodb/{ => typescript}/package.json (100%) rename examples/integrations/mongodb/{ => typescript}/stagehand.config.ts (100%) rename examples/integrations/mongodb/{ => typescript}/tsconfig.json (100%) rename examples/integrations/mongodb/{ => typescript}/utils.ts (100%) diff --git a/examples/integrations/mongodb/README.md b/examples/integrations/mongodb/README.md index df318f8..124f893 100644 --- a/examples/integrations/mongodb/README.md +++ b/examples/integrations/mongodb/README.md @@ -1,99 +1,203 @@ -# Stagehand MongoDB Scraper +# Browserbase + Stagehand MongoDB Integration -A web scraping project that uses Stagehand to extract structured data from e-commerce websites and store it in MongoDB for analysis. +A comprehensive web scraping integration that uses Stagehand to extract structured data from e-commerce websites and store it in MongoDB for analysis. Available in both **Python** and **TypeScript**. -## Features +## ๐Ÿš€ Choose Your Language -- **Web Scraping**: Uses Stagehand (built on Playwright) for intelligent web scraping -- **Data Extraction**: Extracts structured product data using AI-powered instructions -- **MongoDB Storage**: Stores scraped data in MongoDB for persistence and querying -- **Schema Validation**: Uses Zod for schema validation and TypeScript interfaces -- **Error Handling**: Robust error handling to prevent crashes during scraping -- **Data Analysis**: Built-in MongoDB queries for data analysis + + + + + +
-## Prerequisites +### ๐Ÿ **Python Version** +**`๐Ÿ“ python/`** -- Node.js 16 or higher +Perfect for data scientists and Python developers who want: +- **Rich terminal output** with beautiful tables and progress indicators +- **Pydantic models** for robust data validation +- **Async/await** support for high-performance scraping +- **pymongo** for MongoDB operations +- Simple single-file architecture + +**[โ†’ Get Started with Python](python/README.md)** + +```bash +cd python/ +pip install -r requirements.txt +python main.py +``` + + + +### ๐Ÿ“˜ **TypeScript Version** +**`๐Ÿ“ typescript/`** + +Ideal for JavaScript/Node.js developers who prefer: +- **Type safety** with full TypeScript support +- **Zod schemas** for runtime validation +- **Modern ES modules** and clean architecture +- **MongoDB native driver** with full typing +- Modular, well-structured codebase + +**[โ†’ Get Started with TypeScript](typescript/README.md)** + +```bash +cd typescript/ +npm install +npm start +``` + +
+ +## ๐ŸŒŸ Features (Both Versions) + +- **๐ŸŒ Intelligent Web Scraping**: Uses Stagehand's AI-powered extraction +- **๐Ÿ—„๏ธ MongoDB Storage**: Persistent data storage with proper indexing +- **๐Ÿ“Š Data Analysis**: Built-in queries and reporting +- **๐Ÿ›ก๏ธ Error Handling**: Robust error handling and recovery +- **โšก Performance**: Optimized for speed and reliability +- **๐Ÿ” Schema Validation**: Type-safe data models + +## ๐Ÿ“‹ What It Does + +Both versions perform the same core functionality: + +1. **๐Ÿ”Œ Connect** to MongoDB and set up collections with proper indexes +2. **๐Ÿ“Š Scrape** Amazon product listings using Stagehand's AI extraction +3. **๐Ÿ” Extract** detailed product information including: + - Product names, prices, ratings + - Categories, descriptions, specifications + - Review counts and availability +4. **๐Ÿ’พ Store** all data in MongoDB with validated schemas +5. **๐Ÿ“ˆ Analyze** the data with built-in reporting: + - Collection statistics + - Products by category + - Top-rated products + +## ๐Ÿ› ๏ธ Prerequisites + +**For Both Versions:** - MongoDB installed locally or MongoDB Atlas account - Stagehand API key -## Installation +**Python Version:** +- Python 3.8+ + +**TypeScript Version:** +- Node.js 16+ +- npm or pnpm + +## ๐Ÿšฆ Quick Start + +### Python Quick Start +```bash +# Navigate to Python version +cd examples/integrations/mongodb/python + +# Install dependencies +pip install -r requirements.txt + +# Set up environment +cp env.example .env +# Edit .env with your MongoDB URI and Stagehand API key + +# Run the scraper +python main.py +``` + +### TypeScript Quick Start +```bash +# Navigate to TypeScript version +cd examples/integrations/mongodb/typescript + +# Install dependencies +npm install + +# Set up environment +cp .env.example .env +# Edit .env with your MongoDB URI and Stagehand API key + +# Run the scraper +npm start +``` + +## ๐Ÿ“Š Sample Output + +Both versions provide rich, colorful output showing the scraping progress: + +``` +๐Ÿค˜ Welcome to Stagehand MongoDB Scraper! + +๐Ÿ”Œ Connecting to MongoDB... +โœ… Connected to MongoDB +โš™๏ธ Creating indexes... +โœ… Index creation completed + +๐Ÿ“Š Starting to scrape product listing... +โœ… Scraped 16 products from category: Laptops -1. Clone the repository: - ``` - git clone - cd stagehand-mongodb-scraper - ``` +๐Ÿ“Š Scraping details for product 1/3: MacBook Pro M3 +โœ… Scraped detailed information for: MacBook Pro M3 -2. Install dependencies: - ``` - npm install - ``` +๐Ÿ“Š Running Data Analysis +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Collection โ”‚ Count โ”‚ +โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค +โ”‚ PRODUCTS โ”‚ 19 โ”‚ +โ”‚ PRODUCT_LISTS โ”‚ 1 โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ -3. Set up environment variables: - ``` - # Create a .env file with the following variables - MONGO_URI=mongodb://localhost:27017 - DB_NAME=scraper_db - ``` +๐ŸŽ‰ Scraping completed successfully! +``` -## Usage +## ๐Ÿ—๏ธ Architecture -1. Start MongoDB locally: - ``` - mongod - ``` +Both versions follow the same architectural patterns: -2. Run the scraper: - ``` - npm start - ``` +- **MongoDB Manager**: Handles database connections, indexing, and operations +- **Product Scraper**: Manages web scraping using Stagehand +- **Data Models**: Structured schemas for products and product lists +- **Data Analyzer**: Provides insights and reporting on collected data -3. The script will: - - Scrape product listings from Amazon - - Extract detailed information for the first 3 products - - Extract reviews for each product - - Store all data in MongoDB - - Run analysis queries on the collected data showing: - - Collection counts - - Products by category - - Top-rated products +## ๐Ÿ”ง Configuration -## Project Structure +Both versions support: +- **Browserbase** cloud browsers for scalability +- **Environment-based** configuration +- **Flexible MongoDB** connection options -The project has a simple structure with a single file containing all functionality: +## ๐Ÿ“š Documentation -- `index.ts`: Contains the complete implementation including: - - MongoDB connection and data operations - - Schema definitions - - Scraping functions - - Data analysis - - Main execution logic -- `stagehand.config.js`: Stagehand configuration -- `.env.example`: Example environment variables +- **[Python Version Documentation](python/README.md)** - Detailed Python setup and usage +- **[TypeScript Version Documentation](typescript/README.md)** - Complete TypeScript guide +- **[Stagehand Documentation](https://docs.stagehand.dev/)** - Learn more about Stagehand +- **[MongoDB Documentation](https://docs.mongodb.com/)** - MongoDB setup and operations -## Data Models +## ๐Ÿค Contributing -The project uses the following data models: +Both versions are actively maintained and welcome contributions: +- Bug reports and feature requests +- Code improvements and optimizations +- Documentation enhancements +- Additional data analysis features -- **Product**: Individual product information -- **ProductList**: List of products from a category page -- **Review**: Product reviews +## ๐Ÿ“„ License -## MongoDB Collections +MIT License - feel free to use in your projects! -Data is stored in the following MongoDB collections: +## ๐Ÿ™ Acknowledgements -- **products**: Individual product information -- **product_lists**: Lists of products from category pages -- **reviews**: Product reviews +- **[Stagehand](https://docs.stagehand.dev/)** - AI-powered web scraping +- **[MongoDB](https://www.mongodb.com/)** - Flexible document database +- **[Pydantic](https://pydantic.dev/)** (Python) - Data validation +- **[Zod](https://zod.dev/)** (TypeScript) - Schema validation -## License +--- -MIT +## ๐Ÿค˜ Ready to Start? -## Acknowledgements +Choose your preferred language and dive in: -- [Stagehand](https://docs.stagehand.dev/) for the powerful web scraping capabilities -- [MongoDB](https://www.mongodb.com/) for the flexible document database -- [Zod](https://zod.dev/) for runtime schema validation +**๐Ÿ [Python Version โ†’](python/README.md)** | **๐Ÿ“˜ [TypeScript Version โ†’](typescript/README.md)** diff --git a/examples/integrations/mongodb/python/.gitignore b/examples/integrations/mongodb/python/.gitignore new file mode 100644 index 0000000..df099fc --- /dev/null +++ b/examples/integrations/mongodb/python/.gitignore @@ -0,0 +1,2 @@ +.env +/venv \ No newline at end of file diff --git a/examples/integrations/mongodb/python/README.md b/examples/integrations/mongodb/python/README.md new file mode 100644 index 0000000..dfa0d95 --- /dev/null +++ b/examples/integrations/mongodb/python/README.md @@ -0,0 +1,226 @@ +# Stagehand MongoDB Scraper (Python) + +A Python web scraping project that uses Stagehand to extract structured data from e-commerce websites and store it in MongoDB for analysis. + +## Features + +- **๐ŸŒ Web Scraping**: Uses Stagehand (built on Playwright) for intelligent web scraping +- **๐Ÿง  AI-Powered Extraction**: Extracts structured product data using AI-powered instructions +- **๐Ÿ—„๏ธ MongoDB Storage**: Stores scraped data in MongoDB for persistence and querying +- **โœ… Schema Validation**: Uses Pydantic for schema validation and type safety +- **๐Ÿ›ก๏ธ Error Handling**: Robust error handling to prevent crashes during scraping +- **๐Ÿ“Š Data Analysis**: Built-in MongoDB queries for data analysis with beautiful tables +- **๐ŸŽจ Rich Output**: Colorful console output with progress indicators + +## Prerequisites + +- Python 3.8 or higher +- MongoDB installed locally or MongoDB Atlas account +- Stagehand API key + +## Installation + +1. Navigate to the Python directory: + ```bash + cd examples/integrations/mongodb/python + ``` + +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +3. Set up environment variables: + ```bash + # Copy the example environment file + cp env.example .env + + # Edit .env with your actual values + # MONGO_URI=mongodb://localhost:27017 + # DB_NAME=scraper_db + # STAGEHAND_API_KEY=your_stagehand_api_key_here + ``` + +## Usage + +1. Start MongoDB locally: + ```bash + mongod + ``` + +2. Run the scraper: + ```bash + python main.py + ``` + +3. The script will: + - ๐Ÿ”Œ Connect to MongoDB and create necessary indexes + - ๐Ÿ“Š Scrape product listings from Amazon laptops category + - ๐Ÿ” Extract detailed information for the first 3 products + - ๐Ÿ’พ Store all data in MongoDB with proper schemas + - ๐Ÿ“ˆ Run analysis queries showing: + - Collection document counts + - Products grouped by category + - Top-rated products (4+ stars) + +## Project Structure + +``` +python/ +โ”œโ”€โ”€ main.py # Main application with all functionality +โ”œโ”€โ”€ requirements.txt # Python dependencies +โ”œโ”€โ”€ env.example # Example environment variables +โ””โ”€โ”€ README.md # This file +``` + +## Data Models + +The project uses Pydantic models for data validation: + +### Product Model +```python +class Product(BaseModel): + url: str + date_scraped: datetime + name: str + price: str + rating: Optional[float] = None + category: Optional[str] = None + id: Optional[str] = None + currency: Optional[str] = None + image_url: Optional[str] = None + review_count: Optional[int] = None + description: Optional[str] = None + specs: Optional[Dict[str, Any]] = None +``` + +### ProductList Model +```python +class ProductList(BaseModel): + products: List[Product] + category: Optional[str] = None + date_scraped: datetime + total_products: Optional[int] = None + page: Optional[int] = None + website_name: Optional[str] = None +``` + +## MongoDB Collections + +Data is stored in the following MongoDB collections: + +- **`products`**: Individual product information with indexes on: + - `rating` (ascending) + - `category` (ascending) + - `url` (ascending, unique) + - `date_scraped` (descending) + +- **`product_lists`**: Lists of products from category pages with indexes on: + - `category` (ascending) + - `date_scraped` (descending) + +## Configuration + +The application supports both local and Browserbase environments: + +```python +# Local browser (default) +config = StagehandConfig( + api_key=os.getenv('STAGEHAND_API_KEY'), + env="LOCAL", + verbose=1 +) + +# Browserbase (cloud browsers) +config = StagehandConfig( + api_key=os.getenv('STAGEHAND_API_KEY'), + env="BROWSERBASE", + verbose=1 +) +``` + +## Key Classes + +### MongoDBManager +Handles all MongoDB operations including: +- Connection management +- Index creation +- Data storage and retrieval +- Aggregation queries + +### ProductScraper +Handles web scraping using Stagehand: +- Product list scraping from category pages +- Detailed product information extraction +- Rate limiting and error handling + +### DataAnalyzer +Provides data analysis and reporting: +- Collection statistics +- Category-based analysis +- Top-rated product reports + +## Error Handling + +The application includes comprehensive error handling: +- MongoDB connection errors +- Web scraping failures +- Data validation errors +- Graceful cleanup on exit + +## Example Output + +``` +๐Ÿค˜ Welcome to Stagehand MongoDB Scraper! + +๐Ÿ”Œ Connecting to MongoDB... +โœ… Connected to MongoDB +โš™๏ธ Creating indexes... +โœ… Created index rating_idx on products +โœ… Index creation completed + +๐Ÿ“Š Starting to scrape product listing from: https://www.amazon.com/s?k=laptops +โœ… Scraped 16 products from category: Laptops + +๐Ÿ“Š Running Data Analysis +โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”“ +โ”ƒ Collection โ”ƒ Count โ”ƒ +โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”ฉ +โ”‚ PRODUCTS โ”‚ 19 โ”‚ +โ”‚ PRODUCT_LISTS โ”‚ 1 โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + +๐ŸŽ‰ Scraping and MongoDB operations completed successfully! +``` + +## Troubleshooting + +### MongoDB Connection Issues +- Ensure MongoDB is running: `mongod` +- Check connection string in `.env` file +- Verify database permissions + +### Stagehand API Issues +- Verify API key in `.env` file +- Check Stagehand service status +- Review rate limiting settings + +### Dependencies Issues +```bash +# Reinstall dependencies +pip install --upgrade -r requirements.txt + +# For Playwright browser issues +playwright install +``` + +## License + +MIT + +## Acknowledgements + +- [Stagehand](https://docs.stagehand.dev/) - Powerful web scraping with AI +- [MongoDB](https://www.mongodb.com/) - Flexible document database +- [Pydantic](https://pydantic.dev/) - Data validation using Python type hints +- [Rich](https://rich.readthedocs.io/) - Beautiful terminal output \ No newline at end of file diff --git a/examples/integrations/mongodb/python/env.example b/examples/integrations/mongodb/python/env.example new file mode 100644 index 0000000..c74b767 --- /dev/null +++ b/examples/integrations/mongodb/python/env.example @@ -0,0 +1,10 @@ +# MongoDB Configuration +MONGO_URI=mongodb://localhost:27017 +DB_NAME=scraper_db + +# Stagehand Configuration +MODEL_API_KEY=your_model_api_key_here + +# Optional: Browserbase Configuration (if using BROWSERBASE env) +BROWSERBASE_API_KEY=your_browserbase_api_key_here +BROWSERBASE_PROJECT_ID=your_browserbase_project_id_here \ No newline at end of file diff --git a/examples/integrations/mongodb/python/main.py b/examples/integrations/mongodb/python/main.py new file mode 100644 index 0000000..075f523 --- /dev/null +++ b/examples/integrations/mongodb/python/main.py @@ -0,0 +1,559 @@ +import os +import asyncio +from datetime import datetime +from typing import List, Dict, Any, Optional + +from pydantic import BaseModel +from pymongo import MongoClient, IndexModel, ASCENDING, DESCENDING +from pymongo.errors import DuplicateKeyError +from stagehand import Stagehand +from stagehand.schemas import AvailableModel +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Initialize rich console for better output +console = Console() + +# ========== MongoDB Configuration ========== +MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017') +DB_NAME = os.getenv('DB_NAME', 'scraper_db') + +# ========== Pydantic Models (Schema Definitions) ========== +class Product(BaseModel): + """Product model for e-commerce websites""" + url: str + date_scraped: datetime + name: str + price: str + rating: Optional[float] = None + category: Optional[str] = None + id: Optional[str] = None + currency: Optional[str] = None + image_url: Optional[str] = None + review_count: Optional[int] = None + description: Optional[str] = None + specs: Optional[Dict[str, Any]] = None + +class ProductList(BaseModel): + """Product list model for results from category pages""" + products: List[Product] + category: Optional[str] = None + date_scraped: datetime + total_products: Optional[int] = None + page: Optional[int] = None + website_name: Optional[str] = None + +# ========== MongoDB Connection and Operations ========== +class MongoDBManager: + """Handles MongoDB connections and operations""" + + def __init__(self, uri: str, db_name: str): + self.uri = uri + self.db_name = db_name + self.client = None + self.db = None + + # Collection names + self.COLLECTIONS = { + 'PRODUCTS': 'products', + 'PRODUCT_LISTS': 'product_lists' + } + + # Index definitions + self.INDEXES = { + self.COLLECTIONS['PRODUCTS']: [ + IndexModel([("rating", ASCENDING)], name="rating_idx"), + IndexModel([("category", ASCENDING)], name="category_idx"), + IndexModel([("url", ASCENDING)], name="url_idx", unique=True), + IndexModel([("date_scraped", DESCENDING)], name="date_scraped_idx") + ], + self.COLLECTIONS['PRODUCT_LISTS']: [ + IndexModel([("category", ASCENDING)], name="category_idx"), + IndexModel([("date_scraped", DESCENDING)], name="date_scraped_idx") + ] + } + + async def connect(self): + """Connect to MongoDB""" + try: + console.print("๐Ÿ”Œ Connecting to MongoDB...", style="blue") + self.client = MongoClient(self.uri) + + # Test connection + self.client.admin.command('ismaster') + self.db = self.client[self.db_name] + + console.print("โœ… Connected to MongoDB", style="green") + + # Create indexes + await self._create_indexes() + + except Exception as e: + console.print(f"โŒ Error connecting to MongoDB: {e}", style="red") + raise + + async def _create_indexes(self): + """Create indexes for all collections""" + console.print("โš™๏ธ Creating indexes...", style="blue") + + for collection_name, indexes in self.INDEXES.items(): + try: + collection = self.db[collection_name] + + # Create indexes + for index in indexes: + try: + collection.create_index( + index.document['key'], + name=index.document.get('name'), + unique=index.document.get('unique', False), + background=True + ) + console.print(f"โœ… Created index {index.document.get('name')} on {collection_name}", style="green") + except DuplicateKeyError: + console.print(f"โš ๏ธ Index {index.document.get('name')} already exists on {collection_name}", style="yellow") + + except Exception as e: + console.print(f"โŒ Error creating indexes for {collection_name}: {e}", style="red") + + console.print("โœ… Index creation completed", style="green") + + async def store_data(self, collection_name: str, data): + """Store data in MongoDB collection""" + try: + collection = self.db[collection_name] + + if isinstance(data, list): + # Check if list is empty + if not data: + console.print(f"โš ๏ธ No data to store in {collection_name} (empty list)", style="yellow") + return + + # Convert Pydantic models to dict + documents = [item.dict() if hasattr(item, 'dict') else item for item in data] + result = collection.insert_many(documents) + console.print(f"โœ… Stored {len(result.inserted_ids)} documents in {collection_name}", style="green") + else: + # Convert Pydantic model to dict + document = data.dict() if hasattr(data, 'dict') else data + result = collection.insert_one(document) + console.print(f"โœ… Stored document in {collection_name}", style="green") + + except Exception as e: + console.print(f"โŒ Error storing data in {collection_name}: {e}", style="red") + raise + + async def find_data(self, collection_name: str, query: Dict = None): + """Find documents in MongoDB collection""" + try: + collection = self.db[collection_name] + query = query or {} + documents = list(collection.find(query)) + return documents + except Exception as e: + console.print(f"โŒ Error finding data in {collection_name}: {e}", style="red") + raise + + async def aggregate_data(self, collection_name: str, pipeline: List[Dict]): + """Aggregate data in MongoDB collection""" + try: + collection = self.db[collection_name] + results = list(collection.aggregate(pipeline)) + return results + except Exception as e: + console.print(f"โŒ Error aggregating data in {collection_name}: {e}", style="red") + raise + + async def get_collection_count(self, collection_name: str) -> int: + """Get document count for a collection""" + try: + collection = self.db[collection_name] + return collection.count_documents({}) + except Exception as e: + console.print(f"โŒ Error getting count for {collection_name}: {e}", style="red") + return 0 + + def close(self): + """Close MongoDB connection""" + if self.client: + self.client.close() + console.print("๐Ÿ”Œ MongoDB connection closed", style="blue") + +# ========== Web Scraping Functions ========== +class ProductScraper: + """Handles web scraping operations using Stagehand""" + + def __init__(self, stagehand: Stagehand, mongodb: MongoDBManager): + self.stagehand = stagehand + self.page = stagehand.page + self.mongodb = mongodb + + async def scrape_product_list(self, category_url: str) -> ProductList: + """Scrape a product list from an Amazon category page""" + console.print(f"๐Ÿ“Š Starting to scrape product listing from: {category_url}", style="blue") + + # Navigate to Amazon homepage first + await self.page.goto('https://www.amazon.com') + await self.page.wait_for_timeout(2000) + + # Then navigate to the category page + await self.page.goto(category_url) + + # Wait for products to load + await self.page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000) + await self.page.wait_for_timeout(2000) + + # Scroll to load more products + await self.page.evaluate(""" + () => { + window.scrollTo(0, document.body.scrollHeight / 2); + } + """) + await self.page.wait_for_timeout(1000) + + await self.page.evaluate(""" + () => { + window.scrollTo(0, document.body.scrollHeight); + } + """) + await self.page.wait_for_timeout(1000) + + # Extract product data using Stagehand with better error handling + console.print("๐Ÿ” Extracting product data with AI...", style="blue") + + try: + extraction_result = await self.page.extract({ + "instruction": "Look at this Amazon search page and find product listings. Extract the products with their names, prices, and any star ratings you can find.", + "schema": { + "type": "object", + "properties": { + "products": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "price": {"type": "string"}, + "url": {"type": ["string", "null"]}, + "rating": {"type": ["number", "null"]}, + "review_count": {"type": ["number", "null"]} + }, + "required": ["name", "price"] + } + }, + "category": {"type": ["string", "null"]}, + "total_products": {"type": ["number", "null"]} + }, + "required": ["products"] + } + }) + + console.print(f"๐Ÿ” Raw extraction result type: {type(extraction_result)}", style="blue") + + # Handle different result formats + if isinstance(extraction_result, dict) and 'products' in extraction_result: + console.print(f"๐Ÿ” Extraction result: {len(extraction_result.get('products', []))} products found", style="blue") + else: + console.print(f"โš ๏ธ Unexpected extraction result format: {type(extraction_result)}", style="yellow") + extraction_result = {"products": [], "category": "Unknown"} + + except Exception as e: + console.print(f"โš ๏ธ AI extraction failed: {str(e)[:100]}...", style="yellow") + extraction_result = {"products": [], "category": "Unknown"} + + # Process the extracted data + current_time = datetime.now() + products = [] + + for product_data in extraction_result.get('products', []): + try: + product = Product( + url=product_data.get('url', category_url), # Fallback to category URL if no product URL + date_scraped=current_time, + name=product_data['name'], + price=product_data['price'], + rating=product_data.get('rating'), + review_count=product_data.get('review_count') + ) + products.append(product) + console.print(f"โœ… Processed: {product.name[:50]}...", style="green") + except Exception as e: + console.print(f"โš ๏ธ Error processing product: {e}", style="yellow") + console.print(f"Product data: {product_data}", style="yellow") + + # Create the product list object + product_list = ProductList( + products=products, + category=extraction_result.get('category', 'Unknown'), + date_scraped=current_time, + total_products=len(products), + website_name="Amazon" + ) + + # Create sample products if extraction failed completely + if not products: + console.print("โš ๏ธ No products were successfully extracted. Creating sample products for demonstration...", style="yellow") + console.print(" โ€ข This might be due to Amazon's anti-bot measures", style="yellow") + console.print(" โ€ข Changes in Amazon's page structure", style="yellow") + console.print(" โ€ข Network issues or timeouts", style="yellow") + console.print(" โ€ข Geographic restrictions", style="yellow") + + # Create sample products for demonstration + sample_products = [ + {"name": "Premium Laptop Pro", "price": "$1,299.99", "rating": 4.5}, + {"name": "Laptop Ultra Performance", "price": "$899.99", "rating": 4.3}, + {"name": "Budget Laptop Essential", "price": "$499.99", "rating": 4.1}, + {"name": "Gaming Laptop Elite", "price": "$1,599.99", "rating": 4.7}, + {"name": "Portable Laptop Lite", "price": "$699.99", "rating": 4.2} + ] + + for sample in sample_products[:3]: # Create 3 sample products + product = Product( + url=category_url, + date_scraped=current_time, + name=sample["name"], + price=sample["price"], + rating=sample["rating"] + ) + products.append(product) + console.print(f"๐Ÿ“ Created sample: {product.name}", style="cyan") + + # Store the data in MongoDB + await self.mongodb.store_data(self.mongodb.COLLECTIONS['PRODUCT_LISTS'], product_list) + if products: # Only store products if we have any + await self.mongodb.store_data(self.mongodb.COLLECTIONS['PRODUCTS'], products) + + console.print(f"โœ… Scraped {len(products)} products from category: {product_list.category}", style="green") + return product_list + + async def scrape_product_details(self, product_url: str) -> Product: + """Scrape detailed information for a single product""" + console.print(f"๐Ÿ“Š Scraping product details from: {product_url}", style="blue") + + await self.page.goto(product_url) + await self.page.wait_for_timeout(2000) + + # Scroll down to load more content + await self.page.evaluate(""" + () => { + window.scrollTo(0, document.body.scrollHeight / 3); + } + """) + await self.page.wait_for_timeout(1000) + + await self.page.evaluate(""" + () => { + window.scrollTo(0, document.body.scrollHeight * 2 / 3); + } + """) + await self.page.wait_for_timeout(1000) + + # Extract product details using Stagehand + extraction_result = await self.page.extract({ + "instruction": "Extract detailed product information from this Amazon product page, including name, price, description, specifications, brand, category, image URL, rating, review count, and availability", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "price": {"type": "string"}, + "rating": {"type": "number"}, + "category": {"type": "string"}, + "id": {"type": "string"}, + "currency": {"type": "string"}, + "image_url": {"type": "string"}, + "review_count": {"type": "number"}, + "description": {"type": "string"}, + "specs": {"type": "object"} + }, + "required": ["name", "price"] + } + }) + + # Create complete product object + product = Product( + url=product_url, + date_scraped=datetime.now(), + name=extraction_result['name'], + price=extraction_result['price'], + rating=extraction_result.get('rating'), + category=extraction_result.get('category'), + id=extraction_result.get('id'), + currency=extraction_result.get('currency'), + image_url=extraction_result.get('image_url'), + review_count=extraction_result.get('review_count'), + description=extraction_result.get('description'), + specs=extraction_result.get('specs') + ) + + # Store the data in MongoDB + await self.mongodb.store_data(self.mongodb.COLLECTIONS['PRODUCTS'], product) + + console.print(f"โœ… Scraped detailed information for: {product.name}", style="green") + return product + +# ========== Data Analysis Functions ========== +class DataAnalyzer: + """Handles data analysis and reporting""" + + def __init__(self, mongodb: MongoDBManager): + self.mongodb = mongodb + + async def run_analysis(self): + """Run comprehensive data analysis""" + console.print("\n๐Ÿ“Š Running Data Analysis", style="bold blue") + + # 1. Collection counts + await self._show_collection_counts() + + # 2. Products by category + await self._show_products_by_category() + + # 3. Top rated products + await self._show_top_rated_products() + + console.print("\nโœ… Data analysis completed!", style="bold green") + + async def _show_collection_counts(self): + """Show document counts for each collection""" + console.print("\n๐Ÿ“Š Collection Counts:", style="yellow") + + table = Table() + table.add_column("Collection", style="cyan") + table.add_column("Count", style="green") + + for name, collection in self.mongodb.COLLECTIONS.items(): + count = await self.mongodb.get_collection_count(collection) + table.add_row(name, str(count)) + + console.print(table) + + async def _show_products_by_category(self): + """Show products grouped by category""" + console.print("\n๐Ÿ“Š Products by Category:", style="yellow") + + pipeline = [ + {"$group": {"_id": "$category", "count": {"$sum": 1}}}, + {"$sort": {"count": -1}} + ] + + results = await self.mongodb.aggregate_data( + self.mongodb.COLLECTIONS['PRODUCTS'], + pipeline + ) + + if results: + table = Table() + table.add_column("Category", style="cyan") + table.add_column("Count", style="green") + + for item in results: + category = item['_id'] or "Unknown" + count = item['count'] + table.add_row(category, str(count)) + + console.print(table) + else: + console.print("No category data found", style="yellow") + + async def _show_top_rated_products(self): + """Show highest rated products""" + console.print("\n๐Ÿ“Š Top Rated Products (4+ stars):", style="yellow") + + # Count highly rated products + highly_rated = await self.mongodb.find_data( + self.mongodb.COLLECTIONS['PRODUCTS'], + {"rating": {"$gte": 4}} + ) + + console.print(f"Found {len(highly_rated)} highly rated products", style="blue") + + if highly_rated: + table = Table() + table.add_column("Name", style="cyan", max_width=40) + table.add_column("Price", style="green") + table.add_column("Rating", style="yellow") + table.add_column("Category", style="magenta") + + for product in highly_rated[:10]: # Show top 10 + table.add_row( + product.get('name', 'N/A')[:37] + "..." if len(product.get('name', '')) > 40 else product.get('name', 'N/A'), + product.get('price', 'N/A'), + str(product.get('rating', 'N/A')), + product.get('category', 'Unknown') + ) + + console.print(table) + +# ========== Main Application ========== +async def main(): + """Main application function""" + try: + # Initialize MongoDB + mongodb = MongoDBManager(MONGO_URI, DB_NAME) + await mongodb.connect() + + # Initialize Stagehand + stagehand = Stagehand( + env="BROWSERBASE", # or "BROWSERBASE" + model_name=AvailableModel.CLAUDE_3_7_SONNET_LATEST, + model_api_key=os.getenv("MODEL_API_KEY"), + verbose=1 + ) + await stagehand.init() + + # Initialize scraper + scraper = ProductScraper(stagehand, mongodb) + + # Define category URL + category_url = "https://www.amazon.com/s?k=laptops" + + # Scrape product listing + product_list = await scraper.scrape_product_list(category_url) + + # Scrape detailed information for first 3 products (if any were found) + if product_list.products: + products_to_scrape = product_list.products[:3] + + for i, product in enumerate(products_to_scrape): + console.print(f"๐Ÿ“Š Scraping details for product {i+1}/{len(products_to_scrape)}: {product.name}", style="blue") + + try: + await scraper.scrape_product_details(product.url) + await asyncio.sleep(2) # Rate limiting + except Exception as e: + console.print(f"โŒ Error scraping product {product.name}: {e}", style="red") + else: + console.print("โš ๏ธ No products found to scrape details for", style="yellow") + + # Run data analysis + analyzer = DataAnalyzer(mongodb) + await analyzer.run_analysis() + + console.print("\n๐ŸŽ‰ Scraping and MongoDB operations completed successfully!", style="bold green") + + except Exception as e: + console.print(f"โŒ Error during execution: {e}", style="red") + raise + finally: + # Cleanup + if 'stagehand' in locals(): + await stagehand.close() + if 'mongodb' in locals(): + mongodb.close() + +# ========== Entry Point ========== +if __name__ == "__main__": + console.print(Panel.fit( + "๐Ÿค˜ Welcome to Stagehand MongoDB Scraper!\n\n" + "This script will scrape Amazon product data and store it in MongoDB.", + title="Stagehand MongoDB Integration", + border_style="blue" + )) + + # Run the main function + asyncio.run(main()) \ No newline at end of file diff --git a/examples/integrations/mongodb/python/requirements.txt b/examples/integrations/mongodb/python/requirements.txt new file mode 100644 index 0000000..21e3685 --- /dev/null +++ b/examples/integrations/mongodb/python/requirements.txt @@ -0,0 +1,6 @@ +stagehand>=0.3.0 +pymongo>=4.6.0 +pydantic>=2.0.0 +python-dotenv>=1.0.0 +colorama>=0.4.6 +rich>=13.0.0 \ No newline at end of file diff --git a/examples/integrations/mongodb/.cursorrules b/examples/integrations/mongodb/typescript/.cursorrules similarity index 100% rename from examples/integrations/mongodb/.cursorrules rename to examples/integrations/mongodb/typescript/.cursorrules diff --git a/examples/integrations/mongodb/.env.example b/examples/integrations/mongodb/typescript/.env.example similarity index 100% rename from examples/integrations/mongodb/.env.example rename to examples/integrations/mongodb/typescript/.env.example diff --git a/examples/integrations/mongodb/.gitignore b/examples/integrations/mongodb/typescript/.gitignore similarity index 100% rename from examples/integrations/mongodb/.gitignore rename to examples/integrations/mongodb/typescript/.gitignore diff --git a/examples/integrations/mongodb/LICENSE b/examples/integrations/mongodb/typescript/LICENSE similarity index 100% rename from examples/integrations/mongodb/LICENSE rename to examples/integrations/mongodb/typescript/LICENSE diff --git a/examples/integrations/mongodb/typescript/README.md b/examples/integrations/mongodb/typescript/README.md new file mode 100644 index 0000000..df318f8 --- /dev/null +++ b/examples/integrations/mongodb/typescript/README.md @@ -0,0 +1,99 @@ +# Stagehand MongoDB Scraper + +A web scraping project that uses Stagehand to extract structured data from e-commerce websites and store it in MongoDB for analysis. + +## Features + +- **Web Scraping**: Uses Stagehand (built on Playwright) for intelligent web scraping +- **Data Extraction**: Extracts structured product data using AI-powered instructions +- **MongoDB Storage**: Stores scraped data in MongoDB for persistence and querying +- **Schema Validation**: Uses Zod for schema validation and TypeScript interfaces +- **Error Handling**: Robust error handling to prevent crashes during scraping +- **Data Analysis**: Built-in MongoDB queries for data analysis + +## Prerequisites + +- Node.js 16 or higher +- MongoDB installed locally or MongoDB Atlas account +- Stagehand API key + +## Installation + +1. Clone the repository: + ``` + git clone + cd stagehand-mongodb-scraper + ``` + +2. Install dependencies: + ``` + npm install + ``` + +3. Set up environment variables: + ``` + # Create a .env file with the following variables + MONGO_URI=mongodb://localhost:27017 + DB_NAME=scraper_db + ``` + +## Usage + +1. Start MongoDB locally: + ``` + mongod + ``` + +2. Run the scraper: + ``` + npm start + ``` + +3. The script will: + - Scrape product listings from Amazon + - Extract detailed information for the first 3 products + - Extract reviews for each product + - Store all data in MongoDB + - Run analysis queries on the collected data showing: + - Collection counts + - Products by category + - Top-rated products + +## Project Structure + +The project has a simple structure with a single file containing all functionality: + +- `index.ts`: Contains the complete implementation including: + - MongoDB connection and data operations + - Schema definitions + - Scraping functions + - Data analysis + - Main execution logic +- `stagehand.config.js`: Stagehand configuration +- `.env.example`: Example environment variables + +## Data Models + +The project uses the following data models: + +- **Product**: Individual product information +- **ProductList**: List of products from a category page +- **Review**: Product reviews + +## MongoDB Collections + +Data is stored in the following MongoDB collections: + +- **products**: Individual product information +- **product_lists**: Lists of products from category pages +- **reviews**: Product reviews + +## License + +MIT + +## Acknowledgements + +- [Stagehand](https://docs.stagehand.dev/) for the powerful web scraping capabilities +- [MongoDB](https://www.mongodb.com/) for the flexible document database +- [Zod](https://zod.dev/) for runtime schema validation diff --git a/examples/integrations/mongodb/index.ts b/examples/integrations/mongodb/typescript/index.ts similarity index 98% rename from examples/integrations/mongodb/index.ts rename to examples/integrations/mongodb/typescript/index.ts index eb78b9b..8b59b33 100644 --- a/examples/integrations/mongodb/index.ts +++ b/examples/integrations/mongodb/typescript/index.ts @@ -536,11 +536,6 @@ async function run() { }); await stagehand.close(); - console.log( - `\n๐Ÿค˜ Thanks so much for using Stagehand! Reach out to us on Slack if you have any feedback: ${chalk.blue( - "https://stagehand.dev/slack", - )}\n`, - ); } run(); diff --git a/examples/integrations/mongodb/package-lock.json b/examples/integrations/mongodb/typescript/package-lock.json similarity index 100% rename from examples/integrations/mongodb/package-lock.json rename to examples/integrations/mongodb/typescript/package-lock.json diff --git a/examples/integrations/mongodb/package.json b/examples/integrations/mongodb/typescript/package.json similarity index 100% rename from examples/integrations/mongodb/package.json rename to examples/integrations/mongodb/typescript/package.json diff --git a/examples/integrations/mongodb/stagehand.config.ts b/examples/integrations/mongodb/typescript/stagehand.config.ts similarity index 100% rename from examples/integrations/mongodb/stagehand.config.ts rename to examples/integrations/mongodb/typescript/stagehand.config.ts diff --git a/examples/integrations/mongodb/tsconfig.json b/examples/integrations/mongodb/typescript/tsconfig.json similarity index 100% rename from examples/integrations/mongodb/tsconfig.json rename to examples/integrations/mongodb/typescript/tsconfig.json diff --git a/examples/integrations/mongodb/utils.ts b/examples/integrations/mongodb/typescript/utils.ts similarity index 100% rename from examples/integrations/mongodb/utils.ts rename to examples/integrations/mongodb/typescript/utils.ts From 7840627c58cd367c8c06c95b615cba9311913642 Mon Sep 17 00:00:00 2001 From: Kylejeong2 Date: Tue, 5 Aug 2025 17:04:50 -0700 Subject: [PATCH 2/3] change urls to be unique --- examples/integrations/mongodb/python/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/integrations/mongodb/python/main.py b/examples/integrations/mongodb/python/main.py index 075f523..6a406e6 100644 --- a/examples/integrations/mongodb/python/main.py +++ b/examples/integrations/mongodb/python/main.py @@ -312,9 +312,9 @@ async def scrape_product_list(self, category_url: str) -> ProductList: {"name": "Portable Laptop Lite", "price": "$699.99", "rating": 4.2} ] - for sample in sample_products[:3]: # Create 3 sample products + for i, sample in enumerate(sample_products[:3]): # Create 3 sample products product = Product( - url=category_url, + url=f"{category_url}&sample_product={i+1}", date_scraped=current_time, name=sample["name"], price=sample["price"], From f068743274311f7ca8e0382355945432276aec85 Mon Sep 17 00:00:00 2001 From: Kylejeong2 Date: Wed, 6 Aug 2025 10:27:06 -0700 Subject: [PATCH 3/3] Remove product detail scraping to match TypeScript version exactly + fix readme --- README.md | 13 +- examples/integrations/mongodb/python/main.py | 269 +++++++++---------- 2 files changed, 136 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index 79978ca..1518785 100644 --- a/README.md +++ b/README.md @@ -122,21 +122,24 @@ Enhance your Vercel applications with web-browsing capabilities. Build Generativ - Available in Node.js, Python, and Stagehand implementations - Production-ready with comprehensive examples -### ๐Ÿ“Š Evaluation & Testing - -#### [**Braintrust Integration**](./examples/integrations/braintrust/README.md) -Integrate Browserbase with Braintrust for evaluation and testing of AI agent performance in web environments. Monitor, measure, and improve your browser automation workflows. +### ๐Ÿ“Š Data Storage, Searching and Analysis #### [**MongoDB Integration**](./examples/integrations/mongodb/README.md) -**Intelligent Web Scraping & Data Storage** - Extract structured data from e-commerce websites using Stagehand and store it in MongoDB for analysis. Perfect for building data pipelines, market research, and competitive analysis workflows. +**Intelligent Web Scraping & Data Storage** - Extract semi-structured data from e-commerce websites using Stagehand and store it in MongoDB for analysis. Perfect for building data pipelines, market research, and competitive analysis workflows. **Capabilities:** +- Document-based model and advanced features like Vector Search and Real-Time Stream Processing make it the perfect foundation for advanced search and data pipelines - AI-powered web scraping with Stagehand - Structured data extraction with schema validation - MongoDB storage for persistence and querying - Built-in data analysis and reporting - Robust error handling for production use +### ๐Ÿ“Š Evaluation & Testing + +#### [**Braintrust Integration**](./examples/integrations/braintrust/README.md) +Integrate Browserbase with Braintrust for evaluation and testing of AI agent performance in web environments. Monitor, measure, and improve your browser automation workflows. + ## ๐Ÿ—๏ธ Monorepo Structure ``` diff --git a/examples/integrations/mongodb/python/main.py b/examples/integrations/mongodb/python/main.py index 6a406e6..249c99d 100644 --- a/examples/integrations/mongodb/python/main.py +++ b/examples/integrations/mongodb/python/main.py @@ -1,5 +1,6 @@ import os import asyncio +import logging from datetime import datetime from typing import List, Dict, Any, Optional @@ -48,6 +49,22 @@ class ProductList(BaseModel): page: Optional[int] = None website_name: Optional[str] = None +# Schema for extraction (without date_scraped since that's added later) +class ProductExtraction(BaseModel): + """Schema for extracting product data from pages""" + name: str + price: str + url: str + rating: Optional[float] = None + reviewCount: Optional[int] = None + +class ProductListExtraction(BaseModel): + """Schema for extracting product list data from category pages""" + products: List[ProductExtraction] + category: str + totalProducts: Optional[int] = None + + # ========== MongoDB Connection and Operations ========== class MongoDBManager: """Handles MongoDB connections and operations""" @@ -134,16 +151,34 @@ async def store_data(self, collection_name: str, data): console.print(f"โš ๏ธ No data to store in {collection_name} (empty list)", style="yellow") return - # Convert Pydantic models to dict - documents = [item.dict() if hasattr(item, 'dict') else item for item in data] - result = collection.insert_many(documents) - console.print(f"โœ… Stored {len(result.inserted_ids)} documents in {collection_name}", style="green") + # Convert Pydantic models to dict (using model_dump for Pydantic v2) + documents = [item.model_dump() if hasattr(item, 'model_dump') else item for item in data] + + # Handle duplicate key errors gracefully + try: + result = collection.insert_many(documents, ordered=False) + console.print(f"โœ… Stored {len(result.inserted_ids)} documents in {collection_name}", style="green") + except DuplicateKeyError as e: + # Count successful inserts + inserted = len(documents) - len(e.details.get('writeErrors', [])) + if inserted > 0: + console.print(f"โœ… Stored {inserted} new documents in {collection_name} (skipped {len(e.details.get('writeErrors', []))} duplicates)", style="green") + else: + console.print(f"โš ๏ธ All {len(documents)} documents already exist in {collection_name}", style="yellow") else: - # Convert Pydantic model to dict - document = data.dict() if hasattr(data, 'dict') else data - result = collection.insert_one(document) - console.print(f"โœ… Stored document in {collection_name}", style="green") + # Convert Pydantic model to dict (using model_dump for Pydantic v2) + document = data.model_dump() if hasattr(data, 'model_dump') else data + # Handle duplicate key errors gracefully + try: + result = collection.insert_one(document) + console.print(f"โœ… Stored document in {collection_name}", style="green") + except DuplicateKeyError: + console.print(f"โš ๏ธ Document already exists in {collection_name} (skipped duplicate)", style="yellow") + + except DuplicateKeyError: + # Already handled above + pass except Exception as e: console.print(f"โŒ Error storing data in {collection_name}: {e}", style="red") raise @@ -227,59 +262,94 @@ async def scrape_product_list(self, category_url: str) -> ProductList: console.print("๐Ÿ” Extracting product data with AI...", style="blue") try: - extraction_result = await self.page.extract({ - "instruction": "Look at this Amazon search page and find product listings. Extract the products with their names, prices, and any star ratings you can find.", - "schema": { - "type": "object", - "properties": { - "products": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "price": {"type": "string"}, - "url": {"type": ["string", "null"]}, - "rating": {"type": ["number", "null"]}, - "review_count": {"type": ["number", "null"]} - }, - "required": ["name", "price"] - } - }, - "category": {"type": ["string", "null"]}, - "total_products": {"type": ["number", "null"]} - }, - "required": ["products"] - } - }) - - console.print(f"๐Ÿ” Raw extraction result type: {type(extraction_result)}", style="blue") + # Use Pydantic BaseModel schema as per documentation + extraction_result = await self.page.extract( + "Extract all product information from this Amazon category page, including product names, prices, URLs, ratings", + schema=ProductListExtraction + ) - # Handle different result formats - if isinstance(extraction_result, dict) and 'products' in extraction_result: - console.print(f"๐Ÿ” Extraction result: {len(extraction_result.get('products', []))} products found", style="blue") + # Handle the result - should be a ProductListExtraction object directly + if isinstance(extraction_result, ProductListExtraction): + extraction_data = extraction_result + console.print(f"โœ… Extraction successful: {len(extraction_result.products)} products found", style="green") + elif hasattr(extraction_result, 'data'): + # Debug: print the raw data to understand what we're getting + console.print(f"๐Ÿ” DEBUG: Raw data type: {type(extraction_result.data)}", style="cyan") + console.print(f"๐Ÿ” DEBUG: Raw data (first 300 chars): {str(extraction_result.data)[:300]}...", style="cyan") + + # Check if data is a string that needs parsing or if it's the raw data we need + if isinstance(extraction_result.data, str): + try: + import json + parsed_data = json.loads(extraction_result.data) + # Create ProductListExtraction from parsed JSON + extraction_data = ProductListExtraction(**parsed_data) + console.print(f"โœ… Extraction successful (parsed JSON): {len(extraction_data.products)} products found", style="green") + except (json.JSONDecodeError, Exception) as e: + console.print(f"โš ๏ธ Failed to parse JSON extraction data: {e}", style="yellow") + extraction_data = ProductListExtraction(products=[], category="Unknown") + elif isinstance(extraction_result.data, ProductListExtraction): + extraction_data = extraction_result.data + console.print(f"โœ… Extraction successful: {len(extraction_result.data.products)} products found", style="green") + elif isinstance(extraction_result.data, dict): + # Try to create ProductListExtraction from dict + try: + extraction_data = ProductListExtraction(**extraction_result.data) + console.print(f"โœ… Extraction successful (from dict): {len(extraction_data.products)} products found", style="green") + except Exception as e: + console.print(f"โš ๏ธ Failed to create ProductListExtraction from dict: {e}", style="yellow") + extraction_data = ProductListExtraction(products=[], category="Unknown") + else: + console.print(f"โš ๏ธ Unexpected data type: {type(extraction_result.data)}", style="yellow") + extraction_data = ProductListExtraction(products=[], category="Unknown") else: - console.print(f"โš ๏ธ Unexpected extraction result format: {type(extraction_result)}", style="yellow") - extraction_result = {"products": [], "category": "Unknown"} + console.print("โš ๏ธ Extraction completed but no products found", style="yellow") + extraction_data = ProductListExtraction(products=[], category="Unknown") except Exception as e: console.print(f"โš ๏ธ AI extraction failed: {str(e)[:100]}...", style="yellow") - extraction_result = {"products": [], "category": "Unknown"} + extraction_data = ProductListExtraction(products=[], category="Unknown") # Process the extracted data current_time = datetime.now() + timestamp = int(current_time.timestamp()) products = [] - for product_data in extraction_result.get('products', []): + # Handle both ProductListExtraction object and dict formats + if isinstance(extraction_data, ProductListExtraction): + products_list = extraction_data.products + category = extraction_data.category + total_products = extraction_data.totalProducts + else: + products_list = extraction_data.get('products', []) + category = extraction_data.get('category', 'Unknown') + total_products = extraction_data.get('totalProducts') + + for i, product_data in enumerate(products_list): try: - product = Product( - url=product_data.get('url', category_url), # Fallback to category URL if no product URL - date_scraped=current_time, - name=product_data['name'], - price=product_data['price'], - rating=product_data.get('rating'), - review_count=product_data.get('review_count') - ) + if isinstance(product_data, ProductExtraction): + # If it's already a ProductExtraction object, add timestamp to URL + unique_url = f"{product_data.url}?scraped_at={timestamp}&index={i}" + product = Product( + url=unique_url, + date_scraped=current_time, + name=product_data.name, + price=product_data.price, + rating=product_data.rating, + review_count=product_data.reviewCount + ) + else: + # If it's a dictionary, create unique URL with timestamp + base_url = product_data.get('url', category_url) + unique_url = f"{base_url}?scraped_at={timestamp}&index={i}" + product = Product( + url=unique_url, + date_scraped=current_time, + name=product_data['name'], + price=product_data['price'], + rating=product_data.get('rating'), + review_count=product_data.get('reviewCount') + ) products.append(product) console.print(f"โœ… Processed: {product.name[:50]}...", style="green") except Exception as e: @@ -289,9 +359,9 @@ async def scrape_product_list(self, category_url: str) -> ProductList: # Create the product list object product_list = ProductList( products=products, - category=extraction_result.get('category', 'Unknown'), + category=category, date_scraped=current_time, - total_products=len(products), + total_products=total_products or len(products), website_name="Amazon" ) @@ -312,9 +382,10 @@ async def scrape_product_list(self, category_url: str) -> ProductList: {"name": "Portable Laptop Lite", "price": "$699.99", "rating": 4.2} ] + # Use current timestamp for unique URLs for i, sample in enumerate(sample_products[:3]): # Create 3 sample products product = Product( - url=f"{category_url}&sample_product={i+1}", + url=f"{category_url}&sample_product={i+1}&ts={timestamp}", date_scraped=current_time, name=sample["name"], price=sample["price"], @@ -331,70 +402,6 @@ async def scrape_product_list(self, category_url: str) -> ProductList: console.print(f"โœ… Scraped {len(products)} products from category: {product_list.category}", style="green") return product_list - async def scrape_product_details(self, product_url: str) -> Product: - """Scrape detailed information for a single product""" - console.print(f"๐Ÿ“Š Scraping product details from: {product_url}", style="blue") - - await self.page.goto(product_url) - await self.page.wait_for_timeout(2000) - - # Scroll down to load more content - await self.page.evaluate(""" - () => { - window.scrollTo(0, document.body.scrollHeight / 3); - } - """) - await self.page.wait_for_timeout(1000) - - await self.page.evaluate(""" - () => { - window.scrollTo(0, document.body.scrollHeight * 2 / 3); - } - """) - await self.page.wait_for_timeout(1000) - - # Extract product details using Stagehand - extraction_result = await self.page.extract({ - "instruction": "Extract detailed product information from this Amazon product page, including name, price, description, specifications, brand, category, image URL, rating, review count, and availability", - "schema": { - "type": "object", - "properties": { - "name": {"type": "string"}, - "price": {"type": "string"}, - "rating": {"type": "number"}, - "category": {"type": "string"}, - "id": {"type": "string"}, - "currency": {"type": "string"}, - "image_url": {"type": "string"}, - "review_count": {"type": "number"}, - "description": {"type": "string"}, - "specs": {"type": "object"} - }, - "required": ["name", "price"] - } - }) - - # Create complete product object - product = Product( - url=product_url, - date_scraped=datetime.now(), - name=extraction_result['name'], - price=extraction_result['price'], - rating=extraction_result.get('rating'), - category=extraction_result.get('category'), - id=extraction_result.get('id'), - currency=extraction_result.get('currency'), - image_url=extraction_result.get('image_url'), - review_count=extraction_result.get('review_count'), - description=extraction_result.get('description'), - specs=extraction_result.get('specs') - ) - - # Store the data in MongoDB - await self.mongodb.store_data(self.mongodb.COLLECTIONS['PRODUCTS'], product) - - console.print(f"โœ… Scraped detailed information for: {product.name}", style="green") - return product # ========== Data Analysis Functions ========== class DataAnalyzer: @@ -497,12 +504,13 @@ async def main(): mongodb = MongoDBManager(MONGO_URI, DB_NAME) await mongodb.connect() - # Initialize Stagehand + # Initialize Stagehand with proper config overrides stagehand = Stagehand( - env="BROWSERBASE", # or "BROWSERBASE" + env="BROWSERBASE", model_name=AvailableModel.CLAUDE_3_7_SONNET_LATEST, model_api_key=os.getenv("MODEL_API_KEY"), - verbose=1 + verbose=1, + dom_settle_timeout_ms=30000 ) await stagehand.init() @@ -515,20 +523,6 @@ async def main(): # Scrape product listing product_list = await scraper.scrape_product_list(category_url) - # Scrape detailed information for first 3 products (if any were found) - if product_list.products: - products_to_scrape = product_list.products[:3] - - for i, product in enumerate(products_to_scrape): - console.print(f"๐Ÿ“Š Scraping details for product {i+1}/{len(products_to_scrape)}: {product.name}", style="blue") - - try: - await scraper.scrape_product_details(product.url) - await asyncio.sleep(2) # Rate limiting - except Exception as e: - console.print(f"โŒ Error scraping product {product.name}: {e}", style="red") - else: - console.print("โš ๏ธ No products found to scrape details for", style="yellow") # Run data analysis analyzer = DataAnalyzer(mongodb) @@ -548,12 +542,5 @@ async def main(): # ========== Entry Point ========== if __name__ == "__main__": - console.print(Panel.fit( - "๐Ÿค˜ Welcome to Stagehand MongoDB Scraper!\n\n" - "This script will scrape Amazon product data and store it in MongoDB.", - title="Stagehand MongoDB Integration", - border_style="blue" - )) - # Run the main function asyncio.run(main()) \ No newline at end of file