wasmerio
diff --git a/‎Web Scraper/README.md‎
Lines changed: 81 additions & 2 deletions b/‎Web Scraper/README.md‎
Lines changed: 81 additions & 2 deletions
diff --git a/‎Web Scraper/Web_Scraper.py‎
Lines changed: 5 additions & 5 deletions b/‎Web Scraper/Web_Scraper.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎Web Scraper/example_usage.py‎
Lines changed: 70 additions & 0 deletions b/‎Web Scraper/example_usage.py‎
Lines changed: 70 additions & 0 deletions
@@ -1,8 +1,87 @@
-In this script, we use the `requests` library to send a GET request to the Python.org blogs page. We then use the `BeautifulSoup` library to parse the HTML content of the page.
+# Web Scraper
 
-We find all the blog titles on the page by searching for `h2` elements with the class `blog-title`. We then print each title found and save them to a file named `blog_titles.txt`.
+This repository contains two web scraping scripts:
 
+## 1. Traditional Web Scraper (`Web_Scraper.py`)
+
+This script uses the `requests` library to send a GET request to the Python.org blogs page. It then uses the `BeautifulSoup` library to parse the HTML content of the page.
+
+It finds all the blog titles on the page by searching for `h2` elements with the class `blog-title`. It then prints each title found and saves them to a file named `blog_titles.txt`.
+
+### Usage
 To run this script, first install the required libraries:
 
 ```bash
 pip install requests beautifulsoup4
+```
+
+Then run:
+
+```bash
+python Web_Scraper.py
+```
+
+## 2. Google Custom Search Scraper (`google_web_scraper.py`)
+
+This enhanced CLI web scraper uses the Google Custom Search API to extract URLs, titles, and snippets from search results. This approach is more robust than traditional web scraping methods as it:
+
+- Bypasses CAPTCHA challenges that may occur during direct web scraping
+- Retrieves structured data (title, URL, and snippet/description) 
+- Handles dynamic websites more reliably
+- Is less prone to breaking when website structures change
+- Allows searching by keyword to retrieve multiple metadata fields
+
+### Prerequisites
+Before using this script, you need:
+1. A Google API Key from [Google Cloud Console](https://console.cloud.google.com/apis/credentials)
+2. A Custom Search Engine ID from [Google Programmable Search Engine](https://programmablesearchengine.google.com/)
+
+### Installation
+```bash
+pip install -r requirements.txt
+```
+
+### Setup
+Set your API credentials as environment variables:
+```bash
+export GOOGLE_API_KEY='your_google_api_key'
+export SEARCH_ENGINE_ID='your_search_engine_id'
+```
+
+Alternatively, you can pass them directly as command-line arguments.
+
+### Usage
+Basic usage:
+```bash
+python google_web_scraper.py --query "Python tutorials" --results 10
+```
+
+Save results in JSON format:
+```bash
+python google_web_scraper.py --query "machine learning blogs" --results 20 --format json
+```
+
+Specify output file:
+```bash
+python google_web_scraper.py --query "web development news" --output my_search.json --format json
+```
+
+With API credentials as arguments:
+```bash
+python google_web_scraper.py --query "Python tutorials" --api-key YOUR_API_KEY --engine-id YOUR_ENGINE_ID
+```
+
+### Options
+- `--query, -q`: Search query to use for web scraping (required)
+- `--results, -r`: Number of search results to retrieve (default: 10)
+- `--output, -o`: Output file name (default: search_results.txt)
+- `--format, -f`: Output format: txt or json (default: txt)
+- `--api-key, -k`: Google API Key (optional)
+- `--engine-id, -e`: Google Custom Search Engine ID (optional)
+
+### Features
+- Command-line interface with configurable options
+- Support for both TXT and JSON output formats
+- Environment variable support for credentials
+- Error handling and user-friendly messages
+- Ability to retrieve multiple pages of results
@@ -1,6 +1,10 @@
 import requests
 from bs4 import BeautifulSoup
 
+print("This is the traditional web scraper using BeautifulSoup.")
+print("For a more robust solution using Google Custom Search API, see 'google_web_scraper.py'")
+print()
+
 # URL to scrape data from
 URL = "https://www.python.org/blogs/"
 
@@ -23,8 +27,4 @@
     for title in titles:
         file.write(title.get_text(strip=True) + "\n")
 
-print("\nBlog titles saved to 'blog_titles.txt'.")
-     
-   
-     
-     
+print("\nBlog titles saved to 'blog_titles.txt'.")
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""
+Example usage of the Google Custom Search Scraper
+This demonstrates how to use the scraper programmatically
+"""
+import os
+from google_web_scraper import GoogleSearchScraper
+
+
+def example_basic_usage():
+    """Example of basic usage"""
+    # Initialize the scraper with API credentials
+    # These can be set as environment variables: GOOGLE_API_KEY and SEARCH_ENGINE_ID
+    api_key = os.getenv('GOOGLE_API_KEY')
+    search_engine_id = os.getenv('SEARCH_ENGINE_ID')
+    
+    if not api_key or not search_engine_id:
+        print("Please set GOOGLE_API_KEY and SEARCH_ENGINE_ID environment variables")
+        return
+    
+    try:
+        scraper = GoogleSearchScraper(api_key=api_key, search_engine_id=search_engine_id)
+        
+        # Search for Python tutorials
+        results = scraper.search("Python tutorials", num_results=5)
+        
+        print(f"Found {len(results)} results:")
+        for i, result in enumerate(results, 1):
+            title = result.get('title', 'No title')
+            link = result.get('link', 'No URL')
+            snippet = result.get('snippet', 'No snippet')
+            print(f"{i}. {title}")
+            print(f"   URL: {link}")
+            print(f"   Snippet: {snippet}")
+            print()
+    except Exception as e:
+        print(f"Error during search: {e}")
+
+
+def example_multiple_pages():
+    """Example of searching multiple pages"""
+    api_key = os.getenv('GOOGLE_API_KEY')
+    search_engine_id = os.getenv('SEARCH_ENGINE_ID')
+    
+    if not api_key or not search_engine_id:
+        print("Please set GOOGLE_API_KEY and SEARCH_ENGINE_ID environment variables")
+        return
+    
+    try:
+        scraper = GoogleSearchScraper(api_key=api_key, search_engine_id=search_engine_id)
+        
+        # Search for multiple pages of results
+        results = scraper.search_multiple_pages("machine learning", total_results=15)
+        
+        print(f"Found {len(results)} results for 'machine learning':")
+        for i, result in enumerate(results, 1):
+            title = result.get('title', 'No title')
+            link = result.get('link', 'No URL')
+            print(f"{i:2d}. {title}")
+            print(f"    URL: {link}")
+            print()
+    except Exception as e:
+        print(f"Error during search: {e}")
+
+
+if __name__ == "__main__":
+    print("=== Basic Usage Example ===")
+    example_basic_usage()
+    print("\n=== Multiple Pages Example ===") 
+    example_multiple_pages()