diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/Dockerfile b/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/Dockerfile new file mode 100644 index 000000000..e9e3e0b96 --- /dev/null +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/Dockerfile @@ -0,0 +1,66 @@ +# First, specify the base Docker image. +# You can see the Docker images from Apify at https://hub.docker.com/r/apify/. +# You can also use any other image from Docker Hub. +FROM apify/actor-python-playwright:3.13 AS builder + +# Install necessary build tools (clang, rust, build-essential) and dependencies +RUN echo "Installing build tools and dependencies..." \ + && apt-get update && apt-get install -y \ + build-essential \ + clang \ + curl \ + # Install Rust + && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && export PATH="$HOME/.cargo/bin:$PATH" + +# Set up environment for Rust +ENV PATH="/root/.cargo/bin:$PATH" + +# Second, copy just requirements.txt into the Actor image, +# since it should be the only file that affects the dependency install in the next step, +# in order to speed up the build +COPY requirements.txt ./ + +# Create and activate a virtual environment for Python dependencies in /opt +RUN python -m venv /opt/.venv +ENV PATH="/opt/.venv/bin:$PATH" + +# Install the packages specified in requirements.txt, +# Print the installed Python version, pip version +# and all installed packages with their versions for debugging +RUN echo "Python version:" \ + && python --version \ + && echo "Pip version:" \ + && pip --version \ + && echo "Installing dependencies:" \ + && pip install -r requirements.txt \ + && echo "All installed Python packages:" \ + && pip freeze + +# Install Playwright and its dependencies +RUN playwright install-deps && \ + playwright install + +# Use compileall to ensure the runnability of the Actor Python code. +RUN python3 -m compileall -q . + +# Stage 2: Final lightweight runtime image +FROM python:3.13-slim-bookworm + +# Set the PATH to include the virtual environment's bin directory +ENV PATH="/opt/.venv/bin:$PATH" + +# Copy virtual environment with installed dependencies +COPY --from=builder /opt/.venv /opt/.venv + +# Copy the Playwright dependencies from the builder +COPY --from=builder /root/.cache/ms-playwright/chromium_headless_shell-1155 /root/.cache/ms-playwright/chromium_headless_shell-1155 + +# Copy the Playwright dependencies (system libraries) from the builder (directory would be /usr/lib/aarch64-linux-gnu on ARM) +COPY --from=builder /usr/lib/x86_64-linux-gnu /usr/lib/x86_64-linux-gnu + +# Copy application source code +COPY . ./ + +# Run the application +CMD ["python3", "-m", "src"] diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/actor.json b/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/actor.json new file mode 100644 index 000000000..b5786da52 --- /dev/null +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/actor.json @@ -0,0 +1,7 @@ +{ + "actorSpecification": 1, + "name": "web-scrapping-ai-agent", + "version": "0.0", + "buildTag": "latest", + "environmentVariables": {} +} diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/input_schema.json b/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/input_schema.json new file mode 100644 index 000000000..762a8a995 --- /dev/null +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/.actor/input_schema.json @@ -0,0 +1,37 @@ +{ + "title": "Web Scrapping AI Agent", + "type": "object", + "schemaVersion": 1, + "properties": { + "openai_access_token": { + "title": "OpenAI Access Token", + "type": "string", + "description": "Access token for OpenAI API", + "editor": "textfield", + "isSecret": true + }, + "model": { + "title": "OpenAI model", + "type": "string", + "description": "Select the model", + "editor": "select", + "default": "gpt-4", + "enum": ["gpt-4", "gpt-3.5-turbo"] + }, + "url": { + "title": "URL", + "type": "string", + "description": "Enter the URL of the website you want to scrape", + "editor": "textfield", + "default": "https://docs.apify.com/academy/deploying-your-code" + }, + "user_prompt": { + "title": "User Prompt", + "type": "string", + "description": "What you want the AI agent to scrape from the website?", + "editor": "textfield", + "default": "What is actorification?" + } + }, + "required": ["openai_access_token"] +} diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/.gitignore b/advanced_tools_frameworks/web_scrapping_ai_agent/.gitignore new file mode 100644 index 000000000..3e892b1be --- /dev/null +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/.gitignore @@ -0,0 +1,3 @@ +storage +.venv +**/.DS_Store \ No newline at end of file diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/README.md b/advanced_tools_frameworks/web_scrapping_ai_agent/README.md index c73dacc43..2bcb20cb5 100644 --- a/advanced_tools_frameworks/web_scrapping_ai_agent/README.md +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/README.md @@ -1,39 +1,114 @@ -## 💻 Web Scrapping AI Agent -This Streamlit app allows you to scrape a website using OpenAI API and the scrapegraphai library. Simply provide your OpenAI API key, enter the URL of the website you want to scrape, and specify what you want the AI agent to extract from the website. +# 💻 Web Scraping AI Agent -### Features -- Scrape any website by providing the URL -- Utilize OpenAI's LLMs (GPT-3.5-turbo or GPT-4) for intelligent scraping -- Customize the scraping task by specifying what you want the AI agent to extract +This **Apify Streamlit app Actor** enables intelligent web scraping using **OpenAI's API** and the `scrapegraphai` library. With this app, you can scrape any website by simply providing the URL and specifying the data you need extracted. You can run it directly on the **Apify platform** for hassle-free scaling and management. 🚀 -### How to get Started? +## 💡 Why Apify Actors Are Powerful -1. Clone the GitHub repository +Apify Actors provide an easy and efficient way to run your web scraping tasks at scale. They are fully-managed, cloud-based containers designed for tasks like web scraping, automation, and data extraction. [Learn more about Apify Actors in the whitepaper here](https://whitepaper.actor/). 📖 -```bash -git clone https://github.com/Shubhamsaboo/awesome-llm-apps.git -cd awesome-llm-apps/advanced_tools_frameworks/web_scrapping_ai_agent -``` -2. Install the required dependencies: +--- -```bash -pip install -r requirements.txt -``` -3. Get your OpenAI API Key +## 🌟 Features -- Sign up for an [OpenAI account](https://platform.openai.com/) (or the LLM provider of your choice) and obtain your API key. +- **Scrape any website** by providing the URL. 🌍 +- **Leverage OpenAI's LLMs** (GPT-3.5-turbo or GPT-4) for intelligent data extraction. 🤖💬 +- **Run as an Apify Actor** on the Apify platform for seamless deployment and scaling. ⚡ +- **Customize your scraping task** by providing specific user prompts. ✍️ -4. Run the Streamlit App -```bash -streamlit run ai_scrapper.py -``` +--- -### How it Works? +## 🔧 How to Get Started? -- The app prompts you to enter your OpenAI API key, which is used to authenticate and access the OpenAI language models. -- You can select the desired language model (GPT-3.5-turbo or GPT-4) for the scraping task. -- Enter the URL of the website you want to scrape in the provided text input field. -- Specify what you want the AI agent to extract from the website by entering a user prompt. -- The app creates a SmartScraperGraph object using the provided URL, user prompt, and OpenAI configuration. -- The SmartScraperGraph object scrapes the website and extracts the requested information using the specified language model. -- The scraped results are displayed in the app for you to view \ No newline at end of file +### 🅰️ Run as an Apify Actor + +See full guide in [Apify Academy](https://docs.apify.com/academy/getting-started/actors)📚 + +This project is already set up as an **Apify Actor**, allowing you to easily deploy it on the Apify platform. + +1. **Initialize the Apify Actor** (already done in the repository): + + ```bash + apify init + ``` + + This creates `.actor/actor.json` with the configuration and the necessary Dockerfile. 🛠️ + +2. **Refactor the Code**: + + The code has been refactored to separate the logic into a function that handles input and output independently, improving maintainability and scalability. This function now takes the inputs (URL, prompt, model choice) and returns the extracted output, allowing for better handling in the Apify environment. 🔄 + +3. **Build the Actor**: + + [Learn more about building an Actor in the Apify Docs](https://docs.apify.com/academy/getting-started/creating-actors#build-an-actor). 🏗️ + +4. **Run the Actor**: + + [Learn how to run Actors in the Apify console](https://docs.apify.com/academy/getting-started/creating-actors#run-the-actor)📚 + This will trigger the process on the Apify platform, and you’ll receive logs detailing the results. + +--- + +### 🅱️ Run Locally + +1. **Clone the GitHub Repository:** + + ```bash + git clone https://github.com/Shubhamsaboo/awesome-llm-apps.git + cd awesome-llm-apps/advanced_tools_frameworks/web_scrapping_ai_agent + ``` + +2. **Install Dependencies:** + + ```bash + pip install -r requirements.txt + ``` + +3. **Get Your OpenAI API Key:** + - Sign up for an [OpenAI account](https://platform.openai.com/) (or another LLM provider) and obtain your API key. 🔑 + +4. **Run the App:** + + The app has been refactored to remove Streamlit, and it now focuses on the core logic for scraping. You can run the function directly as a Python script: + + ```bash + python run_ai_scraper.py + ``` + +--- + +## 📦 Dockerized for Apify 🐳 + +This project includes a **Dockerfile** optimized for Apify deployment, inspired by the official Apify templates: + +- **Multi-stage build** to keep the Docker image as small as possible. +- Installs **Rust** for building the `minify-html` dependency of `scrapegraphai`. +- **Playwright update** for enhanced web scraping capabilities. +- **Streamlit removal** to optimize the actor's performance and reduce unnecessary dependencies. +- **Concurrency Handling**: Apify Actor's entrypoint function is optimized to handle concurrency and process multiple requests in parallel while managing inputs and outputs efficiently. + +--- + +## 🔍 How It Works + +1. Upon starting the actor, you will pass the necessary inputs to the **entrypoint function**. 🚪 +2. The **entrypoint function** maps Apify input parameters to the scraping logic function, ensuring concurrency is handled efficiently. 🔄 +3. The **scraping logic function** uses the inputs (URL, OpenAI model, user prompt) and returns the processed data. 🧠 +4. The result is returned by the actor, which can be saved to Apify datasets or returned as an output for further processing. 📈 + +--- + +## 📖 Learn More + +- Want to understand why **Apify Actors** are the ideal solution for scalable web scraping? Check out the [Apify Whitepaper](https://whitepaper.actor/) for more insights. 📜 + +--- + +This **Web Scraping AI Agent** is perfect for AI-powered data extraction, whether you're conducting research, automating workflows, or gathering business intelligence. With Apify’s platform, you can deploy and scale the app with ease. 🚀 + +--- + +### Additional Notes for Apify Actor Development + +- **Code Refactor**: The code has been refactored to separate the logic into a function that accepts inputs and returns outputs, facilitating better scalability and reusability in the Apify Actor environment. 🛠️ +- **Concurrency**: The entrypoint function handles multiple requests concurrently, allowing the actor to scale efficiently. 🔄 +- **No Streamlit**: Streamlit has been removed from the app as the primary focus is on backend scraping logic, which is now suitable for Apify actor deployment. ❌ diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/requirements.txt b/advanced_tools_frameworks/web_scrapping_ai_agent/requirements.txt index bc34f2b73..eb0c0505b 100644 --- a/advanced_tools_frameworks/web_scrapping_ai_agent/requirements.txt +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/requirements.txt @@ -1,3 +1,4 @@ streamlit scrapegraphai -playwright \ No newline at end of file +playwright +apify < 3.0 \ No newline at end of file diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/src/__init__.py b/advanced_tools_frameworks/web_scrapping_ai_agent/src/__init__.py new file mode 100644 index 000000000..26301deb7 --- /dev/null +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/src/__init__.py @@ -0,0 +1 @@ +from .ai_scrapper_func import ai_scrapper_func \ No newline at end of file diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/src/__main__.py b/advanced_tools_frameworks/web_scrapping_ai_agent/src/__main__.py new file mode 100644 index 000000000..8a1188300 --- /dev/null +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/src/__main__.py @@ -0,0 +1,6 @@ +import asyncio + +from .main import main + +# Execute the Actor entry point. +asyncio.run(main()) diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/src/ai_scrapper_func.py b/advanced_tools_frameworks/web_scrapping_ai_agent/src/ai_scrapper_func.py new file mode 100644 index 000000000..ffaf72f6a --- /dev/null +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/src/ai_scrapper_func.py @@ -0,0 +1,43 @@ +# Import the required libraries +# import streamlit as st +from scrapegraphai.graphs import SmartScraperGraph + +from typing import Literal + +def ai_scrapper_func(openai_access_token: str, model: Literal["gpt-3.5-turbo", "gpt-4"], url: str, user_prompt: str) -> dict: + # Set up the Streamlit app + # st.title("Web Scrapping AI Agent 🕵️‍♂️") + # st.caption("This app allows you to scrape a website using OpenAI API") + + # Get OpenAI API key from user + # openai_access_token = st.text_input("OpenAI API Key", type="password") + + # if openai_access_token: + # model = st.radio( + # "Select the model", + # ["gpt-3.5-turbo", "gpt-4"], + # index=0, + # ) + graph_config = { + "llm": { + "api_key": openai_access_token, + "model": f"openai/{model}", + }, + } + # # Get the URL of the website to scrape + # url = st.text_input("Enter the URL of the website you want to scrape") + # # Get the user prompt + # user_prompt = st.text_input("What you want the AI agent to scrae from the website?") + + # Create a SmartScraperGraph object + smart_scraper_graph = SmartScraperGraph( + prompt=user_prompt, + source=url, + config=graph_config + ) + # # Scrape the website + # if st.button("Scrape"): + # result = smart_scraper_graph.run() + # st.write(result) + + return smart_scraper_graph.run() \ No newline at end of file diff --git a/advanced_tools_frameworks/web_scrapping_ai_agent/src/main.py b/advanced_tools_frameworks/web_scrapping_ai_agent/src/main.py new file mode 100644 index 000000000..9a58711cc --- /dev/null +++ b/advanced_tools_frameworks/web_scrapping_ai_agent/src/main.py @@ -0,0 +1,45 @@ +"""This module defines the main entry point for the Apify Actor. + +Feel free to modify this file to suit your specific needs. + +To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation: +https://docs.apify.com/sdk/python +Also, see more about why Actors are cool and easy to use here: +https://whitepaper.actor/ +""" + +from apify import Actor + +from src import ai_scrapper_func + +import asyncio +from concurrent.futures import ThreadPoolExecutor + +async def run_scraper_in_thread(kwargs): + loop = asyncio.get_event_loop() + with ThreadPoolExecutor() as executor: + return await loop.run_in_executor(executor, lambda: ai_scrapper_func(**kwargs)) + +async def main() -> None: + """Main entry point for the Apify Actor. + + This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution. + Asynchronous execution is required for communication with Apify platform, and it also enhances performance in + the field of web scraping significantly. + """ + async with Actor: + Actor.log.info('Hello from the Actor!') + # Write your code here + + actor_input = await Actor.get_input() or {} + + Actor.log.info('Running the ai scrapper...') + + result = await run_scraper_in_thread(actor_input) + + Actor.log.info(f'URL: {actor_input.get("url")}') + Actor.log.info(f'Prompt: {actor_input.get("user_prompt")}') + Actor.log.info(f'Result: {result.get('content')}') + await Actor.push_data(result) + + await Actor.exit(status_message='Actor finished successfully!') \ No newline at end of file