Kode-Rex · T-rav · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/.github/workflows/azure-function_webcatgpt.yml b/.github/workflows/azure-function_webcatgpt.yml
@@ -0,0 +1,70 @@
+name: Build and deploy Python project to Azure Function App - webcatgpt
+
+on:
+  push:
+    branches:
+      - azure-function
+  workflow_dispatch:
+
+env:
+  AZURE_FUNCTIONAPP_PACKAGE_PATH: 'src' # set this to the path to your web app project, defaults to the repository root
+  PYTHON_VERSION: '3.11' # set this to the python version to use
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Python version
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - name: Create and start virtual environment
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+
+      - name: Install dependencies
+        run: pip install -r ${{ env.AZURE_FUNCTIONAPP_PACKAGE_PATH }}/requirements.txt
+
+      # Optional: Add step to run tests here
+
+      - name: Zip artifact for deployment
+        run: |
+          cd ${{ env.AZURE_FUNCTIONAPP_PACKAGE_PATH }}
+          zip -r ../release.zip ./*
+
+      - name: Upload artifact for deployment job
+        uses: actions/upload-artifact@v3
+        with:
+          name: python-app
+          path: release.zip
+
+  deploy:
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Azure Login
+        uses: azure/login@v1
+        with:
+          creds: ${{ secrets.AZURE_CREDENTIALS }}
+
+      - name: Download artifact from build job
+        uses: actions/download-artifact@v3
+        with:
+          name: python-app
+
+      # Removed unzip step to deploy zip file directly
+      - name: 'Deploy to Azure Functions'
+        uses: Azure/functions-action@v1
+        id: deploy-to-function
+        with:
+          app-name: 'webcatgpt'
+          slot-name: 'Production'
+          package: './release.zip' # Adjusted to deploy the content of src directly
+          publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_A0CFD2E02C29451F9ABF27C189B5721B }}
+          scm-do-build-during-deployment: true
+          enable-oryx-build: true
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "ms-azuretools.vscode-azurefunctions"
+    ]
+}
diff --git a/README.md b/README.md
@@ -2,29 +2,43 @@
 
 ## Introduction
 
-Web Cat is a Python-based API designed to facilitate the integration of website content with ChatGPT via a custom GPT. The parses a website's content and then seamlessly integrates these insights into your chat, enhancing the user experience with dynamic, context-aware interactions.
+Web Cat is a serverless Python-based API hosted on Azure Functions, designed to scrape and process website content responsibly. Leveraging the readability library and BeautifulSoup, Web Cat extracts the main body of text from web pages, making it easy to integrate website content ChatGPT through the use of Custom GPTs. This API respects robots.txt rules to ensure ethical web scraping practices.
 
-I find it very useful when I am ideating on a concept and I want to pull in additional info without just a copy and paste of the contents into the chat. 
+Using the `@Web Cat` GPT enhances ideation by seamlessly integrating web content into conversations, eliminating the need for manual copy-pasting.
+
+## Features
+ - **Ethical Web Scraping**: Checks robots.txt to ensure scraping is allowed.
+ - **Content Extraction**: Utilizes the readability library for clean text extraction.
+ - **Text Processing**: Further processes extracted content for improved usability.
 
 ## Getting Started
 
 ### Prerequisites
 
+- Azure Functions Core Tools
 - Python 3.8 or later
-- Flask
+- An Azure account and subscription
 
-### Running the API
+## Local Development
 
-1. To start the Flask server locally:
-
-    a. `cd app`
+Prepare your local environment by running:
 
-    b. `python3 app.py`
+```bash
+cd src
+pip install -r requirements.txt
+func start
+```
 
-## Examples
+## Limitations and Considerations
+- **Adherence to `robots.txt`**: This API will not scrape content from URLs disallowed by their robots.txt.
+- **Text-Based Content**: The API is optimized for text content and may not accurately represent multimedia or dynamic web content.
 
-Here's a quick example of how to use the API:
+## Usage
 
-Calling ping to check that the service is up:
+Here's a quick example of how to test the API locally:
 
-`curl -X POST -H "Content-Type: application/json" -d '{"url": "https://www.iana.org/help/example-domains", "output_format": "TEXT"}' http://localhost:4000/scrape`
+```bash
+cd src
+func start
+curl -X POST http://localhost:7071/api/scrape -H "Content-Type: application/json" -d "{\"url\":\"https://example.com\"}"
+```
diff --git a/app/app.py b/app/app.py
diff --git a/requirements.txt b/requirements.txt
diff --git a/service/webcat.service b/service/webcat.service
diff --git a/src/function_app.py b/src/function_app.py
@@ -0,0 +1,50 @@
+import azure.functions as func
+import logging
+from readability.readability import Document
+import requests
+from bs4 import BeautifulSoup
+import random
+from urllib.robotparser import RobotFileParser
+from urllib.parse import urlparse
+
+app = func.FunctionApp()
+
+USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
+]
+
+def can_fetch(url):
+    parsed_url = urlparse(url)
+    url_root = f'{parsed_url.scheme}://{parsed_url.netloc}/'
+    robots_url = f'{url_root}robots.txt'
+    rp = RobotFileParser()
+    rp.set_url(robots_url)
+    rp.read()
+    return rp.can_fetch(USER_AGENTS[0], url)
+
+@app.route(route="scrape", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS)
+def scrape(req: func.HttpRequest) -> func.HttpResponse:
+    logging.info('Python HTTP trigger function processed a request.')
+
+    try:
+        data = req.get_json()
+        url = data.get('url')
+
+        if not url:
+            return func.HttpResponse("Error: Missing URL", status_code=400)
+
+        if not can_fetch(url):
+            return func.HttpResponse("Error: Access denied by robots.txt", status_code=403)
+
+        headers = {'User-Agent': random.choice(USER_AGENTS)}
+        response = requests.get(url, headers=headers)
+        doc = Document(response.content)
+        summary_html = doc.summary(html_partial=True)
+        soup = BeautifulSoup(summary_html, 'html.parser')
+        content = soup.get_text(separator='\n').strip()  # Added strip() here
+
+        return func.HttpResponse(content, mimetype="text/plain")
+    except Exception as e:
+        logging.error(f"Error: {str(e)}")
+        return func.HttpResponse(f"Error: Failed to scrape the URL - {str(e)}", status_code=500)
diff --git a/src/host.json b/src/host.json
@@ -0,0 +1,15 @@
+{
+  "version": "2.0",
+  "logging": {
+    "applicationInsights": {
+      "samplingSettings": {
+        "isEnabled": true,
+        "excludedTypes": "Request"
+      }
+    }
+  },
+  "extensionBundle": {
+    "id": "Microsoft.Azure.Functions.ExtensionBundle",
+    "version": "[4.*, 5.0.0)"
+  }
+}
diff --git a/src/local.settings.json b/src/local.settings.json
@@ -0,0 +1,8 @@
+{
+  "IsEncrypted": false,
+  "Values": {
+    "FUNCTIONS_WORKER_RUNTIME": "python",
+    "AzureWebJobsFeatureFlags": "EnableWorkerIndexing",
+    "AzureWebJobsStorage": ""
+  }
+}
diff --git a/src/requirements.txt b/src/requirements.txt
@@ -0,0 +1,8 @@
+# Do not include azure-functions-worker in this file
+# The Python Worker is managed by the Azure Functions platform
+# Manually managing azure-functions-worker may cause unexpected issues
+
+azure-functions
+requests
+beautifulsoup4
+readability-lxml