diff --git a/.github/workflows/azure-function_webcatgpt.yml b/.github/workflows/azure-function_webcatgpt.yml new file mode 100644 index 0000000..4f1e1c3 --- /dev/null +++ b/.github/workflows/azure-function_webcatgpt.yml @@ -0,0 +1,70 @@ +name: Build and deploy Python project to Azure Function App - webcatgpt + +on: + push: + branches: + - azure-function + workflow_dispatch: + +env: + AZURE_FUNCTIONAPP_PACKAGE_PATH: 'src' # set this to the path to your web app project, defaults to the repository root + PYTHON_VERSION: '3.11' # set this to the python version to use + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Python version + uses: actions/setup-python@v1 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Create and start virtual environment + run: | + python -m venv venv + source venv/bin/activate + + - name: Install dependencies + run: pip install -r ${{ env.AZURE_FUNCTIONAPP_PACKAGE_PATH }}/requirements.txt + + # Optional: Add step to run tests here + + - name: Zip artifact for deployment + run: | + cd ${{ env.AZURE_FUNCTIONAPP_PACKAGE_PATH }} + zip -r ../release.zip ./* + + - name: Upload artifact for deployment job + uses: actions/upload-artifact@v3 + with: + name: python-app + path: release.zip + + deploy: + runs-on: ubuntu-latest + needs: build + steps: + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + + - name: Download artifact from build job + uses: actions/download-artifact@v3 + with: + name: python-app + + # Removed unzip step to deploy zip file directly + - name: 'Deploy to Azure Functions' + uses: Azure/functions-action@v1 + id: deploy-to-function + with: + app-name: 'webcatgpt' + slot-name: 'Production' + package: './release.zip' # Adjusted to deploy the content of src directly + publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_A0CFD2E02C29451F9ABF27C189B5721B }} + scm-do-build-during-deployment: true + enable-oryx-build: true diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..dde673d --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "ms-azuretools.vscode-azurefunctions" + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 131860e..e8d1360 100644 --- a/README.md +++ b/README.md @@ -2,29 +2,43 @@ ## Introduction -Web Cat is a Python-based API designed to facilitate the integration of website content with ChatGPT via a custom GPT. The parses a website's content and then seamlessly integrates these insights into your chat, enhancing the user experience with dynamic, context-aware interactions. +Web Cat is a serverless Python-based API hosted on Azure Functions, designed to scrape and process website content responsibly. Leveraging the readability library and BeautifulSoup, Web Cat extracts the main body of text from web pages, making it easy to integrate website content ChatGPT through the use of Custom GPTs. This API respects robots.txt rules to ensure ethical web scraping practices. -I find it very useful when I am ideating on a concept and I want to pull in additional info without just a copy and paste of the contents into the chat. +Using the `@Web Cat` GPT enhances ideation by seamlessly integrating web content into conversations, eliminating the need for manual copy-pasting. + +## Features + - **Ethical Web Scraping**: Checks robots.txt to ensure scraping is allowed. + - **Content Extraction**: Utilizes the readability library for clean text extraction. + - **Text Processing**: Further processes extracted content for improved usability. ## Getting Started ### Prerequisites +- Azure Functions Core Tools - Python 3.8 or later -- Flask +- An Azure account and subscription -### Running the API +## Local Development -1. To start the Flask server locally: - - a. `cd app` +Prepare your local environment by running: - b. `python3 app.py` +```bash +cd src +pip install -r requirements.txt +func start +``` -## Examples +## Limitations and Considerations +- **Adherence to `robots.txt`**: This API will not scrape content from URLs disallowed by their robots.txt. +- **Text-Based Content**: The API is optimized for text content and may not accurately represent multimedia or dynamic web content. -Here's a quick example of how to use the API: +## Usage -Calling ping to check that the service is up: +Here's a quick example of how to test the API locally: -`curl -X POST -H "Content-Type: application/json" -d '{"url": "https://www.iana.org/help/example-domains", "output_format": "TEXT"}' http://localhost:4000/scrape` +```bash +cd src +func start +curl -X POST http://localhost:7071/api/scrape -H "Content-Type: application/json" -d "{\"url\":\"https://example.com\"}" +``` diff --git a/app/app.py b/app/app.py deleted file mode 100644 index 76dccac..0000000 --- a/app/app.py +++ /dev/null @@ -1,73 +0,0 @@ -from flask import Flask, request, jsonify, Response - -import requests -from readability.readability import Document - -import random -from urllib.robotparser import RobotFileParser - -from datetime import datetime -from bs4 import BeautifulSoup - -app = Flask(__name__) - -@app.route('/ping', methods=['GET']) -def ping(): - now = datetime.now() - date_time_str = now.strftime("%Y-%m-%d %H:%M:%S") - return date_time_str - -USER_AGENTS = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", -] - -def can_fetch(url): - rp = RobotFileParser() - rp.set_url(request.url_root + 'robots.txt') - rp.read() - return rp.can_fetch("*", url) - -@app.route('/scrape', methods=['POST']) -def scrape(): - data = request.get_json() - url = data.get('url') - output_format = data.get('output_format', 'JSON').upper() # Defaults to JSON - - if not url: - error_message = "Error: Missing URL" - if output_format == 'JSON': - return jsonify({'error': error_message}), 400 - else: - return Response(error_message, status=400, mimetype='text/plain') - - if not can_fetch(url): - error_message = "Error: Access denied by robots.txt" - if output_format == 'JSON': - return jsonify({'error': error_message}), 403 - else: - return Response(error_message, status=403, mimetype='text/plain') - - headers = {'User-Agent': random.choice(USER_AGENTS)} - try: - response = requests.get(url, headers=headers) - doc = Document(response.content) - summary_html = doc.summary(html_partial=True) - soup = BeautifulSoup(summary_html, 'html.parser') # Parse the HTML - content = soup.get_text(separator='\n') # Extract text - except Exception as e: - error_message = f"Error: Failed to scrape the URL - {str(e)}" - if output_format == 'JSON': - return jsonify({'error': error_message}), 500 - else: - return Response(error_message, status=500, mimetype='text/plain') - - # Return content based on the requested output format - if output_format == 'JSON': - return jsonify({'content': content}) - else: - return Response(content, mimetype='text/plain') - -if __name__ == '__main__': - app.run(debug=True, host='0.0.0.0', port=4000) - diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index fd5482a..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -Flask -gunicorn -requests -beautifulsoup4 -readability-lxml diff --git a/service/webcat.service b/service/webcat.service deleted file mode 100644 index fc2836b..0000000 --- a/service/webcat.service +++ /dev/null @@ -1,13 +0,0 @@ -[Unit] -Description=WebCat -After=network.target - -[Service] -ExecStart=/usr/local/bin/gunicorn -w 4 -b 0.0.0.0:4000 app:app -WorkingDirectory=/opt/aibuddy-webcat -Restart=always -User=webcat -Group=webcat - -[Install] -WantedBy=multi-user.target diff --git a/src/function_app.py b/src/function_app.py new file mode 100644 index 0000000..b176cc3 --- /dev/null +++ b/src/function_app.py @@ -0,0 +1,50 @@ +import azure.functions as func +import logging +from readability.readability import Document +import requests +from bs4 import BeautifulSoup +import random +from urllib.robotparser import RobotFileParser +from urllib.parse import urlparse + +app = func.FunctionApp() + +USER_AGENTS = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", +] + +def can_fetch(url): + parsed_url = urlparse(url) + url_root = f'{parsed_url.scheme}://{parsed_url.netloc}/' + robots_url = f'{url_root}robots.txt' + rp = RobotFileParser() + rp.set_url(robots_url) + rp.read() + return rp.can_fetch(USER_AGENTS[0], url) + +@app.route(route="scrape", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) +def scrape(req: func.HttpRequest) -> func.HttpResponse: + logging.info('Python HTTP trigger function processed a request.') + + try: + data = req.get_json() + url = data.get('url') + + if not url: + return func.HttpResponse("Error: Missing URL", status_code=400) + + if not can_fetch(url): + return func.HttpResponse("Error: Access denied by robots.txt", status_code=403) + + headers = {'User-Agent': random.choice(USER_AGENTS)} + response = requests.get(url, headers=headers) + doc = Document(response.content) + summary_html = doc.summary(html_partial=True) + soup = BeautifulSoup(summary_html, 'html.parser') + content = soup.get_text(separator='\n').strip() # Added strip() here + + return func.HttpResponse(content, mimetype="text/plain") + except Exception as e: + logging.error(f"Error: {str(e)}") + return func.HttpResponse(f"Error: Failed to scrape the URL - {str(e)}", status_code=500) diff --git a/src/host.json b/src/host.json new file mode 100644 index 0000000..9df9136 --- /dev/null +++ b/src/host.json @@ -0,0 +1,15 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + } +} \ No newline at end of file diff --git a/src/local.settings.json b/src/local.settings.json new file mode 100644 index 0000000..67043d7 --- /dev/null +++ b/src/local.settings.json @@ -0,0 +1,8 @@ +{ + "IsEncrypted": false, + "Values": { + "FUNCTIONS_WORKER_RUNTIME": "python", + "AzureWebJobsFeatureFlags": "EnableWorkerIndexing", + "AzureWebJobsStorage": "" + } +} \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..bb960b0 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,8 @@ +# Do not include azure-functions-worker in this file +# The Python Worker is managed by the Azure Functions platform +# Manually managing azure-functions-worker may cause unexpected issues + +azure-functions +requests +beautifulsoup4 +readability-lxml \ No newline at end of file