Merge pull request #612 from holygrolli/feat-link-db

holygrolli · web-flow · commit 7a952b471403 · 2025-11-26T23:12:33.000+01:00
use external scraped links db
diff --git a/.github/workflows/lecasino.yaml b/.github/workflows/lecasino.yaml
@@ -19,7 +19,7 @@ jobs:
   download:
     runs-on: ubuntu-24.04
     container:
-      image: ghcr.io/holygrolli/whatsupforlunch:sha-979e9b6-2024-08-08
+      image: whatsupforlunch:latest
     outputs:
       matrix: ${{ steps.setmatrix.outputs.matrix }}
     steps:
@@ -30,6 +30,12 @@ jobs:
             ${{
               github.event_name == 'schedule' && 'main' || ''
             }}
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: eu-central-1
       - name: get new links
         shell: bash
         run: |
@@ -101,6 +107,23 @@ jobs:
         mkdir -p ../success-htmls
         cp ./chatgpt_user.txt ../success-htmls/${{ matrix.files }}
         cp final.json ../success-htmls/$(date -d"$(jq -r 'keys[] | select(test("\\d{4}-\\d{2}-\\d{2}"))' final.json | sort | head -n 1)" +%Y-%V).json
+    - name: Mark link as processed in DynamoDB
+      shell: bash
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        AWS_DEFAULT_REGION: eu-central-1
+      run: |
+        # Extract the original URL from the filename
+        # The filename format is: cleaned_lecasino_https_www.l.de_..._.html
+        filename="${{ matrix.files }}"
+        # Remove the "cleaned_lecasino" prefix and ".html" suffix
+        url_encoded="${filename#cleaned_lecasino}"
+        url_encoded="${url_encoded%.html}"
+        # Convert back to URL format
+        url=$(echo "$url_encoded" | sed 's/_https_/https:\/\//' | sed 's/_http_/http:\/\//' | sed 's/_/\//g')
+        echo "Marking link as processed: $url"
+        python locations/.shared/mark_link_processed.py "$url" || echo "Warning: Failed to mark link as processed"
     - name: Archive final files
       uses: actions/upload-artifact@v4
       with:
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -2,7 +2,7 @@ FROM python:3.10-slim
 
 RUN apt-get update && \
     apt-get install -y poppler-utils curl jq git
-RUN pip install amazon-textract-textractor[pdf,pandas] openai scrapy htmlmin lxml[html_clean]
+RUN pip install amazon-textract-textractor[pdf,pandas] openai scrapy htmlmin lxml lxml_html_clean boto3
 ENV NVM_DIR /usr/local/nvm
 ENV NODE_VERSION lts/iron
 RUN mkdir -p /usr/local/nvm && apt-get update && echo "y" | apt-get install curl
diff --git a/locations/.shared/dynamodb_link_validator.py b/locations/.shared/dynamodb_link_validator.py
@@ -0,0 +1,157 @@
+"""
+DynamoDB Link Validator Helper
+
+This module provides utilities to check and store scraped links in AWS DynamoDB
+with TTL-based expiration to avoid reprocessing the same content.
+"""
+
+import boto3
+from datetime import datetime, timedelta
+from typing import Optional
+from botocore.exceptions import ClientError
+
+
+class LinkValidator:
+    """
+    Helper class to validate and store links in DynamoDB with TTL support.
+    
+    Table schema:
+    - Partition key: link (String)
+    - Sort key: timestamp (Number) - used as TTL attribute
+    """
+    
+    def __init__(
+        self,
+        table_name: str = "lunchdeal",
+        region_name: str = "eu-central-1",
+        aws_account_id: str = "840940990295"
+    ):
+        """
+        Initialize the LinkValidator with DynamoDB connection.
+        
+        Args:
+            table_name: Name of the DynamoDB table
+            region_name: AWS region
+            aws_account_id: AWS account ID (for reference)
+        """
+        self.table_name = table_name
+        self.region_name = region_name
+        self.aws_account_id = aws_account_id
+        
+        # Initialize DynamoDB resource
+        self.dynamodb = boto3.resource('dynamodb', region_name=region_name)
+        self.table = self.dynamodb.Table(table_name)
+    
+    def link_exists(self, link: str) -> bool:
+        """
+        Check if a link already exists in DynamoDB and has not expired.
+        
+        Args:
+            link: The URL to check
+            
+        Returns:
+            True if the link exists and has not expired, False otherwise
+        """
+        try:
+            current_timestamp = int(datetime.now().timestamp())
+            
+            # Query for the link with timestamp >= current time
+            response = self.table.query(
+                KeyConditionExpression=boto3.dynamodb.conditions.Key('link').eq(link) & 
+                                     boto3.dynamodb.conditions.Key('timestamp').gte(current_timestamp),
+                Limit=1
+            )
+            
+            # If we found any items, the link exists and is not expired
+            return len(response.get('Items', [])) > 0
+            
+        except ClientError as e:
+            print(f"Error checking link existence: {e}")
+            # On error, assume link doesn't exist to avoid skipping content
+            return False
+    
+    def add_link(self, link: str, ttl_weeks: int = 8) -> bool:
+        """
+        Add a link to DynamoDB with a TTL timestamp.
+        
+        Args:
+            link: The URL to store
+            ttl_weeks: Number of weeks until the link expires (default: 8)
+            
+        Returns:
+            True if successfully added, False otherwise
+        """
+        try:
+            current_time = datetime.now()
+            expiry_time = current_time + timedelta(weeks=ttl_weeks)
+            timestamp = int(expiry_time.timestamp())
+            
+            # Put item in DynamoDB
+            self.table.put_item(
+                Item={
+                    'link': link,
+                    'timestamp': timestamp,
+                    'created_at': current_time.isoformat(),
+                    'expires_at': expiry_time.isoformat()
+                }
+            )
+            
+            print(f"Successfully added link: {link} (expires: {expiry_time.isoformat()})")
+            return True
+            
+        except ClientError as e:
+            print(f"Error adding link: {e}")
+            return False
+    
+    def mark_link_processed(self, link: str, ttl_weeks: int = 8) -> bool:
+        """
+        Convenience method to mark a link as processed (alias for add_link).
+        
+        Args:
+            link: The URL to mark as processed
+            ttl_weeks: Number of weeks until the link expires (default: 8)
+            
+        Returns:
+            True if successfully marked, False otherwise
+        """
+        return self.add_link(link, ttl_weeks)
+
+
+def get_validator() -> LinkValidator:
+    """
+    Factory function to create a LinkValidator instance with default settings.
+    
+    Returns:
+        LinkValidator instance configured for the lunchdeal table
+    """
+    return LinkValidator()
+
+
+# Standalone functions for easier imports
+def check_link(link: str) -> bool:
+    """
+    Check if a link exists in DynamoDB (standalone function).
+    
+    Args:
+        link: The URL to check
+        
+    Returns:
+        True if link exists and has not expired, False otherwise
+    """
+    validator = get_validator()
+    return validator.link_exists(link)
+
+
+def mark_processed(link: str, ttl_weeks: int = 8) -> bool:
+    """
+    Mark a link as processed in DynamoDB (standalone function).
+    
+    Args:
+        link: The URL to mark as processed
+        ttl_weeks: Number of weeks until expiration (default: 8)
+        
+    Returns:
+        True if successfully marked, False otherwise
+    """
+    validator = get_validator()
+    return validator.add_link(link, ttl_weeks)
diff --git a/locations/.shared/mark_link_processed.py b/locations/.shared/mark_link_processed.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""
+Standalone script to mark a link as processed in DynamoDB.
+
+This script is designed to be called from CI/CD workflows after
+successfully processing a scraped link.
+
+Usage:
+    python mark_link_processed.py <url>
+    python mark_link_processed.py <url> --ttl-weeks 8
+
+Examples:
+    python mark_link_processed.py "https://www.l.de/example/page"
+    python mark_link_processed.py "https://www.l.de/example/page" --ttl-weeks 4
+"""
+
+import sys
+import argparse
+from dynamodb_link_validator import mark_processed
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Mark a scraped link as processed in DynamoDB'
+    )
+    parser.add_argument(
+        'url',
+        help='The URL to mark as processed'
+    )
+    parser.add_argument(
+        '--ttl-weeks',
+        type=int,
+        default=8,
+        help='Number of weeks until the link expires (default: 8)'
+    )
+    
+    args = parser.parse_args()
+    
+    try:
+        success = mark_processed(args.url, ttl_weeks=args.ttl_weeks)
+        if success:
+            print(f"✓ Successfully marked link as processed: {args.url}")
+            sys.exit(0)
+        else:
+            print(f"✗ Failed to mark link as processed: {args.url}", file=sys.stderr)
+            sys.exit(1)
+    except Exception as e:
+        print(f"✗ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/locations/lecasino/scra.py b/locations/lecasino/scra.py
@@ -1,8 +1,13 @@
 import scrapy
 import os
+import sys
 from pathlib import Path
 import lxml.html as lxml_html
 
+# Add parent directory to path to import shared modules
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / '.shared'))
+from dynamodb_link_validator import LinkValidator
+
 class Lecasino(scrapy.Spider):
     name = "lecasino"
     allowed_domains = ["www.l.de"]
@@ -12,6 +17,17 @@ class Lecasino(scrapy.Spider):
     menu_pages_html = None
     # base directory to save fetched HTML pages
     artifacts_dir = Path(__file__).resolve().parents[2] / 'tmp'
+    
+    def __init__(self, *args, **kwargs):
+        super(Lecasino, self).__init__(*args, **kwargs)
+        # Initialize link validator
+        try:
+            self.link_validator = LinkValidator()
+            self.use_link_validation = True
+            print("DynamoDB link validation enabled")
+        except Exception as e:
+            print(f"Warning: Could not initialize link validator: {e}")
+            self.use_link_validation = False
 
     def parse(self, response):
         print(response.body)
@@ -22,7 +38,13 @@ def parse(self, response):
             for sel in links:
                 link = sel.get()
                 full = link if link.startswith('http') else "https://www.l.de" + link
-                print(full)
+                
+                # Check if link already exists in DynamoDB
+                if self.use_link_validation and self.link_validator.link_exists(full):
+                    print(f"Skipping already processed link: {full}")
+                    continue
+                
+                print(f"Processing new link: {full}")
                 # schedule a request to fetch the menu page and process it
                 yield scrapy.Request(url=full, callback=self.parse_menu_page, cb_kwargs={"source_url": full})
         else: