Skip to content

Commit 7a952b4

Browse files
authored
Merge pull request #612 from holygrolli/feat-link-db
use external scraped links db
2 parents ff5e99d + fd36fda commit 7a952b4

File tree

5 files changed

+258
-3
lines changed

5 files changed

+258
-3
lines changed

.github/workflows/lecasino.yaml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
download:
2020
runs-on: ubuntu-24.04
2121
container:
22-
image: ghcr.io/holygrolli/whatsupforlunch:sha-979e9b6-2024-08-08
22+
image: whatsupforlunch:latest
2323
outputs:
2424
matrix: ${{ steps.setmatrix.outputs.matrix }}
2525
steps:
@@ -30,6 +30,12 @@ jobs:
3030
${{
3131
github.event_name == 'schedule' && 'main' || ''
3232
}}
33+
- name: Configure AWS credentials
34+
uses: aws-actions/configure-aws-credentials@v4
35+
with:
36+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
37+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
38+
aws-region: eu-central-1
3339
- name: get new links
3440
shell: bash
3541
run: |
@@ -101,6 +107,23 @@ jobs:
101107
mkdir -p ../success-htmls
102108
cp ./chatgpt_user.txt ../success-htmls/${{ matrix.files }}
103109
cp final.json ../success-htmls/$(date -d"$(jq -r 'keys[] | select(test("\\d{4}-\\d{2}-\\d{2}"))' final.json | sort | head -n 1)" +%Y-%V).json
110+
- name: Mark link as processed in DynamoDB
111+
shell: bash
112+
env:
113+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
114+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
115+
AWS_DEFAULT_REGION: eu-central-1
116+
run: |
117+
# Extract the original URL from the filename
118+
# The filename format is: cleaned_lecasino_https_www.l.de_..._.html
119+
filename="${{ matrix.files }}"
120+
# Remove the "cleaned_lecasino" prefix and ".html" suffix
121+
url_encoded="${filename#cleaned_lecasino}"
122+
url_encoded="${url_encoded%.html}"
123+
# Convert back to URL format
124+
url=$(echo "$url_encoded" | sed 's/_https_/https:\/\//' | sed 's/_http_/http:\/\//' | sed 's/_/\//g')
125+
echo "Marking link as processed: $url"
126+
python locations/.shared/mark_link_processed.py "$url" || echo "Warning: Failed to mark link as processed"
104127
- name: Archive final files
105128
uses: actions/upload-artifact@v4
106129
with:

docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ FROM python:3.10-slim
22

33
RUN apt-get update && \
44
apt-get install -y poppler-utils curl jq git
5-
RUN pip install amazon-textract-textractor[pdf,pandas] openai scrapy htmlmin lxml[html_clean]
5+
RUN pip install amazon-textract-textractor[pdf,pandas] openai scrapy htmlmin lxml lxml_html_clean boto3
66
ENV NVM_DIR /usr/local/nvm
77
ENV NODE_VERSION lts/iron
88
RUN mkdir -p /usr/local/nvm && apt-get update && echo "y" | apt-get install curl
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
"""
2+
DynamoDB Link Validator Helper
3+
4+
This module provides utilities to check and store scraped links in AWS DynamoDB
5+
with TTL-based expiration to avoid reprocessing the same content.
6+
"""
7+
8+
import boto3
9+
from datetime import datetime, timedelta
10+
from typing import Optional
11+
from botocore.exceptions import ClientError
12+
13+
14+
class LinkValidator:
15+
"""
16+
Helper class to validate and store links in DynamoDB with TTL support.
17+
18+
Table schema:
19+
- Partition key: link (String)
20+
- Sort key: timestamp (Number) - used as TTL attribute
21+
"""
22+
23+
def __init__(
24+
self,
25+
table_name: str = "lunchdeal",
26+
region_name: str = "eu-central-1",
27+
aws_account_id: str = "840940990295"
28+
):
29+
"""
30+
Initialize the LinkValidator with DynamoDB connection.
31+
32+
Args:
33+
table_name: Name of the DynamoDB table
34+
region_name: AWS region
35+
aws_account_id: AWS account ID (for reference)
36+
"""
37+
self.table_name = table_name
38+
self.region_name = region_name
39+
self.aws_account_id = aws_account_id
40+
41+
# Initialize DynamoDB resource
42+
self.dynamodb = boto3.resource('dynamodb', region_name=region_name)
43+
self.table = self.dynamodb.Table(table_name)
44+
45+
def link_exists(self, link: str) -> bool:
46+
"""
47+
Check if a link already exists in DynamoDB and has not expired.
48+
49+
Args:
50+
link: The URL to check
51+
52+
Returns:
53+
True if the link exists and has not expired, False otherwise
54+
"""
55+
try:
56+
current_timestamp = int(datetime.now().timestamp())
57+
58+
# Query for the link with timestamp >= current time
59+
response = self.table.query(
60+
KeyConditionExpression=boto3.dynamodb.conditions.Key('link').eq(link) &
61+
boto3.dynamodb.conditions.Key('timestamp').gte(current_timestamp),
62+
Limit=1
63+
)
64+
65+
# If we found any items, the link exists and is not expired
66+
return len(response.get('Items', [])) > 0
67+
68+
except ClientError as e:
69+
print(f"Error checking link existence: {e}")
70+
# On error, assume link doesn't exist to avoid skipping content
71+
return False
72+
73+
def add_link(self, link: str, ttl_weeks: int = 8) -> bool:
74+
"""
75+
Add a link to DynamoDB with a TTL timestamp.
76+
77+
Args:
78+
link: The URL to store
79+
ttl_weeks: Number of weeks until the link expires (default: 8)
80+
81+
Returns:
82+
True if successfully added, False otherwise
83+
"""
84+
try:
85+
current_time = datetime.now()
86+
expiry_time = current_time + timedelta(weeks=ttl_weeks)
87+
timestamp = int(expiry_time.timestamp())
88+
89+
# Put item in DynamoDB
90+
self.table.put_item(
91+
Item={
92+
'link': link,
93+
'timestamp': timestamp,
94+
'created_at': current_time.isoformat(),
95+
'expires_at': expiry_time.isoformat()
96+
}
97+
)
98+
99+
print(f"Successfully added link: {link} (expires: {expiry_time.isoformat()})")
100+
return True
101+
102+
except ClientError as e:
103+
print(f"Error adding link: {e}")
104+
return False
105+
106+
def mark_link_processed(self, link: str, ttl_weeks: int = 8) -> bool:
107+
"""
108+
Convenience method to mark a link as processed (alias for add_link).
109+
110+
Args:
111+
link: The URL to mark as processed
112+
ttl_weeks: Number of weeks until the link expires (default: 8)
113+
114+
Returns:
115+
True if successfully marked, False otherwise
116+
"""
117+
return self.add_link(link, ttl_weeks)
118+
119+
120+
def get_validator() -> LinkValidator:
121+
"""
122+
Factory function to create a LinkValidator instance with default settings.
123+
124+
Returns:
125+
LinkValidator instance configured for the lunchdeal table
126+
"""
127+
return LinkValidator()
128+
129+
130+
# Standalone functions for easier imports
131+
def check_link(link: str) -> bool:
132+
"""
133+
Check if a link exists in DynamoDB (standalone function).
134+
135+
Args:
136+
link: The URL to check
137+
138+
Returns:
139+
True if link exists and has not expired, False otherwise
140+
"""
141+
validator = get_validator()
142+
return validator.link_exists(link)
143+
144+
145+
def mark_processed(link: str, ttl_weeks: int = 8) -> bool:
146+
"""
147+
Mark a link as processed in DynamoDB (standalone function).
148+
149+
Args:
150+
link: The URL to mark as processed
151+
ttl_weeks: Number of weeks until expiration (default: 8)
152+
153+
Returns:
154+
True if successfully marked, False otherwise
155+
"""
156+
validator = get_validator()
157+
return validator.add_link(link, ttl_weeks)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Standalone script to mark a link as processed in DynamoDB.
4+
5+
This script is designed to be called from CI/CD workflows after
6+
successfully processing a scraped link.
7+
8+
Usage:
9+
python mark_link_processed.py <url>
10+
python mark_link_processed.py <url> --ttl-weeks 8
11+
12+
Examples:
13+
python mark_link_processed.py "https://www.l.de/example/page"
14+
python mark_link_processed.py "https://www.l.de/example/page" --ttl-weeks 4
15+
"""
16+
17+
import sys
18+
import argparse
19+
from dynamodb_link_validator import mark_processed
20+
21+
22+
def main():
23+
parser = argparse.ArgumentParser(
24+
description='Mark a scraped link as processed in DynamoDB'
25+
)
26+
parser.add_argument(
27+
'url',
28+
help='The URL to mark as processed'
29+
)
30+
parser.add_argument(
31+
'--ttl-weeks',
32+
type=int,
33+
default=8,
34+
help='Number of weeks until the link expires (default: 8)'
35+
)
36+
37+
args = parser.parse_args()
38+
39+
try:
40+
success = mark_processed(args.url, ttl_weeks=args.ttl_weeks)
41+
if success:
42+
print(f"✓ Successfully marked link as processed: {args.url}")
43+
sys.exit(0)
44+
else:
45+
print(f"✗ Failed to mark link as processed: {args.url}", file=sys.stderr)
46+
sys.exit(1)
47+
except Exception as e:
48+
print(f"✗ Error: {e}", file=sys.stderr)
49+
sys.exit(1)
50+
51+
52+
if __name__ == '__main__':
53+
main()

locations/lecasino/scra.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
import scrapy
22
import os
3+
import sys
34
from pathlib import Path
45
import lxml.html as lxml_html
56

7+
# Add parent directory to path to import shared modules
8+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / '.shared'))
9+
from dynamodb_link_validator import LinkValidator
10+
611
class Lecasino(scrapy.Spider):
712
name = "lecasino"
813
allowed_domains = ["www.l.de"]
@@ -12,6 +17,17 @@ class Lecasino(scrapy.Spider):
1217
menu_pages_html = None
1318
# base directory to save fetched HTML pages
1419
artifacts_dir = Path(__file__).resolve().parents[2] / 'tmp'
20+
21+
def __init__(self, *args, **kwargs):
22+
super(Lecasino, self).__init__(*args, **kwargs)
23+
# Initialize link validator
24+
try:
25+
self.link_validator = LinkValidator()
26+
self.use_link_validation = True
27+
print("DynamoDB link validation enabled")
28+
except Exception as e:
29+
print(f"Warning: Could not initialize link validator: {e}")
30+
self.use_link_validation = False
1531

1632
def parse(self, response):
1733
print(response.body)
@@ -22,7 +38,13 @@ def parse(self, response):
2238
for sel in links:
2339
link = sel.get()
2440
full = link if link.startswith('http') else "https://www.l.de" + link
25-
print(full)
41+
42+
# Check if link already exists in DynamoDB
43+
if self.use_link_validation and self.link_validator.link_exists(full):
44+
print(f"Skipping already processed link: {full}")
45+
continue
46+
47+
print(f"Processing new link: {full}")
2648
# schedule a request to fetch the menu page and process it
2749
yield scrapy.Request(url=full, callback=self.parse_menu_page, cb_kwargs={"source_url": full})
2850
else:

0 commit comments

Comments
 (0)