forked from seszele64/blix-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path01_scrape_single_shop.py
More file actions
92 lines (71 loc) · 2.87 KB
/
01_scrape_single_shop.py
File metadata and controls
92 lines (71 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Example: Scrape all data for a single shop.
This script demonstrates:
- Scraping shop leaflets
- Extracting offers and keywords
- Saving data to JSON files
Usage:
python examples/01_scrape_single_shop.py
"""
import sys
from pathlib import Path
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.orchestrator import ScraperOrchestrator
from src.config import settings
from src.logging_config import setup_logging
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn
# Setup logging and console
setup_logging()
console = Console()
def main():
"""
Scrape all data for a single shop (Biedronka).
This example shows how to:
1. Create a ScraperOrchestrator instance
2. Scrape leaflets for a shop
3. Scrape offers and keywords for each active leaflet
4. Handle errors gracefully
"""
shop_slug = "biedronka"
console.print(f"\n[bold cyan]Scraping {shop_slug}...[/bold cyan]\n")
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
# Create orchestrator with headless mode for faster execution
with ScraperOrchestrator(headless=True) as orchestrator:
# Task 1: Scrape all leaflets for the shop
task1 = progress.add_task("Fetching leaflets...", total=None)
leaflets = orchestrator.scrape_shop_leaflets(shop_slug)
progress.update(task1, completed=True)
console.print(f"Found {len(leaflets)} leaflets\n")
# Filter to active leaflets only
active_leaflets = [l for l in leaflets if l.is_active_now()]
console.print(f"Active leaflets: {len(active_leaflets)}\n")
# Task 2: Scrape offers and keywords for each active leaflet
for i, leaflet in enumerate(active_leaflets, 1):
task2 = progress.add_task(
f"[{i}/{len(active_leaflets)}] Scraping leaflet {leaflet.leaflet_id}...",
total=None
)
try:
offers, keywords = orchestrator.scrape_full_leaflet(
shop_slug,
leaflet.leaflet_id
)
console.print(
f" Leaflet {leaflet.leaflet_id}: "
f"{len(offers)} offers, {len(keywords)} keywords"
)
except Exception as e:
console.print(
f" [red]Failed to scrape leaflet {leaflet.leaflet_id}: {e}[/red]"
)
progress.update(task2, completed=True)
console.print("\n[bold green]Scraping completed![/bold green]")
console.print(f"\nData saved to: [cyan]{settings.data_dir}[/cyan]")
if __name__ == "__main__":
main()