crawl4ai/docs/examples/regex_extraction_quickstart.py at main · djl0/crawl4ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# == File: regex_extraction_quickstart.py ==
"""
Mini–quick-start for RegexExtractionStrategy
────────────────────────────────────────────
3 bite-sized demos that parallel the style of *quickstart_examples_set_1.py*:

1.  **Default catalog** – scrape a page and pull out e-mails / phones / URLs, etc.
2.  **Custom pattern**  – add your own regex at instantiation time.
3.  **LLM-assisted schema** – ask the model to write a pattern, cache it, then
    run extraction _without_ further LLM calls.

Run the whole thing with::

    python regex_extraction_quickstart.py
"""

import os, json, asyncio
from pathlib import Path
from typing import List

from crawl4ai import (
    AsyncWebCrawler,
    CrawlerRunConfig,
    CrawlResult,
    RegexExtractionStrategy,
    LLMConfig,
)

# ────────────────────────────────────────────────────────────────────────────
# 1. Default-catalog extraction
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_default() -> None:
    print("\n=== 1. Regex extraction – default patterns ===")

    url = "https://www.iana.org/domains/example"      # has e-mail + URLs
    strategy = RegexExtractionStrategy(
        pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
    )
    config   = CrawlerRunConfig(extraction_strategy=strategy)

    async with AsyncWebCrawler() as crawler:
        result: CrawlResult = await crawler.arun(url, config=config)

    print(f"Fetched {url} - success={result.success}")
    if result.success:
        data = json.loads(result.extracted_content)
        for d in data[:10]:
            print(f"  {d['label']:<12} {d['value']}")
        print(f"... total matches: {len(data)}")
    else:
        print("  !!! crawl failed")


# ────────────────────────────────────────────────────────────────────────────
# 2. Custom pattern override / extension
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_custom() -> None:
    print("\n=== 2. Regex extraction – custom price pattern ===")

    url = "https://www.apple.com/shop/buy-mac/macbook-pro"
    price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}

    strategy = RegexExtractionStrategy(custom = price_pattern)
    config   = CrawlerRunConfig(extraction_strategy=strategy)

    async with AsyncWebCrawler() as crawler:
        result: CrawlResult = await crawler.arun(url, config=config)

    if result.success:
        data = json.loads(result.extracted_content)
        for d in data:
            print(f"  {d['value']}")
        if not data:
            print("  (No prices found - page layout may have changed)")
    else:
        print("  !!! crawl failed")


# ────────────────────────────────────────────────────────────────────────────
# 3. One-shot LLM pattern generation, then fast extraction
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_generate_pattern() -> None:
    print("\n=== 3. generate_pattern → regex extraction ===")

    cache_dir   = Path(__file__).parent / "tmp"
    cache_dir.mkdir(exist_ok=True)
    pattern_file = cache_dir / "price_pattern.json"

    url = "https://www.lazada.sg/tag/smartphone/"

    # ── 3-A. build or load the cached pattern
    if pattern_file.exists():
        pattern = json.load(pattern_file.open(encoding="utf-8"))
        print("Loaded cached pattern:", pattern)
    else:
        print("Generating pattern via LLM…")

        llm_cfg = LLMConfig(
            provider="openai/gpt-4o-mini",
            api_token="env:OPENAI_API_KEY",
        )

        # pull one sample page as HTML context
        async with AsyncWebCrawler() as crawler:
            html = (await crawler.arun(url)).fit_html

        pattern = RegexExtractionStrategy.generate_pattern(
            label="price",
            html=html,
            query="Prices in Malaysian Ringgit (e.g. RM1,299.00 or RM200)",
            llm_config=llm_cfg,
        )

        json.dump(pattern, pattern_file.open("w", encoding="utf-8"), indent=2)
        print("Saved pattern:", pattern_file)

    # ── 3-B. extraction pass – zero LLM calls
    strategy = RegexExtractionStrategy(custom=pattern)
    config   = CrawlerRunConfig(extraction_strategy=strategy, delay_before_return_html=3)

    async with AsyncWebCrawler() as crawler:
        result: CrawlResult = await crawler.arun(url, config=config)

    if result.success:
        data = json.loads(result.extracted_content)
        for d in data[:15]:
            print(f"  {d['value']}")
        print(f"... total matches: {len(data)}")
    else:
        print("  !!! crawl failed")


# ────────────────────────────────────────────────────────────────────────────
# Entrypoint
# ────────────────────────────────────────────────────────────────────────────
async def main() -> None:
    # await demo_regex_default()
    # await demo_regex_custom()
    await demo_regex_generate_pattern()


if __name__ == "__main__":
    asyncio.run(main())