Skip to content

Commit ba2ed53

Browse files
committed
test(releases): Add test cases for release 0.7.0
1 parent a93efcb commit ba2ed53

File tree

1 file changed

+317
-0
lines changed

1 file changed

+317
-0
lines changed
Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
#!/usr/bin/env python3
2+
3+
import asyncio
4+
import pytest
5+
import os
6+
import json
7+
import tempfile
8+
from pathlib import Path
9+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
10+
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy, LLMConfig
11+
from crawl4ai.content_filter_strategy import BM25ContentFilter
12+
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
13+
from crawl4ai.async_url_seeder import AsyncUrlSeeder
14+
from crawl4ai.utils import RobotsParser
15+
16+
17+
class TestCrawl4AIv070:
18+
"""Test suite for Crawl4AI v0.7.0 changes"""
19+
20+
@pytest.mark.asyncio
21+
async def test_raw_url_parsing(self):
22+
"""Test raw:// URL parsing logic fix"""
23+
html_content = "<html><body><h1>Test Content</h1><p>This is a test paragraph.</p></body></html>"
24+
25+
async with AsyncWebCrawler() as crawler:
26+
# Test raw:// prefix
27+
result1 = await crawler.arun(f"raw://{html_content}")
28+
assert result1.success
29+
assert "Test Content" in result1.markdown
30+
31+
# Test raw: prefix
32+
result2 = await crawler.arun(f"raw:{html_content}")
33+
assert result2.success
34+
assert "Test Content" in result2.markdown
35+
36+
@pytest.mark.asyncio
37+
async def test_max_pages_limit_batch_processing(self):
38+
"""Test max_pages limit is respected during batch processing"""
39+
urls = [
40+
"https://httpbin.org/html",
41+
"https://httpbin.org/json",
42+
"https://httpbin.org/xml"
43+
]
44+
45+
config = CrawlerRunConfig(
46+
cache_mode=CacheMode.BYPASS,
47+
max_pages=2
48+
)
49+
50+
async with AsyncWebCrawler() as crawler:
51+
results = await crawler.arun_many(urls, config=config)
52+
# Should only process 2 pages due to max_pages limit
53+
successful_results = [r for r in results if r.success]
54+
assert len(successful_results) <= 2
55+
56+
@pytest.mark.asyncio
57+
async def test_navigation_abort_handling(self):
58+
"""Test handling of navigation aborts during file downloads"""
59+
async with AsyncWebCrawler() as crawler:
60+
# Test with a URL that might cause navigation issues
61+
result = await crawler.arun(
62+
"https://httpbin.org/status/404",
63+
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
64+
)
65+
# Should not crash even with navigation issues
66+
assert result is not None
67+
68+
@pytest.mark.asyncio
69+
async def test_screenshot_capture_fix(self):
70+
"""Test screenshot capture improvements"""
71+
config = CrawlerRunConfig(
72+
cache_mode=CacheMode.BYPASS,
73+
screenshot=True
74+
)
75+
76+
async with AsyncWebCrawler() as crawler:
77+
result = await crawler.arun("https://httpbin.org/html", config=config)
78+
assert result.success
79+
assert result.screenshot is not None
80+
assert len(result.screenshot) > 0
81+
82+
@pytest.mark.asyncio
83+
async def test_redirect_status_codes(self):
84+
"""Test that real redirect status codes are surfaced"""
85+
async with AsyncWebCrawler() as crawler:
86+
# Test with a redirect URL
87+
result = await crawler.arun(
88+
"https://httpbin.org/redirect/1",
89+
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
90+
)
91+
assert result.success
92+
# Should have redirect information
93+
assert result.status_code in [200, 301, 302, 303, 307, 308]
94+
95+
@pytest.mark.asyncio
96+
async def test_local_file_processing(self):
97+
"""Test local file processing with captured_console initialization"""
98+
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
99+
f.write("<html><body><h1>Local File Test</h1></body></html>")
100+
temp_file = f.name
101+
102+
try:
103+
async with AsyncWebCrawler() as crawler:
104+
result = await crawler.arun(f"file://{temp_file}")
105+
assert result.success
106+
assert "Local File Test" in result.markdown
107+
finally:
108+
os.unlink(temp_file)
109+
110+
@pytest.mark.asyncio
111+
async def test_robots_txt_wildcard_support(self):
112+
"""Test robots.txt wildcard rules support"""
113+
parser = RobotsParser()
114+
115+
# Test wildcard patterns
116+
robots_content = "User-agent: *\nDisallow: /admin/*\nDisallow: *.pdf"
117+
118+
# This should work without throwing exceptions
119+
assert parser is not None
120+
121+
@pytest.mark.asyncio
122+
async def test_exclude_external_images(self):
123+
"""Test exclude_external_images flag"""
124+
html_with_images = '''
125+
<html><body>
126+
<img src="/local-image.jpg" alt="Local">
127+
<img src="https://external.com/image.jpg" alt="External">
128+
</body></html>
129+
'''
130+
131+
config = CrawlerRunConfig(
132+
cache_mode=CacheMode.BYPASS,
133+
exclude_external_images=True
134+
)
135+
136+
async with AsyncWebCrawler() as crawler:
137+
result = await crawler.arun(f"raw://{html_with_images}", config=config)
138+
assert result.success
139+
# External images should be excluded
140+
assert "external.com" not in result.cleaned_html
141+
142+
@pytest.mark.asyncio
143+
async def test_llm_extraction_strategy_fix(self):
144+
"""Test LLM extraction strategy choices error fix"""
145+
if not os.getenv("OPENAI_API_KEY"):
146+
pytest.skip("OpenAI API key not available")
147+
148+
llm_config = LLMConfig(
149+
provider="openai/gpt-4o-mini",
150+
api_token=os.getenv("OPENAI_API_KEY")
151+
)
152+
153+
strategy = LLMExtractionStrategy(
154+
llm_config=llm_config,
155+
instruction="Extract the main heading",
156+
extraction_type="block"
157+
)
158+
159+
config = CrawlerRunConfig(
160+
cache_mode=CacheMode.BYPASS,
161+
extraction_strategy=strategy
162+
)
163+
164+
async with AsyncWebCrawler() as crawler:
165+
result = await crawler.arun("https://httpbin.org/html", config=config)
166+
assert result.success
167+
# Should not throw 'str' object has no attribute 'choices' error
168+
assert result.extracted_content is not None
169+
170+
@pytest.mark.asyncio
171+
async def test_wait_for_timeout(self):
172+
"""Test separate timeout for wait_for condition"""
173+
config = CrawlerRunConfig(
174+
cache_mode=CacheMode.BYPASS,
175+
wait_for="css:non-existent-element",
176+
wait_for_timeout=1000 # 1 second timeout
177+
)
178+
179+
async with AsyncWebCrawler() as crawler:
180+
result = await crawler.arun("https://httpbin.org/html", config=config)
181+
# Should timeout gracefully and still return result
182+
assert result is not None
183+
184+
@pytest.mark.asyncio
185+
async def test_bm25_content_filter_language_parameter(self):
186+
"""Test BM25 filter with language parameter for stemming"""
187+
content_filter = BM25ContentFilter(
188+
user_query="test content",
189+
language="english",
190+
use_stemming=True
191+
)
192+
193+
markdown_generator = DefaultMarkdownGenerator(
194+
content_filter=content_filter
195+
)
196+
197+
config = CrawlerRunConfig(
198+
cache_mode=CacheMode.BYPASS,
199+
markdown_generator=markdown_generator
200+
)
201+
202+
async with AsyncWebCrawler() as crawler:
203+
result = await crawler.arun("https://httpbin.org/html", config=config)
204+
assert result.success
205+
assert result.markdown is not None
206+
207+
@pytest.mark.asyncio
208+
async def test_url_normalization(self):
209+
"""Test URL normalization for invalid schemes and trailing slashes"""
210+
async with AsyncWebCrawler() as crawler:
211+
# Test with trailing slash
212+
result = await crawler.arun(
213+
"https://httpbin.org/html/",
214+
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
215+
)
216+
assert result.success
217+
218+
@pytest.mark.asyncio
219+
async def test_max_scroll_steps(self):
220+
"""Test max_scroll_steps parameter for full page scanning"""
221+
config = CrawlerRunConfig(
222+
cache_mode=CacheMode.BYPASS,
223+
scan_full_page=True,
224+
max_scroll_steps=3
225+
)
226+
227+
async with AsyncWebCrawler() as crawler:
228+
result = await crawler.arun("https://httpbin.org/html", config=config)
229+
assert result.success
230+
231+
@pytest.mark.asyncio
232+
async def test_async_url_seeder(self):
233+
"""Test AsyncUrlSeeder functionality"""
234+
seeder = AsyncUrlSeeder(
235+
base_url="https://httpbin.org",
236+
max_depth=1,
237+
max_urls=5
238+
)
239+
240+
async with AsyncWebCrawler() as crawler:
241+
urls = await seeder.seed(crawler)
242+
assert isinstance(urls, list)
243+
assert len(urls) <= 5
244+
245+
@pytest.mark.asyncio
246+
async def test_pdf_processing_timeout(self):
247+
"""Test PDF processing with timeout"""
248+
config = CrawlerRunConfig(
249+
cache_mode=CacheMode.BYPASS,
250+
pdf=True,
251+
pdf_timeout=10000 # 10 seconds
252+
)
253+
254+
async with AsyncWebCrawler() as crawler:
255+
result = await crawler.arun("https://httpbin.org/html", config=config)
256+
assert result.success
257+
# PDF might be None for HTML pages, but should not hang
258+
assert result.pdf is not None or result.pdf is None
259+
260+
@pytest.mark.asyncio
261+
async def test_browser_session_management(self):
262+
"""Test improved browser session management"""
263+
browser_config = BrowserConfig(
264+
headless=True,
265+
use_persistent_context=True
266+
)
267+
268+
async with AsyncWebCrawler(config=browser_config) as crawler:
269+
result = await crawler.arun(
270+
"https://httpbin.org/html",
271+
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
272+
)
273+
assert result.success
274+
275+
@pytest.mark.asyncio
276+
async def test_memory_management(self):
277+
"""Test memory management features"""
278+
config = CrawlerRunConfig(
279+
cache_mode=CacheMode.BYPASS,
280+
memory_threshold_percent=80.0,
281+
check_interval=1.0,
282+
memory_wait_timeout=600 # 10 minutes default
283+
)
284+
285+
async with AsyncWebCrawler() as crawler:
286+
result = await crawler.arun("https://httpbin.org/html", config=config)
287+
assert result.success
288+
289+
@pytest.mark.asyncio
290+
async def test_virtual_scroll_support(self):
291+
"""Test virtual scroll support for modern web scraping"""
292+
config = CrawlerRunConfig(
293+
cache_mode=CacheMode.BYPASS,
294+
scan_full_page=True,
295+
virtual_scroll=True
296+
)
297+
298+
async with AsyncWebCrawler() as crawler:
299+
result = await crawler.arun("https://httpbin.org/html", config=config)
300+
assert result.success
301+
302+
@pytest.mark.asyncio
303+
async def test_adaptive_crawling(self):
304+
"""Test adaptive crawling feature"""
305+
config = CrawlerRunConfig(
306+
cache_mode=CacheMode.BYPASS,
307+
adaptive_crawling=True
308+
)
309+
310+
async with AsyncWebCrawler() as crawler:
311+
result = await crawler.arun("https://httpbin.org/html", config=config)
312+
assert result.success
313+
314+
315+
if __name__ == "__main__":
316+
# Run the tests
317+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)