Skip to content

Commit e04cdc3

Browse files
committed
test: add crawllab.dev live integration tests for graph and browser crates
Add #[ignore]-gated integration tests targeting https://crawllab.dev — a free OSS scraper testing harness — to validate real-world HTTP behaviour. stygian-graph/tests/crawllab.rs (12 tests): - HTTP status code classification: 200 ok, 404/500 → Unavailable, 429 → RateLimited - Redirect following: 301 permanent and 302 temporary both succeed - Redirect cycle (cycle-a → cycle-b loop) exhausts reqwest limit → Unavailable - Content type handling: /json, /text endpoints return non-empty data - Edge cases: HTTP 204 No Content is 2xx (ok); /random always succeeds - Multi-page HTML: /forum?page=1 body is non-empty, page_count=1 stygian-browser/tests/crawllab.rs (4 tests): - js_inline_renders_content: DOM > 200 chars after WaitUntil::NetworkIdle - js_external_renders_content: external script fetched and executed - browser_navigates_status_200: smoke test for browser pool round-trip - eval_works_on_crawllab_page: JS runtime intact + navigator.webdriver hidden Run with: cargo test -p stygian-graph --test crawllab -- --ignored cargo test -p stygian-browser --test crawllab -- --ignored --test-threads=1
1 parent 2ff18b5 commit e04cdc3

File tree

2 files changed

+430
-0
lines changed

2 files changed

+430
-0
lines changed
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
//! Live browser integration tests against <https://crawllab.dev>.
2+
//!
3+
//! crawllab.dev provides JS-rendered endpoints that deliver a minimal HTML
4+
//! skeleton on the initial request; the real content is only visible after the
5+
//! browser executes the bundled scripts. These tests confirm that
6+
//! `stygian-browser` waits for script execution before reading the DOM.
7+
//!
8+
//! Requirements: a real Chrome/Chromium binary **and** outbound HTTPS access.
9+
//! Tests are gated with `#[ignore]`; run them explicitly:
10+
//!
11+
//! ```sh
12+
//! cargo test -p stygian-browser --test crawllab -- --ignored --test-threads=1
13+
//! ```
14+
//!
15+
//! Set `STYGIAN_CHROME_PATH` to override the browser binary path.
16+
17+
#![allow(
18+
clippy::expect_used,
19+
clippy::unwrap_used,
20+
clippy::panic,
21+
clippy::missing_panics_doc
22+
)]
23+
24+
use std::path::PathBuf;
25+
use std::sync::atomic::{AtomicU64, Ordering};
26+
use std::time::Duration;
27+
28+
use stygian_browser::{BrowserConfig, BrowserInstance, WaitUntil};
29+
30+
// ─── Helpers ──────────────────────────────────────────────────────────────────
31+
32+
fn unique_user_data_dir() -> PathBuf {
33+
static COUNTER: AtomicU64 = AtomicU64::new(0);
34+
let n = COUNTER.fetch_add(1, Ordering::Relaxed);
35+
let pid = std::process::id();
36+
std::env::temp_dir().join(format!("stygian-crawllab-{pid}-{n}"))
37+
}
38+
39+
fn test_config() -> BrowserConfig {
40+
let mut cfg = BrowserConfig::builder().headless(true).build();
41+
cfg.launch_timeout = Duration::from_secs(30);
42+
cfg.cdp_timeout = Duration::from_secs(15);
43+
cfg.user_data_dir = Some(unique_user_data_dir());
44+
if let Ok(p) = std::env::var("STYGIAN_CHROME_PATH") {
45+
cfg.chrome_path = Some(PathBuf::from(p));
46+
}
47+
cfg
48+
}
49+
50+
// ─── JS rendering ─────────────────────────────────────────────────────────────
51+
52+
/// `/js/inline` delivers a bare HTML skeleton on the initial request. The
53+
/// real page content is rendered by an inline `<script>` tag. After
54+
/// `WaitUntil::NetworkIdle` the DOM should reflect the completed render.
55+
#[tokio::test]
56+
#[ignore = "requires real Chrome binary and network access to crawllab.dev"]
57+
async fn js_inline_renders_content() -> Result<(), Box<dyn std::error::Error>> {
58+
let instance = BrowserInstance::launch(test_config()).await?;
59+
let mut page = instance.new_page().await?;
60+
61+
page.navigate(
62+
"https://crawllab.dev/js/inline",
63+
WaitUntil::NetworkIdle,
64+
Duration::from_secs(20),
65+
)
66+
.await?;
67+
68+
let html = page.content().await?;
69+
70+
// crawllab guarantees ≥ 200 characters of scraper-visible output.
71+
assert!(
72+
html.len() > 200,
73+
"JS-rendered page should have ≥ 200 chars of content, got {} bytes",
74+
html.len()
75+
);
76+
assert!(
77+
html.contains("<body"),
78+
"rendered page must include a <body> element"
79+
);
80+
81+
page.close().await?;
82+
instance.shutdown().await?;
83+
Ok(())
84+
}
85+
86+
/// `/js/external` loads its render script from a separate file at
87+
/// `/js/render.js`. This exercises the browser's ability to fetch and execute
88+
/// an external script before we read the final DOM state.
89+
#[tokio::test]
90+
#[ignore = "requires real Chrome binary and network access to crawllab.dev"]
91+
async fn js_external_renders_content() -> Result<(), Box<dyn std::error::Error>> {
92+
let instance = BrowserInstance::launch(test_config()).await?;
93+
let mut page = instance.new_page().await?;
94+
95+
page.navigate(
96+
"https://crawllab.dev/js/external",
97+
WaitUntil::NetworkIdle,
98+
Duration::from_secs(20),
99+
)
100+
.await?;
101+
102+
let html = page.content().await?;
103+
104+
assert!(
105+
html.len() > 200,
106+
"externally JS-rendered page should have ≥ 200 chars, got {} bytes",
107+
html.len()
108+
);
109+
assert!(
110+
html.contains("<body"),
111+
"rendered page must include a <body> element"
112+
);
113+
114+
page.close().await?;
115+
instance.shutdown().await?;
116+
Ok(())
117+
}
118+
119+
/// Confirms that the browser's stealth injection does not break normal page
120+
/// navigation or JS execution on an external site. Uses a simple status-200
121+
/// endpoint as a smoke test that the browser pool round-trips correctly.
122+
#[tokio::test]
123+
#[ignore = "requires real Chrome binary and network access to crawllab.dev"]
124+
async fn browser_navigates_status_200() -> Result<(), Box<dyn std::error::Error>> {
125+
let instance = BrowserInstance::launch(test_config()).await?;
126+
let mut page = instance.new_page().await?;
127+
128+
page.navigate(
129+
"https://crawllab.dev/status/200",
130+
WaitUntil::DomContentLoaded,
131+
Duration::from_secs(15),
132+
)
133+
.await?;
134+
135+
let html = page.content().await?;
136+
assert!(
137+
html.contains("<html") || html.contains("<HTML"),
138+
"response should be an HTML document, got: {}",
139+
html.get(..200.min(html.len())).unwrap_or_default()
140+
);
141+
142+
page.close().await?;
143+
instance.shutdown().await?;
144+
Ok(())
145+
}
146+
147+
/// Evaluates JavaScript on a live crawllab.dev page to confirm that our CDP
148+
/// stealth injection does not break the JS runtime.
149+
///
150+
/// We navigate to the JS inline page and use `page.eval()` to directly query
151+
/// the document title — if our injection corrupted the runtime this panics.
152+
#[tokio::test]
153+
#[ignore = "requires real Chrome binary and network access to crawllab.dev"]
154+
async fn eval_works_on_crawllab_page() -> Result<(), Box<dyn std::error::Error>> {
155+
let instance = BrowserInstance::launch(test_config()).await?;
156+
let mut page = instance.new_page().await?;
157+
158+
page.navigate(
159+
"https://crawllab.dev/js/inline",
160+
WaitUntil::DomContentLoaded,
161+
Duration::from_secs(15),
162+
)
163+
.await?;
164+
165+
// Evaluate a simple expression to verify the JS runtime is intact.
166+
let result: f64 = page.eval("1 + 1").await?;
167+
assert!(
168+
(result - 2.0).abs() < f64::EPSILON,
169+
"JS eval sanity check failed: expected 2, got {result}"
170+
);
171+
172+
// Verify navigator.webdriver is hidden (stealth injection active).
173+
let webdriver_hidden: bool = page
174+
.eval("typeof navigator.webdriver === 'undefined' || navigator.webdriver === false")
175+
.await?;
176+
assert!(
177+
webdriver_hidden,
178+
"navigator.webdriver should be hidden by stealth injection"
179+
);
180+
181+
page.close().await?;
182+
instance.shutdown().await?;
183+
Ok(())
184+
}

0 commit comments

Comments
 (0)