Skip to content

Commit 4e0a9e8

Browse files
j-mendezclaude
andcommitted
feat(automation): add ExtractionSchema for structured extraction output
- Add ExtractionSchema struct with name, description, schema, and strict fields - Include schema instructions in system prompt when extraction is enabled - Add with_extraction_schema() builder methods to RemoteMultimodalConfigs and RemoteMultimodalEngine - Support strict mode for exact schema adherence Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent b78d3d2 commit 4e0a9e8

File tree

7 files changed

+148
-14
lines changed

7 files changed

+148
-14
lines changed

Cargo.lock

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

default.nix

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
let
44
spider = pkgs.rustPlatform.buildRustPackage {
55
pname = "spider";
6-
version = "2.43.1";
6+
version = "2.43.2";
77

88
src = ./.;
99

spider/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider"
3-
version = "2.43.1"
3+
version = "2.43.2"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "A web crawler and scraper, building blocks for data curation workloads."
66
repository = "https://github.com/spider-rs/spider"

spider/src/features/automation.rs

Lines changed: 135 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,68 @@ impl From<serde_json::Error> for EngineError {
8484
}
8585
}
8686

87+
/// JSON schema configuration for structured extraction output.
88+
///
89+
/// This allows you to define a schema that the model should follow when
90+
/// extracting data from pages. Similar to OpenAI's structured outputs.
91+
#[derive(Debug, Clone, Default, PartialEq)]
92+
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
93+
pub struct ExtractionSchema {
94+
/// A name for this extraction schema (e.g., "product_listing", "contact_info").
95+
pub name: String,
96+
/// Optional description of what data should be extracted.
97+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
98+
pub description: Option<String>,
99+
/// The JSON Schema definition as a string.
100+
///
101+
/// Example:
102+
/// ```json
103+
/// {
104+
/// "type": "object",
105+
/// "properties": {
106+
/// "title": { "type": "string" },
107+
/// "price": { "type": "number" }
108+
/// },
109+
/// "required": ["title"]
110+
/// }
111+
/// ```
112+
pub schema: String,
113+
/// Whether to enforce strict schema adherence.
114+
///
115+
/// When true, instructs the model to strictly follow the schema.
116+
/// Note: Not all models support strict mode.
117+
#[cfg_attr(feature = "serde", serde(default))]
118+
pub strict: bool,
119+
}
120+
121+
impl ExtractionSchema {
122+
/// Create a new extraction schema.
123+
pub fn new(name: &str, schema: &str) -> Self {
124+
Self {
125+
name: name.to_string(),
126+
description: None,
127+
schema: schema.to_string(),
128+
strict: false,
129+
}
130+
}
131+
132+
/// Create a new extraction schema with description.
133+
pub fn new_with_description(name: &str, description: &str, schema: &str) -> Self {
134+
Self {
135+
name: name.to_string(),
136+
description: Some(description.to_string()),
137+
schema: schema.to_string(),
138+
strict: false,
139+
}
140+
}
141+
142+
/// Set strict mode.
143+
pub fn with_strict(mut self, strict: bool) -> Self {
144+
self.strict = strict;
145+
self
146+
}
147+
}
148+
87149
/// Coarse cost budget the engine may spend for a single automation run.
88150
///
89151
/// This is used by [`ModelPolicy`] to decide whether the engine may select
@@ -347,6 +409,31 @@ pub struct RemoteMultimodalConfig {
347409
///
348410
/// Example: "Extract all product names and prices as a JSON array."
349411
pub extraction_prompt: Option<String>,
412+
/// Optional JSON schema for structured extraction output.
413+
///
414+
/// When provided, the model is instructed to return the `extracted` field
415+
/// conforming to this schema. This enables type-safe extraction.
416+
///
417+
/// Example schema:
418+
/// ```json
419+
/// {
420+
/// "type": "object",
421+
/// "properties": {
422+
/// "products": {
423+
/// "type": "array",
424+
/// "items": {
425+
/// "type": "object",
426+
/// "properties": {
427+
/// "name": { "type": "string" },
428+
/// "price": { "type": "number" }
429+
/// },
430+
/// "required": ["name", "price"]
431+
/// }
432+
/// }
433+
/// }
434+
/// }
435+
/// ```
436+
pub extraction_schema: Option<ExtractionSchema>,
350437
/// Take a screenshot after automation completes and include it in results.
351438
pub screenshot: bool,
352439
}
@@ -370,6 +457,7 @@ impl Default for RemoteMultimodalConfig {
370457
max_inflight_requests: None,
371458
extra_ai_data: false,
372459
extraction_prompt: None,
460+
extraction_schema: None,
373461
screenshot: false,
374462
}
375463
}
@@ -686,6 +774,24 @@ impl RemoteMultimodalConfigs {
686774
self.cfg.screenshot = enabled;
687775
self
688776
}
777+
778+
/// Set a JSON schema for structured extraction output.
779+
///
780+
/// When provided, the model is instructed to return the `extracted` field
781+
/// conforming to this schema. This enables type-safe extraction.
782+
///
783+
/// # Example
784+
/// ```rust
785+
/// use spider::features::automation::{RemoteMultimodalConfigs, ExtractionSchema};
786+
/// let schema = ExtractionSchema::new("products", r#"{"type": "array", "items": {"type": "object", "properties": {"name": {"type": "string"}, "price": {"type": "number"}}}}"#);
787+
/// let mm = RemoteMultimodalConfigs::new("http://localhost:11434/v1/chat/completions", "model")
788+
/// .with_extra_ai_data(true)
789+
/// .with_extraction_schema(Some(schema));
790+
/// ```
791+
pub fn with_extraction_schema(mut self, schema: Option<ExtractionSchema>) -> Self {
792+
self.cfg.extraction_schema = schema;
793+
self
794+
}
689795
}
690796

691797
impl PromptUrlGate {
@@ -865,6 +971,12 @@ impl RemoteMultimodalEngine {
865971
self
866972
}
867973

974+
/// Set a JSON schema for structured extraction output.
975+
pub fn with_extraction_schema(&mut self, schema: Option<ExtractionSchema>) -> &mut Self {
976+
self.cfg.extraction_schema = schema;
977+
self
978+
}
979+
868980
/// Acquire the permit.
869981
pub async fn acquire_llm_permit(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
870982
match &self.semaphore {
@@ -892,12 +1004,33 @@ impl RemoteMultimodalEngine {
8921004
if effective_cfg.extra_ai_data {
8931005
s.push_str("\n\n---\nEXTRACTION MODE ENABLED:\n");
8941006
s.push_str("Include an \"extracted\" field in your JSON response containing structured data extracted from the page.\n");
895-
s.push_str("The \"extracted\" field should be a JSON object or array with the relevant data.\n");
1007+
1008+
// Add schema instructions if provided
1009+
if let Some(schema) = &effective_cfg.extraction_schema {
1010+
s.push_str("\nExtraction Schema: ");
1011+
s.push_str(&schema.name);
1012+
s.push('\n');
1013+
if let Some(desc) = &schema.description {
1014+
s.push_str("Description: ");
1015+
s.push_str(desc.trim());
1016+
s.push('\n');
1017+
}
1018+
s.push_str("\nThe \"extracted\" field MUST conform to this JSON Schema:\n");
1019+
s.push_str(&schema.schema);
1020+
s.push('\n');
1021+
if schema.strict {
1022+
s.push_str("\nSTRICT MODE: You MUST follow the schema exactly. Do not add extra fields or omit required fields.\n");
1023+
}
1024+
} else {
1025+
s.push_str("The \"extracted\" field should be a JSON object or array with the relevant data.\n");
1026+
}
1027+
8961028
if let Some(extraction_prompt) = &effective_cfg.extraction_prompt {
8971029
s.push_str("\nExtraction instructions: ");
8981030
s.push_str(extraction_prompt.trim());
8991031
s.push('\n');
9001032
}
1033+
9011034
s.push_str("\nExample response with extraction:\n");
9021035
s.push_str("{\n \"label\": \"extract_products\",\n \"done\": true,\n \"steps\": [],\n \"extracted\": {\"products\": [{\"name\": \"Product A\", \"price\": 19.99}]}\n}\n");
9031036
}
@@ -2013,6 +2146,7 @@ fn merged_config(
20132146
// Extraction settings
20142147
out.extra_ai_data = override_cfg.extra_ai_data;
20152148
out.extraction_prompt = override_cfg.extraction_prompt.clone();
2149+
out.extraction_schema = override_cfg.extraction_schema.clone();
20162150
out.screenshot = override_cfg.screenshot;
20172151

20182152
out

spider_cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_cli"
3-
version = "2.43.1"
3+
version = "2.43.2"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "The fastest web crawler CLI written in Rust."
66
repository = "https://github.com/spider-rs/spider"

spider_utils/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_utils"
3-
version = "2.43.1"
3+
version = "2.43.2"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "Utilities to use for Spider Web Crawler."
66
repository = "https://github.com/spider-rs/spider"

spider_worker/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "spider_worker"
3-
version = "2.43.1"
3+
version = "2.43.2"
44
authors = ["j-mendez <jeff@spider.cloud>"]
55
description = "The fastest web crawler as a worker or proxy."
66
repository = "https://github.com/spider-rs/spider"

0 commit comments

Comments
 (0)