Skip to content

Commit e2f12b5

Browse files
committed
Add documentation and examples
1 parent 5c399c6 commit e2f12b5

File tree

13 files changed

+131
-28
lines changed

13 files changed

+131
-28
lines changed

python/outlines_core/fsm/outlines_core_rs.pyi

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from typing import Dict, List, Optional, Set, Tuple, Union
22

3-
def build_regex_from_schema(
4-
json: str, whitespace_pattern: Optional[str] = None
5-
) -> str: ...
3+
def build_regex_from_schema(json: str, whitespace_pattern: Optional[str] = None) -> str:
4+
"""Creates regex string from JSON schema with optional whitespace pattern."""
5+
...
66

77
BOOLEAN: str
88
DATE: str

src/bin/convert-json-schema.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
use outlines_core::json_schema::build_regex_from_schema;
1+
use outlines_core::prelude::*;
22

33
fn main() {
44
let schema = std::io::read_to_string(std::io::stdin()).unwrap();
5-
let regex = build_regex_from_schema(&schema, None).unwrap();
5+
let regex = json_schema::regex_from_str(&schema, None).unwrap();
66
println!("Regex: {}", regex);
77
println!("Regex len: {}", regex.len());
88
}

src/error.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
//! The Errors that may occur within the crate.
2+
13
use thiserror::Error;
24

35
pub type Result<T, E = crate::Error> = std::result::Result<T, E>;

src/index.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
/// Construct an Index.
1+
//! Building an `Index` to efficiently map vocabulary tokens to state transitions.
2+
23
use crate::prelude::*;
34
use crate::vocabulary::Vocabulary;
45
use crate::{Error, Result};
@@ -8,6 +9,7 @@ use regex_automata::util::primitives::StateID as AutomataStateId;
89
use regex_automata::Anchored;
910
use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet};
1011

12+
/// `Index` efficiently maps vocabulary tokens to state transitions.
1113
#[derive(Clone, Debug, PartialEq, Encode, Decode)]
1214
pub struct Index {
1315
/// The ID of the initial state in the automaton, processing begins from this state.

src/json_schema/mod.rs

Lines changed: 82 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
//! Provides interfaces to generate a regular expression based on a given JSON schema.
2+
//!
3+
//! An optional custom pattern could be passed as well to handle whitespace within the regex.
4+
//! If `None`, the default [WHITESPACE] pattern is used.
5+
//!
6+
//! Returns errors if the JSON schema content is invalid or some feature is not yet supported
7+
//! for regex generation.
8+
19
use serde_json::Value;
210

311
mod parsing;
@@ -7,12 +15,78 @@ pub use types::*;
715

816
use crate::Result;
917

10-
pub fn build_regex_from_schema(json: &str, whitespace_pattern: Option<&str>) -> Result<String> {
18+
/// Generates a regular expression string from given JSON schema string.
19+
///
20+
/// # Example
21+
///
22+
/// ```rust
23+
/// # use outlines_core::Error;
24+
/// use outlines_core::prelude::*;
25+
///
26+
/// # fn main() -> Result<(), Error> {
27+
/// // Define a JSON schema
28+
/// let schema = r#"{
29+
/// "type": "object",
30+
/// "properties": {
31+
/// "name": { "type": "string" },
32+
/// "age": { "type": "integer" }
33+
/// },
34+
/// "required": ["name", "age"]
35+
/// }"#;
36+
///
37+
/// // Generate regex from schema
38+
/// let regex = json_schema::regex_from_str(&schema, None)?;
39+
/// println!("Generated regex: {}", regex);
40+
///
41+
/// // Custom whitespace pattern could be passed as well
42+
/// let whitespace_pattern = Some(r#"[\n ]*"#);
43+
/// let regex = json_schema::regex_from_str(&schema, whitespace_pattern)?;
44+
/// println!("Generated regex with custom whitespace pattern: {}", regex);
45+
///
46+
/// # Ok(())
47+
/// }
48+
/// ```
49+
pub fn regex_from_str(json: &str, whitespace_pattern: Option<&str>) -> Result<String> {
1150
let json_value: Value = serde_json::from_str(json)?;
12-
to_regex(&json_value, whitespace_pattern)
51+
regex_from_value(&json_value, whitespace_pattern)
1352
}
1453

15-
pub fn to_regex(json: &Value, whitespace_pattern: Option<&str>) -> Result<String> {
54+
/// Generates a regular expression string from `serde_json::Value` type of JSON schema.
55+
///
56+
/// # Example
57+
///
58+
/// ```rust
59+
/// # use outlines_core::Error;
60+
/// use serde_json::Value;
61+
/// use outlines_core::prelude::*;
62+
///
63+
/// # fn main() -> Result<(), Error> {
64+
/// // Define a JSON schema
65+
/// let schema = r#"{
66+
/// "type": "object",
67+
/// "properties": {
68+
/// "name": { "type": "string" },
69+
/// "age": { "type": "integer" }
70+
/// },
71+
/// "required": ["name", "age"]
72+
/// }"#;
73+
///
74+
/// // If schema's `Value` was already parsed
75+
/// let schema_value: Value = serde_json::from_str(schema)?;
76+
///
77+
/// // It's possible to generate a regex from schema value
78+
/// let regex = json_schema::regex_from_value(&schema_value, None)?;
79+
/// println!("Generated regex: {}", regex);
80+
///
81+
/// // Custom whitespace pattern could be passed as well
82+
/// let whitespace_pattern = Some(r#"[\n ]*"#);
83+
/// let regex = json_schema::regex_from_value(&schema_value, whitespace_pattern)?;
84+
/// println!("Generated regex with custom whitespace pattern: {}", regex);
85+
///
86+
/// # Ok(())
87+
/// }
88+
/// ```
89+
pub fn regex_from_value(json: &Value, whitespace_pattern: Option<&str>) -> Result<String> {
1690
let mut parser = parsing::Parser::new(json);
1791
if let Some(pattern) = whitespace_pattern {
1892
parser = parser.with_whitespace_pattern(pattern)
@@ -1001,7 +1075,7 @@ mod tests {
10011075
],
10021076
),
10031077
] {
1004-
let result = build_regex_from_schema(schema, None).expect("To regex failed");
1078+
let result = regex_from_str(schema, None).expect("To regex failed");
10051079
assert_eq!(result, regex, "JSON Schema {} didn't match", schema);
10061080

10071081
let re = Regex::new(&result).expect("Regex failed");
@@ -1057,7 +1131,7 @@ mod tests {
10571131
],
10581132
),
10591133
] {
1060-
let regex = build_regex_from_schema(schema, None).expect("To regex failed");
1134+
let regex = regex_from_str(schema, None).expect("To regex failed");
10611135
let re = Regex::new(&regex).expect("Regex failed");
10621136
for m in a_match {
10631137
should_match(&re, m);
@@ -1110,8 +1184,7 @@ mod tests {
11101184
vec![r#"{SPACE"date"SPACE:SPACE"2018-11-13"SPACE}"#],
11111185
),
11121186
] {
1113-
let regex =
1114-
build_regex_from_schema(schema, whitespace_pattern).expect("To regex failed");
1187+
let regex = regex_from_str(schema, whitespace_pattern).expect("To regex failed");
11151188
assert_eq!(regex, expected_regex);
11161189

11171190
let re = Regex::new(&regex).expect("Regex failed");
@@ -1135,7 +1208,7 @@ mod tests {
11351208
}
11361209
}"##;
11371210

1138-
let regex = build_regex_from_schema(schema, None);
1211+
let regex = regex_from_str(schema, None);
11391212
assert!(regex.is_ok(), "{:?}", regex);
11401213

11411214
// Confirm the depth of 3 recursion levels by default, recursion level starts
@@ -1268,7 +1341,7 @@ mod tests {
12681341
"$ref": "#/definitions/typeA"
12691342
}"##;
12701343

1271-
let regex = build_regex_from_schema(schema, None);
1344+
let regex = regex_from_str(schema, None);
12721345
assert!(regex.is_ok(), "{:?}", regex);
12731346
}
12741347
}

src/json_schema/parsing.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
//! Parser generates a regular expression described by a JSON schema.
2+
13
use std::num::NonZeroU64;
24

35
use regex::escape;

src/json_schema/types.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
1+
//! Static collection of regular expressions for JSON and format types used
2+
//! in generating a regular expression string based on a given JSON schema.
3+
14
// allow `\"`, `\\`, or any character which isn't a control sequence
25
pub static STRING_INNER: &str = r#"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])"#;
36
pub static STRING: &str = r#""([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*""#;
4-
57
pub static INTEGER: &str = r#"(-)?(0|[1-9][0-9]*)"#;
68
pub static NUMBER: &str = r#"((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?"#;
79
pub static BOOLEAN: &str = r#"(true|false)"#;
810
pub static NULL: &str = r#"null"#;
911

12+
/// Default whitespace pattern used for generation a regular expression from JSON schema.
1013
pub static WHITESPACE: &str = r#"[ ]?"#;
1114

15+
/// Supported JSON types.
1216
#[derive(Debug, PartialEq)]
1317
pub enum JsonType {
1418
String,
@@ -37,6 +41,7 @@ pub static UUID: &str = r#""[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9
3741
pub static URI: &str = r#"^(https?|ftp):\/\/([^\s:@]+(:[^\s:@]*)?@)?([a-zA-Z\d.-]+\.[a-zA-Z]{2,}|localhost)(:\d+)?(\/[^\s?#]*)?(\?[^\s#]*)?(#[^\s]*)?$|^urn:[a-zA-Z\d][a-zA-Z\d\-]{0,31}:[^\s]+$"#;
3842
pub static EMAIL: &str = r#"^(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])$"#;
3943

44+
/// Supported format type of the `JsonType::String`.
4045
#[derive(Debug, PartialEq)]
4146
pub enum FormatType {
4247
DateTime,

src/prelude.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
//! Library's interface essentials.
2+
13
pub use tokenizers::FromPretrainedParameters;
24

35
pub use super::{
46
index::Index,
7+
json_schema,
58
primitives::{StateId, Token, TokenId},
69
vocabulary::Vocabulary,
710
};

src/primitives.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
//! Defines fundamental types used throughout the crate.
2+
13
/// Token content.
24
pub type Token = Vec<u8>;
35

src/python_bindings/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
//! Provides tools and interfaces to integrate the crate's functionality with Python.
2+
13
use std::sync::Arc;
24

35
use crate::index::Index;
@@ -332,7 +334,7 @@ pub fn build_regex_from_schema_py(
332334
let json = serde_json::from_str(&json).map_err(|_| {
333335
PyErr::new::<pyo3::exceptions::PyTypeError, _>("Expected a valid JSON string.")
334336
})?;
335-
json_schema::to_regex(&json, whitespace_pattern)
337+
json_schema::regex_from_value(&json, whitespace_pattern)
336338
.map_err(|e| PyValueError::new_err(e.to_string()))
337339
}
338340

0 commit comments

Comments
 (0)