Skip to content

Commit ec3fedb

Browse files
committed
Apply suggestions from CR
1 parent e4d9e1a commit ec3fedb

File tree

6 files changed

+36
-27
lines changed

6 files changed

+36
-27
lines changed

python/outlines_core/fsm/outlines_core_rs.pyi

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from typing import Dict, List, Optional, Set, Tuple, Union
22

3-
def build_regex_from_schema(json: str, whitespace_pattern: Optional[str] = None) -> str:
3+
def build_regex_from_schema(
4+
json_schema: str, whitespace_pattern: Optional[str] = None
5+
) -> str:
46
"""Creates regex string from JSON schema with optional whitespace pattern."""
57
...
68

src/json_schema/types.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@ pub static NUMBER: &str = r#"((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?"#
99
pub static BOOLEAN: &str = r#"(true|false)"#;
1010
pub static NULL: &str = r#"null"#;
1111

12-
/// Default whitespace pattern used for generation a regular expression from JSON schema.
12+
/// Default whitespace pattern used for generating a regular expression from JSON schema.
13+
///
14+
/// It's being imposed since letting the model choose the number of white spaces and
15+
/// new lines led to pathological behaviors, especially for small models,
16+
/// see [example](https://github.com/dottxt-ai/outlines/issues/484)
1317
pub static WHITESPACE: &str = r#"[ ]?"#;
1418

1519
/// Supported JSON types.

src/lib.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
//! ## Support
3535
//!
3636
//! `Outlines_core` is primarily used in structured text generation project [`outlines`](https://github.com/dottxt-ai/outlines),
37+
//! if you need support, consider reaching out to its maintainers, you can also open an issue or start a discussion
38+
//! on [github](https://github.com/dottxt-ai/outlines-core)
3739
//!
3840
//! ## Example
3941
//!
@@ -59,10 +61,10 @@
5961
//! println!("Generated regex: {}", regex);
6062
//!
6163
//! // Create `Vocabulary` from pretrained large language model (but manually is also possible)
62-
//! let vocabulary = Vocabulary::from_pretrained("openai-community/gpt2", None);
64+
//! let vocabulary = Vocabulary::from_pretrained("openai-community/gpt2", None)?;
6365
//!
6466
//! // Create new `Index` from regex and a given `Vocabulary`
65-
//! let index = Index::new(regex, &vocabulary)?;
67+
//! let index = Index::new(&regex, &vocabulary)?;
6668
//!
6769
//! let initial_state = index.initial_state();
6870
//! println!("Is initial state {} a final state? {}", initial_state, index.is_final_state(&initial_state));

src/python_bindings/mod.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,15 +326,15 @@ impl PyVocabulary {
326326
}
327327

328328
#[pyfunction(name = "build_regex_from_schema")]
329-
#[pyo3(signature = (json, whitespace_pattern=None))]
329+
#[pyo3(signature = (json_schema, whitespace_pattern=None))]
330330
pub fn build_regex_from_schema_py(
331-
json: String,
331+
json_schema: String,
332332
whitespace_pattern: Option<&str>,
333333
) -> PyResult<String> {
334-
let json = serde_json::from_str(&json).map_err(|_| {
334+
let value = serde_json::from_str(&json_schema).map_err(|_| {
335335
PyErr::new::<pyo3::exceptions::PyTypeError, _>("Expected a valid JSON string.")
336336
})?;
337-
json_schema::regex_from_value(&json, whitespace_pattern)
337+
json_schema::regex_from_value(&value, whitespace_pattern)
338338
.map_err(|e| PyValueError::new_err(e.to_string()))
339339
}
340340

src/vocabulary/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//! Creates `Vocabulary` from pretrained large language model.
1+
//! Creates `Vocabulary` manually or from pretrained large language model.
22
33
use bincode::{Decode, Encode};
44
use rustc_hash::FxHashMap as HashMap;
@@ -15,7 +15,7 @@ use processor::TokenProcessor;
1515
mod locator;
1616
mod processor;
1717

18-
/// Creates `Vocabulary` manually or from defined large language model.
18+
/// `Vocabulary` of large language model.
1919
///
2020
/// ## Examples
2121
///

tests/fsm/test_json_schema.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,7 @@
22
import re
33

44
import pytest
5-
from outlines_core.fsm.json_schema import ( # noqa: F401
6-
BOOLEAN,
7-
DATE,
8-
DATE_TIME,
9-
EMAIL,
10-
INTEGER,
11-
NULL,
12-
NUMBER,
13-
STRING,
14-
STRING_INNER,
15-
TIME,
16-
URI,
17-
UUID,
18-
WHITESPACE,
19-
build_regex_from_schema,
20-
)
5+
from outlines_core.fsm.json_schema import build_regex_from_schema
216
from pydantic import BaseModel
227

238

@@ -46,7 +31,23 @@ def test_invalid_json():
4631
build_regex_from_schema("{'name':")
4732

4833

49-
def test_types_presence():
34+
def test_types_presence_and_not_emptyness():
35+
from outlines_core.fsm.json_schema import (
36+
BOOLEAN,
37+
DATE,
38+
DATE_TIME,
39+
EMAIL,
40+
INTEGER,
41+
NULL,
42+
NUMBER,
43+
STRING,
44+
STRING_INNER,
45+
TIME,
46+
URI,
47+
UUID,
48+
WHITESPACE,
49+
)
50+
5051
assert BOOLEAN
5152
assert DATE
5253
assert DATE_TIME

0 commit comments

Comments
 (0)