Apply suggestions from CR

torymur · torymur · commit ec3fedb2057a · 2025-01-24T12:18:03.000Z
diff --git a/python/outlines_core/fsm/outlines_core_rs.pyi b/python/outlines_core/fsm/outlines_core_rs.pyi
@@ -1,6 +1,8 @@
 from typing import Dict, List, Optional, Set, Tuple, Union
 
-def build_regex_from_schema(json: str, whitespace_pattern: Optional[str] = None) -> str:
+def build_regex_from_schema(
+    json_schema: str, whitespace_pattern: Optional[str] = None
+) -> str:
     """Creates regex string from JSON schema with optional whitespace pattern."""
     ...
 
diff --git a/src/json_schema/types.rs b/src/json_schema/types.rs
@@ -9,7 +9,11 @@ pub static NUMBER: &str = r#"((-)?(0|[1-9][0-9]*))(\.[0-9]+)?([eE][+-][0-9]+)?"#
 pub static BOOLEAN: &str = r#"(true|false)"#;
 pub static NULL: &str = r#"null"#;
 
-/// Default whitespace pattern used for generation a regular expression from JSON schema.
+/// Default whitespace pattern used for generating a regular expression from JSON schema.
+///
+/// It's being imposed since letting the model choose the number of white spaces and
+/// new lines led to pathological behaviors, especially for small models,
+/// see [example](https://github.com/dottxt-ai/outlines/issues/484)
 pub static WHITESPACE: &str = r#"[ ]?"#;
 
 /// Supported JSON types.
diff --git a/src/lib.rs b/src/lib.rs
@@ -34,6 +34,8 @@
 //! ## Support
 //!
 //! `Outlines_core` is primarily used in structured text generation project [`outlines`](https://github.com/dottxt-ai/outlines),
+//! if you need support, consider reaching out to its maintainers, you can also open an issue or start a discussion
+//! on [github](https://github.com/dottxt-ai/outlines-core)
 //!
 //! ## Example
 //!
@@ -59,10 +61,10 @@
 //! println!("Generated regex: {}", regex);
 //!
 //! // Create `Vocabulary` from pretrained large language model (but manually is also possible)
-//! let vocabulary = Vocabulary::from_pretrained("openai-community/gpt2", None);
+//! let vocabulary = Vocabulary::from_pretrained("openai-community/gpt2", None)?;
 //!
 //! // Create new `Index` from regex and a given `Vocabulary`
-//! let index = Index::new(regex, &vocabulary)?;
+//! let index = Index::new(&regex, &vocabulary)?;
 //!
 //! let initial_state = index.initial_state();
 //! println!("Is initial state {} a final state? {}", initial_state, index.is_final_state(&initial_state));
diff --git a/src/python_bindings/mod.rs b/src/python_bindings/mod.rs
@@ -326,15 +326,15 @@ impl PyVocabulary {
 }
 
 #[pyfunction(name = "build_regex_from_schema")]
-#[pyo3(signature = (json, whitespace_pattern=None))]
+#[pyo3(signature = (json_schema, whitespace_pattern=None))]
 pub fn build_regex_from_schema_py(
-    json: String,
+    json_schema: String,
     whitespace_pattern: Option<&str>,
 ) -> PyResult<String> {
-    let json = serde_json::from_str(&json).map_err(|_| {
+    let value = serde_json::from_str(&json_schema).map_err(|_| {
         PyErr::new::<pyo3::exceptions::PyTypeError, _>("Expected a valid JSON string.")
     })?;
-    json_schema::regex_from_value(&json, whitespace_pattern)
+    json_schema::regex_from_value(&value, whitespace_pattern)
         .map_err(|e| PyValueError::new_err(e.to_string()))
 }
 
diff --git a/src/vocabulary/mod.rs b/src/vocabulary/mod.rs
@@ -1,4 +1,4 @@
-//! Creates `Vocabulary` from pretrained large language model.
+//! Creates `Vocabulary` manually or from pretrained large language model.
 
 use bincode::{Decode, Encode};
 use rustc_hash::FxHashMap as HashMap;
@@ -15,7 +15,7 @@ use processor::TokenProcessor;
 mod locator;
 mod processor;
 
-/// Creates `Vocabulary` manually or from defined large language model.
+/// `Vocabulary` of large language model.
 ///
 /// ## Examples
 ///
diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py
@@ -2,22 +2,7 @@
 import re
 
 import pytest
-from outlines_core.fsm.json_schema import (  # noqa: F401
-    BOOLEAN,
-    DATE,
-    DATE_TIME,
-    EMAIL,
-    INTEGER,
-    NULL,
-    NUMBER,
-    STRING,
-    STRING_INNER,
-    TIME,
-    URI,
-    UUID,
-    WHITESPACE,
-    build_regex_from_schema,
-)
+from outlines_core.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 
 
@@ -46,7 +31,23 @@ def test_invalid_json():
         build_regex_from_schema("{'name':")
 
 
-def test_types_presence():
+def test_types_presence_and_not_emptyness():
+    from outlines_core.fsm.json_schema import (
+        BOOLEAN,
+        DATE,
+        DATE_TIME,
+        EMAIL,
+        INTEGER,
+        NULL,
+        NUMBER,
+        STRING,
+        STRING_INNER,
+        TIME,
+        URI,
+        UUID,
+        WHITESPACE,
+    )
+
     assert BOOLEAN
     assert DATE
     assert DATE_TIME