Add crate main documentation

torymur · torymur · commit e4d9e1a6d838 · 2025-01-24T12:18:03.000Z
diff --git a/src/json_schema/mod.rs b/src/json_schema/mod.rs
@@ -7,19 +7,19 @@
 //! for regex generation.
 //!
 //! ## Supported features
-//! 
+//!
 //! Note, that only some of the features of JSON schema are supported for regex generation.
-//! 
+//!
 //! ### Supported constraints
-//! 
+//!
 //! #### Common
 //!  - `type`
 //!     - Specifies the data type (string, number, integer, boolean, array, object, null).
 //!  - `enum`
 //!     - Lists the allowed values.
 //!  - `const`
 //!     - Specifies a single allowed value.
-//!  
+//!
 //! #### Object
 //! - `properties`
 //!     - Defines the expected properties of an object and their schemas.
@@ -52,7 +52,7 @@
 //! - `format`
 //!     - Specifies a pre-defined format, these are supported [`FormatType`]
 //!
-//! #### Number 
+//! #### Number
 //! - `minDigitsInteger`
 //!     - Specifies minimum number of digits in the integer part of a numeric value.
 //! - `maxDigitsInteger`
@@ -65,36 +65,36 @@
 //!     - Defines minimum number of digits in the exponent part of a scientific notation number.
 //! - `maxDigitsExponent`
 //!     - Defines maximum number of digits in the exponent part of a scientific notation number.
-//! 
+//!
 //! #### Integer
 //! - `minDigits`
 //!     - Defines the minimum number of digits.
 //! - `maxDigits`
 //!     - Defines the maximum number of digits.
-//!  
+//!
 //! #### Logical
 //! - `allOf`
 //!     - Combines multiple schemas; all must be valid.
 //! - `anyOf`
 //!     - Combines multiple schemas; at least one must be valid.
 //! - `oneOf`
 //!     - Combines multiple schemas; exactly one must be valid.
-//! 
+//!
 //! ### Recursion
-//! 
+//!
 //! Currently maximum recursion depth is cautiously defined at the level 3.
-//! 
+//!
 //! Note, that in general recursion in regular expressions is not the best approach due to inherent limitations
 //! and inefficiencies, especially when applied to complex patterns or large input.
-//! 
-//! But often, even simple referential JSON schemas will produce enormous regex size, since it increases 
+//!
+//! But often, even simple referential JSON schemas will produce enormous regex size, since it increases
 //! exponentially in recursive case, which likely to introduce performance issues by consuming large
 //! amounts of time, resources and memory.
-//! 
+//!
 //! ### References
-//! 
+//!
 //! Only local references are currently being supported.
-//! 
+//!
 //! ### Unconstrained objects
 //!
 //! An empty object means unconstrained, allowing any JSON type.
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,3 +1,83 @@
+//! # Outlines_core
+//!
+//! `outlines_core` crate provides a convenient way to:
+//!
+//! - build regular expressions from JSON schemas
+//!
+//! - construct an [`index::Index`] object by combining a [`vocabulary::Vocabulary`] and regular
+//!   expression to efficiently map tokens from a given `Vocabulary` to state transitions in a
+//!   finite-state automation
+//!
+//! ## `json_schema`
+//!
+//! [`json_schema`] module provides interfaces to generate a regular expression based on a given JSON schema, depending on its type:
+//! - [`json_schema::regex_from_str`]
+//! - [`json_schema::regex_from_value`]
+//!
+//! Whitespace pattern could be customized, otherwise the default [`json_schema::WHITESPACE`] pattern is used.
+//!
+//! Note, that not all the features of JSON schema are supported for regex generation: [Supported Features](json_schema#supported-features)
+//!
+//! ## `Index`
+//!
+//! Once [`index::Index`] is built, it can be used to evaluate or validate token sequences.
+//!
+//! ### Complexity and construction cost
+//!
+//! `Index` can accommodate large vocabularies and complex regular expressions. However, its size **may** grow
+//! significantly with the complexity of the input, as well as time and computational resources.
+//!
+//! ## Python bindings
+//!
+//! Additionally, crate provides interfaces to integrate the crate's functionality with Python.
+//!
+//! ## Support
+//!
+//! `Outlines_core` is primarily used in structured text generation project [`outlines`](https://github.com/dottxt-ai/outlines),
+//!
+//! ## Example
+//!
+//! Basic example of how it all fits together.
+//!
+//! ```rust
+//! # use outlines_core::Error;
+//! use outlines_core::prelude::*;
+//!
+//! # fn main() -> Result<(), Error> {
+//! // Define a JSON schema
+//! let schema = r#"{
+//!     "type": "object",
+//!     "properties": {
+//!         "name": { "type": "string" },
+//!         "age": { "type": "integer" }
+//!     },
+//!     "required": ["name", "age"]
+//! }"#;
+//!
+//! // Generate a regular expression from it
+//! let regex = json_schema::regex_from_str(&schema, None)?;
+//! println!("Generated regex: {}", regex);
+//!
+//! // Create `Vocabulary` from pretrained large language model (but manually is also possible)
+//! let vocabulary = Vocabulary::from_pretrained("openai-community/gpt2", None);
+//!
+//! // Create new `Index` from regex and a given `Vocabulary`
+//! let index = Index::new(regex, &vocabulary)?;
+//!
+//! let initial_state = index.initial_state();
+//! println!("Is initial state {} a final state? {}", initial_state, index.is_final_state(&initial_state));
+//!
+//! let allowed_tokens = index.allowed_tokens(&initial_state).expect("Some allowed tokens");
+//! println!("Allowed tokens at initial state are {:?}", allowed_tokens);
+//!
+//! let token_id = allowed_tokens.first().expect("First token");
+//! println!("Next state for the token_id {} is {:?}", token_id, index.next_state(&initial_state, token_id));
+//! println!("Final states are {:?}", index.final_states());
+//! println!("Index has exactly {} transitions", index.transitions().len());
+//! # Ok(())
+//! }
+//! ```
+
 pub mod error;
 pub mod index;
 pub mod json_schema;