dottxt-ai
diff --git a/‎Cargo.toml‎
Lines changed: 9 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎benchmarks/bench_indexes.py‎
Lines changed: 217 additions & 0 deletions b/‎benchmarks/bench_indexes.py‎
Lines changed: 217 additions & 0 deletions
diff --git a/‎python/outlines_core/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/outlines_core/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/outlines_core/outlines_core_rs.pyi‎
Lines changed: 4 additions & 3 deletions b/‎python/outlines_core/outlines_core_rs.pyi‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎python/outlines_core/utils.py‎
Lines changed: 57 additions & 0 deletions b/‎python/outlines_core/utils.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎rustfmt.toml‎
Lines changed: 1 addition & 0 deletions b/‎rustfmt.toml‎
Lines changed: 1 addition & 0 deletions
@@ -21,9 +21,18 @@ hf-hub = "=0.3.2"
 tokenizers = { version = "=0.20.3", features = ["http"] }
 rustc-hash = "2.1.0"
 regex-automata = "0.4.9"
+smallvec = "1.14.0"
+regex-syntax = "0.8.5"
+rayon = "1.10.0"
+
+[dev-dependencies]
+rand = { version = "0.9.0" }
+
 
 [features]
 python-bindings = ["pyo3", "serde-pyobject"]
+run_benchmarks = []
+
 
 [lib]
 name = "outlines_core"
 
@@ -0,0 +1,217 @@
+# flake8: noqa
+# mypy: ignore-errors
+import os
+import random
+import time
+
+import psutil
+from outlines_core import Guide, Index, Vocabulary, create_mask, mask_to_list
+from outlines_core.json_schema import build_regex_from_schema
+
+os.environ["RUST_LOG"] = "debug"
+
+
+regexes = [
+    {
+        "name": "email",
+        "regex": r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]{1,63}(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]{1,63}){0,10})@(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.){1,3}[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?",
+    },
+    {"name": "simple_phone", "regex": r"\+?[1-9][0-9]{7,14}"},
+    {
+        "name": "complex_phone",
+        "regex": r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}",
+    },
+    {"name": "permissive_any", "regex": r".{255}$"},
+    {"name": "permissive_words", "regex": r"[a-zA-Z]{100}"},
+    {"name": "https", "regex" : r"(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?"}
+]
+schemas = [
+    {
+        "name": "schema_simple",
+        "regex": r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, "required": ["name", "age"]}',
+    },
+    {
+        "name": "schema_simple_phone",
+        "regex": r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}, "complexe_phone": {"type": "string", "pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"}}, "required": ["name", "age", "complexe_phone"]}',
+    },
+    {
+        "name": "schema_complexe",
+        "regex": """{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "Schema for a recording",
+  "type": "object",
+  "definitions": {
+    "artist": {
+      "type": "object",
+      "properties": {
+        "id": {"type": "number"},
+        "name": {"type": "string"},
+        "functions": {
+          "type": "array",
+          "items": {"type": "string"}
+        }
+      },
+      "required": ["id", "name", "functions"]
+    }
+  },
+  "properties": {
+    "id": {"type": "number"},
+    "work": {
+      "type": "object",
+      "properties": {
+        "id": {"type": "number"},
+        "name": {"type": "string"},
+        "composer": {"$ref": "#/definitions/artist"}
+      }
+    },
+    "recording_artists": {
+      "type": "array",
+      "items": {"$ref": "#/definitions/artist"}
+    }
+  },
+  "required": ["id", "work", "recording_artists"]
+}"""
+    },
+    {
+        "name" : "schema_curriculum",
+        "regex" : r'''{
+                "$schema": "http://json-schema.org/draft-04/schema#",
+                "title": "Schema for a Curriculum Vitae",
+                "type": "object",
+                "definitions": {
+                    "experienceEntry": {
+                    "type": "object",
+                    "properties": {
+                        "date": {
+                        "type": "string",
+                        "format": "date"
+                        },
+                        "position": {
+                        "type": "string"
+                        }
+                    },
+                    "required": ["date", "position"]
+                    }
+                },
+                "properties": {
+                    "name": {
+                    "type": "string"
+                    },
+                    "surname": {
+                    "type": "string"
+                    },
+                    "email": {
+                    "type": "string",
+                    "pattern": "[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"
+                    },
+                    "phone": {
+                    "type": "string",
+                    "pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"
+                    },
+                    "website": {
+                    "type": "string",
+                    "pattern": "(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?"
+                    },
+                    "resume": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/experienceEntry"
+                    }
+                    }
+                },
+                "required": ["name", "surname", "email", "phone", "resume"]
+                }'''
+    }
+]
+
+
+class V2IndexBenchmark:
+    def setup(self, regex):
+        self.vocab = Vocabulary.from_pretrained("unsloth/Llama-3.1-8B-Instruct")
+        self.v2_index = Index(regex, self.vocab)
+
+        self.v2_guide = Guide(self.v2_index)
+
+        self.mask = create_mask(len(self.vocab) + 1)
+
+        self.process = psutil.Process()
+
+        assert (
+            not self.v2_guide.is_finished()
+        ), f"Compressed Guide should not be finished for {regex}"
+
+    def run_benchmark(self):
+        iterations = 0
+        v2_total_time = 0
+
+        self.current_token_id = -1
+
+        if not self.v2_guide.is_finished():
+            iterations += 1
+
+            start_compressed = time.perf_counter()
+            self.v2_guide.get_tokens(self.mask)
+            end_compressed = time.perf_counter()
+
+            v2_time = end_compressed - start_compressed
+            v2_total_time += v2_time
+
+        
+            mask_tokens_list = mask_to_list(self.mask)
+            random_idx = random.randrange(len(mask_tokens_list))
+            self.current_token_id = mask_tokens_list[random_idx]
+
+
+        while not self.v2_guide.is_finished():
+            iterations += 1
+            
+            start_compressed = time.perf_counter()
+            self.v2_guide.advance(self.current_token_id, self.mask)
+            end_compressed = time.perf_counter()
+
+            v2_time = end_compressed - start_compressed
+            v2_total_time += v2_time
+
+          
+            if not self.v2_guide.is_finished():
+                if iterations > 2000 :
+                    break
+                mask_tokens_list = mask_to_list(self.mask)
+                random_idx = random.randrange(len(mask_tokens_list))
+               
+                self.current_token_id = mask_tokens_list[random_idx]
+                
+
+      
+        v2_total_time_us = v2_total_time * 1e6
+
+        print(f"  Total iterations (Number of tokens): {iterations}")
+        print(
+            f"  Guide with Compressed Index: {v2_total_time_us:.2f} µs ({v2_total_time_us / iterations:.2f} µs per iteration)"
+        )
+        
+
+
+def test_benchmark_v2index():
+    for r in regexes:
+        name = r["name"]
+        regex = r["regex"]
+
+        print(f"> Regex : '{name}'")
+        bench = V2IndexBenchmark()
+        bench.setup(regex)
+        bench.run_benchmark()
+
+    for s in schemas:
+        name = s["name"]
+        schema = s["regex"]
+        regex = build_regex_from_schema(schema, None)
+        print(f"> Schema : '{name}'")
+        bench = V2IndexBenchmark()
+        bench.setup(regex)
+        bench.run_benchmark()
+
+
+if __name__ == "__main__":
+    print("Running main...")
+    test_benchmark_v2index()
@@ -2,6 +2,7 @@
 from importlib.metadata import PackageNotFoundError, version
 
 from .outlines_core_rs import Guide, Index, Vocabulary
+from .utils import create_mask, first_token_id_from_mask, mask_to_list
 
 try:
     __version__ = version("outlines_core")
 
@@ -1,4 +1,5 @@
 from typing import Dict, List, Optional, Set, Tuple, Union
+import array
 
 def build_regex_from_schema(
     json_schema: str, whitespace_pattern: Optional[str] = None
@@ -26,10 +27,10 @@ class Guide:
     def get_state(self) -> int:
         """Retrieves current state id of the Guide."""
         ...
-    def get_tokens(self) -> List[int]:
+    def get_tokens(self, mask:Optional[array.array]) -> List[int]:
         """Gets the list of allowed tokens for the current state."""
         ...
-    def advance(self, token_id: int) -> List[int]:
+    def advance(self, token_id: int, mask: Optional[array.array]) -> List[int]:
         """Guide moves to the next state provided by the token id and returns a list of allowed tokens."""
         ...
     def is_finished(self) -> bool:
@@ -86,7 +87,7 @@ class Index:
     def __init__(self, regex: str, vocabulary: "Vocabulary"):
         """Creates an index from a regex and vocabulary."""
         ...
-    def get_allowed_tokens(self, state: int) -> Optional[List[int]]:
+    def get_allowed_tokens(self, state: int, mask: Optional[array.array]) -> Optional[List[int]]:
         """Returns allowed tokens in this state."""
         ...
     def get_next_state(self, state: int, token_id: int) -> Optional[int]:
 
@@ -0,0 +1,57 @@
+import array
+from typing import List
+
+
+def mask_to_list(mask_buffer: array.array) -> List[int]:
+    """
+    Converts a mask buffer into a list of token IDs where bits are set to 1.
+
+    Args:
+        mask_buffer:  array.array containing the mask bits.
+
+    Returns:
+        List[int]: A list of token IDs corresponding to bits set to 1 in the mask.
+    """
+
+    tokens = []
+    for word_idx, word in enumerate(mask_buffer):
+        base = word_idx * 64
+        for bit_idx in range(64):
+            if word & (1 << bit_idx):
+                tokens.append(base + bit_idx)
+
+    return tokens
+
+
+def create_mask(size: int) -> array.array:
+    """
+    Creates a mask buffer initialized with zeros for a given number of bits.
+
+    Args:
+        size (int): The number of bits the mask should represent (e.g., vocab_size).
+
+    Returns:
+        array.array: A buffer of bytes initialized to zero, sized to hold `size` bits.
+                     Each byte represents 8 bits, so the length is ceil(size / 8).
+
+    Raises:
+        ValueError: If size is not positive.
+    """
+    if size <= 0:
+        raise ValueError("Mask size must be positive")
+    u64_size = (size + 63) // 64
+    return array.array("Q", [0] * u64_size)
+
+
+def first_token_id_from_mask(mask_buffer: array.array) -> int:
+    bytes_data = mask_buffer.tobytes()
+
+    # Parcourir chaque octet
+    for byte_idx, byte in enumerate(bytes_data):
+        if byte:  # Si l'octet contient au moins un bit à 1
+            # Trouver le premier bit à 1 dans cet octet
+            for bit_idx in range(8):
+                if byte & (128 >> bit_idx):  # Vérifier le bit de gauche à droite (MSB)
+                    return byte_idx * 8 + bit_idx
+
+    return -1
@@ -2,3 +2,4 @@ group_imports = "StdExternalCrate"
 imports_granularity = "Module"
 reorder_impl_items = true
 reorder_imports = true
+