2626import os
2727import platform
2828import sys
29+ import tempfile
2930import threading
3031from dataclasses import dataclass , field
3132from enum import Enum
@@ -364,16 +365,23 @@ class CrayonVocab:
364365 "_hardware_info" ,
365366 )
366367
367- def __init__ (self , device : DeviceType = "auto" ) -> None :
368+ def __init__ (
369+ self ,
370+ vocab_list : Optional [List [str ]] = None ,
371+ device : DeviceType = "auto" ,
372+ unk_token : str = "<UNK>"
373+ ) -> None :
368374 """
369375 Initialize the tokenizer engine.
370376
371377 Args:
378+ vocab_list: Optional list of strings to build an ad-hoc vocabulary.
372379 device: Device selection mode.
373380 - "auto": Detects GPU. If available, uses it. Else CPU.
374381 - "cpu": Forces AVX2/AVX-512 CPU backend (best for latency).
375382 - "cuda": Forces NVIDIA GPU backend (best for batch throughput).
376383 - "rocm": Forces AMD GPU backend (best for batch throughput).
384+ unk_token: String to use as the unknown token placeholder.
377385
378386 Raises:
379387 ImportError: If the CPU backend extension is not available.
@@ -395,6 +403,11 @@ def __init__(self, device: DeviceType = "auto") -> None:
395403 self ._idx_to_str : List [str ] = []
396404 self .current_profile_path : Optional [str ] = None
397405 self ._profile_loaded : bool = False
406+ self ._temp_dat_path : Optional [str ] = None
407+
408+ # Public properties for test compatibility
409+ self .unk_token = unk_token
410+ self .unk_token_id = 1 # Hardware convention in Crayon v2
398411
399412 # Device state
400413 self ._requested_device : DeviceType = device
@@ -413,6 +426,10 @@ def __init__(self, device: DeviceType = "auto") -> None:
413426 # --- Resolve and Initialize Device ---
414427 self .device = self ._resolve_device (device )
415428 self ._init_selected_backend ()
429+
430+ # --- Load ad-hoc vocab if provided ---
431+ if vocab_list :
432+ self .load_from_list (vocab_list )
416433
417434 def _load_cpu_backend (self ) -> None :
418435 """Load the CPU extension (required as fallback for all modes)."""
@@ -610,6 +627,49 @@ def _resolve_profile_path(self, name_or_path: str) -> str:
610627 f"You can specify the full path or set CRAYON_PROFILE_DIR environment variable."
611628 )
612629
630+ @property
631+ def id_to_token (self ) -> List [str ]:
632+ """Get the ID-to-token mapping list (for compatibility)."""
633+ return self ._idx_to_str
634+
635+ def __len__ (self ) -> int :
636+ """Return the total number of tokens in the active vocabulary."""
637+ return len (self ._idx_to_str )
638+
639+ def __contains__ (self , token : str ) -> bool :
640+ """Check if a token exists in the active vocabulary (O(N) fallback)."""
641+ return token in self ._idx_to_str
642+
643+ def load_from_list (self , vocab : List [str ]) -> None :
644+ """Build and load a temporary DAT profile from a list of strings."""
645+ try :
646+ from ..c_ext import crayon_compiler
647+ except ImportError :
648+ raise ImportError ("crayon_compiler extension required for load_from_list()" )
649+
650+ with self ._lock :
651+ # Create a secure temporary file
652+ fd , path = tempfile .mkstemp (suffix = ".dat" )
653+ os .close (fd )
654+
655+ try :
656+ # Compile to the temp file
657+ crayon_compiler .compile_dat (vocab , path )
658+
659+ # IMPORTANT: Since load_profile() expects a .json file to load _idx_to_str,
660+ # we create a dummy JSON or just bypass the load_profile JSON loading
661+ # by manually setting _idx_to_str after load_profile.
662+ self .load_profile (path )
663+
664+ # Override the idx_to_str which failed to load during load_profile (since no .json exists)
665+ self ._idx_to_str = list (vocab )
666+ self ._temp_dat_path = path
667+
668+ except Exception as e :
669+ if os .path .exists (path ):
670+ os .unlink (path )
671+ raise RuntimeError (f"Failed to build ad-hoc vocabulary: { e } " )
672+
613673 def _close_profile_handles (self ) -> None :
614674 """Safely close any open file handles."""
615675 if self ._dat_mem_ref is not None :
@@ -625,6 +685,14 @@ def _close_profile_handles(self) -> None:
625685 except Exception :
626686 pass
627687 self ._dat_file_ref = None
688+
689+ # Clean up temporary DAT if exists
690+ if hasattr (self , '_temp_dat_path' ) and self ._temp_dat_path and os .path .exists (self ._temp_dat_path ):
691+ try :
692+ os .unlink (self ._temp_dat_path )
693+ except Exception :
694+ pass
695+ self ._temp_dat_path = None
628696
629697 def close (self ) -> None :
630698 """Release all resources and close file handles."""
0 commit comments