11//! Creates `Vocabulary` manually or from pretrained large language model.
22
33use bincode:: { Decode , Encode } ;
4+ #[ cfg( feature = "hugginface-hub" ) ]
45use locator:: { HFLocator , Locator } ;
6+ #[ cfg( feature = "hugginface-hub" ) ]
57use processor:: TokenProcessor ;
68use rustc_hash:: FxHashMap as HashMap ;
9+ #[ cfg( feature = "hugginface-hub" ) ]
710use tokenizers:: normalizers:: Sequence ;
11+ #[ cfg( feature = "hugginface-hub" ) ]
812use tokenizers:: { NormalizerWrapper , Tokenizer } ;
913
1014use crate :: prelude:: * ;
1115use crate :: { Error , Result } ;
1216
17+ #[ cfg( feature = "hugginface-hub" ) ]
1318mod locator;
19+ #[ cfg( feature = "hugginface-hub" ) ]
1420mod processor;
1521
1622/// `Vocabulary` of large language model.
1723///
1824/// ## Examples
1925///
26+ #[ cfg_attr(
27+ feature = "hugginface-hub" ,
28+ doc = r##"
2029/// ### Create a vocabulary from a pretrained model.
2130/// ```rust
2231/// use outlines_core::prelude::*;
@@ -51,6 +60,8 @@ mod processor;
5160/// vocabulary.remove("token");
5261/// assert_eq!(vocabulary.token_ids("token"), None);
5362/// ```
63+ "##
64+ ) ]
5465#[ derive( Clone , Debug , Default , PartialEq , Encode , Decode ) ]
5566pub struct Vocabulary {
5667 eos_token_id : TokenId ,
@@ -67,6 +78,7 @@ impl Vocabulary {
6778 }
6879
6980 /// Creates the vocabulary of pre-trained model from Hugging Face Hub.
81+ #[ cfg( feature = "hugginface-hub" ) ]
7082 pub fn from_pretrained (
7183 model : & str ,
7284 parameters : Option < FromPretrainedParameters > ,
@@ -76,6 +88,7 @@ impl Vocabulary {
7688
7789 #[ doc( hidden) ]
7890 #[ inline( always) ]
91+ #[ cfg( feature = "hugginface-hub" ) ]
7992 fn from_pretrained_with_locator < L : Locator > (
8093 model : & str ,
8194 parameters : Option < FromPretrainedParameters > ,
@@ -158,6 +171,7 @@ impl Vocabulary {
158171 }
159172
160173 /// Filters out `Prepend` kind of tokenizer's normalizers.
174+ #[ cfg( feature = "hugginface-hub" ) ]
161175 fn filter_prepend_normalizers ( tokenizer : & mut Tokenizer ) {
162176 // Main concern is prepend normalizers, for example https://github.com/google/sentencepiece
163177 // In `sentencepiece` tokenizer, `▁` is used to denote spaces in the source text,
@@ -248,8 +262,6 @@ impl TryFrom<(TokenId, HashMap<String, Vec<TokenId>>)> for Vocabulary {
248262
249263#[ cfg( test) ]
250264mod tests {
251- use rustc_hash:: FxHashSet as HashSet ;
252-
253265 use super :: * ;
254266
255267 #[ test]
@@ -305,6 +317,7 @@ mod tests {
305317 assert ! ( vocabulary. tokens. is_empty( ) ) ;
306318 }
307319
320+ #[ cfg( feature = "hugginface-hub" ) ]
308321 #[ test]
309322 fn supported_pretrained_models ( ) {
310323 // Support is expected for these:
@@ -332,6 +345,7 @@ mod tests {
332345 }
333346 }
334347
348+ #[ cfg( feature = "hugginface-hub" ) ]
335349 #[ test]
336350 fn pretrained_from_gpt2 ( ) {
337351 let model = "openai-community/gpt2" ;
@@ -363,8 +377,11 @@ mod tests {
363377 }
364378 }
365379
380+ #[ cfg( feature = "hugginface-hub" ) ]
366381 #[ test]
367382 fn pretrained_from_llama ( ) {
383+ use rustc_hash:: FxHashSet as HashSet ;
384+
368385 let model = "hf-internal-testing/llama-tokenizer" ;
369386 let tokenizer = Tokenizer :: from_pretrained ( model, None ) . expect ( "Tokenizer failed" ) ;
370387 let vocabulary = Vocabulary :: from_pretrained ( model, None ) . expect ( "Vocabulary failed" ) ;
@@ -405,6 +422,7 @@ mod tests {
405422 }
406423 }
407424
425+ #[ cfg( feature = "hugginface-hub" ) ]
408426 #[ test]
409427 fn token_processor_error ( ) {
410428 let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM" ;
@@ -419,6 +437,7 @@ mod tests {
419437 }
420438 }
421439
440+ #[ cfg( feature = "hugginface-hub" ) ]
422441 #[ test]
423442 fn tokenizer_error ( ) {
424443 let model = "hf-internal-testing/some-non-existent-model" ;
@@ -430,7 +449,9 @@ mod tests {
430449 }
431450 }
432451
452+ #[ cfg( feature = "hugginface-hub" ) ]
433453 struct NoneLocator ;
454+ #[ cfg( feature = "hugginface-hub" ) ]
434455 impl Locator for NoneLocator {
435456 fn locate_eos_token_id (
436457 _model : & str ,
@@ -441,6 +462,7 @@ mod tests {
441462 }
442463 }
443464
465+ #[ cfg( feature = "hugginface-hub" ) ]
444466 #[ test]
445467 fn unable_to_locate_eos_token_id_error ( ) {
446468 let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM" ;
@@ -456,6 +478,7 @@ mod tests {
456478 }
457479
458480 #[ test]
481+ #[ cfg( feature = "hugginface-hub" ) ]
459482 fn prepend_normalizers_filtered_out ( ) {
460483 use tokenizers:: normalizers:: { Prepend , Sequence } ;
461484
@@ -488,6 +511,7 @@ mod tests {
488511 }
489512
490513 #[ test]
514+ #[ cfg( feature = "hugginface-hub" ) ]
491515 fn other_normalizers_being_kept ( ) {
492516 use tokenizers:: normalizers:: BertNormalizer ;
493517
0 commit comments