@@ -31,7 +31,7 @@ defmodule Tokenizers.Tokenizer do
31
31
@ doc """
32
32
Instantiate a new tokenizer from an existing file on the Hugging Face Hub.
33
33
34
- This is going to download a tokenizer file, save to a file and load that file.
34
+ This is going to download a tokenizer file, save it to disk and load that file.
35
35
36
36
## Options
37
37
@@ -48,10 +48,23 @@ defmodule Tokenizers.Tokenizer do
48
48
* `:revision` - The revision name that should be used for fetching the tokenizers
49
49
from Hugging Face.
50
50
51
+ * `:use_cache` - Tells if it should read from cache when the file already exists.
52
+ Defaults to `true`.
53
+
54
+ * `:cache_dir` - The directory where cache is saved. Files are written to cache
55
+ even if `:use_cache` is false. By default it uses `:filename.basedir/3` to get
56
+ a cache dir based in the "tokenizers_elixir" application name.
57
+
51
58
"""
52
59
@ spec from_pretrained ( String . t ( ) , Keyword . t ( ) ) :: { :ok , Tokenizer . t ( ) } | { :error , term ( ) }
53
60
def from_pretrained ( identifier , opts \\ [ ] ) do
54
- opts = Keyword . validate! ( opts , revision: "main" , http_client: { Tokenizers.HTTPClient , [ ] } )
61
+ opts =
62
+ Keyword . validate! ( opts ,
63
+ revision: "main" ,
64
+ use_cache: true ,
65
+ cache_dir: :filename . basedir ( :user_cache , "tokenizers_elixir" ) ,
66
+ http_client: { Tokenizers.HTTPClient , [ ] }
67
+ )
55
68
56
69
{ http_client , http_opts } = opts [ :http_client ]
57
70
@@ -68,17 +81,53 @@ defmodule Tokenizers.Tokenizer do
68
81
|> Keyword . put ( :method , :get )
69
82
|> Keyword . update ( :headers , headers , fn existing -> existing ++ headers end )
70
83
84
+ cache_dir = opts [ :cache_dir ]
85
+
86
+ file_path_fun = fn etag ->
87
+ Path . join ( cache_dir , entry_filename ( url , etag ) )
88
+ end
89
+
90
+ if opts [ :use_cache ] do
91
+ with { :ok , response } <- request ( http_client , Keyword . put ( http_opts , :method , :head ) ) do
92
+ etag = fetch_etag ( response . headers )
93
+ file_path = file_path_fun . ( etag )
94
+
95
+ if File . exists? ( file_path ) do
96
+ from_file ( file_path )
97
+ else
98
+ with { :ok , response } <- request ( http_client , http_opts ) do
99
+ File . mkdir_p! ( cache_dir )
100
+ File . write! ( file_path , response . body )
101
+
102
+ from_file ( file_path )
103
+ end
104
+ end
105
+ end
106
+ else
107
+ with { :ok , response } <- request ( http_client , http_opts ) do
108
+ etag = fetch_etag ( response . headers )
109
+ file_path = file_path_fun . ( etag )
110
+
111
+ File . mkdir_p! ( cache_dir )
112
+ File . write! ( file_path , response . body )
113
+
114
+ from_file ( file_path )
115
+ end
116
+ end
117
+ end
118
+
119
+ defp fetch_etag ( headers ) do
120
+ { _ , etag } = List . keyfind! ( headers , "etag" , 0 )
121
+
122
+ etag
123
+ end
124
+
125
+ defp request ( http_client , http_opts ) do
71
126
case http_client . request ( http_opts ) do
72
127
{ :ok , response } ->
73
128
case response . status do
74
129
status when status in 200 .. 299 ->
75
- cache_dir = :filename . basedir ( :user_cache , "tokenizers_elixir" )
76
- :ok = File . mkdir_p ( cache_dir )
77
- file_path = Path . join ( cache_dir , "#{ identifier } .json" )
78
-
79
- :ok = File . write ( file_path , response . body )
80
-
81
- from_file ( file_path )
130
+ { :ok , response }
82
131
83
132
404 ->
84
133
{ :error , :not_found }
@@ -93,6 +142,18 @@ defmodule Tokenizers.Tokenizer do
93
142
end
94
143
end
95
144
145
+ defp entry_filename ( url , etag ) do
146
+ encode_url ( url ) <> "." <> encode_etag ( etag )
147
+ end
148
+
149
+ defp encode_url ( url ) do
150
+ url |> :erlang . md5 ( ) |> Base . encode32 ( case: :lower , padding: false )
151
+ end
152
+
153
+ defp encode_etag ( etag ) do
154
+ Base . encode32 ( etag , case: :lower , padding: false )
155
+ end
156
+
96
157
@ doc """
97
158
Instantiate a new tokenizer from the file at the given path.
98
159
"""
0 commit comments