@@ -31,6 +31,11 @@ defer {
3131 llama_model_free ( model)
3232}
3333
34+ guard let vocab = llama_model_get_vocab ( model) else {
35+ print ( " Failed to get vocab " )
36+ exit ( 1 )
37+ }
38+
3439var tokens = tokenize ( text: prompt, add_bos: true )
3540
3641let n_kv_req = UInt32 ( tokens. count) + UInt32( ( n_len - Int( tokens. count) ) * n_parallel)
@@ -41,7 +46,7 @@ context_params.n_batch = UInt32(max(n_len, n_parallel))
4146context_params. n_threads = 8
4247context_params. n_threads_batch = 8
4348
44- let context = llama_new_context_with_model ( model, context_params)
49+ let context = llama_init_from_model ( model, context_params)
4550guard context != nil else {
4651 print ( " Failed to initialize context " )
4752 exit ( 1 )
@@ -141,7 +146,7 @@ while n_cur <= n_len {
141146 let new_token_id = llama_sampler_sample ( smpl, context, i_batch [ i] )
142147
143148 // is it an end of stream? -> mark the stream as finished
144- if llama_vocab_is_eog ( model , new_token_id) || n_cur == n_len {
149+ if llama_vocab_is_eog ( vocab , new_token_id) || n_cur == n_len {
145150 i_batch [ i] = - 1
146151 // print("")
147152 if n_parallel > 1 {
@@ -207,7 +212,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
207212 let utf8Count = text. utf8. count
208213 let n_tokens = utf8Count + ( add_bos ? 1 : 0 )
209214 let tokens = UnsafeMutablePointer< llama_token> . allocate( capacity: n_tokens)
210- let tokenCount = llama_tokenize ( model , text, Int32 ( utf8Count) , tokens, Int32 ( n_tokens) , add_bos, /*special tokens*/ false )
215+ let tokenCount = llama_tokenize ( vocab , text, Int32 ( utf8Count) , tokens, Int32 ( n_tokens) , add_bos, /*special tokens*/ false )
211216 var swiftTokens : [ llama_token ] = [ ]
212217 for i in 0 ..< tokenCount {
213218 swiftTokens. append ( tokens [ Int ( i) ] )
@@ -218,12 +223,12 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
218223
219224private func token_to_piece( token: llama_token , buffer: inout [ CChar ] ) -> String ? {
220225 var result = [ CChar] ( repeating: 0 , count: 8 )
221- let nTokens = llama_token_to_piece ( model , token, & result, Int32 ( result. count) , 0 , false )
226+ let nTokens = llama_token_to_piece ( vocab , token, & result, Int32 ( result. count) , 0 , false )
222227 if nTokens < 0 {
223228 let actualTokensCount = - Int( nTokens)
224229 result = . init( repeating: 0 , count: actualTokensCount)
225230 let check = llama_token_to_piece (
226- model ,
231+ vocab ,
227232 token,
228233 & result,
229234 Int32 ( result. count) ,
0 commit comments