4141    CreateEmbeddingRequest ,
4242    CreateChatCompletionRequest ,
4343    ModelList ,
44+     TokenizeInputRequest ,
45+     TokenizeInputResponse ,
46+     TokenizeInputCountResponse ,
47+     DetokenizeInputRequest ,
48+     DetokenizeInputResponse ,
4449)
4550from  llama_cpp .server .errors  import  RouteErrorHandler 
4651
@@ -196,6 +201,9 @@ async def authenticate(
196201    )
197202
198203
204+ openai_v1_tag  =  "OpenAI V1" 
205+ 
206+ 
199207@router .post ( 
200208    "/v1/completions" , 
201209    summary = "Completion" , 
@@ -227,11 +235,13 @@ async def authenticate(
227235            }, 
228236        } 
229237    }, 
238+     tags = [openai_v1_tag ], 
230239) 
231240@router .post ( 
232241    "/v1/engines/copilot-codex/completions" , 
233242    include_in_schema = False , 
234243    dependencies = [Depends (authenticate )], 
244+     tags = [openai_v1_tag ], 
235245) 
236246async  def  create_completion (
237247    request : Request ,
@@ -297,7 +307,10 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
297307
298308
299309@router .post ( 
300-     "/v1/embeddings" , summary = "Embedding" , dependencies = [Depends (authenticate )] 
310+     "/v1/embeddings" , 
311+     summary = "Embedding" , 
312+     dependencies = [Depends (authenticate )], 
313+     tags = [openai_v1_tag ], 
301314) 
302315async  def  create_embedding (
303316    request : CreateEmbeddingRequest ,
@@ -339,6 +352,7 @@ async def create_embedding(
339352            }, 
340353        } 
341354    }, 
355+     tags = [openai_v1_tag ], 
342356) 
343357async  def  create_chat_completion (
344358    request : Request ,
@@ -391,7 +405,12 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
391405        return  iterator_or_completion 
392406
393407
394- @router .get ("/v1/models" , summary = "Models" , dependencies = [Depends (authenticate )]) 
408+ @router .get ( 
409+     "/v1/models" , 
410+     summary = "Models" , 
411+     dependencies = [Depends (authenticate )], 
412+     tags = [openai_v1_tag ], 
413+ ) 
395414async  def  get_models (
396415    llama_proxy : LlamaProxy  =  Depends (get_llama_proxy ),
397416) ->  ModelList :
@@ -407,3 +426,51 @@ async def get_models(
407426            for  model_alias  in  llama_proxy 
408427        ],
409428    }
429+ 
430+ 
431+ extras_tag  =  "Extras" 
432+ 
433+ 
434+ @router .post ( 
435+     "/extras/tokenize" , 
436+     summary = "Tokenize" , 
437+     dependencies = [Depends (authenticate )], 
438+     tags = [extras_tag ], 
439+ ) 
440+ async  def  tokenize (
441+     body : TokenizeInputRequest ,
442+     llama_proxy : LlamaProxy  =  Depends (get_llama_proxy ),
443+ ) ->  TokenizeInputResponse :
444+     tokens  =  llama_proxy (body .model ).tokenize (body .input .encode ("utf-8" ), special = True )
445+ 
446+     return  {"tokens" : tokens }
447+ 
448+ 
449+ @router .post ( 
450+     "/extras/tokenize/count" , 
451+     summary = "Tokenize Count" , 
452+     dependencies = [Depends (authenticate )], 
453+     tags = [extras_tag ], 
454+ ) 
455+ async  def  count_query_tokens (
456+     body : TokenizeInputRequest ,
457+     llama_proxy : LlamaProxy  =  Depends (get_llama_proxy ),
458+ ) ->  TokenizeInputCountResponse :
459+     tokens  =  llama_proxy (body .model ).tokenize (body .input .encode ("utf-8" ), special = True )
460+ 
461+     return  {"count" : len (tokens )}
462+ 
463+ 
464+ @router .post ( 
465+     "/extras/detokenize" , 
466+     summary = "Detokenize" , 
467+     dependencies = [Depends (authenticate )], 
468+     tags = [extras_tag ], 
469+ ) 
470+ async  def  detokenize (
471+     body : DetokenizeInputRequest ,
472+     llama_proxy : LlamaProxy  =  Depends (get_llama_proxy ),
473+ ) ->  DetokenizeInputResponse :
474+     text  =  llama_proxy (body .model ).detokenize (body .tokens ).decode ("utf-8" )
475+ 
476+     return  {"text" : text }
0 commit comments