Skip to content

Commit 527951b

Browse files
mathijshenquetngxson
authored andcommitted
server : Add option to return token pieces in /tokenize endpoint (ggml-org#9108)
* server : added with_pieces functionality to /tokenize endpoint * server : Add tokenize with pieces tests to server.feature * Handle case if tokenizer splits along utf8 continuation bytes * Add example of token splitting * Remove trailing ws * Fix trailing ws * Maybe fix ci * maybe this fix windows ci? --------- Co-authored-by: Xuan Son Nguyen <[email protected]>
1 parent 724645d commit 527951b

File tree

5 files changed

+138
-6
lines changed

5 files changed

+138
-6
lines changed

examples/server/README.md

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -545,9 +545,44 @@ Notice that each `probs` is an array of length `n_probs`.
545545

546546
*Options:*
547547

548-
`content`: Set the text to tokenize.
548+
`content`: (Required) The text to tokenize.
549549

550-
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
550+
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
551+
552+
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
553+
554+
**Response:**
555+
556+
Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
557+
558+
559+
If `with_pieces` is `false`:
560+
```json
561+
{
562+
"tokens": [123, 456, 789]
563+
}
564+
```
565+
566+
If `with_pieces` is `true`:
567+
```json
568+
{
569+
"tokens": [
570+
{"id": 123, "piece": "Hello"},
571+
{"id": 456, "piece": " world"},
572+
{"id": 789, "piece": "!"}
573+
]
574+
}
575+
```
576+
577+
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
578+
```json
579+
{
580+
"tokens": [
581+
{"id": 198, "piece": [195]}, // hex C3
582+
{"id": 164, "piece": [161]} // hex A1
583+
]
584+
}
585+
```
551586

552587
### POST `/detokenize`: Convert tokens to text
553588

examples/server/server.cpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3085,12 +3085,39 @@ int main(int argc, char ** argv) {
30853085
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
30863086
const json body = json::parse(req.body);
30873087

3088-
std::vector<llama_token> tokens;
3088+
json tokens_response = json::array();
30893089
if (body.count("content") != 0) {
30903090
const bool add_special = json_value(body, "add_special", false);
3091-
tokens = ctx_server.tokenize(body.at("content"), add_special);
3091+
const bool with_pieces = json_value(body, "with_pieces", false);
3092+
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
3093+
3094+
if (with_pieces) {
3095+
for (const auto& token : tokens) {
3096+
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
3097+
json piece_json;
3098+
3099+
// Check if the piece is valid UTF-8
3100+
if (is_valid_utf8(piece)) {
3101+
piece_json = piece;
3102+
} else {
3103+
// If not valid UTF-8, store as array of byte values
3104+
piece_json = json::array();
3105+
for (unsigned char c : piece) {
3106+
piece_json.push_back(static_cast<int>(c));
3107+
}
3108+
}
3109+
3110+
tokens_response.push_back({
3111+
{"id", token},
3112+
{"piece", piece_json}
3113+
});
3114+
}
3115+
} else {
3116+
tokens_response = tokens;
3117+
}
30923118
}
3093-
const json data = format_tokenizer_response(tokens);
3119+
3120+
const json data = format_tokenizer_response(tokens_response);
30943121
res_ok(res, data);
30953122
};
30963123

examples/server/tests/features/server.feature

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,14 @@ Feature: llama.cpp server
105105
Given first token is removed
106106
Then tokens can be detokenized
107107

108+
Scenario: Tokenize with pieces
109+
When tokenizing with pieces:
110+
"""
111+
What is the capital of Germany?
112+
113+
"""
114+
Then tokens are given with pieces
115+
108116
Scenario: Models available
109117
Given available models
110118
Then 1 models are supported

examples/server/tests/features/steps/steps.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
14
import asyncio
25
import json
36
import os
@@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
697700
context.tokenize_add_special = True
698701

699702

703+
@step("tokenizing with pieces")
704+
@async_run_until_complete
705+
async def step_tokenize_with_pieces(context):
706+
context.tokenized_text = context_text(context)
707+
async with aiohttp.ClientSession() as session:
708+
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
709+
if getattr(context, "tokenize_add_special", None) is not None:
710+
tokenize_args["add_special"] = context.tokenize_add_special
711+
712+
async with session.post(
713+
f"{context.base_url}/tokenize", json=tokenize_args
714+
) as response:
715+
assert response.status == 200
716+
tokenize_json = await response.json()
717+
context.tokens_with_pieces = tokenize_json["tokens"]
718+
719+
720+
@step("tokens are given with pieces")
721+
@async_run_until_complete
722+
async def step_tokenize_with_pieces(context):
723+
# Verify that the response contains both token IDs and pieces
724+
assert all(
725+
"id" in token and "piece" in token for token in context.tokens_with_pieces
726+
)
727+
728+
700729
@step('tokenizing')
701730
@async_run_until_complete
702731
async def step_tokenize(context):

examples/server/utils.hpp

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
616616
return res;
617617
}
618618

619-
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
619+
static bool is_valid_utf8(const std::string & str) {
620+
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
621+
const unsigned char* end = bytes + str.length();
622+
623+
while (bytes < end) {
624+
if (*bytes <= 0x7F) {
625+
// 1-byte sequence (0xxxxxxx)
626+
bytes++;
627+
} else if ((*bytes & 0xE0) == 0xC0) {
628+
// 2-byte sequence (110xxxxx 10xxxxxx)
629+
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
630+
return false;
631+
bytes += 2;
632+
} else if ((*bytes & 0xF0) == 0xE0) {
633+
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
634+
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
635+
return false;
636+
bytes += 3;
637+
} else if ((*bytes & 0xF8) == 0xF0) {
638+
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
639+
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
640+
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
641+
return false;
642+
bytes += 4;
643+
} else {
644+
// Invalid UTF-8 lead byte
645+
return false;
646+
}
647+
}
648+
649+
return true;
650+
}
651+
652+
static json format_tokenizer_response(const json & tokens) {
620653
return json {
621654
{"tokens", tokens}
622655
};

0 commit comments

Comments
 (0)