-
Notifications
You must be signed in to change notification settings - Fork 13.7k
tests : add test-tokenizers-remote #13846
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
2d2e059
42ff186
ecbc92a
0fe7183
d97b9ad
4b4843a
8e1125a
de8ec13
f9a2717
05f94a0
7210ebe
d3a2eb5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,145 @@ | ||
| #include "arg.h" | ||
| #include "common.h" | ||
|
|
||
| #include <string> | ||
| #include <fstream> | ||
| #include <vector> | ||
| #include <json.hpp> | ||
|
|
||
| using json = nlohmann::json; | ||
|
|
||
| #undef NDEBUG | ||
| #include <cassert> | ||
|
|
||
| std::string endpoint = "https://huggingface.co/"; | ||
| std::string repo = "ggml-org/vocabs"; | ||
|
|
||
| static void write_file(const std::string & fname, const std::string & content) { | ||
| std::ofstream file(fname); | ||
| if (file) { | ||
| file << content; | ||
| file.close(); | ||
| } | ||
| } | ||
|
|
||
| static json get_hf_repo_dir(const std::string & hf_repo_with_branch, bool recursive, const std::string & repo_path, const std::string & bearer_token) { | ||
| auto parts = string_split<std::string>(hf_repo_with_branch, ':'); | ||
| std::string branch = parts.size() > 1 ? parts.back() : "main"; | ||
| std::string hf_repo = parts[0]; | ||
| std::string url = endpoint + "api/models/" + hf_repo + "/tree/" + branch; | ||
| std::string path = repo_path; | ||
|
|
||
| if (!path.empty()) { | ||
| // FIXME: path should be properly url-encoded! | ||
| string_replace_all(path, "/", "%2F"); | ||
| url += "/" + path; | ||
| } | ||
|
|
||
| if (recursive) { | ||
| url += "?recursive=true"; | ||
| } | ||
|
|
||
| // headers | ||
| std::vector<std::string> headers; | ||
| headers.push_back("Accept: application/json"); | ||
| if (!bearer_token.empty()) { | ||
| headers.push_back("Authorization: Bearer " + bearer_token); | ||
| } | ||
|
|
||
| // we use "=" to avoid clashing with other component, while still being allowed on windows | ||
| std::string cached_response_fname = "tree=" + hf_repo + "/" + repo_path + "=" + branch + ".json"; | ||
CISC marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| string_replace_all(cached_response_fname, "/", "_"); | ||
| std::string cached_response_path = fs_get_cache_file(cached_response_fname); | ||
|
|
||
| // make the request | ||
| common_remote_params params; | ||
| params.headers = headers; | ||
| json res_data; | ||
| try { | ||
| // TODO: For pagination links we need response headers, which is not provided by common_remote_get_content() | ||
| auto res = common_remote_get_content(url, params); | ||
| long res_code = res.first; | ||
| std::string res_str = std::string(res.second.data(), res.second.size()); | ||
|
|
||
| if (res_code == 200) { | ||
| write_file(cached_response_path, res_str); | ||
| } else if (res_code == 401) { | ||
| throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); | ||
| } else { | ||
| throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); | ||
| } | ||
| } catch (const std::exception & e) { | ||
| fprintf(stderr, "error: failed to get repo tree: %s\n", e.what()); | ||
| fprintf(stderr, "try reading from cache\n"); | ||
| } | ||
|
|
||
| // try to read from cache | ||
| try { | ||
| std::ifstream f(cached_response_path); | ||
| res_data = json::parse(f); | ||
| } catch (const std::exception & e) { | ||
| fprintf(stderr, "error: failed to get repo tree (check your internet connection)\n"); | ||
| } | ||
|
|
||
| return res_data; | ||
| } | ||
|
|
||
| int main(void) { | ||
| if (common_has_curl()) { | ||
| json tree = get_hf_repo_dir(repo, true, {}, {}); | ||
|
|
||
| if (!tree.empty()) { | ||
| std::vector<std::pair<std::string, std::string>> files; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Btw, avoid these nested STL containers - I find this very difficult to understand. This can be: struct common_file_info {
std::string whatever_the_first_string_is;
std::string whatever_the_second_string_is;
// etc ...
};
std::vector<common_file_info> files;It's much easier to read and extend in the future with additional information if needed.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe refactor
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the problem is that you're trying to reuse Tbh, I think this test is being quite over-engineered. I adhere to the KISS principle and here is my thoughts:
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Well, maybe adding some new functions, as @ggerganov suggested, would be a better idea?
Undeniably. :)
True.
Should not be any reason to run this locally though, this is meant for CI.
I don't want to rely on any specific tree structure, so in this case I would have to traverse the checkout to find all the right files instead, which adds just as much logic as before.
Yeah, I guess.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Just add a script to copy all find . -type f -name "*.gguf.*" -exec cp {} ./my_tmp_dir
test-tokenizers-all ./my_tmp_dir
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding batching to
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. That function should be kept simple. Batching is just a wrapper around it
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The thinking was that it would be less duplicated effort, and that model downloading would benefit from it as it will now most likely get throttled on many splits.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't want to spend too much of my time arguing which way is better, but if you want to do it - do it. Still, my concerns about whether the whole thing can be just a bash script seem to be ignored at this point.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, didn't mean to come off as ignoring it, just mulling it over. :) |
||
|
|
||
| for (const auto & item : tree) { | ||
| if (item.at("type") == "file") { | ||
| std::string path = item.at("path"); | ||
|
|
||
| if (string_ends_with(path, ".gguf") || string_ends_with(path, ".gguf.inp") || string_ends_with(path, ".gguf.out")) { | ||
| // this is to avoid different repo having same file name, or same file name in different subdirs | ||
| std::string filepath = repo + "_" + path; | ||
| // to make sure we don't have any slashes in the filename | ||
| string_replace_all(filepath, "/", "_"); | ||
| // to make sure we don't have any quotes in the filename | ||
| string_replace_all(filepath, "'", "_"); | ||
| filepath = fs_get_cache_file(filepath); | ||
|
|
||
| files.push_back({endpoint + repo + "/resolve/main/" + path, filepath}); | ||
| } | ||
| } | ||
| } | ||
|
Comment on lines
+90
to
+111
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should factor this in a
|
||
|
|
||
| if (common_download_file_multiple(files, {}, false)) { | ||
CISC marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| std::string dir_sep(1, DIRECTORY_SEPARATOR); | ||
|
|
||
| for (auto const & item : files) { | ||
| std::string filepath = item.second; | ||
|
|
||
| if (string_ends_with(filepath, ".gguf")) { | ||
| std::string vocab_inp = filepath + ".inp"; | ||
| std::string vocab_out = filepath + ".out"; | ||
| auto matching_inp = std::find_if(files.begin(), files.end(), [&vocab_inp](const auto & p) { | ||
| return p.second == vocab_inp; | ||
| }); | ||
| auto matching_out = std::find_if(files.begin(), files.end(), [&vocab_out](const auto & p) { | ||
| return p.second == vocab_out; | ||
| }); | ||
|
|
||
| if (matching_inp != files.end() && matching_out != files.end()) { | ||
| std::string test_command = "." + dir_sep + "test-tokenizer-0 '" + filepath + "'"; | ||
| assert(std::system(test_command.c_str()) == 0); | ||
| } else { | ||
| printf("test-tokenizers-remote: %s found without .inp/out vocab files, skipping...\n", filepath.c_str()); | ||
| } | ||
| } | ||
| } | ||
| } else { | ||
| printf("test-tokenizers-remote: failed to download files, unable to perform tests...\n"); | ||
| } | ||
| } else { | ||
| printf("test-tokenizers-remote: failed to retrieve repository info, unable to perform tests...\n"); | ||
| } | ||
| } else { | ||
| printf("test-tokenizers-remote: no curl, unable to perform tests...\n"); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe we don't need to expose these functions. instead, use
common_remote_get_content, then write the response content to file usingfstreamThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe, but then I'll lose all the fancy functionality (caching, multi-threaded download, etc).