Skip to content

Commit 3c62aed

Browse files
authored
common : simplify etag tracking by removing json (ggml-org#16342)
The JSON parser is temporarily kept only for backward compatibility. It reads the etag from old .json files to prevent unnecessary re-downloads for existing users. This legacy code can be removed in a future version. Signed-off-by: Adrien Gallouët <[email protected]>
1 parent f1eb1cb commit 3c62aed

File tree

1 file changed

+67
-111
lines changed

1 file changed

+67
-111
lines changed

common/arg.cpp

Lines changed: 67 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,53 @@ struct common_hf_file_res {
217217
std::string mmprojFile;
218218
};
219219

220+
static void write_etag(const std::string & path, const std::string & etag) {
221+
const std::string etag_path = path + ".etag";
222+
write_file(etag_path, etag);
223+
LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
224+
}
225+
226+
static std::string read_etag(const std::string & path) {
227+
std::string none;
228+
const std::string etag_path = path + ".etag";
229+
230+
if (std::filesystem::exists(etag_path)) {
231+
std::ifstream etag_in(etag_path);
232+
if (!etag_in) {
233+
LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
234+
return none;
235+
}
236+
std::string etag;
237+
std::getline(etag_in, etag);
238+
return etag;
239+
}
240+
241+
// no etag file, but maybe there is an old .json
242+
// remove this code later
243+
const std::string metadata_path = path + ".json";
244+
245+
if (std::filesystem::exists(metadata_path)) {
246+
std::ifstream metadata_in(metadata_path);
247+
try {
248+
nlohmann::json metadata_json;
249+
metadata_in >> metadata_json;
250+
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
251+
metadata_json.dump().c_str());
252+
if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
253+
std::string etag = metadata_json.at("etag");
254+
write_etag(path, etag);
255+
if (!std::filesystem::remove(metadata_path)) {
256+
LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
257+
}
258+
return etag;
259+
}
260+
} catch (const nlohmann::json::exception & e) {
261+
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
262+
}
263+
}
264+
return none;
265+
}
266+
220267
#ifdef LLAMA_USE_CURL
221268

222269
bool common_has_curl() {
@@ -373,36 +420,15 @@ static bool common_download_head(CURL * curl,
373420
static bool common_download_file_single_online(const std::string & url,
374421
const std::string & path,
375422
const std::string & bearer_token) {
376-
// If the file exists, check its JSON metadata companion file.
377-
std::string metadata_path = path + ".json";
378423
static const int max_attempts = 3;
379424
static const int retry_delay_seconds = 2;
380425
for (int i = 0; i < max_attempts; ++i) {
381-
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
382-
std::string etag;
383-
std::string last_modified;
426+
std::string etag;
384427

385428
// Check if the file already exists locally
386429
const auto file_exists = std::filesystem::exists(path);
387430
if (file_exists) {
388-
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
389-
std::ifstream metadata_in(metadata_path);
390-
if (metadata_in.good()) {
391-
try {
392-
metadata_in >> metadata;
393-
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
394-
metadata.dump().c_str());
395-
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
396-
etag = metadata.at("etag");
397-
}
398-
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
399-
last_modified = metadata.at("lastModified");
400-
}
401-
} catch (const nlohmann::json::exception & e) {
402-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
403-
}
404-
}
405-
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
431+
etag = read_etag(path);
406432
} else {
407433
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
408434
}
@@ -440,11 +466,6 @@ static bool common_download_file_single_online(const std::string & url,
440466
headers.etag.c_str());
441467
should_download = true;
442468
should_download_from_scratch = true;
443-
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
444-
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
445-
last_modified.c_str(), headers.last_modified.c_str());
446-
should_download = true;
447-
should_download_from_scratch = true;
448469
}
449470
}
450471

@@ -475,15 +496,9 @@ static bool common_download_file_single_online(const std::string & url,
475496
}
476497
}
477498
}
478-
479-
// Write the updated JSON metadata file.
480-
metadata.update({
481-
{ "url", url },
482-
{ "etag", headers.etag },
483-
{ "lastModified", headers.last_modified }
484-
});
485-
write_file(metadata_path, metadata.dump(4));
486-
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
499+
if (head_request_ok) {
500+
write_etag(path, headers.etag);
501+
}
487502

488503
// start the download
489504
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
@@ -664,51 +679,6 @@ static void print_progress(size_t current, size_t total) { // TODO isatty
664679
std::cout.flush();
665680
}
666681

667-
struct common_file_metadata {
668-
std::string etag;
669-
std::string last_modified;
670-
};
671-
672-
static std::optional<common_file_metadata> read_metadata(const std::string & path) {
673-
if (!std::filesystem::exists(path)) {
674-
return std::nullopt;
675-
}
676-
677-
nlohmann::json metadata_json;
678-
common_file_metadata metadata;
679-
680-
std::ifstream metadata_in(path);
681-
try {
682-
metadata_in >> metadata_json;
683-
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, path.c_str(),
684-
metadata_json.dump().c_str());
685-
if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
686-
metadata.etag = metadata_json.at("etag");
687-
}
688-
if (metadata_json.contains("lastModified") && metadata_json.at("lastModified").is_string()) {
689-
metadata.last_modified = metadata_json.at("lastModified");
690-
}
691-
} catch (const nlohmann::json::exception & e) {
692-
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, path.c_str(), e.what());
693-
return std::nullopt;
694-
}
695-
696-
return metadata;
697-
}
698-
699-
static void write_metadata(const std::string & path,
700-
const std::string & url,
701-
const common_file_metadata & metadata) {
702-
nlohmann::json metadata_json = {
703-
{ "url", url },
704-
{ "etag", metadata.etag },
705-
{ "lastModified", metadata.last_modified }
706-
};
707-
708-
write_file(path, metadata_json.dump(4));
709-
LOG_DBG("%s: file metadata saved: %s\n", __func__, path.c_str());
710-
}
711-
712682
static bool common_pull_file(httplib::Client & cli,
713683
const std::string & resolve_path,
714684
const std::string & path_tmp,
@@ -775,8 +745,6 @@ static bool common_pull_file(httplib::Client & cli,
775745
static bool common_download_file_single_online(const std::string & url,
776746
const std::string & path,
777747
const std::string & bearer_token) {
778-
// If the file exists, check its JSON metadata companion file.
779-
std::string metadata_path = path + ".json";
780748
static const int max_attempts = 3;
781749
static const int retry_delay_seconds = 2;
782750

@@ -788,12 +756,11 @@ static bool common_download_file_single_online(const std::string & url,
788756
}
789757
cli.set_default_headers(default_headers);
790758

791-
common_file_metadata last;
792759
const bool file_exists = std::filesystem::exists(path);
760+
761+
std::string last_etag;
793762
if (file_exists) {
794-
if (auto opt = read_metadata(metadata_path)) {
795-
last = *opt;
796-
}
763+
last_etag = read_etag(path);
797764
} else {
798765
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
799766
}
@@ -809,14 +776,9 @@ static bool common_download_file_single_online(const std::string & url,
809776
}
810777
}
811778

812-
common_file_metadata current;
813-
if (head_ok) {
814-
if (head->has_header("ETag")) {
815-
current.etag = head->get_header_value("ETag");
816-
}
817-
if (head->has_header("Last-Modified")) {
818-
current.last_modified = head->get_header_value("Last-Modified");
819-
}
779+
std::string etag;
780+
if (head_ok && head->has_header("ETag")) {
781+
etag = head->get_header_value("ETag");
820782
}
821783

822784
size_t total_size = 0;
@@ -834,16 +796,10 @@ static bool common_download_file_single_online(const std::string & url,
834796
}
835797

836798
bool should_download_from_scratch = false;
837-
if (head_ok) {
838-
if (!last.etag.empty() && last.etag != current.etag) {
839-
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
840-
last.etag.c_str(), current.etag.c_str());
841-
should_download_from_scratch = true;
842-
} else if (!last.last_modified.empty() && last.last_modified != current.last_modified) {
843-
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
844-
last.last_modified.c_str(), current.last_modified.c_str());
845-
should_download_from_scratch = true;
846-
}
799+
if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
800+
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
801+
last_etag.c_str(), etag.c_str());
802+
should_download_from_scratch = true;
847803
}
848804

849805
if (file_exists) {
@@ -871,9 +827,8 @@ static bool common_download_file_single_online(const std::string & url,
871827
}
872828

873829
// start the download
874-
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
875-
__func__, show_masked_url(parts).c_str(), path_temporary.c_str(),
876-
current.etag.c_str(), current.last_modified.c_str());
830+
LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
831+
__func__, show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
877832
const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
878833
if (!was_pull_successful) {
879834
if (i + 1 < max_attempts) {
@@ -883,15 +838,16 @@ static bool common_download_file_single_online(const std::string & url,
883838
} else {
884839
LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
885840
}
886-
887841
continue;
888842
}
889843

890844
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
891845
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
892846
return false;
893847
}
894-
write_metadata(metadata_path, url, current);
848+
if (!etag.empty()) {
849+
write_etag(path, etag);
850+
}
895851
break;
896852
}
897853

0 commit comments

Comments
 (0)