Skip to content

Commit befea8c

Browse files
authored
🚨 Remove cache migration script (#35810)
* Remove cache migration script * remove dummy move_cache
1 parent d52a9d0 commit befea8c

File tree

2 files changed

+0
-197
lines changed

2 files changed

+0
-197
lines changed

src/transformers/utils/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@
9696
http_user_agent,
9797
is_offline_mode,
9898
is_remote_url,
99-
move_cache,
10099
send_example_telemetry,
101100
try_to_load_from_cache,
102101
)

src/transformers/utils/hub.py

Lines changed: 0 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@
1818
import json
1919
import os
2020
import re
21-
import shutil
2221
import sys
2322
import tempfile
24-
import traceback
2523
import warnings
2624
from concurrent import futures
2725
from pathlib import Path
@@ -40,7 +38,6 @@
4038
create_branch,
4139
create_commit,
4240
create_repo,
43-
get_hf_file_metadata,
4441
hf_hub_download,
4542
hf_hub_url,
4643
try_to_load_from_cache,
@@ -86,7 +83,6 @@ def is_offline_mode():
8683

8784
torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
8885
default_cache_path = constants.default_cache_path
89-
old_default_cache_path = os.path.join(torch_cache_home, "transformers")
9086

9187
# Determine default cache directory. Lots of legacy environment variables to ensure backward compatibility.
9288
# The best way to set the cache path is with the environment variable HF_HOME. For more details, checkout this
@@ -100,23 +96,6 @@ def is_offline_mode():
10096
PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
10197
TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
10298

103-
# Onetime move from the old location to the new one if no ENV variable has been set.
104-
if (
105-
os.path.isdir(old_default_cache_path)
106-
and not os.path.isdir(constants.HF_HUB_CACHE)
107-
and "PYTORCH_PRETRAINED_BERT_CACHE" not in os.environ
108-
and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
109-
and "TRANSFORMERS_CACHE" not in os.environ
110-
):
111-
logger.warning(
112-
"In Transformers v4.22.0, the default path to cache downloaded models changed from"
113-
" '~/.cache/torch/transformers' to '~/.cache/huggingface/hub'. Since you don't seem to have"
114-
" overridden and '~/.cache/torch/transformers' is a directory that exists, we're moving it to"
115-
" '~/.cache/huggingface/hub' to avoid redownloading models you have already in the cache. You should"
116-
" only see this message once."
117-
)
118-
shutil.move(old_default_cache_path, constants.HF_HUB_CACHE)
119-
12099
HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(constants.HF_HOME, "modules"))
121100
TRANSFORMERS_DYNAMIC_MODULE_NAME = "transformers_modules"
122101
SESSION_ID = uuid4().hex
@@ -1087,47 +1066,6 @@ def get_checkpoint_shard_files(
10871066
return cached_filenames, sharded_metadata
10881067

10891068

1090-
# All what is below is for conversion between old cache format and new cache format.
1091-
1092-
1093-
def get_all_cached_files(cache_dir=None):
1094-
"""
1095-
Returns a list for all files cached with appropriate metadata.
1096-
"""
1097-
if cache_dir is None:
1098-
cache_dir = TRANSFORMERS_CACHE
1099-
else:
1100-
cache_dir = str(cache_dir)
1101-
if not os.path.isdir(cache_dir):
1102-
return []
1103-
1104-
cached_files = []
1105-
for file in os.listdir(cache_dir):
1106-
meta_path = os.path.join(cache_dir, f"{file}.json")
1107-
if not os.path.isfile(meta_path):
1108-
continue
1109-
1110-
with open(meta_path, encoding="utf-8") as meta_file:
1111-
metadata = json.load(meta_file)
1112-
url = metadata["url"]
1113-
etag = metadata["etag"].replace('"', "")
1114-
cached_files.append({"file": file, "url": url, "etag": etag})
1115-
1116-
return cached_files
1117-
1118-
1119-
def extract_info_from_url(url):
1120-
"""
1121-
Extract repo_name, revision and filename from an url.
1122-
"""
1123-
search = re.search(r"^https://huggingface\.co/(.*)/resolve/([^/]*)/(.*)$", url)
1124-
if search is None:
1125-
return None
1126-
repo, revision, filename = search.groups()
1127-
cache_repo = "--".join(["models"] + repo.split("/"))
1128-
return {"repo": cache_repo, "revision": revision, "filename": filename}
1129-
1130-
11311069
def create_and_tag_model_card(
11321070
repo_id: str,
11331071
tags: Optional[List[str]] = None,
@@ -1168,88 +1106,6 @@ def create_and_tag_model_card(
11681106
return model_card
11691107

11701108

1171-
def clean_files_for(file):
1172-
"""
1173-
Remove, if they exist, file, file.json and file.lock
1174-
"""
1175-
for f in [file, f"{file}.json", f"{file}.lock"]:
1176-
if os.path.isfile(f):
1177-
os.remove(f)
1178-
1179-
1180-
def move_to_new_cache(file, repo, filename, revision, etag, commit_hash):
1181-
"""
1182-
Move file to repo following the new huggingface hub cache organization.
1183-
"""
1184-
os.makedirs(repo, exist_ok=True)
1185-
1186-
# refs
1187-
os.makedirs(os.path.join(repo, "refs"), exist_ok=True)
1188-
if revision != commit_hash:
1189-
ref_path = os.path.join(repo, "refs", revision)
1190-
with open(ref_path, "w") as f:
1191-
f.write(commit_hash)
1192-
1193-
# blobs
1194-
os.makedirs(os.path.join(repo, "blobs"), exist_ok=True)
1195-
blob_path = os.path.join(repo, "blobs", etag)
1196-
shutil.move(file, blob_path)
1197-
1198-
# snapshots
1199-
os.makedirs(os.path.join(repo, "snapshots"), exist_ok=True)
1200-
os.makedirs(os.path.join(repo, "snapshots", commit_hash), exist_ok=True)
1201-
pointer_path = os.path.join(repo, "snapshots", commit_hash, filename)
1202-
huggingface_hub.file_download._create_relative_symlink(blob_path, pointer_path)
1203-
clean_files_for(file)
1204-
1205-
1206-
def move_cache(cache_dir=None, new_cache_dir=None, token=None):
1207-
if new_cache_dir is None:
1208-
new_cache_dir = TRANSFORMERS_CACHE
1209-
if cache_dir is None:
1210-
# Migrate from old cache in .cache/huggingface/transformers
1211-
old_cache = Path(TRANSFORMERS_CACHE).parent / "transformers"
1212-
if os.path.isdir(str(old_cache)):
1213-
cache_dir = str(old_cache)
1214-
else:
1215-
cache_dir = new_cache_dir
1216-
cached_files = get_all_cached_files(cache_dir=cache_dir)
1217-
logger.info(f"Moving {len(cached_files)} files to the new cache system")
1218-
1219-
hub_metadata = {}
1220-
for file_info in tqdm(cached_files):
1221-
url = file_info.pop("url")
1222-
if url not in hub_metadata:
1223-
try:
1224-
hub_metadata[url] = get_hf_file_metadata(url, token=token)
1225-
except requests.HTTPError:
1226-
continue
1227-
1228-
etag, commit_hash = hub_metadata[url].etag, hub_metadata[url].commit_hash
1229-
if etag is None or commit_hash is None:
1230-
continue
1231-
1232-
if file_info["etag"] != etag:
1233-
# Cached file is not up to date, we just throw it as a new version will be downloaded anyway.
1234-
clean_files_for(os.path.join(cache_dir, file_info["file"]))
1235-
continue
1236-
1237-
url_info = extract_info_from_url(url)
1238-
if url_info is None:
1239-
# Not a file from huggingface.co
1240-
continue
1241-
1242-
repo = os.path.join(new_cache_dir, url_info["repo"])
1243-
move_to_new_cache(
1244-
file=os.path.join(cache_dir, file_info["file"]),
1245-
repo=repo,
1246-
filename=url_info["filename"],
1247-
revision=url_info["revision"],
1248-
etag=etag,
1249-
commit_hash=commit_hash,
1250-
)
1251-
1252-
12531109
class PushInProgress:
12541110
"""
12551111
Internal class to keep track of a push in progress (which might contain multiple `Future` jobs).
@@ -1271,55 +1127,3 @@ def cancel(self) -> None:
12711127
# Cancel the job if it wasn't started yet and remove cancelled/done jobs from the list
12721128
if not (job.cancel() or job.done())
12731129
]
1274-
1275-
1276-
cache_version_file = os.path.join(TRANSFORMERS_CACHE, "version.txt")
1277-
if not os.path.isfile(cache_version_file):
1278-
cache_version = 0
1279-
else:
1280-
with open(cache_version_file) as f:
1281-
try:
1282-
cache_version = int(f.read())
1283-
except ValueError:
1284-
cache_version = 0
1285-
1286-
cache_is_not_empty = os.path.isdir(TRANSFORMERS_CACHE) and len(os.listdir(TRANSFORMERS_CACHE)) > 0
1287-
1288-
if cache_version < 1 and cache_is_not_empty:
1289-
if is_offline_mode():
1290-
logger.warning(
1291-
"You are offline and the cache for model files in Transformers v4.22.0 has been updated while your local "
1292-
"cache seems to be the one of a previous version. It is very likely that all your calls to any "
1293-
"`from_pretrained()` method will fail. Remove the offline mode and enable internet connection to have "
1294-
"your cache be updated automatically, then you can go back to offline mode."
1295-
)
1296-
else:
1297-
logger.warning(
1298-
"The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a "
1299-
"one-time only operation. You can interrupt this and resume the migration later on by calling "
1300-
"`transformers.utils.move_cache()`."
1301-
)
1302-
try:
1303-
if TRANSFORMERS_CACHE != constants.HF_HUB_CACHE:
1304-
# Users set some env variable to customize cache storage
1305-
move_cache(TRANSFORMERS_CACHE, TRANSFORMERS_CACHE)
1306-
else:
1307-
move_cache()
1308-
except Exception as e:
1309-
trace = "\n".join(traceback.format_tb(e.__traceback__))
1310-
logger.error(
1311-
f"There was a problem when trying to move your cache:\n\n{trace}\n{e.__class__.__name__}: {e}\n\nPlease "
1312-
"file an issue at https://github.com/huggingface/transformers/issues/new/choose and copy paste this whole "
1313-
"message and we will do our best to help."
1314-
)
1315-
1316-
if cache_version < 1:
1317-
try:
1318-
os.makedirs(TRANSFORMERS_CACHE, exist_ok=True)
1319-
with open(cache_version_file, "w") as f:
1320-
f.write("1")
1321-
except Exception:
1322-
logger.warning(
1323-
f"There was a problem when trying to write in your cache folder ({TRANSFORMERS_CACHE}). You should set "
1324-
"the environment variable TRANSFORMERS_CACHE to a writable directory."
1325-
)

0 commit comments

Comments
 (0)