Skip to content

Commit 7bfe488

Browse files
authored
Import media from ZIP archive also when checksum is missing (#481)
* Refactor media importer * Fix empty checksums on media archive import * Remove unused function
1 parent 6a4015c commit 7bfe488

File tree

4 files changed

+423
-95
lines changed

4 files changed

+423
-95
lines changed
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
"""Class for handling the import of a media ZIP archive."""
2+
3+
import json
4+
import os
5+
import shutil
6+
import tempfile
7+
import zipfile
8+
from typing import Dict, List, Tuple
9+
10+
from gramps.gen.db import DbTxn
11+
from gramps.gen.db.base import DbReadBase
12+
from gramps.gen.lib import Media
13+
from gramps.gen.lib.serialize import to_json
14+
15+
from ..auth import set_tree_usage
16+
from ..types import FilenameOrPath
17+
from .file import get_checksum
18+
from .media import check_quota_media, get_media_handler
19+
from .resources.util import update_object
20+
21+
MissingFiles = Dict[str, List[Dict[str, str]]]
22+
23+
24+
class MediaImporter:
25+
"""A class to handle a media archiv ZIP file and import media files.
26+
27+
The class takes a tree ID, database handle, and ZIP file path as input.
28+
If delete is true (the default), the ZIP file is deleted when the import
29+
is done.
30+
31+
The importer uses the following criteria:
32+
33+
- For any media objects that have a checksum but where no file is found
34+
(for local file handler, this means no file is found at the respective path,
35+
for object storage, this means no object with that checksum as key is found),
36+
it looks for a file with the right checksum (regardless of filename) in the ZIP.
37+
If one is found, it is uploaded to the media storage (in the case of local file
38+
handler, it is renamed to the path in the media object; in the case of object
39+
storage, it is uploaded by checksum).
40+
- For any media objects that have an empty checksum (and, in the case of local file
41+
storage, do not have a file at the right path), the ZIP archive is searched for
42+
a file with the right (relative) path. If one is found, the media object is
43+
updated with that file's checksum. Then, in a second step, the file is uploaded.
44+
45+
"""
46+
47+
def __init__(
48+
self,
49+
tree: str,
50+
db_handle: DbReadBase,
51+
file_name: FilenameOrPath,
52+
delete: bool = True,
53+
) -> None:
54+
"""Initialize media importer."""
55+
self.tree = tree
56+
self.db_handle = db_handle
57+
self.file_name = file_name
58+
self.delete = delete
59+
self.media_handler = get_media_handler(self.db_handle, tree=self.tree)
60+
self.objects: List[Media] = self._get_objects()
61+
62+
def _get_objects(self) -> List[Media]:
63+
"""Get a list of all media objects in the database."""
64+
return list(self.db_handle.iter_media())
65+
66+
def _update_objects(self) -> None:
67+
"""Update the list of media objects."""
68+
self.objects = self._get_objects()
69+
70+
def _identify_missing_files(self) -> MissingFiles:
71+
"""Identify missing files by comparing existing handles with all media objects."""
72+
objects_existing = self.media_handler.filter_existing_files(
73+
self.objects, db_handle=self.db_handle
74+
)
75+
handles_existing = set(obj.handle for obj in objects_existing)
76+
objects_missing = [
77+
obj for obj in self.objects if obj.handle not in handles_existing
78+
]
79+
80+
missing_files = {}
81+
for obj in objects_missing:
82+
if obj.checksum not in missing_files:
83+
missing_files[obj.checksum] = []
84+
obj_details = {
85+
"handle": obj.handle,
86+
"media_path": obj.get_path(),
87+
"mime": obj.get_mime_type(),
88+
}
89+
missing_files[obj.checksum].append(obj_details)
90+
91+
return missing_files
92+
93+
def _check_disk_space_and_extract(self) -> str:
94+
"""Check disk space and extract files into a temporary directory."""
95+
total_size = 0
96+
with zipfile.ZipFile(self.file_name, "r") as zip_file:
97+
for file_info in zip_file.infolist():
98+
total_size += file_info.file_size
99+
100+
disk_usage = shutil.disk_usage(self.file_name)
101+
if total_size > disk_usage.free:
102+
raise ValueError("Not enough free space on disk")
103+
104+
temp_dir = tempfile.mkdtemp()
105+
zip_file.extractall(temp_dir)
106+
107+
return temp_dir
108+
109+
def _fix_missing_checksums(self, temp_dir: str, missing_files: MissingFiles) -> int:
110+
"""Fix objects with missing checksums if we have a file with matching path."""
111+
handles_by_path: Dict[str, List[str]] = {}
112+
for obj_details in missing_files[""]:
113+
path = obj_details["media_path"]
114+
if path not in handles_by_path:
115+
handles_by_path[path] = []
116+
handles_by_path[path].append(obj_details["handle"])
117+
checksums_by_handle: Dict[str, str] = {}
118+
for root, _, files in os.walk(temp_dir):
119+
for name in files:
120+
abs_file_path = os.path.join(root, name)
121+
rel_file_path = os.path.relpath(abs_file_path, temp_dir)
122+
if rel_file_path in handles_by_path:
123+
with open(abs_file_path, "rb") as f:
124+
checksum = get_checksum(f)
125+
for handle in handles_by_path[rel_file_path]:
126+
checksums_by_handle[handle] = checksum
127+
if not checksums_by_handle:
128+
return 0
129+
with DbTxn("Update media checksums", self.db_handle) as trans:
130+
objects_by_handle = {
131+
obj.handle: obj
132+
for obj in self.objects
133+
if obj.handle in checksums_by_handle
134+
}
135+
for handle, checksum in checksums_by_handle.items():
136+
new_object = objects_by_handle[handle]
137+
new_object.set_checksum(checksum)
138+
update_object(self.db_handle, new_object, trans)
139+
140+
return len(checksums_by_handle)
141+
142+
def _identify_files_to_upload(
143+
self, temp_dir: str, missing_files: MissingFiles
144+
) -> Dict[str, Tuple[str, int]]:
145+
"""Identify files to upload from the extracted temporary directory."""
146+
to_upload = {}
147+
for root, _, files in os.walk(temp_dir):
148+
for name in files:
149+
file_path = os.path.join(root, name)
150+
with open(file_path, "rb") as f:
151+
checksum = get_checksum(f)
152+
if checksum in missing_files and checksum not in to_upload:
153+
to_upload[checksum] = (file_path, os.path.getsize(file_path))
154+
155+
return to_upload
156+
157+
def _upload_files(
158+
self,
159+
to_upload: Dict[str, Tuple[str, int]],
160+
missing_files: MissingFiles,
161+
) -> int:
162+
"""Upload identified files and return the number of failures."""
163+
num_failures = 0
164+
for checksum, (file_path, file_size) in to_upload.items():
165+
for obj_details in missing_files[checksum]:
166+
with open(file_path, "rb") as f:
167+
try:
168+
self.media_handler.upload_file(
169+
f,
170+
checksum,
171+
obj_details["mime"],
172+
path=obj_details["media_path"],
173+
)
174+
except Exception:
175+
num_failures += 1
176+
177+
return num_failures
178+
179+
def _delete_zip_file(self):
180+
"""Delete the ZIP file."""
181+
return os.remove(self.file_name)
182+
183+
def _delete_temporary_directory(self, temp_dir):
184+
"""Delete the temporary directory."""
185+
return shutil.rmtree(temp_dir)
186+
187+
def _update_media_usage(self) -> None:
188+
"""Update the media usage."""
189+
usage_media = self.media_handler.get_media_size(db_handle=self.db_handle)
190+
set_tree_usage(self.tree, usage_media=usage_media)
191+
192+
def __call__(self, fix_missing_checksums: bool = True) -> Dict[str, int]:
193+
"""Import a media archive file."""
194+
missing_files = self._identify_missing_files()
195+
196+
if not missing_files:
197+
# no missing files
198+
# delete ZIP file
199+
if self.delete:
200+
self._delete_zip_file()
201+
return {"missing": 0, "uploaded": 0, "failures": 0}
202+
203+
temp_dir = self._check_disk_space_and_extract()
204+
205+
if "" in missing_files:
206+
if fix_missing_checksums:
207+
# files without checksum! Need to fix that first
208+
fixed = self._fix_missing_checksums(temp_dir, missing_files)
209+
# after fixing checksums, we need fetch media objects again and re-run
210+
if fixed:
211+
self._update_objects()
212+
# set fix_missing_checksums to False to avoid an infinite loop
213+
return self(fix_missing_checksums=False)
214+
else:
215+
# we already tried fixing checksums - ignore the 2nd time
216+
missing_files.pop("")
217+
218+
# delete ZIP file
219+
if self.delete:
220+
self._delete_zip_file()
221+
222+
to_upload = self._identify_files_to_upload(temp_dir, missing_files)
223+
224+
if not to_upload:
225+
# no files to upload
226+
self._delete_temporary_directory(temp_dir)
227+
return {"missing": len(missing_files), "uploaded": 0, "failures": 0}
228+
229+
upload_size = sum(file_size for (_, file_size) in to_upload.values())
230+
check_quota_media(to_add=upload_size, tree=self.tree)
231+
232+
num_failures = self._upload_files(to_upload, missing_files)
233+
234+
self._delete_temporary_directory(temp_dir)
235+
self._update_media_usage()
236+
237+
return {
238+
"missing": len(missing_files),
239+
"uploaded": len(to_upload) - num_failures,
240+
"failures": num_failures,
241+
}
242+
243+
244+
# _identify_missing_files -> missing_files = {checksum: [(handle, media_path, mime), ...]}
245+
# _identify_files_to_upload -> to_upload = {checksum: (file_path, file_size)}

gramps_webapi/api/resources/util.py

Lines changed: 0 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,99 +1228,6 @@ def dry_run_import(
12281228
}
12291229

12301230

1231-
def run_import_media_archive(
1232-
tree: str,
1233-
db_handle: DbReadBase,
1234-
file_name: FilenameOrPath,
1235-
delete: bool = True,
1236-
) -> Dict[str, int]:
1237-
"""Import a media archive file."""
1238-
media_handler = get_media_handler(db_handle, tree=tree)
1239-
1240-
# create a dict {checksum: [(handle1, path), (handle2, path2), ...], ...}
1241-
# of missing files
1242-
handles = db_handle.get_media_handles()
1243-
objects = [db_handle.get_media_from_handle(handle) for handle in handles]
1244-
objects_existing = media_handler.filter_existing_files(objects, db_handle=db_handle)
1245-
handles_existing = set(obj.handle for obj in objects_existing)
1246-
objects_missing = [obj for obj in objects if obj.handle not in handles_existing]
1247-
1248-
checksums_handles: Dict[str, List[Tuple[str, str, str]]] = {}
1249-
for obj in objects_missing:
1250-
if obj.checksum not in checksums_handles:
1251-
checksums_handles[obj.checksum] = []
1252-
obj_details = (obj.handle, obj.get_path(), obj.get_mime_type())
1253-
checksums_handles[obj.checksum].append(obj_details)
1254-
if len(checksums_handles) == 0:
1255-
# no missing files
1256-
# delete ZIP file
1257-
if delete:
1258-
os.remove(file_name)
1259-
return {"missing": 0, "uploaded": 0, "failures": 0}
1260-
1261-
total_size = 0
1262-
with zipfile.ZipFile(file_name, "r") as zip_file:
1263-
# compute file size
1264-
for file_info in zip_file.infolist():
1265-
total_size += file_info.file_size
1266-
1267-
# check disk usage
1268-
disk_usage = shutil.disk_usage(file_name)
1269-
if total_size > disk_usage.free:
1270-
raise ValueError("Not enough free space on disk")
1271-
1272-
# extract
1273-
temp_dir = tempfile.mkdtemp()
1274-
zip_file.extractall(temp_dir)
1275-
1276-
# delete ZIP file
1277-
if delete:
1278-
os.remove(file_name)
1279-
1280-
to_upload = {}
1281-
# walk extracted files
1282-
for root, _, files in os.walk(temp_dir):
1283-
for name in files:
1284-
file_path = os.path.join(root, name)
1285-
with open(file_path, "rb") as f:
1286-
checksum = get_checksum(f)
1287-
if checksum in checksums_handles and checksum not in to_upload:
1288-
to_upload[checksum] = (file_path, os.path.getsize(file_path))
1289-
1290-
if len(to_upload) == 0:
1291-
# no files to upload
1292-
1293-
# delete extracted temp files
1294-
shutil.rmtree(temp_dir)
1295-
1296-
return {"missing": len(checksums_handles), "uploaded": 0, "failures": 0}
1297-
1298-
upload_size = sum([file_size for (file_path, file_size) in to_upload.values()])
1299-
check_quota_media(to_add=upload_size, tree=tree)
1300-
1301-
num_failures = 0
1302-
for checksum, (file_path, file_size) in to_upload.items():
1303-
for handle, media_path, mime in checksums_handles[checksum]:
1304-
with open(file_path, "rb") as f:
1305-
try:
1306-
media_handler.upload_file(f, checksum, mime, path=media_path)
1307-
except Exception:
1308-
num_failures += 1
1309-
1310-
# delete extracted temp files
1311-
shutil.rmtree(temp_dir)
1312-
1313-
# update media usage
1314-
usage_media = media_handler.get_media_size(db_handle=db_handle)
1315-
set_tree_usage(tree, usage_media=usage_media)
1316-
1317-
return {
1318-
"missing": len(checksums_handles),
1319-
"uploaded": len(to_upload) - num_failures,
1320-
"failures": num_failures,
1321-
}
1322-
1323-
13241231
def check_fix_default_person(db_handle: Union[DbReadBase, DbWriteBase]) -> None:
13251232
"""If the db is writable, check if the default person still exists.
13261233

gramps_webapi/api/tasks.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@
3232
from .emails import email_confirm_email, email_new_user, email_reset_pw
3333
from .export import prepare_options, run_export
3434
from .media import get_media_handler
35+
from .media_importer import MediaImporter
3536
from .report import run_report
36-
from .resources.util import dry_run_import, run_import, run_import_media_archive
37+
from .resources.util import dry_run_import, run_import
3738
from .util import (
3839
check_quota_people,
3940
get_config,
@@ -209,12 +210,13 @@ def export_media(tree: str, view_private: bool) -> Dict[str, Union[str, int]]:
209210
def import_media_archive(tree: str, file_name: str, delete: bool = True):
210211
"""Import a media archive."""
211212
db_handle = get_db_outside_request(tree=tree, view_private=True, readonly=True)
212-
result = run_import_media_archive(
213+
importer = MediaImporter(
213214
tree=tree,
214215
db_handle=db_handle,
215216
file_name=file_name,
216217
delete=delete,
217218
)
219+
result = importer()
218220
return result
219221

220222

0 commit comments

Comments
 (0)