Skip to content

Commit a6fcfc3

Browse files
committed
use a tempfile for download
1 parent b18b86c commit a6fcfc3

File tree

1 file changed

+21
-19
lines changed

1 file changed

+21
-19
lines changed

machine/archive/scripts/bucket_diff.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# /// script
22
# requires-python = ">=3.14"
33
# dependencies = [
4+
# "aiofiles",
45
# "boto3",
56
# "polars==1.36.1",
67
# "rich==14.2",
@@ -21,6 +22,7 @@
2122
from multiprocessing import Value
2223
from pathlib import Path
2324

25+
import aiofiles
2426
import s3fs
2527
import polars as pl
2628
from rich.progress import (
@@ -48,20 +50,19 @@
4850
DATABASES = ["full", "img", "genomes", "sra"]
4951

5052

51-
async def download_original(client, location):
52-
data = io.BytesIO()
53+
async def download_original(client, location, fp):
5354
try:
5455
f = await client.open_async(location)
5556
h = hashlib.new("sha256")
5657
while (chnk := await f.read(1024 * 1024)) != b"":
5758
h.update(chnk)
58-
data.write(chnk)
59+
await fp.write(chnk)
5960
sha256 = h.hexdigest()
60-
data.flush()
61+
await fp.flush()
6162
finally:
6263
await f.close()
6364

64-
return (data, sha256)
65+
return sha256
6566

6667

6768
async def upload_mirror(client, data, location):
@@ -304,20 +305,21 @@ async def process_sig(
304305
# this was already uploaded, so just return the records
305306
pass
306307
else:
307-
# TODO: save to temp file, instead of memory
308-
(data, sha256) = await download_original(src_fs, s3_path)
309-
310-
raw_sig = data.getvalue()
311-
del data
312-
sig = load_signatures(raw_sig)
313-
314-
loop = asyncio.get_running_loop()
315-
extract = partial(extract_record, sig, key, sha256, last_modified, size)
316-
records = await loop.run_in_executor(None, extract)
317-
del sig
318-
319-
if not uploaded:
320-
_result = await upload_mirror(upload_fs, raw_sig, s3_path)
308+
async with aiofiles.tempfile.NamedTemporaryFile() as data:
309+
# save to temp file, instead of memory
310+
sha256 = await download_original(src_fs, s3_path, data)
311+
await data.flush()
312+
313+
await data.seek(0)
314+
sig = load_signatures(data.name)
315+
loop = asyncio.get_running_loop()
316+
extract = partial(extract_record, sig, key, sha256, last_modified, size)
317+
records = await loop.run_in_executor(None, extract)
318+
del sig
319+
320+
await data.seek(0)
321+
if not uploaded:
322+
_result = await upload_mirror(upload_fs, data, s3_path)
321323

322324
with current_tasks.get_lock():
323325
current_tasks.value += 1

0 commit comments

Comments
 (0)