Skip to content

Commit e382ae3

Browse files
feat(scripts): add some iso downloading utility scripts
1 parent a7d4e99 commit e382ae3

File tree

7 files changed

+644
-1
lines changed

7 files changed

+644
-1
lines changed

.gitignore

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,4 +243,15 @@ public/sw.js
243243
public/workbox-*.js
244244
public/worker-*.js
245245
public/fallback-*.js
246-
public/precache.*.js
246+
public/precache.*.js
247+
248+
# ================================================================
249+
# Exclusions for ISO Downloader Scripts
250+
# ================================================================
251+
Scans/
252+
results.csv
253+
service_account.json
254+
download_state.json
255+
OpenDirectoryDownloader.exe
256+
OpenDirectoryDownloader-*
257+
*.txt

scripts/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# ISO Downloader Utilities
2+
3+
This folder contains utilities for downloading ISO files and uploading them to a shared drive.
4+
Everything is fairly experimental, don't expect decent code quality.
5+
6+
## What each script does
7+
- `downloader.py`: Designed to work on Windows with a Google Drive share and a temporary directory. Has a whole speed limit schedule system thingy to avoid overloading internet connections or causing SSDs to fill up.
8+
- `dedi.py`: Designed to work on either Windows or Linux with a temporary directory. Utilises a gservice account in `service_account.json` and uses the google python APIs.
9+
- `filter.py`: Takes a file `unfiltered.txt` and uses the constants `SKIP_EXTENSIONS` and `BANNED_KEYWORDS` to filter out files that are not suitable for archive.
10+
- `rclonededi.py`: Same as `dedi.py` but uses rclone to upload files to a remote drive, allowing for speed limits to be set on uploads.
11+
- `winworldpc.py`: Very experimental winworldpc downloader, takes a `results.csv` and tries to account for the winworldpc download limit. Designed to work on Windows, can be modified to work on Linux.
12+
13+
All downloading scripts support a bandwidth limiter.

scripts/dedi.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import subprocess
2+
from pathlib import Path
3+
4+
from google.oauth2 import service_account
5+
from googleapiclient.discovery import build
6+
from googleapiclient.http import MediaFileUpload
7+
8+
# Configuration
9+
QUEUE_FILE = "queue.txt"
10+
DOWNLOAD_DIR = r"T:\dl2" # DOWNLOAD_DIR = "/tmp/isodl"
11+
SPEED_LIMIT = "1m"
12+
13+
# Google Drive config
14+
SERVICE_ACCOUNT_FILE = "service_account.json"
15+
DRIVE_FOLDER_ID = "11epaa3D4nKP1fI2RHRVP42jx_JwrRsZ6"
16+
SHARED_DRIVE_ID = "0AJ0TLbTX04lFUk9PVA"
17+
18+
def get_drive_service():
19+
creds = service_account.Credentials.from_service_account_file(
20+
SERVICE_ACCOUNT_FILE,
21+
scopes=['https://www.googleapis.com/auth/drive']
22+
)
23+
return build('drive', 'v3', credentials=creds)
24+
25+
def list_drive_files(service, folder_id: str, shared_drive_id: str) -> set[str]:
26+
"""Returns a set of file names already in the shared drive folder."""
27+
files = set()
28+
page_token = None
29+
30+
while True:
31+
response = service.files().list(
32+
q=f"'{folder_id}' in parents and trashed = false",
33+
corpora='drive',
34+
driveId=shared_drive_id,
35+
includeItemsFromAllDrives=True,
36+
supportsAllDrives=True,
37+
spaces='drive',
38+
fields='nextPageToken, files(name)',
39+
pageToken=page_token
40+
).execute()
41+
42+
for file in response.get('files', []):
43+
files.add(file['name'])
44+
45+
page_token = response.get('nextPageToken')
46+
if not page_token:
47+
break
48+
49+
return files
50+
51+
def parse_queue(file_path: str) -> list[str]:
52+
urls = []
53+
with open(file_path, "r", encoding="utf-8") as f:
54+
for line in f:
55+
line = line.strip()
56+
if not line or line.startswith("#"):
57+
continue
58+
url = line.split("#")[0].strip()
59+
if url:
60+
urls.append(url)
61+
return urls
62+
63+
def download_url(url: str, download_dir: str, speed_limit: str, drive_service, drive_filenames: set[str]) -> None:
64+
filename = url.split("/")[-1]
65+
temp_path = Path(download_dir) / filename
66+
67+
if filename in drive_filenames:
68+
print(f"Skipping {filename}; already exists on Google Drive.")
69+
return
70+
71+
if temp_path.exists():
72+
print(f"Skipping download; file already exists locally: {temp_path}")
73+
else:
74+
print(f"Downloading: {url}")
75+
try:
76+
subprocess.run([
77+
"wget", "--limit-rate", speed_limit, "-P", download_dir, url
78+
], check=True)
79+
except subprocess.CalledProcessError as e:
80+
print(f"Failed to download: {e.cmd} - Exit code {e.returncode}")
81+
return
82+
83+
if temp_path.exists():
84+
try:
85+
upload_to_drive(drive_service, str(temp_path), DRIVE_FOLDER_ID)
86+
temp_path.unlink()
87+
print(f"Deleted local file after upload: {temp_path}")
88+
except Exception as e:
89+
print(f"Upload failed for {temp_path}: {e}")
90+
else:
91+
print(f"Downloaded file not found: {temp_path}")
92+
93+
def upload_to_drive(service, file_path: str, folder_id: str):
94+
file_metadata = {
95+
'name': Path(file_path).name,
96+
'parents': [folder_id]
97+
}
98+
media = MediaFileUpload(file_path, resumable=True)
99+
file = service.files().create(
100+
body=file_metadata,
101+
media_body=media,
102+
fields='id',
103+
supportsAllDrives=True
104+
).execute()
105+
print(f"Uploaded to Drive: {file_path} (ID: {file.get('id')})")
106+
107+
108+
def main():
109+
Path(DOWNLOAD_DIR).mkdir(parents=True, exist_ok=True)
110+
111+
urls = parse_queue(QUEUE_FILE)
112+
total_files = len(urls)
113+
print(f"Total files to process: {total_files}")
114+
115+
drive_service = get_drive_service()
116+
117+
drive_filenames = list_drive_files(drive_service, DRIVE_FOLDER_ID, SHARED_DRIVE_ID)
118+
print(f"Found {len(drive_filenames)} existing files in the shared folder.")
119+
120+
completed = 0
121+
for url in urls:
122+
download_url(url, DOWNLOAD_DIR, SPEED_LIMIT, drive_service, drive_filenames)
123+
completed += 1
124+
percent = (completed / total_files) * 100
125+
print(f"Progress: {completed}/{total_files} files processed ({percent:.1f}%)")
126+
127+
if __name__ == "__main__":
128+
main()

scripts/downloader.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import subprocess
2+
from pathlib import Path
3+
import shutil
4+
from datetime import datetime
5+
from time import sleep
6+
7+
# Configuration
8+
QUEUE_FILE = "queue.txt"
9+
FILTERED_OUTPUT_FILE = "filtered.txt"
10+
DOWNLOAD_DIR = r"T:\dl"
11+
FINAL_DIR = r"G:\Shared drives\ISO Archives\tempuploaddir"
12+
BANDWIDTH_SCHEDULE = {
13+
"00:00": "3M",
14+
"00:30": "3M",
15+
"01:00": "3M",
16+
"01:30": "4M",
17+
"02:00": "4M",
18+
"02:30": "4M",
19+
"03:00": "5M",
20+
"03:30": "5M",
21+
"04:00": "6M",
22+
"04:30": "6M",
23+
"05:00": "6M",
24+
"05:30": "7M",
25+
"06:00": "7M",
26+
"06:30": "7M",
27+
"07:00": "6M",
28+
"07:30": "5M",
29+
"08:00": "3M",
30+
"08:30": "3M",
31+
"09:00": "3M",
32+
"09:30": "3M",
33+
"10:00": "3M",
34+
"10:30": "3M",
35+
"11:00": "3M",
36+
"11:30": "3M",
37+
"12:00": "3M",
38+
"12:30": "3M",
39+
"13:00": "3M",
40+
"13:30": "3M",
41+
"14:00": "3M",
42+
"14:30": "3M",
43+
"15:00": "3M",
44+
"15:30": "3M",
45+
"16:00": "3M",
46+
"16:30": "3M",
47+
"17:00": "3M",
48+
"17:30": "1M",
49+
"18:00": "1M",
50+
"18:30": "1M",
51+
"19:00": "1M",
52+
"19:30": "1M",
53+
"20:00": "1M",
54+
"20:30": "1M",
55+
"21:00": "1M",
56+
"21:30": "1M",
57+
"22:00": "1M",
58+
"22:30": "1M",
59+
"23:00": "1M",
60+
"23:30": "3M",
61+
}
62+
63+
def parse_queue(file_path: str) -> list[str]:
64+
urls = []
65+
with open(file_path, "r", encoding="utf-8") as f:
66+
for line in f:
67+
line = line.strip()
68+
if not line or line.startswith("#"):
69+
continue
70+
url = line.split("#")[0].strip()
71+
if url:
72+
urls.append(url)
73+
return urls
74+
75+
def get_current_speed_limit() -> str:
76+
"""Return the appropriate bandwidth limit based on current time."""
77+
now = datetime.now()
78+
current_minutes = now.hour * 60 + now.minute
79+
80+
# Convert schedule times to minutes for comparison
81+
times_in_minutes = sorted(
82+
((int(t.split(":")[0]) * 60 + int(t.split(":")[1]), rate) for t, rate in BANDWIDTH_SCHEDULE.items())
83+
)
84+
85+
selected_rate = times_in_minutes[0][1] # default to the first entry
86+
for t_minutes, rate in times_in_minutes:
87+
if current_minutes >= t_minutes:
88+
selected_rate = rate
89+
else:
90+
break
91+
return selected_rate
92+
93+
def download_url(url: str, download_dir: str, final_dir: str) -> None:
94+
filename = url.split("/")[-1]
95+
final_path = Path(final_dir) / filename
96+
97+
if final_path.exists():
98+
print(f"Skipping download; file already exists: {final_path}")
99+
return
100+
101+
speed_limit = get_current_speed_limit()
102+
print(f"Downloading: {url} (Speed limit: {speed_limit})")
103+
104+
try:
105+
subprocess.run([
106+
"wget", "--limit-rate", speed_limit, "-P", download_dir, url
107+
], check=True)
108+
except subprocess.CalledProcessError as e:
109+
print(f"Failed to download: {e.cmd} - Exit code {e.returncode}")
110+
return
111+
112+
temp_path = Path(download_dir) / filename
113+
114+
sleep(1)
115+
116+
if temp_path.exists():
117+
print(f"Moving {temp_path} to {final_path}")
118+
shutil.move(str(temp_path), str(final_path))
119+
else:
120+
print(f"Downloaded file not found: {temp_path}")
121+
122+
def main():
123+
Path(DOWNLOAD_DIR).mkdir(parents=True, exist_ok=True)
124+
Path(FINAL_DIR).mkdir(parents=True, exist_ok=True)
125+
126+
urls = parse_queue(QUEUE_FILE)
127+
total_files = len(urls)
128+
129+
print(f"Total files to download: {total_files}")
130+
131+
completed = 0
132+
133+
for url in urls:
134+
download_url(url, DOWNLOAD_DIR, FINAL_DIR)
135+
completed += 1
136+
percent = (completed / total_files) * 100
137+
print(f"Progress: {completed}/{total_files} files downloaded ({percent:.1f}%)")
138+
139+
if __name__ == "__main__":
140+
main()

scripts/filter.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
SKIP_EXTENSIONS = (
2+
".rpm", ".repo", ".deb", ".db", ".pkg.tar", ".pkg.tar.zst", ".pkg", ".xml.gz", ".xml",
3+
".xml.zck", "xml.xz", ".sqlite.gz", ".sqlite.xz", ".cfg", ".conf", "gpl", ".pf2",
4+
"vmlinuz", ".txt", ".efi", ".manifest", ".sqlite.bz2", ".gpg", ".html", "gpg-key",
5+
".css", ".js", ".php", "gpg-key-beta", "gpg-key-fedora", "gpg-key-fedora-rawhide",
6+
"gpg-key-fedora-test", "gpg-key-rawhide", ".png", ".dtb", "vmlinuz-lpae",
7+
"memtest", "license", "vmlinuz-pae", "gpg-key-fedora-x86_64", "compose_id", ".o", "tbl", ".torrent",
8+
".json", ".mod", "readme", "lst", "c32", ".yaml.gz", "community-charter", "eula", ".qcow2", ".vhd", ".box",
9+
".hdr", ".sh", ".msg", ".lss", "boot.cat", ".list", ".patch", "time", "filelist.gz", "dir_sizes", "empty_repo"
10+
)
11+
BANNED_KEYWORDS = [
12+
"/os/", "rpm", "source", "jigdo", "template", "_toolchain", "netinst", "xml", "fullfile", "metadata",
13+
"initrd.img", "kickstart", "azure", "ec2", "cloud", "/repo/", "/updates/", "/kmods/", "/headers/", "manifest",
14+
"pkglist", "srclist", "docs", "/base/", "scripts", "release-notes", "isolinux", "copying", "autorun", "autoboot",
15+
"/de/", "/es/", "/it/", "/fr/", "/i386/images/", "/build/livecd/"
16+
]
17+
18+
FILTERED_OUTPUT_FILE = "filtered.txt"
19+
INPUT_FILE = "unfiltered.txt"
20+
21+
def should_skip(url: str) -> bool:
22+
url_lc = url.lower()
23+
if url_lc.endswith(SKIP_EXTENSIONS):
24+
return True
25+
for keyword in BANNED_KEYWORDS:
26+
if keyword in url_lc:
27+
return True
28+
return False
29+
30+
def parse_queue(file_path: str) -> list[str]:
31+
urls = []
32+
with open(file_path, "r", encoding="utf-8") as f:
33+
for line in f:
34+
line = line.strip()
35+
if not line or line.startswith("#"):
36+
continue
37+
url = line.split("#")[0].strip()
38+
if url and not should_skip(url):
39+
urls.append(url)
40+
return urls
41+
42+
urls = parse_queue(INPUT_FILE)
43+
44+
with open(FILTERED_OUTPUT_FILE, "w") as f:
45+
for url in urls:
46+
f.write(f"{url}\n")

0 commit comments

Comments
 (0)