feat(scripts): add some iso downloading utility scripts

ThisCatLikesCrypto · ThisCatLikesCrypto · commit e382ae333bc5 · 2025-08-13T10:28:28.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -243,4 +243,15 @@ public/sw.js
 public/workbox-*.js
 public/worker-*.js
 public/fallback-*.js
-public/precache.*.js
+public/precache.*.js
+
+# ================================================================
+# Exclusions for ISO Downloader Scripts
+# ================================================================
+Scans/
+results.csv
+service_account.json
+download_state.json
+OpenDirectoryDownloader.exe
+OpenDirectoryDownloader-*
+*.txt
diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,13 @@
+# ISO Downloader Utilities
+
+This folder contains utilities for downloading ISO files and uploading them to a shared drive.
+Everything is fairly experimental, don't expect decent code quality.
+
+## What each script does
+- `downloader.py`: Designed to work on Windows with a Google Drive share and a temporary directory. Has a whole speed limit schedule system thingy to avoid overloading internet connections or causing SSDs to fill up.
+- `dedi.py`: Designed to work on either Windows or Linux with a temporary directory. Utilises a gservice account in `service_account.json` and uses the google python APIs.
+- `filter.py`: Takes a file `unfiltered.txt` and uses the constants `SKIP_EXTENSIONS` and `BANNED_KEYWORDS` to filter out files that are not suitable for archive.
+- `rclonededi.py`: Same as `dedi.py` but uses rclone to upload files to a remote drive, allowing for speed limits to be set on uploads.
+- `winworldpc.py`: Very experimental winworldpc downloader, takes a `results.csv` and tries to account for the winworldpc download limit. Designed to work on Windows, can be modified to work on Linux.
+
+All downloading scripts support a bandwidth limiter.
diff --git a/scripts/dedi.py b/scripts/dedi.py
@@ -0,0 +1,128 @@
+import subprocess
+from pathlib import Path
+
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaFileUpload
+
+# Configuration
+QUEUE_FILE = "queue.txt"
+DOWNLOAD_DIR = r"T:\dl2" # DOWNLOAD_DIR = "/tmp/isodl"
+SPEED_LIMIT = "1m"
+
+# Google Drive config
+SERVICE_ACCOUNT_FILE = "service_account.json"
+DRIVE_FOLDER_ID = "11epaa3D4nKP1fI2RHRVP42jx_JwrRsZ6"
+SHARED_DRIVE_ID = "0AJ0TLbTX04lFUk9PVA"
+
+def get_drive_service():
+    creds = service_account.Credentials.from_service_account_file(
+        SERVICE_ACCOUNT_FILE,
+        scopes=['https://www.googleapis.com/auth/drive']
+    )
+    return build('drive', 'v3', credentials=creds)
+
+def list_drive_files(service, folder_id: str, shared_drive_id: str) -> set[str]:
+    """Returns a set of file names already in the shared drive folder."""
+    files = set()
+    page_token = None
+
+    while True:
+        response = service.files().list(
+            q=f"'{folder_id}' in parents and trashed = false",
+            corpora='drive',
+            driveId=shared_drive_id,
+            includeItemsFromAllDrives=True,
+            supportsAllDrives=True,
+            spaces='drive',
+            fields='nextPageToken, files(name)',
+            pageToken=page_token
+        ).execute()
+
+        for file in response.get('files', []):
+            files.add(file['name'])
+
+        page_token = response.get('nextPageToken')
+        if not page_token:
+            break
+
+    return files
+
+def parse_queue(file_path: str) -> list[str]:
+    urls = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            url = line.split("#")[0].strip()
+            if url:
+                urls.append(url)
+    return urls
+
+def download_url(url: str, download_dir: str, speed_limit: str, drive_service, drive_filenames: set[str]) -> None:
+    filename = url.split("/")[-1]
+    temp_path = Path(download_dir) / filename
+
+    if filename in drive_filenames:
+        print(f"Skipping {filename}; already exists on Google Drive.")
+        return
+
+    if temp_path.exists():
+        print(f"Skipping download; file already exists locally: {temp_path}")
+    else:
+        print(f"Downloading: {url}")
+        try:
+            subprocess.run([
+                "wget", "--limit-rate", speed_limit, "-P", download_dir, url
+            ], check=True)
+        except subprocess.CalledProcessError as e:
+            print(f"Failed to download: {e.cmd} - Exit code {e.returncode}")
+            return
+
+    if temp_path.exists():
+        try:
+            upload_to_drive(drive_service, str(temp_path), DRIVE_FOLDER_ID)
+            temp_path.unlink()
+            print(f"Deleted local file after upload: {temp_path}")
+        except Exception as e:
+            print(f"Upload failed for {temp_path}: {e}")
+    else:
+        print(f"Downloaded file not found: {temp_path}")
+
+def upload_to_drive(service, file_path: str, folder_id: str):
+    file_metadata = {
+        'name': Path(file_path).name,
+        'parents': [folder_id]
+    }
+    media = MediaFileUpload(file_path, resumable=True)
+    file = service.files().create(
+        body=file_metadata,
+        media_body=media,
+        fields='id',
+        supportsAllDrives=True
+    ).execute()
+    print(f"Uploaded to Drive: {file_path} (ID: {file.get('id')})")
+
+
+def main():
+    Path(DOWNLOAD_DIR).mkdir(parents=True, exist_ok=True)
+
+    urls = parse_queue(QUEUE_FILE)
+    total_files = len(urls)
+    print(f"Total files to process: {total_files}")
+
+    drive_service = get_drive_service()
+
+    drive_filenames = list_drive_files(drive_service, DRIVE_FOLDER_ID, SHARED_DRIVE_ID)
+    print(f"Found {len(drive_filenames)} existing files in the shared folder.")
+
+    completed = 0
+    for url in urls:
+        download_url(url, DOWNLOAD_DIR, SPEED_LIMIT, drive_service, drive_filenames)
+        completed += 1
+        percent = (completed / total_files) * 100
+        print(f"Progress: {completed}/{total_files} files processed ({percent:.1f}%)")
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/downloader.py b/scripts/downloader.py
@@ -0,0 +1,140 @@
+import subprocess
+from pathlib import Path
+import shutil
+from datetime import datetime
+from time import sleep
+
+# Configuration
+QUEUE_FILE = "queue.txt"
+FILTERED_OUTPUT_FILE = "filtered.txt"
+DOWNLOAD_DIR = r"T:\dl"
+FINAL_DIR = r"G:\Shared drives\ISO Archives\tempuploaddir"
+BANDWIDTH_SCHEDULE = {
+    "00:00": "3M",
+    "00:30": "3M",
+    "01:00": "3M",
+    "01:30": "4M",
+    "02:00": "4M",
+    "02:30": "4M",
+    "03:00": "5M",
+    "03:30": "5M",
+    "04:00": "6M",
+    "04:30": "6M",
+    "05:00": "6M",
+    "05:30": "7M",
+    "06:00": "7M",
+    "06:30": "7M",
+    "07:00": "6M",
+    "07:30": "5M",
+    "08:00": "3M",
+    "08:30": "3M",
+    "09:00": "3M",
+    "09:30": "3M",
+    "10:00": "3M",
+    "10:30": "3M",
+    "11:00": "3M",
+    "11:30": "3M",
+    "12:00": "3M",
+    "12:30": "3M",
+    "13:00": "3M",
+    "13:30": "3M",
+    "14:00": "3M",
+    "14:30": "3M",
+    "15:00": "3M",
+    "15:30": "3M",
+    "16:00": "3M",
+    "16:30": "3M",
+    "17:00": "3M",
+    "17:30": "1M",
+    "18:00": "1M",
+    "18:30": "1M",
+    "19:00": "1M",
+    "19:30": "1M",
+    "20:00": "1M",
+    "20:30": "1M",
+    "21:00": "1M",
+    "21:30": "1M",
+    "22:00": "1M",
+    "22:30": "1M",
+    "23:00": "1M",
+    "23:30": "3M",
+}
+
+def parse_queue(file_path: str) -> list[str]:
+    urls = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            url = line.split("#")[0].strip()
+            if url:
+                urls.append(url)
+    return urls
+
+def get_current_speed_limit() -> str:
+    """Return the appropriate bandwidth limit based on current time."""
+    now = datetime.now()
+    current_minutes = now.hour * 60 + now.minute
+
+    # Convert schedule times to minutes for comparison
+    times_in_minutes = sorted(
+        ((int(t.split(":")[0]) * 60 + int(t.split(":")[1]), rate) for t, rate in BANDWIDTH_SCHEDULE.items())
+    )
+
+    selected_rate = times_in_minutes[0][1]  # default to the first entry
+    for t_minutes, rate in times_in_minutes:
+        if current_minutes >= t_minutes:
+            selected_rate = rate
+        else:
+            break
+    return selected_rate
+
+def download_url(url: str, download_dir: str, final_dir: str) -> None:
+    filename = url.split("/")[-1]
+    final_path = Path(final_dir) / filename
+
+    if final_path.exists():
+        print(f"Skipping download; file already exists: {final_path}")
+        return
+
+    speed_limit = get_current_speed_limit()
+    print(f"Downloading: {url} (Speed limit: {speed_limit})")
+
+    try:
+        subprocess.run([
+            "wget", "--limit-rate", speed_limit, "-P", download_dir, url
+        ], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to download: {e.cmd} - Exit code {e.returncode}")
+        return
+
+    temp_path = Path(download_dir) / filename
+
+    sleep(1)
+
+    if temp_path.exists():
+        print(f"Moving {temp_path} to {final_path}")
+        shutil.move(str(temp_path), str(final_path))
+    else:
+        print(f"Downloaded file not found: {temp_path}")
+
+def main():
+    Path(DOWNLOAD_DIR).mkdir(parents=True, exist_ok=True)
+    Path(FINAL_DIR).mkdir(parents=True, exist_ok=True)
+
+    urls = parse_queue(QUEUE_FILE)
+    total_files = len(urls)
+
+    print(f"Total files to download: {total_files}")
+
+    completed = 0
+
+    for url in urls:
+        download_url(url, DOWNLOAD_DIR, FINAL_DIR)
+        completed += 1
+        percent = (completed / total_files) * 100
+        print(f"Progress: {completed}/{total_files} files downloaded ({percent:.1f}%)")
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/filter.py b/scripts/filter.py
@@ -0,0 +1,46 @@
+SKIP_EXTENSIONS = (
+    ".rpm", ".repo", ".deb", ".db", ".pkg.tar", ".pkg.tar.zst", ".pkg", ".xml.gz", ".xml",
+    ".xml.zck", "xml.xz", ".sqlite.gz", ".sqlite.xz", ".cfg", ".conf", "gpl", ".pf2",
+    "vmlinuz", ".txt", ".efi", ".manifest", ".sqlite.bz2", ".gpg", ".html", "gpg-key",
+    ".css", ".js", ".php", "gpg-key-beta", "gpg-key-fedora", "gpg-key-fedora-rawhide",
+    "gpg-key-fedora-test", "gpg-key-rawhide", ".png", ".dtb", "vmlinuz-lpae",
+    "memtest", "license", "vmlinuz-pae", "gpg-key-fedora-x86_64", "compose_id", ".o", "tbl", ".torrent",
+    ".json", ".mod", "readme", "lst", "c32", ".yaml.gz", "community-charter", "eula", ".qcow2", ".vhd", ".box",
+    ".hdr", ".sh", ".msg", ".lss", "boot.cat", ".list", ".patch", "time", "filelist.gz", "dir_sizes", "empty_repo"
+)
+BANNED_KEYWORDS = [
+    "/os/", "rpm", "source", "jigdo", "template", "_toolchain", "netinst", "xml", "fullfile", "metadata",
+    "initrd.img", "kickstart", "azure", "ec2", "cloud", "/repo/", "/updates/", "/kmods/", "/headers/", "manifest",
+    "pkglist", "srclist", "docs", "/base/", "scripts", "release-notes", "isolinux", "copying", "autorun", "autoboot",
+    "/de/", "/es/", "/it/", "/fr/", "/i386/images/", "/build/livecd/"
+]
+
+FILTERED_OUTPUT_FILE = "filtered.txt"
+INPUT_FILE = "unfiltered.txt"
+
+def should_skip(url: str) -> bool:
+    url_lc = url.lower()
+    if url_lc.endswith(SKIP_EXTENSIONS):
+        return True
+    for keyword in BANNED_KEYWORDS:
+        if keyword in url_lc:
+            return True
+    return False
+
+def parse_queue(file_path: str) -> list[str]:
+    urls = []
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            url = line.split("#")[0].strip()
+            if url and not should_skip(url):
+                urls.append(url)
+    return urls
+
+urls = parse_queue(INPUT_FILE)
+
+with open(FILTERED_OUTPUT_FILE, "w") as f:
+    for url in urls:
+        f.write(f"{url}\n")
diff --git a/scripts/rclonededi.py b/scripts/rclonededi.py
diff --git a/scripts/winworldpc.py b/scripts/winworldpc.py