Add parallelised preprocessing option, other minor fixes

jsmailes · jsmailes · commit 1805a77b33d8 · 2023-11-13T11:00:45.000Z
diff --git a/README.md b/README.md
@@ -201,7 +201,27 @@ Be careful using this option, as it creates a much larger number of files, and t
 
 > [!WARNING]
 > This script in particular will use a large amount of RAM, since it loads the entire dataset into memory at once.
-> Processing may be done in batches by using the `--max-files` and `--skip-files` command-line arguments.
+> Processing may be done in batches by using the `--max-files` and `--skip-files` command-line arguments, or the script below.
+
+##### np-to-tfrecord-parallel.sh
+
+This script can run multiple instances of `np-to-tfrecord.py` in parallel, allowing preprocessing to be sped up and/or less RAM to be used.
+
+Usage:
+```bash
+np-to-tfrecord-parallel.sh <NUM PROCESSES> <FILES PER PROCESS> <INPUT PATH> <OUTPUT PATH>
+```
+Where:
+- `INPUT PATH` contains your `.npy` files, as above.
+- `OUTPUT PATH` is the desired output directory.
+- `NUM PROCESSES` is the number of CPU cores to use.
+- `FILES PER PROCESS` is the number of files each thread should load at once.
+
+Ensure that `NUM_PROCESSES * FILES_PER_PROCESS` input files can fit comfortably in RAM.
+
+> [!NOTE]
+> Shuffling is disabled by default in this script - if shuffled data is desired, the `--no-shuffle` flag should be removed from the script.
+> If this flag is removed, shuffling will only be done on a per-process level - that is, each process will shuffle the files it has loaded, but not the dataset as a whole.
 
 
 #### sqlite3-compress.py
diff --git a/analysis/plots-data.ipynb b/analysis/plots-data.ipynb
@@ -311,8 +311,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data_dir = data_base + \"/filtered\"\n",
-    "suffixes = [\"i\",\"j\",\"k\",\"l\",\"m\",\"n\",\"o\",\"p\",\"q\",\"r\",\"s\",\"t\",\"u\"]"
+    "data_dir = data_base\n",
+    "suffixes = [f\"{i:03d}\" for i in range(171)]"
    ]
   },
   {
@@ -325,8 +325,8 @@
     "    return sat_id * num_cells + sat_cell\n",
     "\n",
     "def load_data(path, suffix):\n",
-    "    file_ids = os.path.join(path, f\"ids-{suffix}.npy\")\n",
-    "    file_cells = os.path.join(path, f\"cells-{suffix}.npy\")\n",
+    "    file_ids = os.path.join(path, f\"ra_sat_{suffix}.npy\")\n",
+    "    file_cells = os.path.join(path, f\"ra_cell_{suffix}.npy\")\n",
     "\n",
     "    return np.load(file_ids), np.load(file_cells)\n",
     "\n",
diff --git a/preprocessing/np-to-tfrecord-parallel.sh b/preprocessing/np-to-tfrecord-parallel.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# This script converts the numpy arrays to tfrecords in parallel, using a specified number of processes.
+
+# If not enough arguments are specified, print help and exit
+if [ $# -ne 4 ]; then
+    echo "Usage: $0 <num_processes> <files_per_process> <path_in> <path_out>"
+    exit 1
+fi
+
+# Take arguments from the command line
+num_processes=$1
+files_per_process=$2
+path_in=$3
+path_out=$4
+
+num_files=$(ls -1 $path_in/samples_*.npy | wc -l)
+step_size=$((num_processes*files_per_process))
+
+# Run the conversion in parallel
+for i in $(seq 0 $step_size $((num_files-1))); do
+    start=$i
+    end=$((start+step_size))
+    if [ $end -gt $num_files ]; then
+        end=$num_files
+    fi
+    echo "Starting processes with files $start to $end"
+    for j in $(seq 0 $(($num_processes - 1))); do
+        skip_files=$((start+j*files_per_process))
+        echo "    Starting process $j with files $skip_files to $(($skip_files+files_per_process))"
+        python3 np-to-tfrecord.py --path-in=$path_in --path-out=$path_out --skip-files=$skip_files --max-files=$files_per_process --no-shuffle &
+    done
+    wait
+done
diff --git a/preprocessing/np-to-tfrecord.py b/preprocessing/np-to-tfrecord.py
@@ -42,7 +42,7 @@ def save_dataset(path, suffix, samples_array, ids_array, cells_array):
             }))
             writer.write(example.SerializeToString())
 
-def save_dataset_batches(path, chunk_size, samples_array, ids_array, cells_array, verbose):
+def save_dataset_batches(path, chunk_size, samples_array, ids_array, cells_array, verbose, skip_count=None):
     chunk_count = 0
 
     # Create directory if it doesn't exist
@@ -63,19 +63,21 @@ def save_dataset_batches(path, chunk_size, samples_array, ids_array, cells_array
         ids_array = ids_array[chunk_size:]
         cells_array = cells_array[chunk_size:]
 
-        save_dataset(path, str(chunk_count), s, i, c)
+        suffix = f"{skip_count}-{chunk_count}" if skip_count is not None else f"{chunk_count}"
+        save_dataset(path, suffix, s, i, c)
         chunk_count += 1
 
     if samples_array.shape[0] > 0:
         if verbose:
             print(f"Saving chunk {chunk_count}...")
             print(f"Samples remaining: {samples_array.shape[0]}")
-        save_dataset(path, str(chunk_count), samples_array, ids_array, cells_array)
+        suffix = f"{skip_count}-{chunk_count}" if skip_count is not None else f"{chunk_count}"
+        save_dataset(path, suffix, samples_array, ids_array, cells_array)
         chunk_count += 1
 
     return chunk_count
 
-def process_all(chunk_size, path_in, path_out, max_files=None, skip_files=0, verbose=False, shuffle=True, by_id=False):
+def process_all(chunk_size, path_in, path_out, max_files=None, skip_files=None, verbose=False, shuffle=True, by_id=False):
     samples_array = None
     ids_array = None
     cells_array = None
@@ -86,7 +88,8 @@ def process_all(chunk_size, path_in, path_out, max_files=None, skip_files=0, ver
     suffixes = [ f for f in os.listdir(path_in) if f.startswith("samples_") and f.endswith(".npy") ]
     suffixes.sort()
     suffixes = [ f[8:-4] for f in suffixes ]
-    suffixes = suffixes[skip_files:]
+    if skip_files is not None:
+        suffixes = suffixes[skip_files:]
     if max_files is not None:
         suffixes = suffixes[:max_files]
 
@@ -145,7 +148,7 @@ def process_all(chunk_size, path_in, path_out, max_files=None, skip_files=0, ver
                 if verbose:
                     print("Done")
 
-            save_dataset_batches(path_out_id, chunk_size, samples_array_unique_subset, ids_array_unique_subset, cells_array_unique_subset, verbose)
+            save_dataset_batches(path_out_id, chunk_size, samples_array_unique_subset, ids_array_unique_subset, cells_array_unique_subset, verbose, skip_count=skip_files)
 
             del samples_array_unique_subset
             del ids_array_unique_subset
@@ -180,7 +183,7 @@ def process_all(chunk_size, path_in, path_out, max_files=None, skip_files=0, ver
             if verbose:
                 print("Done")
 
-        save_dataset_batches(path_out_test, chunk_size, samples_array_unique_subset, ids_array_unique_subset, cells_array_unique_subset, verbose)
+        save_dataset_batches(path_out_test, chunk_size, samples_array_unique_subset, ids_array_unique_subset, cells_array_unique_subset, verbose, skip_count=skip_files)
 
         del samples_array_unique_subset
         del ids_array_unique_subset
@@ -200,7 +203,7 @@ def process_all(chunk_size, path_in, path_out, max_files=None, skip_files=0, ver
             if verbose:
                 print("Done")
 
-        chunk_count = save_dataset_batches(path_out, chunk_size, samples_array, ids_array, cells_array, verbose)
+        chunk_count = save_dataset_batches(path_out, chunk_size, samples_array, ids_array, cells_array, verbose, skip_count=skip_files)
 
         if verbose:
             print("Total messages: {}".format(message_count))
@@ -216,7 +219,7 @@ def process_all(chunk_size, path_in, path_out, max_files=None, skip_files=0, ver
     parser.add_argument("--path-in", type=str, default=path_in, help="Input directory.")
     parser.add_argument("--path-out", type=str, default=path_out, help="Output directory.")
     parser.add_argument("--max-files", type=int, default=None, help="Maximum number of input files to process.")
-    parser.add_argument("--skip-files", type=int, default=0, help="Number of input files to skip.")
+    parser.add_argument("--skip-files", type=int, default=None, help="Number of input files to skip.")
     parser.add_argument("--no-shuffle", action='store_true', help="Do not shuffle data.")
     parser.add_argument("--by-id", action='store_true', help="Create datasets with different percentages of the most common IDs. WARNING: This will create a lot of datasets, and take a long time!")
     parser.add_argument("-v", "--verbose", action='store_true', help="Display progress.")