Lucaslab-Berkeley
diff --git a/‎docs/programs/refine_template.md‎
Lines changed: 11 additions & 3 deletions b/‎docs/programs/refine_template.md‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎src/leopard_em/backend/core_match_template.py‎
Lines changed: 12 additions & 193 deletions b/‎src/leopard_em/backend/core_match_template.py‎
Lines changed: 12 additions & 193 deletions
@@ -125,9 +125,9 @@ All these parameters are discussed in more detail on the [match template program
 
 ### Configuring GPUs for a match template run
 
-Template refinement can run across multiple GPUs and is controlled in the same way as match template.
-Note that the `num_cpus` field is currently unused for the refine template program and can just be set to one.
-Like [configuring GPUs for a match template run](match_template.md#configuring-gpus-for-a-match-template-run), GPUs are targeted by their device index.
+The refine template program parallelizes across multiple GPUs by splitting which particles are refined across the configured list of GPU devices.
+The `num_cpus` field controls how many concurrent streams of work are being submitted to each GPUs; in most cases, a value of `1` or `2` will saturate the GPU and give the best performance, although your mileage may vary.
+Like [configuring GPUs for a match template run](match_template.md#configuring-gpus-for-a-match-template-run), GPUs are targeted by their device index or the special string `"all"`
 The following configuration will run `refine_template` on GPU zero.
 
 ```yaml
@@ -136,6 +136,14 @@ computational_config:
   num_cpus: 1
 ```
 
+The following configuration will run `refine_template` on all available GPUs with two streams per GPU.
+
+```yaml
+computational_config:
+  gpu_ids: "all"
+  num_cpus: 2
+```
+
 ## Running the refine template program
 
 Once you've configured a YAML file, running the refine template program is fairly simple.
 
@@ -10,19 +10,19 @@
 import roma
 import torch
 import tqdm
-from torch_fourier_slice import extract_central_slices_rfft_3d
 
+from leopard_em.backend.cross_correlation import (
+    do_streamed_orientation_cross_correlate,
+)
 from leopard_em.backend.process_results import (
     aggregate_distributed_results,
     scale_mip,
 )
 from leopard_em.backend.utils import (
-    do_iteration_statistics_updates,
-    normalize_template_projection,
+    do_iteration_statistics_updates_compiled,
     run_multiprocess_jobs,
 )
 
-COMPILE_BACKEND = "inductor"
 DEFAULT_STATISTIC_DTYPE = torch.float32
 
 # Turn off gradient calculations by default
@@ -31,13 +31,6 @@
 # Set multiprocessing start method to spawn
 set_start_method("spawn", force=True)
 
-normalize_template_projection_compiled = torch.compile(
-    normalize_template_projection, backend=COMPILE_BACKEND
-)
-do_iteration_statistics_updates_compiled = torch.compile(
-    do_iteration_statistics_updates, backend=COMPILE_BACKEND
-)
-
 
 ###########################################################
 ###      Main function for whole orientation search     ###
@@ -396,6 +389,10 @@ def _core_match_template_single_gpu(
     ### Setup iterator object with tqdm for progress bar ###
     ########################################################
 
+    total_projections = (
+        euler_angles.shape[0] * defocus_values.shape[0] * pixel_values.shape[0]
+    )
+
     num_batches = math.ceil(euler_angles.shape[0] / orientation_batch_size)
     orientation_batch_iterator = tqdm.tqdm(
         range(num_batches),
@@ -405,10 +402,9 @@ def _core_match_template_single_gpu(
         dynamic_ncols=True,
         position=device.index,
         mininterval=1,  # Slow down to reduce number of lines written
-    )
-
-    total_projections = (
-        euler_angles.shape[0] * defocus_values.shape[0] * pixel_values.shape[0]
+        smoothing=0.05,
+        unit="corr",
+        unit_scale=total_projections / num_batches,
     )
 
     ##################################
@@ -423,7 +419,7 @@ def _core_match_template_single_gpu(
             "ZYZ", euler_angles_batch, degrees=True, device=device
         )
 
-        cross_correlation = _do_bached_orientation_cross_correlate(
+        cross_correlation = do_streamed_orientation_cross_correlate(
             image_dft=image_dft,
             template_dft=template_dft,
             rotation_matrices=rot_matrix,
@@ -466,180 +462,3 @@ def _core_match_template_single_gpu(
     # Place the results in the shared multi-process manager dictionary so accessible
     # by the main process.
     result_dict[device_id] = result
-
-
-def _do_bached_orientation_cross_correlate(
-    image_dft: torch.Tensor,
-    template_dft: torch.Tensor,
-    rotation_matrices: torch.Tensor,
-    projective_filters: torch.Tensor,
-    streams: list[torch.cuda.Stream],
-) -> torch.Tensor:
-    """Batched projection and cross-correlation with fixed (batched) filters.
-
-    Note that this function returns a cross-correlogram with "same" mode (i.e. the
-    same size as the input image). See numpy correlate docs for more information.
-
-    Parameters
-    ----------
-    image_dft : torch.Tensor
-        Real-fourier transform (RFFT) of the image with large image filters
-        already applied. Has shape (H, W // 2 + 1).
-    template_dft : torch.Tensor
-        Real-fourier transform (RFFT) of the template volume to take Fourier
-        slices from. Has shape (l, h, w // 2 + 1) where (l, h, w) is the original
-        real-space shape of the template volume.
-    rotation_matrices : torch.Tensor
-        Rotation matrices to apply to the template volume. Has shape
-        (num_orientations, 3, 3).
-    projective_filters : torch.Tensor
-        Multiplied 'ctf_filters' with 'whitening_filter_template'. Has shape
-        (num_Cs, num_defocus, h, w // 2 + 1). Is RFFT and not fftshifted.
-    streams : list[torch.cuda.Stream]
-        List of CUDA streams to use for parallel computation. Each stream will
-        handle a separate cross-correlation.
-
-    Returns
-    -------
-    torch.Tensor
-        Cross-correlation of the image with the template volume for each
-        orientation and defocus value. Will have shape
-        (num_Cs, num_defocus, num_orientations, H, W).
-    """
-    # Accounting for RFFT shape
-    projection_shape_real = (template_dft.shape[1], template_dft.shape[2] * 2 - 2)
-    image_shape_real = (image_dft.shape[0], image_dft.shape[1] * 2 - 2)
-
-    num_orientations = rotation_matrices.shape[0]
-    num_Cs = projective_filters.shape[0]  # pylint: disable=invalid-name
-    num_defocus = projective_filters.shape[1]
-
-    cross_correlation = torch.empty(
-        size=(num_Cs, num_defocus, num_orientations, *image_shape_real),
-        dtype=DEFAULT_STATISTIC_DTYPE,
-        device=image_dft.device,
-    )
-
-    # Do a batched Fourier slice extraction for all the orientations at once.
-    fourier_slices = extract_central_slices_rfft_3d(
-        volume_rfft=template_dft,
-        image_shape=(projection_shape_real[0],) * 3,
-        rotation_matrices=rotation_matrices,
-    )
-    fourier_slices = torch.fft.ifftshift(fourier_slices, dim=(-2,))
-    fourier_slices[..., 0, 0] = 0 + 0j  # zero out the DC component (mean zero)
-    fourier_slices *= -1  # flip contrast
-
-    # Iterate over the orientations
-    for i in range(num_orientations):
-        fourier_slice = fourier_slices[i]
-
-        # Iterate over the different pixel sizes (Cs) and defocus values for this
-        # particular orientation
-        for j in range(num_defocus):
-            for k in range(num_Cs):
-                # Use a round-robin scheduling for the streams
-                job_idx = (i * num_defocus * num_Cs) + (j * num_Cs) + k
-                stream_idx = job_idx % len(streams)
-                stream = streams[stream_idx]
-
-                with torch.cuda.stream(stream):
-                    # Apply the projective filter and do template normalization
-                    fourier_slice_filtered = fourier_slice * projective_filters[k, j]
-                    projection = torch.fft.irfft2(fourier_slice_filtered)
-                    projection = torch.fft.ifftshift(projection, dim=(-2, -1))
-                    projection = normalize_template_projection_compiled(
-                        projection,
-                        projection_shape_real,
-                        image_shape_real,
-                    )
-
-                    # Padded forward Fourier transform for cross-correlation
-                    projection_dft = torch.fft.rfft2(projection, s=image_shape_real)
-                    projection_dft[0, 0] = 0 + 0j
-
-                    # Cross correlation step by element-wise multiplication
-                    projection_dft = image_dft * projection_dft.conj()
-                    torch.fft.irfft2(
-                        projection_dft,
-                        s=image_shape_real,
-                        out=cross_correlation[k, j, i],
-                    )
-
-    # Wait for all streams to finish
-    for stream in streams:
-        stream.synchronize()
-
-    # shape is (num_Cs, num_defocus, num_orientations, H, W)
-    return cross_correlation
-
-
-def _do_bached_orientation_cross_correlate_cpu(
-    image_dft: torch.Tensor,
-    template_dft: torch.Tensor,
-    rotation_matrices: torch.Tensor,
-    projective_filters: torch.Tensor,
-) -> torch.Tensor:
-    """Same as `_do_bached_orientation_cross_correlate` but on the CPU.
-
-    The only difference is that this function does not call into a compiled torch
-    function for normalization.
-
-    TODO: Figure out a better way to split up CPU/GPU functions while remaining
-    performant and not duplicating code.
-
-    Parameters
-    ----------
-    image_dft : torch.Tensor
-        Real-fourier transform (RFFT) of the image with large image filters
-        already applied. Has shape (H, W // 2 + 1).
-    template_dft : torch.Tensor
-        Real-fourier transform (RFFT) of the template volume to take Fourier
-        slices from. Has shape (l, h, w // 2 + 1).
-    rotation_matrices : torch.Tensor
-        Rotation matrices to apply to the template volume. Has shape
-        (orientations, 3, 3).
-    projective_filters : torch.Tensor
-        Multiplied 'ctf_filters' with 'whitening_filter_template'. Has shape
-        (defocus_batch, h, w // 2 + 1). Is RFFT and not fftshifted.
-
-    Returns
-    -------
-    torch.Tensor
-        Cross-correlation for the batch of orientations and defocus values.s
-    """
-    # Accounting for RFFT shape
-    projection_shape_real = (template_dft.shape[1], template_dft.shape[2] * 2 - 2)
-    image_shape_real = (image_dft.shape[0], image_dft.shape[1] * 2 - 2)
-
-    # Extract central slice(s) from the template volume
-    fourier_slice = extract_central_slices_rfft_3d(
-        volume_rfft=template_dft,
-        image_shape=(projection_shape_real[0],) * 3,  # NOTE: requires cubic template
-        rotation_matrices=rotation_matrices,
-    )
-    fourier_slice = torch.fft.ifftshift(fourier_slice, dim=(-2,))
-    fourier_slice[..., 0, 0] = 0 + 0j  # zero out the DC component (mean zero)
-    fourier_slice *= -1  # flip contrast
-
-    # Apply the projective filters on a new batch dimension
-    fourier_slice = fourier_slice[None, None, ...] * projective_filters[:, :, None, ...]
-
-    # Inverse Fourier transform into real space and normalize
-    projections = torch.fft.irfftn(fourier_slice, dim=(-2, -1))
-    projections = torch.fft.ifftshift(projections, dim=(-2, -1))
-    projections = normalize_template_projection(
-        projections,
-        projection_shape_real,
-        image_shape_real,
-    )
-
-    # Padded forward Fourier transform for cross-correlation
-    projections_dft = torch.fft.rfftn(projections, dim=(-2, -1), s=image_shape_real)
-    projections_dft[..., 0, 0] = 0 + 0j  # zero out the DC component (mean zero)
-
-    # Cross correlation step by element-wise multiplication
-    projections_dft = image_dft[None, None, None, ...] * projections_dft.conj()
-    cross_correlation = torch.fft.irfftn(projections_dft, dim=(-2, -1))
-
-    return cross_correlation