updates to incorporate the matrix chunking method #79

steven-murray · 2024-01-23T11:29:32Z

I think you could simply do this as antpairs_set = set(self.antpairs)

steven-murray · 2024-01-18T17:32:32Z

Would it not be faster to just save Nsets arrays in a list, and then index from them? This way, you have to put the results into a big array so they're very separated in memory space

I mean, setup something like this:

mat_product = [ np.zeros(len(ai), len(aj), self.nfeed, self.nfeed) for ai, aj in self.matsets ]

The reason I didn't do it this way is because I wanted the final array that holds all the matrix products to be indexable with the elements of antpairs. That way, I can easily grab the non-redundant pairs at the end of return an array with the same format that you'd get using the vector-vector method. Otherwise, I have to re-calculate the mapping the mapping to the antpairs indices for each sub-matrix. I can change it if you think it would help with memory management to do it the other way though.

-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,7 @@ @@
     from pathlib import Path
     from pyuvdata import UVBeam
     from pyuvsim import AnalyticBeam, simsetup
+    from submatrices import *
     from typing import Literal
     from matvis import DATA_PATH, HAVE_GPU, coordinates, cpu, simulate_vis
@@ Expand Down Expand Up / @@ -78,6 +79,7 @@ def run_profile( @@
         naz=360,
         nza=180,
         pairs=None,
+        matsets=None,
         nchunks=1,
         source_buffer=1.0,
     ):
@@ Expand Down Expand Up / @@ -136,6 +138,7 @@ def run_profile( @@
             beam_idx=beam_idx,
             matprod_method=f"{'GPU' if gpu else 'CPU'}{method}",
             antpairs=pairs,
+            matsets=matsets,
             min_chunks=nchunks,
             source_buffer=source_buffer,
         )
@@ Expand Down Expand Up / @@ -278,8 +281,9 @@ def get_redundancies(bls, ndecimals: int = 2): @@
     )
     @click.option("-k", "--keep-ants", type=str, default="")
     @click.option("--outriggers/--no-outriggers", default=False)
+    @click.option("-msm", "--matset-method", type=str, default="")
     @add_common_options
-    def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs):
+    def hera_profile(hex_num, nside, keep_ants, outriggers, matset_method, **kwargs):
         """Run profiling of matvis with a HERA-like array."""
         from py21cmsense.antpos import hera
@@ Expand All @@
         bls = antpos[np.newaxis, :, :2] - antpos[:, np.newaxis, :2]
         pairs = np.array(get_redundancies(bls.value))
-        run_profile(nsource=12 * nside**2, nants=antpos.shape[0], pairs=pairs, **kwargs)
+        if matset_method == "":
+            matsets = None
+        run_profile(
+            nsource=12 * nside**2,
+            nants=antpos.shape[0],
+            pairs=pairs,
+            matsets=matsets,
+            **kwargs,
+        )
     def get_line_based_stats(lstats) -> tuple[dict, float]:
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -3,8 +3,13 @@ @@
     from abc import ABC, abstractmethod
     from typing import Any
+    from matvis import HAVE_GPU
     from .._utils import get_dtypes
+    if HAVE_GPU:
+        import cupy as cp
     class MatProd(ABC):
         """
@@ Expand All / @@ -20,6 +25,9 @@ class MatProd(ABC): @@
             Number of antennas.
         antpairs
             The antenna pairs to sum over. If None, all pairs are used.
+        matsets
+            The list of sub-matrices to calculate.  If None, all pairs are used in a single
+            matrix multiplication.
         precision
             The precision of the data (1 or 2).
         """
@@ Expand All / @@ -29,16 +37,24 @@ def __init__( @@
             nchunks: int,
             nfeed: int,
             nant: int,
-            antpairs: np.ndarray | None,
+            antpairs: np.ndarray | list | None,
+            matsets: list | None,
             precision=1,
         ):
             if antpairs is None:
                 self.all_pairs = True
                 self.antpairs = np.array([(i, j) for i in range(nant) for j in range(nant)])
+                self.matsets = None
             else:
                 self.all_pairs = False
                 self.antpairs = antpairs
+                if HAVE_GPU:
+                    for i in range(len(matsets)):
+                        matsets[i][0] = cp.array(matsets[i][0])
+                        matsets[i][1] = cp.array(matsets[i][1])
+                self.matsets = matsets
             self.nchunks = nchunks
             self.nfeed = nfeed
             self.nant = nant
@@ Expand All / @@ -60,10 +76,33 @@ def allocate_vis(self): @@
                 (self.nchunks, self.npairs, self.nfeed, self.nfeed), 0.0, dtype=self.ctype
             )
+        def check_antpairs_in_matsets(self):
+            """Check that all non-redundant ant pairs are included in set of sub-matrices.
+            If using the CPUMatChunk method, make sure that all non-redudant antpairs are included somewhere
+            in the set of sub-matrices to be calculated.  Otherwise, throw an exception.
+            """
+            antpairs_set = set()
+            matsets_set = set()
+            for _, (ai, aj) in enumerate(self.antpairs):
+                antpairs_set.add((ai, aj))
+            for _, (ai, aj) in enumerate(self.matsets):
+                for i in range(len(ai)):
+                    for j in range(len(aj)):
+                        matsets_set.add((ai[i], aj[j]))
+            if not antpairs_set.issubset(matsets_set):
+                raise Exception("Some non-redundant pairs are missing from sub-matrices.  ")
         def setup(self):
             """Setup the memory for the object."""
             self.allocate_vis()
+            if self.matsets is not None:
+                self.check_antpairs_in_matsets()
         @abstractmethod
         def compute(self, z: np.ndarray, out: np.ndarray):
             """
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -32,6 +32,7 @@ def simulate( @@
         I_sky: np.ndarray,
         beam_list: Sequence[UVBeam | Callable] | None,
         antpairs: np.ndarray | list[tuple[int, int]] | None = None,
+        matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
         precision: int = 1,
         polarized: bool = False,
         beam_idx: np.ndarray | None = None,
@@ Expand Down Expand Up / @@ -81,6 +82,13 @@ def simulate( @@
             Either a 2D array, shape ``(Npairs, 2)``, or list of 2-tuples of ints, with
             the list of antenna-pairs to return as visibilities (all feed-pairs are always
             calculated). If None, all feed-pairs are returned.
+        matsets  : array_like, optional
+            A list of containing a set of 2-tuples of numpy arrays.  Each entry in the list
+            corresponds to a different sub-matrix.  The first element of the ith tuple lists the rows of Z corresponding
+            to the rows of the ith sub-matrix, and the second tuple element contains the list of columns.  In the case
+            of vector-vector loop, the number of tuples is equal to Npairs and each element of the tuple contains only
+            one int, such that the sub-matrices contain only one element each.  Outer dimension has to be a list because
+            numpy doesn't like inhomogeneous arrays.
         precision : int, optional
             Which precision level to use for floats and complex numbers.
             Allowed values:
@@ Expand All / @@ -101,14 +109,15 @@ def simulate( @@
             allows). Default is 100.
         matprod_method : str, optional
             The method to use for the final matrix multiplication. Default is 'CPUMatMul',
-            which simply uses `np.dot` over the two full matrices. Currently, the other
-            option is `CPUVectorLoop`, which uses a loop over the antenna pairs,
-            computing the sum over sources as a vector dot product.
-            Whether to calculate visibilities for each antpair in antpairs as a vector
-            dot-product instead of using a full matrix-matrix multiplication for all
-            possible pairs. Default is False. Setting to True can be faster for large
-            arrays where `antpairs` is small (possibly from high redundancy). You should
-            run a performance test before using this.
+            which simply uses `np.dot` over the two full matrices. The second option is
+            `CPUVectorLoop`, which uses a loop over the antenna pairs, computing the sum
+            over sources as a vector dot product.  The third option is `CPUMatChunk`, which
+            divides the product into several matrix multiplications that are optimized to have the
+            highest possible density of non-redundant pairs without invoking the overhead of a large
+            for loop over vector products.  For very large arrays, with very high redundancy, CPUVectorLoop
+            is usually fastest, while for cases with low redundancy, `CPUMatMul` is fastest.  The `CPUMatChunk`
+            can sometimes be fastest for intermediate levels of redundancy and intermediately sized arrays.  You should
+            run a performance test before choosing one of these options.
         max_memory : int, optional
             The maximum memory (in bytes) to use for the visibility calculation. This is
             not a hard-set limit, but rather a guideline for how much memory to use. If the
@@ Expand Down Expand Up / @@ -175,7 +184,7 @@ def simulate( @@
             source_buffer=source_buffer,
         )
         mpcls = getattr(mp, matprod_method)
-        matprod = mpcls(nchunks, nfeed, nant, antpairs, precision=precision)
+        matprod = mpcls(nchunks, nfeed, nant, antpairs, matsets, precision=precision)
         zcalc = ZMatrixCalc(
             nsrc=nsrc_alloc,
             nfeed=nfeed,
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
                 out[i] = z[ai].conj().dot(z[aj].T)
             return out
+    class CPUMatChunk(MatProd):
+        """Loop over a small set of sub-matrix products which collectively contain all nont-redundant pairs."""
+        def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
+            """Perform the source-summing operation for a single time and chunk.
+            Parameters
+            ----------
+            z
+                Complex integrand. Shape=(Nfeed, Nant, Nax, Nsrc).
+            out
+                Output array, shaped as (Nfeed, Nfeed, Npairs).
+            """
+            z = z.reshape((self.nant, self.nfeed, -1))
+            mat_product = np.zeros(
+                (self.nant, self.nant, self.nfeed, self.nfeed), dtype=z.dtype
+            )
+            # Chris 12/20/23: instead we will use matsets
+            for j in range(self.nfeed):
+                for k in range(self.nfeed):
+                    for i, (ai, aj) in enumerate(self.matsets):
+                        AI, AJ = np.meshgrid(ai, aj)
+                        mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T
+            # Now, we need to identify the non-redundant pairs and put them into the final output array
+            for j in range(self.nfeed):
+                for k in range(self.nfeed):
+                    for i, (ai, aj) in enumerate(self.antpairs):
+                        out[i, j, k] = mat_product[ai, aj, j, k]
+            return out

-Original file line number
+Diff line change
@@ Expand Up / @@ -49,6 +49,7 @@ def simulate( @@
         beam_list: Sequence[UVBeam | Callable] | None,
         polarized: bool = False,
         antpairs: np.ndarray | list[tuple[int, int]] | None = None,
+        matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
         beam_idx: np.ndarray | None = None,
         max_memory: int = np.inf,
         min_chunks: int = 1,
@@ Expand Down Expand Up / @@ -120,7 +121,7 @@ def simulate( @@
             ctype=ctype,
         )
         mpcls = getattr(mp, matprod_method)
-        matprod = mpcls(nchunks, nfeed, nant, antpairs, precision=precision)
+        matprod = mpcls(nchunks, nfeed, nant, antpairs, matsets, precision=precision)
         npixc = nsrc // nchunks
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

updates to incorporate the matrix chunking method #79

Uh oh!

Diff view

Diff view

There are no files selected for viewing

steven-murray Jan 23, 2024

Uh oh!

Uh oh!

steven-murray Jan 18, 2024

Uh oh!

steven-murray Jan 18, 2024

Uh oh!

ccain002 Jan 19, 2024 •

edited

Loading

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -92,3 +92,47 @@ def sum_chunks(self, out: np.ndarray): @@
             out[:] = self.vis[0].transpose((2, 0, 1)).get()
             cp.cuda.Device().synchronize()
+    class GPUMatChunk(MatProd):
+        """Use a loop over specific pairs, performing a vdot over the source axis."""
+        def allocate_vis(self):
+            """Allocate memory for the visibilities."""
+            self.vis = [
+                cp.full(
+                    (self.nfeed, self.nfeed, self.npairs), 0.0, dtype=self.ctype, order="F"
+                )
+                for _ in range(self.nchunks)
+            ]
+        def compute(self, z: cp.ndarray, out: cp.ndarray) -> cp.ndarray:
+            """Perform the source-summing operation for a single time and chunk."""
+            z = z.reshape((self.nant, self.nfeed, -1))
+            mat_product = cp.zeros(
+                (self.nant, self.nant, self.nfeed, self.nfeed), dtype=z.dtype
+            )
+            for j in range(self.nfeed):
+                for k in range(self.nfeed):
+                    for i, (ai, aj) in enumerate(self.matsets):
+                        AI, AJ = cp.meshgrid(ai, aj)
+                        mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T
+            for j in range(self.nfeed):
+                for k in range(self.nfeed):
+                    for i, (ai, aj) in enumerate(self.antpairs):
+                        out[j, k, i] = mat_product[ai, aj, j, k]
+            cp.cuda.Device().synchronize()
+            return out
+        def sum_chunks(self, out: np.ndarray):
+            """Sum the chunks into the output array."""
+            if self.nchunks > 1:
+                for i in range(1, len(self.vis)):
+                    self.vis[0] += self.vis[i]
+            out[:] = self.vis[0].transpose((2, 0, 1)).get()
+            cp.cuda.Device().synchronize()

-Original file line number
+Diff line change
@@ -0,0 +1,22 @@
+    """Module containing several options for computing sub-matrices for the MatChunk method."""
+    import numpy as np
+    def get_matrix_sets(bls, ndecimals: int = 2):
+        """Find redundant baselines."""
+        uvbins = set()
+        msets = []
+        # Everything here is in wavelengths
+        bls = np.round(bls, decimals=ndecimals)
+        nant = bls.shape[0]
+        # group redundant baselines
+        for i in range(nant):
+            for j in range(i + 1, nant):
+                u, v = bls[i, j]
+                if (u, v) not in uvbins and (-u, -v) not in uvbins:
+                    uvbins.add((u, v))
+                    msets.append([np.array([i]), np.array([j])])
+        return msets

-Original file line number
+Diff line change
@@ -1,7 +1,9 @@
     """Tests that the various matprod methods produce the same result."""
     import pytest
+    import cupy as cp
     import numpy as np
     from matvis._utils import get_dtypes
@@ Expand All / @@ -16,13 +18,20 @@ def simple_matprod(z): @@
     @pytest.mark.parametrize("antpairs", [True, False])
     @pytest.mark.parametrize("precision", [1, 2])
     @pytest.mark.parametrize(
-        "method", ["CPUMatMul", "CPUVectorDot", "GPUMatMul", "GPUVectorDot"]
+        "method",
+        [
+            "CPUMatMul",
+            "CPUVectorDot",
+            "CPUMatChunk",
+            "GPUMatMul",
+            "GPUVectorDot",
+            "GPUMatChunk",
+        ],
     )
-    def test_matprod(nfeed, antpairs, precision, method):
+    def test_matprod(nfeed, antpairs, matsets, precision, method):
         """Test that the various matprod methods produce the same result."""
         if method.startswith("GPU"):
             pytest.importorskip("cupy")
-            import cupy as cp
             from matvis.gpu import matprod as module
         else:
@@ Expand All / @@ -36,7 +45,24 @@ def test_matprod(nfeed, antpairs, precision, method): @@
         else:
             antpairs = None
-        obj = cls(nchunks=1, nfeed=nfeed, nant=nant, antpairs=antpairs, precision=precision)
+        if matsets:
+            matsets = [
+                (np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3])),
+                (np.array([0, 1, 2, 3]), np.array([3, 4])),
+                (np.array([3, 4]), np.array([0, 1, 2, 3])),
+                (np.array([3, 4]), np.array([3, 4])),
+            ]
+        else:
+            matsets = None
+        obj = cls(
+            nchunks=1,
+            nfeed=nfeed,
+            nant=nant,
+            antpairs=antpairs,
+            matsets=matsets,
+            precision=precision,
+        )
         obj.setup()
         ctype = get_dtypes(precision)[1]
@@ Expand Down @@

updates to incorporate the matrix chunking method #79

Are you sure you want to change the base?

Uh oh!

updates to incorporate the matrix chunking method #79

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

steven-murray Jan 23, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

steven-murray Jan 18, 2024

Choose a reason for hiding this comment

Uh oh!

steven-murray Jan 18, 2024

Choose a reason for hiding this comment

Uh oh!

ccain002 Jan 19, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

ccain002 Jan 19, 2024 •

edited

Loading