From 2b398d1b562a910074a92bc1d7896fff6b609ff9 Mon Sep 17 00:00:00 2001
From: lab-admin <ccain002@ucr.edu>
Date: Wed, 10 Jan 2024 20:06:36 -0700
Subject: [PATCH 1/5] updates to incorporate the matrix chunking method

---
 src/matvis/cli.py                             | 29 ++++++++++++-
 src/matvis/core/matprod.py                    |  8 +++-
 src/matvis/cpu/cpu.py                         | 41 +++++++++++++-----
 src/matvis/cpu/matprod.py                     | 42 ++++++++++++++++++-
 src/matvis/gpu/gpu.py                         |  3 +-
 src/matvis/gpu/matprod.py                     | 42 +++++++++++++++++++
 src/matvis/wrapper.py                         |  3 ++
 tests/matmul_checks/check_matsets_ordering.py | 38 +++++++++++++++++
 tests/test_matprod.py                         | 30 +++++++++++--
 9 files changed, 217 insertions(+), 19 deletions(-)
 create mode 100644 tests/matmul_checks/check_matsets_ordering.py

diff --git a/src/matvis/cli.py b/src/matvis/cli.py
index c6dbcd1..eb90168 100644
--- a/src/matvis/cli.py
+++ b/src/matvis/cli.py
@@ -78,6 +78,7 @@ def run_profile(
     naz=360,
     nza=180,
     pairs=None,
+	matsets=None,
     nchunks=1,
     source_buffer=1.0,
 ):
@@ -113,6 +114,7 @@ def run_profile(
     print(f"  ANALYTIC-BEAM:    {analytic_beam:>7}")
     print(f"  METHOD:           {method:>7}")
     print(f"  NPAIRS:           {len(pairs) if pairs is not None else nants**2:>7}")
+	#print(f"  NPAIRS:           {sum(np.array([len(pairs[i][0])*len(pairs[i][1]) for i in range(len(pairs))])) if pairs is not None else nants**2:>7}")
     print("---------------------------------")
 
     if gpu:
@@ -136,6 +138,7 @@ def run_profile(
         beam_idx=beam_idx,
         matprod_method=f"{'GPU' if gpu else 'CPU'}{method}",
         antpairs=pairs,
+		matsets=matsets,
         min_chunks=nchunks,
         source_buffer=source_buffer,
     )
@@ -264,6 +267,28 @@ def get_redundancies(bls, ndecimals: int = 2):
 
     return pairs
 
+# Chris: for now I have duplicated the get_redundancies function as a placeholder for generating the matrix sets and just assumed 
+# 1x1 sub-matrices (the vector method).  In the future this will be replaced by a function that takes in a set of sub-matrices provided 
+# by the user.  
+def get_matrix_sets(bls, ndecimals: int = 2):
+    """Find redundant baselines."""
+    uvbins = set()
+    msets = []
+
+    # Everything here is in wavelengths
+    bls = np.round(bls, decimals=ndecimals)
+    nant = bls.shape[0]
+
+    # group redundant baselines
+    for i in range(nant):
+        for j in range(i + 1, nant):
+            u, v = bls[i, j]
+            if (u, v) not in uvbins and (-u, -v) not in uvbins:
+                uvbins.add((u, v))
+                msets.append([np.array([i]), np.array([j]))
+
+    return msets
+
 
 @main.command()
 @click.option(
@@ -290,8 +315,10 @@ def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs):
 
     bls = antpos[np.newaxis, :, :2] - antpos[:, np.newaxis, :2]
     pairs = np.array(get_redundancies(bls.value))
+	matsets = list(get_matrix_sets(bls.value))
+	#pairs = list(get_redundancies(bls.value))
 
-    run_profile(nsource=12 * nside**2, nants=antpos.shape[0], pairs=pairs, **kwargs)
+    run_profile(nsource=12 * nside**2, nants=antpos.shape[0], pairs=pairs, matsets=matsets, **kwargs)
 
 
 def get_line_based_stats(lstats) -> tuple[dict, float]:
diff --git a/src/matvis/core/matprod.py b/src/matvis/core/matprod.py
index dbe49e9..9f961ea 100644
--- a/src/matvis/core/matprod.py
+++ b/src/matvis/core/matprod.py
@@ -20,6 +20,9 @@ class MatProd(ABC):
         Number of antennas.
     antpairs
         The antenna pairs to sum over. If None, all pairs are used.
+	matsets
+		The list of sub-matrices to calculate.  If None, all pairs are used in a single
+		matrix multiplication.  
     precision
         The precision of the data (1 or 2).
     """
@@ -29,15 +32,18 @@ def __init__(
         nchunks: int,
         nfeed: int,
         nant: int,
-        antpairs: np.ndarray | None,
+        antpairs: np.ndarray | list | None,
+		matsets: list | None,
         precision=1,
     ):
         if antpairs is None:
             self.all_pairs = True
             self.antpairs = np.array([(i, j) for i in range(nant) for j in range(nant)])
+            self.matsets = list([(np.array([i]), np.array([j])) for i in range(nant) for j in range(nant)])
         else:
             self.all_pairs = False
             self.antpairs = antpairs
+            self.matsets  = matsets
 
         self.nchunks = nchunks
         self.nfeed = nfeed
diff --git a/src/matvis/cpu/cpu.py b/src/matvis/cpu/cpu.py
index 34f23dd..4890376 100644
--- a/src/matvis/cpu/cpu.py
+++ b/src/matvis/cpu/cpu.py
@@ -32,6 +32,7 @@ def simulate(
     I_sky: np.ndarray,
     beam_list: Sequence[UVBeam | Callable] | None,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
+	matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
     precision: int = 1,
     polarized: bool = False,
     beam_idx: np.ndarray | None = None,
@@ -81,6 +82,13 @@ def simulate(
         Either a 2D array, shape ``(Npairs, 2)``, or list of 2-tuples of ints, with
         the list of antenna-pairs to return as visibilities (all feed-pairs are always
         calculated). If None, all feed-pairs are returned.
+	matsets  :: array_like, optional
+		 A list of containing a set of 2-tuples of numpy arrays.  Each entry in the list 
+		 corresponds to a different sub-matrix.  The first element of the ith tuple lists the rows of Z corresponding
+		 to the rows of the ith sub-matrix, and the second tuple element contains the list of columns.  In the case
+		 of vector-vector loop, the number of tuples is equal to Npairs and each element of the tuple contains only
+		 one int, such that the sub-matrices contain only one element each.  Outer dimension has to be a list because
+		 numpy doesn't like inhomogeneous arrays.  
     precision : int, optional
         Which precision level to use for floats and complex numbers.
         Allowed values:
@@ -99,16 +107,27 @@ def simulate(
     max_progress_reports : int, optional
         Maximum number of progress reports to print to the screen (if logging level
         allows). Default is 100.
-    matprod_method : str, optional
-        The method to use for the final matrix multiplication. Default is 'CPUMatMul',
-        which simply uses `np.dot` over the two full matrices. Currently, the other
-        option is `CPUVectorLoop`, which uses a loop over the antenna pairs,
-        computing the sum over sources as a vector dot product.
-        Whether to calculate visibilities for each antpair in antpairs as a vector
-        dot-product instead of using a full matrix-matrix multiplication for all
-        possible pairs. Default is False. Setting to True can be faster for large
-        arrays where `antpairs` is small (possibly from high redundancy). You should
-        run a performance test before using this.
+    #matprod_method : str, optional
+    #    The method to use for the final matrix multiplication. Default is 'CPUMatMul',
+    #    which simply uses `np.dot` over the two full matrices. Currently, the other
+    #    option is `CPUVectorLoop`, which uses a loop over the antenna pairs,
+    #    computing the sum over sources as a vector dot product.
+    #    Whether to calculate visibilities for each antpair in antpairs as a vector
+    #    dot-product instead of using a full matrix-matrix multiplication for all
+    #    possible pairs. Default is False. Setting to True can be faster for large
+    #    arrays where `antpairs` is small (possibly from high redundancy). You should
+    #    run a performance test before using this.
+	matprod_method : str, optional
+		The method to use for the final matrix multiplication. Default is 'CPUMatMul',
+        which simply uses `np.dot` over the two full matrices. The second option is 
+		`CPUVectorLoop`, which uses a loop over the antenna pairs, computing the sum 
+		over sources as a vector dot product.  The third option is ``CPUMatChunk'', which
+		divides the product into several matrix multiplications that are optimized to have the 
+		highest possible density of non-redundant pairs without invoking the overhead of a large
+		for loop over vector products.  For very large arrays, with very high redundancy, CPUVectorLoop
+		is usually fastest, while for cases with low redundancy, 'CPUMatMul' is fastest.  The ``CPUMatChunk''
+		can sometimes be fastest for intermediate levels of redundancy and intermediately sized arrays.  You should
+		run a performance test before choosing one of these options.  
     max_memory : int, optional
         The maximum memory (in bytes) to use for the visibility calculation. This is
         not a hard-set limit, but rather a guideline for how much memory to use. If the
@@ -175,7 +194,7 @@ def simulate(
         source_buffer=source_buffer,
     )
     mpcls = getattr(mp, matprod_method)
-    matprod = mpcls(nchunks, nfeed, nant, antpairs, precision=precision)
+    matprod = mpcls(nchunks, nfeed, nant, antpairs, matsets, precision=precision)
     zcalc = ZMatrixCalc(
         nsrc=nsrc_alloc,
         nfeed=nfeed,
diff --git a/src/matvis/cpu/matprod.py b/src/matvis/cpu/matprod.py
index 2e1191c..03df957 100644
--- a/src/matvis/cpu/matprod.py
+++ b/src/matvis/cpu/matprod.py
@@ -45,8 +45,48 @@ def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
             Output array, shaped as (Nfeed, Nfeed, Npairs).
         """
         z = z.reshape((self.nant, self.nfeed, -1))
-
+  
         for i, (ai, aj) in enumerate(self.antpairs):
             out[i] = z[ai].conj().dot(z[aj].T)
 
         return out
+		
+class CPUMatChunk(MatProd): 
+	
+	def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
+		"""Perform the source-summing operation for a single time and chunk.
+
+        Parameters
+        ----------
+        z
+            Complex integrand. Shape=(Nfeed, Nant, Nax, Nsrc).
+        out
+            Output array, shaped as (Nfeed, Nfeed, Npairs).
+        """
+		
+		z = z.reshape((self.nant, self.nfeed, -1))
+		
+		mat_product = np.zeros((self.nant,self.nant,self.nfeed,self.nfeed),dtype=z.dtype)
+		
+		# Chris 12/20/23: instead we will use matsets
+		for j in range(self.nfeed): 
+			for k in range(self.nfeed): 
+				for i, (ai, aj) in enumerate(self.matsets):
+					AI,AJ = np.meshgrid(ai,aj)
+					#print(ai)
+					#print(aj)
+					#print(AI)
+					#print(AJ)
+					mat_product[AI,AJ,j,k] = z[ai[:],j].conj().dot(z[aj[:],k].T).T
+				
+		# Now, we need to identify the non-redundant pairs and put them into the final output array
+		for j in range(self.nfeed): 
+			for k in range(self.nfeed): 
+				for i, (ai, aj) in enumerate(self.antpairs): 
+					out[i,j,k] = mat_product[ai,aj,j,k]
+
+		return out
+	
+		
+        
+ 
diff --git a/src/matvis/gpu/gpu.py b/src/matvis/gpu/gpu.py
index 9bf82ec..8ff0791 100644
--- a/src/matvis/gpu/gpu.py
+++ b/src/matvis/gpu/gpu.py
@@ -49,6 +49,7 @@ def simulate(
     beam_list: Sequence[UVBeam | Callable] | None,
     polarized: bool = False,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
+	matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
     beam_idx: np.ndarray | None = None,
     max_memory: int = np.inf,
     min_chunks: int = 1,
@@ -120,7 +121,7 @@ def simulate(
         ctype=ctype,
     )
     mpcls = getattr(mp, matprod_method)
-    matprod = mpcls(nchunks, nfeed, nant, antpairs, precision=precision)
+    matprod = mpcls(nchunks, nfeed, nant, antpairs, matsets, precision=precision)
 
     npixc = nsrc // nchunks
 
diff --git a/src/matvis/gpu/matprod.py b/src/matvis/gpu/matprod.py
index 6bff898..bf27dad 100644
--- a/src/matvis/gpu/matprod.py
+++ b/src/matvis/gpu/matprod.py
@@ -92,3 +92,45 @@ def sum_chunks(self, out: np.ndarray):
 
         out[:] = self.vis[0].transpose((2, 0, 1)).get()
         cp.cuda.Device().synchronize()
+
+
+class GPUMatChunk(MatProd):
+    """Use a loop over specific pairs, performing a vdot over the source axis."""
+
+    def allocate_vis(self):
+        """Allocate memory for the visibilities."""
+        self.vis = [
+            cp.full(
+                (self.nfeed, self.nfeed, self.npairs), 0.0, dtype=self.ctype, order="F"
+            )
+            for _ in range(self.nchunks)
+        ]
+		
+    def compute(self, z: cp.ndarray, out: cp.ndarray) -> cp.ndarray:
+        """Perform the source-summing operation for a single time and chunk."""
+        z = z.reshape((self.nant, self.nfeed, -1))
+		
+        mat_product = cp.zeros((self.nant,self.nant,self.nfeed,self.nfeed),dtype=z.dtype)
+
+        for j in range(self.nfeed): 
+            for k in range(self.nfeed): 
+                for i, (ai, aj) in enumerate(self.matsets):
+                    AI,AJ = cp.meshgrid(ai,aj)
+                    mat_product[AI,AJ,j,k] = z[ai[:],j].conj().dot(z[aj[:],k].T).T
+					
+        for j in range(self.nfeed): 
+            for k in range(self.nfeed): 
+                for i, (ai, aj) in enumerate(self.antpairs): 
+                    out[j,k,i] = mat_product[ai,aj,j,k]
+
+        cp.cuda.Device().synchronize()
+        return out
+		
+    def sum_chunks(self, out: np.ndarray):
+        """Sum the chunks into the output array."""
+        if self.nchunks > 1:
+            for i in range(1, len(self.vis)):
+                self.vis[0] += self.vis[i]
+
+        out[:] = self.vis[0].transpose((2, 0, 1)).get()
+        cp.cuda.Device().synchronize()
diff --git a/src/matvis/wrapper.py b/src/matvis/wrapper.py
index 9c7f7a1..aa7e786 100644
--- a/src/matvis/wrapper.py
+++ b/src/matvis/wrapper.py
@@ -28,6 +28,7 @@ def simulate_vis(
     beam_spline_opts: dict | None = None,
     beam_idx: np.ndarray | None = None,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
+	matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None, # set of sub-matrices
     source_buffer: float = 1.0,
     **backend_kwargs,
 ):
@@ -129,6 +130,7 @@ def simulate_vis(
     ]
 
     npairs = len(antpairs) if antpairs is not None else nants * nants
+    #npairs = sum(np.array([len(antpairs[i][0])*len(antpairs[i][1]) for i in range(len(antpairs))])) if antpairs is not None else nants * nants
     if polarized:
         vis = np.zeros(
             (freqs.size, lsts.size, npairs, nfeeds, nfeeds), dtype=complex_dtype
@@ -150,6 +152,7 @@ def simulate_vis(
             beam_spline_opts=beam_spline_opts,
             beam_idx=beam_idx,
             antpairs=antpairs,
+			matsets=matsets,
             source_buffer=source_buffer,
             **backend_kwargs,
         )
diff --git a/tests/matmul_checks/check_matsets_ordering.py b/tests/matmul_checks/check_matsets_ordering.py
new file mode 100644
index 0000000..dc9b100
--- /dev/null
+++ b/tests/matmul_checks/check_matsets_ordering.py
@@ -0,0 +1,38 @@
+import numpy as np
+
+# this test ensures that we are indexing the matrix product correctly to get the unique ant pairs when using the sub-matrix method
+
+Nant = 10
+
+V = np.zeros((Nant,Nant))
+
+for i in range(Nant): 
+	for j in range(Nant): 
+		V[i,j] = Nant*i + j
+		
+print(V)
+
+pairs = np.array([[0,3],[1,2],[2,3],[1,5],[6,7],[5,8],[6,9],[7,9],[8,9]])
+antpairs = np.copy(pairs)
+
+for i in range(len(pairs)): 
+	pairs[i] = np.array(pairs[i])
+	
+print(V[pairs[:,0],pairs[:,1]]) # print out the unique elements of V
+
+matsets = [(np.array([0,1,2]),np.array([2,3,5])),(np.array([5,6,7,8]),np.array([7,8,9]))]
+
+mat_product = np.zeros((Nant,Nant))
+out = np.zeros(len(antpairs))
+
+for i, (ai,aj) in enumerate(matsets): 
+	AI,AJ = np.meshgrid(ai,aj)
+	mat_product[AI,AJ] = V[AI,AJ]
+	
+for i, (ai,aj) in enumerate(antpairs):
+	 out[i] = mat_product[ai,aj]
+
+print(mat_product) 
+print(out) # should be the same as the second print statement
+
+
diff --git a/tests/test_matprod.py b/tests/test_matprod.py
index 154813b..257af0a 100644
--- a/tests/test_matprod.py
+++ b/tests/test_matprod.py
@@ -1,8 +1,12 @@
 """Tests that the various matprod methods produce the same result."""
 
+import sys
+sys.path.insert(0,'/home/lab-admin/Desktop/Desktop/Graduate_School/Research/ASU/21_group/matvis/src/')
+
 import pytest
 
 import numpy as np
+import cupy as cp
 
 from matvis._utils import get_dtypes
 
@@ -16,9 +20,9 @@ def simple_matprod(z):
 @pytest.mark.parametrize("antpairs", [True, False])
 @pytest.mark.parametrize("precision", [1, 2])
 @pytest.mark.parametrize(
-    "method", ["CPUMatMul", "CPUVectorDot", "GPUMatMul", "GPUVectorDot"]
+    "method", ["CPUMatMul", "CPUVectorDot", "CPUMatChunk"]#, "GPUMatMul", "GPUVectorDot"]
 )
-def test_matprod(nfeed, antpairs, precision, method):
+def test_matprod(nfeed, antpairs, matsets, precision, method):
     """Test that the various matprod methods produce the same result."""
     if method.startswith("GPU"):
         pytest.importorskip("cupy")
@@ -35,8 +39,15 @@ def test_matprod(nfeed, antpairs, precision, method):
         antpairs = np.array([(i, j) for i in range(nant) for j in range(nant)])
     else:
         antpairs = None
-
-    obj = cls(nchunks=1, nfeed=nfeed, nant=nant, antpairs=antpairs, precision=precision)
+        
+    if matsets and method.startswith("CPU"): 
+        matsets = [(np.array([0,1,2,3]),np.array([0,1,2,3])),(np.array([0,1,2,3]),np.array([3,4])),(np.array([3,4]),np.array([0,1,2,3])),(np.array([3,4]),np.array([3,4]))] 
+    elif matsets and method.startswith("GPU"): 
+        matsets = [(cp.array([0,1,2,3]),cp.array([0,1,2,3])),(cp.array([0,1,2,3]),cp.array([3,4])),(cp.array([3,4]),cp.array([0,1,2,3])),(cp.array([3,4]),cp.array([3,4]))] 
+    else: 
+        matsets = None
+	
+    obj = cls(nchunks=1, nfeed=nfeed, nant=nant, antpairs=antpairs, matsets=matsets, precision=precision)
     obj.setup()
 
     ctype = get_dtypes(precision)[1]
@@ -60,7 +71,18 @@ def test_matprod(nfeed, antpairs, precision, method):
     )
     if method.startswith("GPU"):
         true = true.get()
+        
+    print(np.shape(out))
+    print(np.shape(true))
+    print(out)
+    print(true)
+    print(out-true)
 
     assert out.dtype.type == ctype
     assert out.shape == (obj.npairs, nfeed, nfeed)
     np.testing.assert_allclose(out, true)
+
+#antpairs = np.array([(1,2),(1,3),(3,2),(3,4),(4,4)])
+
+#test_matprod(2,True,True,1,"CPUMatChunk")
+test_matprod(2,True,True,1,"GPUMatChunk")

From 64b52208e267f1bc262a43ba07a4e259b50f25c5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 11 Jan 2024 17:51:32 +0000
Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/matvis/cli.py                             |  6 +-
 src/matvis/core/matprod.py                    | 18 ++++--
 src/matvis/cpu/cpu.py                         | 38 +++++------
 src/matvis/cpu/matprod.py                     | 64 +++++++++----------
 src/matvis/gpu/gpu.py                         |  2 +-
 src/matvis/gpu/matprod.py                     | 28 ++++----
 src/matvis/wrapper.py                         |  7 +-
 tests/matmul_checks/check_matsets_ordering.py | 45 +++++++------
 tests/test_matprod.py                         | 55 +++++++++++-----
 9 files changed, 148 insertions(+), 115 deletions(-)

diff --git a/src/matvis/cli.py b/src/matvis/cli.py
index eb90168..721a3c3 100644
--- a/src/matvis/cli.py
+++ b/src/matvis/cli.py
@@ -267,9 +267,9 @@ def get_redundancies(bls, ndecimals: int = 2):
 
     return pairs
 
-# Chris: for now I have duplicated the get_redundancies function as a placeholder for generating the matrix sets and just assumed 
-# 1x1 sub-matrices (the vector method).  In the future this will be replaced by a function that takes in a set of sub-matrices provided 
-# by the user.  
+# Chris: for now I have duplicated the get_redundancies function as a placeholder for generating the matrix sets and just assumed
+# 1x1 sub-matrices (the vector method).  In the future this will be replaced by a function that takes in a set of sub-matrices provided
+# by the user.
 def get_matrix_sets(bls, ndecimals: int = 2):
     """Find redundant baselines."""
     uvbins = set()
diff --git a/src/matvis/core/matprod.py b/src/matvis/core/matprod.py
index 9f961ea..16f612a 100644
--- a/src/matvis/core/matprod.py
+++ b/src/matvis/core/matprod.py
@@ -20,9 +20,9 @@ class MatProd(ABC):
         Number of antennas.
     antpairs
         The antenna pairs to sum over. If None, all pairs are used.
-	matsets
-		The list of sub-matrices to calculate.  If None, all pairs are used in a single
-		matrix multiplication.  
+        matsets
+                The list of sub-matrices to calculate.  If None, all pairs are used in a single
+                matrix multiplication.
     precision
         The precision of the data (1 or 2).
     """
@@ -33,17 +33,23 @@ def __init__(
         nfeed: int,
         nant: int,
         antpairs: np.ndarray | list | None,
-		matsets: list | None,
+        matsets: list | None,
         precision=1,
     ):
         if antpairs is None:
             self.all_pairs = True
             self.antpairs = np.array([(i, j) for i in range(nant) for j in range(nant)])
-            self.matsets = list([(np.array([i]), np.array([j])) for i in range(nant) for j in range(nant)])
+            self.matsets = list(
+                [
+                    (np.array([i]), np.array([j]))
+                    for i in range(nant)
+                    for j in range(nant)
+                ]
+            )
         else:
             self.all_pairs = False
             self.antpairs = antpairs
-            self.matsets  = matsets
+            self.matsets = matsets
 
         self.nchunks = nchunks
         self.nfeed = nfeed
diff --git a/src/matvis/cpu/cpu.py b/src/matvis/cpu/cpu.py
index 4890376..1ec7c45 100644
--- a/src/matvis/cpu/cpu.py
+++ b/src/matvis/cpu/cpu.py
@@ -32,7 +32,7 @@ def simulate(
     I_sky: np.ndarray,
     beam_list: Sequence[UVBeam | Callable] | None,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
-	matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
+    matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
     precision: int = 1,
     polarized: bool = False,
     beam_idx: np.ndarray | None = None,
@@ -82,13 +82,13 @@ def simulate(
         Either a 2D array, shape ``(Npairs, 2)``, or list of 2-tuples of ints, with
         the list of antenna-pairs to return as visibilities (all feed-pairs are always
         calculated). If None, all feed-pairs are returned.
-	matsets  :: array_like, optional
-		 A list of containing a set of 2-tuples of numpy arrays.  Each entry in the list 
-		 corresponds to a different sub-matrix.  The first element of the ith tuple lists the rows of Z corresponding
-		 to the rows of the ith sub-matrix, and the second tuple element contains the list of columns.  In the case
-		 of vector-vector loop, the number of tuples is equal to Npairs and each element of the tuple contains only
-		 one int, such that the sub-matrices contain only one element each.  Outer dimension has to be a list because
-		 numpy doesn't like inhomogeneous arrays.  
+        matsets  :: array_like, optional
+                 A list of containing a set of 2-tuples of numpy arrays.  Each entry in the list
+                 corresponds to a different sub-matrix.  The first element of the ith tuple lists the rows of Z corresponding
+                 to the rows of the ith sub-matrix, and the second tuple element contains the list of columns.  In the case
+                 of vector-vector loop, the number of tuples is equal to Npairs and each element of the tuple contains only
+                 one int, such that the sub-matrices contain only one element each.  Outer dimension has to be a list because
+                 numpy doesn't like inhomogeneous arrays.
     precision : int, optional
         Which precision level to use for floats and complex numbers.
         Allowed values:
@@ -117,17 +117,17 @@ def simulate(
     #    possible pairs. Default is False. Setting to True can be faster for large
     #    arrays where `antpairs` is small (possibly from high redundancy). You should
     #    run a performance test before using this.
-	matprod_method : str, optional
-		The method to use for the final matrix multiplication. Default is 'CPUMatMul',
-        which simply uses `np.dot` over the two full matrices. The second option is 
-		`CPUVectorLoop`, which uses a loop over the antenna pairs, computing the sum 
-		over sources as a vector dot product.  The third option is ``CPUMatChunk'', which
-		divides the product into several matrix multiplications that are optimized to have the 
-		highest possible density of non-redundant pairs without invoking the overhead of a large
-		for loop over vector products.  For very large arrays, with very high redundancy, CPUVectorLoop
-		is usually fastest, while for cases with low redundancy, 'CPUMatMul' is fastest.  The ``CPUMatChunk''
-		can sometimes be fastest for intermediate levels of redundancy and intermediately sized arrays.  You should
-		run a performance test before choosing one of these options.  
+        matprod_method : str, optional
+                The method to use for the final matrix multiplication. Default is 'CPUMatMul',
+        which simply uses `np.dot` over the two full matrices. The second option is
+                `CPUVectorLoop`, which uses a loop over the antenna pairs, computing the sum
+                over sources as a vector dot product.  The third option is ``CPUMatChunk'', which
+                divides the product into several matrix multiplications that are optimized to have the
+                highest possible density of non-redundant pairs without invoking the overhead of a large
+                for loop over vector products.  For very large arrays, with very high redundancy, CPUVectorLoop
+                is usually fastest, while for cases with low redundancy, 'CPUMatMul' is fastest.  The ``CPUMatChunk''
+                can sometimes be fastest for intermediate levels of redundancy and intermediately sized arrays.  You should
+                run a performance test before choosing one of these options.
     max_memory : int, optional
         The maximum memory (in bytes) to use for the visibility calculation. This is
         not a hard-set limit, but rather a guideline for how much memory to use. If the
diff --git a/src/matvis/cpu/matprod.py b/src/matvis/cpu/matprod.py
index 03df957..ee4caaf 100644
--- a/src/matvis/cpu/matprod.py
+++ b/src/matvis/cpu/matprod.py
@@ -45,16 +45,16 @@ def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
             Output array, shaped as (Nfeed, Nfeed, Npairs).
         """
         z = z.reshape((self.nant, self.nfeed, -1))
-  
+
         for i, (ai, aj) in enumerate(self.antpairs):
             out[i] = z[ai].conj().dot(z[aj].T)
 
         return out
-		
-class CPUMatChunk(MatProd): 
-	
-	def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
-		"""Perform the source-summing operation for a single time and chunk.
+
+
+class CPUMatChunk(MatProd):
+    def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
+        """Perform the source-summing operation for a single time and chunk.
 
         Parameters
         ----------
@@ -63,30 +63,28 @@ def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
         out
             Output array, shaped as (Nfeed, Nfeed, Npairs).
         """
-		
-		z = z.reshape((self.nant, self.nfeed, -1))
-		
-		mat_product = np.zeros((self.nant,self.nant,self.nfeed,self.nfeed),dtype=z.dtype)
-		
-		# Chris 12/20/23: instead we will use matsets
-		for j in range(self.nfeed): 
-			for k in range(self.nfeed): 
-				for i, (ai, aj) in enumerate(self.matsets):
-					AI,AJ = np.meshgrid(ai,aj)
-					#print(ai)
-					#print(aj)
-					#print(AI)
-					#print(AJ)
-					mat_product[AI,AJ,j,k] = z[ai[:],j].conj().dot(z[aj[:],k].T).T
-				
-		# Now, we need to identify the non-redundant pairs and put them into the final output array
-		for j in range(self.nfeed): 
-			for k in range(self.nfeed): 
-				for i, (ai, aj) in enumerate(self.antpairs): 
-					out[i,j,k] = mat_product[ai,aj,j,k]
-
-		return out
-	
-		
-        
- 
+
+        z = z.reshape((self.nant, self.nfeed, -1))
+
+        mat_product = np.zeros(
+            (self.nant, self.nant, self.nfeed, self.nfeed), dtype=z.dtype
+        )
+
+        # Chris 12/20/23: instead we will use matsets
+        for j in range(self.nfeed):
+            for k in range(self.nfeed):
+                for i, (ai, aj) in enumerate(self.matsets):
+                    AI, AJ = np.meshgrid(ai, aj)
+                    # print(ai)
+                    # print(aj)
+                    # print(AI)
+                    # print(AJ)
+                    mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T
+
+        # Now, we need to identify the non-redundant pairs and put them into the final output array
+        for j in range(self.nfeed):
+            for k in range(self.nfeed):
+                for i, (ai, aj) in enumerate(self.antpairs):
+                    out[i, j, k] = mat_product[ai, aj, j, k]
+
+        return out
diff --git a/src/matvis/gpu/gpu.py b/src/matvis/gpu/gpu.py
index 8ff0791..b93ee02 100644
--- a/src/matvis/gpu/gpu.py
+++ b/src/matvis/gpu/gpu.py
@@ -49,7 +49,7 @@ def simulate(
     beam_list: Sequence[UVBeam | Callable] | None,
     polarized: bool = False,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
-	matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
+    matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None,
     beam_idx: np.ndarray | None = None,
     max_memory: int = np.inf,
     min_chunks: int = 1,
diff --git a/src/matvis/gpu/matprod.py b/src/matvis/gpu/matprod.py
index bf27dad..5bfd1f2 100644
--- a/src/matvis/gpu/matprod.py
+++ b/src/matvis/gpu/matprod.py
@@ -105,27 +105,29 @@ def allocate_vis(self):
             )
             for _ in range(self.nchunks)
         ]
-		
+
     def compute(self, z: cp.ndarray, out: cp.ndarray) -> cp.ndarray:
         """Perform the source-summing operation for a single time and chunk."""
         z = z.reshape((self.nant, self.nfeed, -1))
-		
-        mat_product = cp.zeros((self.nant,self.nant,self.nfeed,self.nfeed),dtype=z.dtype)
 
-        for j in range(self.nfeed): 
-            for k in range(self.nfeed): 
+        mat_product = cp.zeros(
+            (self.nant, self.nant, self.nfeed, self.nfeed), dtype=z.dtype
+        )
+
+        for j in range(self.nfeed):
+            for k in range(self.nfeed):
                 for i, (ai, aj) in enumerate(self.matsets):
-                    AI,AJ = cp.meshgrid(ai,aj)
-                    mat_product[AI,AJ,j,k] = z[ai[:],j].conj().dot(z[aj[:],k].T).T
-					
-        for j in range(self.nfeed): 
-            for k in range(self.nfeed): 
-                for i, (ai, aj) in enumerate(self.antpairs): 
-                    out[j,k,i] = mat_product[ai,aj,j,k]
+                    AI, AJ = cp.meshgrid(ai, aj)
+                    mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T
+
+        for j in range(self.nfeed):
+            for k in range(self.nfeed):
+                for i, (ai, aj) in enumerate(self.antpairs):
+                    out[j, k, i] = mat_product[ai, aj, j, k]
 
         cp.cuda.Device().synchronize()
         return out
-		
+
     def sum_chunks(self, out: np.ndarray):
         """Sum the chunks into the output array."""
         if self.nchunks > 1:
diff --git a/src/matvis/wrapper.py b/src/matvis/wrapper.py
index aa7e786..78e5f1d 100644
--- a/src/matvis/wrapper.py
+++ b/src/matvis/wrapper.py
@@ -28,7 +28,8 @@ def simulate_vis(
     beam_spline_opts: dict | None = None,
     beam_idx: np.ndarray | None = None,
     antpairs: np.ndarray | list[tuple[int, int]] | None = None,
-	matsets: list[tuple[np.ndarray[int], np.ndarray[int]]] | None = None, # set of sub-matrices
+    matsets: list[tuple[np.ndarray[int], np.ndarray[int]]]
+    | None = None,  # set of sub-matrices
     source_buffer: float = 1.0,
     **backend_kwargs,
 ):
@@ -130,7 +131,7 @@ def simulate_vis(
     ]
 
     npairs = len(antpairs) if antpairs is not None else nants * nants
-    #npairs = sum(np.array([len(antpairs[i][0])*len(antpairs[i][1]) for i in range(len(antpairs))])) if antpairs is not None else nants * nants
+    # npairs = sum(np.array([len(antpairs[i][0])*len(antpairs[i][1]) for i in range(len(antpairs))])) if antpairs is not None else nants * nants
     if polarized:
         vis = np.zeros(
             (freqs.size, lsts.size, npairs, nfeeds, nfeeds), dtype=complex_dtype
@@ -152,7 +153,7 @@ def simulate_vis(
             beam_spline_opts=beam_spline_opts,
             beam_idx=beam_idx,
             antpairs=antpairs,
-			matsets=matsets,
+            matsets=matsets,
             source_buffer=source_buffer,
             **backend_kwargs,
         )
diff --git a/tests/matmul_checks/check_matsets_ordering.py b/tests/matmul_checks/check_matsets_ordering.py
index dc9b100..00e26d0 100644
--- a/tests/matmul_checks/check_matsets_ordering.py
+++ b/tests/matmul_checks/check_matsets_ordering.py
@@ -4,35 +4,38 @@
 
 Nant = 10
 
-V = np.zeros((Nant,Nant))
+V = np.zeros((Nant, Nant))
+
+for i in range(Nant):
+    for j in range(Nant):
+        V[i, j] = Nant * i + j
 
-for i in range(Nant): 
-	for j in range(Nant): 
-		V[i,j] = Nant*i + j
-		
 print(V)
 
-pairs = np.array([[0,3],[1,2],[2,3],[1,5],[6,7],[5,8],[6,9],[7,9],[8,9]])
+pairs = np.array(
+    [[0, 3], [1, 2], [2, 3], [1, 5], [6, 7], [5, 8], [6, 9], [7, 9], [8, 9]]
+)
 antpairs = np.copy(pairs)
 
-for i in range(len(pairs)): 
-	pairs[i] = np.array(pairs[i])
-	
-print(V[pairs[:,0],pairs[:,1]]) # print out the unique elements of V
+for i in range(len(pairs)):
+    pairs[i] = np.array(pairs[i])
 
-matsets = [(np.array([0,1,2]),np.array([2,3,5])),(np.array([5,6,7,8]),np.array([7,8,9]))]
+print(V[pairs[:, 0], pairs[:, 1]])  # print out the unique elements of V
 
-mat_product = np.zeros((Nant,Nant))
-out = np.zeros(len(antpairs))
+matsets = [
+    (np.array([0, 1, 2]), np.array([2, 3, 5])),
+    (np.array([5, 6, 7, 8]), np.array([7, 8, 9])),
+]
 
-for i, (ai,aj) in enumerate(matsets): 
-	AI,AJ = np.meshgrid(ai,aj)
-	mat_product[AI,AJ] = V[AI,AJ]
-	
-for i, (ai,aj) in enumerate(antpairs):
-	 out[i] = mat_product[ai,aj]
+mat_product = np.zeros((Nant, Nant))
+out = np.zeros(len(antpairs))
 
-print(mat_product) 
-print(out) # should be the same as the second print statement
+for i, (ai, aj) in enumerate(matsets):
+    AI, AJ = np.meshgrid(ai, aj)
+    mat_product[AI, AJ] = V[AI, AJ]
 
+for i, (ai, aj) in enumerate(antpairs):
+    out[i] = mat_product[ai, aj]
 
+print(mat_product)
+print(out)  # should be the same as the second print statement
diff --git a/tests/test_matprod.py b/tests/test_matprod.py
index 257af0a..082c5d5 100644
--- a/tests/test_matprod.py
+++ b/tests/test_matprod.py
@@ -1,12 +1,16 @@
 """Tests that the various matprod methods produce the same result."""
 
 import sys
-sys.path.insert(0,'/home/lab-admin/Desktop/Desktop/Graduate_School/Research/ASU/21_group/matvis/src/')
+
+sys.path.insert(
+    0,
+    "/home/lab-admin/Desktop/Desktop/Graduate_School/Research/ASU/21_group/matvis/src/",
+)
 
 import pytest
 
-import numpy as np
 import cupy as cp
+import numpy as np
 
 from matvis._utils import get_dtypes
 
@@ -20,7 +24,8 @@ def simple_matprod(z):
 @pytest.mark.parametrize("antpairs", [True, False])
 @pytest.mark.parametrize("precision", [1, 2])
 @pytest.mark.parametrize(
-    "method", ["CPUMatMul", "CPUVectorDot", "CPUMatChunk"]#, "GPUMatMul", "GPUVectorDot"]
+    "method",
+    ["CPUMatMul", "CPUVectorDot", "CPUMatChunk"],  # , "GPUMatMul", "GPUVectorDot"]
 )
 def test_matprod(nfeed, antpairs, matsets, precision, method):
     """Test that the various matprod methods produce the same result."""
@@ -39,15 +44,32 @@ def test_matprod(nfeed, antpairs, matsets, precision, method):
         antpairs = np.array([(i, j) for i in range(nant) for j in range(nant)])
     else:
         antpairs = None
-        
-    if matsets and method.startswith("CPU"): 
-        matsets = [(np.array([0,1,2,3]),np.array([0,1,2,3])),(np.array([0,1,2,3]),np.array([3,4])),(np.array([3,4]),np.array([0,1,2,3])),(np.array([3,4]),np.array([3,4]))] 
-    elif matsets and method.startswith("GPU"): 
-        matsets = [(cp.array([0,1,2,3]),cp.array([0,1,2,3])),(cp.array([0,1,2,3]),cp.array([3,4])),(cp.array([3,4]),cp.array([0,1,2,3])),(cp.array([3,4]),cp.array([3,4]))] 
-    else: 
+
+    if matsets and method.startswith("CPU"):
+        matsets = [
+            (np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3])),
+            (np.array([0, 1, 2, 3]), np.array([3, 4])),
+            (np.array([3, 4]), np.array([0, 1, 2, 3])),
+            (np.array([3, 4]), np.array([3, 4])),
+        ]
+    elif matsets and method.startswith("GPU"):
+        matsets = [
+            (cp.array([0, 1, 2, 3]), cp.array([0, 1, 2, 3])),
+            (cp.array([0, 1, 2, 3]), cp.array([3, 4])),
+            (cp.array([3, 4]), cp.array([0, 1, 2, 3])),
+            (cp.array([3, 4]), cp.array([3, 4])),
+        ]
+    else:
         matsets = None
-	
-    obj = cls(nchunks=1, nfeed=nfeed, nant=nant, antpairs=antpairs, matsets=matsets, precision=precision)
+
+    obj = cls(
+        nchunks=1,
+        nfeed=nfeed,
+        nant=nant,
+        antpairs=antpairs,
+        matsets=matsets,
+        precision=precision,
+    )
     obj.setup()
 
     ctype = get_dtypes(precision)[1]
@@ -71,18 +93,19 @@ def test_matprod(nfeed, antpairs, matsets, precision, method):
     )
     if method.startswith("GPU"):
         true = true.get()
-        
+
     print(np.shape(out))
     print(np.shape(true))
     print(out)
     print(true)
-    print(out-true)
+    print(out - true)
 
     assert out.dtype.type == ctype
     assert out.shape == (obj.npairs, nfeed, nfeed)
     np.testing.assert_allclose(out, true)
 
-#antpairs = np.array([(1,2),(1,3),(3,2),(3,4),(4,4)])
 
-#test_matprod(2,True,True,1,"CPUMatChunk")
-test_matprod(2,True,True,1,"GPUMatChunk")
+# antpairs = np.array([(1,2),(1,3),(3,2),(3,4),(4,4)])
+
+# test_matprod(2,True,True,1,"CPUMatChunk")
+test_matprod(2, True, True, 1, "GPUMatChunk")

From 2a4a5749427a29e4df9534e7496fc475c053ba55 Mon Sep 17 00:00:00 2001
From: christopher <ccain002@ucr.edu>
Date: Mon, 22 Jan 2024 10:56:18 -0800
Subject: [PATCH 3/5] formatting changes & added check to setup

---
 src/matvis/cli.py          | 41 +++++++++++------------------------
 src/matvis/core/matprod.py | 37 +++++++++++++++++++++++---------
 src/matvis/cpu/cpu.py      | 44 +++++++++++++++-----------------------
 src/matvis/cpu/matprod.py  |  7 ++----
 src/matvis/submatrices.py  | 22 +++++++++++++++++++
 src/matvis/wrapper.py      |  1 -
 6 files changed, 80 insertions(+), 72 deletions(-)
 create mode 100644 src/matvis/submatrices.py

diff --git a/src/matvis/cli.py b/src/matvis/cli.py
index 721a3c3..f6905e2 100644
--- a/src/matvis/cli.py
+++ b/src/matvis/cli.py
@@ -21,6 +21,7 @@
 from pathlib import Path
 from pyuvdata import UVBeam
 from pyuvsim import AnalyticBeam, simsetup
+from submatrices import get_matrix_sets
 from typing import Literal
 
 from matvis import DATA_PATH, HAVE_GPU, coordinates, cpu, simulate_vis
@@ -78,7 +79,7 @@ def run_profile(
     naz=360,
     nza=180,
     pairs=None,
-	matsets=None,
+    matsets=None,
     nchunks=1,
     source_buffer=1.0,
 ):
@@ -114,7 +115,6 @@ def run_profile(
     print(f"  ANALYTIC-BEAM:    {analytic_beam:>7}")
     print(f"  METHOD:           {method:>7}")
     print(f"  NPAIRS:           {len(pairs) if pairs is not None else nants**2:>7}")
-	#print(f"  NPAIRS:           {sum(np.array([len(pairs[i][0])*len(pairs[i][1]) for i in range(len(pairs))])) if pairs is not None else nants**2:>7}")
     print("---------------------------------")
 
     if gpu:
@@ -138,7 +138,7 @@ def run_profile(
         beam_idx=beam_idx,
         matprod_method=f"{'GPU' if gpu else 'CPU'}{method}",
         antpairs=pairs,
-		matsets=matsets,
+        matsets=matsets,
         min_chunks=nchunks,
         source_buffer=source_buffer,
     )
@@ -267,28 +267,6 @@ def get_redundancies(bls, ndecimals: int = 2):
 
     return pairs
 
-# Chris: for now I have duplicated the get_redundancies function as a placeholder for generating the matrix sets and just assumed
-# 1x1 sub-matrices (the vector method).  In the future this will be replaced by a function that takes in a set of sub-matrices provided
-# by the user.
-def get_matrix_sets(bls, ndecimals: int = 2):
-    """Find redundant baselines."""
-    uvbins = set()
-    msets = []
-
-    # Everything here is in wavelengths
-    bls = np.round(bls, decimals=ndecimals)
-    nant = bls.shape[0]
-
-    # group redundant baselines
-    for i in range(nant):
-        for j in range(i + 1, nant):
-            u, v = bls[i, j]
-            if (u, v) not in uvbins and (-u, -v) not in uvbins:
-                uvbins.add((u, v))
-                msets.append([np.array([i]), np.array([j]))
-
-    return msets
-
 
 @main.command()
 @click.option(
@@ -315,10 +293,15 @@ def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs):
 
     bls = antpos[np.newaxis, :, :2] - antpos[:, np.newaxis, :2]
     pairs = np.array(get_redundancies(bls.value))
-	matsets = list(get_matrix_sets(bls.value))
-	#pairs = list(get_redundancies(bls.value))
-
-    run_profile(nsource=12 * nside**2, nants=antpos.shape[0], pairs=pairs, matsets=matsets, **kwargs)
+    matsets = list(get_matrix_sets(bls.value))
+
+    run_profile(
+        nsource=12 * nside**2,
+        nants=antpos.shape[0],
+        pairs=pairs,
+        matsets=matsets,
+        **kwargs,
+    )
 
 
 def get_line_based_stats(lstats) -> tuple[dict, float]:
diff --git a/src/matvis/core/matprod.py b/src/matvis/core/matprod.py
index 16f612a..9116e80 100644
--- a/src/matvis/core/matprod.py
+++ b/src/matvis/core/matprod.py
@@ -20,9 +20,9 @@ class MatProd(ABC):
         Number of antennas.
     antpairs
         The antenna pairs to sum over. If None, all pairs are used.
-        matsets
-                The list of sub-matrices to calculate.  If None, all pairs are used in a single
-                matrix multiplication.
+    matsets
+        The list of sub-matrices to calculate.  If None, all pairs are used in a single
+        matrix multiplication.
     precision
         The precision of the data (1 or 2).
     """
@@ -39,13 +39,7 @@ def __init__(
         if antpairs is None:
             self.all_pairs = True
             self.antpairs = np.array([(i, j) for i in range(nant) for j in range(nant)])
-            self.matsets = list(
-                [
-                    (np.array([i]), np.array([j]))
-                    for i in range(nant)
-                    for j in range(nant)
-                ]
-            )
+            self.matsets = None
         else:
             self.all_pairs = False
             self.antpairs = antpairs
@@ -72,10 +66,33 @@ def allocate_vis(self):
             (self.nchunks, self.npairs, self.nfeed, self.nfeed), 0.0, dtype=self.ctype
         )
 
+    def check_antpairs_in_matsets(self):
+        """Check that all non-redundant ant pairs are included in set of sub-matrices.
+
+        If using the CPUMatChunk method, make sure that all non-redudant antpairs are included somewhere
+        in the set of sub-matrices to be calculated.  Otherwise, throw an exception.
+        """
+        antpairs_set = set()
+        matsets_set = set()
+
+        for _, (ai, aj) in enumerate(self.antpairs):
+            antpairs_set.add((ai, aj))
+
+        for _, (ai, aj) in enumerate(self.matsets):
+            for i in range(len(ai)):
+                for j in range(len(aj)):
+                    matsets_set.add((ai[i], aj[j]))
+
+        if not antpairs_set.issubset(matsets_set):
+            raise Exception("Some non-redundant pairs are missing from sub-matrices.  ")
+
     def setup(self):
         """Setup the memory for the object."""
         self.allocate_vis()
 
+        if self.matsets is not None:
+            self.check_antpairs_in_matsets()
+
     @abstractmethod
     def compute(self, z: np.ndarray, out: np.ndarray):
         """
diff --git a/src/matvis/cpu/cpu.py b/src/matvis/cpu/cpu.py
index 1ec7c45..99c12dc 100644
--- a/src/matvis/cpu/cpu.py
+++ b/src/matvis/cpu/cpu.py
@@ -82,13 +82,13 @@ def simulate(
         Either a 2D array, shape ``(Npairs, 2)``, or list of 2-tuples of ints, with
         the list of antenna-pairs to return as visibilities (all feed-pairs are always
         calculated). If None, all feed-pairs are returned.
-        matsets  :: array_like, optional
-                 A list of containing a set of 2-tuples of numpy arrays.  Each entry in the list
-                 corresponds to a different sub-matrix.  The first element of the ith tuple lists the rows of Z corresponding
-                 to the rows of the ith sub-matrix, and the second tuple element contains the list of columns.  In the case
-                 of vector-vector loop, the number of tuples is equal to Npairs and each element of the tuple contains only
-                 one int, such that the sub-matrices contain only one element each.  Outer dimension has to be a list because
-                 numpy doesn't like inhomogeneous arrays.
+    matsets  : array_like, optional
+        A list of containing a set of 2-tuples of numpy arrays.  Each entry in the list
+        corresponds to a different sub-matrix.  The first element of the ith tuple lists the rows of Z corresponding
+        to the rows of the ith sub-matrix, and the second tuple element contains the list of columns.  In the case
+        of vector-vector loop, the number of tuples is equal to Npairs and each element of the tuple contains only
+        one int, such that the sub-matrices contain only one element each.  Outer dimension has to be a list because
+        numpy doesn't like inhomogeneous arrays.
     precision : int, optional
         Which precision level to use for floats and complex numbers.
         Allowed values:
@@ -107,27 +107,17 @@ def simulate(
     max_progress_reports : int, optional
         Maximum number of progress reports to print to the screen (if logging level
         allows). Default is 100.
-    #matprod_method : str, optional
-    #    The method to use for the final matrix multiplication. Default is 'CPUMatMul',
-    #    which simply uses `np.dot` over the two full matrices. Currently, the other
-    #    option is `CPUVectorLoop`, which uses a loop over the antenna pairs,
-    #    computing the sum over sources as a vector dot product.
-    #    Whether to calculate visibilities for each antpair in antpairs as a vector
-    #    dot-product instead of using a full matrix-matrix multiplication for all
-    #    possible pairs. Default is False. Setting to True can be faster for large
-    #    arrays where `antpairs` is small (possibly from high redundancy). You should
-    #    run a performance test before using this.
-        matprod_method : str, optional
-                The method to use for the final matrix multiplication. Default is 'CPUMatMul',
+    matprod_method : str, optional
+        The method to use for the final matrix multiplication. Default is 'CPUMatMul',
         which simply uses `np.dot` over the two full matrices. The second option is
-                `CPUVectorLoop`, which uses a loop over the antenna pairs, computing the sum
-                over sources as a vector dot product.  The third option is ``CPUMatChunk'', which
-                divides the product into several matrix multiplications that are optimized to have the
-                highest possible density of non-redundant pairs without invoking the overhead of a large
-                for loop over vector products.  For very large arrays, with very high redundancy, CPUVectorLoop
-                is usually fastest, while for cases with low redundancy, 'CPUMatMul' is fastest.  The ``CPUMatChunk''
-                can sometimes be fastest for intermediate levels of redundancy and intermediately sized arrays.  You should
-                run a performance test before choosing one of these options.
+        `CPUVectorLoop`, which uses a loop over the antenna pairs, computing the sum
+        over sources as a vector dot product.  The third option is `CPUMatChunk`, which
+        divides the product into several matrix multiplications that are optimized to have the
+        highest possible density of non-redundant pairs without invoking the overhead of a large
+        for loop over vector products.  For very large arrays, with very high redundancy, CPUVectorLoop
+        is usually fastest, while for cases with low redundancy, `CPUMatMul` is fastest.  The `CPUMatChunk`
+        can sometimes be fastest for intermediate levels of redundancy and intermediately sized arrays.  You should
+        run a performance test before choosing one of these options.
     max_memory : int, optional
         The maximum memory (in bytes) to use for the visibility calculation. This is
         not a hard-set limit, but rather a guideline for how much memory to use. If the
diff --git a/src/matvis/cpu/matprod.py b/src/matvis/cpu/matprod.py
index ee4caaf..2b96d34 100644
--- a/src/matvis/cpu/matprod.py
+++ b/src/matvis/cpu/matprod.py
@@ -53,6 +53,8 @@ def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
 
 
 class CPUMatChunk(MatProd):
+    """Loop over a small set of sub-matrix products which collectively contain all nont-redundant pairs."""
+
     def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
         """Perform the source-summing operation for a single time and chunk.
 
@@ -63,7 +65,6 @@ def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
         out
             Output array, shaped as (Nfeed, Nfeed, Npairs).
         """
-
         z = z.reshape((self.nant, self.nfeed, -1))
 
         mat_product = np.zeros(
@@ -75,10 +76,6 @@ def compute(self, z: np.ndarray, out: np.ndarray) -> np.ndarray:
             for k in range(self.nfeed):
                 for i, (ai, aj) in enumerate(self.matsets):
                     AI, AJ = np.meshgrid(ai, aj)
-                    # print(ai)
-                    # print(aj)
-                    # print(AI)
-                    # print(AJ)
                     mat_product[AI, AJ, j, k] = z[ai[:], j].conj().dot(z[aj[:], k].T).T
 
         # Now, we need to identify the non-redundant pairs and put them into the final output array
diff --git a/src/matvis/submatrices.py b/src/matvis/submatrices.py
new file mode 100644
index 0000000..f755f2d
--- /dev/null
+++ b/src/matvis/submatrices.py
@@ -0,0 +1,22 @@
+"""Module containing several options for computing sub-matrices for the MatChunk method."""
+import numpy as np
+
+
+def get_matrix_sets(bls, ndecimals: int = 2):
+    """Find redundant baselines."""
+    uvbins = set()
+    msets = []
+
+    # Everything here is in wavelengths
+    bls = np.round(bls, decimals=ndecimals)
+    nant = bls.shape[0]
+
+    # group redundant baselines
+    for i in range(nant):
+        for j in range(i + 1, nant):
+            u, v = bls[i, j]
+            if (u, v) not in uvbins and (-u, -v) not in uvbins:
+                uvbins.add((u, v))
+                msets.append([np.array([i]), np.array([j])])
+
+    return msets
diff --git a/src/matvis/wrapper.py b/src/matvis/wrapper.py
index 78e5f1d..831ca5a 100644
--- a/src/matvis/wrapper.py
+++ b/src/matvis/wrapper.py
@@ -131,7 +131,6 @@ def simulate_vis(
     ]
 
     npairs = len(antpairs) if antpairs is not None else nants * nants
-    # npairs = sum(np.array([len(antpairs[i][0])*len(antpairs[i][1]) for i in range(len(antpairs))])) if antpairs is not None else nants * nants
     if polarized:
         vis = np.zeros(
             (freqs.size, lsts.size, npairs, nfeeds, nfeeds), dtype=complex_dtype

From e3690c35f04815a5bb1e0e9e1443eac1235616fa Mon Sep 17 00:00:00 2001
From: christopher <ccain002@ucr.edu>
Date: Mon, 22 Jan 2024 11:15:32 -0800
Subject: [PATCH 4/5] more updates

---
 src/matvis/cli.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/matvis/cli.py b/src/matvis/cli.py
index f6905e2..96c591f 100644
--- a/src/matvis/cli.py
+++ b/src/matvis/cli.py
@@ -21,7 +21,7 @@
 from pathlib import Path
 from pyuvdata import UVBeam
 from pyuvsim import AnalyticBeam, simsetup
-from submatrices import get_matrix_sets
+from submatrices import *
 from typing import Literal
 
 from matvis import DATA_PATH, HAVE_GPU, coordinates, cpu, simulate_vis
@@ -281,8 +281,9 @@ def get_redundancies(bls, ndecimals: int = 2):
 )
 @click.option("-k", "--keep-ants", type=str, default="")
 @click.option("--outriggers/--no-outriggers", default=False)
+@click.option("-msm", "--matset-method", type=str, default="")
 @add_common_options
-def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs):
+def hera_profile(hex_num, nside, keep_ants, outriggers, matset_method, **kwargs):
     """Run profiling of matvis with a HERA-like array."""
     from py21cmsense.antpos import hera
 
@@ -293,7 +294,9 @@ def hera_profile(hex_num, nside, keep_ants, outriggers, **kwargs):
 
     bls = antpos[np.newaxis, :, :2] - antpos[:, np.newaxis, :2]
     pairs = np.array(get_redundancies(bls.value))
-    matsets = list(get_matrix_sets(bls.value))
+
+    if matset_method == "":
+        matsets = None
 
     run_profile(
         nsource=12 * nside**2,

From 89e52f986f7f5452fb29de68c4b9cb8a9bc35d68 Mon Sep 17 00:00:00 2001
From: christopher <ccain002@ucr.edu>
Date: Mon, 22 Jan 2024 11:44:56 -0800
Subject: [PATCH 5/5] fixing test_matprod

---
 src/matvis/core/matprod.py                    | 10 +++++
 tests/matmul_checks/check_matsets_ordering.py | 41 -------------------
 tests/test_matprod.py                         | 37 ++++-------------
 3 files changed, 19 insertions(+), 69 deletions(-)
 delete mode 100644 tests/matmul_checks/check_matsets_ordering.py

diff --git a/src/matvis/core/matprod.py b/src/matvis/core/matprod.py
index 9116e80..788a9eb 100644
--- a/src/matvis/core/matprod.py
+++ b/src/matvis/core/matprod.py
@@ -3,8 +3,13 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
+from matvis import HAVE_GPU
+
 from .._utils import get_dtypes
 
+if HAVE_GPU:
+    import cupy as cp
+
 
 class MatProd(ABC):
     """
@@ -43,6 +48,11 @@ def __init__(
         else:
             self.all_pairs = False
             self.antpairs = antpairs
+
+            if HAVE_GPU:
+                for i in range(len(matsets)):
+                    matsets[i][0] = cp.array(matsets[i][0])
+                    matsets[i][1] = cp.array(matsets[i][1])
             self.matsets = matsets
 
         self.nchunks = nchunks
diff --git a/tests/matmul_checks/check_matsets_ordering.py b/tests/matmul_checks/check_matsets_ordering.py
deleted file mode 100644
index 00e26d0..0000000
--- a/tests/matmul_checks/check_matsets_ordering.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import numpy as np
-
-# this test ensures that we are indexing the matrix product correctly to get the unique ant pairs when using the sub-matrix method
-
-Nant = 10
-
-V = np.zeros((Nant, Nant))
-
-for i in range(Nant):
-    for j in range(Nant):
-        V[i, j] = Nant * i + j
-
-print(V)
-
-pairs = np.array(
-    [[0, 3], [1, 2], [2, 3], [1, 5], [6, 7], [5, 8], [6, 9], [7, 9], [8, 9]]
-)
-antpairs = np.copy(pairs)
-
-for i in range(len(pairs)):
-    pairs[i] = np.array(pairs[i])
-
-print(V[pairs[:, 0], pairs[:, 1]])  # print out the unique elements of V
-
-matsets = [
-    (np.array([0, 1, 2]), np.array([2, 3, 5])),
-    (np.array([5, 6, 7, 8]), np.array([7, 8, 9])),
-]
-
-mat_product = np.zeros((Nant, Nant))
-out = np.zeros(len(antpairs))
-
-for i, (ai, aj) in enumerate(matsets):
-    AI, AJ = np.meshgrid(ai, aj)
-    mat_product[AI, AJ] = V[AI, AJ]
-
-for i, (ai, aj) in enumerate(antpairs):
-    out[i] = mat_product[ai, aj]
-
-print(mat_product)
-print(out)  # should be the same as the second print statement
diff --git a/tests/test_matprod.py b/tests/test_matprod.py
index 082c5d5..6bf39bb 100644
--- a/tests/test_matprod.py
+++ b/tests/test_matprod.py
@@ -1,11 +1,5 @@
 """Tests that the various matprod methods produce the same result."""
 
-import sys
-
-sys.path.insert(
-    0,
-    "/home/lab-admin/Desktop/Desktop/Graduate_School/Research/ASU/21_group/matvis/src/",
-)
 
 import pytest
 
@@ -25,13 +19,19 @@ def simple_matprod(z):
 @pytest.mark.parametrize("precision", [1, 2])
 @pytest.mark.parametrize(
     "method",
-    ["CPUMatMul", "CPUVectorDot", "CPUMatChunk"],  # , "GPUMatMul", "GPUVectorDot"]
+    [
+        "CPUMatMul",
+        "CPUVectorDot",
+        "CPUMatChunk",
+        "GPUMatMul",
+        "GPUVectorDot",
+        "GPUMatChunk",
+    ],
 )
 def test_matprod(nfeed, antpairs, matsets, precision, method):
     """Test that the various matprod methods produce the same result."""
     if method.startswith("GPU"):
         pytest.importorskip("cupy")
-        import cupy as cp
 
         from matvis.gpu import matprod as module
     else:
@@ -45,20 +45,13 @@ def test_matprod(nfeed, antpairs, matsets, precision, method):
     else:
         antpairs = None
 
-    if matsets and method.startswith("CPU"):
+    if matsets:
         matsets = [
             (np.array([0, 1, 2, 3]), np.array([0, 1, 2, 3])),
             (np.array([0, 1, 2, 3]), np.array([3, 4])),
             (np.array([3, 4]), np.array([0, 1, 2, 3])),
             (np.array([3, 4]), np.array([3, 4])),
         ]
-    elif matsets and method.startswith("GPU"):
-        matsets = [
-            (cp.array([0, 1, 2, 3]), cp.array([0, 1, 2, 3])),
-            (cp.array([0, 1, 2, 3]), cp.array([3, 4])),
-            (cp.array([3, 4]), cp.array([0, 1, 2, 3])),
-            (cp.array([3, 4]), cp.array([3, 4])),
-        ]
     else:
         matsets = None
 
@@ -94,18 +87,6 @@ def test_matprod(nfeed, antpairs, matsets, precision, method):
     if method.startswith("GPU"):
         true = true.get()
 
-    print(np.shape(out))
-    print(np.shape(true))
-    print(out)
-    print(true)
-    print(out - true)
-
     assert out.dtype.type == ctype
     assert out.shape == (obj.npairs, nfeed, nfeed)
     np.testing.assert_allclose(out, true)
-
-
-# antpairs = np.array([(1,2),(1,3),(3,2),(3,4),(4,4)])
-
-# test_matprod(2,True,True,1,"CPUMatChunk")
-test_matprod(2, True, True, 1, "GPUMatChunk")