cleaned up probes/docs for probes

Alexander Ororbia · Alexander Ororbia · commit 247de74e357d · 2025-03-01T14:24:09.000-05:00
diff --git a/ngclearn/utils/analysis/attentive_probe.py b/ngclearn/utils/analysis/attentive_probe.py
@@ -32,10 +32,15 @@ def cross_attention(params: tuple, x1: jax.Array, x2: jax.Array, mask: jax.Array
 
     Args:
         params (tuple): tuple of parameters
+
         x1 (jax.Array): query sequence. Shape: (B, T, Dq)
+
         x2 (jax.Array): key-value sequence. Shape: (B, S, Dkv)
+
         mask (jax.Array): mask tensor. Shape: (B, T, S)
+
         n_heads (int, optional): number of attention heads. Defaults to 8.
+
         dropout_rate (float, optional): dropout rate. Defaults to 0.0.
 
     Returns:
@@ -70,6 +75,27 @@ def cross_attention(params: tuple, x1: jax.Array, x2: jax.Array, mask: jax.Array
 
 @bind(jax.jit, static_argnums=[3, 4, 5, 6])
 def run_attention_probe(params, encodings, mask, n_heads: int, dropout: float = 0.0, use_LN=False, use_softmax=True):
+    """
+    Runs full nonlinear attentive probe on input encodings (typically embedding vectors produced by some other model). 
+
+    Args: 
+        params: parameters tuple/list of probe
+
+        encodings: input encoding vectors/data
+
+        mask: optional mask to be applied to internal cross-attention
+
+        n_heads: number of attention heads
+
+        dropout: if >0, triggers drop-out applied internally to cross-attention
+
+        use_LN: use layer normalization?
+
+        use_softmax: should softmax be applied to output of attention probe? (useful for classification) 
+
+    Returns:
+        output scores/probabilities, cross-attention (hidden) features
+    """
     # encoded_image_feature: (B, hw, dim)
     #learnable_query, *_params) = params
     learnable_query, Wq, bq, Wk, bk, Wv, bv, Wout, bout, Whid, bhid, Wln_mu, Wln_scale, Wy, by = params
@@ -87,6 +113,30 @@ def run_attention_probe(params, encodings, mask, n_heads: int, dropout: float =
 
 @bind(jax.jit, static_argnums=[4, 5, 6, 7])
 def eval_attention_probe(params, encodings, labels, mask, n_heads: int, dropout: float = 0.0, use_LN=False, use_softmax=True):
+    """
+    Runs and evaluates the nonlinear attentive probe given a paired set of encoding vectors and externally assigned 
+    labels/regression targets.
+
+    Args:
+        params: parameters tuple/list of probe
+
+        encodings: input encoding vectors/data
+
+        labels: output target values (e.g., labels, regression target vectors)
+
+        mask: optional mask to be applied to internal cross-attention
+
+        n_heads: number of attention heads
+
+        dropout: if >0, triggers drop-out applied internally to cross-attention
+
+        use_LN: use layer normalization?
+
+        use_softmax: should softmax be applied to output of attention probe? (useful for classification) 
+
+    Returns:
+        current loss value, output scores/probabilities
+    """
     # encodings: (B, hw, dim)
     outs, _ = run_attention_probe(params, encodings, mask, n_heads, dropout, use_LN, use_softmax)
     if use_softmax: ## Multinoulli log likelihood for 1-of-K predictions
@@ -97,6 +147,10 @@ def eval_attention_probe(params, encodings, labels, mask, n_heads: int, dropout:
 
 class AttentiveProbe(Probe):
     """
+    This implements a nonlinear attentive probe, which is useful for evaluating the quality of 
+    encodings/embeddings in light of some superivsory downstream data (e.g., label one-hot 
+    encodings or real-valued vector regression targets).
+
     Args:
         dkey: init seed key
 
@@ -167,13 +221,34 @@ def __init__(
         self.eta = 0.001
 
     def process(self, embedding_sequence):
+        """
+        Runs the probe's inference scheme given an input batch of sequences of encodings/embeddings.
+
+        Args:
+            embedding_sequence: a 3D tensor containing a batch of encoding sequences; shape (B, T, embed_dim)
+
+        Returns: 
+            probe output scores/probability values
+        """
         outs, feats = run_attention_probe(
             self.probe_params, embedding_sequence, self.mask, self.num_heads, 0.0, use_LN=self.use_LN,
             use_softmax=self.use_softmax
         )
         return outs
 
     def update(self, embedding_sequence, labels):
+        """
+        Runs and updates this probe given an input batch of sequences of encodings/embeddings and their externally 
+        assigned labels/target vector values.
+
+        Args:
+            embedding_sequence: a 3D tensor containing a batch of encoding sequences; shape (B, T, embed_dim)
+
+            labels: target values that map to embedding sequence; shape (B, target_value_dim)
+
+        Returns:
+            probe output scores/probability values
+        """
         ## compute partial derivatives / adjustments to probe parameters
         outputs, grads = self.grad_fx(
             self.probe_params, embedding_sequence, labels, self.mask, self.num_heads, dropout=0., use_LN=self.use_LN,
diff --git a/ngclearn/utils/analysis/linear_probe.py b/ngclearn/utils/analysis/linear_probe.py
@@ -51,6 +51,11 @@ def eval_linear_probe(params, x, y, use_softmax=True, use_LN=False):
 
 class LinearProbe(Probe):
     """
+    This implements a regularized linear probe, which is useful for evaluating the quality of 
+    encodings/embeddings in light of some superivsory downstream data (e.g., label one-hot 
+    encodings or real-valued vector regression targets). 
+    Note that this probe allows for configurable Elastic-net (L1+L2) regularization.
+
     Args:
         dkey: init seed key
 
@@ -79,7 +84,6 @@ def __init__(
         self.use_LN = use_LN
         self.l2_decay = 0.0001
         self.l1_decay = 0.000025
-        ## TODO: add in pre-built layer norm of inputs?
 
         ## set up classifier
         flat_input_dim = input_dim * source_seq_length
@@ -97,14 +101,35 @@ def __init__(
         self.eta = 0.001
 
     def process(self, embeddings):
+        """
+        Runs the probe's inference scheme given an input batch of sequences of encodings/embeddings.
+
+        Args:
+            embedding_sequence: a 3D tensor containing a batch of encoding sequences; shape (B, T, embed_dim)
+
+        Returns:
+            probe output scores/probability values
+        """
         _embeddings = embeddings
-        if len(_embeddings.shape) > 2:
+        if len(_embeddings.shape) > 2: ## we flatten a sequence batch to 2D for a linear probe
             flat_dim = embeddings.shape[1] * embeddings.shape[2]
             _embeddings = jnp.reshape(_embeddings, (embeddings.shape[0], flat_dim))
         outs = run_linear_probe(self.probe_params, _embeddings, use_softmax=self.use_softmax, use_LN=self.use_LN)
         return outs
 
     def update(self, embeddings, labels):
+        """
+        Runs and updates this probe given an input batch of sequences of encodings/embeddings and their externally
+        assigned labels/target vector values.
+
+        Args:
+            embedding_sequence: a 3D tensor containing a batch of encoding sequences; shape (B, T, embed_dim)
+
+            labels: target values that map to embedding sequence; shape (B, target_value_dim)
+
+        Returns:
+            probe output scores/probability values
+        """
         _embeddings = embeddings
         if len(_embeddings.shape) > 2:
             flat_dim = embeddings.shape[1] * embeddings.shape[2]
@@ -123,3 +148,4 @@ def update(self, embeddings, labels):
             self.optim_params, self.probe_params, grads, eta=self.eta
         )
         return loss, predictions
+
diff --git a/ngclearn/utils/analysis/probe.py b/ngclearn/utils/analysis/probe.py
@@ -26,6 +26,15 @@ def update(self, embeddings, labels):
         return L, predictions
 
     def predict(self, data):
+        """
+        Runs this probe's inference scheme over a pool of data.
+
+        Args:
+            data: a dataset or design tensor/matrix containing encoding vector sequences; shape (N, T, embed_dim) or (N, embed_dim)
+
+        Returns: 
+            the output scores/predictions made by this probe
+        """
         _data = data
         if len(_data.shape) < 3:
             _data = jnp.expand_dims(_data, axis=1)
@@ -45,13 +54,31 @@ def predict(self, data):
         return Y_mu
 
     def fit(self, data, labels, n_iter=50):
+        """
+        Fits this probe to a pool of data.
+
+        Args:
+            data: a dataset or design tensor/matrix containing encoding vector sequences; shape (N, T, embed_dim) or (N, embed_dim)
+
+            labels: a design matrix containing corresponding labels/targets for the embedding data; shape (N, target_dim)
+            
+        Returns:
+            the output scores/predictions made by this probe
+        """
         _data = data
         if len(_data.shape) < 3:
             _data = jnp.expand_dims(_data, axis=1)
 
         n_samples, seq_len, dim = _data.shape
+        size_modulo = n_samples % self.batch_size
+        if size_modulo > 0: 
+            ## we append some dup data for dataset design tensors that do not divide by batch size evenly
+            _chunk = _data[0:size_modulo, :, :]
+            _data = jnp.concatenate((_data, _chunk), axis=0)
+            n_samples, seq_len, dim = _data.shape
         n_batches = int(n_samples / self.batch_size)
 
+        ## run main probe fitting loop
         Y_mu = []
         _Y = None
         for iter in range(n_iter):
@@ -81,4 +108,5 @@ def fit(self, data, labels, n_iter=50):
             print()
             if iter == n_iter - 1:
                 Y_mu = jnp.concatenate(Y_mu, axis=0)
-        return Y_mu, _Y
+        return Y_mu, _Y ## return predictions mapped to current shuffling of labels
+