apache
diff --git a/‎scripts/nn/layers/multi_attention.dml‎
Lines changed: 205 additions & 0 deletions b/‎scripts/nn/layers/multi_attention.dml‎
Lines changed: 205 additions & 0 deletions
diff --git a/‎src/test/java/org/apache/sysds/test/applications/nn/transformers/MultiAttentionLayerTest.java‎
Lines changed: 139 additions & 0 deletions b/‎src/test/java/org/apache/sysds/test/applications/nn/transformers/MultiAttentionLayerTest.java‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎src/test/resources/component/transformers/multi_attention_layer/input_attention_test4.csv‎
Lines changed: 2 additions & 0 deletions b/‎src/test/resources/component/transformers/multi_attention_layer/input_attention_test4.csv‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,205 @@
+
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+source("nn/layers/softmax.dml") as softmax
+source("nn/layers/dropout.dml") as dropout
+source("scripts/nn/util.dml") as util
+
+
+forward = function(matrix[double] Q, matrix[double] K,
+      matrix[double] V, int H, int T, int D, double dropout_p)
+    return (matrix[double] context, matrix[double] attention, matrix[double] dropout_mask) {
+  /*
+   * Computes the forward pass for a multi-head attention layer.
+   *
+   * Inputs (B: Batch size, T: Sequence length, D: Embedding length, H: Heads):
+   * - Q: Input querys, of shape (B,T*H*D).
+   * - K: Input keys, of shape (B,T*H*D).
+   * - V: Input values, of shape (B,T*H*D).
+   * - H: Head count.
+   * - T: Sequence length.
+   * - D: Embedding length of single query, value, key,
+   * - dropout_p: Dropout probability.
+   *
+   * Outputs:
+   * - context: Token context embeddings, of shape (B, T*H*D)
+   * - attention: Attention on value(s) for given query(s), of shape (B, H*T*T)
+   * - dropout_mask: Dropout mask used on attention, of shape (B, H*T*T)
+   */
+  B = nrow(Q)
+
+  # Transpose head and token dimension for per-head computation
+  Q = util::transpose_ABCD_to_ACBD(Q, T, H)  # Shape (B, H*T*D)
+  K = util::transpose_ABCD_to_ACBD(K, T, H)  # Shape (B, H*T*D)
+  V = util::transpose_ABCD_to_ACBD(V, T, H)  # Shape (B, H*T*D)
+
+  attention = matrix(0, rows=B, cols=H*T*T)
+  dropout_mask = matrix(0, rows=B, cols=H*T*T)
+  context = matrix(0, rows=B, cols=H*T*D)
+  K_norm = K / sqrt(D)
+
+  # For loops for tensor operations
+  for (batch in 1:B) {
+    attention_probs_b = matrix(0, rows=H, cols=T*T)
+    if (dropout_p > 0.0) {
+      dropout_mask_b = matrix(0, rows=H, cols=T*T)
+    }
+    context_b = matrix(0, rows=H, cols=T*D)
+    Q_b = matrix(Q[batch], rows=H, cols=T*D)
+    K_norm_b = matrix(K_norm[batch], rows=H, cols=T*D)
+    V_b = matrix(V[batch], rows=H, cols=T*D)
+
+    for (head in 1:H) {
+      Q_h = matrix(Q_b[head], rows=T, cols=D)
+      K_norm_h = matrix(K_norm_b[head], rows=T, cols=D)
+      V_h = matrix(V_b[head], rows=T, cols=D)
+
+      attention_scores = Q_h %*% t(K_norm_h)  # Shape (T, T)
+      
+      # TODO: Add support for attention mask here
+      
+      # Column-wise softmax
+      attention_probs_h = softmax::forward(attention_scores)
+
+      if (dropout_p > 0.0) {
+        [attention_probs_h, dropout_mask_h] = dropout::forward(attention_probs_h, dropout_p, -1)
+      }
+
+      context_h = attention_probs_h %*% V_h  # Shape (T, D)
+
+      attention_probs_b[head] = matrix(attention_probs_h, rows=1, cols=T*T)
+      if (dropout_p > 0.0) {
+        dropout_mask_b[head] = matrix(dropout_mask_h, rows=1, cols=T*T)
+      }
+      context_b[head] = matrix(context_h, rows=1, cols=T*D)
+    }
+
+    attention[batch] = matrix(attention_probs_b, rows=1, cols=H*T*T)
+    if (dropout_p > 0.0) {
+      dropout_mask[batch] = matrix(dropout_mask_b, rows=1, cols=H*T*T)
+    }
+    context[batch] = matrix(context_b, rows=1, cols=H*T*D)
+  }
+
+  # Swap head and token dimension for original shape
+  context = util::transpose_ABCD_to_ACBD(context, H, T)
+}
+
+
+backward = function(matrix[double] dcontext, 
+      matrix[double] dropout_mask, matrix[double] attention, matrix[double] Q, 
+      matrix[double] K, matrix[double] V, int H, int T, 
+      int D, double dropout_p)
+    return (matrix[double] dQ, matrix[double] dK, matrix[double] dV) {
+  /*
+   * Computes the backward pass for a multi-head attention layer.
+   *
+   * Inputs (B: Batch size, T: Sequence length, D: Embedding length, H: Heads):
+   * - dcontext: Gradient w.r.t. the context matrix of shape (B, T*H*D)
+   * - dropout_mask: Dropout mask from forward pass of shape (B, H*T*T)
+   * - attention: Attention output from forward pass of shape (B, H*T*T)
+   * - Q: Input querys, of shape (B,T*H*D).
+   * - K: Input keys, of shape (B,T*H*D).
+   * - V: Input values, of shape (B,T*H*D).
+   * - H: Head count.
+   * - T: Sequence length.
+   * - D: Embedding length of single query, value, key,
+   * - dropout_p: Dropout probability.
+   *
+   * Outputs:
+   * - dQ: Gradient w.r.t. input querys, of shape (B,T*H*D).
+   * - dK: Gradient w.r.t. input keys, of shape (B,T*H*D).
+   * - dV: Gradient w.r.t. input values, of shape (B,T*H*D).
+   */
+  B = nrow(Q)
+
+  # Transpose head and token dimension for per-head computation
+  dcontext = util::transpose_ABCD_to_ACBD(dcontext, T, H)  # Shape (B, H*T*D)
+  Q = util::transpose_ABCD_to_ACBD(Q, T, H)  # Shape (B, H*T*D)
+  K = util::transpose_ABCD_to_ACBD(K, T, H)  # Shape (B, H*T*D)
+  V = util::transpose_ABCD_to_ACBD(V, T, H)  # Shape (B, H*T*D)
+
+  dQ = matrix(0, rows=B, cols=H*T*D)  # Shape (B, H*T*D)
+  dK = matrix(0, rows=B, cols=H*T*D)  # Shape (B, H*T*D)
+  dV = matrix(0, rows=B, cols=H*T*D)  # Shape (B, H*T*D)
+
+  K_norm = K / sqrt(D)
+
+  # For loops for tensor operations
+  for (batch in 1:B) {
+    dcontext_b = matrix(dcontext[batch], rows=H, cols=T*D)
+    if (dropout_p > 0.0) {
+      dropout_mask_b = matrix(dropout_mask[batch], rows=H, cols=T*T) 
+    }
+    attention_b = matrix(attention[batch], rows=H, cols=T*T) 
+
+    Q_b = matrix(Q[batch], rows=H, cols=T*D)
+    K_norm_b = matrix(K_norm[batch], rows=H, cols=T*D)
+    V_b = matrix(V[batch], rows=H, cols=T*D)
+
+    dQ_b = matrix(0, rows=H, cols=T*D)
+    dK_b = matrix(0, rows=H, cols=T*D)
+    dV_b = matrix(0, rows=H, cols=T*D)
+
+    for (head in 1:H) {
+      dcontext_h = matrix(dcontext_b[head], rows=T, cols=D)
+      if (dropout_p > 0.0) {
+        dropout_mask_h = matrix(dropout_mask_b[head], rows=T, cols=T)
+      }
+      attention_h = matrix(attention_b[head], rows=T, cols=T)
+
+      # Compute dV early to release attention_h
+      dV_h = t(attention_h) %*% dcontext_h
+
+      Q_h = matrix(Q_b[head], rows=T, cols=D)
+      K_norm_h = matrix(K_norm_b[head], rows=T, cols=D)
+      V_h = matrix(V_b[head], rows=T, cols=D)
+
+      dattention_probs = dcontext_h %*% t(V_h)
+
+      if (dropout_p > 0.0) {
+        # Provide unnecessary required X input matrix via empty matrix
+        dattention_probs = dropout::backward(dattention_probs, matrix(0, rows=1, cols=1), dropout_p, dropout_mask_h)
+      }
+      attention_scores = Q_h %*% t(K_norm_h)  # Shape (T, T)
+      dattention_scores = softmax::backward(dattention_probs, attention_scores)
+
+      dQ_h = dattention_scores %*% K_norm_h
+      dK_h = t(dattention_scores) %*% (Q_h / sqrt(D))
+
+      # Append to batch matrices
+      dK_b[head] = matrix(dK_h, rows=1, cols=T*D)
+      dQ_b[head] = matrix(dQ_h, rows=1, cols=T*D)
+      dV_b[head] = matrix(dV_h, rows=1, cols=T*D)
+    }
+    
+    # Append to output matrices
+    dK[batch] = matrix(dK_b, rows=1, cols=H*T*D)
+    dQ[batch] = matrix(dQ_b, rows=1, cols=H*T*D)
+    dV[batch] = matrix(dV_b, rows=1, cols=H*T*D)
+  }
+
+  # Swap head and token dimensions
+  dK = util::transpose_ABCD_to_ACBD(dK, H, T)
+  dQ = util::transpose_ABCD_to_ACBD(dQ, H, T)
+  dV = util::transpose_ABCD_to_ACBD(dV, H, T)
+}
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysds.test.applications.nn.transformers;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+public class MultiAttentionLayerTest extends AutomatedTestBase {
+	private static final String TEST_NAME_FORWARD = "multi_attention_forward";
+	private static final String TEST_NAME_BACKWARD = "multi_attention_backward";
+	private static final String TEST_DIR = "applications/nn/component/";
+	private static final String RESOURCE_DIR = "src/test/resources/component/transformers/multi_attention_layer/";
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME_FORWARD, new TestConfiguration(TEST_DIR, TEST_NAME_FORWARD));
+		addTestConfiguration(TEST_NAME_BACKWARD, new TestConfiguration(TEST_DIR, TEST_NAME_BACKWARD));
+	}
+
+	@Test
+	public void testMultiAttentionForwardSimple() {
+		runMultiAttentionTest("test1", 2, 3, 4, 5, 0, TEST_NAME_FORWARD, 1e-5, true);
+	}
+
+	@Test
+	public void testMultiAttentionForwardLarge() {
+		runMultiAttentionTest("test2", 8, 12, 10, 4, 0, TEST_NAME_FORWARD, 1e-5, true);
+	}
+
+	@Test
+	public void testMultiAttentionForwardSmall() {
+		runMultiAttentionTest("test3", 1, 1, 1, 1, 0, TEST_NAME_FORWARD, 1e-5, true);
+	}
+
+	@Test
+	public void testMultiAttentionBackwardSimple() {
+		runMultiAttentionTest("test4", 2, 3, 4, 5, 0, TEST_NAME_BACKWARD, 1e-5, false);
+	}
+
+	@Test
+	public void testMultiAttentionBackwardLarge() {
+		runMultiAttentionTest("test5", 8, 12, 10, 5, 0, TEST_NAME_BACKWARD, 1e-5, false);
+	}
+
+	@Test
+	public void testMultiAttentionBackwardSmall() {
+		runMultiAttentionTest("test6", 1, 1, 1, 1, 0, TEST_NAME_BACKWARD, 1e-5, false);
+	}
+
+	private void runMultiAttentionTest(String testSuffix, int batchSize, int seqLength, int numHeads, int embeddingDim,
+			int debug, String testname, double precision, boolean isForward) {
+		// Set execution platform
+		Types.ExecMode platformOld = setExecMode(Types.ExecMode.SINGLE_NODE);
+
+		try {
+			// Load test configuration
+			getAndLoadTestConfiguration(testname);
+			fullDMLScriptName = getScript();
+
+			// Program arguments
+			if (isForward) {
+				programArgs = new String[] { 
+					"-stats", "-args",
+					String.valueOf(batchSize), String.valueOf(seqLength),
+					String.valueOf(numHeads), String.valueOf(embeddingDim),
+					String.valueOf(debug),
+					RESOURCE_DIR + "input_query_" + testSuffix + ".csv",
+					RESOURCE_DIR + "input_key_" + testSuffix + ".csv",
+					RESOURCE_DIR + "input_value_" + testSuffix + ".csv",
+					RESOURCE_DIR + "output_context_" + testSuffix + ".csv",
+					RESOURCE_DIR + "output_attention_" + testSuffix + ".csv",
+					output("context_error"),
+					output("attention_error"), 
+				};
+			} else {
+				programArgs = new String[] { 
+					"-stats", "-args",
+					String.valueOf(batchSize), String.valueOf(seqLength),
+					String.valueOf(numHeads), String.valueOf(embeddingDim),
+					String.valueOf(debug),
+					RESOURCE_DIR + "input_query_" + testSuffix + ".csv",
+					RESOURCE_DIR + "input_key_" + testSuffix + ".csv",
+					RESOURCE_DIR + "input_value_" + testSuffix + ".csv",
+					RESOURCE_DIR + "input_dcontext_" + testSuffix + ".csv",
+					RESOURCE_DIR + "input_attention_" + testSuffix + ".csv",
+					RESOURCE_DIR + "output_dquery_" + testSuffix + ".csv",
+					RESOURCE_DIR + "output_dkey_" + testSuffix + ".csv",
+					RESOURCE_DIR + "output_dvalue_" + testSuffix + ".csv",
+					output("dquery_error"),
+					output("dkey_error"), 
+					output("dvalue_error"), 
+				};
+			}
+
+			// Run the test
+			runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+
+			// Compare results
+			if (isForward) {
+				double contextMaxError = (Double) readDMLScalarFromOutputDir("context_error").values().toArray()[0];
+				assert contextMaxError < precision;
+				double attentionMaxError = (Double) readDMLScalarFromOutputDir("context_error").values().toArray()[0];
+				assert attentionMaxError < precision;
+			} else {
+				double dqueryMaxError = (Double) readDMLScalarFromOutputDir("dquery_error").values().toArray()[0];
+				assert dqueryMaxError < precision;
+				double dkeyMaxError = (Double) readDMLScalarFromOutputDir("dkey_error").values().toArray()[0];
+				assert dkeyMaxError < precision;
+				double dvalueMaxError = (Double) readDMLScalarFromOutputDir("dvalue_error").values().toArray()[0];
+				assert dvalueMaxError < precision;
+			}
+		} catch (Throwable ex) {
+			ex.printStackTrace(System.out); // Log or debug all exceptions or errors
+			throw new RuntimeException(ex);
+		} finally {
+			resetExecMode(platformOld);
+		}
+	}
+}
@@ -0,0 +1,2 @@
+0.328471,0.331907,0.339622,0.336846,0.335182,0.327972,0.335271,0.335710,0.329019,0.329675,0.336258,0.334066,0.342494,0.325649,0.331857,0.342283,0.335882,0.321835,0.331431,0.336706,0.331864,0.313857,0.340129,0.346014,0.346816,0.319861,0.333323,0.353853,0.299062,0.347086,0.330171,0.313161,0.356668,0.339396,0.324497,0.336107
+0.300318,0.358489,0.341193,0.319460,0.342199,0.338342,0.321149,0.352661,0.326190,0.331912,0.333238,0.334850,0.324741,0.354198,0.321062,0.307494,0.367816,0.324689,0.357409,0.339341,0.303250,0.357259,0.333069,0.309672,0.359344,0.327826,0.312830,0.333608,0.334254,0.332138,0.343666,0.329218,0.327117,0.328783,0.338962,0.332255
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+0.328471,0.331907,0.339622,0.336846,0.335182,0.327972,0.335271,0.335710,0.329019,0.329675,0.336258,0.334066,0.342494,0.325649,0.331857,0.342283,0.335882,0.321835,0.331431,0.336706,0.331864,0.313857,0.340129,0.346014,0.346816,0.319861,0.333323,0.353853,0.299062,0.347086,0.330171,0.313161,0.356668,0.339396,0.324497,0.336107`
	`2`	`+0.300318,0.358489,0.341193,0.319460,0.342199,0.338342,0.321149,0.352661,0.326190,0.331912,0.333238,0.334850,0.324741,0.354198,0.321062,0.307494,0.367816,0.324689,0.357409,0.339341,0.303250,0.357259,0.333069,0.309672,0.359344,0.327826,0.312830,0.333608,0.334254,0.332138,0.343666,0.329218,0.327117,0.328783,0.338962,0.332255`