More convenient pasts layout

conordaly0 · conordaly0 · commit 9d354fd10e2e · 2020-11-05T09:10:34.000Z
diff --git a/+transformer/+layer/attention.m b/+transformer/+layer/attention.m
@@ -8,12 +8,12 @@
 %   Inputs:
 %       X               - A (numFeatures*numHeads)-by-numInputSubwords-by-numObs
 %                         input array.
-%       past            - A numFeatures-by-numPastSubwords-by-numHeads-by-2-by-numObs
+%       past            - A numFeatures-by-numPastSubwords-by-numHeads-by-numObs-by-2
 %                         array. This contains the 'keys' and 'values' for
 %                         past subwords. These are needed to predict future
 %                         outputs in an autoregressive manner. 'keys' are
-%                         stored in past(:,:,:,1,:) and 'values' are stored
-%                         in past(:,:,:,2,:).
+%                         stored in past(:,:,:,:,1) and 'values' are stored
+%                         in past(:,:,:,:,2).
 %       weights         - The weights for the full multi-head attention
 %                         block stored in a struct. This includes:
 %                           - attn_c_attn_w_0: A weight matrix for the
@@ -30,13 +30,13 @@
 %   Outputs:
 %       Z               - A (numFeatures*numHeads)-by-numInputSubwords-by-numObs
 %                         output array.
-%       present         - A numFeatures-by-numAllSubwords-by-numHeads-by-2-by-numObs
+%       present         - A numFeatures-by-numAllSubwords-by-numHeads-by-numObs-by-2
 %                         array. This contains the 'keys' and 'values' that
 %                         are created from inputs. These need to passed
 %                         back in as the 'past' input if we want to predict
 %                         future outputs in an autoregressive manner. 'keys'
-%                         are stored in present(:,:,:,1,:) and 'values' are
-%                         stored in present(:,:,:,2,:).
+%                         are stored in present(:,:,:,:,1) and 'values' are
+%                         stored in present(:,:,:,:,2).
 %
 %   References:
 %
@@ -63,21 +63,16 @@
 
 % Use the past
 if ~isempty(past)
-    % Here we must squeeze out the singleton fourth dimensions after
-    % extracting the keys and values from past, since K, V have dimensions
-    % numFeatures-by-numPastSubwords-by-numHeads-by-numObs
-    PK = permute(past(:,:,:,1,:), [1 2 3 5 4]);
-    PV = permute(past(:,:,:,2,:), [1 2 3 5 4]);
+    PK = past(:,:,:,:,1);
+    PV = past(:,:,:,:,2);
     K = cat(2,PK,K);
     V = cat(2,PV,V);
 end
 
 % Set present. Note that this is done differently from the original
 % implementation which sets the value of present before the previous if
-% statement. Here we cat K, V along the fifth dimension, then permute to
-% recover the layout numFeatures-by-numPastSubwords-by-numHeads-by-2-by-numObs
+% statement
 present = cat(5,K,V);
-present = permute(present, [1 2 3 5 4]);
 
 A = transformer.layer.multiheadAttention(Q,K,V);
 
diff --git a/test/transformer/layer/tattention.m b/test/transformer/layer/tattention.m
@@ -86,26 +86,18 @@ function checkPastPresentCaching(test,NumQueries,NumObs)
             % Verify the expected value of past - it is the key and values
             % concatenated on the 4th dimension.
             [~,K,V] = iSplitQKV(x,hyperParams.NumHeads,latentDim);
-            K = reshape(K, [size(K, 1:3) 1 size(K, 4)]);
-            V = reshape(V, [size(V, 1:3) 1 size(V, 4)]);
-            test.verifyEqual(past,cat(4,K,V));
+            test.verifyEqual(past,cat(5,K,V));
             % Now verify second call to attention is possible with the first 
             % past as input - and verify the value of the attention output.
             [yAct,present] = test.attention(x,past,weights,hyperParams);
             [Q,K,V] = iSplitQKV(x,hyperParams.NumHeads,latentDim);
-            Q = reshape(Q, [size(Q, 1:3) 1 size(Q, 4)]);
-            K = reshape(K, [size(K, 1:3) 1 size(K, 4)]);
-            V = reshape(V, [size(V, 1:3) 1 size(V, 4)]);
             % Verify the correct value for present.
-            pastK = past(:,:,:,1,:);
-            pastV = past(:,:,:,2,:);
-            test.verifyEqual(extractdata(present),extractdata(cat(4,cat(2,pastK,K),cat(2,pastV,V))),'AbsTol',1e-5);
+            pastK = past(:,:,:,:,1);
+            pastV = past(:,:,:,:,2);
+            test.verifyEqual(extractdata(present),extractdata(cat(5,cat(2,pastK,K),cat(2,pastV,V))),'AbsTol',1e-5);
             % To compute the expected value, concatenate the pasts
             K = cat(2,K,pastK);
             V = cat(2,V,pastV);
-            Q = permute(Q, [1 2 3 5 4]);
-            K = permute(K, [1 2 3 5 4]);
-            V = permute(V, [1 2 3 5 4]);
             yExp = test.multiheadAttention(Q,K,V);
             yExp = iMergeHeads(yExp);
             test.verifyEqual(extractdata(yAct),extractdata(yExp),'AbsTol',1e-5);