address review comments

p-lanza · p-lanza · commit 781c3e381439 · 2025-11-03T09:19:13.000-07:00
diff --git a/src/Dialect/ONNX/ONNXOps/NN/Attention.cpp b/src/Dialect/ONNX/ONNXOps/NN/Attention.cpp
@@ -20,7 +20,10 @@ template <>
 LogicalResult ONNXAttentionOpShapeHelper::computeShape() {
   auto attentionOp = cast<ONNXAttentionOp>(op);
 
-  int64_t rank = createIE->getShapedTypeRank(attentionOp.getQ());
+  const int64_t rank = createIE->getShapedTypeRank(attentionOp.getQ());
+  if (rank != 3 && rank != 4)
+    return failure();
+
   DimsExpr qShape;
   createIE->getShapeAsDims(attentionOp.getQ(), qShape);
   DimsExpr kShape;
@@ -31,29 +34,30 @@ LogicalResult ONNXAttentionOpShapeHelper::computeShape() {
   auto qNumHeads = attentionOp.getQNumHeads();
   auto kvNumHeads = attentionOp.getKvNumHeads();
 
-  if (rank == 4) {
-    DimsExpr outputDims = qShape;
-    outputDims[3] = vShape[3];
-    setOutputDims(outputDims, 0);
-  } else if (rank == 3) {
-    assert(qNumHeads && kvNumHeads &&
-           "*_num_heads attributes must be present with 3D inputs");
-    DimsExpr outputDims = qShape;
-    outputDims[2] = LitIE(*qNumHeads * (vShape[2].getLiteral() / *kvNumHeads));
-    setOutputDims(outputDims, 0);
-  } else {
-    return failure();
-  }
+  auto normalizeInputTo4D = [](DimsExpr inputShape,
+                                std::optional<int64_t> numHeads) -> DimsExpr {
+    DimsExpr shape4D = inputShape;
+    if (inputShape.size() == 4)
+      return shape4D;
 
-  // Need past_key/value inputs to infer shapes for present_key/value outputs
-  if (attentionOp->getNumOperands() < 6)
-    return success();
+    assert(numHeads && "*_num_heads attributes must be present with 3D inputs");
+    shape4D.insert(shape4D.begin() + 1, LitIE(*numHeads));
+    shape4D[3] = shape4D[3].floorDiv(shape4D[1]);
 
-  if (isNoneValue(attentionOp.getPastKey()) ||
-      isNoneValue(attentionOp.getPastValue()) ||
-      isNoneValue(attentionOp.getPresentKey()) ||
-      isNoneValue(attentionOp.getPresentValue()))
-    return success();
+    return shape4D;
+  };
+
+  DimsExpr qShape4D = normalizeInputTo4D(qShape, qNumHeads);
+  DimsExpr kShape4D = normalizeInputTo4D(kShape, kvNumHeads);
+  DimsExpr vShape4D = normalizeInputTo4D(vShape, kvNumHeads);
+
+  DimsExpr outputDims = qShape;
+  if (rank == 4) {
+    outputDims[3] = vShape4D[3];
+  } else /*if (rank == 3)*/ {
+    outputDims[2] = qShape4D[1] * vShape4D[3];
+  }
+  setOutputDims(outputDims, 0);
 
   if (!hasShapeAndRank(attentionOp.getPastKey()) ||
       !hasShapeAndRank(attentionOp.getPastValue()))
@@ -67,21 +71,19 @@ LogicalResult ONNXAttentionOpShapeHelper::computeShape() {
   if (pastKShape.size() != 4 || pastVShape.size() != 4)
     return failure();
 
-  auto totalSeqLen = pastKShape[2] + kShape[2];
+  auto totalSeqLen = pastKShape[2] + kShape4D[2];
 
-  DimsExpr presentKeyDims = kShape;
+  DimsExpr presentKeyDims = kShape4D;
   presentKeyDims[2] = totalSeqLen;
   setOutputDims(presentKeyDims, 1);
 
-  DimsExpr presentValueDims = vShape;
+  DimsExpr presentValueDims = vShape4D;
   presentValueDims[2] = totalSeqLen;
   setOutputDims(presentValueDims, 2);
 
-  if (attentionOp.getQkMatmulOutputMode()) {
-    DimsExpr qkOutputDims = qShape;
-    qkOutputDims[3] = totalSeqLen;
-    setOutputDims(presentValueDims, 3);
-  }
+  DimsExpr qkOutputDims = qShape4D;
+  qkOutputDims[3] = totalSeqLen;
+  setOutputDims(presentValueDims, 3);
 
   return success();
 }
@@ -93,25 +95,16 @@ LogicalResult ONNXAttentionOpShapeHelper::computeShape() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult ONNXAttentionOp::verify() {
-  const int64_t numIn = this->getNumOperands();
-  const int64_t numOut = this->getNumResults();
-
   // If presentK and presentV are outputs, then we must pass pastK and pastV as
   // inputs
-  if (numOut >= 3) {
-    Value presentK = this->getResult(1);
-    Value presentV = this->getResult(2);
-    if (!isNoneValue(presentK) || !isNoneValue(presentV)) {
-      if (numIn < 6)
-        return emitOpError("inputs 'pastK' and 'pastV' are needed for outputs "
-                           "'presentK' and 'presentV'");
-
-      Value pastK = this->getOperand(4);
-      Value pastV = this->getOperand(5);
-      if (isNoneValue(pastK) || isNoneValue(pastV))
-        return emitOpError("inputs 'pastK' and 'pastV' are needed for outputs "
-                           "'presentK' and 'presentV'");
-    }
+  Value presentK = this->getResult(1);
+  Value presentV = this->getResult(2);
+  if (!isNoneValue(presentK) || !isNoneValue(presentV)) {
+    Value pastK = this->getOperand(4);
+    Value pastV = this->getOperand(5);
+    if (isNoneValue(pastK) || isNoneValue(pastV))
+      return emitOpError("inputs 'pastK' and 'pastV' are needed for outputs "
+                         "'presentK' and 'presentV'");
   }
 
   ONNXAttentionOpAdaptor adaptor(*this);
@@ -120,7 +113,7 @@ LogicalResult ONNXAttentionOp::verify() {
   if (!hasShapeAndRank(q))
     return success(); // Won't be able to do any more checking at this stage.
 
-  auto qType = mlir::cast<ShapedType>(q.getType());
+  auto qType = cast<ShapedType>(q.getType());
   int64_t qRank = qType.getShape().size();
   if (qRank != 3 && qRank != 4)
     return onnx_mlir::Diagnostic::emitOperandHasUnexpectedRankError(
@@ -137,13 +130,13 @@ LogicalResult ONNXAttentionOp::verify() {
   if (!hasShapeAndRank(k) || !hasShapeAndRank(v))
     return success(); // Won't be able to do any more checking at this stage.
 
-  auto kType = mlir::cast<ShapedType>(k.getType());
+  auto kType = cast<ShapedType>(k.getType());
   int64_t kRank = kType.getShape().size();
   if (kRank != 3 && kRank != 4)
     return onnx_mlir::Diagnostic::emitOperandHasUnexpectedRankError(
         *this->getOperation(), k, kRank, "3 or 4");
 
-  auto vType = mlir::cast<ShapedType>(v.getType());
+  auto vType = cast<ShapedType>(v.getType());
   int64_t vRank = vType.getShape().size();
   if (vRank != 3 && vRank != 4)
     return onnx_mlir::Diagnostic::emitOperandHasUnexpectedRankError(
@@ -195,10 +188,9 @@ LogicalResult ONNXAttentionOp::inferShapes(
     if (!hasShapeAndRank(this->getOperand(i)))
       return success();
 
-  Type elementType = mlir::cast<ShapedType>(getQ().getType()).getElementType();
+  Type elementType = getElementTypeOrSelf(getQ().getType());
   ONNXAttentionOpShapeHelper shapeHelper(getOperation(), {});
   return shapeHelper.computeShapeAndUpdateType(elementType);
-  return success();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/src/Dialect/ONNX/ONNXOps/NN/RotaryEmbedding.cpp b/src/Dialect/ONNX/ONNXOps/NN/RotaryEmbedding.cpp
@@ -35,7 +35,7 @@ LogicalResult ONNXRotaryEmbeddingOp::verify() {
     return success(); // Won't be able to do any checking at this stage.
 
   auto inputType = mlir::cast<ShapedType>(input.getType());
-  int64_t inputRank = inputType.getRank();
+  const int64_t inputRank = inputType.getRank();
 
   if (inputRank != 3 && inputRank != 4)
     return onnx_mlir::Diagnostic::emitOperandHasUnexpectedRankError(
@@ -46,13 +46,19 @@ LogicalResult ONNXRotaryEmbeddingOp::verify() {
     return emitOpError(
         "attribute 'num_heads' must be provided when input is a 3D tensor.");
 
-  // Check hidden_size divisible by num_heads
   if (inputType.hasStaticShape()) {
     auto inputShape = inputType.getShape();
-    if (inputRank == 3 && numHeads && inputShape[2] % *numHeads != 0)
+    // Check head_size is even
+    if (inputRank == 4 && inputShape[3] % 2 != 0)
+      return onnx_mlir::Diagnostic::emitDimensionHasUnexpectedValueError(
+          *this->getOperation(), input, 3, inputShape[3], "even");
+
+    // Check hidden_size divisible by num_heads and resulting head_size is
+    // even (i.e. hidden_size % (num_heads * 2)  == 0)
+    if (inputRank == 3 && numHeads && inputShape[2] % (*numHeads * 2) != 0)
       return onnx_mlir::Diagnostic::emitDimensionHasUnexpectedValueError(
           *this->getOperation(), input, 2, inputShape[2],
-          "divisible by " + std::to_string(*numHeads));
+          "divisible by " + std::to_string(*numHeads) + " * 2");
   }
 
   Value cosCache = adaptor.getCosCache();
@@ -103,7 +109,7 @@ LogicalResult ONNXRotaryEmbeddingOp::inferShapes(
   if (!hasShapeAndRank(getOperation()->getOperand(0)))
     return success();
 
-  Type elementType = mlir::cast<ShapedType>(getX().getType()).getElementType();
+  Type elementType = getElementTypeOrSelf(getX().getType());
   ONNXRotaryEmbeddingOpShapeHelper shapeHelper(getOperation(), {});
   return shapeHelper.computeShapeAndUpdateType(elementType);
 }
diff --git a/test/mlir/onnx/invalid.mlir b/test/mlir/onnx/invalid.mlir
@@ -1007,6 +1007,24 @@ func.func @test_rotary_embedding_bad_dtype(%data: tensor<1x128x3072xi64>, %cos_c
 
 // -----
 
+func.func @test_rotary_embedding_4d_odd_head_size(%data: tensor<1x32x128x95xf32>, %cos_cache: tensor<4096x48xf32>, %sin_cache: tensor<4096x48xf32>) -> tensor<*xf32> {
+  %pos_ids = "onnx.NoValue"() {value} : () -> none
+  // expected-error @+1 {{onnx.RotaryEmbedding: operand '<block argument> of type 'tensor<1x32x128x95xf32>' at index: 0' has dimension at index 3 with value 95, value should be even}}
+  %0 = "onnx.RotaryEmbedding"(%data, %cos_cache, %sin_cache, %pos_ids) {num_heads = 32: si64} : (tensor<1x32x128x95xf32>, tensor<4096x48xf32>, tensor<4096x48xf32>, none) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @test_rotary_embedding_3d_odd_head_size(%data: tensor<1x128x3040xf32>, %cos_cache: tensor<4096x48xf32>, %sin_cache: tensor<4096x48xf32>) -> tensor<*xf32> {
+  %pos_ids = "onnx.NoValue"() {value} : () -> none
+  // expected-error @+1 {{onnx.RotaryEmbedding: operand '<block argument> of type 'tensor<1x128x3040xf32>' at index: 0' has dimension at index 2 with value 3040, value should be divisible by 32 * 2}}
+  %0 = "onnx.RotaryEmbedding"(%data, %cos_cache, %sin_cache, %pos_ids) {num_heads = 32: si64} : (tensor<1x128x3040xf32>, tensor<4096x48xf32>, tensor<4096x48xf32>, none) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
 func.func @test_rotary_embedding_bad_embedding_dim(%data: tensor<1x32x128x96xf32>, %cos_cache: tensor<4096x48xf32>, %sin_cache: tensor<4096x48xf32>) -> tensor<*xf32> {
   %pos_ids = "onnx.NoValue"() {value} : () -> none
   // expected-error @+1 {{onnx.RotaryEmbedding: operand '<block argument> of type 'tensor<4096x48xf32>' at index: 1' has dimension at index 1 with value 48, value should be 50}}
diff --git a/test/mlir/onnx/onnx_shape_inference.mlir b/test/mlir/onnx/onnx_shape_inference.mlir
@@ -4536,3 +4536,21 @@ func.func @test_attention_4d_qk_output(%q: tensor<1x32x128x96xf32>, %k: tensor<1
 // CHECK-LABEL:  func.func @test_attention_4d_qk_output
 // CHECK:          "onnx.Attention"
 // CHECK-SAME:       (tensor<1x32x128x96xf32>, tensor<1x16x128x96xf32>, tensor<1x16x128x48xf32>, none, tensor<1x16x256x96xf32>, tensor<1x16x256x48xf32>) -> (tensor<1x32x128x48xf32>, tensor<1x16x384x96xf32>, tensor<1x16x384x48xf32>, tensor<1x16x384x48xf32>)
+
+func.func @test_attention_3d_inputs_4d_present_kv(%q: tensor<1x128x3072xf32>, %k: tensor<1x128x1536xf32>, %v: tensor<1x128x768xf32>, %past_k: tensor<1x16x256x96xf32>, %past_v: tensor<1x16x256x48xf32>) -> tensor<*xf32> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %out, %present_k, %present_v, %qk_out = "onnx.Attention"(%q, %k, %v, %none, %past_k, %past_v) {q_num_heads = 32: si64, kv_num_heads = 16: si64} : (tensor<1x128x3072xf32>, tensor<1x128x1536xf32>, tensor<1x128x768xf32>, none, tensor<1x16x256x96xf32>, tensor<1x16x256x48xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>, none)
+  return %out : tensor<*xf32>
+}
+// CHECK-LABEL:  func.func @test_attention_3d_inputs_4d_present_kv
+// CHECK:          "onnx.Attention"
+// CHECK-SAME:        (tensor<1x128x3072xf32>, tensor<1x128x1536xf32>, tensor<1x128x768xf32>, none, tensor<1x16x256x96xf32>, tensor<1x16x256x48xf32>) -> (tensor<1x128x1536xf32>, tensor<1x16x384x96xf32>, tensor<1x16x384x48xf32>, none)
+
+func.func @test_attention_3d_q_4d_kv(%q: tensor<1x128x3072xf32>, %k: tensor<1x16x128x96xf32>, %v: tensor<1x16x128x48xf32>) -> tensor<*xf32> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %out, %present_k, %present_v, %qk_out = "onnx.Attention"(%q, %k, %v, %none, %none, %none) {q_num_heads = 32: si64} : (tensor<1x128x3072xf32>, tensor<1x16x128x96xf32>, tensor<1x16x128x48xf32>, none, none, none) -> (tensor<*xf32>, none, none, none)
+  return %out : tensor<*xf32>
+}
+// CHECK-LABEL:  func.func @test_attention_3d_q_4d_kv
+// CHECK:          "onnx.Attention"
+// CHECK-SAME:       (tensor<1x128x3072xf32>, tensor<1x16x128x96xf32>, tensor<1x16x128x48xf32>, none, none, none) -> (tensor<1x128x1536xf32>