From 2a8d78189a6c20a482dd44dfe43f8c919d0e53d6 Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier@nvidia.com>
Date: Fri, 16 May 2025 01:44:31 -0700
Subject: [PATCH 1/4] [flang] use DataLayout instead of GEP to compute element
 size

---
 .../flang/Optimizer/CodeGen/FIROpPatterns.h   |  4 ++
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 50 ++++++++---------
 flang/test/Fir/convert-to-llvm.fir            | 54 +++++--------------
 flang/test/Fir/copy-codegen.fir               | 12 ++---
 flang/test/Fir/embox-char.fir                 |  8 +--
 flang/test/Fir/embox-substring.fir            |  7 ++-
 6 files changed, 48 insertions(+), 87 deletions(-)

diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
index 53d16323beddf..7b1c14e4dfdc9 100644
--- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
+++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
@@ -173,6 +173,10 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
         this->getTypeConverter());
   }
 
+  const mlir::DataLayout &getDataLayout() const {
+    return lowerTy().getDataLayout();
+  }
+
   void attachTBAATag(mlir::LLVM::AliasAnalysisOpInterface op,
                      mlir::Type baseFIRType, mlir::Type accessFIRType,
                      mlir::LLVM::GEPOp gep) const {
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index e534cfa5591c6..ad9119ba4a031 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -1043,22 +1043,12 @@ static mlir::SymbolRefAttr getMalloc(fir::AllocMemOp op,
 static mlir::Value
 computeElementDistance(mlir::Location loc, mlir::Type llvmObjectType,
                        mlir::Type idxTy,
-                       mlir::ConversionPatternRewriter &rewriter) {
-  // Note that we cannot use something like
-  // mlir::LLVM::getPrimitiveTypeSizeInBits() for the element type here. For
-  // example, it returns 10 bytes for mlir::Float80Type for targets where it
-  // occupies 16 bytes. Proper solution is probably to use
-  // mlir::DataLayout::getTypeABIAlignment(), but DataLayout is not being set
-  // yet (see llvm-project#57230). For the time being use the '(intptr_t)((type
-  // *)0 + 1)' trick for all types. The generated instructions are optimized
-  // into constant by the first pass of InstCombine, so it should not be a
-  // performance issue.
-  auto llvmPtrTy = ::getLlvmPtrType(llvmObjectType.getContext());
-  auto nullPtr = rewriter.create<mlir::LLVM::ZeroOp>(loc, llvmPtrTy);
-  auto gep = rewriter.create<mlir::LLVM::GEPOp>(
-      loc, llvmPtrTy, llvmObjectType, nullPtr,
-      llvm::ArrayRef<mlir::LLVM::GEPArg>{1});
-  return rewriter.create<mlir::LLVM::PtrToIntOp>(loc, idxTy, gep);
+                       mlir::ConversionPatternRewriter &rewriter,
+                       const mlir::DataLayout &dataLayout) {
+  llvm::TypeSize size = dataLayout.getTypeSize(llvmObjectType);
+  unsigned short alignment = dataLayout.getTypeABIAlignment(llvmObjectType);
+  std::int64_t distance = llvm::alignTo(size, alignment);
+  return genConstantIndex(loc, idxTy, rewriter, distance);
 }
 
 /// Return value of the stride in bytes between adjacent elements
@@ -1066,10 +1056,10 @@ computeElementDistance(mlir::Location loc, mlir::Type llvmObjectType,
 /// \p idxTy integer type.
 static mlir::Value
 genTypeStrideInBytes(mlir::Location loc, mlir::Type idxTy,
-                     mlir::ConversionPatternRewriter &rewriter,
-                     mlir::Type llTy) {
+                     mlir::ConversionPatternRewriter &rewriter, mlir::Type llTy,
+                     const mlir::DataLayout &dataLayout) {
   // Create a pointer type and use computeElementDistance().
-  return computeElementDistance(loc, llTy, idxTy, rewriter);
+  return computeElementDistance(loc, llTy, idxTy, rewriter, dataLayout);
 }
 
 namespace {
@@ -1111,7 +1101,7 @@ struct AllocMemOpConversion : public fir::FIROpConversion<fir::AllocMemOp> {
   mlir::Value genTypeSizeInBytes(mlir::Location loc, mlir::Type idxTy,
                                  mlir::ConversionPatternRewriter &rewriter,
                                  mlir::Type llTy) const {
-    return computeElementDistance(loc, llTy, idxTy, rewriter);
+    return computeElementDistance(loc, llTy, idxTy, rewriter, getDataLayout());
   }
 };
 } // namespace
@@ -1323,8 +1313,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
                                    fir::CharacterType charTy,
                                    mlir::ValueRange lenParams) const {
     auto i64Ty = mlir::IntegerType::get(rewriter.getContext(), 64);
-    mlir::Value size =
-        genTypeStrideInBytes(loc, i64Ty, rewriter, this->convertType(charTy));
+    mlir::Value size = genTypeStrideInBytes(
+        loc, i64Ty, rewriter, this->convertType(charTy), this->getDataLayout());
     if (charTy.hasConstantLen())
       return size; // Length accounted for in the genTypeStrideInBytes GEP.
     // Otherwise,  multiply the single character size by the length.
@@ -1338,6 +1328,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
   std::tuple<mlir::Value, mlir::Value> getSizeAndTypeCode(
       mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
       mlir::Type boxEleTy, mlir::ValueRange lenParams = {}) const {
+    const mlir::DataLayout &dataLayout = this->getDataLayout();
     auto i64Ty = mlir::IntegerType::get(rewriter.getContext(), 64);
     if (auto eleTy = fir::dyn_cast_ptrEleTy(boxEleTy))
       boxEleTy = eleTy;
@@ -1354,18 +1345,19 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
         mlir::dyn_cast<fir::LogicalType>(boxEleTy) || fir::isa_real(boxEleTy) ||
         fir::isa_complex(boxEleTy))
       return {genTypeStrideInBytes(loc, i64Ty, rewriter,
-                                   this->convertType(boxEleTy)),
+                                   this->convertType(boxEleTy), dataLayout),
               typeCodeVal};
     if (auto charTy = mlir::dyn_cast<fir::CharacterType>(boxEleTy))
       return {getCharacterByteSize(loc, rewriter, charTy, lenParams),
               typeCodeVal};
     if (fir::isa_ref_type(boxEleTy)) {
       auto ptrTy = ::getLlvmPtrType(rewriter.getContext());
-      return {genTypeStrideInBytes(loc, i64Ty, rewriter, ptrTy), typeCodeVal};
+      return {genTypeStrideInBytes(loc, i64Ty, rewriter, ptrTy, dataLayout),
+              typeCodeVal};
     }
     if (mlir::isa<fir::RecordType>(boxEleTy))
       return {genTypeStrideInBytes(loc, i64Ty, rewriter,
-                                   this->convertType(boxEleTy)),
+                                   this->convertType(boxEleTy), dataLayout),
               typeCodeVal};
     fir::emitFatalError(loc, "unhandled type in fir.box code generation");
   }
@@ -1909,8 +1901,8 @@ struct XEmboxOpConversion : public EmboxCommonConversion<fir::cg::XEmboxOp> {
     if (hasSubcomp) {
       // We have a subcomponent. The step value needs to be the number of
       // bytes per element (which is a derived type).
-      prevDimByteStride =
-          genTypeStrideInBytes(loc, i64Ty, rewriter, convertType(seqEleTy));
+      prevDimByteStride = genTypeStrideInBytes(
+          loc, i64Ty, rewriter, convertType(seqEleTy), getDataLayout());
     } else if (hasSubstr) {
       // We have a substring. The step value needs to be the number of bytes
       // per CHARACTER element.
@@ -3604,8 +3596,8 @@ struct CopyOpConversion : public fir::FIROpConversion<fir::CopyOp> {
     mlir::Value llvmDestination = adaptor.getDestination();
     mlir::Type i64Ty = mlir::IntegerType::get(rewriter.getContext(), 64);
     mlir::Type copyTy = fir::unwrapRefType(copy.getSource().getType());
-    mlir::Value copySize =
-        genTypeStrideInBytes(loc, i64Ty, rewriter, convertType(copyTy));
+    mlir::Value copySize = genTypeStrideInBytes(
+        loc, i64Ty, rewriter, convertType(copyTy), getDataLayout());
 
     mlir::LLVM::AliasAnalysisOpInterface newOp;
     if (copy.getNoOverlap())
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index 2960528fb6c24..6d8a8bb606b90 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -216,9 +216,7 @@ func.func @test_alloc_and_freemem_one() {
 }
 
 // CHECK-LABEL:  llvm.func @test_alloc_and_freemem_one() {
-// CHECK-NEXT:    %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK-NEXT:    %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK-NEXT:    %[[N:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:    %[[N:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK-NEXT:    llvm.call @malloc(%[[N]])
 // CHECK:         llvm.call @free(%{{.*}})
 // CHECK-NEXT:    llvm.return
@@ -235,10 +233,8 @@ func.func @test_alloc_and_freemem_several() {
 }
 
 // CHECK-LABEL:  llvm.func @test_alloc_and_freemem_several() {
-// CHECK: [[NULL:%.*]]  = llvm.mlir.zero : !llvm.ptr
-// CHECK: [[PTR:%.*]]  = llvm.getelementptr [[NULL]][{{.*}}] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<100 x f32>
-// CHECK: [[N:%.*]]  = llvm.ptrtoint [[PTR]] : !llvm.ptr to i64
-// CHECK: [[MALLOC:%.*]] = llvm.call @malloc([[N]])
+// CHECK:      %[[N:.*]] = llvm.mlir.constant(400 : i64) : i64
+// CHECK: [[MALLOC:%.*]] = llvm.call @malloc(%[[N]])
 // CHECK:              llvm.call @free([[MALLOC]])
 // CHECK:              llvm.return
 
@@ -251,9 +247,7 @@ func.func @test_with_shape(%ncols: index, %nrows: index) {
 
 // CHECK-LABEL: llvm.func @test_with_shape
 // CHECK-SAME: %[[NCOLS:.*]]: i64, %[[NROWS:.*]]: i64
-// CHECK:   %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:   %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK:   %[[FOUR:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:   %[[FOUR:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK:   %[[DIM1_SIZE:.*]] = llvm.mul %[[FOUR]], %[[NCOLS]]  : i64
 // CHECK:   %[[TOTAL_SIZE:.*]] = llvm.mul %[[DIM1_SIZE]], %[[NROWS]]  : i64
 // CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[TOTAL_SIZE]])
@@ -269,9 +263,7 @@ func.func @test_string_with_shape(%len: index, %nelems: index) {
 
 // CHECK-LABEL: llvm.func @test_string_with_shape
 // CHECK-SAME: %[[LEN:.*]]: i64, %[[NELEMS:.*]]: i64)
-// CHECK:   %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:   %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK:   %[[ONE:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:   %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:   %[[LEN_SIZE:.*]] = llvm.mul %[[ONE]], %[[LEN]]  : i64
 // CHECK:   %[[TOTAL_SIZE:.*]] = llvm.mul %[[LEN_SIZE]], %[[NELEMS]]  : i64
 // CHECK:   %[[MEM:.*]] = llvm.call @malloc(%[[TOTAL_SIZE]])
@@ -1654,9 +1646,7 @@ func.func @embox0(%arg0: !fir.ref<!fir.array<100xi32>>) {
 // AMDGPU:        %[[AA:.*]] = llvm.alloca %[[C1]] x !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
 // AMDGPU:        %[[ALLOCA:.*]] = llvm.addrspacecast %[[AA]] : !llvm.ptr<5> to !llvm.ptr
 // CHECK:         %[[TYPE_CODE:.*]] = llvm.mlir.constant(9 : i32) : i32
-// CHECK:         %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK:         %[[I64_ELEM_SIZE:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:         %[[I64_ELEM_SIZE:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK:         %[[DESC:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
 // CHECK:         %[[DESC0:.*]] = llvm.insertvalue %[[I64_ELEM_SIZE]], %[[DESC]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}})>
 // CHECK:         %[[CFI_VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
@@ -1879,9 +1869,7 @@ func.func @xembox0(%arg0: !fir.ref<!fir.array<?xi32>>) {
 // AMDGPU:        %[[ALLOCA:.*]] = llvm.addrspacecast %[[AA]] : !llvm.ptr<5> to !llvm.ptr
 // CHECK:         %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
 // CHECK:         %[[TYPE:.*]] = llvm.mlir.constant(9 : i32) : i32
-// CHECK:         %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK:         %[[BOX0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[BOX1:.*]] = llvm.insertvalue %[[ELEM_LEN_I64]], %[[BOX0]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
@@ -1933,9 +1921,7 @@ func.func @xembox0_i32(%arg0: !fir.ref<!fir.array<?xi32>>) {
 // CHECK:         %[[C0_I32:.*]] = llvm.mlir.constant(0 : i32) : i32
 // CHECK:         %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
 // CHECK:         %[[TYPE:.*]] = llvm.mlir.constant(9 : i32) : i32
-// CHECK:         %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK:         %[[BOX0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[BOX1:.*]] = llvm.insertvalue %[[ELEM_LEN_I64]], %[[BOX0]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
@@ -1988,9 +1974,7 @@ func.func @xembox1(%arg0: !fir.ref<!fir.array<?x!fir.char<1, 10>>>) {
 
 // CHECK-LABEL: llvm.func @xembox1(%{{.*}}: !llvm.ptr) {
 // CHECK:         %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64
-// CHECK:         %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.mlir.constant(10 : i64) : i64
 // CHECK:         %{{.*}} = llvm.insertvalue %[[ELEM_LEN_I64]], %{{.*}}[1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[PREV_PTROFF:.*]] = llvm.mul %[[ELEM_LEN_I64]], %[[C0]]  : i64
 
@@ -2042,9 +2026,7 @@ func.func private @_QPxb(!fir.box<!fir.array<?x?xf64>>)
 // AMDGPU:        %[[AR:.*]] = llvm.alloca %[[ARR_SIZE]] x f64 {bindc_name = "arr"} : (i64) -> !llvm.ptr<5>
 // AMDGPU:        %[[ARR:.*]] = llvm.addrspacecast %[[AR]] : !llvm.ptr<5> to !llvm.ptr
 // CHECK:         %[[TYPE_CODE:.*]] = llvm.mlir.constant(28 : i32) : i32
-// CHECK:         %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.mlir.constant(8 : i64) : i64
 // CHECK:         %[[BOX0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i64>>)>
 // CHECK:         %[[BOX1:.*]] = llvm.insertvalue %[[ELEM_LEN_I64]], %[[BOX0]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i64>>)>
 // CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
@@ -2126,9 +2108,7 @@ func.func private @_QPtest_dt_callee(%arg0: !fir.box<!fir.array<?xi32>>)
 // CHECK:         %[[C10:.*]] = llvm.mlir.constant(10 : i64) : i64
 // CHECK:         %[[C2:.*]] = llvm.mlir.constant(2 : i64) : i64
 // CHECK:         %[[TYPE_CODE:.*]] = llvm.mlir.constant(9 : i32) : i32
-// CHECK:         %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+// CHECK:         %[[ELEM_LEN_I64:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK:         %[[BOX0:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[BOX1:.*]] = llvm.insertvalue %[[ELEM_LEN_I64]], %[[BOX0]][1] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[VERSION:.*]] = llvm.mlir.constant(20240719 : i32) : i32
@@ -2146,9 +2126,7 @@ func.func private @_QPtest_dt_callee(%arg0: !fir.box<!fir.array<?xi32>>)
 // CHECK:         %[[BOX6:.*]] = llvm.insertvalue %[[F18ADDENDUM_I8]], %[[BOX5]][6] : !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)>
 // CHECK:         %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64
 // CHECK:         %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64
-// CHECK:         %[[ELE_TYPE:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:         %[[GEP_DTYPE_SIZE:.*]] = llvm.getelementptr %[[ELE_TYPE]][1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFtest_dt_sliceTt", (i32, i32)>
-// CHECK:         %[[PTRTOINT_DTYPE_SIZE:.*]] = llvm.ptrtoint %[[GEP_DTYPE_SIZE]] : !llvm.ptr to i64
+// CHECK:         %[[PTRTOINT_DTYPE_SIZE:.*]] = llvm.mlir.constant(8 : i64) : i64
 // CHECK:         %[[ADJUSTED_OFFSET:.*]] = llvm.sub %[[C1]], %[[ONE]]  : i64
 // CHECK:         %[[EXT_SUB:.*]] = llvm.sub %[[C10]], %[[C1]]  : i64
 // CHECK:         %[[EXT_ADD:.*]] = llvm.add %[[EXT_SUB]], %[[C2]]  : i64
@@ -2429,9 +2407,7 @@ func.func @test_rebox_1(%arg0: !fir.box<!fir.array<?x?xf32>>) {
 //CHECK:    %[[SIX:.*]] = llvm.mlir.constant(6 : index) : i64
 //CHECK:    %[[EIGHTY:.*]] = llvm.mlir.constant(80 : index) : i64
 //CHECK:    %[[FLOAT_TYPE:.*]] = llvm.mlir.constant(27 : i32) : i32
-//CHECK:    %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-//CHECK:    %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-//CHECK:    %[[ELEM_SIZE_I64:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+//CHECK:    %[[ELEM_SIZE_I64:.*]] = llvm.mlir.constant(4 : i64) : i64
 //CHECK:    %[[EXTRA_GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 6] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
 //CHECK:    %[[EXTRA:.*]] = llvm.load %[[EXTRA_GEP]] : !llvm.ptr -> i8
 //CHECK:    %[[RBOX:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
@@ -2504,9 +2480,7 @@ func.func @foo(%arg0: !fir.box<!fir.array<?x!fir.type<t{i:i32,c:!fir.char<1,10>}
 //CHECK:   %[[COMPONENT_OFFSET_1:.*]] = llvm.mlir.constant(1 : i64) : i64
 //CHECK:   %[[ELEM_COUNT:.*]] = llvm.mlir.constant(7 : i64) : i64
 //CHECK:   %[[TYPE_CHAR:.*]] = llvm.mlir.constant(40 : i32) : i32
-//CHECK:   %[[NULL:.*]] = llvm.mlir.zero : !llvm.ptr
-//CHECK:   %[[GEP:.*]] = llvm.getelementptr %[[NULL]][1]
-//CHECK:   %[[CHAR_SIZE:.*]] = llvm.ptrtoint %[[GEP]] : !llvm.ptr to i64
+//CHECK:   %[[CHAR_SIZE:.*]] = llvm.mlir.constant(1 : i64) : i64
 //CHECK:   %[[ELEM_SIZE:.*]] = llvm.mul %[[CHAR_SIZE]], %[[ELEM_COUNT]]
 //CHECK:   %[[EXTRA_GEP:.*]] = llvm.getelementptr %[[ARG0]][0, 6] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)>
 //CHECK:   %[[EXTRA:.*]] = llvm.load %[[EXTRA_GEP]] : !llvm.ptr -> i8
diff --git a/flang/test/Fir/copy-codegen.fir b/flang/test/Fir/copy-codegen.fir
index eef1885c6a49c..7b0620ca2d312 100644
--- a/flang/test/Fir/copy-codegen.fir
+++ b/flang/test/Fir/copy-codegen.fir
@@ -12,10 +12,8 @@ func.func @test_copy_1(%arg0: !fir.ref<!t>, %arg1: !fir.ref<!t>) {
 // CHECK-LABEL:   llvm.func @test_copy_1(
 // CHECK-SAME:                           %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !llvm.ptr,
 // CHECK-SAME:                           %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !llvm.ptr) {
-// CHECK:           %[[VAL_2:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:           %[[VAL_3:.*]] = llvm.getelementptr %[[VAL_2]][1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"sometype", (array<9 x i32>)>
-// CHECK:           %[[VAL_4:.*]] = llvm.ptrtoint %[[VAL_3]] : !llvm.ptr to i64
-// CHECK:           "llvm.intr.memcpy"(%[[VAL_1]], %[[VAL_0]], %[[VAL_4]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(36 : i64) : i64
+// CHECK:           "llvm.intr.memcpy"(%[[VAL_1]], %[[VAL_0]], %[[VAL_2]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
 
@@ -26,10 +24,8 @@ func.func @test_copy_2(%arg0: !fir.ref<!t>, %arg1: !fir.ref<!t>) {
 // CHECK-LABEL:   llvm.func @test_copy_2(
 // CHECK-SAME:                           %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !llvm.ptr,
 // CHECK-SAME:                           %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !llvm.ptr) {
-// CHECK:           %[[VAL_2:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:           %[[VAL_3:.*]] = llvm.getelementptr %[[VAL_2]][1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"sometype", (array<9 x i32>)>
-// CHECK:           %[[VAL_4:.*]] = llvm.ptrtoint %[[VAL_3]] : !llvm.ptr to i64
-// CHECK:           "llvm.intr.memmove"(%[[VAL_1]], %[[VAL_0]], %[[VAL_4]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK:           %[[VAL_2:.*]] = llvm.mlir.constant(36 : i64) : i64
+// CHECK:           "llvm.intr.memmove"(%[[VAL_1]], %[[VAL_0]], %[[VAL_2]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
 // CHECK:           llvm.return
 // CHECK:         }
 }
diff --git a/flang/test/Fir/embox-char.fir b/flang/test/Fir/embox-char.fir
index efb069f96520d..8e40acfdf289f 100644
--- a/flang/test/Fir/embox-char.fir
+++ b/flang/test/Fir/embox-char.fir
@@ -45,9 +45,7 @@
 // CHECK:           %[[VAL_30:.*]] = llvm.load %[[VAL_29]] : !llvm.ptr -> i64
 // CHECK:           %[[VAL_31:.*]] = llvm.sdiv %[[VAL_16]], %[[VAL_13]]  : i64
 // CHECK:           %[[VAL_32:.*]] = llvm.mlir.constant(44 : i32) : i32
-// CHECK:           %[[VAL_33:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:           %[[VAL_34:.*]] = llvm.getelementptr %[[VAL_33]][1] : (!llvm.ptr) -> !llvm.ptr, i32
-// CHECK:           %[[VAL_35:.*]] = llvm.ptrtoint %[[VAL_34]] : !llvm.ptr to i64
+// CHECK:           %[[VAL_35:.*]] = llvm.mlir.constant(4 : i64) : i64
 // CHECK:           %[[VAL_36:.*]] = llvm.mul %[[VAL_35]], %[[VAL_31]] : i64
 // CHECK:           %[[VAL_37:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
 // CHECK:           %[[VAL_38:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_37]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
@@ -139,9 +137,7 @@ func.func @test_char4(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x!fir.cha
 // CHECK:           %[[VAL_28:.*]] = llvm.getelementptr %[[VAL_10]][0, 7, %[[VAL_11]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
 // CHECK:           %[[VAL_29:.*]] = llvm.load %[[VAL_28]] : !llvm.ptr -> i64
 // CHECK:           %[[VAL_30:.*]] = llvm.mlir.constant(40 : i32) : i32
-// CHECK:           %[[VAL_31:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:           %[[VAL_32:.*]] = llvm.getelementptr %[[VAL_31]][1] : (!llvm.ptr) -> !llvm.ptr, i8
-// CHECK:           %[[VAL_33:.*]] = llvm.ptrtoint %[[VAL_32]] : !llvm.ptr to i64
+// CHECK:           %[[VAL_33:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:           %[[VAL_34:.*]] = llvm.mul %[[VAL_33]], %[[VAL_15]] : i64
 // CHECK:           %[[VAL_35:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
 // CHECK:           %[[VAL_36:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_35]][1] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)>
diff --git a/flang/test/Fir/embox-substring.fir b/flang/test/Fir/embox-substring.fir
index f2042f9bda7fc..6ce6346f89b1d 100644
--- a/flang/test/Fir/embox-substring.fir
+++ b/flang/test/Fir/embox-substring.fir
@@ -29,10 +29,9 @@ func.func private @dump(!fir.box<!fir.array<2x!fir.char<1>>>)
 // CHECK-SAME:                                  %[[VAL_0:.*]]: !llvm.ptr,
 // CHECK-SAME:                                  %[[VAL_1:.*]]: i64) {
 // CHECK:           %[[VAL_5:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK:           llvm.getelementptr
-// CHECK:           %[[VAL_28:.*]] = llvm.mlir.zero : !llvm.ptr
-// CHECK:           %[[VAL_29:.*]] = llvm.getelementptr %[[VAL_28]][1] : (!llvm.ptr) -> !llvm.ptr, i8
-// CHECK:           %[[VAL_30:.*]] = llvm.ptrtoint %[[VAL_29]] : !llvm.ptr to i64
+// CHECK:           llvm.mlir.constant(1 : i64) : i64
+// CHECK:           llvm.mlir.constant(1 : i64) : i64
+// CHECK:           %[[VAL_30:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:           %[[VAL_31:.*]] = llvm.mul %[[VAL_30]], %[[VAL_1]]  : i64
 // CHECK:           %[[VAL_42:.*]] = llvm.mul %[[VAL_31]], %[[VAL_5]]  : i64
 // CHECK:           %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_42]], %{{.*}}[7, 0, 2] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>

From d71c0b7f45582ece43016eb98367251e54e75280 Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier@nvidia.com>
Date: Fri, 16 May 2025 08:09:37 -0700
Subject: [PATCH 2/4] [flang] translate derived type array init to attribute if
 possible

---
 .../Optimizer/CodeGen/LLVMInsertChainFolder.h |  31 +++
 .../include/flang/Optimizer/Dialect/FIROps.td |   5 +
 flang/lib/Optimizer/CodeGen/CMakeLists.txt    |   1 +
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |  51 +++--
 .../CodeGen/LLVMInsertChainFolder.cpp         | 204 ++++++++++++++++++
 flang/lib/Optimizer/Dialect/FIROps.cpp        |  15 ++
 .../Fir/convert-and-fold-insert-on-range.fir  |  33 +++
 7 files changed, 319 insertions(+), 21 deletions(-)
 create mode 100644 flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
 create mode 100644 flang/lib/Optimizer/CodeGen/LLVMInsertChainFolder.cpp
 create mode 100644 flang/test/Fir/convert-and-fold-insert-on-range.fir

diff --git a/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h b/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
new file mode 100644
index 0000000000000..d577c4c0fa70b
--- /dev/null
+++ b/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
@@ -0,0 +1,31 @@
+//===-- LLVMInsertChainFolder.h --  insertvalue chain folder ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper to fold LLVM dialect llvm.insertvalue chain representing constants
+// into an Attribute representation.
+// This sits in Flang because it is incomplete and tailored for flang needs.
+//
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+class Attribute;
+class OpBuilder;
+class Value;
+} // namespace mlir
+
+namespace fir {
+
+/// Attempt to fold an llvm.insertvalue chain into an attribute representation
+/// suitable as llvm.constant operand. The returned value will be a null pointer
+/// if this is not an llvm.insertvalue result pr if the chain is not a constant,
+/// or cannot be represented as an Attribute. The operations are not deleted,
+/// but some llvm.insertvalue value operands may be folded with the builder on
+/// the way.
+mlir::Attribute tryFoldingLLVMInsertChain(mlir::Value insertChainResult,
+                                          mlir::OpBuilder &builder);
+} // namespace fir
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 458b780806144..dc66885f776f0 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2129,6 +2129,11 @@ def fir_InsertOnRangeOp : fir_OneResultOp<"insert_on_range", [NoMemoryEffect]> {
     $seq `,` $val custom<CustomRangeSubscript>($coor) attr-dict `:` functional-type(operands, results)
   }];
 
+  let extraClassDeclaration = [{
+    /// Is this insert_on_range inserting on all the values of the result type?
+    bool isFullRange();
+  }];
+
   let hasVerifier = 1;
 }
 
diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
index 04480bac552b7..980307db315d9 100644
--- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -3,6 +3,7 @@ add_flang_library(FIRCodeGen
   CodeGen.cpp
   CodeGenOpenMP.cpp
   FIROpPatterns.cpp
+  LLVMInsertChainFolder.cpp
   LowerRepackArrays.cpp
   PreCGRewrite.cpp
   TBAABuilder.cpp
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index ad9119ba4a031..ed76a77ced047 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -14,6 +14,7 @@
 
 #include "flang/Optimizer/CodeGen/CodeGenOpenMP.h"
 #include "flang/Optimizer/CodeGen/FIROpPatterns.h"
+#include "flang/Optimizer/CodeGen/LLVMInsertChainFolder.h"
 #include "flang/Optimizer/CodeGen/TypeConverter.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRCG/CGOps.h"
@@ -2412,15 +2413,38 @@ struct InsertOnRangeOpConversion
   doRewrite(fir::InsertOnRangeOp range, mlir::Type ty, OpAdaptor adaptor,
             mlir::ConversionPatternRewriter &rewriter) const override {
 
-    llvm::SmallVector<std::int64_t> dims;
-    auto type = adaptor.getOperands()[0].getType();
+    auto arrayType = adaptor.getSeq().getType();
 
     // Iteratively extract the array dimensions from the type.
+    llvm::SmallVector<std::int64_t> dims;
+    mlir::Type type = arrayType;
     while (auto t = mlir::dyn_cast<mlir::LLVM::LLVMArrayType>(type)) {
       dims.push_back(t.getNumElements());
       type = t.getElementType();
     }
 
+    // Avoid generating long insert chain that are very slow to fold back
+    // (which is required in globals when later generating LLVM IR). Attempt to
+    // fold the inserted element value to an attribute and build an ArrayAttr
+    // for the resulting array.
+    if (range.isFullRange()) {
+      if (mlir::Attribute cst =
+              fir::tryFoldingLLVMInsertChain(adaptor.getVal(), rewriter)) {
+        mlir::Attribute dimVal = cst;
+        for (auto dim : llvm::reverse(dims)) {
+          // Use std::vector in case the number of elements is big.
+          std::vector<mlir::Attribute> elements(dim, dimVal);
+          dimVal = mlir::ArrayAttr::get(range.getContext(), elements);
+        }
+        // Replace insert chain with constant.
+        rewriter.replaceOpWithNewOp<mlir::LLVM::ConstantOp>(range, arrayType,
+                                                            dimVal);
+        return mlir::success();
+      }
+    }
+
+    // The inserted value cannot be folded to an attribute, turn the
+    // insert_range into an llvm.insertvalue chain.
     llvm::SmallVector<std::int64_t> lBounds;
     llvm::SmallVector<std::int64_t> uBounds;
 
@@ -2434,8 +2458,8 @@ struct InsertOnRangeOpConversion
 
     auto &subscripts = lBounds;
     auto loc = range.getLoc();
-    mlir::Value lastOp = adaptor.getOperands()[0];
-    mlir::Value insertVal = adaptor.getOperands()[1];
+    mlir::Value lastOp = adaptor.getSeq();
+    mlir::Value insertVal = adaptor.getVal();
 
     while (subscripts != uBounds) {
       lastOp = rewriter.create<mlir::LLVM::InsertValueOp>(
@@ -3131,7 +3155,7 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
       // initialization is on the full range.
       auto insertOnRangeOps = gr.front().getOps<fir::InsertOnRangeOp>();
       for (auto insertOp : insertOnRangeOps) {
-        if (isFullRange(insertOp.getCoor(), insertOp.getType())) {
+        if (insertOp.isFullRange()) {
           auto seqTyAttr = convertType(insertOp.getType());
           auto *op = insertOp.getVal().getDefiningOp();
           auto constant = mlir::dyn_cast<mlir::arith::ConstantOp>(op);
@@ -3161,22 +3185,7 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
     return mlir::success();
   }
 
-  bool isFullRange(mlir::DenseIntElementsAttr indexes,
-                   fir::SequenceType seqTy) const {
-    auto extents = seqTy.getShape();
-    if (indexes.size() / 2 != static_cast<int64_t>(extents.size()))
-      return false;
-    auto cur_index = indexes.value_begin<int64_t>();
-    for (unsigned i = 0; i < indexes.size(); i += 2) {
-      if (*(cur_index++) != 0)
-        return false;
-      if (*(cur_index++) != extents[i / 2] - 1)
-        return false;
-    }
-    return true;
-  }
-
-  // TODO: String comparaison should be avoided. Replace linkName with an
+  // TODO: String comparisons should be avoided. Replace linkName with an
   // enumeration.
   mlir::LLVM::Linkage
   convertLinkage(std::optional<llvm::StringRef> optLinkage) const {
diff --git a/flang/lib/Optimizer/CodeGen/LLVMInsertChainFolder.cpp b/flang/lib/Optimizer/CodeGen/LLVMInsertChainFolder.cpp
new file mode 100644
index 0000000000000..0fc8697b735cf
--- /dev/null
+++ b/flang/lib/Optimizer/CodeGen/LLVMInsertChainFolder.cpp
@@ -0,0 +1,204 @@
+//===-- LLVMInsertChainFolder.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/CodeGen/LLVMInsertChainFolder.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "flang-insert-folder"
+
+#include <deque>
+
+namespace {
+// Helper class to construct the attribute elements of an aggregate value being
+// folded without creating a full mlir::Attribute representation for each step
+// of the insert value chain, which would both be expensive in terms of
+// compilation time and memory (since the intermediate Attribute would survive,
+// unused, inside the mlir context).
+class InsertChainBackwardFolder {
+  // Type for the current value of an element of the aggregate value being
+  // constructed by the insert chain.
+  // At any point of the insert chain, the value of an element is either:
+  //  - nullptr: not yet known, the insert has not yet been seen.
+  //  - an mlir::Attribute: the element is fully defined.
+  //  - a nested InsertChainBackwardFolder: the element is itself an aggregate
+  //    and its sub-elements have been partially defined (insert with mutliple
+  //    indices have been seen).
+
+  // The insertion folder assumes backward walk of the insert chain. Once an
+  // element or sub-element has been defined, it is not overriden by new
+  // insertions (last insert wins).
+  using InFlightValue =
+      llvm::PointerUnion<mlir::Attribute, InsertChainBackwardFolder *>;
+
+public:
+  InsertChainBackwardFolder(
+      mlir::Type type, std::deque<InsertChainBackwardFolder> *folderStorage)
+      : values(getNumElements(type), mlir::Attribute{}),
+        folderStorage{folderStorage}, type{type} {}
+
+  /// Push
+  bool pushValue(mlir::Attribute val, llvm::ArrayRef<int64_t> at);
+
+  mlir::Attribute finalize(mlir::Attribute defaultFieldValue);
+
+private:
+  static int64_t getNumElements(mlir::Type type) {
+    if (auto structTy =
+            llvm::dyn_cast_if_present<mlir::LLVM::LLVMStructType>(type))
+      return structTy.getBody().size();
+    if (auto arrayTy =
+            llvm::dyn_cast_if_present<mlir::LLVM::LLVMArrayType>(type))
+      return arrayTy.getNumElements();
+    return 0;
+  }
+
+  static mlir::Type getSubElementType(mlir::Type type, int64_t field) {
+    if (auto arrayTy =
+            llvm::dyn_cast_if_present<mlir::LLVM::LLVMArrayType>(type))
+      return arrayTy.getElementType();
+    if (auto structTy =
+            llvm::dyn_cast_if_present<mlir::LLVM::LLVMStructType>(type))
+      return structTy.getBody()[field];
+    return {};
+  }
+
+  // Current element value of the aggregate value being built.
+  llvm::SmallVector<InFlightValue> values;
+  // std::deque is used to allocate storage for nested list and guarantee the
+  // stability of the InsertChainBackwardFolder* used as element value.
+  std::deque<InsertChainBackwardFolder> *folderStorage;
+  // Type of the aggregate value being built.
+  mlir::Type type;
+};
+} // namespace
+
+// Helper to fold the value being inserted by an llvm.insert_value.
+// This may call tryFoldingLLVMInsertChain if the value is an aggregate and
+// was itself constructed by a different insert chain.
+static mlir::Attribute getAttrIfConstant(mlir::Value val,
+                                         mlir::OpBuilder &rewriter) {
+  if (auto cst = val.getDefiningOp<mlir::LLVM::ConstantOp>())
+    return cst.getValue();
+  if (auto insert = val.getDefiningOp<mlir::LLVM::InsertValueOp>())
+    return fir::tryFoldingLLVMInsertChain(val, rewriter);
+  if (val.getDefiningOp<mlir::LLVM::ZeroOp>())
+    return mlir::LLVM::ZeroAttr::get(val.getContext());
+  if (val.getDefiningOp<mlir::LLVM::UndefOp>())
+    return mlir::LLVM::UndefAttr::get(val.getContext());
+  if (mlir::Operation *op = val.getDefiningOp()) {
+    unsigned resNum = llvm::cast<mlir::OpResult>(val).getResultNumber();
+    llvm::SmallVector<mlir::Value> results;
+    if (mlir::succeeded(rewriter.tryFold(op, results)) &&
+        results.size() > resNum) {
+      if (auto cst = results[resNum].getDefiningOp<mlir::LLVM::ConstantOp>())
+        return cst.getValue();
+    }
+  }
+  if (auto trunc = val.getDefiningOp<mlir::LLVM::TruncOp>())
+    if (auto attr = getAttrIfConstant(trunc.getArg(), rewriter))
+      if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
+        return mlir::IntegerAttr::get(trunc.getType(), intAttr.getInt());
+  LLVM_DEBUG(llvm::dbgs() << "cannot fold insert value operand: " << val
+                          << "\n");
+  return {};
+}
+
+mlir::Attribute
+InsertChainBackwardFolder::finalize(mlir::Attribute defaultFieldValue) {
+  std::vector<mlir::Attribute> attrs;
+  attrs.reserve(values.size());
+  for (InFlightValue &inFlight : values) {
+    if (!inFlight) {
+      attrs.push_back(defaultFieldValue);
+    } else if (auto attr = llvm::dyn_cast<mlir::Attribute>(inFlight)) {
+      attrs.push_back(attr);
+    } else {
+      auto *inFlightList = llvm::cast<InsertChainBackwardFolder *>(inFlight);
+      attrs.push_back(inFlightList->finalize(defaultFieldValue));
+    }
+  }
+  return mlir::ArrayAttr::get(type.getContext(), attrs);
+}
+
+bool InsertChainBackwardFolder::pushValue(mlir::Attribute val,
+                                          llvm::ArrayRef<int64_t> at) {
+  if (at.size() == 0 || at[0] >= static_cast<int64_t>(values.size()))
+    return false;
+  InFlightValue &inFlight = values[at[0]];
+  if (!inFlight) {
+    if (at.size() == 1) {
+      inFlight = val;
+      return true;
+    }
+    // This is the first insert to a nested field. Create a
+    // InsertChainBackwardFolder for the current element value.
+    InsertChainBackwardFolder &inFlightList = folderStorage->emplace_back(
+        getSubElementType(type, at[0]), folderStorage);
+    inFlight = &inFlightList;
+    return inFlightList.pushValue(val, at.drop_front());
+  }
+  // Keep last inserted value if already set.
+  if (llvm::isa<mlir::Attribute>(inFlight))
+    return true;
+  auto *inFlightList = llvm::cast<InsertChainBackwardFolder *>(inFlight);
+  if (at.size() == 1) {
+    if (!llvm::isa<mlir::LLVM::ZeroAttr, mlir::LLVM::UndefAttr>(val)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "insert chain sub-element partially overwritten initial "
+                    "value is not zero or undef\n");
+      return false;
+    }
+    inFlight = inFlightList->finalize(val);
+    return true;
+  }
+  return inFlightList->pushValue(val, at.drop_front());
+}
+
+mlir::Attribute fir::tryFoldingLLVMInsertChain(mlir::Value val,
+                                               mlir::OpBuilder &rewriter) {
+  if (auto cst = val.getDefiningOp<mlir::LLVM::ConstantOp>())
+    return cst.getValue();
+  if (auto insert = val.getDefiningOp<mlir::LLVM::InsertValueOp>()) {
+    LLVM_DEBUG(llvm::dbgs() << "trying to fold insert chain:" << val << "\n");
+    if (auto structTy =
+            llvm::dyn_cast<mlir::LLVM::LLVMStructType>(insert.getType())) {
+      mlir::LLVM::InsertValueOp currentInsert = insert;
+      mlir::LLVM::InsertValueOp lastInsert;
+      std::deque<InsertChainBackwardFolder> folderStorage;
+      InsertChainBackwardFolder inFlightList(structTy, &folderStorage);
+      while (currentInsert) {
+        mlir::Attribute attr =
+            getAttrIfConstant(currentInsert.getValue(), rewriter);
+        if (!attr)
+          return {};
+        if (!inFlightList.pushValue(attr, currentInsert.getPosition()))
+          return {};
+        lastInsert = currentInsert;
+        currentInsert = currentInsert.getContainer()
+                            .getDefiningOp<mlir::LLVM::InsertValueOp>();
+      }
+      mlir::Attribute defaultVal;
+      if (lastInsert) {
+        if (lastInsert.getContainer().getDefiningOp<mlir::LLVM::ZeroOp>())
+          defaultVal = mlir::LLVM::ZeroAttr::get(val.getContext());
+        else if (lastInsert.getContainer().getDefiningOp<mlir::LLVM::UndefOp>())
+          defaultVal = mlir::LLVM::UndefAttr::get(val.getContext());
+      }
+      if (!defaultVal) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "insert chain initial value is not Zero or Undef\n");
+        return {};
+      }
+      return inFlightList.finalize(defaultVal);
+    }
+  }
+  return {};
+}
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index d85b38c467857..e12af7782a578 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -2365,6 +2365,21 @@ llvm::LogicalResult fir::InsertOnRangeOp::verify() {
   return mlir::success();
 }
 
+bool fir::InsertOnRangeOp::isFullRange() {
+  auto extents = getType().getShape();
+  mlir::DenseIntElementsAttr indexes = getCoor();
+  if (indexes.size() / 2 != static_cast<int64_t>(extents.size()))
+    return false;
+  auto cur_index = indexes.value_begin<int64_t>();
+  for (unsigned i = 0; i < indexes.size(); i += 2) {
+    if (*(cur_index++) != 0)
+      return false;
+    if (*(cur_index++) != extents[i / 2] - 1)
+      return false;
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // InsertValueOp
 //===----------------------------------------------------------------------===//
diff --git a/flang/test/Fir/convert-and-fold-insert-on-range.fir b/flang/test/Fir/convert-and-fold-insert-on-range.fir
new file mode 100644
index 0000000000000..df18614d80b63
--- /dev/null
+++ b/flang/test/Fir/convert-and-fold-insert-on-range.fir
@@ -0,0 +1,33 @@
+// Test codegen of constant insert_on_range without symbol reference into mlir.constant.
+// RUN: fir-opt --cg-rewrite --split-input-file --fir-to-llvm-ir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  fir.global @derived_array : !fir.array<2x!fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.undefined !fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>
+    %1 = fir.zero_bits !fir.heap<!fir.array<?xf64>>
+    %2 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %3 = fir.embox %1(%2) : (!fir.heap<!fir.array<?xf64>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf64>>>
+    %4 = fir.insert_value %0, %3, ["comp", !fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>] : (!fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>, !fir.box<!fir.heap<!fir.array<?xf64>>>) -> !fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>
+    %5 = fir.undefined !fir.array<2x!fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>
+    %6 = fir.insert_on_range %5, %4 from (0) to (1) : (!fir.array<2x!fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>, !fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>) -> !fir.array<2x!fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>
+    fir.has_value %6 : !fir.array<2x!fir.type<sometype{comp:!fir.box<!fir.heap<!fir.array<?xf64>>>}>>
+  }
+}
+
+//CHECK-LABEL: llvm.mlir.global external @derived_array()
+//CHECK:    %[[CST:.*]] = llvm.mlir.constant([
+//CHECK-SAME:  [
+//CHECK-SAME:    [#llvm.zero, 8, 20240719 : i32, 1 : i8, 28 : i8, 2 : i8, 0 : i8,
+//CHECK-SAME:      [
+//CHECK-SAME:        [1, 0 : index, 8]
+//CHECK-SAME:      ]
+//CHECK-SAME:    ],
+//CHECK-SAME:  [
+//CHECK-SAME:    [#llvm.zero, 8, 20240719 : i32, 1 : i8, 28 : i8, 2 : i8, 0 : i8,
+//CHECK-SAME:      [
+//CHECK-SAME:        [1, 0 : index, 8]
+//CHECK-SAME:      ]
+//CHECK-SAME:    ]) :
+//CHECK-SAME: !llvm.array<2 x struct<"sometype", (struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>)>>
+//CHECK:    llvm.return %[[CST]] : !llvm.array<2 x struct<"sometype", (struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>)>>

From 796a1e0269baf1c77ffabf47a8fa155356bc9096 Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier@nvidia.com>
Date: Mon, 19 May 2025 01:37:14 -0700
Subject: [PATCH 3/4] use map_to_vector and FailureOr

---
 .../Optimizer/CodeGen/LLVMInsertChainFolder.h |  7 ++-
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |  7 +--
 .../CodeGen/LLVMInsertChainFolder.cpp         | 54 ++++++++++---------
 3 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h b/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
index d577c4c0fa70b..321bda91aa6fe 100644
--- a/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
+++ b/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
@@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/LogicalResult.h"
+
 namespace mlir {
 class Attribute;
 class OpBuilder;
@@ -26,6 +28,7 @@ namespace fir {
 /// or cannot be represented as an Attribute. The operations are not deleted,
 /// but some llvm.insertvalue value operands may be folded with the builder on
 /// the way.
-mlir::Attribute tryFoldingLLVMInsertChain(mlir::Value insertChainResult,
-                                          mlir::OpBuilder &builder);
+llvm::FailureOr<mlir::Attribute>
+tryFoldingLLVMInsertChain(mlir::Value insertChainResult,
+                          mlir::OpBuilder &builder);
 } // namespace fir
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index ed76a77ced047..70c90fae34086 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2428,9 +2428,10 @@ struct InsertOnRangeOpConversion
     // fold the inserted element value to an attribute and build an ArrayAttr
     // for the resulting array.
     if (range.isFullRange()) {
-      if (mlir::Attribute cst =
-              fir::tryFoldingLLVMInsertChain(adaptor.getVal(), rewriter)) {
-        mlir::Attribute dimVal = cst;
+      llvm::FailureOr<mlir::Attribute> cst =
+          fir::tryFoldingLLVMInsertChain(adaptor.getVal(), rewriter);
+      if (llvm::succeeded(cst)) {
+        mlir::Attribute dimVal = *cst;
         for (auto dim : llvm::reverse(dims)) {
           // Use std::vector in case the number of elements is big.
           std::vector<mlir::Attribute> elements(dim, dimVal);
diff --git a/flang/lib/Optimizer/CodeGen/LLVMInsertChainFolder.cpp b/flang/lib/Optimizer/CodeGen/LLVMInsertChainFolder.cpp
index 0fc8697b735cf..5b522f2647916 100644
--- a/flang/lib/Optimizer/CodeGen/LLVMInsertChainFolder.cpp
+++ b/flang/lib/Optimizer/CodeGen/LLVMInsertChainFolder.cpp
@@ -67,7 +67,7 @@ class InsertChainBackwardFolder {
     if (auto structTy =
             llvm::dyn_cast_if_present<mlir::LLVM::LLVMStructType>(type))
       return structTy.getBody()[field];
-    return {};
+    return nullptr;
   }
 
   // Current element value of the aggregate value being built.
@@ -83,12 +83,18 @@ class InsertChainBackwardFolder {
 // Helper to fold the value being inserted by an llvm.insert_value.
 // This may call tryFoldingLLVMInsertChain if the value is an aggregate and
 // was itself constructed by a different insert chain.
+// Returns a nullptr Attribute if the value could not be folded.
 static mlir::Attribute getAttrIfConstant(mlir::Value val,
                                          mlir::OpBuilder &rewriter) {
   if (auto cst = val.getDefiningOp<mlir::LLVM::ConstantOp>())
     return cst.getValue();
-  if (auto insert = val.getDefiningOp<mlir::LLVM::InsertValueOp>())
-    return fir::tryFoldingLLVMInsertChain(val, rewriter);
+  if (auto insert = val.getDefiningOp<mlir::LLVM::InsertValueOp>()) {
+    llvm::FailureOr<mlir::Attribute> attr =
+        fir::tryFoldingLLVMInsertChain(val, rewriter);
+    if (succeeded(attr))
+      return *attr;
+    return nullptr;
+  }
   if (val.getDefiningOp<mlir::LLVM::ZeroOp>())
     return mlir::LLVM::ZeroAttr::get(val.getContext());
   if (val.getDefiningOp<mlir::LLVM::UndefOp>())
@@ -108,23 +114,20 @@ static mlir::Attribute getAttrIfConstant(mlir::Value val,
         return mlir::IntegerAttr::get(trunc.getType(), intAttr.getInt());
   LLVM_DEBUG(llvm::dbgs() << "cannot fold insert value operand: " << val
                           << "\n");
-  return {};
+  return nullptr;
 }
 
 mlir::Attribute
 InsertChainBackwardFolder::finalize(mlir::Attribute defaultFieldValue) {
-  std::vector<mlir::Attribute> attrs;
-  attrs.reserve(values.size());
-  for (InFlightValue &inFlight : values) {
-    if (!inFlight) {
-      attrs.push_back(defaultFieldValue);
-    } else if (auto attr = llvm::dyn_cast<mlir::Attribute>(inFlight)) {
-      attrs.push_back(attr);
-    } else {
-      auto *inFlightList = llvm::cast<InsertChainBackwardFolder *>(inFlight);
-      attrs.push_back(inFlightList->finalize(defaultFieldValue));
-    }
-  }
+  llvm::SmallVector<mlir::Attribute> attrs = llvm::map_to_vector(
+      values, [&](InFlightValue inFlight) -> mlir::Attribute {
+        if (!inFlight)
+          return defaultFieldValue;
+        if (auto attr = llvm::dyn_cast<mlir::Attribute>(inFlight))
+          return attr;
+        return llvm::cast<InsertChainBackwardFolder *>(inFlight)->finalize(
+            defaultFieldValue);
+      });
   return mlir::ArrayAttr::get(type.getContext(), attrs);
 }
 
@@ -140,8 +143,11 @@ bool InsertChainBackwardFolder::pushValue(mlir::Attribute val,
     }
     // This is the first insert to a nested field. Create a
     // InsertChainBackwardFolder for the current element value.
-    InsertChainBackwardFolder &inFlightList = folderStorage->emplace_back(
-        getSubElementType(type, at[0]), folderStorage);
+    mlir::Type subType = getSubElementType(type, at[0]);
+    if (!subType)
+      return false;
+    InsertChainBackwardFolder &inFlightList =
+        folderStorage->emplace_back(subType, folderStorage);
     inFlight = &inFlightList;
     return inFlightList.pushValue(val, at.drop_front());
   }
@@ -162,8 +168,8 @@ bool InsertChainBackwardFolder::pushValue(mlir::Attribute val,
   return inFlightList->pushValue(val, at.drop_front());
 }
 
-mlir::Attribute fir::tryFoldingLLVMInsertChain(mlir::Value val,
-                                               mlir::OpBuilder &rewriter) {
+llvm::FailureOr<mlir::Attribute>
+fir::tryFoldingLLVMInsertChain(mlir::Value val, mlir::OpBuilder &rewriter) {
   if (auto cst = val.getDefiningOp<mlir::LLVM::ConstantOp>())
     return cst.getValue();
   if (auto insert = val.getDefiningOp<mlir::LLVM::InsertValueOp>()) {
@@ -178,9 +184,9 @@ mlir::Attribute fir::tryFoldingLLVMInsertChain(mlir::Value val,
         mlir::Attribute attr =
             getAttrIfConstant(currentInsert.getValue(), rewriter);
         if (!attr)
-          return {};
+          return llvm::failure();
         if (!inFlightList.pushValue(attr, currentInsert.getPosition()))
-          return {};
+          return llvm::failure();
         lastInsert = currentInsert;
         currentInsert = currentInsert.getContainer()
                             .getDefiningOp<mlir::LLVM::InsertValueOp>();
@@ -195,10 +201,10 @@ mlir::Attribute fir::tryFoldingLLVMInsertChain(mlir::Value val,
       if (!defaultVal) {
         LLVM_DEBUG(llvm::dbgs()
                    << "insert chain initial value is not Zero or Undef\n");
-        return {};
+        return llvm::failure();
       }
       return inFlightList.finalize(defaultVal);
     }
   }
-  return {};
+  return llvm::failure();
 }

From 181985e213289b1f0caa4d81599e07fbec067fb5 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Tue, 20 May 2025 10:47:35 +0200
Subject: [PATCH 4/4] Apply suggestions from code review

Thanks Slava!

Co-authored-by: Slava Zakharin <szakharin@nvidia.com>
---
 flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h b/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
index 321bda91aa6fe..36360dc77d588 100644
--- a/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
+++ b/flang/include/flang/Optimizer/CodeGen/LLVMInsertChainFolder.h
@@ -23,8 +23,8 @@ class Value;
 namespace fir {
 
 /// Attempt to fold an llvm.insertvalue chain into an attribute representation
-/// suitable as llvm.constant operand. The returned value will be a null pointer
-/// if this is not an llvm.insertvalue result pr if the chain is not a constant,
+/// suitable as llvm.constant operand. The returned value will be llvm::Failure
+/// if this is not an llvm.insertvalue result or if the chain is not a constant,
 /// or cannot be represented as an Attribute. The operations are not deleted,
 /// but some llvm.insertvalue value operands may be folded with the builder on
 /// the way.