[OpenMP][MLIR] Fix threadprivate lowering when compiling for target when target operations are in use (#119310)

agozillon · web-flow · commit fa56e8bb6451 · 2025-01-03T18:01:01.000+01:00
Currently the compiler will ICE in programs like the following on the
device lowering pass:

```
program main
    implicit none

    type i1_t
       integer :: val(1000)
    end type i1_t
    integer :: i
    type(i1_t), pointer :: newi1
    type(i1_t), pointer :: tab=&gt;null()

    integer, dimension(:), pointer :: tabval

!$omp THREADPRIVATE(tab)

allocate(newi1)

tab=&gt;newi1
tab%val(:)=1
tabval=&gt;tab%val

!$omp target teams distribute parallel do
  do i = 1, 1000
   tabval(i) = i
 end do
!$omp end target teams distribute parallel do

end program main
```

This is due to the fact that THREADPRIVATE returns a result operation,
and this operation can actually be used by other LLVM dialect (or other
dialect) operations. However, we currently skip the lowering of
threadprivate, so we effectively never generate and bind an LLVM-IR
result to the threadprivate operation result. So when we later go on to
lower dependent LLVM dialect operations, we are missing the required
LLVM-IR result, try to access and use it and then ICE. The fix in this
particular PR is to allow compilation of threadprivate for device as
well as host, and simply treat the device compilation as a no-op,
binding the LLVM-IR result of threadprivate with no alterations and
binding it, which will allow the rest of the compilation to proceed,
where we'll eventually discard the host segment in any case.

The other possible solution to this I can think of, is doing something
similar to Flang's passes that occur prior to CodeGen to the LLVM
dialect, where they erase/no-op certain unrequired operations or
transform them to lower level series of operations. And we would
erase/no-op threadprivate on device as we'd never have these in target
regions.

The main issues I can see with this are that we currently do not
specialise this stage based on wether we're compiling for device or
host, so it's setting a precedent and adding another point of having to
understand the separation between target and host compilation. I am also
not sure we'd necessarily want to enforce this at a dialect level incase
someone else wishes to add a different lowering flow or translation
flow. Another possible issue is that a target operation we have/utilise
would depend on the result of threadprivate, meaning we'd not be allowed
to entirely erase/no-op it, I am not sure of any situations where this
may be an issue currently though.
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2588,31 +2588,39 @@ static LogicalResult
 convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation) {
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   auto threadprivateOp = cast<omp::ThreadprivateOp>(opInst);
 
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
   Value symAddr = threadprivateOp.getSymAddr();
   auto *symOp = symAddr.getDefiningOp();
+
+  if (auto asCast = dyn_cast<LLVM::AddrSpaceCastOp>(symOp))
+    symOp = asCast.getOperand().getDefiningOp();
+
   if (!isa<LLVM::AddressOfOp>(symOp))
     return opInst.emitError("Addressing symbol not found");
   LLVM::AddressOfOp addressOfOp = dyn_cast<LLVM::AddressOfOp>(symOp);
 
   LLVM::GlobalOp global =
       addressOfOp.getGlobal(moduleTranslation.symbolTable());
   llvm::GlobalValue *globalValue = moduleTranslation.lookupGlobal(global);
-  llvm::Type *type = globalValue->getValueType();
-  llvm::TypeSize typeSize =
-      builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize(
-          type);
-  llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue());
-  llvm::StringRef suffix = llvm::StringRef(".cache", 6);
-  std::string cacheName = (Twine(global.getSymName()).concat(suffix)).str();
-  llvm::Value *callInst =
-      moduleTranslation.getOpenMPBuilder()->createCachedThreadPrivate(
-          ompLoc, globalValue, size, cacheName);
-  moduleTranslation.mapValue(opInst.getResult(0), callInst);
+
+  if (!ompBuilder->Config.isTargetDevice()) {
+    llvm::Type *type = globalValue->getValueType();
+    llvm::TypeSize typeSize =
+        builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize(
+            type);
+    llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue());
+    llvm::Value *callInst = ompBuilder->createCachedThreadPrivate(
+        ompLoc, globalValue, size, global.getSymName() + ".cache");
+    moduleTranslation.mapValue(opInst.getResult(0), callInst);
+  } else {
+    moduleTranslation.mapValue(opInst.getResult(0), globalValue);
+  }
+
   return success();
 }
 
@@ -4212,6 +4220,14 @@ static bool isTargetDeviceOp(Operation *op) {
   if (op->getParentOfType<omp::TargetOp>())
     return true;
 
+  // Certain operations return results, and whether utilised in host or
+  // target there is a chance an LLVM Dialect operation depends on it
+  // by taking it in as an operand, so we must always lower these in
+  // some manner or result in an ICE (whether they end up in a no-op
+  // or otherwise).
+  if (mlir::isa<omp::ThreadprivateOp>(op))
+    return true;
+
   if (auto parentFn = op->getParentOfType<LLVM::LLVMFuncOp>())
     if (auto declareTargetIface =
             llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
diff --git a/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Not intended to be a functional example, the aim of this test is to verify
+// omp.threadprivate does not crash on lowering during the OpenMP target device
+// pass when used in conjunction with target code in the same module.
+
+module attributes {omp.is_target_device = true } {
+  llvm.func @func() attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
+    %0 = llvm.mlir.addressof @_QFEpointer2 : !llvm.ptr
+    %1 = omp.threadprivate %0 : !llvm.ptr -> !llvm.ptr
+    %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(implicit, to) capture(ByRef) -> !llvm.ptr
+    omp.target map_entries(%2 -> %arg0 : !llvm.ptr) {
+      %3 = llvm.mlir.constant(1 : i32) : i32
+      %4 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+      llvm.store %3, %4 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+   llvm.mlir.global internal @_QFEpointer2() {addr_space = 0 : i32} : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {
+    %0 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.return %0 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+  }
+}
+
+// CHECK: define weak_odr protected void @{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]]) {
+// CHECK:  %[[ALLOCA:.*]] = alloca ptr, align 8
+// CHECK:  store ptr %[[ARG1]], ptr %[[ALLOCA]], align 8
+// CHECK:  %[[LOAD_ALLOCA:.*]] = load ptr, ptr %[[ALLOCA]], align 8
+// CHECK:  store i32 1, ptr %[[LOAD_ALLOCA]], align 4
diff --git a/offload/test/offloading/fortran/target-with-threadprivate.f90 b/offload/test/offloading/fortran/target-with-threadprivate.f90
@@ -0,0 +1,37 @@
+! Basic offloading test that makes sure we can use the predominantly host
+! pragma threadprivate in the same program as target code
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+
+    type dtype
+       integer :: val(10)
+    end type dtype
+
+    integer :: i
+    type(dtype), pointer :: pointer1
+    type(dtype), pointer :: pointer2=>null()
+    integer, dimension(:), pointer :: data_pointer
+
+!$omp threadprivate(pointer2)
+
+nullify(pointer1)
+allocate(pointer1)
+
+pointer2=>pointer1
+pointer2%val(:)=1
+data_pointer=>pointer2%val
+
+!$omp target
+   do i = 1, 10
+     data_pointer(i) = i
+   end do
+!$omp end target
+
+print *, data_pointer
+
+end program main
+
+! CHECK: 1 2 3 4 5 6 7 8 9 10