Merge branch 'main' into review/benchmarks-refactor-sequence-containers

ldionne · ldionne · commit 71909bac2138 · 2025-01-20T16:15:23.000-05:00
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
@@ -36,19 +36,22 @@ jobs:
           tag=`date +%s`
           container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/ci-ubuntu-22.04"
           echo "container-name=$container_name" >> $GITHUB_OUTPUT
+          echo "container-name-agent=$container_name-agent" >> $GITHUB_OUTPUT
           echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT
+          echo "container-name-agent-tag=$container_name-agent:$tag" >> $GITHUB_OUTPUT
           echo "container-filename=$(echo $container_name:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
       - name: Build container
         working-directory: ./.github/workflows/containers/github-action-ci/
         run: |
-          podman build -t ${{ steps.vars.outputs.container-name-tag }} .
+          podman build --target ci-container -t ${{ steps.vars.outputs.container-name-tag }} .
+          podman build --target ci-container-agent -t ${{ steps.vars.outputs.container-name-agent-tag }} .
 
       # Save the container so we have it in case the push fails.  This also
       # allows us to separate the push step into a different job so we can
       # maintain minimal permissions while building the container.
       - name: Save container image
         run: |
-          podman save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
+          podman save  ${{ steps.vars.outputs.container-name-tag }} ${{ steps.vars.outputs.container-name-agent-tag }} >  ${{ steps.vars.outputs.container-filename }}
 
       - name: Upload container image
         uses: actions/upload-artifact@v4
@@ -86,3 +89,7 @@ jobs:
           podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
           podman push ${{ needs.build-ci-container.outputs.container-name-tag }}
           podman push ${{ needs.build-ci-container.outputs.container-name }}:latest
+
+          podman tag ${{ needs.build-ci-container.outpus.container-name-agent-tag }} ${{ needs.build-ci-container.outputs.container-name-agent }}:latest
+          podman push ${{ needs.build-ci-container.outputs.container-name-agent-tag }}
+          podman push ${{ needs.build-ci-container.outputs.container-name-agent }}:latest
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -13,7 +13,8 @@ RUN apt-get update && \
     ninja-build \
     python3 \
     git \
-    curl
+    curl \
+    zlib1g-dev
 
 RUN curl -O -L https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-$LLVM_VERSION.tar.gz && tar -xf llvmorg-$LLVM_VERSION.tar.gz
 
@@ -38,7 +39,7 @@ RUN cmake -B ./build -G Ninja ./llvm \
 
 RUN ninja -C ./build stage2-clang-bolt stage2-install-distribution && ninja -C ./build install-distribution
 
-FROM base
+FROM base as ci-container
     
 COPY --from=stage1-toolchain $LLVM_SYSROOT $LLVM_SYSROOT
 
@@ -91,4 +92,15 @@ RUN adduser gha sudo
 RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
 
 USER gha
+WORKDIR /home/gha
+
+FROM ci-container as ci-container-agent
+
+ENV GITHUB_RUNNER_VERSION=2.321.0
+
+RUN mkdir actions-runner && \
+    cd actions-runner && \
+    curl -O -L https://github.com/actions/runner/releases/download/v$GITHUB_RUNNER_VERSION/actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz && \
+    tar xzf ./actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz && \
+    rm ./actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz
 
diff --git a/libcxx/include/future b/libcxx/include/future
@@ -1840,7 +1840,7 @@ class _LIBCPP_HIDDEN __async_func {
   tuple<_Fp, _Args...> __f_;
 
 public:
-  using _Rp = __invoke_result_t<_Fp, _Args...>;
+  using _Rp _LIBCPP_NODEBUG = __invoke_result_t<_Fp, _Args...>;
 
   _LIBCPP_HIDE_FROM_ABI explicit __async_func(_Fp&& __f, _Args&&... __args)
       : __f_(std::move(__f), std::move(__args)...) {}
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
@@ -333,10 +333,10 @@ jakub@nod-labs.com (email), [kuhar](https://github.com/kuhar) (GitHub)
 Peter Collingbourne \
 peter@pcc.me.uk (email), [pcc](https://github.com/pcc) (GitHub)
 
-#### CMake and library layering
+#### CMake
 
-Chandler Carruth \
-chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
+Petr Hosek \
+phosek@google.com (email), [petrhosek](https://github.com/petrhosek) (GitHub)
 
 #### Debug info and DWARF
 
@@ -356,6 +356,11 @@ echristo@gmail.com (email), [echristo](https://github.com/echristo) (GitHub)
 Teresa Johnson \
 tejohnson@google.com (email), [teresajohnson](https://github.com/teresajohnson) (GitHub)
 
+#### Library layering
+
+Takumi Nakamura \
+geek4civic@gmail.com (email), [chapuni](https://github.com/chapuni) (GitHub)
+
 #### MCJIT, Orc, RuntimeDyld, PerfJITEvents
 
 Lang Hames \
@@ -477,7 +482,7 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 
 Paul C. Anagnostopoulos (paul@windfall.com, [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
-Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support, Inlining \
+Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support, Inlining, CMake and library layering \
 Peter Collingbourne (peter@pcc.me.uk, [pcc](https://github.com/pcc)) -- LTO \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -161,7 +161,7 @@ class LiveRegMatrix {
   /// Use MCRegUnitIterator to enumerate all regunits in the desired PhysReg.
   /// This returns a reference to an internal Query data structure that is only
   /// valid until the next query() call.
-  LiveIntervalUnion::Query &query(const LiveRange &LR, MCRegister RegUnit);
+  LiveIntervalUnion::Query &query(const LiveRange &LR, MCRegUnit RegUnit);
 
   /// Directly access the live interval unions per regunit.
   /// This returns an array indexed by the regunit number.
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -184,7 +184,7 @@ bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg,
 }
 
 LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,
-                                               MCRegister RegUnit) {
+                                               MCRegUnit RegUnit) {
   LiveIntervalUnion::Query &Q = Queries[RegUnit];
   Q.init(UserTag, LR, Matrix[RegUnit]);
   return Q;
@@ -206,7 +206,7 @@ LiveRegMatrix::checkInterference(const LiveInterval &VirtReg,
 
   // Check the matrix for virtual register interference.
   bool Interference = foreachUnit(TRI, VirtReg, PhysReg,
-                                  [&](MCRegister Unit, const LiveRange &LR) {
+                                  [&](MCRegUnit Unit, const LiveRange &LR) {
                                     return query(LR, Unit).checkInterference();
                                   });
   if (Interference)
diff --git a/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h b/mlir/lib/Conversion/GPUCommon/OpToFuncCallLowering.h
@@ -16,10 +16,16 @@
 
 namespace mlir {
 
-/// Rewriting that replace SourceOp with a CallOp to `f32Func` or `f64Func` or
-/// `f32ApproxFunc` or `f16Func` depending on the element type and the
-/// fastMathFlag of that Op. The function declaration is added in case it was
-/// not added before.
+namespace {
+/// Detection trait tor the `getFastmath` instance method.
+template <typename T>
+using has_get_fastmath_t = decltype(std::declval<T>().getFastmath());
+} // namespace
+
+/// Rewriting that replaces SourceOp with a CallOp to `f32Func` or `f64Func` or
+/// `f32ApproxFunc` or `f16Func` or `i32Type` depending on the element type and
+/// the fastMathFlag of that Op, if present. The function declaration is added
+/// in case it was not added before.
 ///
 /// If the input values are of bf16 type (or f16 type if f16Func is empty), the
 /// value is first casted to f32, the function called and then the result casted
@@ -39,14 +45,22 @@ namespace mlir {
 ///
 /// will be transformed into
 ///   llvm.call @__nv_fast_expf(%arg_f32) : (f32) -> f32
+///
+/// Final example with NVVM:
+///   %pow_f32 = math.fpowi %arg_f32, %arg_i32
+///
+/// will be transformed into
+///   llvm.call @__nv_powif(%arg_f32, %arg_i32) : (f32, i32) -> f32
 template <typename SourceOp>
 struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
 public:
   explicit OpToFuncCallLowering(const LLVMTypeConverter &lowering,
                                 StringRef f32Func, StringRef f64Func,
-                                StringRef f32ApproxFunc, StringRef f16Func)
+                                StringRef f32ApproxFunc, StringRef f16Func,
+                                StringRef i32Func = "")
       : ConvertOpToLLVMPattern<SourceOp>(lowering), f32Func(f32Func),
-        f64Func(f64Func), f32ApproxFunc(f32ApproxFunc), f16Func(f16Func) {}
+        f64Func(f64Func), f32ApproxFunc(f32ApproxFunc), f16Func(f16Func),
+        i32Func(i32Func) {}
 
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
@@ -76,9 +90,8 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
 
     Type resultType = castedOperands.front().getType();
     Type funcType = getFunctionType(resultType, castedOperands);
-    StringRef funcName =
-        getFunctionName(cast<LLVM::LLVMFunctionType>(funcType).getReturnType(),
-                        op.getFastmath());
+    StringRef funcName = getFunctionName(
+        cast<LLVM::LLVMFunctionType>(funcType).getReturnType(), op);
     if (funcName.empty())
       return failure();
 
@@ -91,14 +104,15 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
       return success();
     }
 
+    assert(callOp.getResult().getType().isF32() &&
+           "only f32 types are supposed to be truncated back");
     Value truncated = rewriter.create<LLVM::FPTruncOp>(
         op->getLoc(), adaptor.getOperands().front().getType(),
         callOp.getResult());
     rewriter.replaceOp(op, {truncated});
     return success();
   }
 
-private:
   Value maybeCast(Value operand, PatternRewriter &rewriter) const {
     Type type = operand.getType();
     if (!isa<Float16Type, BFloat16Type>(type))
@@ -117,38 +131,50 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> {
     return LLVM::LLVMFunctionType::get(resultType, operandTypes);
   }
 
-  StringRef getFunctionName(Type type, arith::FastMathFlags flag) const {
-    if (isa<Float16Type>(type))
-      return f16Func;
-    if (isa<Float32Type>(type)) {
-      if (((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) &&
-          !f32ApproxFunc.empty())
-        return f32ApproxFunc;
-      else
-        return f32Func;
-    }
-    if (isa<Float64Type>(type))
-      return f64Func;
-    return "";
-  }
-
   LLVM::LLVMFuncOp appendOrGetFuncOp(StringRef funcName, Type funcType,
                                      Operation *op) const {
     using LLVM::LLVMFuncOp;
 
     auto funcAttr = StringAttr::get(op->getContext(), funcName);
-    Operation *funcOp = SymbolTable::lookupNearestSymbolFrom(op, funcAttr);
+    auto funcOp =
+        SymbolTable::lookupNearestSymbolFrom<LLVMFuncOp>(op, funcAttr);
     if (funcOp)
-      return cast<LLVMFuncOp>(*funcOp);
+      return funcOp;
 
-    mlir::OpBuilder b(op->getParentOfType<FunctionOpInterface>());
+    auto parentFunc = op->getParentOfType<FunctionOpInterface>();
+    assert(parentFunc && "expected there to be a parent function");
+    OpBuilder b(parentFunc);
     return b.create<LLVMFuncOp>(op->getLoc(), funcName, funcType);
   }
 
+  StringRef getFunctionName(Type type, SourceOp op) const {
+    bool useApprox = false;
+    if constexpr (llvm::is_detected<has_get_fastmath_t, SourceOp>::value) {
+      arith::FastMathFlags flag = op.getFastmath();
+      useApprox = ((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) &&
+                  !f32ApproxFunc.empty();
+    }
+
+    if (isa<Float16Type>(type))
+      return f16Func;
+    if (isa<Float32Type>(type)) {
+      if (useApprox)
+        return f32ApproxFunc;
+      return f32Func;
+    }
+    if (isa<Float64Type>(type))
+      return f64Func;
+
+    if (type.isInteger(32))
+      return i32Func;
+    return "";
+  }
+
   const std::string f32Func;
   const std::string f64Func;
   const std::string f32ApproxFunc;
   const std::string f16Func;
+  const std::string i32Func;
 };
 
 } // namespace mlir
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -446,6 +446,22 @@ static void populateOpPatterns(const LLVMTypeConverter &converter,
                                            f32ApproxFunc, f16Func);
 }
 
+template <typename OpTy>
+static void populateIntOpPatterns(const LLVMTypeConverter &converter,
+                                  RewritePatternSet &patterns,
+                                  StringRef i32Func) {
+  patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
+  patterns.add<OpToFuncCallLowering<OpTy>>(converter, "", "", "", "", i32Func);
+}
+
+template <typename OpTy>
+static void populateFloatIntOpPatterns(const LLVMTypeConverter &converter,
+                                       RewritePatternSet &patterns,
+                                       StringRef f32Func, StringRef f64Func) {
+  patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
+  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func, "", "");
+}
+
 void mlir::populateGpuSubgroupReduceOpLoweringPattern(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
   patterns.add<GPUSubgroupReduceOpLowering>(converter);
@@ -509,6 +525,7 @@ void mlir::populateGpuToNVVMConversionPatterns(
 
   populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
                                     "__nv_fmod");
+  populateIntOpPatterns<math::AbsIOp>(converter, patterns, "__nv_abs");
   populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
                                    "__nv_fabs");
   populateOpPatterns<math::AcosOp>(converter, patterns, "__nv_acosf",
@@ -555,6 +572,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
                                    "__nv_log2", "__nv_fast_log2f");
   populateOpPatterns<math::PowFOp>(converter, patterns, "__nv_powf", "__nv_pow",
                                    "__nv_fast_powf");
+  populateFloatIntOpPatterns<math::FPowIOp>(converter, patterns, "__nv_powif",
+                                            "__nv_powi");
   populateOpPatterns<math::RoundOp>(converter, patterns, "__nv_roundf",
                                     "__nv_round");
   populateOpPatterns<math::RoundEvenOp>(converter, patterns, "__nv_rintf",
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1033,3 +1033,27 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+
+gpu.module @test_module_52 {
+  // CHECK: llvm.func @__nv_abs(i32) -> i32
+  // CHECK-LABEL: func @gpu_abs
+  func.func @gpu_abs(%arg_i32 : i32) -> (i32) {
+    %result32 = math.absi %arg_i32 : i32
+    // CHECK: llvm.call @__nv_abs(%{{.*}}) : (i32) -> i32
+    func.return %result32 : i32
+  }
+}
+
+gpu.module @test_module_53 {
+  // CHECK: llvm.func @__nv_powif(f32, i32) -> f32
+  // CHECK: llvm.func @__nv_powi(f64, i32) -> f64
+  // CHECK-LABEL: func @gpu_powi
+  func.func @gpu_powi(%arg_f32 : f32, %arg_f64 : f64, %arg_i32 : i32) -> (f32, f64) {
+    %result32 = math.fpowi %arg_f32, %arg_i32 : f32, i32
+    // CHECK: llvm.call @__nv_powif(%{{.*}}, %{{.*}}) : (f32, i32) -> f32
+    %result64 = math.fpowi %arg_f64, %arg_i32 : f64, i32
+    // CHECK: llvm.call @__nv_powi(%{{.*}}, %{{.*}}) : (f64, i32) -> f64
+    func.return %result32, %result64 : f32, f64
+  }
+}

-Original file line number
+Diff line change
 Peter Collingbourne \
 [email protected] (email), [pcc](https://github.com/pcc) (GitHub)
 -#### CMake and library layering
 +#### CMake
 -Chandler Carruth \
 -[email protected], chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
 +Petr Hosek \
 +phosek@google.com (email), [petrhosek](https://github.com/petrhosek) (GitHub)
 #### Debug info and DWARF
 Teresa Johnson \
 [email protected] (email), [teresajohnson](https://github.com/teresajohnson) (GitHub)
 +#### Library layering
++
 +Takumi Nakamura \
 +[email protected] (email), [chapuni](https://github.com/chapuni) (GitHub)
++
 #### MCJIT, Orc, RuntimeDyld, PerfJITEvents
 Lang Hames \
 Paul C. Anagnostopoulos ([email protected], [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner ([email protected], [bogner](https://github.com/bogner)) -- SelectionDAG \
 -Chandler Carruth ([email protected], [email protected], [chandlerc](https://github.com/chandlerc)) -- ADT, Support, Inlining \
 +Chandler Carruth ([email protected], [email protected], [chandlerc](https://github.com/chandlerc)) -- ADT, Support, Inlining, CMake and library layering \
 Peter Collingbourne ([email protected], [pcc](https://github.com/pcc)) -- LTO \
 Evan Cheng ([email protected]) -- Parts of code generator not covered by someone else \
 Jake Ehrlich ([email protected], [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \