rahulsingh-intel
diff --git a/‎aten/src/ATen/ParallelNative.cpp‎
Lines changed: 17 additions & 13 deletions b/‎aten/src/ATen/ParallelNative.cpp‎
Lines changed: 17 additions & 13 deletions
diff --git a/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 4 additions & 5 deletions b/‎aten/src/ATen/cuda/CUDABlas.cpp‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎aten/src/ATen/cuda/CUDASparseDescriptors.cpp‎
Lines changed: 0 additions & 2 deletions b/‎aten/src/ATen/cuda/CUDASparseDescriptors.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎aten/src/ATen/detail/MTIAHooksInterface.h‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/detail/MTIAHooksInterface.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/functorch/BatchRulesModules.cpp‎
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/functorch/BatchRulesModules.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/Activation.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/Activation.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/Blas.cpp‎
Lines changed: 15 additions & 3 deletions b/‎aten/src/ATen/native/mkldnn/xpu/Blas.cpp‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎torch/csrc/Generator.cpp‎
Lines changed: 1 addition & 1 deletion b/‎torch/csrc/Generator.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/csrc/api/include/torch/detail/TensorDataContainer.h‎
Lines changed: 1 addition & 2 deletions b/‎torch/csrc/api/include/torch/detail/TensorDataContainer.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎torch/csrc/autograd/python_function.cpp‎
Lines changed: 8 additions & 9 deletions b/‎torch/csrc/autograd/python_function.cpp‎
Lines changed: 8 additions & 9 deletions
@@ -86,14 +86,14 @@ TaskThreadPoolBase& _get_intraop_pool() {
 #endif // C10_MOBILE
 
 // Run lambda function `fn` over `task_id` in [0, `range`) with threadpool.
-// `fn` will be called with params: (thread_pool_task_id, task_id).
-void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
+// `fn` will be called with params: task_id.
+static void _run_with_pool(const std::function<void(size_t)>& fn, size_t range) {
 #ifndef C10_MOBILE
   for (const auto i : c10::irange(1, range)) {
-    _get_intraop_pool().run([fn, i]() { fn((int)i, i); });
+    _get_intraop_pool().run([fn, i]() { fn(i); });
   }
   // Run the first task on the current thread directly.
-  fn(0, 0);
+  fn(0);
 #else
   caffe2::PThreadPool* const pool = caffe2::pthreadpool();
   TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
@@ -102,7 +102,7 @@ void _run_with_pool(const std::function<void(int, size_t)>& fn, size_t range) {
     // PThreadPool::run() is blocking.  A std::function [const] reference to
     // this lambda cannot go out of scope before PThreadPool::run() returns.
     [&fn](const size_t task_id) {
-      fn(0 /* unused */, task_id);
+      fn(task_id);
     }, range);
 #endif // C10_MOBILE
 }
@@ -113,6 +113,10 @@ struct ParallelRegionGuard {
     internal::set_thread_num(task_id);
     _set_in_parallel_region(true);
   }
+  ParallelRegionGuard(const ParallelRegionGuard&) = delete;
+  ParallelRegionGuard(ParallelRegionGuard&&) = delete;
+  ParallelRegionGuard& operator=(const ParallelRegionGuard&) = delete;
+  ParallelRegionGuard& operator=(ParallelRegionGuard&&) = delete;
 
   ~ParallelRegionGuard() {
     _set_in_parallel_region(false);
@@ -124,16 +128,16 @@ struct ParallelRegionGuard {
 
 namespace internal {
 
-inline std::tuple<size_t, size_t> calc_num_tasks_and_chunk_size(
+static std::tuple<size_t, size_t> calc_num_tasks_and_chunk_size(
     int64_t begin, int64_t end, int64_t grain_size) {
   if ((end - begin) < grain_size) {
     return std::make_tuple(1, std::max((int64_t)0, end - begin));
   }
   // Choose number of tasks based on grain size and number of threads.
-  size_t chunk_size = divup((end - begin), get_num_threads());
+  int64_t chunk_size = divup((end - begin), get_num_threads());
   // Make sure each task is at least grain_size size.
-  chunk_size = std::max((size_t)grain_size, chunk_size);
-  size_t num_tasks = divup((end - begin), chunk_size);
+  chunk_size = std::max(grain_size, chunk_size);
+  size_t num_tasks = static_cast<size_t>(divup((end - begin), chunk_size));
   return std::make_tuple(num_tasks, chunk_size);
 }
 
@@ -157,12 +161,12 @@ void invoke_parallel(
   } state;
 
   auto task = [f, &state, begin, end, chunk_size]
-      (int /* unused */, size_t task_id) {
-    int64_t local_start = begin + task_id * chunk_size;
+      (size_t task_id) {
+    int64_t local_start = static_cast<int64_t>(begin + task_id * chunk_size);
     if (local_start < end) {
-      int64_t local_end = std::min(end, (int64_t)(chunk_size + local_start));
+      int64_t local_end = std::min(end, static_cast<int64_t>(chunk_size + local_start));
       try {
-        ParallelRegionGuard guard(task_id);
+        ParallelRegionGuard guard(static_cast<int>(task_id));
         f(local_start, local_end);
       } catch (...) {
         if (!state.err_flag.test_and_set()) {
 
@@ -284,6 +284,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
   }
   template <typename T>
   inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
     TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value)));
   }
 };
@@ -392,7 +393,7 @@ inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
   preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, c_alignment);
 #endif
 
-  auto workspace = at::empty(workspaceSize, at::TensorOptions().dtype(at::kByte).device(at::kCUDA));
+  auto workspace = at::empty(static_cast<int64_t>(workspaceSize), at::TensorOptions().dtype(at::kByte).device(at::kCUDA));
 
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
@@ -901,12 +902,10 @@ void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 #else
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   if (prop->major >= 5) {
-#ifndef USE_ROCM
     cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
     if (!at::globalContext().allowFP16ReductionCuBLAS()) {
       cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
     }
-#endif
     // Disallow fp16 reductions that could lead to unexpected overflow issues.
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
     TORCH_CUDABLAS_CHECK(cublasGemmEx(
@@ -1284,7 +1283,7 @@ void gemm_and_bias(
   preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES, d_alignment);
 #endif
 
-  auto workspace = at::empty(workspaceSize, at::TensorOptions().dtype(at::kByte).device(at::kCUDA));
+  auto workspace = at::empty(static_cast<int64_t>(workspaceSize), at::TensorOptions().dtype(at::kByte).device(at::kCUDA));
 
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
@@ -1466,7 +1465,7 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
   }
   size_t workspaceSize = _getWorkspaceSize();
-  auto workspace = at::empty(workspaceSize, at::TensorOptions().dtype(at::kByte).device(at::kCUDA));
+  auto workspace = at::empty(static_cast<int64_t>(workspaceSize), at::TensorOptions().dtype(at::kByte).device(at::kCUDA));
 
   CuBlasLtMatmulPreference preference;
   preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
 
@@ -56,7 +56,6 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type) {
   }
 }
 
-#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
 cusparseDnMatDescr_t createRawDnMatDescriptor(const Tensor& input, int64_t batch_offset, bool is_const=false) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.layout() == kStrided);
   IntArrayRef input_strides = input.strides();
@@ -121,7 +120,6 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t ba
 CuSparseConstDnMatDescriptor::CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset) {
   descriptor_.reset(createRawDnMatDescriptor(input, batch_offset, /*is_const*/true));
 }
-#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
 
 CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) {
   // cuSPARSE doesn't support batched vectors
 
@@ -116,7 +116,7 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
 
 
   virtual void recordMemoryHistory(
-    std::optional<std::string> enabled,
+    const std::optional<std::string>& enabled,
     const std::string& stacks,
     size_t max_entries) const {
     FAIL_MTIAHOOKS_FUNC(__func__);
 
@@ -162,6 +162,7 @@ grid_sample_backward_helper_in(
 
 static std::tuple<Tensor, std::optional<int64_t>, Tensor, std::optional<int64_t>>
 grid_sample_backward_helper_out(
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     std::tuple<Tensor, Tensor> bw_out,
     int64_t grad_input_out_bdim,
     int64_t grad_grid_out_bdim,
@@ -261,7 +262,7 @@ struct UpsampleBackwardBatchRuleHelper<F, Func, typelist<A, B, C, T...>> {
 
     auto out = Func(
         std::move(grad_output_),
-        std::move(output_size),
+        output_size,
         std::move(physical_input_size),
         std::forward<T>(extra_args)...);
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
 
@@ -579,7 +579,7 @@ static void _rrelu_with_noise_train(
     Tensor& noise,
     const Scalar& lower_,
     const Scalar& upper_,
-    std::optional<Generator> generator) {
+    const std::optional<Generator>& generator) {
   using opmath_t = at::opmath_type<scalar_t>;
   opmath_t lower = lower_.to<opmath_t>();
   opmath_t upper = upper_.to<opmath_t>();
 
@@ -17,7 +17,8 @@
 #include <ATen/ops/mm_native.h>
 #endif
 
-namespace at::native::xpu {
+namespace at::native {
+namespace xpu {
 
 // result = beta * self + alpha * (mat1 * mat2)
 Tensor& addmm_out(
@@ -454,7 +455,7 @@ Tensor& tensordot_out(
 TORCH_LIBRARY_IMPL(aten, XPU, m) {
   m.impl("tensordot.out", TORCH_FN(tensordot_out));
 }
-} // namespace at::native::xpu
+} // namespace xpu
 
 TORCH_IMPL_FUNC(addmm_out_xpu)
 (const Tensor& self,
@@ -469,11 +470,13 @@ TORCH_IMPL_FUNC(addmm_out_xpu)
 
 TORCH_IMPL_FUNC(mm_out_xpu)
 (const Tensor& self, const Tensor& mat2, const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   xpu::mm_out(self, mat2, const_cast<Tensor&>(result));
 }
 
 TORCH_IMPL_FUNC(bmm_out_xpu)
 (const Tensor& self, const Tensor& batch2, const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   xpu::bmm_out(self, batch2, const_cast<Tensor&>(result));
 }
 
@@ -498,7 +501,13 @@ TORCH_IMPL_FUNC(baddbmm_out_xpu)
  const Scalar& alpha,
  const Tensor& result) {
   xpu::baddbmm_out(
-      self, batch1, batch2, beta, alpha, const_cast<Tensor&>(result));
+      self,
+      batch1,
+      batch2,
+      beta,
+      alpha,
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+      const_cast<Tensor&>(result));
 }
 
 TORCH_IMPL_FUNC(addmv_out_xpu)
@@ -508,5 +517,8 @@ TORCH_IMPL_FUNC(addmv_out_xpu)
  const Scalar& beta,
  const Scalar& alpha,
  const Tensor& result) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   xpu::addmv_out(self, mat, vec, beta, alpha, const_cast<Tensor&>(result));
 }
+
+} // namespace at::native
@@ -26,7 +26,7 @@ PyObject* THPGenerator_initDefaultGenerator(const at::Generator& cdata) {
   if (!self)
     throw python_error();
   auto self_ = reinterpret_cast<THPGenerator*>(self.get());
-  self_->cdata = std::move(cdata);
+  self_->cdata = cdata;
   return self.release();
 }
 
 
@@ -118,8 +118,7 @@ struct TensorDataContainer {
         type_(TensorDataContainerType::InitList) {}
 #define TENSOR(T, S)                            \
   TensorDataContainer(T value)                  \
-      : sizes_(),                               \
-        scalar_type_(at::k##S),                 \
+      : scalar_type_(at::k##S),                 \
         type_(TensorDataContainerType::Scalar), \
         scalar_(value) {}
   AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
 
@@ -136,6 +136,7 @@ namespace torch::autograd {
 // NOTE: this function is written in a way that assumes it's only called for
 // backward; it's used by engine.cpp.  This is responsible for forwarding a call
 // from C++'s Node::apply to a Python method "apply".
+// NOLINTNEXTLINE(*-rvalue-reference*)
 auto PyNode::apply(variable_list&& inputs) -> variable_list {
   pybind11::gil_scoped_acquire gil;
   at::OptionalDeviceGuard _device_guard;
@@ -184,7 +185,7 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
 }
 
 auto PyNode::defer_to_dynamo(
-    variable_list&& inputs,
+    const variable_list& inputs,
     const std::optional<PyObject*>& compiler) -> variable_list {
   pybind11::gil_scoped_acquire gil;
   at::OptionalDeviceGuard _device_guard;
@@ -526,7 +527,7 @@ static void THPFunction_dealloc(THPFunction* self) {
   Py_TYPE(self)->tp_free((PyObject*)self);
 }
 
-PyObject* THPFunction_new(
+static PyObject* THPFunction_new(
     PyTypeObject* type,
     PyObject* args,
     PyObject* kwargs) {
@@ -875,6 +876,7 @@ struct InputFlags {
   std::vector<bool> is_variable_input;
 };
 
+namespace {
 template <bool enforce_variables>
 std::pair<UnpackedInput, InputFlags> unpack_input(PyObject* args) {
   UnpackedInput unpacked;
@@ -938,7 +940,7 @@ std::pair<UnpackedInput, InputFlags> unpack_input(PyObject* args) {
 // value is assigned by the prim::PythonOp node and helps to eventually route
 // the outputs of the subgraph correctly This newly created subgraph is then
 // added to the prim::PythonOp node as a subgraph attribute
-static void _append_subgraph(
+void _append_subgraph(
     torch::jit::Node* node,
     torch::jit::Graph* graph,
     std::vector<torch::jit::Value*> trace_outputs,
@@ -980,7 +982,7 @@ static void _append_subgraph(
   }
 }
 
-static torch::jit::Node* _trace_pre_record(
+torch::jit::Node* _trace_pre_record(
     PyObject* op_obj,
     PyObject* input_objects,
     const variable_list& input_vars) {
@@ -1011,7 +1013,7 @@ static torch::jit::Node* _trace_pre_record(
       std::move(pyobj), arg_types, input_vars, std::move(scalar_args));
 }
 
-static void _trace_post_record(
+void _trace_post_record(
     torch::jit::Node* node,
     PyObject* op_obj,
     const variable_list& input_vars,
@@ -1218,8 +1220,6 @@ PyObject* THPFunction_maybe_clear_saved_tensors(
   END_HANDLE_TH_ERRORS
 }
 
-namespace {
-
 THPObjectPtr make_ctx_input_tuple(
     THPFunction* ctx,
     const UnpackedInput& unpacked_input,
@@ -1253,8 +1253,6 @@ THPObjectPtr make_ctx_input_output_tuple(
   return result;
 }
 
-} // namespace
-
 static PyObject* THPFunction_setup_context = nullptr;
 
 static PyObject* get_base_setup_context() {
@@ -1652,6 +1650,7 @@ PyObject* THPFunction_metadata(THPFunction* self, void* _unused) {
   return metadata;
   END_HANDLE_TH_ERRORS
 }
+} // namespace
 
 using getter = PyObject* (*)(PyObject*, void*);
 using setter = int (*)(PyObject*, PyObject*, void*);
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,6 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type) {`
`56`	`56`	`}`
`57`	`57`	`}`
`58`	`58`
`59`		`-#if AT_USE_CUSPARSE_GENERIC_API() \|\| AT_USE_HIPSPARSE_GENERIC_API()`
`60`	`59`	`cusparseDnMatDescr_t createRawDnMatDescriptor(const Tensor& input, int64_t batch_offset, bool is_const=false) {`
`61`	`60`	`TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.layout() == kStrided);`
`62`	`61`	`IntArrayRef input_strides = input.strides();`
`@@ -121,7 +120,6 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t ba`
`121`	`120`	`CuSparseConstDnMatDescriptor::CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset) {`
`122`	`121`	`descriptor_.reset(createRawDnMatDescriptor(input, batch_offset, /is_const/true));`
`123`	`122`	`}`
`124`		`-#endif // AT_USE_CUSPARSE_GENERIC_API() \|\| AT_USE_HIPSPARSE_GENERIC_API()`
`125`	`123`
`126`	`124`	`CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) {`
`127`	`125`	`// cuSPARSE doesn't support batched vectors`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ PyObject* THPGenerator_initDefaultGenerator(const at::Generator& cdata) {`
`26`	`26`	`if (!self)`
`27`	`27`	`throw python_error();`
`28`	`28`	`auto self_ = reinterpret_cast<THPGenerator*>(self.get());`
`29`		`- self_->cdata = std::move(cdata);`
	`29`	`+ self_->cdata = cdata;`
`30`	`30`	`return self.release();`
`31`	`31`	`}`
`32`	`32`