Skip to content

Commit b290136

Browse files
committed
Fix merge conflicts
1 parent b483030 commit b290136

File tree

17 files changed

+7
-1671
lines changed

17 files changed

+7
-1671
lines changed

.ci/docker/build.sh

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ case "$tag" in
288288
;;
289289
*)
290290
# Catch-all for builds that are not hardcoded.
291-
PROTOBUF=yes
291+
PROTOBUF=yes
292292
VISION=yes
293293
echo "image '$image' did not match an existing build configuration"
294294
if [[ "$image" == *py* ]]; then
@@ -460,15 +460,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then
460460
echo "expecting triton to not be installed, but it is"
461461
exit 0
462462
fi
463-
<<<<<<< HEAD
464-
465-
# Sanity check cmake version. Executorch reinstalls cmake and I'm not sure if
466-
# they support 4.0.0 yet, so exclude them from this check.
467-
CMAKE_VERSION=$(drun cmake --version)
468-
if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
469-
echo "CMake version is not 4.0.0:"
470-
drun cmake --version
471-
exit 0
472-
fi
473-
=======
474-
>>>>>>> upstream/main
Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1 @@
1-
<<<<<<< HEAD
2-
d704bc6e69c1a588c8edd3cbb67505d554ed65f6
3-
=======
4-
7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
5-
>>>>>>> upstream/main
1+
ac80c4190aa0321f761a08af97e1e1eee41f01d9

.ci/docker/libtorch/build.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,11 @@ case ${DOCKER_TAG_PREFIX} in
5050
BASE_TARGET=rocm
5151
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
5252
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
53-
<<<<<<< HEAD
54-
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
55-
=======
5653
# add gfx950, gfx115x conditionally starting in ROCm 7.0
5754
if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
5855
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
5956
fi
6057
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
61-
>>>>>>> upstream/main
6258
;;
6359
*)
6460
echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"

CMakeLists.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -903,13 +903,8 @@ cmake_dependent_option(
903903
USE_FBGEMM_GENAI
904904
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
905905
Will be disabled if not supported by the platform"
906-
<<<<<<< HEAD
907-
OFF
908-
"USE_CUDA OR USE_ROCM"
909-
=======
910906
${USE_FBGEMM_GENAI_DEFAULT}
911907
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
912-
>>>>>>> upstream/main
913908
OFF)
914909

915910

aten/src/ATen/native/cuda/Blas.cpp

Lines changed: 0 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -58,173 +58,8 @@
5858

5959
namespace at::native {
6060

61-
<<<<<<< HEAD
62-
namespace {
63-
64-
// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
65-
c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
66-
if (resolve_conj && tensor.is_conj()) {
67-
return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
68-
} else {
69-
return c10::MaybeOwned<Tensor>::borrowed(tensor);
70-
}
71-
}
72-
73-
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
74-
if (tensor.is_non_overlapping_and_dense()) { // common case
75-
transpose_tensor = tensor.is_contiguous();
76-
return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
77-
}
78-
IntArrayRef tensor_strides = tensor.strides();
79-
IntArrayRef tensor_sizes = tensor.sizes();
80-
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
81-
transpose_tensor = false;
82-
return resolve_conj_if_indicated(tensor, !transpose_result);
83-
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
84-
transpose_tensor = true;
85-
return resolve_conj_if_indicated(tensor, transpose_result);
86-
} else {
87-
transpose_tensor = true;
88-
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
89-
}
90-
}
91-
92-
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
93-
if (tensor.is_non_overlapping_and_dense()) { // common case
94-
transpose_tensor = tensor.is_contiguous();
95-
return resolve_conj_if_indicated(tensor, true);
96-
}
97-
98-
IntArrayRef tensor_strides = tensor.strides();
99-
IntArrayRef tensor_sizes = tensor.sizes();
100-
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
101-
transpose_tensor = false;
102-
return resolve_conj_if_indicated(tensor, true);
103-
} else if ((tensor_strides[1] == 1) &&
104-
(tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
105-
transpose_tensor = true;
106-
return resolve_conj_if_indicated(tensor, true);
107-
} else {
108-
transpose_tensor = true;
109-
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
110-
}
111-
}
112-
113-
using at::cuda::blas::ScalingType;
114-
115-
/**
116-
* @brief Prepares matrices for CUBLAS operation
117-
*
118-
* This constructor prepares tensors for CUBLAS
119-
* The main difference is that PyTorch uses row-major as the default and
120-
* CUBLAS expects column-major.
121-
*
122-
* @details
123-
* To enable row-major output while using CUBLAS,
124-
* we use the mathematical identity that (A × B)^T = B^T × A^T.
125-
*
126-
* Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
127-
* T = row-major, N = col-major
128-
*
129-
* Example:
130-
* For matrices A (M×K)(row-major) and B (K×N)(row-major):
131-
* - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
132-
* - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
133-
* - However, since the output form cublas is column-major this is
134-
* - equivalent to an output of size MxN row-major as expected
135-
*
136-
* The transpose flags are derived from the layouts of the passed in tensors
137-
*
138-
* If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
139-
* to their unpacked values to match what cuBLAS expects.
140-
*
141-
* @param mat1 First input matrix
142-
* @param mat2 Second input matrix
143-
* @param c Output matrix (result)
144-
* @param scale_a Optional scaling factor for first matrix
145-
* @param scale_b Optional scaling factor for second matrix
146-
* @param scale_result Optional scaling factor for result
147-
*/
148-
struct cublasCommonArgs {
149-
cublasCommonArgs(
150-
const Tensor& mat1,
151-
const Tensor& mat2,
152-
Tensor& c,
153-
const std::optional<Tensor>& scale_a = std::nullopt,
154-
const std::optional<Tensor>& scale_b = std::nullopt,
155-
const std::optional<Tensor>& scale_result = std::nullopt,
156-
const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
157-
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
158-
bool transpose_result = false, transpose_a = false, transpose_b = false;
159-
result = prepare_matrix_for_cublas(c, transpose_result);
160-
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
161-
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
162-
163-
// Handle scale tensors if provided
164-
if (scale_a && scale_b) {
165-
// By default since we return in row-major we run the gemm
166-
// as B.T @ A.T, check transpose_result to determine if we flip the scales
167-
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
168-
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
169-
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
170-
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
171-
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
172-
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
173-
}
174-
175-
if (scale_result) {
176-
scale_result_ptr = scale_result->data_ptr();
177-
scale_result_dtype = scale_result->scalar_type();
178-
}
179-
180-
// Update transpose flags
181-
if (transpose_result) {
182-
transpose_a = !transpose_a;
183-
transpose_b = !transpose_b;
184-
}
185-
186-
auto sizes_a = mata->sizes();
187-
auto sizes_b = matb->sizes();
188-
189-
m = sizes_a[transpose_result ? 1 : 0];
190-
k = sizes_a[transpose_result ? 0 : 1];
191-
n = sizes_b[transpose_result ? 0 : 1];
192-
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
193-
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
194-
result_ld = result->stride(transpose_result ? 0 : 1);
195-
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
196-
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
197-
198-
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
199-
// if the gemm operands are in packed float4
200-
if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
201-
k = k * 2;
202-
lda = lda * 2;
203-
ldb = ldb * 2;
204-
}
205-
}
206-
207-
// Matrix members
208-
char transa, transb;
209-
int64_t m, n, k;
210-
int64_t lda, ldb, result_ld;
211-
c10::MaybeOwned<Tensor> mata, matb, result;
212-
213-
// Scale members
214-
void* scale_mata_ptr = nullptr;
215-
void* scale_matb_ptr = nullptr;
216-
void* scale_result_ptr = nullptr;
217-
std::optional<c10::ScalarType> scale_mata_dtype;
218-
std::optional<ScalingType> scaling_mata_type;
219-
std::optional<c10::ScalarType> scale_matb_dtype;
220-
std::optional<ScalingType> scaling_matb_type;
221-
std::optional<c10::ScalarType> scale_result_dtype;
222-
};
223-
} // namespace
224-
=======
22561
using at::blas::ScalingType;
22662
using at::blas::SwizzleType;
227-
>>>>>>> upstream/main
22863

22964
c10::MaybeOwned<Tensor> prepare_batch_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, int64_t& ld_tensor, bool transpose_result, int64_t m, int64_t n) {
23065
IntArrayRef tensor_strides = tensor.strides();

aten/src/ATen/native/cuda/Normalization.cuh

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,7 @@ __device__ scalar_t reduce(Op op, PTA tensor, int plane) {
121121
for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x*UNRL) {
122122
#pragma unroll
123123
for (int u = 0; u < UNRL; u++)
124-
<<<<<<< HEAD
125-
tmp[u] = op(batch, plane, min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
126-
=======
127124
tmp[u] = op(batch, plane, std::min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
128-
>>>>>>> upstream/main
129125
#pragma unroll
130126
for (int u = 0; u < UNRL; u++)
131127
if (x+u*blockDim.x < tensor.size(2))
@@ -315,11 +311,7 @@ __global__ void batch_norm_collect_statistics_kernel(
315311
stat_accscalar_t v_[UNRL];
316312
for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) {
317313
for (int u = 0; u < UNRL; u++)
318-
<<<<<<< HEAD
319-
v_[u] = input[batch][plane][min(x+u*blockDim.x, input.size(2)-1)];
320-
=======
321314
v_[u] = input[batch][plane][std::min(x+u*blockDim.x, input.size(2)-1)];
322-
>>>>>>> upstream/main
323315
for (int u = 0; u < UNRL; u++) {
324316
if (x+u*blockDim.x < input.size(2)) {
325317
stat_accscalar_t d1 = v_[u] - avg;

aten/src/ATen/native/sparse/cuda/SparseMatMul.cu

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
#include <thrust/iterator/discard_iterator.h>
4141

4242

43-
<<<<<<< HEAD
4443
#if defined(__CUDACC__) && ((CUSPARSE_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
4544
#define IS_CUSPARSE11_AVAILABLE() 1
4645
#else
@@ -60,8 +59,6 @@
6059
#endif
6160

6261
#if IS_CUSPARSE11_AVAILABLE()
63-
=======
64-
>>>>>>> upstream/main
6562
#include <library_types.h>
6663

6764
namespace at::native {

requirements-build.txt

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# Build System requirements
2-
<<<<<<< HEAD
32
setuptools>=70.1.0,<80.0 # setuptools develop deprecated on 80.0
43
cmake>=3.31.4
54
ninja==1.11.1.3
@@ -10,15 +9,4 @@ pyyaml==6.0.2
109
requests==2.32.4
1110
six==1.17.0 # dependency chain: NNPACK -> PeachPy -> six
1211
typing-extensions==4.14.1
13-
=======
14-
setuptools>=70.1.0
15-
cmake>=3.27
16-
ninja
17-
numpy
18-
packaging
19-
pyyaml
20-
requests
21-
six # dependency chain: NNPACK -> PeachPy -> six
22-
typing-extensions>=4.10.0
23-
>>>>>>> upstream/main
2412
pip # not technically needed, but this makes setup.py invocation work

test/dynamo/test_structured_trace.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,7 @@
2121
from torch._inductor.test_case import TestCase
2222
from torch._logging._internal import TorchLogsFormatter
2323
from torch.nn.parallel import DistributedDataParallel as DDP
24-
<<<<<<< HEAD
25-
from torch.testing._internal.common_utils import find_free_port, skipIfRocm
26-
=======
2724
from torch.testing._internal.common_utils import find_free_port, xfailIfS390X
28-
>>>>>>> upstream/main
2925
from torch.testing._internal.triton_utils import requires_cuda_and_triton
3026

3127

test/inductor/test_cuda_repro.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,7 @@
3939
DeterministicGuard,
4040
freeze_rng_state,
4141
IS_FBCODE,
42-
<<<<<<< HEAD
43-
skipIfRocm,
44-
=======
4542
MI350_ARCH,
46-
>>>>>>> upstream/main
4743
skipIfRocmArch,
4844
TEST_WITH_ASAN,
4945
TEST_WITH_ROCM,

0 commit comments

Comments
 (0)