Skip to content

Commit 844a8a2

Browse files
generatedunixname89002005232357meta-codesync[bot]
authored andcommitted
Revert D86123088 (#649)
Summary: Pull Request resolved: #649 This diff reverts D86123088 (The context such as a Sandcastle job, Task, SEV, etc. was not provided.) Depends on D86123088 Reviewed By: agron911 Differential Revision: D86533448 fbshipit-source-id: 438f340d41eea4b41d38f55ebf3ac3847f50ad62
1 parent d250c37 commit 844a8a2

File tree

3 files changed

+85
-169
lines changed

3 files changed

+85
-169
lines changed

python/triton/runtime/jit.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ def specialize_impl(arg, is_const=False, specialize_value=True, align=True):
380380
return ("constexpr", arg.cache_key)
381381
elif isinstance(arg, constexpr):
382382
return ("constexpr", arg)
383+
elif hasattr(arg, "tma_desc_cpu_ptr"):
384+
return ("nvTmaDesc", None)
383385
elif isinstance(arg, tuple):
384386
spec = [specialize_impl(x) for x in arg]
385387
make_tuple = lambda vals: type(arg)(*vals) if hasattr(arg, "_fields") else tuple(vals)

third_party/nvidia/backend/driver.c

Lines changed: 13 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,9 @@
11
#include "cuda.h"
22
#include <dlfcn.h>
33
#include <stdbool.h>
4-
#include <stdlib.h>
54
#define PY_SSIZE_T_CLEAN
65
#include <Python.h>
76

8-
typedef struct {
9-
PyObject_HEAD;
10-
_Alignas(128) CUtensorMap tensorMap;
11-
} PyCUtensorMapObject;
12-
137
// Raises a Python exception and returns false if code is not CUDA_SUCCESS.
148
static bool gpuAssert(CUresult code, const char *file, int line) {
159
if (code == CUDA_SUCCESS)
@@ -32,7 +26,7 @@ static bool gpuAssert(CUresult code, const char *file, int line) {
3226
#define CUDA_CHECK_AND_RETURN_NULL(ans) \
3327
do { \
3428
if (!gpuAssert((ans), __FILE__, __LINE__)) \
35-
goto cleanup; \
29+
return NULL; \
3630
} while (0)
3731

3832
// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
@@ -50,7 +44,7 @@ static bool gpuAssert(CUresult code, const char *file, int line) {
5044
if ((funcPointer) == NULL) { \
5145
(funcPointer) = (initializerFunction)(); \
5246
if ((funcPointer) == NULL) { \
53-
goto cleanup; \
47+
return NULL; \
5448
} \
5549
} \
5650
} while (0)
@@ -93,9 +87,6 @@ static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
9387
warp_size, "sm_clock_rate", sm_clock_rate,
9488
"mem_clock_rate", mem_clock_rate, "mem_bus_width",
9589
mem_bus_width);
96-
97-
cleanup:
98-
return NULL;
9990
}
10091

10192
static PyObject *loadBinary(PyObject *self, PyObject *args) {
@@ -247,9 +238,6 @@ static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
247238
cuOccupancyMaxActiveClusters(&maxActiveClusters, func, &config));
248239
Py_END_ALLOW_THREADS;
249240
return PyLong_FromLong(maxActiveClusters);
250-
251-
cleanup:
252-
return NULL;
253241
}
254242

255243
static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
@@ -291,43 +279,8 @@ static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
291279
Py_RETURN_NONE;
292280
}
293281

294-
static PyObject *PyCUtensorMap_alloc(PyTypeObject *type, Py_ssize_t n_items) {
295-
PyCUtensorMapObject *self = NULL;
296-
void *mem = NULL;
297-
size_t size = type->tp_basicsize;
298-
299-
if (posix_memalign(&mem, 128, size) != 0) {
300-
PyErr_NoMemory();
301-
return NULL;
302-
}
303-
304-
self = (PyCUtensorMapObject *)mem;
305-
PyObject_INIT(self, type);
306-
return (PyObject *)self;
307-
}
308-
309-
static void PyCUtensorMap_dealloc(PyObject *self) {
310-
Py_TYPE(self)->tp_free(self);
311-
}
312-
313-
static void PyCUtensorMap_free(void *ptr) { free(ptr); }
314-
315-
// clang-format off
316-
static PyTypeObject PyCUtensorMapType = {
317-
PyVarObject_HEAD_INIT(NULL, 0)
318-
.tp_name = "triton.backends.nvidia.PyCUtensorMap",
319-
.tp_basicsize = sizeof(PyCUtensorMapObject),
320-
.tp_itemsize = 0,
321-
.tp_flags = Py_TPFLAGS_DEFAULT,
322-
.tp_doc = "<PyCUtensorMap object>",
323-
.tp_new = PyType_GenericNew,
324-
.tp_alloc = PyCUtensorMap_alloc,
325-
.tp_dealloc = (destructor)PyCUtensorMap_dealloc,
326-
.tp_free = PyCUtensorMap_free,
327-
};
328-
// clang-format on
329-
330282
static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
283+
unsigned long long desc_address;
331284
unsigned long long global_address;
332285
int swizzle;
333286
int elemSize;
@@ -337,20 +290,16 @@ static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
337290
PyObject *strides;
338291
int padding;
339292

340-
if (!PyArg_ParseTuple(args, "KiiiOOOi", &global_address, &swizzle, &elemSize,
341-
&elemType, &blockSize, &shape, &strides, &padding)) {
342-
return NULL;
343-
}
344-
345-
PyCUtensorMapObject *desc = (PyCUtensorMapObject *)PyObject_CallObject(
346-
(PyObject *)&PyCUtensorMapType, NULL);
347-
if (!desc) {
293+
if (!PyArg_ParseTuple(args, "KKiiiOOOi", &desc_address, &global_address,
294+
&swizzle, &elemSize, &elemType, &blockSize, &shape,
295+
&strides, &padding)) {
348296
return NULL;
349297
}
350298

351299
PyObject *blockSizeFast = NULL;
352300
PyObject *shapeFast = NULL;
353301
PyObject *stridesFast = NULL;
302+
PyObject *result = NULL;
354303

355304
uint32_t blockSizeInt[5];
356305
uint64_t shapeInt[5];
@@ -421,18 +370,17 @@ static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
421370
INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
422371
getCuTensorMapEncodeTiledHandle);
423372
CUDA_CHECK_AND_RETURN_NULL(cuTensorMapEncodeTiled(
424-
&desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
425-
stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
426-
swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill));
427-
428-
return (PyObject *)desc;
373+
(CUtensorMap *)desc_address, elemType, rank, (void *)global_address,
374+
shapeInt, stridesLL, blockSizeInt, elementStrides,
375+
CU_TENSOR_MAP_INTERLEAVE_NONE, swizzle,
376+
CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill));
377+
Py_RETURN_NONE;
429378

430379
cleanup:
431380
Py_XDECREF(blockSizeFast);
432381
Py_XDECREF(shapeFast);
433382
Py_XDECREF(stridesFast);
434-
Py_XDECREF(desc);
435-
return NULL;
383+
return result;
436384
}
437385

438386
// Simple helper to experiment creating TMA descriptors on the host.
@@ -478,8 +426,6 @@ static PyObject *fill1DTMADescriptor(PyObject *self, PyObject *args) {
478426
CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE));
479427
Py_INCREF(Py_None);
480428
return Py_None;
481-
cleanup:
482-
return NULL;
483429
}
484430

485431
// Simple helper to experiment creating TMA descriptors on the host.
@@ -544,8 +490,6 @@ static PyObject *fill2DTMADescriptor(PyObject *self, PyObject *args) {
544490
CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE));
545491
Py_INCREF(Py_None);
546492
return Py_None;
547-
cleanup:
548-
return NULL;
549493
}
550494

551495
// Simple helper to experiment creating TMA descriptors on the host.
@@ -601,8 +545,6 @@ static PyObject *fill1DTMADescriptorType(PyObject *self, PyObject *args) {
601545
Py_INCREF(Py_None);
602546
#endif
603547
return Py_None;
604-
cleanup:
605-
return NULL;
606548
}
607549

608550
// Simple helper to experiment creating TMA descriptors on the host.
@@ -677,8 +619,6 @@ static PyObject *fill2DTMADescriptorType(PyObject *self, PyObject *args) {
677619
Py_INCREF(Py_None);
678620
#endif
679621
return Py_None;
680-
cleanup:
681-
return NULL;
682622
}
683623

684624
static PyMethodDef ModuleMethods[] = {
@@ -711,18 +651,12 @@ static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils",
711651
ModuleMethods};
712652

713653
PyMODINIT_FUNC PyInit_cuda_utils(void) {
714-
if (PyType_Ready(&PyCUtensorMapType) < 0) {
715-
return NULL;
716-
}
717-
718654
PyObject *m = PyModule_Create(&ModuleDef);
719655
if (m == NULL) {
720656
return NULL;
721657
}
722658

723659
PyModule_AddFunctions(m, ModuleMethods);
724-
Py_INCREF(&PyCUtensorMapType);
725-
PyModule_AddObject(m, "PyCUtensorMap", (PyObject *)&PyCUtensorMapType);
726660

727661
return m;
728662
}

0 commit comments

Comments
 (0)