11#include "cuda.h"
22#include <dlfcn.h>
33#include <stdbool.h>
4- #include <stdlib.h>
54#define PY_SSIZE_T_CLEAN
65#include <Python.h>
76
8- typedef struct {
9- PyObject_HEAD ;
10- _Alignas(128 ) CUtensorMap tensorMap ;
11- } PyCUtensorMapObject ;
12-
137// Raises a Python exception and returns false if code is not CUDA_SUCCESS.
148static bool gpuAssert (CUresult code , const char * file , int line ) {
159 if (code == CUDA_SUCCESS )
@@ -32,7 +26,7 @@ static bool gpuAssert(CUresult code, const char *file, int line) {
3226#define CUDA_CHECK_AND_RETURN_NULL (ans ) \
3327 do { \
3428 if (!gpuAssert((ans), __FILE__, __LINE__)) \
35- goto cleanup; \
29+ return NULL; \
3630 } while (0)
3731
3832// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
@@ -50,7 +44,7 @@ static bool gpuAssert(CUresult code, const char *file, int line) {
5044 if ((funcPointer) == NULL) { \
5145 (funcPointer) = (initializerFunction)(); \
5246 if ((funcPointer) == NULL) { \
53- goto cleanup; \
47+ return NULL; \
5448 } \
5549 } \
5650 } while (0)
@@ -93,9 +87,6 @@ static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
9387 warp_size , "sm_clock_rate" , sm_clock_rate ,
9488 "mem_clock_rate" , mem_clock_rate , "mem_bus_width" ,
9589 mem_bus_width );
96-
97- cleanup :
98- return NULL ;
9990}
10091
10192static PyObject * loadBinary (PyObject * self , PyObject * args ) {
@@ -247,9 +238,6 @@ static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
247238 cuOccupancyMaxActiveClusters (& maxActiveClusters , func , & config ));
248239 Py_END_ALLOW_THREADS ;
249240 return PyLong_FromLong (maxActiveClusters );
250-
251- cleanup :
252- return NULL ;
253241}
254242
255243static PyObject * setPrintfFifoSize (PyObject * self , PyObject * args ) {
@@ -291,43 +279,8 @@ static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
291279 Py_RETURN_NONE ;
292280}
293281
294- static PyObject * PyCUtensorMap_alloc (PyTypeObject * type , Py_ssize_t n_items ) {
295- PyCUtensorMapObject * self = NULL ;
296- void * mem = NULL ;
297- size_t size = type -> tp_basicsize ;
298-
299- if (posix_memalign (& mem , 128 , size ) != 0 ) {
300- PyErr_NoMemory ();
301- return NULL ;
302- }
303-
304- self = (PyCUtensorMapObject * )mem ;
305- PyObject_INIT (self , type );
306- return (PyObject * )self ;
307- }
308-
309- static void PyCUtensorMap_dealloc (PyObject * self ) {
310- Py_TYPE (self )-> tp_free (self );
311- }
312-
313- static void PyCUtensorMap_free (void * ptr ) { free (ptr ); }
314-
315- // clang-format off
316- static PyTypeObject PyCUtensorMapType = {
317- PyVarObject_HEAD_INIT (NULL , 0 )
318- .tp_name = "triton.backends.nvidia.PyCUtensorMap" ,
319- .tp_basicsize = sizeof (PyCUtensorMapObject ),
320- .tp_itemsize = 0 ,
321- .tp_flags = Py_TPFLAGS_DEFAULT ,
322- .tp_doc = "<PyCUtensorMap object>" ,
323- .tp_new = PyType_GenericNew ,
324- .tp_alloc = PyCUtensorMap_alloc ,
325- .tp_dealloc = (destructor )PyCUtensorMap_dealloc ,
326- .tp_free = PyCUtensorMap_free ,
327- };
328- // clang-format on
329-
330282static PyObject * fillTMADescriptor (PyObject * self , PyObject * args ) {
283+ unsigned long long desc_address ;
331284 unsigned long long global_address ;
332285 int swizzle ;
333286 int elemSize ;
@@ -337,20 +290,16 @@ static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
337290 PyObject * strides ;
338291 int padding ;
339292
340- if (!PyArg_ParseTuple (args , "KiiiOOOi" , & global_address , & swizzle , & elemSize ,
341- & elemType , & blockSize , & shape , & strides , & padding )) {
342- return NULL ;
343- }
344-
345- PyCUtensorMapObject * desc = (PyCUtensorMapObject * )PyObject_CallObject (
346- (PyObject * )& PyCUtensorMapType , NULL );
347- if (!desc ) {
293+ if (!PyArg_ParseTuple (args , "KKiiiOOOi" , & desc_address , & global_address ,
294+ & swizzle , & elemSize , & elemType , & blockSize , & shape ,
295+ & strides , & padding )) {
348296 return NULL ;
349297 }
350298
351299 PyObject * blockSizeFast = NULL ;
352300 PyObject * shapeFast = NULL ;
353301 PyObject * stridesFast = NULL ;
302+ PyObject * result = NULL ;
354303
355304 uint32_t blockSizeInt [5 ];
356305 uint64_t shapeInt [5 ];
@@ -421,18 +370,17 @@ static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
421370 INITIALIZE_FUNCTION_POINTER_IF_NULL (cuTensorMapEncodeTiled ,
422371 getCuTensorMapEncodeTiledHandle );
423372 CUDA_CHECK_AND_RETURN_NULL (cuTensorMapEncodeTiled (
424- & desc -> tensorMap , elemType , rank , (void * )global_address , shapeInt ,
425- stridesLL , blockSizeInt , elementStrides , CU_TENSOR_MAP_INTERLEAVE_NONE ,
426- swizzle , CU_TENSOR_MAP_L2_PROMOTION_L2_128B , fill ));
427-
428- return ( PyObject * ) desc ;
373+ ( CUtensorMap * ) desc_address , elemType , rank , (void * )global_address ,
374+ shapeInt , stridesLL , blockSizeInt , elementStrides ,
375+ CU_TENSOR_MAP_INTERLEAVE_NONE , swizzle ,
376+ CU_TENSOR_MAP_L2_PROMOTION_L2_128B , fill ));
377+ Py_RETURN_NONE ;
429378
430379cleanup :
431380 Py_XDECREF (blockSizeFast );
432381 Py_XDECREF (shapeFast );
433382 Py_XDECREF (stridesFast );
434- Py_XDECREF (desc );
435- return NULL ;
383+ return result ;
436384}
437385
438386// Simple helper to experiment creating TMA descriptors on the host.
@@ -478,8 +426,6 @@ static PyObject *fill1DTMADescriptor(PyObject *self, PyObject *args) {
478426 CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE ));
479427 Py_INCREF (Py_None );
480428 return Py_None ;
481- cleanup :
482- return NULL ;
483429}
484430
485431// Simple helper to experiment creating TMA descriptors on the host.
@@ -544,8 +490,6 @@ static PyObject *fill2DTMADescriptor(PyObject *self, PyObject *args) {
544490 CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE ));
545491 Py_INCREF (Py_None );
546492 return Py_None ;
547- cleanup :
548- return NULL ;
549493}
550494
551495// Simple helper to experiment creating TMA descriptors on the host.
@@ -601,8 +545,6 @@ static PyObject *fill1DTMADescriptorType(PyObject *self, PyObject *args) {
601545 Py_INCREF (Py_None );
602546#endif
603547 return Py_None ;
604- cleanup :
605- return NULL ;
606548}
607549
608550// Simple helper to experiment creating TMA descriptors on the host.
@@ -677,8 +619,6 @@ static PyObject *fill2DTMADescriptorType(PyObject *self, PyObject *args) {
677619 Py_INCREF (Py_None );
678620#endif
679621 return Py_None ;
680- cleanup :
681- return NULL ;
682622}
683623
684624static PyMethodDef ModuleMethods [] = {
@@ -711,18 +651,12 @@ static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils",
711651 ModuleMethods };
712652
713653PyMODINIT_FUNC PyInit_cuda_utils (void ) {
714- if (PyType_Ready (& PyCUtensorMapType ) < 0 ) {
715- return NULL ;
716- }
717-
718654 PyObject * m = PyModule_Create (& ModuleDef );
719655 if (m == NULL ) {
720656 return NULL ;
721657 }
722658
723659 PyModule_AddFunctions (m , ModuleMethods );
724- Py_INCREF (& PyCUtensorMapType );
725- PyModule_AddObject (m , "PyCUtensorMap" , (PyObject * )& PyCUtensorMapType );
726660
727661 return m ;
728662}
0 commit comments