@@ -479,290 +479,6 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479479 return ret ;
480480}
481481
482- #pragma push_macro("__INTRINSIC_LOAD")
483- #define __INTRINSIC_LOAD (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
484- __Clobber ) \
485- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
486- __TmpType __ret; \
487- asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \
488- return (__DeclType)__ret; \
489- }
490-
491- #pragma push_macro("__INTRINSIC_LOAD2")
492- #define __INTRINSIC_LOAD2 (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
493- __Clobber ) \
494- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
495- __DeclType __ret; \
496- __TmpType __tmp; \
497- asm(__AsmOp " {%0,%1}, [%2];" \
498- : __AsmType(__tmp.x), __AsmType(__tmp.y) \
499- : "l"(__ptr)__Clobber); \
500- using __ElementType = decltype(__ret.x); \
501- __ret.x = (__ElementType)(__tmp.x); \
502- __ret.y = (__ElementType)__tmp.y; \
503- return __ret; \
504- }
505-
506- #pragma push_macro("__INTRINSIC_LOAD4")
507- #define __INTRINSIC_LOAD4 (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
508- __Clobber ) \
509- inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
510- __DeclType __ret; \
511- __TmpType __tmp; \
512- asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \
513- : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \
514- __AsmType(__tmp.w) \
515- : "l"(__ptr)__Clobber); \
516- using __ElementType = decltype(__ret.x); \
517- __ret.x = (__ElementType)__tmp.x; \
518- __ret.y = (__ElementType)__tmp.y; \
519- __ret.z = (__ElementType)__tmp.z; \
520- __ret.w = (__ElementType)__tmp.w; \
521- return __ret; \
522- }
523-
524- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , char , unsigned int , "=r" , );
525- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , signed char , unsigned int , "=r" , );
526- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s16" , short , unsigned short , "=h" , );
527- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s32" , int , unsigned int , "=r" , );
528- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s64" , long long , unsigned long long,
529- "=l" , );
530-
531- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s8" , char2 , int2 , "=r" , );
532- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s8" , char4 , int4 , "=r" , );
533- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s16" , short2 , short2 , "=h" , );
534- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s16" , short4 , short4 , "=h" , );
535- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s32" , int2 , int2 , "=r" , );
536- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s32" , int4 , int4 , "=r" , );
537- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s64 " , longlong2 , longlong2 , "=l" , );
538-
539- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u8" , unsigned char , unsigned int ,
540- "=r" , );
541- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u16" , unsigned short , unsigned short ,
542- "=h" , );
543- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u32" , unsigned int , unsigned int ,
544- "=r" , );
545- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u64" , unsigned long long,
546- unsigned long long, "=l" , );
547-
548- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u8" , uchar2 , int2 , "=r" , );
549- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u8" , uchar4 , int4 , "=r" , );
550- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u16" , ushort2 , ushort2 , "=h" , );
551- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u16" , ushort4 , ushort4 , "=h" , );
552- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u32" , uint2 , uint2 , "=r" , );
553- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u32" , uint4 , uint4 , "=r" , );
554- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u64" , ulonglong2 , ulonglong2 ,
555- "=l" , );
556-
557- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.f32" , float , float , "=f" , );
558- __INTRINSIC_LOAD (__ldcg , "ld.global.cg.f64" , double , double , "=d" , );
559- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f32" , float2 , float2 , "=f" , );
560- __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.f32" , float4 , float4 , "=f" , );
561- __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f64" , double2 , double2 , "=d" , );
562-
563- inline __device__ long __ldcg (const long * __ptr ) {
564- unsigned long __ret ;
565- if (sizeof (long ) == 8 ) {
566- asm("ld.global.cg.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
567- } else {
568- asm("ld.global.cg.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
569- }
570- return (long )__ret ;
571- }
572-
573- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u8" , unsigned char , unsigned int ,
574- "=r" , : "memory" );
575- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u16" , unsigned short , unsigned short ,
576- "=h" , : "memory" );
577- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u32" , unsigned int , unsigned int ,
578- "=r" , : "memory" );
579- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u64" , unsigned long long,
580- unsigned long long, "=l" , : "memory" );
581-
582- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , char , unsigned int ,
583- "=r" , : "memory" );
584- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , signed char , unsigned int ,
585- "=r" , : "memory" );
586- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s16" , short , unsigned short ,
587- "=h" , : "memory" );
588- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s32" , int , unsigned int ,
589- "=r" , : "memory" );
590- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s64" , long long , unsigned long long,
591- "=l" , : "memory" );
592-
593- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u8" , uchar2 , uint2 ,
594- "=r" , : "memory" );
595- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u8" , uchar4 , uint4 ,
596- "=r" , : "memory" );
597- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u16" , ushort2 , ushort2 ,
598- "=h" , : "memory" );
599- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u16" , ushort4 , ushort4 ,
600- "=h" , : "memory" );
601- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u32" , uint2 , uint2 ,
602- "=r" , : "memory" );
603- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u32" , uint4 , uint4 ,
604- "=r" , : "memory" );
605- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u64" , ulonglong2 , ulonglong2 ,
606- "=l" , : "memory" );
607-
608- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s8" , char2 , int2 , "=r" , : "memory" );
609- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s8" , char4 , int4 , "=r" , : "memory" );
610- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s16" , short2 , short2 ,
611- "=h" , : "memory" );
612- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s16" , short4 , short4 ,
613- "=h" , : "memory" );
614- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s32" , int2 , int2 , "=r" , : "memory" );
615- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s32" , int4 , int4 , "=r" , : "memory" );
616- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s64" , longlong2 , longlong2 ,
617- "=l" , : "memory" );
618-
619- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.f32" , float , float , "=f" , : "memory" );
620- __INTRINSIC_LOAD (__ldcv , "ld.global.cv.f64" , double , double , "=d" , : "memory" );
621-
622- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f32" , float2 , float2 ,
623- "=f" , : "memory" );
624- __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.f32" , float4 , float4 ,
625- "=f" , : "memory" );
626- __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f64" , double2 , double2 ,
627- "=d" , : "memory" );
628-
629- inline __device__ long __ldcv (const long * __ptr ) {
630- unsigned long __ret ;
631- if (sizeof (long ) == 8 ) {
632- asm("ld.global.cv.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
633- } else {
634- asm("ld.global.cv.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
635- }
636- return (long )__ret ;
637- }
638-
639- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , char , unsigned int , "=r" , );
640- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , signed char , signed int , "=r" , );
641- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s16" , short , unsigned short , "=h" , );
642- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s32" , int , unsigned int , "=r" , );
643- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s64" , long long , unsigned long long,
644- "=l" , );
645-
646- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s8" , char2 , int2 , "=r" , );
647- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s8" , char4 , int4 , "=r" , );
648- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s16" , short2 , short2 , "=h" , );
649- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s16" , short4 , short4 , "=h" , );
650- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s32" , int2 , int2 , "=r" , );
651- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s32" , int4 , int4 , "=r" , );
652- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s64" , longlong2 , longlong2 , "=l" , );
653-
654- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u8" , unsigned char , unsigned int ,
655- "=r" , );
656- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u16" , unsigned short , unsigned short ,
657- "=h" , );
658- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u32" , unsigned int , unsigned int ,
659- "=r" , );
660- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u64" , unsigned long long,
661- unsigned long long, "=l" , );
662-
663- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u8" , uchar2 , uint2 , "=r" , );
664- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u8" , uchar4 , uint4 , "=r" , );
665- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u16" , ushort2 , ushort2 , "=h" , );
666- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u16" , ushort4 , ushort4 , "=h" , );
667- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u32" , uint2 , uint2 , "=r" , );
668- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u32" , uint4 , uint4 , "=r" , );
669- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u64" , ulonglong2 , ulonglong2 ,
670- "=l" , );
671-
672- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.f32" , float , float , "=f" , );
673- __INTRINSIC_LOAD (__ldcs , "ld.global.cs.f64" , double , double , "=d" , );
674- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f32" , float2 , float2 , "=f" , );
675- __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.f32" , float4 , float4 , "=f" , );
676- __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f64" , double2 , double2 , "=d" , );
677-
678- #pragma pop_macro("__INTRINSIC_LOAD")
679- #pragma pop_macro("__INTRINSIC_LOAD2")
680- #pragma pop_macro("__INTRINSIC_LOAD4")
681-
682- inline __device__ long __ldcs (const long * __ptr ) {
683- unsigned long __ret ;
684- if (sizeof (long ) == 8 ) {
685- asm("ld.global.cs.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
686- } else {
687- asm("ld.global.cs.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
688- }
689- return (long )__ret ;
690- }
691-
692- #pragma push_macro("__INTRINSIC_STORE")
693- #define __INTRINSIC_STORE (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType ) \
694- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
695- __TmpType __tmp = (__TmpType)__value; \
696- asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \
697- }
698-
699- #pragma push_macro("__INTRINSIC_STORE2")
700- #define __INTRINSIC_STORE2 (__FnName , __AsmOp , __DeclType , __TmpType , \
701- __AsmType ) \
702- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
703- __TmpType __tmp; \
704- using __ElementType = decltype(__tmp.x); \
705- __tmp.x = (__ElementType)(__value.x); \
706- __tmp.y = (__ElementType)(__value.y); \
707- asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \
708- __AsmType(__tmp.y) \
709- : "memory"); \
710- }
711-
712- #pragma push_macro("__INTRINSIC_STORE4")
713- #define __INTRINSIC_STORE4 (__FnName , __AsmOp , __DeclType , __TmpType , \
714- __AsmType ) \
715- inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
716- __TmpType __tmp; \
717- using __ElementType = decltype(__tmp.x); \
718- __tmp.x = (__ElementType)(__value.x); \
719- __tmp.y = (__ElementType)(__value.y); \
720- __tmp.z = (__ElementType)(__value.z); \
721- __tmp.w = (__ElementType)(__value.w); \
722- asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \
723- __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \
724- : "memory"); \
725- }
726-
727- __INTRINSIC_STORE (__stwt , "st.global.wt.s8" , char , int , "r" );
728- __INTRINSIC_STORE (__stwt , "st.global.wt.s8" , signed char , int , "r" );
729- __INTRINSIC_STORE (__stwt , "st.global.wt.s16" , short , short , "h" );
730- __INTRINSIC_STORE (__stwt , "st.global.wt.s32" , int , int , "r" );
731- __INTRINSIC_STORE (__stwt , "st.global.wt.s64" , long long , long long , "l" );
732-
733- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s8" , char2 , int2 , "r" );
734- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s8" , char4 , int4 , "r" );
735- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s16" , short2 , short2 , "h" );
736- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s16" , short4 , short4 , "h" );
737- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s32" , int2 , int2 , "r" );
738- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s32" , int4 , int4 , "r" );
739- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s64" , longlong2 , longlong2 , "l" );
740-
741- __INTRINSIC_STORE (__stwt , "st.global.wt.u8" , unsigned char , int , "r" );
742- __INTRINSIC_STORE (__stwt , "st.global.wt.u16" , unsigned short , unsigned short ,
743- "h" );
744- __INTRINSIC_STORE (__stwt , "st.global.wt.u32" , unsigned int , unsigned int , "r" );
745- __INTRINSIC_STORE (__stwt , "st.global.wt.u64" , unsigned long long,
746- unsigned long long, "l" );
747-
748- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u8" , uchar2 , uchar2 , "r" );
749- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u8" , uchar4 , uint4 , "r" );
750- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u16" , ushort2 , ushort2 , "h" );
751- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u16" , ushort4 , ushort4 , "h" );
752- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u32" , uint2 , uint2 , "r" );
753- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u32" , uint4 , uint4 , "r" );
754- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u64" , ulonglong2 , ulonglong2 , "l" );
755-
756- __INTRINSIC_STORE (__stwt , "st.global.wt.f32" , float , float , "f" );
757- __INTRINSIC_STORE (__stwt , "st.global.wt.f64" , double , double , "d" );
758- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f32" , float2 , float2 , "f" );
759- __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.f32" , float4 , float4 , "f" );
760- __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f64" , double2 , double2 , "d" );
761-
762- #pragma pop_macro("__INTRINSIC_STORE")
763- #pragma pop_macro("__INTRINSIC_STORE2")
764- #pragma pop_macro("__INTRINSIC_STORE4")
765-
766482#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
767483
768484#if CUDA_VERSION >= 11000
0 commit comments