@@ -479,6 +479,290 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479479 return ret ;
480480}
481481
482+ #pragma push_macro("__INTRINSIC_LOAD")
483+ #define __INTRINSIC_LOAD (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
484+ __Clobber ) \
485+ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
486+ __TmpType __ret; \
487+ asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \
488+ return (__DeclType)__ret; \
489+ }
490+
491+ #pragma push_macro("__INTRINSIC_LOAD2")
492+ #define __INTRINSIC_LOAD2 (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
493+ __Clobber ) \
494+ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
495+ __DeclType __ret; \
496+ __TmpType __tmp; \
497+ asm(__AsmOp " {%0,%1}, [%2];" \
498+ : __AsmType(__tmp.x), __AsmType(__tmp.y) \
499+ : "l"(__ptr)__Clobber); \
500+ using __ElementType = decltype(__ret.x); \
501+ __ret.x = (__ElementType)(__tmp.x); \
502+ __ret.y = (__ElementType)__tmp.y; \
503+ return __ret; \
504+ }
505+
506+ #pragma push_macro("__INTRINSIC_LOAD4")
507+ #define __INTRINSIC_LOAD4 (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType , \
508+ __Clobber ) \
509+ inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
510+ __DeclType __ret; \
511+ __TmpType __tmp; \
512+ asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \
513+ : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \
514+ __AsmType(__tmp.w) \
515+ : "l"(__ptr)__Clobber); \
516+ using __ElementType = decltype(__ret.x); \
517+ __ret.x = (__ElementType)__tmp.x; \
518+ __ret.y = (__ElementType)__tmp.y; \
519+ __ret.z = (__ElementType)__tmp.z; \
520+ __ret.w = (__ElementType)__tmp.w; \
521+ return __ret; \
522+ }
523+
524+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , char , unsigned int , "=r" , );
525+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s8" , signed char , unsigned int , "=r" , );
526+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s16" , short , unsigned short , "=h" , );
527+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s32" , int , unsigned int , "=r" , );
528+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.s64" , long long , unsigned long long,
529+ "=l" , );
530+
531+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s8" , char2 , int2 , "=r" , );
532+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s8" , char4 , int4 , "=r" , );
533+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s16" , short2 , short2 , "=h" , );
534+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s16" , short4 , short4 , "=h" , );
535+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s32" , int2 , int2 , "=r" , );
536+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.s32" , int4 , int4 , "=r" , );
537+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.s64 " , longlong2 , longlong2 , "=l" , );
538+
539+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u8" , unsigned char , unsigned int ,
540+ "=r" , );
541+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u16" , unsigned short , unsigned short ,
542+ "=h" , );
543+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u32" , unsigned int , unsigned int ,
544+ "=r" , );
545+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.u64" , unsigned long long,
546+ unsigned long long, "=l" , );
547+
548+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u8" , uchar2 , int2 , "=r" , );
549+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u8" , uchar4 , int4 , "=r" , );
550+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u16" , ushort2 , ushort2 , "=h" , );
551+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u16" , ushort4 , ushort4 , "=h" , );
552+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u32" , uint2 , uint2 , "=r" , );
553+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.u32" , uint4 , uint4 , "=r" , );
554+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.u64" , ulonglong2 , ulonglong2 ,
555+ "=l" , );
556+
557+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.f32" , float , float , "=f" , );
558+ __INTRINSIC_LOAD (__ldcg , "ld.global.cg.f64" , double , double , "=d" , );
559+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f32" , float2 , float2 , "=f" , );
560+ __INTRINSIC_LOAD4 (__ldcg , "ld.global.cg.v4.f32" , float4 , float4 , "=f" , );
561+ __INTRINSIC_LOAD2 (__ldcg , "ld.global.cg.v2.f64" , double2 , double2 , "=d" , );
562+
563+ inline __device__ long __ldcg (const long * __ptr ) {
564+ unsigned long __ret ;
565+ if (sizeof (long ) == 8 ) {
566+ asm("ld.global.cg.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
567+ } else {
568+ asm("ld.global.cg.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
569+ }
570+ return (long )__ret ;
571+ }
572+
573+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u8" , unsigned char , unsigned int ,
574+ "=r" , : "memory" );
575+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u16" , unsigned short , unsigned short ,
576+ "=h" , : "memory" );
577+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u32" , unsigned int , unsigned int ,
578+ "=r" , : "memory" );
579+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.u64" , unsigned long long,
580+ unsigned long long, "=l" , : "memory" );
581+
582+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , char , unsigned int ,
583+ "=r" , : "memory" );
584+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s8" , signed char , unsigned int ,
585+ "=r" , : "memory" );
586+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s16" , short , unsigned short ,
587+ "=h" , : "memory" );
588+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s32" , int , unsigned int ,
589+ "=r" , : "memory" );
590+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.s64" , long long , unsigned long long,
591+ "=l" , : "memory" );
592+
593+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u8" , uchar2 , uint2 ,
594+ "=r" , : "memory" );
595+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u8" , uchar4 , uint4 ,
596+ "=r" , : "memory" );
597+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u16" , ushort2 , ushort2 ,
598+ "=h" , : "memory" );
599+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u16" , ushort4 , ushort4 ,
600+ "=h" , : "memory" );
601+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u32" , uint2 , uint2 ,
602+ "=r" , : "memory" );
603+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.u32" , uint4 , uint4 ,
604+ "=r" , : "memory" );
605+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.u64" , ulonglong2 , ulonglong2 ,
606+ "=l" , : "memory" );
607+
608+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s8" , char2 , int2 , "=r" , : "memory" );
609+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s8" , char4 , int4 , "=r" , : "memory" );
610+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s16" , short2 , short2 ,
611+ "=h" , : "memory" );
612+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s16" , short4 , short4 ,
613+ "=h" , : "memory" );
614+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s32" , int2 , int2 , "=r" , : "memory" );
615+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.s32" , int4 , int4 , "=r" , : "memory" );
616+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.s64" , longlong2 , longlong2 ,
617+ "=l" , : "memory" );
618+
619+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.f32" , float , float , "=f" , : "memory" );
620+ __INTRINSIC_LOAD (__ldcv , "ld.global.cv.f64" , double , double , "=d" , : "memory" );
621+
622+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f32" , float2 , float2 ,
623+ "=f" , : "memory" );
624+ __INTRINSIC_LOAD4 (__ldcv , "ld.global.cv.v4.f32" , float4 , float4 ,
625+ "=f" , : "memory" );
626+ __INTRINSIC_LOAD2 (__ldcv , "ld.global.cv.v2.f64" , double2 , double2 ,
627+ "=d" , : "memory" );
628+
629+ inline __device__ long __ldcv (const long * __ptr ) {
630+ unsigned long __ret ;
631+ if (sizeof (long ) == 8 ) {
632+ asm("ld.global.cv.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
633+ } else {
634+ asm("ld.global.cv.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
635+ }
636+ return (long )__ret ;
637+ }
638+
639+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , char , unsigned int , "=r" , );
640+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s8" , signed char , signed int , "=r" , );
641+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s16" , short , unsigned short , "=h" , );
642+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s32" , int , unsigned int , "=r" , );
643+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.s64" , long long , unsigned long long,
644+ "=l" , );
645+
646+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s8" , char2 , int2 , "=r" , );
647+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s8" , char4 , int4 , "=r" , );
648+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s16" , short2 , short2 , "=h" , );
649+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s16" , short4 , short4 , "=h" , );
650+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s32" , int2 , int2 , "=r" , );
651+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.s32" , int4 , int4 , "=r" , );
652+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.s64" , longlong2 , longlong2 , "=l" , );
653+
654+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u8" , unsigned char , unsigned int ,
655+ "=r" , );
656+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u16" , unsigned short , unsigned short ,
657+ "=h" , );
658+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u32" , unsigned int , unsigned int ,
659+ "=r" , );
660+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.u64" , unsigned long long,
661+ unsigned long long, "=l" , );
662+
663+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u8" , uchar2 , uint2 , "=r" , );
664+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u8" , uchar4 , uint4 , "=r" , );
665+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u16" , ushort2 , ushort2 , "=h" , );
666+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u16" , ushort4 , ushort4 , "=h" , );
667+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u32" , uint2 , uint2 , "=r" , );
668+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.u32" , uint4 , uint4 , "=r" , );
669+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.u64" , ulonglong2 , ulonglong2 ,
670+ "=l" , );
671+
672+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.f32" , float , float , "=f" , );
673+ __INTRINSIC_LOAD (__ldcs , "ld.global.cs.f64" , double , double , "=d" , );
674+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f32" , float2 , float2 , "=f" , );
675+ __INTRINSIC_LOAD4 (__ldcs , "ld.global.cs.v4.f32" , float4 , float4 , "=f" , );
676+ __INTRINSIC_LOAD2 (__ldcs , "ld.global.cs.v2.f64" , double2 , double2 , "=d" , );
677+
678+ #pragma pop_macro("__INTRINSIC_LOAD")
679+ #pragma pop_macro("__INTRINSIC_LOAD2")
680+ #pragma pop_macro("__INTRINSIC_LOAD4")
681+
682+ inline __device__ long __ldcs (const long * __ptr ) {
683+ unsigned long __ret ;
684+ if (sizeof (long ) == 8 ) {
685+ asm("ld.global.cs.s64 %0, [%1];" : "=l" (__ret ) : "l" (__ptr ));
686+ } else {
687+ asm("ld.global.cs.s32 %0, [%1];" : "=r" (__ret ) : "l" (__ptr ));
688+ }
689+ return (long )__ret ;
690+ }
691+
692+ #pragma push_macro("__INTRINSIC_STORE")
693+ #define __INTRINSIC_STORE (__FnName , __AsmOp , __DeclType , __TmpType , __AsmType ) \
694+ inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
695+ __TmpType __tmp = (__TmpType)__value; \
696+ asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \
697+ }
698+
699+ #pragma push_macro("__INTRINSIC_STORE2")
700+ #define __INTRINSIC_STORE2 (__FnName , __AsmOp , __DeclType , __TmpType , \
701+ __AsmType ) \
702+ inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
703+ __TmpType __tmp; \
704+ using __ElementType = decltype(__tmp.x); \
705+ __tmp.x = (__ElementType)(__value.x); \
706+ __tmp.y = (__ElementType)(__value.y); \
707+ asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \
708+ __AsmType(__tmp.y) \
709+ : "memory"); \
710+ }
711+
712+ #pragma push_macro("__INTRINSIC_STORE4")
713+ #define __INTRINSIC_STORE4 (__FnName , __AsmOp , __DeclType , __TmpType , \
714+ __AsmType ) \
715+ inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
716+ __TmpType __tmp; \
717+ using __ElementType = decltype(__tmp.x); \
718+ __tmp.x = (__ElementType)(__value.x); \
719+ __tmp.y = (__ElementType)(__value.y); \
720+ __tmp.z = (__ElementType)(__value.z); \
721+ __tmp.w = (__ElementType)(__value.w); \
722+ asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \
723+ __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \
724+ : "memory"); \
725+ }
726+
727+ __INTRINSIC_STORE (__stwt , "st.global.wt.s8" , char , int , "r" );
728+ __INTRINSIC_STORE (__stwt , "st.global.wt.s8" , signed char , int , "r" );
729+ __INTRINSIC_STORE (__stwt , "st.global.wt.s16" , short , short , "h" );
730+ __INTRINSIC_STORE (__stwt , "st.global.wt.s32" , int , int , "r" );
731+ __INTRINSIC_STORE (__stwt , "st.global.wt.s64" , long long , long long , "l" );
732+
733+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s8" , char2 , int2 , "r" );
734+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s8" , char4 , int4 , "r" );
735+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s16" , short2 , short2 , "h" );
736+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s16" , short4 , short4 , "h" );
737+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s32" , int2 , int2 , "r" );
738+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.s32" , int4 , int4 , "r" );
739+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.s64" , longlong2 , longlong2 , "l" );
740+
741+ __INTRINSIC_STORE (__stwt , "st.global.wt.u8" , unsigned char , int , "r" );
742+ __INTRINSIC_STORE (__stwt , "st.global.wt.u16" , unsigned short , unsigned short ,
743+ "h" );
744+ __INTRINSIC_STORE (__stwt , "st.global.wt.u32" , unsigned int , unsigned int , "r" );
745+ __INTRINSIC_STORE (__stwt , "st.global.wt.u64" , unsigned long long,
746+ unsigned long long, "l" );
747+
748+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u8" , uchar2 , uchar2 , "r" );
749+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u8" , uchar4 , uint4 , "r" );
750+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u16" , ushort2 , ushort2 , "h" );
751+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u16" , ushort4 , ushort4 , "h" );
752+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u32" , uint2 , uint2 , "r" );
753+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.u32" , uint4 , uint4 , "r" );
754+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.u64" , ulonglong2 , ulonglong2 , "l" );
755+
756+ __INTRINSIC_STORE (__stwt , "st.global.wt.f32" , float , float , "f" );
757+ __INTRINSIC_STORE (__stwt , "st.global.wt.f64" , double , double , "d" );
758+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f32" , float2 , float2 , "f" );
759+ __INTRINSIC_STORE4 (__stwt , "st.global.wt.v4.f32" , float4 , float4 , "f" );
760+ __INTRINSIC_STORE2 (__stwt , "st.global.wt.v2.f64" , double2 , double2 , "d" );
761+
762+ #pragma pop_macro("__INTRINSIC_STORE")
763+ #pragma pop_macro("__INTRINSIC_STORE2")
764+ #pragma pop_macro("__INTRINSIC_STORE4")
765+
482766#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483767
484768#if CUDA_VERSION >= 11000
0 commit comments