@@ -479,6 +479,293 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479479 return ret;
480480}
481481
482+ #if defined(__cplusplus) && (__cplusplus >= 201103L)
483+
484+ #pragma push_macro("__INTRINSIC_LOAD")
485+ #define __INTRINSIC_LOAD (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
486+ __Clobber) \
487+ inline __device__ __DeclType __FnName (const __DeclType *__ptr) { \
488+ __TmpType __ret; \
489+ asm (__AsmOp " %0, [%1];" : __AsmType (__ret) : " l" (__ptr)__Clobber); \
490+ return (__DeclType)__ret; \
491+ }
492+
493+ #pragma push_macro("__INTRINSIC_LOAD2")
494+ #define __INTRINSIC_LOAD2 (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
495+ __Clobber) \
496+ inline __device__ __DeclType __FnName (const __DeclType *__ptr) { \
497+ __DeclType __ret; \
498+ __TmpType __tmp; \
499+ asm (__AsmOp " {%0,%1}, [%2];" \
500+ : __AsmType (__tmp.x ), __AsmType (__tmp.y ) \
501+ : " l" (__ptr)__Clobber); \
502+ using __ElementType = decltype (__ret.x ); \
503+ __ret.x = (__ElementType)(__tmp.x ); \
504+ __ret.y = (__ElementType)__tmp.y ; \
505+ return __ret; \
506+ }
507+
508+ #pragma push_macro("__INTRINSIC_LOAD4")
509+ #define __INTRINSIC_LOAD4 (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
510+ __Clobber) \
511+ inline __device__ __DeclType __FnName (const __DeclType *__ptr) { \
512+ __DeclType __ret; \
513+ __TmpType __tmp; \
514+ asm (__AsmOp " {%0,%1,%2,%3}, [%4];" \
515+ : __AsmType (__tmp.x ), __AsmType (__tmp.y ), __AsmType (__tmp.z ), \
516+ __AsmType (__tmp.w ) \
517+ : " l" (__ptr)__Clobber); \
518+ using __ElementType = decltype (__ret.x ); \
519+ __ret.x = (__ElementType)__tmp.x ; \
520+ __ret.y = (__ElementType)__tmp.y ; \
521+ __ret.z = (__ElementType)__tmp.z ; \
522+ __ret.w = (__ElementType)__tmp.w ; \
523+ return __ret; \
524+ }
525+
526+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s8" , char , unsigned int , " =r" , );
527+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s8" , signed char , unsigned int , " =r" , );
528+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s16" , short , unsigned short , " =h" , );
529+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s32" , int , unsigned int , " =r" , );
530+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s64" , long long , unsigned long long ,
531+ " =l" , );
532+
533+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s8" , char2, int2, " =r" , );
534+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s8" , char4, int4, " =r" , );
535+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s16" , short2, short2, " =h" , );
536+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s16" , short4, short4, " =h" , );
537+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s32" , int2, int2, " =r" , );
538+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s32" , int4, int4, " =r" , );
539+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s64 " , longlong2, longlong2, " =l" , );
540+
541+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u8" , unsigned char , unsigned int ,
542+ " =r" , );
543+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u16" , unsigned short , unsigned short ,
544+ " =h" , );
545+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u32" , unsigned int , unsigned int ,
546+ " =r" , );
547+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u64" , unsigned long long ,
548+ unsigned long long , " =l" , );
549+
550+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u8" , uchar2, int2, " =r" , );
551+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u8" , uchar4, int4, " =r" , );
552+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u16" , ushort2, ushort2, " =h" , );
553+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u16" , ushort4, ushort4, " =h" , );
554+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u32" , uint2, uint2, " =r" , );
555+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u32" , uint4, uint4, " =r" , );
556+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u64" , ulonglong2, ulonglong2,
557+ " =l" , );
558+
559+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.f32" , float , float , " =f" , );
560+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.f64" , double , double , " =d" , );
561+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.f32" , float2, float2, " =f" , );
562+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.f32" , float4, float4, " =f" , );
563+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.f64" , double2, double2, " =d" , );
564+
565+ inline __device__ long __ldcg (const long *__ptr) {
566+ unsigned long __ret;
567+ if (sizeof (long ) == 8 ) {
568+ asm (" ld.global.cg.s64 %0, [%1];" : " =l" (__ret) : " l" (__ptr));
569+ } else {
570+ asm (" ld.global.cg.s32 %0, [%1];" : " =r" (__ret) : " l" (__ptr));
571+ }
572+ return (long )__ret;
573+ }
574+
575+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u8" , unsigned char , unsigned int ,
576+ " =r" , : " memory" );
577+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u16" , unsigned short , unsigned short ,
578+ " =h" , : " memory" );
579+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u32" , unsigned int , unsigned int ,
580+ " =r" , : " memory" );
581+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u64" , unsigned long long ,
582+ unsigned long long , " =l" , : " memory" );
583+
584+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s8" , char , unsigned int ,
585+ " =r" , : " memory" );
586+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s8" , signed char , unsigned int ,
587+ " =r" , : " memory" );
588+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s16" , short , unsigned short ,
589+ " =h" , : " memory" );
590+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s32" , int , unsigned int ,
591+ " =r" , : " memory" );
592+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s64" , long long , unsigned long long ,
593+ " =l" , : " memory" );
594+
595+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u8" , uchar2, uint2,
596+ " =r" , : " memory" );
597+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u8" , uchar4, uint4,
598+ " =r" , : " memory" );
599+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u16" , ushort2, ushort2,
600+ " =h" , : " memory" );
601+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u16" , ushort4, ushort4,
602+ " =h" , : " memory" );
603+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u32" , uint2, uint2,
604+ " =r" , : " memory" );
605+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u32" , uint4, uint4,
606+ " =r" , : " memory" );
607+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u64" , ulonglong2, ulonglong2,
608+ " =l" , : " memory" );
609+
610+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s8" , char2, int2, " =r" , : " memory" );
611+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s8" , char4, int4, " =r" , : " memory" );
612+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s16" , short2, short2,
613+ " =h" , : " memory" );
614+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s16" , short4, short4,
615+ " =h" , : " memory" );
616+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s32" , int2, int2, " =r" , : " memory" );
617+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s32" , int4, int4, " =r" , : " memory" );
618+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s64" , longlong2, longlong2,
619+ " =l" , : " memory" );
620+
621+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.f32" , float , float , " =f" , : " memory" );
622+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.f64" , double , double , " =d" , : " memory" );
623+
624+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.f32" , float2, float2,
625+ " =f" , : " memory" );
626+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.f32" , float4, float4,
627+ " =f" , : " memory" );
628+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.f64" , double2, double2,
629+ " =d" , : " memory" );
630+
631+ inline __device__ long __ldcv (const long *__ptr) {
632+ unsigned long __ret;
633+ if (sizeof (long ) == 8 ) {
634+ asm (" ld.global.cv.s64 %0, [%1];" : " =l" (__ret) : " l" (__ptr));
635+ } else {
636+ asm (" ld.global.cv.s32 %0, [%1];" : " =r" (__ret) : " l" (__ptr));
637+ }
638+ return (long )__ret;
639+ }
640+
641+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s8" , char , unsigned int , " =r" , );
642+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s8" , signed char , signed int , " =r" , );
643+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s16" , short , unsigned short , " =h" , );
644+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s32" , int , unsigned int , " =r" , );
645+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s64" , long long , unsigned long long ,
646+ " =l" , );
647+
648+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s8" , char2, int2, " =r" , );
649+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s8" , char4, int4, " =r" , );
650+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s16" , short2, short2, " =h" , );
651+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s16" , short4, short4, " =h" , );
652+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s32" , int2, int2, " =r" , );
653+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s32" , int4, int4, " =r" , );
654+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s64" , longlong2, longlong2, " =l" , );
655+
656+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u8" , unsigned char , unsigned int ,
657+ " =r" , );
658+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u16" , unsigned short , unsigned short ,
659+ " =h" , );
660+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u32" , unsigned int , unsigned int ,
661+ " =r" , );
662+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u64" , unsigned long long ,
663+ unsigned long long , " =l" , );
664+
665+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u8" , uchar2, uint2, " =r" , );
666+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u8" , uchar4, uint4, " =r" , );
667+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u16" , ushort2, ushort2, " =h" , );
668+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u16" , ushort4, ushort4, " =h" , );
669+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u32" , uint2, uint2, " =r" , );
670+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u32" , uint4, uint4, " =r" , );
671+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u64" , ulonglong2, ulonglong2,
672+ " =l" , );
673+
674+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.f32" , float , float , " =f" , );
675+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.f64" , double , double , " =d" , );
676+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.f32" , float2, float2, " =f" , );
677+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.f32" , float4, float4, " =f" , );
678+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.f64" , double2, double2, " =d" , );
679+
680+ #pragma pop_macro("__INTRINSIC_LOAD")
681+ #pragma pop_macro("__INTRINSIC_LOAD2")
682+ #pragma pop_macro("__INTRINSIC_LOAD4")
683+
684+ inline __device__ long __ldcs (const long *__ptr) {
685+ unsigned long __ret;
686+ if (sizeof (long ) == 8 ) {
687+ asm (" ld.global.cs.s64 %0, [%1];" : " =l" (__ret) : " l" (__ptr));
688+ } else {
689+ asm (" ld.global.cs.s32 %0, [%1];" : " =r" (__ret) : " l" (__ptr));
690+ }
691+ return (long )__ret;
692+ }
693+
694+ #pragma push_macro("__INTRINSIC_STORE")
695+ #define __INTRINSIC_STORE (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType ) \
696+ inline __device__ void __FnName (__DeclType *__ptr, __DeclType __value) { \
697+ __TmpType __tmp = (__TmpType)__value; \
698+ asm (__AsmOp " [%0], %1;" ::" l" (__ptr), __AsmType (__tmp) : " memory" ); \
699+ }
700+
701+ #pragma push_macro("__INTRINSIC_STORE2")
702+ #define __INTRINSIC_STORE2 (__FnName, __AsmOp, __DeclType, __TmpType, \
703+ __AsmType) \
704+ inline __device__ void __FnName (__DeclType *__ptr, __DeclType __value) { \
705+ __TmpType __tmp; \
706+ using __ElementType = decltype (__tmp.x ); \
707+ __tmp.x = (__ElementType)(__value.x ); \
708+ __tmp.y = (__ElementType)(__value.y ); \
709+ asm (__AsmOp " [%0], {%1,%2};" ::" l" (__ptr), __AsmType (__tmp.x ), \
710+ __AsmType (__tmp.y ) \
711+ : " memory" ); \
712+ }
713+
714+ #pragma push_macro("__INTRINSIC_STORE4")
715+ #define __INTRINSIC_STORE4 (__FnName, __AsmOp, __DeclType, __TmpType, \
716+ __AsmType) \
717+ inline __device__ void __FnName (__DeclType *__ptr, __DeclType __value) { \
718+ __TmpType __tmp; \
719+ using __ElementType = decltype (__tmp.x ); \
720+ __tmp.x = (__ElementType)(__value.x ); \
721+ __tmp.y = (__ElementType)(__value.y ); \
722+ __tmp.z = (__ElementType)(__value.z ); \
723+ __tmp.w = (__ElementType)(__value.w ); \
724+ asm (__AsmOp " [%0], {%1,%2,%3,%4};" ::" l" (__ptr), __AsmType (__tmp.x ), \
725+ __AsmType (__tmp.y ), __AsmType (__tmp.z ), __AsmType (__tmp.w ) \
726+ : " memory" ); \
727+ }
728+
729+ __INTRINSIC_STORE (__stwt, " st.global.wt.s8" , char , int , " r" );
730+ __INTRINSIC_STORE (__stwt, " st.global.wt.s8" , signed char , int , " r" );
731+ __INTRINSIC_STORE (__stwt, " st.global.wt.s16" , short , short , " h" );
732+ __INTRINSIC_STORE (__stwt, " st.global.wt.s32" , int , int , " r" );
733+ __INTRINSIC_STORE (__stwt, " st.global.wt.s64" , long long , long long , " l" );
734+
735+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s8" , char2, int2, " r" );
736+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s8" , char4, int4, " r" );
737+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s16" , short2, short2, " h" );
738+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s16" , short4, short4, " h" );
739+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s32" , int2, int2, " r" );
740+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s32" , int4, int4, " r" );
741+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s64" , longlong2, longlong2, " l" );
742+
743+ __INTRINSIC_STORE (__stwt, " st.global.wt.u8" , unsigned char , int , " r" );
744+ __INTRINSIC_STORE (__stwt, " st.global.wt.u16" , unsigned short , unsigned short ,
745+ " h" );
746+ __INTRINSIC_STORE (__stwt, " st.global.wt.u32" , unsigned int , unsigned int , " r" );
747+ __INTRINSIC_STORE (__stwt, " st.global.wt.u64" , unsigned long long ,
748+ unsigned long long , " l" );
749+
750+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u8" , uchar2, uchar2, " r" );
751+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u8" , uchar4, uint4, " r" );
752+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u16" , ushort2, ushort2, " h" );
753+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u16" , ushort4, ushort4, " h" );
754+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u32" , uint2, uint2, " r" );
755+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u32" , uint4, uint4, " r" );
756+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u64" , ulonglong2, ulonglong2, " l" );
757+
758+ __INTRINSIC_STORE (__stwt, " st.global.wt.f32" , float , float , " f" );
759+ __INTRINSIC_STORE (__stwt, " st.global.wt.f64" , double , double , " d" );
760+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.f32" , float2, float2, " f" );
761+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.f32" , float4, float4, " f" );
762+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.f64" , double2, double2, " d" );
763+
764+ #pragma pop_macro("__INTRINSIC_STORE")
765+ #pragma pop_macro("__INTRINSIC_STORE2")
766+ #pragma pop_macro("__INTRINSIC_STORE4")
767+
768+ #endif // defined(__cplusplus) && (__cplusplus >= 201103L)
482769#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483770
484771#if CUDA_VERSION >= 11000
0 commit comments