@@ -479,6 +479,293 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479479  return  ret;
480480}
481481
482+ #if  defined(__cplusplus) && (__cplusplus >= 201103L)
483+ 
484+ #pragma  push_macro("__INTRINSIC_LOAD")
485+ #define  __INTRINSIC_LOAD (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType,  \
486+                          __Clobber)                                            \
487+   inline  __device__ __DeclType __FnName (const  __DeclType *__ptr) {             \
488+     __TmpType __ret;                                                           \
489+     asm (__AsmOp "  %0, [%1];" __AsmType (__ret) : " l" 
490+     return  (__DeclType)__ret;                                                  \
491+   }
492+ 
493+ #pragma  push_macro("__INTRINSIC_LOAD2")
494+ #define  __INTRINSIC_LOAD2 (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
495+                           __Clobber)                                           \
496+   inline  __device__ __DeclType __FnName (const  __DeclType *__ptr) {             \
497+     __DeclType __ret;                                                          \
498+     __TmpType __tmp;                                                           \
499+     asm (__AsmOp "  {%0,%1}, [%2];" 
500+         : __AsmType (__tmp.x ), __AsmType (__tmp.y )                               \
501+         : " l" 
502+     using  __ElementType = decltype (__ret.x );                                   \
503+     __ret.x  = (__ElementType)(__tmp.x );                                        \
504+     __ret.y  = (__ElementType)__tmp.y ;                                          \
505+     return  __ret;                                                              \
506+   }
507+ 
508+ #pragma  push_macro("__INTRINSIC_LOAD4")
509+ #define  __INTRINSIC_LOAD4 (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
510+                           __Clobber)                                           \
511+   inline  __device__ __DeclType __FnName (const  __DeclType *__ptr) {             \
512+     __DeclType __ret;                                                          \
513+     __TmpType __tmp;                                                           \
514+     asm (__AsmOp "  {%0,%1,%2,%3}, [%4];" 
515+         : __AsmType (__tmp.x ), __AsmType (__tmp.y ), __AsmType (__tmp.z ),          \
516+           __AsmType (__tmp.w )                                                   \
517+         : " l" 
518+     using  __ElementType = decltype (__ret.x );                                   \
519+     __ret.x  = (__ElementType)__tmp.x ;                                          \
520+     __ret.y  = (__ElementType)__tmp.y ;                                          \
521+     __ret.z  = (__ElementType)__tmp.z ;                                          \
522+     __ret.w  = (__ElementType)__tmp.w ;                                          \
523+     return  __ret;                                                              \
524+   }
525+ 
526+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s8" char , unsigned  int , " =r" 
527+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s8" signed  char , unsigned  int , " =r" 
528+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s16" short , unsigned  short , " =h" 
529+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s32" int , unsigned  int , " =r" 
530+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.s64" long  long , unsigned  long  long ,
531+                  " =l" 
532+ 
533+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s8" " =r" 
534+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s8" " =r" 
535+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s16" " =h" 
536+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s16" " =h" 
537+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s32" " =r" 
538+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.s32" " =r" 
539+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.s64 " " =l" 
540+ 
541+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u8" unsigned  char , unsigned  int ,
542+                  " =r" 
543+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u16" unsigned  short , unsigned  short ,
544+                  " =h" 
545+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u32" unsigned  int , unsigned  int ,
546+                  " =r" 
547+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.u64" unsigned  long  long ,
548+                  unsigned  long  long , " =l" 
549+ 
550+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u8" " =r" 
551+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u8" " =r" 
552+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u16" " =h" 
553+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u16" " =h" 
554+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u32" " =r" 
555+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.u32" " =r" 
556+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.u64" 
557+                   " =l" 
558+ 
559+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.f32" float , float , " =f" 
560+ __INTRINSIC_LOAD (__ldcg, " ld.global.cg.f64" double , double , " =d" 
561+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.f32" " =f" 
562+ __INTRINSIC_LOAD4 (__ldcg, " ld.global.cg.v4.f32" " =f" 
563+ __INTRINSIC_LOAD2 (__ldcg, " ld.global.cg.v2.f64" " =d" 
564+ 
565+ inline  __device__ long  __ldcg (const  long  *__ptr) {
566+   unsigned  long  __ret;
567+   if  (sizeof (long ) == 8 ) {
568+     asm (" ld.global.cg.s64 %0, [%1];" " =l" " l" 
569+   } else  {
570+     asm (" ld.global.cg.s32 %0, [%1];" " =r" " l" 
571+   }
572+   return  (long )__ret;
573+ }
574+ 
575+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u8" unsigned  char , unsigned  int ,
576+                  " =r" " memory" 
577+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u16" unsigned  short , unsigned  short ,
578+                  " =h" " memory" 
579+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u32" unsigned  int , unsigned  int ,
580+                  " =r" " memory" 
581+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.u64" unsigned  long  long ,
582+                  unsigned  long  long , " =l" " memory" 
583+ 
584+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s8" char , unsigned  int ,
585+                  " =r" " memory" 
586+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s8" signed  char , unsigned  int ,
587+                  " =r" " memory" 
588+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s16" short , unsigned  short ,
589+                  " =h" " memory" 
590+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s32" int , unsigned  int ,
591+                  " =r" " memory" 
592+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.s64" long  long , unsigned  long  long ,
593+                  " =l" " memory" 
594+ 
595+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u8" 
596+                   " =r" " memory" 
597+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u8" 
598+                   " =r" " memory" 
599+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u16" 
600+                   " =h" " memory" 
601+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u16" 
602+                   " =h" " memory" 
603+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u32" 
604+                   " =r" " memory" 
605+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.u32" 
606+                   " =r" " memory" 
607+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.u64" 
608+                   " =l" " memory" 
609+ 
610+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s8" " =r" " memory" 
611+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s8" " =r" " memory" 
612+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s16" 
613+                   " =h" " memory" 
614+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s16" 
615+                   " =h" " memory" 
616+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s32" " =r" " memory" 
617+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.s32" " =r" " memory" 
618+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.s64" 
619+                   " =l" " memory" 
620+ 
621+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.f32" float , float , " =f" " memory" 
622+ __INTRINSIC_LOAD (__ldcv, " ld.global.cv.f64" double , double , " =d" " memory" 
623+ 
624+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.f32" 
625+                   " =f" " memory" 
626+ __INTRINSIC_LOAD4 (__ldcv, " ld.global.cv.v4.f32" 
627+                   " =f" " memory" 
628+ __INTRINSIC_LOAD2 (__ldcv, " ld.global.cv.v2.f64" 
629+                   " =d" " memory" 
630+ 
631+ inline  __device__ long  __ldcv (const  long  *__ptr) {
632+   unsigned  long  __ret;
633+   if  (sizeof (long ) == 8 ) {
634+     asm (" ld.global.cv.s64 %0, [%1];" " =l" " l" 
635+   } else  {
636+     asm (" ld.global.cv.s32 %0, [%1];" " =r" " l" 
637+   }
638+   return  (long )__ret;
639+ }
640+ 
641+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s8" char , unsigned  int , " =r" 
642+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s8" signed  char , signed  int , " =r" 
643+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s16" short , unsigned  short , " =h" 
644+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s32" int , unsigned  int , " =r" 
645+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.s64" long  long , unsigned  long  long ,
646+                  " =l" 
647+ 
648+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s8" " =r" 
649+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s8" " =r" 
650+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s16" " =h" 
651+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s16" " =h" 
652+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s32" " =r" 
653+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.s32" " =r" 
654+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.s64" " =l" 
655+ 
656+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u8" unsigned  char , unsigned  int ,
657+                  " =r" 
658+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u16" unsigned  short , unsigned  short ,
659+                  " =h" 
660+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u32" unsigned  int , unsigned  int ,
661+                  " =r" 
662+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.u64" unsigned  long  long ,
663+                  unsigned  long  long , " =l" 
664+ 
665+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u8" " =r" 
666+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u8" " =r" 
667+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u16" " =h" 
668+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u16" " =h" 
669+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u32" " =r" 
670+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.u32" " =r" 
671+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.u64" 
672+                   " =l" 
673+ 
674+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.f32" float , float , " =f" 
675+ __INTRINSIC_LOAD (__ldcs, " ld.global.cs.f64" double , double , " =d" 
676+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.f32" " =f" 
677+ __INTRINSIC_LOAD4 (__ldcs, " ld.global.cs.v4.f32" " =f" 
678+ __INTRINSIC_LOAD2 (__ldcs, " ld.global.cs.v2.f64" " =d" 
679+ 
680+ #pragma  pop_macro("__INTRINSIC_LOAD")
681+ #pragma  pop_macro("__INTRINSIC_LOAD2")
682+ #pragma  pop_macro("__INTRINSIC_LOAD4")
683+ 
684+ inline  __device__ long  __ldcs (const  long  *__ptr) {
685+   unsigned  long  __ret;
686+   if  (sizeof (long ) == 8 ) {
687+     asm (" ld.global.cs.s64 %0, [%1];" " =l" " l" 
688+   } else  {
689+     asm (" ld.global.cs.s32 %0, [%1];" " =r" " l" 
690+   }
691+   return  (long )__ret;
692+ }
693+ 
694+ #pragma  push_macro("__INTRINSIC_STORE")
695+ #define  __INTRINSIC_STORE (__FnName, __AsmOp, __DeclType, __TmpType, __AsmType ) \
696+   inline  __device__ void  __FnName (__DeclType *__ptr, __DeclType __value) {     \
697+     __TmpType __tmp = (__TmpType)__value;                                      \
698+     asm (__AsmOp "  [%0], %1;" " l" __AsmType (__tmp) : " memory" 
699+   }
700+ 
701+ #pragma  push_macro("__INTRINSIC_STORE2")
702+ #define  __INTRINSIC_STORE2 (__FnName, __AsmOp, __DeclType, __TmpType,           \
703+                            __AsmType)                                          \
704+   inline  __device__ void  __FnName (__DeclType *__ptr, __DeclType __value) {     \
705+     __TmpType __tmp;                                                           \
706+     using  __ElementType = decltype (__tmp.x );                                   \
707+     __tmp.x  = (__ElementType)(__value.x );                                      \
708+     __tmp.y  = (__ElementType)(__value.y );                                      \
709+     asm (__AsmOp "  [%0], {%1,%2};" " l" __AsmType (__tmp.x ),            \
710+         __AsmType (__tmp.y )                                                     \
711+         : " memory" 
712+   }
713+ 
714+ #pragma  push_macro("__INTRINSIC_STORE4")
715+ #define  __INTRINSIC_STORE4 (__FnName, __AsmOp, __DeclType, __TmpType,           \
716+                            __AsmType)                                          \
717+   inline  __device__ void  __FnName (__DeclType *__ptr, __DeclType __value) {     \
718+     __TmpType __tmp;                                                           \
719+     using  __ElementType = decltype (__tmp.x );                                   \
720+     __tmp.x  = (__ElementType)(__value.x );                                      \
721+     __tmp.y  = (__ElementType)(__value.y );                                      \
722+     __tmp.z  = (__ElementType)(__value.z );                                      \
723+     __tmp.w  = (__ElementType)(__value.w );                                      \
724+     asm (__AsmOp "  [%0], {%1,%2,%3,%4};" " l" __AsmType (__tmp.x ),      \
725+         __AsmType (__tmp.y ), __AsmType (__tmp.z ), __AsmType (__tmp.w )             \
726+         : " memory" 
727+   }
728+ 
729+ __INTRINSIC_STORE (__stwt, " st.global.wt.s8" char , int , " r" 
730+ __INTRINSIC_STORE (__stwt, " st.global.wt.s8" signed  char , int , " r" 
731+ __INTRINSIC_STORE (__stwt, " st.global.wt.s16" short , short , " h" 
732+ __INTRINSIC_STORE (__stwt, " st.global.wt.s32" int , int , " r" 
733+ __INTRINSIC_STORE (__stwt, " st.global.wt.s64" long  long , long  long , " l" 
734+ 
735+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s8" " r" 
736+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s8" " r" 
737+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s16" " h" 
738+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s16" " h" 
739+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s32" " r" 
740+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.s32" " r" 
741+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.s64" " l" 
742+ 
743+ __INTRINSIC_STORE (__stwt, " st.global.wt.u8" unsigned  char , int , " r" 
744+ __INTRINSIC_STORE (__stwt, " st.global.wt.u16" unsigned  short , unsigned  short ,
745+                   " h" 
746+ __INTRINSIC_STORE (__stwt, " st.global.wt.u32" unsigned  int , unsigned  int , " r" 
747+ __INTRINSIC_STORE (__stwt, " st.global.wt.u64" unsigned  long  long ,
748+                   unsigned  long  long , " l" 
749+ 
750+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u8" " r" 
751+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u8" " r" 
752+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u16" " h" 
753+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u16" " h" 
754+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u32" " r" 
755+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.u32" " r" 
756+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.u64" " l" 
757+ 
758+ __INTRINSIC_STORE (__stwt, " st.global.wt.f32" float , float , " f" 
759+ __INTRINSIC_STORE (__stwt, " st.global.wt.f64" double , double , " d" 
760+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.f32" " f" 
761+ __INTRINSIC_STORE4 (__stwt, " st.global.wt.v4.f32" " f" 
762+ __INTRINSIC_STORE2 (__stwt, " st.global.wt.v2.f64" " d" 
763+ 
764+ #pragma  pop_macro("__INTRINSIC_STORE")
765+ #pragma  pop_macro("__INTRINSIC_STORE2")
766+ #pragma  pop_macro("__INTRINSIC_STORE4")
767+ 
768+ #endif  //  defined(__cplusplus) && (__cplusplus >= 201103L)
482769#endif  //  !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483770
484771#if  CUDA_VERSION >= 11000
0 commit comments