Skip to content

Commit 75bfdee

Browse files
committed
Reapply "Add missing intrinsics to cuda headers" (llvm#144755)
This reverts commit 298f1c2.
1 parent 3e795c6 commit 75bfdee

File tree

1 file changed

+284
-0
lines changed

1 file changed

+284
-0
lines changed

clang/lib/Headers/__clang_cuda_intrinsics.h

Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,290 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
479479
return ret;
480480
}
481481

482+
#pragma push_macro("__INTRINSIC_LOAD")
483+
#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
484+
__Clobber) \
485+
inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
486+
__TmpType __ret; \
487+
asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \
488+
return (__DeclType)__ret; \
489+
}
490+
491+
#pragma push_macro("__INTRINSIC_LOAD2")
492+
#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
493+
__Clobber) \
494+
inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
495+
__DeclType __ret; \
496+
__TmpType __tmp; \
497+
asm(__AsmOp " {%0,%1}, [%2];" \
498+
: __AsmType(__tmp.x), __AsmType(__tmp.y) \
499+
: "l"(__ptr)__Clobber); \
500+
using __ElementType = decltype(__ret.x); \
501+
__ret.x = (__ElementType)(__tmp.x); \
502+
__ret.y = (__ElementType)__tmp.y; \
503+
return __ret; \
504+
}
505+
506+
#pragma push_macro("__INTRINSIC_LOAD4")
507+
#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
508+
__Clobber) \
509+
inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \
510+
__DeclType __ret; \
511+
__TmpType __tmp; \
512+
asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \
513+
: __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \
514+
__AsmType(__tmp.w) \
515+
: "l"(__ptr)__Clobber); \
516+
using __ElementType = decltype(__ret.x); \
517+
__ret.x = (__ElementType)__tmp.x; \
518+
__ret.y = (__ElementType)__tmp.y; \
519+
__ret.z = (__ElementType)__tmp.z; \
520+
__ret.w = (__ElementType)__tmp.w; \
521+
return __ret; \
522+
}
523+
524+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", );
525+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", );
526+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", );
527+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", );
528+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long,
529+
"=l", );
530+
531+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", );
532+
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", );
533+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", );
534+
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", );
535+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", );
536+
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", );
537+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", );
538+
539+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int,
540+
"=r", );
541+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short,
542+
"=h", );
543+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int,
544+
"=r", );
545+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long,
546+
unsigned long long, "=l", );
547+
548+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", );
549+
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", );
550+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", );
551+
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", );
552+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", );
553+
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", );
554+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2,
555+
"=l", );
556+
557+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", );
558+
__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", );
559+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", );
560+
__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", );
561+
__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", );
562+
563+
inline __device__ long __ldcg(const long *__ptr) {
564+
unsigned long __ret;
565+
if (sizeof(long) == 8) {
566+
asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
567+
} else {
568+
asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
569+
}
570+
return (long)__ret;
571+
}
572+
573+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int,
574+
"=r", : "memory");
575+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
576+
"=h", : "memory");
577+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int,
578+
"=r", : "memory");
579+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
580+
unsigned long long, "=l", : "memory");
581+
582+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int,
583+
"=r", : "memory");
584+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int,
585+
"=r", : "memory");
586+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short,
587+
"=h", : "memory");
588+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int,
589+
"=r", : "memory");
590+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
591+
"=l", : "memory");
592+
593+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2,
594+
"=r", : "memory");
595+
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4,
596+
"=r", : "memory");
597+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2,
598+
"=h", : "memory");
599+
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4,
600+
"=h", : "memory");
601+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2,
602+
"=r", : "memory");
603+
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4,
604+
"=r", : "memory");
605+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2,
606+
"=l", : "memory");
607+
608+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory");
609+
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory");
610+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2,
611+
"=h", : "memory");
612+
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4,
613+
"=h", : "memory");
614+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory");
615+
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory");
616+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2,
617+
"=l", : "memory");
618+
619+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory");
620+
__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory");
621+
622+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2,
623+
"=f", : "memory");
624+
__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4,
625+
"=f", : "memory");
626+
__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2,
627+
"=d", : "memory");
628+
629+
inline __device__ long __ldcv(const long *__ptr) {
630+
unsigned long __ret;
631+
if (sizeof(long) == 8) {
632+
asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
633+
} else {
634+
asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
635+
}
636+
return (long)__ret;
637+
}
638+
639+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", );
640+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", );
641+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", );
642+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", );
643+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long,
644+
"=l", );
645+
646+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", );
647+
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", );
648+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", );
649+
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", );
650+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", );
651+
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", );
652+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", );
653+
654+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int,
655+
"=r", );
656+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
657+
"=h", );
658+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int,
659+
"=r", );
660+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
661+
unsigned long long, "=l", );
662+
663+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", );
664+
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", );
665+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", );
666+
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", );
667+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", );
668+
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", );
669+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2,
670+
"=l", );
671+
672+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", );
673+
__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", );
674+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", );
675+
__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", );
676+
__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", );
677+
678+
#pragma pop_macro("__INTRINSIC_LOAD")
679+
#pragma pop_macro("__INTRINSIC_LOAD2")
680+
#pragma pop_macro("__INTRINSIC_LOAD4")
681+
682+
inline __device__ long __ldcs(const long *__ptr) {
683+
unsigned long __ret;
684+
if (sizeof(long) == 8) {
685+
asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
686+
} else {
687+
asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
688+
}
689+
return (long)__ret;
690+
}
691+
692+
#pragma push_macro("__INTRINSIC_STORE")
693+
#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \
694+
inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
695+
__TmpType __tmp = (__TmpType)__value; \
696+
asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \
697+
}
698+
699+
#pragma push_macro("__INTRINSIC_STORE2")
700+
#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType, \
701+
__AsmType) \
702+
inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
703+
__TmpType __tmp; \
704+
using __ElementType = decltype(__tmp.x); \
705+
__tmp.x = (__ElementType)(__value.x); \
706+
__tmp.y = (__ElementType)(__value.y); \
707+
asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \
708+
__AsmType(__tmp.y) \
709+
: "memory"); \
710+
}
711+
712+
#pragma push_macro("__INTRINSIC_STORE4")
713+
#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType, \
714+
__AsmType) \
715+
inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \
716+
__TmpType __tmp; \
717+
using __ElementType = decltype(__tmp.x); \
718+
__tmp.x = (__ElementType)(__value.x); \
719+
__tmp.y = (__ElementType)(__value.y); \
720+
__tmp.z = (__ElementType)(__value.z); \
721+
__tmp.w = (__ElementType)(__value.w); \
722+
asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \
723+
__AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \
724+
: "memory"); \
725+
}
726+
727+
__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
728+
__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
729+
__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
730+
__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
731+
__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
732+
733+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
734+
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
735+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
736+
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
737+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
738+
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
739+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
740+
741+
__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
742+
__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
743+
"h");
744+
__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
745+
__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
746+
unsigned long long, "l");
747+
748+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
749+
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
750+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
751+
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
752+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
753+
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
754+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
755+
756+
__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
757+
__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
758+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
759+
__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
760+
__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
761+
762+
#pragma pop_macro("__INTRINSIC_STORE")
763+
#pragma pop_macro("__INTRINSIC_STORE2")
764+
#pragma pop_macro("__INTRINSIC_STORE4")
765+
482766
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
483767

484768
#if CUDA_VERSION >= 11000

0 commit comments

Comments
 (0)