@@ -111,24 +111,6 @@ pub unsafe fn _tile_dpbf16ps<const DST: i32, const A: i32, const B: i32>() {
111111 tdpbf16ps ( DST as i8 , A as i8 , B as i8 ) ;
112112}
113113
114- /// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in tiles a and b,
115- /// accumulating the intermediate single-precision (32-bit) floating-point elements
116- /// with elements in dst, and store the 32-bit result back to tile dst.
117- #[ inline]
118- #[ rustc_legacy_const_generics( 0 , 1 , 2 ) ]
119- #[ target_feature( enable = "amx-bf16,amx-transpose" ) ]
120- #[ cfg_attr(
121- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
122- assert_instr( ttdpbf16ps, DST = 0 , A = 1 , B = 2 )
123- ) ]
124- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
125- pub unsafe fn _tile_tdpbf16ps < const DST : i32 , const A : i32 , const B : i32 > ( ) {
126- static_assert_uimm_bits ! ( DST , 3 ) ;
127- static_assert_uimm_bits ! ( A , 3 ) ;
128- static_assert_uimm_bits ! ( B , 3 ) ;
129- ttdpbf16ps ( DST as i8 , A as i8 , B as i8 ) ;
130- }
131-
132114/// Compute dot-product of bytes in tiles with a source/destination accumulator.
133115/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding
134116/// signed 8-bit integers in b, producing 4 intermediate 32-bit results.
@@ -218,24 +200,6 @@ pub unsafe fn _tile_dpfp16ps<const DST: i32, const A: i32, const B: i32>() {
218200 tdpfp16ps ( DST as i8 , A as i8 , B as i8 ) ;
219201}
220202
221- /// Compute transpose and dot-product of FP16 (16-bit) floating-point pairs in tiles a and b,
222- /// accumulating the intermediate single-precision (32-bit) floating-point elements
223- /// with elements in dst, and store the 32-bit result back to tile dst.
224- #[ inline]
225- #[ rustc_legacy_const_generics( 0 , 1 , 2 ) ]
226- #[ target_feature( enable = "amx-fp16,amx-transpose" ) ]
227- #[ cfg_attr(
228- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
229- assert_instr( ttdpfp16ps, DST = 0 , A = 1 , B = 2 )
230- ) ]
231- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
232- pub unsafe fn _tile_tdpfp16ps < const DST : i32 , const A : i32 , const B : i32 > ( ) {
233- static_assert_uimm_bits ! ( DST , 3 ) ;
234- static_assert_uimm_bits ! ( A , 3 ) ;
235- static_assert_uimm_bits ! ( B , 3 ) ;
236- ttdpfp16ps ( DST as i8 , A as i8 , B as i8 ) ;
237- }
238-
239203/// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
240204/// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
241205/// Calculates the imaginary part of the result. For each possible combination of (row of a, column of b),
@@ -278,87 +242,6 @@ pub unsafe fn _tile_cmmrlfp16ps<const DST: i32, const A: i32, const B: i32>() {
278242 tcmmrlfp16ps ( DST as i8 , A as i8 , B as i8 ) ;
279243}
280244
281- /// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
282- /// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
283- /// Calculates the imaginary part of the result. For each possible combination of (transposed column of a, column of b),
284- /// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
285- /// The imaginary part of the a element is multiplied with the real part of the corresponding b element, and the real part of
286- /// the a element is multiplied with the imaginary part of the corresponding b elements. The two accumulated results are added,
287- /// and then accumulated into the corresponding row and column of dst.
288- #[ inline]
289- #[ rustc_legacy_const_generics( 0 , 1 , 2 ) ]
290- #[ target_feature( enable = "amx-complex,amx-transpose" ) ]
291- #[ cfg_attr(
292- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
293- assert_instr( ttcmmimfp16ps, DST = 0 , A = 1 , B = 2 )
294- ) ]
295- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
296- pub unsafe fn _tile_tcmmimfp16ps < const DST : i32 , const A : i32 , const B : i32 > ( ) {
297- static_assert_uimm_bits ! ( DST , 3 ) ;
298- static_assert_uimm_bits ! ( A , 3 ) ;
299- static_assert_uimm_bits ! ( B , 3 ) ;
300- ttcmmimfp16ps ( DST as i8 , A as i8 , B as i8 ) ;
301- }
302-
303- /// Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
304- /// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
305- /// Calculates the real part of the result. For each possible combination of (transposed column of a, column of b),
306- /// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
307- /// The real part of the a element is multiplied with the real part of the corresponding b element, and the negated imaginary part of
308- /// the a element is multiplied with the imaginary part of the corresponding b elements.
309- /// The two accumulated results are added, and then accumulated into the corresponding row and column of dst.
310- #[ inline]
311- #[ rustc_legacy_const_generics( 0 , 1 , 2 ) ]
312- #[ target_feature( enable = "amx-complex,amx-transpose" ) ]
313- #[ cfg_attr(
314- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
315- assert_instr( ttcmmrlfp16ps, DST = 0 , A = 1 , B = 2 )
316- ) ]
317- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
318- pub unsafe fn _tile_ttcmmrlfp16ps < const DST : i32 , const A : i32 , const B : i32 > ( ) {
319- static_assert_uimm_bits ! ( DST , 3 ) ;
320- static_assert_uimm_bits ! ( A , 3 ) ;
321- static_assert_uimm_bits ! ( B , 3 ) ;
322- ttcmmrlfp16ps ( DST as i8 , A as i8 , B as i8 ) ;
323- }
324-
325- /// Perform matrix conjugate transpose and multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile.
326- /// Each dword element in input tiles a and b is interpreted as a complex number with FP16 real part and FP16 imaginary part.
327- /// Calculates the imaginary part of the result. For each possible combination of (transposed column of a, column of b),
328- /// it performs a set of multiplication and accumulations on all corresponding complex numbers (one from a and one from b).
329- /// The negated imaginary part of the a element is multiplied with the real part of the corresponding b element, and the real part of
330- /// the a element is multiplied with the imaginary part of the corresponding b elements. The two accumulated results are added,
331- /// and then accumulated into the corresponding row and column of dst.
332- #[ inline]
333- #[ rustc_legacy_const_generics( 0 , 1 , 2 ) ]
334- #[ target_feature( enable = "amx-complex,amx-transpose" ) ]
335- #[ cfg_attr(
336- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
337- assert_instr( tconjtcmmimfp16ps, DST = 0 , A = 1 , B = 2 )
338- ) ]
339- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
340- pub unsafe fn _tile_conjtcmmimfp16ps < const DST : i32 , const A : i32 , const B : i32 > ( ) {
341- static_assert_uimm_bits ! ( DST , 3 ) ;
342- static_assert_uimm_bits ! ( A , 3 ) ;
343- static_assert_uimm_bits ! ( B , 3 ) ;
344- tconjtcmmimfp16ps ( DST as i8 , A as i8 , B as i8 ) ;
345- }
346-
347- /// Perform a conjugate transpose of an FP16-pair of complex numbers in tile a, and store the result in tile dst.
348- #[ inline]
349- #[ rustc_legacy_const_generics( 0 , 1 ) ]
350- #[ target_feature( enable = "amx-complex,amx-transpose" ) ]
351- #[ cfg_attr(
352- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
353- assert_instr( tconjtfp16, DST = 0 , A = 1 )
354- ) ]
355- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
356- pub unsafe fn _tile_conjtfp16 < const DST : i32 , const A : i32 > ( ) {
357- static_assert_uimm_bits ! ( DST , 3 ) ;
358- static_assert_uimm_bits ! ( A , 3 ) ;
359- tconjtfp16 ( DST as i8 , A as i8 ) ;
360- }
361-
362245/// Compute dot-product of BF8 (8-bit) floating-point pairs in tiles a and b,
363246/// accumulating the intermediate single-precision (32-bit) floating-point elements
364247/// with elements in dst, and store the 32-bit result back to tile dst.
@@ -497,176 +380,6 @@ pub unsafe fn _tile_mmultf32ps<const DST: i32, const A: i32, const B: i32>() {
497380 tmmultf32ps ( DST as i8 , A as i8 , B as i8 ) ;
498381}
499382
500- /// Perform matrix transpose and multiplication of two tiles a and b, containing packed single precision (32-bit)
501- /// floating-point elements, which are converted to TF32 (tensor-float32) format, and accumulate the
502- /// results into a packed single precision tile.
503- /// For each possible combination of (transposed column of a, column of b), it performs
504- /// - convert to TF32
505- /// - multiply the corresponding elements of a and b
506- /// - accumulate the results into the corresponding row and column of dst using round-to-nearest-even
507- /// rounding mode.
508- /// Output FP32 denormals are always flushed to zero, input single precision denormals are always
509- /// handled and *not* treated as zero.
510- #[ inline]
511- #[ rustc_legacy_const_generics( 0 , 1 , 2 ) ]
512- #[ target_feature( enable = "amx-tf32,amx-transpose" ) ]
513- #[ cfg_attr(
514- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
515- assert_instr( ttmmultf32ps, DST = 0 , A = 1 , B = 2 )
516- ) ]
517- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
518- pub unsafe fn _tile_tmmultf32ps < const DST : i32 , const A : i32 , const B : i32 > ( ) {
519- static_assert_uimm_bits ! ( DST , 3 ) ;
520- static_assert_uimm_bits ! ( A , 3 ) ;
521- static_assert_uimm_bits ! ( B , 3 ) ;
522- ttmmultf32ps ( DST as i8 , A as i8 , B as i8 ) ;
523- }
524-
525- /// Transposes 32-bit elements in tile a and stores the result in tile dst.
526- #[ inline]
527- #[ rustc_legacy_const_generics( 0 , 1 ) ]
528- #[ target_feature( enable = "amx-transpose" ) ]
529- #[ cfg_attr(
530- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
531- assert_instr( ttransposed, DST = 0 , A = 1 )
532- ) ]
533- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
534- pub unsafe fn _tile_transposed < const DST : i32 , const A : i32 > ( ) {
535- static_assert_uimm_bits ! ( DST , 3 ) ;
536- static_assert_uimm_bits ! ( A , 3 ) ;
537- ttransposed ( DST as i8 , A as i8 ) ;
538- }
539-
540- /// TODO - Document
541- #[ inline]
542- #[ rustc_legacy_const_generics( 0 ) ]
543- #[ target_feature( enable = "amx-transpose" ) ]
544- #[ cfg_attr(
545- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
546- assert_instr( t2rpntlvwz0, DST = 0 )
547- ) ]
548- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
549- pub unsafe fn _tile_2rpntlvwz0 < const DST : i32 > ( base : * const u8 , stride : usize ) {
550- static_assert_uimm_bits ! ( DST , 3 ) ;
551- t2rpntlvwz0 ( DST as i8 , base, stride) ;
552- }
553-
554- /// TODO - Document
555- /// Provides a hint to the implementation that the data would be reused but does not need
556- /// to be resident in the nearest cache levels.
557- #[ rustc_legacy_const_generics( 0 ) ]
558- #[ target_feature( enable = "amx-transpose" ) ]
559- #[ cfg_attr(
560- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
561- assert_instr( t2rpntlvwz0t1, DST = 0 )
562- ) ]
563- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
564- pub unsafe fn _tile_2rpntlvwz0t1 < const DST : i32 > ( base : * const u8 , stride : usize ) {
565- static_assert_uimm_bits ! ( DST , 3 ) ;
566- t2rpntlvwz0t1 ( DST as i8 , base, stride) ;
567- }
568-
569- /// TODO - Document
570- #[ inline]
571- #[ rustc_legacy_const_generics( 0 ) ]
572- #[ target_feature( enable = "amx-transpose" ) ]
573- #[ cfg_attr(
574- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
575- assert_instr( t2rpntlvwz1, DST = 0 )
576- ) ]
577- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
578- pub unsafe fn _tile_2rpntlvwz1 < const DST : i32 > ( base : * const u8 , stride : usize ) {
579- static_assert_uimm_bits ! ( DST , 3 ) ;
580- t2rpntlvwz1 ( DST as i8 , base, stride) ;
581- }
582-
583- /// TODO - Document
584- /// Provides a hint to the implementation that the data would be reused but does not need
585- /// to be resident in the nearest cache levels.
586- #[ inline]
587- #[ rustc_legacy_const_generics( 0 ) ]
588- #[ target_feature( enable = "amx-transpose" ) ]
589- #[ cfg_attr(
590- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
591- assert_instr( t2rpntlvwz1t1, DST = 0 )
592- ) ]
593- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
594- pub unsafe fn _tile_2rpntlvwz1t1 < const DST : i32 > ( base : * const u8 , stride : usize ) {
595- static_assert_uimm_bits ! ( DST , 3 ) ;
596- t2rpntlvwz1t1 ( DST as i8 , base, stride) ;
597- }
598-
599- /// TODO - Document
600- /// Additionally, this intrinsic indicates the source memory location is likely to become
601- /// read-shared by multiple processors, i.e., read in the future by at least one other processor
602- /// before it is written, assuming it is ever written in the future.
603- #[ inline]
604- #[ rustc_legacy_const_generics( 0 ) ]
605- #[ target_feature( enable = "amx-transpose,amx-movrs" ) ]
606- #[ cfg_attr(
607- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
608- assert_instr( t2rpntlvwz0rs, DST = 0 )
609- ) ]
610- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
611- pub unsafe fn _tile_2rpntlvwz0rs < const DST : i32 > ( base : * const u8 , stride : usize ) {
612- static_assert_uimm_bits ! ( DST , 3 ) ;
613- t2rpntlvwz0rs ( DST as i8 , base, stride) ;
614- }
615-
616- /// TODO - Document
617- /// Provides a hint to the implementation that the data would be reused but does not need
618- /// to be resident in the nearest cache levels.
619- /// Additionally, this intrinsic indicates the source memory location is likely to become
620- /// read-shared by multiple processors, i.e., read in the future by at least one other processor
621- /// before it is written, assuming it is ever written in the future.
622- #[ rustc_legacy_const_generics( 0 ) ]
623- #[ target_feature( enable = "amx-transpose,amx-movrs" ) ]
624- #[ cfg_attr(
625- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
626- assert_instr( t2rpntlvwz0rst1, DST = 0 )
627- ) ]
628- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
629- pub unsafe fn _tile_2rpntlvwz0rst1 < const DST : i32 > ( base : * const u8 , stride : usize ) {
630- static_assert_uimm_bits ! ( DST , 3 ) ;
631- t2rpntlvwz0rst1 ( DST as i8 , base, stride) ;
632- }
633-
634- /// TODO - Document
635- /// Additionally, this intrinsic indicates the source memory location is likely to become
636- /// read-shared by multiple processors, i.e., read in the future by at least one other processor
637- /// before it is written, assuming it is ever written in the future.
638- #[ inline]
639- #[ rustc_legacy_const_generics( 0 ) ]
640- #[ target_feature( enable = "amx-transpose,amx-movrs" ) ]
641- #[ cfg_attr(
642- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
643- assert_instr( t2rpntlvwz1rs, DST = 0 )
644- ) ]
645- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
646- pub unsafe fn _tile_2rpntlvwz1rs < const DST : i32 > ( base : * const u8 , stride : usize ) {
647- static_assert_uimm_bits ! ( DST , 3 ) ;
648- t2rpntlvwz1rs ( DST as i8 , base, stride) ;
649- }
650-
651- /// TODO - Document
652- /// Provides a hint to the implementation that the data would be reused but does not need
653- /// to be resident in the nearest cache levels.
654- /// Additionally, this intrinsic indicates the source memory location is likely to become
655- /// read-shared by multiple processors, i.e., read in the future by at least one other processor
656- /// before it is written, assuming it is ever written in the future.
657- #[ inline]
658- #[ rustc_legacy_const_generics( 0 ) ]
659- #[ target_feature( enable = "amx-transpose,amx-movrs" ) ]
660- #[ cfg_attr(
661- all( test, any( target_os = "linux" , target_env = "msvc" ) ) ,
662- assert_instr( t2rpntlvwz1rst1, DST = 0 )
663- ) ]
664- #[ unstable( feature = "x86_amx_intrinsics" , issue = "126622" ) ]
665- pub unsafe fn _tile_2rpntlvwz1rst1 < const DST : i32 > ( base : * const u8 , stride : usize ) {
666- static_assert_uimm_bits ! ( DST , 3 ) ;
667- t2rpntlvwz1rst1 ( DST as i8 , base, stride) ;
668- }
669-
670383#[ allow( improper_ctypes) ]
671384unsafe extern "C" {
672385 #[ link_name = "llvm.x86.ldtilecfg" ]
@@ -685,8 +398,6 @@ unsafe extern "C" {
685398 fn tilezero ( dst : i8 ) ;
686399 #[ link_name = "llvm.x86.tdpbf16ps" ]
687400 fn tdpbf16ps ( dst : i8 , a : i8 , b : i8 ) ;
688- #[ link_name = "llvm.x86.ttdpbf16ps" ]
689- fn ttdpbf16ps ( dst : i8 , a : i8 , b : i8 ) ;
690401 #[ link_name = "llvm.x86.tdpbuud" ]
691402 fn tdpbuud ( dst : i8 , a : i8 , b : i8 ) ;
692403 #[ link_name = "llvm.x86.tdpbusd" ]
@@ -697,20 +408,10 @@ unsafe extern "C" {
697408 fn tdpbssd ( dst : i8 , a : i8 , b : i8 ) ;
698409 #[ link_name = "llvm.x86.tdpfp16ps" ]
699410 fn tdpfp16ps ( dst : i8 , a : i8 , b : i8 ) ;
700- #[ link_name = "llvm.x86.ttdpfp16ps" ]
701- fn ttdpfp16ps ( dst : i8 , a : i8 , b : i8 ) ;
702411 #[ link_name = "llvm.x86.tcmmimfp16ps" ]
703412 fn tcmmimfp16ps ( dst : i8 , a : i8 , b : i8 ) ;
704413 #[ link_name = "llvm.x86.tcmmrlfp16ps" ]
705414 fn tcmmrlfp16ps ( dst : i8 , a : i8 , b : i8 ) ;
706- #[ link_name = "llvm.x86.ttcmmimfp16ps" ]
707- fn ttcmmimfp16ps ( dst : i8 , a : i8 , b : i8 ) ;
708- #[ link_name = "llvm.x86.ttcmmrlfp16ps" ]
709- fn ttcmmrlfp16ps ( dst : i8 , a : i8 , b : i8 ) ;
710- #[ link_name = "llvm.x86.tconjtcmmimfp16ps" ]
711- fn tconjtcmmimfp16ps ( dst : i8 , a : i8 , b : i8 ) ;
712- #[ link_name = "llvm.x86.tconjtfp16" ]
713- fn tconjtfp16 ( dst : i8 , a : i8 ) ;
714415 #[ link_name = "llvm.x86.tdpbf8ps" ]
715416 fn tdpbf8ps ( dst : i8 , a : i8 , b : i8 ) ;
716417 #[ link_name = "llvm.x86.tdpbhf8ps" ]
@@ -725,26 +426,6 @@ unsafe extern "C" {
725426 fn tileloaddrst164 ( dst : i8 , base : * const u8 , stride : usize ) ;
726427 #[ link_name = "llvm.x86.tmmultf32ps" ]
727428 fn tmmultf32ps ( dst : i8 , a : i8 , b : i8 ) ;
728- #[ link_name = "llvm.x86.ttmmultf32ps" ]
729- fn ttmmultf32ps ( dst : i8 , a : i8 , b : i8 ) ;
730- #[ link_name = "llvm.x86.ttransposed" ]
731- fn ttransposed ( dst : i8 , a : i8 ) ;
732- #[ link_name = "llvm.x86.t2rpntlvwz0" ]
733- fn t2rpntlvwz0 ( dst : i8 , base : * const u8 , stride : usize ) ;
734- #[ link_name = "llvm.x86.t2rpntlvwz0t1" ]
735- fn t2rpntlvwz0t1 ( dst : i8 , base : * const u8 , stride : usize ) ;
736- #[ link_name = "llvm.x86.t2rpntlvwz1" ]
737- fn t2rpntlvwz1 ( dst : i8 , base : * const u8 , stride : usize ) ;
738- #[ link_name = "llvm.x86.t2rpntlvwz1t1" ]
739- fn t2rpntlvwz1t1 ( dst : i8 , base : * const u8 , stride : usize ) ;
740- #[ link_name = "llvm.x86.t2rpntlvwz0rs" ]
741- fn t2rpntlvwz0rs ( dst : i8 , base : * const u8 , stride : usize ) ;
742- #[ link_name = "llvm.x86.t2rpntlvwz0rst1" ]
743- fn t2rpntlvwz0rst1 ( dst : i8 , base : * const u8 , stride : usize ) ;
744- #[ link_name = "llvm.x86.t2rpntlvwz1rs" ]
745- fn t2rpntlvwz1rs ( dst : i8 , base : * const u8 , stride : usize ) ;
746- #[ link_name = "llvm.x86.t2rpntlvwz1rst1" ]
747- fn t2rpntlvwz1rst1 ( dst : i8 , base : * const u8 , stride : usize ) ;
748429}
749430
750431#[ cfg( test) ]
0 commit comments