Skip to content

Commit 0207843

Browse files
committed
Add: WGMMA synchronization
1 parent 733cbac commit 0207843

File tree

2 files changed

+516
-349
lines changed

2 files changed

+516
-349
lines changed

less_slow_sm80.ptx

Lines changed: 5 additions & 349 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
/**
2-
* less_slow_sm90a.ptx
2+
* less_slow_sm80.ptx
33
*
44
* Micro-kernels for building a performance-first mindset for CUDA-capable
55
* GPUs using Parallel Thread eXecution (PTX) Intermediate Representation (IR)
6-
* for for Hopper-generation Nvidia GPUs and newer.
6+
* for for Ampere-generation Nvidia GPUs with Warp-level MMA (WMMA).
77
*
88
* ? You should start at `less_slow.cu` before reading this file.
99
* ? You should start at `less_slow_sm70.ptx` before reading this file.
@@ -13,7 +13,7 @@
1313
* You can validate this file by asking the Nvidia PTX Assembler to compile it
1414
* to `.cubin` for some target architecture:
1515
*
16-
* $ ptxas -o less_slow_from_ptx.cubin -arch=sm_90a less_slow_sm90a.ptx
16+
* $ ptxas -o less_slow_from_ptx.cubin -arch=sm_80 less_slow_sm80.ptx
1717
* $ cuobjdump -sass less_slow_from_ptx.cubin | grep -i mma
1818
*
1919
* Assuming how aggressively NVCC unrolls loops and the number of kernels in
@@ -24,8 +24,8 @@
2424
* $ sed -r 's/^[[:space:]]+//; s/[[:space:]]+$//' | \
2525
* $ sort -u
2626
*/
27-
.version 8.0 // PTX version 8.0 for Hopper GPUs
28-
.target sm_90a // Target architecture (SM_90a - Hopper GPUs)
27+
.version 7.0 // PTX version 7.0 for Ampere GPUs
28+
.target sm_80 // Target architecture (SM_80 - Ampere GPUs)
2929
.address_size 64 // 64-bit addressing
3030

3131
/**
@@ -283,347 +283,3 @@ loop_exit:
283283
st.global.volatile.f32 [dummy_sink_f32+12], accum3;
284284
ret;
285285
}
286-
287-
/**
288-
* The instruction syntax for Warp-Group asynchronous instructions is very
289-
* different, as at least one of the operand matrices has to be in shared
290-
* memory (not registers). It's documented as in 2 variants:
291-
*
292-
* wgmma.mma_async.sync.aligned.shape.dtype.tf32.tf32
293-
* d, a-desc, b-desc, scale-d, imm-scale-a, imm-scale-b;
294-
* wgmma.mma_async.sync.aligned.shape.dtype.tf32.tf32
295-
* d, a, b-desc, scale-d, imm-scale-a, imm-scale-b;
296-
*
297-
* There is no "C" matrix involved at all, we are computing `D = A * B + D`.
298-
* The A and B matrix descriptors are the properties of the matrix in shared
299-
* memory. It is a 64-bit value contained with the following layout:
300-
*
301-
* - 14 bits [0; 13]: start address
302-
* - 14 bits [16; 29]: leading dimension byte offset
303-
* - 14 bits [32; 45]: stride dimension byte offset
304-
* - 3 bits [49; 51]: matrix base offset, valid only for "swizzling"
305-
* - 2 bits [62; 63]: "swizzling" mode
306-
*
307-
* Swizzling defines the order of the elements and can have 4 possible values:
308-
*
309-
* 0: no "swizzling" at all
310-
* 1: a 128-byte "swizzle" with a 1024 byte offset of a repeating pattern
311-
* 2: a 64-byte "swizzle" with a 512 byte offset of a repeating pattern
312-
* 3: a 32-byte "swizzle" with a 256 byte offset of a repeating pattern
313-
*
314-
* The list of supported shapes is exhausting:
315-
*
316-
* .m64n8k8, .m64n16k8, .m64n24k8, .m64n32k8,
317-
* .m64n40k8, .m64n48k8, .m64n56k8, .m64n64k8,
318-
* .m64n72k8, .m64n80k8, .m64n88k8, .m64n96k8,
319-
* .m64n104k8, .m64n112k8, .m64n120k8, .m64n128k8,
320-
* .m64n136k8, .m64n144k8, .m64n152k8, .m64n160k8,
321-
* .m64n168k8, .m64n176k8, .m64n184k8, .m64n192k8,
322-
* .m64n200k8, .m64n208k8, .m64n216k8, .m64n224k8,
323-
* .m64n232k8, .m64n240k8, .m64n248k8, .m64n256k8
324-
*
325-
* The `scale` parameters can be used to either negate the inputs, or disable
326-
* additive bias accumulation in the output.
327-
*/
328-
.visible .entry tops_tf32f32_sm90tc_m64n16k8_loop128_ptx_kernel()
329-
{
330-
// Accumulator registers used for both input and output of this MMA
331-
.reg .f32 accum<8>;
332-
333-
// Descriptors for matrix A and matrix B operands
334-
.reg .b64 desc_a, desc_b;
335-
336-
// General-purpose registers for loop control
337-
.reg .b32 loop_counter, loop_limit;
338-
339-
// Predicate register for conditional branching (loop exit)
340-
.reg .pred exit_predicate;
341-
342-
// Set up loop counter and loop limit
343-
mov.u32 loop_counter, 0;
344-
mov.u32 loop_limit, 128;
345-
346-
// Zero-initialize the accumulator registers
347-
mov.f32 accum0, 0.0;
348-
mov.f32 accum1, 0.0;
349-
mov.f32 accum2, 0.0;
350-
mov.f32 accum3, 0.0;
351-
mov.f32 accum4, 0.0;
352-
mov.f32 accum5, 0.0;
353-
mov.f32 accum6, 0.0;
354-
mov.f32 accum7, 0.0;
355-
356-
// Initialize matrix descriptors with arbitrary placeholder values
357-
mov.u64 desc_a, 0x0000000000000000;
358-
mov.u64 desc_b, 0x0000000000000000;
359-
360-
// Enforce the ordered for Warp-Group instructions
361-
wgmma.fence.sync.aligned;
362-
363-
// The main loop will repeat for 128 iterations
364-
loop_start:
365-
setp.ge.u32 exit_predicate, loop_counter, loop_limit;
366-
@exit_predicate bra loop_exit;
367-
368-
wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32
369-
{ accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7 },
370-
desc_a,
371-
desc_b,
372-
0, -1, -1;
373-
374-
// Increment the loop counter
375-
add.u32 loop_counter, loop_counter, 1;
376-
377-
// Branch back to the beginning of the loop
378-
bra loop_start;
379-
380-
loop_exit:
381-
// Commit all prior uncommitted operations to the group and wait!
382-
wgmma.commit_group.sync.aligned;
383-
wgmma.wait_group.sync.aligned 0;
384-
385-
// Use volatile stores to force the accumulator values to be written out.
386-
// This dummy write (to a global variable) makes the work observable and
387-
// prevents the multiplication pipeline from being optimized out.
388-
st.global.volatile.f32 [dummy_sink_f32], accum0;
389-
st.global.volatile.f32 [dummy_sink_f32+4], accum1;
390-
st.global.volatile.f32 [dummy_sink_f32+8], accum2;
391-
st.global.volatile.f32 [dummy_sink_f32+12], accum3;
392-
ret;
393-
}
394-
395-
/**
396-
* This results in massive performance gains on Hopper:
397-
* - 16x16x8 MMA computed by individual warps: 74 T
398-
* - 64x16x8 WGMMA computed by four warps together: 300 T
399-
*
400-
* Will it get even better with larger matrices if we scale the second
401-
* dimension from 16 to 256? It would require 128 accumulators.
402-
*/
403-
404-
.visible .entry tops_tf32f32_sm90tc_m64n256k8_loop128_ptx_kernel()
405-
{
406-
// Accumulator registers used for both input and output of this MMA
407-
.reg .f32 accum<128>;
408-
409-
// Descriptors for matrix A and matrix B operands
410-
.reg .b64 desc_a, desc_b;
411-
412-
// General-purpose registers for loop control
413-
.reg .b32 loop_counter, loop_limit;
414-
415-
// Predicate register for conditional branching (loop exit)
416-
.reg .pred exit_predicate;
417-
418-
// Set up loop counter and loop limit to fill accumulators
419-
mov.u32 loop_counter, 0;
420-
mov.u32 loop_limit, 128;
421-
422-
// Zero-initialize the accumulator registers:
423-
mov.f32 accum0, 0.0; mov.f32 accum1, 0.0; mov.f32 accum2, 0.0; mov.f32 accum3, 0.0;
424-
mov.f32 accum4, 0.0; mov.f32 accum5, 0.0; mov.f32 accum6, 0.0; mov.f32 accum7, 0.0;
425-
mov.f32 accum8, 0.0; mov.f32 accum9, 0.0; mov.f32 accum10, 0.0; mov.f32 accum11, 0.0;
426-
mov.f32 accum12, 0.0; mov.f32 accum13, 0.0; mov.f32 accum14, 0.0; mov.f32 accum15, 0.0;
427-
mov.f32 accum16, 0.0; mov.f32 accum17, 0.0; mov.f32 accum18, 0.0; mov.f32 accum19, 0.0;
428-
mov.f32 accum20, 0.0; mov.f32 accum21, 0.0; mov.f32 accum22, 0.0; mov.f32 accum23, 0.0;
429-
mov.f32 accum24, 0.0; mov.f32 accum25, 0.0; mov.f32 accum26, 0.0; mov.f32 accum27, 0.0;
430-
mov.f32 accum28, 0.0; mov.f32 accum29, 0.0; mov.f32 accum30, 0.0; mov.f32 accum31, 0.0;
431-
mov.f32 accum32, 0.0; mov.f32 accum33, 0.0; mov.f32 accum34, 0.0; mov.f32 accum35, 0.0;
432-
mov.f32 accum36, 0.0; mov.f32 accum37, 0.0; mov.f32 accum38, 0.0; mov.f32 accum39, 0.0;
433-
mov.f32 accum40, 0.0; mov.f32 accum41, 0.0; mov.f32 accum42, 0.0; mov.f32 accum43, 0.0;
434-
mov.f32 accum44, 0.0; mov.f32 accum45, 0.0; mov.f32 accum46, 0.0; mov.f32 accum47, 0.0;
435-
mov.f32 accum48, 0.0; mov.f32 accum49, 0.0; mov.f32 accum50, 0.0; mov.f32 accum51, 0.0;
436-
mov.f32 accum52, 0.0; mov.f32 accum53, 0.0; mov.f32 accum54, 0.0; mov.f32 accum55, 0.0;
437-
mov.f32 accum56, 0.0; mov.f32 accum57, 0.0; mov.f32 accum58, 0.0; mov.f32 accum59, 0.0;
438-
mov.f32 accum60, 0.0; mov.f32 accum61, 0.0; mov.f32 accum62, 0.0; mov.f32 accum63, 0.0;
439-
mov.f32 accum64, 0.0; mov.f32 accum65, 0.0; mov.f32 accum66, 0.0; mov.f32 accum67, 0.0;
440-
mov.f32 accum68, 0.0; mov.f32 accum69, 0.0; mov.f32 accum70, 0.0; mov.f32 accum71, 0.0;
441-
mov.f32 accum72, 0.0; mov.f32 accum73, 0.0; mov.f32 accum74, 0.0; mov.f32 accum75, 0.0;
442-
mov.f32 accum76, 0.0; mov.f32 accum77, 0.0; mov.f32 accum78, 0.0; mov.f32 accum79, 0.0;
443-
mov.f32 accum80, 0.0; mov.f32 accum81, 0.0; mov.f32 accum82, 0.0; mov.f32 accum83, 0.0;
444-
mov.f32 accum84, 0.0; mov.f32 accum85, 0.0; mov.f32 accum86, 0.0; mov.f32 accum87, 0.0;
445-
mov.f32 accum88, 0.0; mov.f32 accum89, 0.0; mov.f32 accum90, 0.0; mov.f32 accum91, 0.0;
446-
mov.f32 accum92, 0.0; mov.f32 accum93, 0.0; mov.f32 accum94, 0.0; mov.f32 accum95, 0.0;
447-
mov.f32 accum96, 0.0; mov.f32 accum97, 0.0; mov.f32 accum98, 0.0; mov.f32 accum99, 0.0;
448-
mov.f32 accum100, 0.0; mov.f32 accum101, 0.0; mov.f32 accum102, 0.0; mov.f32 accum103, 0.0;
449-
mov.f32 accum104, 0.0; mov.f32 accum105, 0.0; mov.f32 accum106, 0.0; mov.f32 accum107, 0.0;
450-
mov.f32 accum108, 0.0; mov.f32 accum109, 0.0; mov.f32 accum110, 0.0; mov.f32 accum111, 0.0;
451-
mov.f32 accum112, 0.0; mov.f32 accum113, 0.0; mov.f32 accum114, 0.0; mov.f32 accum115, 0.0;
452-
mov.f32 accum116, 0.0; mov.f32 accum117, 0.0; mov.f32 accum118, 0.0; mov.f32 accum119, 0.0;
453-
mov.f32 accum120, 0.0; mov.f32 accum121, 0.0; mov.f32 accum122, 0.0; mov.f32 accum123, 0.0;
454-
mov.f32 accum124, 0.0; mov.f32 accum125, 0.0; mov.f32 accum126, 0.0; mov.f32 accum127, 0.0;
455-
456-
// Initialize matrix descriptors with arbitrary placeholder values
457-
mov.u64 desc_a, 0x0000000000000000;
458-
mov.u64 desc_b, 0x0000000000000000;
459-
460-
// Enforce the ordered for Warp-Group instructions
461-
wgmma.fence.sync.aligned;
462-
463-
// The main loop will repeat for 128 iterations
464-
loop_start:
465-
setp.ge.u32 exit_predicate, loop_counter, loop_limit;
466-
@exit_predicate bra loop_exit;
467-
468-
wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32
469-
{ accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7,
470-
accum8, accum9, accum10, accum11, accum12, accum13, accum14, accum15,
471-
accum16, accum17, accum18, accum19, accum20, accum21, accum22, accum23,
472-
accum24, accum25, accum26, accum27, accum28, accum29, accum30, accum31,
473-
accum32, accum33, accum34, accum35, accum36, accum37, accum38, accum39,
474-
accum40, accum41, accum42, accum43, accum44, accum45, accum46, accum47,
475-
accum48, accum49, accum50, accum51, accum52, accum53, accum54, accum55,
476-
accum56, accum57, accum58, accum59, accum60, accum61, accum62, accum63,
477-
accum64, accum65, accum66, accum67, accum68, accum69, accum70, accum71,
478-
accum72, accum73, accum74, accum75, accum76, accum77, accum78, accum79,
479-
accum80, accum81, accum82, accum83, accum84, accum85, accum86, accum87,
480-
accum88, accum89, accum90, accum91, accum92, accum93, accum94, accum95,
481-
accum96, accum97, accum98, accum99, accum100, accum101, accum102, accum103,
482-
accum104, accum105, accum106, accum107, accum108, accum109, accum110, accum111,
483-
accum112, accum113, accum114, accum115, accum116, accum117, accum118, accum119,
484-
accum120, accum121, accum122, accum123, accum124, accum125, accum126, accum127 },
485-
desc_a,
486-
desc_b,
487-
0, -1, -1;
488-
489-
// Increment the loop counter
490-
add.u32 loop_counter, loop_counter, 1;
491-
492-
// Branch back to the beginning of the loop
493-
bra loop_start;
494-
495-
loop_exit:
496-
// Commit all prior uncommitted operations to the group and wait!
497-
wgmma.commit_group.sync.aligned;
498-
wgmma.wait_group.sync.aligned 0;
499-
500-
// Use volatile stores to force the accumulator values to be written out.
501-
// This dummy write (to a global variable) makes the work observable and
502-
// prevents the multiplication pipeline from being optimized out.
503-
st.global.volatile.f32 [dummy_sink_f32], accum0;
504-
st.global.volatile.f32 [dummy_sink_f32+4], accum1;
505-
st.global.volatile.f32 [dummy_sink_f32+8], accum2;
506-
st.global.volatile.f32 [dummy_sink_f32+12], accum3;
507-
ret;
508-
}
509-
510-
/**
511-
* This results in massive performance gains on Hopper:
512-
* - 16x16x8 MMA computed by individual warps: 74 T
513-
* - 64x16x8 WGMMA computed by four warps together: 300 T
514-
* - 64x256x8 WGMMA computed by four warps together: 4.7 P ?!
515-
*
516-
* There are also "structured-sparse" variants of those instructions, in case
517-
* half of our entries are zeros! Those, however, simply expand the last
518-
* dimension by 2x, making the instructions no more usable for small matrices.
519-
*/
520-
521-
.visible .entry tops_b1i32and_sm90tc_m64n256k256_loop128_ptx_kernel()
522-
{
523-
// Accumulator registers used for both input and output of the MMA operation
524-
.reg .s32 accum<128>;
525-
526-
// Descriptors for matrix A and matrix B operands (in shared memory)
527-
.reg .b64 desc_a, desc_b;
528-
529-
// General-purpose registers for loop control
530-
.reg .b32 loop_counter, loop_limit;
531-
532-
// Predicate registers for conditional branching (loop exit) and scale flag
533-
.reg .pred exit_predicate, scale_d;
534-
535-
// Set up loop counter and loop limit
536-
mov.u32 loop_counter, 0;
537-
mov.u32 loop_limit, 128;
538-
539-
// Zero-initialize the accumulators, as registers may contain noise
540-
mov.s32 accum0, 0; mov.s32 accum1, 0; mov.s32 accum2, 0; mov.s32 accum3, 0;
541-
mov.s32 accum4, 0; mov.s32 accum5, 0; mov.s32 accum6, 0; mov.s32 accum7, 0;
542-
mov.s32 accum8, 0; mov.s32 accum9, 0; mov.s32 accum10, 0; mov.s32 accum11, 0;
543-
mov.s32 accum12, 0; mov.s32 accum13, 0; mov.s32 accum14, 0; mov.s32 accum15, 0;
544-
mov.s32 accum16, 0; mov.s32 accum17, 0; mov.s32 accum18, 0; mov.s32 accum19, 0;
545-
mov.s32 accum20, 0; mov.s32 accum21, 0; mov.s32 accum22, 0; mov.s32 accum23, 0;
546-
mov.s32 accum24, 0; mov.s32 accum25, 0; mov.s32 accum26, 0; mov.s32 accum27, 0;
547-
mov.s32 accum28, 0; mov.s32 accum29, 0; mov.s32 accum30, 0; mov.s32 accum31, 0;
548-
mov.s32 accum32, 0; mov.s32 accum33, 0; mov.s32 accum34, 0; mov.s32 accum35, 0;
549-
mov.s32 accum36, 0; mov.s32 accum37, 0; mov.s32 accum38, 0; mov.s32 accum39, 0;
550-
mov.s32 accum40, 0; mov.s32 accum41, 0; mov.s32 accum42, 0; mov.s32 accum43, 0;
551-
mov.s32 accum44, 0; mov.s32 accum45, 0; mov.s32 accum46, 0; mov.s32 accum47, 0;
552-
mov.s32 accum48, 0; mov.s32 accum49, 0; mov.s32 accum50, 0; mov.s32 accum51, 0;
553-
mov.s32 accum52, 0; mov.s32 accum53, 0; mov.s32 accum54, 0; mov.s32 accum55, 0;
554-
mov.s32 accum56, 0; mov.s32 accum57, 0; mov.s32 accum58, 0; mov.s32 accum59, 0;
555-
mov.s32 accum60, 0; mov.s32 accum61, 0; mov.s32 accum62, 0; mov.s32 accum63, 0;
556-
mov.s32 accum64, 0; mov.s32 accum65, 0; mov.s32 accum66, 0; mov.s32 accum67, 0;
557-
mov.s32 accum68, 0; mov.s32 accum69, 0; mov.s32 accum70, 0; mov.s32 accum71, 0;
558-
mov.s32 accum72, 0; mov.s32 accum73, 0; mov.s32 accum74, 0; mov.s32 accum75, 0;
559-
mov.s32 accum76, 0; mov.s32 accum77, 0; mov.s32 accum78, 0; mov.s32 accum79, 0;
560-
mov.s32 accum80, 0; mov.s32 accum81, 0; mov.s32 accum82, 0; mov.s32 accum83, 0;
561-
mov.s32 accum84, 0; mov.s32 accum85, 0; mov.s32 accum86, 0; mov.s32 accum87, 0;
562-
mov.s32 accum88, 0; mov.s32 accum89, 0; mov.s32 accum90, 0; mov.s32 accum91, 0;
563-
mov.s32 accum92, 0; mov.s32 accum93, 0; mov.s32 accum94, 0; mov.s32 accum95, 0;
564-
mov.s32 accum96, 0; mov.s32 accum97, 0; mov.s32 accum98, 0; mov.s32 accum99, 0;
565-
mov.s32 accum100, 0; mov.s32 accum101, 0; mov.s32 accum102, 0; mov.s32 accum103, 0;
566-
mov.s32 accum104, 0; mov.s32 accum105, 0; mov.s32 accum106, 0; mov.s32 accum107, 0;
567-
mov.s32 accum108, 0; mov.s32 accum109, 0; mov.s32 accum110, 0; mov.s32 accum111, 0;
568-
mov.s32 accum112, 0; mov.s32 accum113, 0; mov.s32 accum114, 0; mov.s32 accum115, 0;
569-
mov.s32 accum116, 0; mov.s32 accum117, 0; mov.s32 accum118, 0; mov.s32 accum119, 0;
570-
mov.s32 accum120, 0; mov.s32 accum121, 0; mov.s32 accum122, 0; mov.s32 accum123, 0;
571-
mov.s32 accum124, 0; mov.s32 accum125, 0; mov.s32 accum126, 0; mov.s32 accum127, 0;
572-
573-
// Initialize matrix descriptors with arbitrary placeholder values.
574-
// In practice, these would be set to point to shared-memory regions containing your matrices.
575-
mov.u64 desc_a, 0x0000000000000000;
576-
mov.u64 desc_b, 0x0000000000000000;
577-
578-
// Initialize scale flag (controls operand scaling or additive bias behavior)
579-
mov.pred scale_d, 1;
580-
581-
// Enforce the ordered for Warp-Group instructions
582-
wgmma.fence.sync.aligned;
583-
584-
// The main loop will repeat for 128 iterations
585-
loop_start:
586-
setp.ge.u32 exit_predicate, loop_counter, loop_limit;
587-
@exit_predicate bra loop_exit;
588-
589-
wgmma.mma_async.sync.aligned.m64n256k256.s32.b1.b1.and.popc
590-
{ accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7,
591-
accum8, accum9, accum10, accum11, accum12, accum13, accum14, accum15,
592-
accum16, accum17, accum18, accum19, accum20, accum21, accum22, accum23,
593-
accum24, accum25, accum26, accum27, accum28, accum29, accum30, accum31,
594-
accum32, accum33, accum34, accum35, accum36, accum37, accum38, accum39,
595-
accum40, accum41, accum42, accum43, accum44, accum45, accum46, accum47,
596-
accum48, accum49, accum50, accum51, accum52, accum53, accum54, accum55,
597-
accum56, accum57, accum58, accum59, accum60, accum61, accum62, accum63,
598-
accum64, accum65, accum66, accum67, accum68, accum69, accum70, accum71,
599-
accum72, accum73, accum74, accum75, accum76, accum77, accum78, accum79,
600-
accum80, accum81, accum82, accum83, accum84, accum85, accum86, accum87,
601-
accum88, accum89, accum90, accum91, accum92, accum93, accum94, accum95,
602-
accum96, accum97, accum98, accum99, accum100, accum101, accum102, accum103,
603-
accum104, accum105, accum106, accum107, accum108, accum109, accum110, accum111,
604-
accum112, accum113, accum114, accum115, accum116, accum117, accum118, accum119,
605-
accum120, accum121, accum122, accum123, accum124, accum125, accum126, accum127 },
606-
desc_a,
607-
desc_b,
608-
scale_d;
609-
610-
// Increment the loop counter
611-
add.u32 loop_counter, loop_counter, 1;
612-
613-
// Branch back to the beginning of the loop
614-
bra loop_start;
615-
616-
loop_exit:
617-
// Commit all prior uncommitted operations to the group and wait!
618-
wgmma.commit_group.sync.aligned;
619-
wgmma.wait_group.sync.aligned 0;
620-
621-
// Use volatile stores to force the accumulator values to be written out.
622-
// This dummy write (to a global variable) makes the work observable and
623-
// prevents the multiplication pipeline from being optimized out.
624-
st.global.volatile.s32 [dummy_sink_s32], accum0;
625-
st.global.volatile.s32 [dummy_sink_s32+4], accum1;
626-
st.global.volatile.s32 [dummy_sink_s32+8], accum2;
627-
st.global.volatile.s32 [dummy_sink_s32+12], accum3;
628-
ret;
629-
}

0 commit comments

Comments
 (0)