|
1 | 1 | /** |
2 | | - * less_slow_sm90a.ptx |
| 2 | + * less_slow_sm80.ptx |
3 | 3 | * |
4 | 4 | * Micro-kernels for building a performance-first mindset for CUDA-capable |
5 | 5 | * GPUs using Parallel Thread eXecution (PTX) Intermediate Representation (IR) |
6 | | - * for for Hopper-generation Nvidia GPUs and newer. |
| 6 | + * for for Ampere-generation Nvidia GPUs with Warp-level MMA (WMMA). |
7 | 7 | * |
8 | 8 | * ? You should start at `less_slow.cu` before reading this file. |
9 | 9 | * ? You should start at `less_slow_sm70.ptx` before reading this file. |
|
13 | 13 | * You can validate this file by asking the Nvidia PTX Assembler to compile it |
14 | 14 | * to `.cubin` for some target architecture: |
15 | 15 | * |
16 | | - * $ ptxas -o less_slow_from_ptx.cubin -arch=sm_90a less_slow_sm90a.ptx |
| 16 | + * $ ptxas -o less_slow_from_ptx.cubin -arch=sm_80 less_slow_sm80.ptx |
17 | 17 | * $ cuobjdump -sass less_slow_from_ptx.cubin | grep -i mma |
18 | 18 | * |
19 | 19 | * Assuming how aggressively NVCC unrolls loops and the number of kernels in |
|
24 | 24 | * $ sed -r 's/^[[:space:]]+//; s/[[:space:]]+$//' | \ |
25 | 25 | * $ sort -u |
26 | 26 | */ |
27 | | -.version 8.0 // PTX version 8.0 for Hopper GPUs |
28 | | -.target sm_90a // Target architecture (SM_90a - Hopper GPUs) |
| 27 | +.version 7.0 // PTX version 7.0 for Ampere GPUs |
| 28 | +.target sm_80 // Target architecture (SM_80 - Ampere GPUs) |
29 | 29 | .address_size 64 // 64-bit addressing |
30 | 30 |
|
31 | 31 | /** |
@@ -283,347 +283,3 @@ loop_exit: |
283 | 283 | st.global.volatile.f32 [dummy_sink_f32+12], accum3; |
284 | 284 | ret; |
285 | 285 | } |
286 | | - |
287 | | -/** |
288 | | - * The instruction syntax for Warp-Group asynchronous instructions is very |
289 | | - * different, as at least one of the operand matrices has to be in shared |
290 | | - * memory (not registers). It's documented as in 2 variants: |
291 | | - * |
292 | | - * wgmma.mma_async.sync.aligned.shape.dtype.tf32.tf32 |
293 | | - * d, a-desc, b-desc, scale-d, imm-scale-a, imm-scale-b; |
294 | | - * wgmma.mma_async.sync.aligned.shape.dtype.tf32.tf32 |
295 | | - * d, a, b-desc, scale-d, imm-scale-a, imm-scale-b; |
296 | | - * |
297 | | - * There is no "C" matrix involved at all, we are computing `D = A * B + D`. |
298 | | - * The A and B matrix descriptors are the properties of the matrix in shared |
299 | | - * memory. It is a 64-bit value contained with the following layout: |
300 | | - * |
301 | | - * - 14 bits [0; 13]: start address |
302 | | - * - 14 bits [16; 29]: leading dimension byte offset |
303 | | - * - 14 bits [32; 45]: stride dimension byte offset |
304 | | - * - 3 bits [49; 51]: matrix base offset, valid only for "swizzling" |
305 | | - * - 2 bits [62; 63]: "swizzling" mode |
306 | | - * |
307 | | - * Swizzling defines the order of the elements and can have 4 possible values: |
308 | | - * |
309 | | - * 0: no "swizzling" at all |
310 | | - * 1: a 128-byte "swizzle" with a 1024 byte offset of a repeating pattern |
311 | | - * 2: a 64-byte "swizzle" with a 512 byte offset of a repeating pattern |
312 | | - * 3: a 32-byte "swizzle" with a 256 byte offset of a repeating pattern |
313 | | - * |
314 | | - * The list of supported shapes is exhausting: |
315 | | - * |
316 | | - * .m64n8k8, .m64n16k8, .m64n24k8, .m64n32k8, |
317 | | - * .m64n40k8, .m64n48k8, .m64n56k8, .m64n64k8, |
318 | | - * .m64n72k8, .m64n80k8, .m64n88k8, .m64n96k8, |
319 | | - * .m64n104k8, .m64n112k8, .m64n120k8, .m64n128k8, |
320 | | - * .m64n136k8, .m64n144k8, .m64n152k8, .m64n160k8, |
321 | | - * .m64n168k8, .m64n176k8, .m64n184k8, .m64n192k8, |
322 | | - * .m64n200k8, .m64n208k8, .m64n216k8, .m64n224k8, |
323 | | - * .m64n232k8, .m64n240k8, .m64n248k8, .m64n256k8 |
324 | | - * |
325 | | - * The `scale` parameters can be used to either negate the inputs, or disable |
326 | | - * additive bias accumulation in the output. |
327 | | - */ |
328 | | -.visible .entry tops_tf32f32_sm90tc_m64n16k8_loop128_ptx_kernel() |
329 | | -{ |
330 | | - // Accumulator registers used for both input and output of this MMA |
331 | | - .reg .f32 accum<8>; |
332 | | - |
333 | | - // Descriptors for matrix A and matrix B operands |
334 | | - .reg .b64 desc_a, desc_b; |
335 | | - |
336 | | - // General-purpose registers for loop control |
337 | | - .reg .b32 loop_counter, loop_limit; |
338 | | - |
339 | | - // Predicate register for conditional branching (loop exit) |
340 | | - .reg .pred exit_predicate; |
341 | | - |
342 | | - // Set up loop counter and loop limit |
343 | | - mov.u32 loop_counter, 0; |
344 | | - mov.u32 loop_limit, 128; |
345 | | - |
346 | | - // Zero-initialize the accumulator registers |
347 | | - mov.f32 accum0, 0.0; |
348 | | - mov.f32 accum1, 0.0; |
349 | | - mov.f32 accum2, 0.0; |
350 | | - mov.f32 accum3, 0.0; |
351 | | - mov.f32 accum4, 0.0; |
352 | | - mov.f32 accum5, 0.0; |
353 | | - mov.f32 accum6, 0.0; |
354 | | - mov.f32 accum7, 0.0; |
355 | | - |
356 | | - // Initialize matrix descriptors with arbitrary placeholder values |
357 | | - mov.u64 desc_a, 0x0000000000000000; |
358 | | - mov.u64 desc_b, 0x0000000000000000; |
359 | | - |
360 | | - // Enforce the ordered for Warp-Group instructions |
361 | | - wgmma.fence.sync.aligned; |
362 | | - |
363 | | - // The main loop will repeat for 128 iterations |
364 | | -loop_start: |
365 | | - setp.ge.u32 exit_predicate, loop_counter, loop_limit; |
366 | | - @exit_predicate bra loop_exit; |
367 | | - |
368 | | - wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 |
369 | | - { accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7 }, |
370 | | - desc_a, |
371 | | - desc_b, |
372 | | - 0, -1, -1; |
373 | | - |
374 | | - // Increment the loop counter |
375 | | - add.u32 loop_counter, loop_counter, 1; |
376 | | - |
377 | | - // Branch back to the beginning of the loop |
378 | | - bra loop_start; |
379 | | - |
380 | | -loop_exit: |
381 | | - // Commit all prior uncommitted operations to the group and wait! |
382 | | - wgmma.commit_group.sync.aligned; |
383 | | - wgmma.wait_group.sync.aligned 0; |
384 | | - |
385 | | - // Use volatile stores to force the accumulator values to be written out. |
386 | | - // This dummy write (to a global variable) makes the work observable and |
387 | | - // prevents the multiplication pipeline from being optimized out. |
388 | | - st.global.volatile.f32 [dummy_sink_f32], accum0; |
389 | | - st.global.volatile.f32 [dummy_sink_f32+4], accum1; |
390 | | - st.global.volatile.f32 [dummy_sink_f32+8], accum2; |
391 | | - st.global.volatile.f32 [dummy_sink_f32+12], accum3; |
392 | | - ret; |
393 | | -} |
394 | | - |
395 | | -/** |
396 | | - * This results in massive performance gains on Hopper: |
397 | | - * - 16x16x8 MMA computed by individual warps: 74 T |
398 | | - * - 64x16x8 WGMMA computed by four warps together: 300 T |
399 | | - * |
400 | | - * Will it get even better with larger matrices if we scale the second |
401 | | - * dimension from 16 to 256? It would require 128 accumulators. |
402 | | - */ |
403 | | - |
404 | | -.visible .entry tops_tf32f32_sm90tc_m64n256k8_loop128_ptx_kernel() |
405 | | -{ |
406 | | - // Accumulator registers used for both input and output of this MMA |
407 | | - .reg .f32 accum<128>; |
408 | | - |
409 | | - // Descriptors for matrix A and matrix B operands |
410 | | - .reg .b64 desc_a, desc_b; |
411 | | - |
412 | | - // General-purpose registers for loop control |
413 | | - .reg .b32 loop_counter, loop_limit; |
414 | | - |
415 | | - // Predicate register for conditional branching (loop exit) |
416 | | - .reg .pred exit_predicate; |
417 | | - |
418 | | - // Set up loop counter and loop limit to fill accumulators |
419 | | - mov.u32 loop_counter, 0; |
420 | | - mov.u32 loop_limit, 128; |
421 | | - |
422 | | - // Zero-initialize the accumulator registers: |
423 | | - mov.f32 accum0, 0.0; mov.f32 accum1, 0.0; mov.f32 accum2, 0.0; mov.f32 accum3, 0.0; |
424 | | - mov.f32 accum4, 0.0; mov.f32 accum5, 0.0; mov.f32 accum6, 0.0; mov.f32 accum7, 0.0; |
425 | | - mov.f32 accum8, 0.0; mov.f32 accum9, 0.0; mov.f32 accum10, 0.0; mov.f32 accum11, 0.0; |
426 | | - mov.f32 accum12, 0.0; mov.f32 accum13, 0.0; mov.f32 accum14, 0.0; mov.f32 accum15, 0.0; |
427 | | - mov.f32 accum16, 0.0; mov.f32 accum17, 0.0; mov.f32 accum18, 0.0; mov.f32 accum19, 0.0; |
428 | | - mov.f32 accum20, 0.0; mov.f32 accum21, 0.0; mov.f32 accum22, 0.0; mov.f32 accum23, 0.0; |
429 | | - mov.f32 accum24, 0.0; mov.f32 accum25, 0.0; mov.f32 accum26, 0.0; mov.f32 accum27, 0.0; |
430 | | - mov.f32 accum28, 0.0; mov.f32 accum29, 0.0; mov.f32 accum30, 0.0; mov.f32 accum31, 0.0; |
431 | | - mov.f32 accum32, 0.0; mov.f32 accum33, 0.0; mov.f32 accum34, 0.0; mov.f32 accum35, 0.0; |
432 | | - mov.f32 accum36, 0.0; mov.f32 accum37, 0.0; mov.f32 accum38, 0.0; mov.f32 accum39, 0.0; |
433 | | - mov.f32 accum40, 0.0; mov.f32 accum41, 0.0; mov.f32 accum42, 0.0; mov.f32 accum43, 0.0; |
434 | | - mov.f32 accum44, 0.0; mov.f32 accum45, 0.0; mov.f32 accum46, 0.0; mov.f32 accum47, 0.0; |
435 | | - mov.f32 accum48, 0.0; mov.f32 accum49, 0.0; mov.f32 accum50, 0.0; mov.f32 accum51, 0.0; |
436 | | - mov.f32 accum52, 0.0; mov.f32 accum53, 0.0; mov.f32 accum54, 0.0; mov.f32 accum55, 0.0; |
437 | | - mov.f32 accum56, 0.0; mov.f32 accum57, 0.0; mov.f32 accum58, 0.0; mov.f32 accum59, 0.0; |
438 | | - mov.f32 accum60, 0.0; mov.f32 accum61, 0.0; mov.f32 accum62, 0.0; mov.f32 accum63, 0.0; |
439 | | - mov.f32 accum64, 0.0; mov.f32 accum65, 0.0; mov.f32 accum66, 0.0; mov.f32 accum67, 0.0; |
440 | | - mov.f32 accum68, 0.0; mov.f32 accum69, 0.0; mov.f32 accum70, 0.0; mov.f32 accum71, 0.0; |
441 | | - mov.f32 accum72, 0.0; mov.f32 accum73, 0.0; mov.f32 accum74, 0.0; mov.f32 accum75, 0.0; |
442 | | - mov.f32 accum76, 0.0; mov.f32 accum77, 0.0; mov.f32 accum78, 0.0; mov.f32 accum79, 0.0; |
443 | | - mov.f32 accum80, 0.0; mov.f32 accum81, 0.0; mov.f32 accum82, 0.0; mov.f32 accum83, 0.0; |
444 | | - mov.f32 accum84, 0.0; mov.f32 accum85, 0.0; mov.f32 accum86, 0.0; mov.f32 accum87, 0.0; |
445 | | - mov.f32 accum88, 0.0; mov.f32 accum89, 0.0; mov.f32 accum90, 0.0; mov.f32 accum91, 0.0; |
446 | | - mov.f32 accum92, 0.0; mov.f32 accum93, 0.0; mov.f32 accum94, 0.0; mov.f32 accum95, 0.0; |
447 | | - mov.f32 accum96, 0.0; mov.f32 accum97, 0.0; mov.f32 accum98, 0.0; mov.f32 accum99, 0.0; |
448 | | - mov.f32 accum100, 0.0; mov.f32 accum101, 0.0; mov.f32 accum102, 0.0; mov.f32 accum103, 0.0; |
449 | | - mov.f32 accum104, 0.0; mov.f32 accum105, 0.0; mov.f32 accum106, 0.0; mov.f32 accum107, 0.0; |
450 | | - mov.f32 accum108, 0.0; mov.f32 accum109, 0.0; mov.f32 accum110, 0.0; mov.f32 accum111, 0.0; |
451 | | - mov.f32 accum112, 0.0; mov.f32 accum113, 0.0; mov.f32 accum114, 0.0; mov.f32 accum115, 0.0; |
452 | | - mov.f32 accum116, 0.0; mov.f32 accum117, 0.0; mov.f32 accum118, 0.0; mov.f32 accum119, 0.0; |
453 | | - mov.f32 accum120, 0.0; mov.f32 accum121, 0.0; mov.f32 accum122, 0.0; mov.f32 accum123, 0.0; |
454 | | - mov.f32 accum124, 0.0; mov.f32 accum125, 0.0; mov.f32 accum126, 0.0; mov.f32 accum127, 0.0; |
455 | | - |
456 | | - // Initialize matrix descriptors with arbitrary placeholder values |
457 | | - mov.u64 desc_a, 0x0000000000000000; |
458 | | - mov.u64 desc_b, 0x0000000000000000; |
459 | | - |
460 | | - // Enforce the ordered for Warp-Group instructions |
461 | | - wgmma.fence.sync.aligned; |
462 | | - |
463 | | - // The main loop will repeat for 128 iterations |
464 | | -loop_start: |
465 | | - setp.ge.u32 exit_predicate, loop_counter, loop_limit; |
466 | | - @exit_predicate bra loop_exit; |
467 | | - |
468 | | - wgmma.mma_async.sync.aligned.m64n256k8.f32.tf32.tf32 |
469 | | - { accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7, |
470 | | - accum8, accum9, accum10, accum11, accum12, accum13, accum14, accum15, |
471 | | - accum16, accum17, accum18, accum19, accum20, accum21, accum22, accum23, |
472 | | - accum24, accum25, accum26, accum27, accum28, accum29, accum30, accum31, |
473 | | - accum32, accum33, accum34, accum35, accum36, accum37, accum38, accum39, |
474 | | - accum40, accum41, accum42, accum43, accum44, accum45, accum46, accum47, |
475 | | - accum48, accum49, accum50, accum51, accum52, accum53, accum54, accum55, |
476 | | - accum56, accum57, accum58, accum59, accum60, accum61, accum62, accum63, |
477 | | - accum64, accum65, accum66, accum67, accum68, accum69, accum70, accum71, |
478 | | - accum72, accum73, accum74, accum75, accum76, accum77, accum78, accum79, |
479 | | - accum80, accum81, accum82, accum83, accum84, accum85, accum86, accum87, |
480 | | - accum88, accum89, accum90, accum91, accum92, accum93, accum94, accum95, |
481 | | - accum96, accum97, accum98, accum99, accum100, accum101, accum102, accum103, |
482 | | - accum104, accum105, accum106, accum107, accum108, accum109, accum110, accum111, |
483 | | - accum112, accum113, accum114, accum115, accum116, accum117, accum118, accum119, |
484 | | - accum120, accum121, accum122, accum123, accum124, accum125, accum126, accum127 }, |
485 | | - desc_a, |
486 | | - desc_b, |
487 | | - 0, -1, -1; |
488 | | - |
489 | | - // Increment the loop counter |
490 | | - add.u32 loop_counter, loop_counter, 1; |
491 | | - |
492 | | - // Branch back to the beginning of the loop |
493 | | - bra loop_start; |
494 | | - |
495 | | -loop_exit: |
496 | | - // Commit all prior uncommitted operations to the group and wait! |
497 | | - wgmma.commit_group.sync.aligned; |
498 | | - wgmma.wait_group.sync.aligned 0; |
499 | | - |
500 | | - // Use volatile stores to force the accumulator values to be written out. |
501 | | - // This dummy write (to a global variable) makes the work observable and |
502 | | - // prevents the multiplication pipeline from being optimized out. |
503 | | - st.global.volatile.f32 [dummy_sink_f32], accum0; |
504 | | - st.global.volatile.f32 [dummy_sink_f32+4], accum1; |
505 | | - st.global.volatile.f32 [dummy_sink_f32+8], accum2; |
506 | | - st.global.volatile.f32 [dummy_sink_f32+12], accum3; |
507 | | - ret; |
508 | | -} |
509 | | - |
510 | | -/** |
511 | | - * This results in massive performance gains on Hopper: |
512 | | - * - 16x16x8 MMA computed by individual warps: 74 T |
513 | | - * - 64x16x8 WGMMA computed by four warps together: 300 T |
514 | | - * - 64x256x8 WGMMA computed by four warps together: 4.7 P ?! |
515 | | - * |
516 | | - * There are also "structured-sparse" variants of those instructions, in case |
517 | | - * half of our entries are zeros! Those, however, simply expand the last |
518 | | - * dimension by 2x, making the instructions no more usable for small matrices. |
519 | | - */ |
520 | | - |
521 | | -.visible .entry tops_b1i32and_sm90tc_m64n256k256_loop128_ptx_kernel() |
522 | | -{ |
523 | | - // Accumulator registers used for both input and output of the MMA operation |
524 | | - .reg .s32 accum<128>; |
525 | | - |
526 | | - // Descriptors for matrix A and matrix B operands (in shared memory) |
527 | | - .reg .b64 desc_a, desc_b; |
528 | | - |
529 | | - // General-purpose registers for loop control |
530 | | - .reg .b32 loop_counter, loop_limit; |
531 | | - |
532 | | - // Predicate registers for conditional branching (loop exit) and scale flag |
533 | | - .reg .pred exit_predicate, scale_d; |
534 | | - |
535 | | - // Set up loop counter and loop limit |
536 | | - mov.u32 loop_counter, 0; |
537 | | - mov.u32 loop_limit, 128; |
538 | | - |
539 | | - // Zero-initialize the accumulators, as registers may contain noise |
540 | | - mov.s32 accum0, 0; mov.s32 accum1, 0; mov.s32 accum2, 0; mov.s32 accum3, 0; |
541 | | - mov.s32 accum4, 0; mov.s32 accum5, 0; mov.s32 accum6, 0; mov.s32 accum7, 0; |
542 | | - mov.s32 accum8, 0; mov.s32 accum9, 0; mov.s32 accum10, 0; mov.s32 accum11, 0; |
543 | | - mov.s32 accum12, 0; mov.s32 accum13, 0; mov.s32 accum14, 0; mov.s32 accum15, 0; |
544 | | - mov.s32 accum16, 0; mov.s32 accum17, 0; mov.s32 accum18, 0; mov.s32 accum19, 0; |
545 | | - mov.s32 accum20, 0; mov.s32 accum21, 0; mov.s32 accum22, 0; mov.s32 accum23, 0; |
546 | | - mov.s32 accum24, 0; mov.s32 accum25, 0; mov.s32 accum26, 0; mov.s32 accum27, 0; |
547 | | - mov.s32 accum28, 0; mov.s32 accum29, 0; mov.s32 accum30, 0; mov.s32 accum31, 0; |
548 | | - mov.s32 accum32, 0; mov.s32 accum33, 0; mov.s32 accum34, 0; mov.s32 accum35, 0; |
549 | | - mov.s32 accum36, 0; mov.s32 accum37, 0; mov.s32 accum38, 0; mov.s32 accum39, 0; |
550 | | - mov.s32 accum40, 0; mov.s32 accum41, 0; mov.s32 accum42, 0; mov.s32 accum43, 0; |
551 | | - mov.s32 accum44, 0; mov.s32 accum45, 0; mov.s32 accum46, 0; mov.s32 accum47, 0; |
552 | | - mov.s32 accum48, 0; mov.s32 accum49, 0; mov.s32 accum50, 0; mov.s32 accum51, 0; |
553 | | - mov.s32 accum52, 0; mov.s32 accum53, 0; mov.s32 accum54, 0; mov.s32 accum55, 0; |
554 | | - mov.s32 accum56, 0; mov.s32 accum57, 0; mov.s32 accum58, 0; mov.s32 accum59, 0; |
555 | | - mov.s32 accum60, 0; mov.s32 accum61, 0; mov.s32 accum62, 0; mov.s32 accum63, 0; |
556 | | - mov.s32 accum64, 0; mov.s32 accum65, 0; mov.s32 accum66, 0; mov.s32 accum67, 0; |
557 | | - mov.s32 accum68, 0; mov.s32 accum69, 0; mov.s32 accum70, 0; mov.s32 accum71, 0; |
558 | | - mov.s32 accum72, 0; mov.s32 accum73, 0; mov.s32 accum74, 0; mov.s32 accum75, 0; |
559 | | - mov.s32 accum76, 0; mov.s32 accum77, 0; mov.s32 accum78, 0; mov.s32 accum79, 0; |
560 | | - mov.s32 accum80, 0; mov.s32 accum81, 0; mov.s32 accum82, 0; mov.s32 accum83, 0; |
561 | | - mov.s32 accum84, 0; mov.s32 accum85, 0; mov.s32 accum86, 0; mov.s32 accum87, 0; |
562 | | - mov.s32 accum88, 0; mov.s32 accum89, 0; mov.s32 accum90, 0; mov.s32 accum91, 0; |
563 | | - mov.s32 accum92, 0; mov.s32 accum93, 0; mov.s32 accum94, 0; mov.s32 accum95, 0; |
564 | | - mov.s32 accum96, 0; mov.s32 accum97, 0; mov.s32 accum98, 0; mov.s32 accum99, 0; |
565 | | - mov.s32 accum100, 0; mov.s32 accum101, 0; mov.s32 accum102, 0; mov.s32 accum103, 0; |
566 | | - mov.s32 accum104, 0; mov.s32 accum105, 0; mov.s32 accum106, 0; mov.s32 accum107, 0; |
567 | | - mov.s32 accum108, 0; mov.s32 accum109, 0; mov.s32 accum110, 0; mov.s32 accum111, 0; |
568 | | - mov.s32 accum112, 0; mov.s32 accum113, 0; mov.s32 accum114, 0; mov.s32 accum115, 0; |
569 | | - mov.s32 accum116, 0; mov.s32 accum117, 0; mov.s32 accum118, 0; mov.s32 accum119, 0; |
570 | | - mov.s32 accum120, 0; mov.s32 accum121, 0; mov.s32 accum122, 0; mov.s32 accum123, 0; |
571 | | - mov.s32 accum124, 0; mov.s32 accum125, 0; mov.s32 accum126, 0; mov.s32 accum127, 0; |
572 | | - |
573 | | - // Initialize matrix descriptors with arbitrary placeholder values. |
574 | | - // In practice, these would be set to point to shared-memory regions containing your matrices. |
575 | | - mov.u64 desc_a, 0x0000000000000000; |
576 | | - mov.u64 desc_b, 0x0000000000000000; |
577 | | - |
578 | | - // Initialize scale flag (controls operand scaling or additive bias behavior) |
579 | | - mov.pred scale_d, 1; |
580 | | - |
581 | | - // Enforce the ordered for Warp-Group instructions |
582 | | - wgmma.fence.sync.aligned; |
583 | | - |
584 | | - // The main loop will repeat for 128 iterations |
585 | | -loop_start: |
586 | | - setp.ge.u32 exit_predicate, loop_counter, loop_limit; |
587 | | - @exit_predicate bra loop_exit; |
588 | | - |
589 | | - wgmma.mma_async.sync.aligned.m64n256k256.s32.b1.b1.and.popc |
590 | | - { accum0, accum1, accum2, accum3, accum4, accum5, accum6, accum7, |
591 | | - accum8, accum9, accum10, accum11, accum12, accum13, accum14, accum15, |
592 | | - accum16, accum17, accum18, accum19, accum20, accum21, accum22, accum23, |
593 | | - accum24, accum25, accum26, accum27, accum28, accum29, accum30, accum31, |
594 | | - accum32, accum33, accum34, accum35, accum36, accum37, accum38, accum39, |
595 | | - accum40, accum41, accum42, accum43, accum44, accum45, accum46, accum47, |
596 | | - accum48, accum49, accum50, accum51, accum52, accum53, accum54, accum55, |
597 | | - accum56, accum57, accum58, accum59, accum60, accum61, accum62, accum63, |
598 | | - accum64, accum65, accum66, accum67, accum68, accum69, accum70, accum71, |
599 | | - accum72, accum73, accum74, accum75, accum76, accum77, accum78, accum79, |
600 | | - accum80, accum81, accum82, accum83, accum84, accum85, accum86, accum87, |
601 | | - accum88, accum89, accum90, accum91, accum92, accum93, accum94, accum95, |
602 | | - accum96, accum97, accum98, accum99, accum100, accum101, accum102, accum103, |
603 | | - accum104, accum105, accum106, accum107, accum108, accum109, accum110, accum111, |
604 | | - accum112, accum113, accum114, accum115, accum116, accum117, accum118, accum119, |
605 | | - accum120, accum121, accum122, accum123, accum124, accum125, accum126, accum127 }, |
606 | | - desc_a, |
607 | | - desc_b, |
608 | | - scale_d; |
609 | | - |
610 | | - // Increment the loop counter |
611 | | - add.u32 loop_counter, loop_counter, 1; |
612 | | - |
613 | | - // Branch back to the beginning of the loop |
614 | | - bra loop_start; |
615 | | - |
616 | | -loop_exit: |
617 | | - // Commit all prior uncommitted operations to the group and wait! |
618 | | - wgmma.commit_group.sync.aligned; |
619 | | - wgmma.wait_group.sync.aligned 0; |
620 | | - |
621 | | - // Use volatile stores to force the accumulator values to be written out. |
622 | | - // This dummy write (to a global variable) makes the work observable and |
623 | | - // prevents the multiplication pipeline from being optimized out. |
624 | | - st.global.volatile.s32 [dummy_sink_s32], accum0; |
625 | | - st.global.volatile.s32 [dummy_sink_s32+4], accum1; |
626 | | - st.global.volatile.s32 [dummy_sink_s32+8], accum2; |
627 | | - st.global.volatile.s32 [dummy_sink_s32+12], accum3; |
628 | | - ret; |
629 | | -} |
0 commit comments