renamed as lane var was not lane but wrapped.

AtsushiYoshimura0302 · AtsushiYoshimura0302 · commit ec5773bde4e1 · 2025-06-17T11:34:52.000+09:00
diff --git a/Test/WMMA/wmma_test_kernel.h b/Test/WMMA/wmma_test_kernel.h
@@ -54,7 +54,6 @@ __device__ half_2 packFp32s( float a, float b ) { return __builtin_amdgcn_cvt_pk
 
 extern "C" __global__ void wmma_matmul( __fp16* a, __fp16* b, __fp16* c )
 {
-	const int gIdx = blockIdx.x * blockDim.x + threadIdx.x;
 	const int lIdx = threadIdx.x;
 
 	// a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and b
@@ -65,14 +64,14 @@ extern "C" __global__ void wmma_matmul( __fp16* a, __fp16* b, __fp16* c )
 	// initialize c fragment to 0
 	frag_type_c c_frag = {};
 
-	const int lane = lIdx % 16;
+	const int laneWrapped = lIdx % 16;
 	const int laneGroup = lIdx / 16;
 #if defined( __gfx12__ )
 #if 1
 	for( int ele = 0; ele < WMMA_DATA_WIDTH; ++ele )
 	{
-		b_frag[ele] = b[16 * ( ele + laneGroup * WMMA_DATA_WIDTH ) + lane];
-		a_frag[ele] = a[16 * lane + ( ele + laneGroup * WMMA_DATA_WIDTH )];
+		b_frag[ele] = b[16 * ( ele + laneGroup * WMMA_DATA_WIDTH ) + laneWrapped];
+		a_frag[ele] = a[16 * laneWrapped + ( ele + laneGroup * WMMA_DATA_WIDTH )];
 	}
 #else
 	{//with __builtin_amdgcn_cvt_pkrtz
@@ -82,17 +81,17 @@ extern "C" __global__ void wmma_matmul( __fp16* a, __fp16* b, __fp16* c )
 		{
 			const int e0 = ele * 2 + 0;
 			const int e1 = ele * 2 + 1;
-			b_ptr[ele] = packFp32s( b[16 * ( e0 + laneGroup * WMMA_DATA_WIDTH ) + lane], b[16 * ( e1 + laneGroup * WMMA_DATA_WIDTH ) + lane] );
-			a_ptr[ele] = packFp32s( a[16 * lane + ( e0 + laneGroup * WMMA_DATA_WIDTH )], a[16 * lane + ( e1 + laneGroup * WMMA_DATA_WIDTH )] );
+			b_ptr[ele] = packFp32s( b[16 * ( e0 + laneGroup * WMMA_DATA_WIDTH ) + laneWrapped], b[16 * ( e1 + laneGroup * WMMA_DATA_WIDTH ) + laneWrapped] );
+			a_ptr[ele] = packFp32s( a[16 * laneWrapped + ( e0 + laneGroup * WMMA_DATA_WIDTH )], a[16 * laneWrapped + ( e1 + laneGroup * WMMA_DATA_WIDTH )] );
 		}
 	}
 #endif
 #else
 	// lane is (0-31) mod 16 instead of 0-31 due to matrix replication in RDNA3
 	for( int ele = 0; ele < WMMA_DATA_WIDTH; ++ele )
 	{
-		b_frag[ele] = b[16 * ele + lane];
-		a_frag[ele] = a[16 * lane + ele];
+		b_frag[ele] = b[16 * ele + laneWrapped];
+		a_frag[ele] = a[16 * laneWrapped + ele];
 	}
 #endif
 	// call the WMMA compiler intrinsic 
@@ -107,16 +106,16 @@ extern "C" __global__ void wmma_matmul( __fp16* a, __fp16* b, __fp16* c )
 #if defined( __gfx12__ )
 	for( int ele = 0; ele < WMMA_DATA_WIDTH; ++ele )
 	{
-		c[16 * ( ele + laneGroup * WMMA_DATA_WIDTH ) + lane] = c_frag[ele];
+		c[16 * ( ele + laneGroup * WMMA_DATA_WIDTH ) + laneWrapped] = c_frag[ele];
 	}
 #else
 	for( int ele = 0; ele < 8; ++ele )
 	{
 		const int r = ele * 2 + ( lIdx / 16 );
 		// store results from unpacked c_frag output
-		c[16 * r + lane] = c_frag[ele * 2];
+		c[16 * r + laneWrapped] = c_frag[ele * 2];
 		// if OPSEL was set to "true", the line above would instead be
-		// c[16 * r + lane] = c_frag[ele*2 + 1];
+		// c[16 * r + laneWrapped] = c_frag[ele*2 + 1];
 	}
 #endif
 }

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,6 @@ __device__ half_2 packFp32s( float a, float b ) { return __builtin_amdgcn_cvt_pk`
`54`	`54`
`55`	`55`	`extern "C" __global__ void wmma_matmul( __fp16* a, __fp16* b, __fp16* c )`
`56`	`56`	`{`
`57`		`- const int gIdx = blockIdx.x * blockDim.x + threadIdx.x;`
`58`	`57`	`const int lIdx = threadIdx.x;`
`59`	`58`
`60`	`59`	`// a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and b`
`@@ -65,14 +64,14 @@ extern "C" __global__ void wmma_matmul( __fp16* a, __fp16* b, __fp16* c )`
`65`	`64`	`// initialize c fragment to 0`
`66`	`65`	`frag_type_c c_frag = {};`
`67`	`66`
`68`		`- const int lane = lIdx % 16;`
	`67`	`+ const int laneWrapped = lIdx % 16;`
`69`	`68`	`const int laneGroup = lIdx / 16;`
`70`	`69`	`#if defined( __gfx12__ )`
`71`	`70`	`#if 1`
`72`	`71`	`for( int ele = 0; ele < WMMA_DATA_WIDTH; ++ele )`
`73`	`72`	`{`
`74`		`- b_frag[ele] = b[16 * ( ele + laneGroup * WMMA_DATA_WIDTH ) + lane];`
`75`		`- a_frag[ele] = a[16 * lane + ( ele + laneGroup * WMMA_DATA_WIDTH )];`
	`73`	`+ b_frag[ele] = b[16 * ( ele + laneGroup * WMMA_DATA_WIDTH ) + laneWrapped];`
	`74`	`+ a_frag[ele] = a[16 * laneWrapped + ( ele + laneGroup * WMMA_DATA_WIDTH )];`
`76`	`75`	`}`
`77`	`76`	`#else`
`78`	`77`	`{//with __builtin_amdgcn_cvt_pkrtz`
`@@ -82,17 +81,17 @@ extern "C" __global__ void wmma_matmul( __fp16* a, __fp16* b, __fp16* c )`
`82`	`81`	`{`
`83`	`82`	`const int e0 = ele * 2 + 0;`
`84`	`83`	`const int e1 = ele * 2 + 1;`
`85`		`- b_ptr[ele] = packFp32s( b[16 * ( e0 + laneGroup * WMMA_DATA_WIDTH ) + lane], b[16 * ( e1 + laneGroup * WMMA_DATA_WIDTH ) + lane] );`
`86`		`- a_ptr[ele] = packFp32s( a[16 * lane + ( e0 + laneGroup * WMMA_DATA_WIDTH )], a[16 * lane + ( e1 + laneGroup * WMMA_DATA_WIDTH )] );`
	`84`	`+ b_ptr[ele] = packFp32s( b[16 * ( e0 + laneGroup * WMMA_DATA_WIDTH ) + laneWrapped], b[16 * ( e1 + laneGroup * WMMA_DATA_WIDTH ) + laneWrapped] );`
	`85`	`+ a_ptr[ele] = packFp32s( a[16 * laneWrapped + ( e0 + laneGroup * WMMA_DATA_WIDTH )], a[16 * laneWrapped + ( e1 + laneGroup * WMMA_DATA_WIDTH )] );`
`87`	`86`	`}`
`88`	`87`	`}`
`89`	`88`	`#endif`
`90`	`89`	`#else`
`91`	`90`	`// lane is (0-31) mod 16 instead of 0-31 due to matrix replication in RDNA3`
`92`	`91`	`for( int ele = 0; ele < WMMA_DATA_WIDTH; ++ele )`
`93`	`92`	`{`
`94`		`- b_frag[ele] = b[16 * ele + lane];`
`95`		`- a_frag[ele] = a[16 * lane + ele];`
	`93`	`+ b_frag[ele] = b[16 * ele + laneWrapped];`
	`94`	`+ a_frag[ele] = a[16 * laneWrapped + ele];`
`96`	`95`	`}`
`97`	`96`	`#endif`
`98`	`97`	`// call the WMMA compiler intrinsic`
`@@ -107,16 +106,16 @@ extern "C" __global__ void wmma_matmul( __fp16* a, __fp16* b, __fp16* c )`
`107`	`106`	`#if defined( __gfx12__ )`
`108`	`107`	`for( int ele = 0; ele < WMMA_DATA_WIDTH; ++ele )`
`109`	`108`	`{`
`110`		`- c[16 * ( ele + laneGroup * WMMA_DATA_WIDTH ) + lane] = c_frag[ele];`
	`109`	`+ c[16 * ( ele + laneGroup * WMMA_DATA_WIDTH ) + laneWrapped] = c_frag[ele];`
`111`	`110`	`}`
`112`	`111`	`#else`
`113`	`112`	`for( int ele = 0; ele < 8; ++ele )`
`114`	`113`	`{`
`115`	`114`	`const int r = ele * 2 + ( lIdx / 16 );`
`116`	`115`	`// store results from unpacked c_frag output`
`117`		`- c[16 * r + lane] = c_frag[ele * 2];`
	`116`	`+ c[16 * r + laneWrapped] = c_frag[ele * 2];`
`118`	`117`	`// if OPSEL was set to "true", the line above would instead be`
`119`		`- // c[16 * r + lane] = c_frag[ele*2 + 1];`
	`118`	`+ // c[16 * r + laneWrapped] = c_frag[ele*2 + 1];`
`120`	`119`	`}`
`121`	`120`	`#endif`
`122`	`121`	`}`