@@ -1566,16 +1566,16 @@ class tinyBLAS_Q0_PPC {
15661566 }
15671567 }
15681568 }
1569-
15701569 template <int size>
15711570 inline void compute (acc_t * ACC, int c_idx, int s_idx, std::array<int , size>& comparray, vector float * vs, vector float * fin_res) {
15721571 vector signed int vec_C[4 ];
1573- vector float CA[4 ] = {0 };
1572+ // vector float CA[4] = {0};
15741573 vector float res[4 ] = {0 };
15751574 __builtin_mma_disassemble_acc (vec_C, ACC);
15761575 for (int i = 0 ; i < 4 ; i++) {
1577- CA[i] = vec_splats ((float )(((double )comparray[c_idx+i]) * -128.0 ));
1578- res[i] = vec_add (vec_ctf (vec_C[i], 0 ), CA[i]);
1576+ // CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
1577+ // res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
1578+ res[i] = vec_ctf (vec_C[i], 0 );
15791579 fin_res[s_idx+i] = vec_madd (res[i], vs[s_idx+i], fin_res[s_idx+i]);
15801580 }
15811581 }
@@ -1971,7 +1971,7 @@ class tinyBLAS_Q0_PPC {
19711971 }
19721972
19731973 template <typename VA, typename VB>
1974- void packNormal (const TB* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
1974+ void packNormal (const TB* a, int64_t lda, int rows, int cols, VA* vec/* , bool flip*/ ) {
19751975 int64_t i, j;
19761976 TB *aoffset = NULL ;
19771977 VA *vecOffset = NULL ;
@@ -1981,9 +1981,9 @@ class tinyBLAS_Q0_PPC {
19811981 VB c1[2 ] = {0 }, c2[2 ] = {0 }, c3[2 ] = {0 }, c4[2 ]={0 };
19821982 VB c5[2 ] = {0 }, c6[2 ] = {0 }, c7[2 ] = {0 }, c8[2 ]={0 };
19831983 VB t1, t2, t3, t4, t5, t6, t7, t8;
1984- vector unsigned char xor_vector;
1985- uint8_t flip_vec = 0x80 ;
1986- xor_vector = vec_splats (flip_vec);
1984+ // vector unsigned char xor_vector;
1985+ // uint8_t flip_vec = 0x80;
1986+ // xor_vector = vec_splats(flip_vec);
19871987 vector unsigned char swiz1 = {0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 };
19881988 vector unsigned char swiz2 = {8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 };
19891989 vector unsigned char swiz3 = {0 , 1 , 2 , 3 , 8 , 9 , 10 , 11 , 16 , 17 , 18 , 19 , 24 , 25 , 26 , 27 };
@@ -2033,12 +2033,12 @@ class tinyBLAS_Q0_PPC {
20332033 t6 = vec_perm (t1, t3, swiz4);
20342034 t7 = vec_perm (t2, t4, swiz3);
20352035 t8 = vec_perm (t2, t4, swiz4);
2036- if (flip == true ) {
2036+ /* if (flip == true) {
20372037 t5 = vec_xor(t5, xor_vector);
20382038 t6 = vec_xor(t6, xor_vector);
20392039 t7 = vec_xor(t7, xor_vector);
20402040 t8 = vec_xor(t8, xor_vector);
2041- }
2041+ }*/
20422042 vec_xst (t5, 0 , vecOffset);
20432043 vec_xst (t6, 0 , vecOffset+16 );
20442044 vec_xst (t7, 0 , vecOffset+32 );
@@ -2052,12 +2052,12 @@ class tinyBLAS_Q0_PPC {
20522052 t6 = vec_perm (t1, t3, swiz4);
20532053 t7 = vec_perm (t2, t4, swiz3);
20542054 t8 = vec_perm (t2, t4, swiz4);
2055- if (flip == true ) {
2055+ /* if (flip == true) {
20562056 t5 = vec_xor(t5, xor_vector);
20572057 t6 = vec_xor(t6, xor_vector);
20582058 t7 = vec_xor(t7, xor_vector);
20592059 t8 = vec_xor(t8, xor_vector);
2060- }
2060+ }*/
20612061 vec_xst (t5, 0 , vecOffset+64 );
20622062 vec_xst (t6, 0 , vecOffset+80 );
20632063 vec_xst (t7, 0 , vecOffset+96 );
@@ -2071,12 +2071,12 @@ class tinyBLAS_Q0_PPC {
20712071 t6 = vec_perm (t1, t3, swiz4);
20722072 t7 = vec_perm (t2, t4, swiz3);
20732073 t8 = vec_perm (t2, t4, swiz4);
2074- if (flip == true ) {
2074+ /* if (flip == true) {
20752075 t5 = vec_xor(t5, xor_vector);
20762076 t6 = vec_xor(t6, xor_vector);
20772077 t7 = vec_xor(t7, xor_vector);
20782078 t8 = vec_xor(t8, xor_vector);
2079- }
2079+ }*/
20802080 vec_xst (t5, 0 , vecOffset+128 );
20812081 vec_xst (t6, 0 , vecOffset+144 );
20822082 vec_xst (t7, 0 , vecOffset+160 );
@@ -2090,12 +2090,12 @@ class tinyBLAS_Q0_PPC {
20902090 t6 = vec_perm (t1, t3, swiz4);
20912091 t7 = vec_perm (t2, t4, swiz3);
20922092 t8 = vec_perm (t2, t4, swiz4);
2093- if (flip == true ) {
2093+ /* if (flip == true) {
20942094 t5 = vec_xor(t5, xor_vector);
20952095 t6 = vec_xor(t6, xor_vector);
20962096 t7 = vec_xor(t7, xor_vector);
20972097 t8 = vec_xor(t8, xor_vector);
2098- }
2098+ }*/
20992099 vec_xst (t5, 0 , vecOffset+192 );
21002100 vec_xst (t6, 0 , vecOffset+208 );
21012101 vec_xst (t7, 0 , vecOffset+224 );
@@ -2145,12 +2145,12 @@ class tinyBLAS_Q0_PPC {
21452145 t6 = vec_perm (t1, t3, swiz4);
21462146 t7 = vec_perm (t2, t4, swiz3);
21472147 t8 = vec_perm (t2, t4, swiz4);
2148- if (flip == true ) {
2148+ /* if (flip == true) {
21492149 t5 = vec_xor(t5, xor_vector);
21502150 t6 = vec_xor(t6, xor_vector);
21512151 t7 = vec_xor(t7, xor_vector);
21522152 t8 = vec_xor(t8, xor_vector);
2153- }
2153+ }*/
21542154 vec_xst (t5, 0 , vecOffset);
21552155 vec_xst (t6, 0 , vecOffset+16 );
21562156 vec_xst (t7, 0 , vecOffset+32 );
@@ -2164,12 +2164,12 @@ class tinyBLAS_Q0_PPC {
21642164 t6 = vec_perm (t1, t3, swiz4);
21652165 t7 = vec_perm (t2, t4, swiz3);
21662166 t8 = vec_perm (t2, t4, swiz4);
2167- if (flip == true ) {
2167+ /* if (flip == true) {
21682168 t5 = vec_xor(t5, xor_vector);
21692169 t6 = vec_xor(t6, xor_vector);
21702170 t7 = vec_xor(t7, xor_vector);
21712171 t8 = vec_xor(t8, xor_vector);
2172- }
2172+ }*/
21732173 vec_xst (t5, 0 , vecOffset+64 );
21742174 vec_xst (t6, 0 , vecOffset+80 );
21752175 vec_xst (t7, 0 , vecOffset+96 );
@@ -2208,12 +2208,12 @@ class tinyBLAS_Q0_PPC {
22082208 t6 = vec_perm (t1, t3, swiz4);
22092209 t7 = vec_perm (t2, t4, swiz3);
22102210 t8 = vec_perm (t2, t4, swiz4);
2211- if (flip == true ) {
2211+ /* if (flip == true) {
22122212 t5 = vec_xor(t5, xor_vector);
22132213 t6 = vec_xor(t6, xor_vector);
22142214 t7 = vec_xor(t7, xor_vector);
22152215 t8 = vec_xor(t8, xor_vector);
2216- }
2216+ }*/
22172217 vec_xst (t5, 0 , vecOffset);
22182218 vec_xst (t6, 0 , vecOffset+16 );
22192219 vec_xst (t7, 0 , vecOffset+32 );
@@ -2227,12 +2227,12 @@ class tinyBLAS_Q0_PPC {
22272227 t6 = vec_perm (t1, t3, swiz4);
22282228 t7 = vec_perm (t2, t4, swiz3);
22292229 t8 = vec_perm (t2, t4, swiz4);
2230- if (flip == true ) {
2230+ /* if (flip == true) {
22312231 t5 = vec_xor(t5, xor_vector);
22322232 t6 = vec_xor(t6, xor_vector);
22332233 t7 = vec_xor(t7, xor_vector);
22342234 t8 = vec_xor(t8, xor_vector);
2235- }
2235+ }*/
22362236 vec_xst (t5, 0 , vecOffset+64 );
22372237 vec_xst (t6, 0 , vecOffset+80 );
22382238 vec_xst (t7, 0 , vecOffset+96 );
@@ -2415,9 +2415,9 @@ class tinyBLAS_Q0_PPC {
24152415 if (std::is_same_v<TA, block_q4_0>) {
24162416 packNormalInt4<int8_t , vector signed char , 4 >((A+(ii*lda)+l), lda, 4 , 4 , (int8_t *)vec_A, comparray);
24172417 } else {
2418- packNormal<int8_t , vector signed char >((const TB*)(A+(ii*lda)+l), lda, 4 , 8 , (int8_t *)vec_A, false );
2418+ packNormal<int8_t , vector signed char >((const TB*)(A+(ii*lda)+l), lda, 4 , 8 , (int8_t *)vec_A/* , false*/ );
24192419 }
2420- packNormal<uint8_t , vector unsigned char >((B+(jj*ldb)+l), ldb, 8 , 8 , (uint8_t *)vec_B, true );
2420+ packNormal<uint8_t , vector unsigned char >((B+(jj*ldb)+l), ldb, 8 , 8 , (uint8_t *)vec_B/* , true*/ );
24212421 for (int x = 0 ; x < 8 ; x++) {
24222422 __builtin_mma_xvi8ger4pp (&acc_0, vec_A[x], vec_B[x]);
24232423 __builtin_mma_xvi8ger4pp (&acc_1, vec_A[x], vec_B[x+8 ]);
@@ -2428,7 +2428,7 @@ class tinyBLAS_Q0_PPC {
24282428 *((float *)&vs[I+4 ]+J) = (unhalf ((A+((ii+I)*lda)+l)->d ) * unhalf ((B+((jj+J+4 )*ldb)+l)->d ));
24292429 }
24302430 }
2431- if (!isAblock_q4) {
2431+ /* if (!isAblock_q4) {
24322432 auto aoffset = A+(ii*lda)+l;
24332433 for (int i = 0; i < 4; i++) {
24342434 comparray[i] = 0;
@@ -2439,7 +2439,7 @@ class tinyBLAS_Q0_PPC {
24392439 comparray[i] = ca;
24402440 aoffset += lda;
24412441 }
2442- }
2442+ }*/
24432443 compute<4 >(&acc_0, 0 , 0 , comparray, vs, fin_res);
24442444 compute<4 >(&acc_1, 0 , 4 , comparray, vs, fin_res);
24452445 }
@@ -2460,9 +2460,9 @@ class tinyBLAS_Q0_PPC {
24602460 if (std::is_same_v<TA, block_q4_0>) {
24612461 packNormalInt4<int8_t , vector signed char , 8 >((A+(ii*lda)+l), lda, 8 , 4 , (int8_t *)vec_A, comparray);
24622462 } else {
2463- packNormal<int8_t , vector signed char >((const TB*)(A+(ii*lda)+l), lda, 8 , 8 , (int8_t *)vec_A, false );
2463+ packNormal<int8_t , vector signed char >((const TB*)(A+(ii*lda)+l), lda, 8 , 8 , (int8_t *)vec_A/* , false*/ );
24642464 }
2465- packNormal<uint8_t , vector unsigned char >((B+(jj*ldb)+l), ldb, 4 , 8 , (uint8_t *)vec_B, true );
2465+ packNormal<uint8_t , vector unsigned char >((B+(jj*ldb)+l), ldb, 4 , 8 , (uint8_t *)vec_B/* , true*/ );
24662466 for (int x = 0 ; x < 8 ; x++) {
24672467 __builtin_mma_xvi8ger4pp (&acc_0, vec_A[x], vec_B[x]);
24682468 __builtin_mma_xvi8ger4pp (&acc_1, vec_A[x+8 ], vec_B[x]);
@@ -2472,7 +2472,7 @@ class tinyBLAS_Q0_PPC {
24722472 *((float *)&vs[I]+J) = (unhalf ((A+((ii+I)*lda)+l)->d ) * unhalf ((B+((jj+J)*ldb)+l)->d ));
24732473 }
24742474 }
2475- if (!isAblock_q4) {
2475+ /* if (!isAblock_q4) {
24762476 auto aoffset = A+(ii*lda)+l;
24772477 for (int i = 0; i < 8; i++) {
24782478 comparray[i] = 0;
@@ -2483,7 +2483,7 @@ class tinyBLAS_Q0_PPC {
24832483 comparray[i] = ca;
24842484 aoffset += lda;
24852485 }
2486- }
2486+ }*/
24872487 compute<8 >(&acc_0, 0 , 0 , comparray, vs, fin_res);
24882488 compute<8 >(&acc_1, 4 , 4 , comparray, vs, fin_res);
24892489 }
@@ -2506,9 +2506,9 @@ class tinyBLAS_Q0_PPC {
25062506 if (std::is_same_v<TA, block_q4_0>) {
25072507 packNormalInt4<int8_t , vector signed char , 8 >((A+(ii*lda)+l), lda, 8 , 4 , (int8_t *)vec_A, comparray);
25082508 } else {
2509- packNormal<int8_t , vector signed char >((const TB*)(A+(ii*lda)+l), lda, 8 , 8 , (int8_t *)vec_A, false );
2509+ packNormal<int8_t , vector signed char >((const TB*)(A+(ii*lda)+l), lda, 8 , 8 , (int8_t *)vec_A/* , false*/ );
25102510 }
2511- packNormal<uint8_t , vector unsigned char >((B+(jj*ldb)+l), ldb, 8 , 8 , (uint8_t *)vec_B, true );
2511+ packNormal<uint8_t , vector unsigned char >((B+(jj*ldb)+l), ldb, 8 , 8 , (uint8_t *)vec_B/* , true*/ );
25122512 for (int x = 0 ; x < 8 ; x++) {
25132513 __builtin_mma_xvi8ger4pp (&acc_0, vec_A[x], vec_B[x]);
25142514 __builtin_mma_xvi8ger4pp (&acc_1, vec_A[x+8 ], vec_B[x]);
@@ -2521,7 +2521,7 @@ class tinyBLAS_Q0_PPC {
25212521 *((float *)&vs[I+8 ]+J) = (unhalf ((A+((ii+I)*lda)+l)->d ) * unhalf ((B+((jj+J+4 )*ldb)+l)->d ));
25222522 }
25232523 }
2524- if (!isAblock_q4) {
2524+ /* if (!isAblock_q4) {
25252525 auto aoffset = A+(ii*lda)+l;
25262526 for (int i = 0; i < 8; i++) {
25272527 comparray[i] = 0;
@@ -2532,7 +2532,7 @@ class tinyBLAS_Q0_PPC {
25322532 comparray[i] = ca;
25332533 aoffset += lda;
25342534 }
2535- }
2535+ }*/
25362536 compute<8 >(&acc_0, 0 , 0 , comparray, vs, fin_res);
25372537 compute<8 >(&acc_1, 4 , 4 , comparray, vs, fin_res);
25382538 compute<8 >(&acc_2, 0 , 8 , comparray, vs, fin_res);
@@ -2576,9 +2576,9 @@ class tinyBLAS_Q0_PPC {
25762576 if (isAblock_q4) {
25772577 packNormalInt4<int8_t , vector signed char , 4 >((A+(ii*lda)+l), lda, RM, 4 , (int8_t *)vec_A, comparray);
25782578 } else {
2579- packNormal<int8_t , vector signed char >((const TB*)(A+(ii*lda)+l), lda, RM, 8 , (int8_t *)vec_A, false );
2579+ packNormal<int8_t , vector signed char >((const TB*)(A+(ii*lda)+l), lda, RM, 8 , (int8_t *)vec_A/* , false*/ );
25802580 }
2581- packNormal<uint8_t , vector unsigned char >((B+(jj*ldb)+l), ldb, RN, 8 , (uint8_t *)vec_B, true );
2581+ packNormal<uint8_t , vector unsigned char >((B+(jj*ldb)+l), ldb, RN, 8 , (uint8_t *)vec_B/* , true*/ );
25822582 for (int x = 0 ; x < 8 ; x+=4 ) {
25832583 __builtin_mma_xvi8ger4pp (&acc_0, vec_A[x], vec_B[x]);
25842584 __builtin_mma_xvi8ger4pp (&acc_0, vec_A[x+1 ], vec_B[x+1 ]);
@@ -2591,7 +2591,7 @@ class tinyBLAS_Q0_PPC {
25912591 }
25922592 }
25932593 __builtin_mma_disassemble_acc (vec_C, &acc_0);
2594- if (!isAblock_q4) {
2594+ /* if (!isAblock_q4) {
25952595 auto aoffset = A+(ii*lda)+l;
25962596 for (int i = 0; i < RM; i++) {
25972597 comparray[i] = 0;
@@ -2602,10 +2602,10 @@ class tinyBLAS_Q0_PPC {
26022602 comparray[i] = ca;
26032603 aoffset += lda;
26042604 }
2605- }
2605+ }*/
26062606 for (int i = 0 ; i < RM; i++) {
2607- CA[i] = vec_splats ((float )(((double )comparray[i]) * -128.0 ));
2608- res[i] = vec_add ( vec_ctf (vec_C[i], 0 ), CA[i] );
2607+ // CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
2608+ res[i] = vec_ctf (vec_C[i], 0 );
26092609 fin_res[i] = vec_madd (res[i], vs[i], fin_res[i]);
26102610 }
26112611 }
0 commit comments