@@ -627,10 +627,8 @@ real1 cut_worker_sparse_segmented(
627627
628628 for (int u = 0 ; u < n ; ++ u ) {
629629 const bool u_bit = get_bit (theta , u );
630- const uint row_start = G_rows [u ];
631630 const uint row_end = G_rows [u + 1 ];
632-
633- for (uint col = row_start ; col < row_end ; ++ col ) {
631+ for (uint col = G_rows [u ]; col < row_end ; ++ col ) {
634632 const int v = G_cols [col ];
635633 const real1 val = get_G_m (G_data , col , segment_size );
636634 const bool v_bit = get_bit (theta , v );
@@ -707,10 +705,8 @@ real1 single_bit_flip_worker_sparse_segmented(
707705 if (u == k ) {
708706 u_bit = !u_bit ;
709707 }
710- const uint row_start = G_rows [u ];
711708 const uint row_end = G_rows [u + 1 ];
712-
713- for (uint col = row_start ; col < row_end ; ++ col ) {
709+ for (uint col = G_rows [u ]; col < row_end ; ++ col ) {
714710 const int v = G_cols [col ];
715711 const real1 val = get_G_m (G_data , col , segment_size );
716712 bool v_bit = get_const_bit (theta , v );
@@ -783,10 +779,8 @@ real1 double_bit_flip_worker_sparse_segmented(
783779 if ((u == k ) || (u == l )) {
784780 u_bit = !u_bit ;
785781 }
786- const uint row_start = G_rows [u ];
787782 const uint row_end = G_rows [u + 1 ];
788-
789- for (uint col = row_start ; col < row_end ; ++ col ) {
783+ for (uint col = G_rows [u ]; col < row_end ; ++ col ) {
790784 const int v = G_cols [col ];
791785 const real1 val = get_G_m (G_data , col , segment_size );
792786 bool v_bit = get_const_bit (theta , v );
@@ -945,11 +939,11 @@ __kernel void gray(
945939 real1 energy = ZERO_R1 ;
946940 for (uint u = 0 ; u < n ; u ++ ) {
947941 const size_t u_offset = u * n ;
948- int bit_u = get_local_bit (theta_local , u );
942+ int u_bit = get_local_bit (theta_local , u );
949943 for (uint v = u + 1 ; v < n ; v ++ ) {
950- const int bit_v = get_local_bit (theta_local , v );
944+ const int v_bit = get_local_bit (theta_local , v );
951945 const real1 val = G_m [u_offset + v ];
952- if (bit_u != bit_v ) {
946+ if (u_bit != v_bit ) {
953947 energy += val ;
954948 } else if (is_spin_glass ) {
955949 energy -= val ;
@@ -1016,11 +1010,154 @@ __kernel void gray_segmented(
10161010 real1 energy = ZERO_R1 ;
10171011 for (uint u = 0 ; u < n ; u ++ ) {
10181012 const size_t u_offset = u * n ;
1019- int bit_u = get_local_bit (theta_local , u );
1013+ int u_bit = get_local_bit (theta_local , u );
10201014 for (uint v = u + 1 ; v < n ; v ++ ) {
1021- const int bit_v = get_local_bit (theta_local , v );
1015+ const int v_bit = get_local_bit (theta_local , v );
10221016 const real1 val = get_G_m (G_m , u_offset + v , segment_size );
1023- if (bit_u != bit_v ) {
1017+ if (u_bit != v_bit ) {
1018+ energy += val ;
1019+ } else if (is_spin_glass ) {
1020+ energy -= val ;
1021+ }
1022+ }
1023+ }
1024+
1025+ if (energy > best_energy ) {
1026+ best_energy = energy ;
1027+ best_i = i ;
1028+ } else {
1029+ theta_local [flip_bit >> 6U ] ^= 1UL << (flip_bit & 63U );
1030+ }
1031+ }
1032+ }
1033+
1034+ i = get_global_id (0 );
1035+ const size_t offset = i * blocks ;
1036+ for (int b = 0 ; b < blocks ; ++ b ) {
1037+ theta_out [offset + b ] = theta_local [b ];
1038+ }
1039+ energy_out [i ] = best_energy ;
1040+ }
1041+
1042+ __kernel void gray_sparse (
1043+ __global const real1 * G_data ,
1044+ __global const uint * G_rows ,
1045+ __global const uint * G_cols ,
1046+ __constant ulong * theta ,
1047+ __constant int * args ,
1048+ __global ulong * theta_out ,
1049+ __global real1 * energy_out
1050+ ) {
1051+ const int n = args [0 ];
1052+ const bool is_spin_glass = args [1 ];
1053+ const int gray_iterations = args [2 ];
1054+ const int blocks = (n + 63 ) / 64 ;
1055+ const int last_block = blocks - 1 ;
1056+
1057+ int i = get_global_id (0 );
1058+ const int max_i = get_global_size (0 );
1059+
1060+ ulong theta_local [2048 ];
1061+ for (int b = 0 ; b < blocks ; ++ b ) {
1062+ theta_local [b ] = theta [b ];
1063+ }
1064+
1065+ // Initialize different seed per thread
1066+ const int seed = i ^ (i >> 1 );
1067+ for (int b = 0 ; b < 64 ; ++ b ) {
1068+ theta_local [last_block ] ^= (seed >> (63U - b )) << b ;
1069+ }
1070+
1071+ real1 best_energy = - INFINITY ;
1072+ int best_i = i ;
1073+ int best_block = 0U ;
1074+ for (; i < gray_iterations ; i += max_i ) {
1075+ for (int block = 0 ; block < blocks ; ++ block ) {
1076+ const size_t flip_bit = gray_code_next (theta_local , i , block << 6U );
1077+ real1 energy = ZERO_R1 ;
1078+ for (uint u = 0 ; u < n ; u ++ ) {
1079+ int u_bit = get_local_bit (theta_local , u );
1080+ const size_t mCol = G_rows [u + 1 ];
1081+ for (int col = G_rows [u ]; col < mCol ; ++ col ) {
1082+ const int v = G_cols [col ];
1083+ const real1 val = G_data [col ];
1084+ bool v_bit = get_local_bit (theta_local , v );
1085+ if (u_bit != v_bit ) {
1086+ energy += val ;
1087+ } else if (is_spin_glass ) {
1088+ energy -= val ;
1089+ }
1090+ }
1091+ }
1092+
1093+ if (energy > best_energy ) {
1094+ best_energy = energy ;
1095+ best_i = i ;
1096+ } else {
1097+ theta_local [flip_bit >> 6U ] ^= 1UL << (flip_bit & 63U );
1098+ }
1099+ }
1100+ }
1101+
1102+ i = get_global_id (0 );
1103+ const size_t offset = i * blocks ;
1104+ for (int b = 0 ; b < blocks ; ++ b ) {
1105+ theta_out [offset + b ] = theta_local [b ];
1106+ }
1107+ energy_out [i ] = best_energy ;
1108+ }
1109+
1110+ __kernel void gray_sparse_segmented (
1111+ __global const real1 * G_data0 ,
1112+ __global const real1 * G_data1 ,
1113+ __global const real1 * G_data2 ,
1114+ __global const real1 * G_data3 ,
1115+ __global const uint * G_rows ,
1116+ __global const uint * G_cols ,
1117+ __constant ulong * theta ,
1118+ __constant int * args ,
1119+ __global ulong * theta_out ,
1120+ __global real1 * energy_out
1121+ ) {
1122+ __global const real1 * G_data [4 ] = { G_data0 , G_data1 , G_data2 , G_data3 };
1123+
1124+ const int n = args [0 ];
1125+ const bool is_spin_glass = args [1 ];
1126+ const int gray_iterations = args [2 ];
1127+ const int segment_size = args [3 ];
1128+ const int blocks = (n + 63 ) / 64 ;
1129+ const int last_block = blocks - 1 ;
1130+
1131+ int i = get_global_id (0 );
1132+ const int max_i = get_global_size (0 );
1133+
1134+ ulong theta_local [2048 ];
1135+ for (int b = 0 ; b < blocks ; ++ b ) {
1136+ theta_local [b ] = theta [b ];
1137+ }
1138+
1139+ // Initialize different seed per thread
1140+ const int seed = i ^ (i >> 1 );
1141+ for (int b = 0 ; b < 64 ; ++ b ) {
1142+ theta_local [last_block ] ^= (seed >> (63U - b )) << b ;
1143+ }
1144+
1145+ real1 best_energy = - INFINITY ;
1146+ int best_i = i ;
1147+ int best_block = 0U ;
1148+ for (; i < gray_iterations ; i += max_i ) {
1149+ for (int block = 0 ; block < blocks ; ++ block ) {
1150+ const size_t flip_bit = gray_code_next (theta_local , i , block << 6U );
1151+ real1 energy = ZERO_R1 ;
1152+ for (uint u = 0 ; u < n ; u ++ ) {
1153+ const size_t u_offset = u * n ;
1154+ int u_bit = get_local_bit (theta_local , u );
1155+ const uint row_end = G_rows [u + 1 ];
1156+ for (uint col = G_rows [u ]; col < row_end ; ++ col ) {
1157+ const int v = G_cols [col ];
1158+ const real1 val = get_G_m (G_data , col , segment_size );
1159+ const bool v_bit = get_local_bit (theta_local , v );
1160+ if (u_bit != v_bit ) {
10241161 energy += val ;
10251162 } else if (is_spin_glass ) {
10261163 energy -= val ;
0 commit comments