@@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
5757 }
5858}
5959
60+ // --------------------------------------------------------------------------------------------------------------
61+
6062__global__
61- void cryptolight_core_gpu_phase2 (const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
63+ void cryptolight_old_gpu_phase2 (const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
6264{
6365 __shared__ uint32_t __align__ (16 ) sharedMemory[1024 ];
6466
@@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
209211#endif // __CUDA_ARCH__ >= 300
210212}
211213
214+ __device__ __forceinline__ void store_variant1 (uint32_t * long_state)
215+ {
216+ uint4 * Z = (uint4 *) long_state;
217+ const uint32_t tmp = (Z->z >> 24 ); // __byte_perm(src, 0, 0x7773);
218+ const uint32_t index = (((tmp >> 3 ) & 6u ) | (tmp & 1u )) << 1 ;
219+ Z->z = (Z->z & 0x00ffffffu ) | ((tmp ^ ((0x75310u >> index) & 0x30u )) << 24 );
220+ }
221+
222+ #define MUL_SUM_XOR_DST_1 (a,c,dst,tweak ) { \
223+ uint64_t hi, lo = cuda_mul128 (((uint64_t *)a)[0 ], ((uint64_t *)dst)[0 ], &hi) + ((uint64_t *)c)[1 ]; \
224+ hi += ((uint64_t *)c)[0 ]; \
225+ ((uint64_t *)c)[0 ] = ((uint64_t *)dst)[0 ] ^ hi; \
226+ ((uint64_t *)c)[1 ] = ((uint64_t *)dst)[1 ] ^ lo; \
227+ ((uint64_t *)dst)[0 ] = hi; \
228+ ((uint64_t *)dst)[1 ] = lo ^ tweak; }
229+
230+ __global__
231+ void cryptolight_gpu_phase2 (const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
232+ uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
233+ uint64_t * __restrict__ d_tweak)
234+ {
235+ __shared__ __align__ (16 ) uint32_t sharedMemory[1024 ];
236+ cn_aes_gpu_init (sharedMemory);
237+ __syncthreads ();
238+
239+ const uint32_t thread = blockDim .x * blockIdx .x + threadIdx .x ;
240+ if (thread < threads)
241+ {
242+ const uint32_t batchsize = ITER >> (2 + bfactor);
243+ const uint32_t start = partidx * batchsize;
244+ const uint32_t end = start + batchsize;
245+ const uint32_t longptr = thread << LONG_SHL_IDX;
246+ uint32_t * long_state = &d_long_state[longptr];
247+ uint64_t tweak = d_tweak[thread];
248+
249+ void * ctx_a = (void *)(&d_ctx_a[thread << 2 ]);
250+ void * ctx_b = (void *)(&d_ctx_b[thread << 2 ]);
251+ uint4 A = AS_UINT4 (ctx_a); // ld.global.u32.v4
252+ uint4 B = AS_UINT4 (ctx_b);
253+ uint32_t * a = (uint32_t *)&A;
254+ uint32_t * b = (uint32_t *)&B;
255+
256+ for (int i = start; i < end; i++)
257+ {
258+ uint32_t c[4 ];
259+ uint32_t j = (A.x >> 2 ) & E2I_MASK2;
260+ cn_aes_single_round (sharedMemory, &long_state[j], c, a);
261+ XOR_BLOCKS_DST (c, b, &long_state[j]);
262+ store_variant1 (&long_state[j]);
263+ MUL_SUM_XOR_DST_1 (c, a, &long_state[(c[0 ] >> 2 ) & E2I_MASK2], tweak);
264+
265+ j = (A.x >> 2 ) & E2I_MASK2;
266+ cn_aes_single_round (sharedMemory, &long_state[j], b, a);
267+ XOR_BLOCKS_DST (b, c, &long_state[j]);
268+ store_variant1 (&long_state[j]);
269+ MUL_SUM_XOR_DST_1 (b, a, &long_state[(b[0 ] >> 2 ) & E2I_MASK2], tweak);
270+ }
271+ if (bfactor) {
272+ AS_UINT4 (ctx_a) = A;
273+ AS_UINT4 (ctx_b) = B;
274+ }
275+ }
276+ }
277+
212278__global__
213279void cryptolight_core_gpu_phase3 (int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
214280{
@@ -252,7 +318,7 @@ extern int device_bfactor[MAX_GPUS];
252318
253319__host__
254320void cryptolight_core_hash (int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
255- uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
321+ uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak )
256322{
257323 dim3 grid (blocks);
258324 dim3 block (threads);
@@ -271,7 +337,11 @@ void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long
271337
272338 for (i = 0 ; i < partcount; i++)
273339 {
274- cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
340+ dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
341+ if (variant == 0 )
342+ cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
343+ else
344+ cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
275345 exit_if_cudaerror (thr_id, __FUNCTION__, __LINE__);
276346 if (partcount > 1 ) usleep (bsleep);
277347 }
0 commit comments