upgrade cryptolight algo for AEON support

tpruvot · tpruvot · commit 400516017059 · 2018-06-23T13:22:37.000+02:00
they also forked to variant 1 system...
diff --git a/ccminer.cpp b/ccminer.cpp
@@ -2380,7 +2380,7 @@ static void *miner_thread(void *userdata)
 			rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_CRYPTOLIGHT:
-			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);
+			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
 			break;
 		case ALGO_CRYPTONIGHT:
 		{
diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu
@@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
 	}
 }
 
+// --------------------------------------------------------------------------------------------------------------
+
 __global__
-void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
 {
 	__shared__ uint32_t __align__(16) sharedMemory[1024];
 
@@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
 #endif // __CUDA_ARCH__ >= 300
 }
 
+__device__ __forceinline__ void store_variant1(uint32_t* long_state)
+{
+	uint4* Z = (uint4*) long_state;
+	const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
+	Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
+}
+
+#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
+        uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
+        hi += ((uint64_t *)c)[0]; \
+        ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
+        ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
+        ((uint64_t *)dst)[0] = hi; \
+        ((uint64_t *)dst)[1] = lo ^ tweak; }
+
+__global__
+void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		const uint32_t longptr = thread << LONG_SHL_IDX;
+		uint32_t * long_state = &d_long_state[longptr];
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+		uint32_t* a = (uint32_t*)&A;
+		uint32_t* b = (uint32_t*)&B;
+
+		for (int i = start; i < end; i++)
+		{
+			uint32_t c[4];
+			uint32_t j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], c, a);
+			XOR_BLOCKS_DST(c, b, &long_state[j]);
+			store_variant1(&long_state[j]);
+			MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
+
+			j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], b, a);
+			XOR_BLOCKS_DST(b, c, &long_state[j]);
+			store_variant1(&long_state[j]);
+			MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
 __global__
 void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
 {
@@ -252,7 +318,7 @@ extern int device_bfactor[MAX_GPUS];
 
 __host__
 void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
-	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
@@ -271,7 +337,11 @@ void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long
 
 	for(i = 0; i < partcount; i++)
 	{
-		cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
+		if (variant == 0)
+			cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		else
+			cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}
diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp
@@ -22,6 +22,16 @@ struct cryptonight_ctx {
 	oaes_ctx* aes_ctx;
 };
 
+
+static void cryptolight_store_variant(void* state, int variant) {
+	if (variant == 1) {
+		// use variant 1 like monero since june 2018
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	}
+}
+
 static void do_blake_hash(const void* input, int len, void* output)
 {
 	uchar hash[32];
@@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
 }
 
-static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];
 
 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
 }
 
 static void copy_block(uint8_t* dst, const uint8_t* src) {
@@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }
 
-static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx)
+static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
+	if (variant && len < 43)
+		return 0;
+
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 
+	const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
+
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
 #undef RND
@@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		cryptolight_store_variant(&ctx->long_state[j], variant);
 
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
 
 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		cryptolight_store_variant(&ctx->long_state[j], variant);
 
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
 	}
 
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
 	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
 
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+	return 1;
 }
 
-void cryptolight_hash(void* output, const void* input, int len)
+int cryptolight_hash_variant(void* output, const void* input, int len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	cryptolight_hash_ctx(output, input, len, ctx);
+	int rc = cryptolight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
+	return rc;
 }
+
+void cryptolight_hash(void* output, const void* input)
+{
+	cryptolight_hash_variant(output, input, 76, 1);
+}
+
diff --git a/crypto/cryptolight.cu b/crypto/cryptolight.cu
@@ -11,12 +11,13 @@ static uint32_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
+static uint64_t *d_ctx_tweak[MAX_GPUS];
 static uint32_t *d_ctx_a[MAX_GPUS];
 static uint32_t *d_ctx_b[MAX_GPUS];
 
 static bool init[MAX_GPUS] = { 0 };
 
-extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
 {
 	int res = 0;
 	uint32_t throughput = 0;
@@ -26,13 +27,18 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 	uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
 	const uint32_t first_nonce = *nonceptr;
 	uint32_t nonce = first_nonce;
+	int dev_id = device_map[thr_id];
 
 	if(opt_benchmark) {
 		ptarget[7] = 0x00ff;
 	}
 
 	if(!init[thr_id])
 	{
+		if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
+			device_config[thr_id] = strdup("80x32");
+		}
+
 		if (device_config[thr_id]) {
 			sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
 
 		init[thr_id] = true;
 	}
@@ -91,8 +98,8 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
 
 		cryptonight_extra_setData(thr_id, pdata, ptarget);
-		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], 0, NULL);
-		cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
 		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
 
 		*hashes_done = nonce - first_nonce + throughput;
@@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			cryptolight_hash(vhash, tempdata, 76);
+			cryptolight_hash_variant(vhash, tempdata, 76, variant);
 			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
 				res = 1;
@@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 				if(resNonces[1] != UINT32_MAX)
 				{
 					*tempnonceptr = resNonces[1];
-					cryptolight_hash(vhash, tempdata, 76);
+					cryptolight_hash_variant(vhash, tempdata, 76, variant);
 					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 						res++;
 						work->nonces[1] = resNonces[1];
@@ -157,6 +164,7 @@ void free_cryptolight(int thr_id)
 	cudaFree(d_ctx_key1[thr_id]);
 	cudaFree(d_ctx_key2[thr_id]);
 	cudaFree(d_ctx_text[thr_id]);
+	cudaFree(d_ctx_tweak[thr_id]);
 	cudaFree(d_ctx_a[thr_id]);
 	cudaFree(d_ctx_b[thr_id]);
 
diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h
@@ -135,7 +135,7 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 	}
 }
 
-void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
 
 void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
 void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp
@@ -550,16 +550,19 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
 	}
 
 	else if (opt_algo == ALGO_CRYPTOLIGHT) {
+		int variant = 1;
 		uint32_t nonce = work->nonces[idnonce];
 		noncestr = bin2hex((unsigned char*) &nonce, 4);
 		last_found_nonce = nonce;
-		cryptolight_hash(hash, data, 76);
+		//if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
+		//	variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
+		cryptolight_hash_variant(hash, data, 76, variant);
 		work_set_target_ratio(work, (uint32_t*) hash);
 	}
 
 	else if (opt_algo == ALGO_CRYPTONIGHT) {
-		uint32_t nonce = work->nonces[idnonce];
 		int variant = 0;
+		uint32_t nonce = work->nonces[idnonce];
 		noncestr = bin2hex((unsigned char*) &nonce, 4);
 		last_found_nonce = nonce;
 		if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
diff --git a/miner.h b/miner.h
@@ -279,7 +279,7 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce,
 extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
 extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
 extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -900,7 +900,8 @@ void blake2b_hash(void *output, const void *input);
 void blake2s_hash(void *output, const void *input);
 void bmw_hash(void *state, const void *input);
 void c11hash(void *output, const void *input);
-void cryptolight_hash(void* output, const void* input, int len);
+int cryptolight_hash_variant(void* output, const void* input, int len, int variant);
+void cryptolight_hash(void* output, const void* input);
 int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
 void cryptonight_hash(void* output, const void* input);
 void monero_hash(void* output, const void* input);
diff --git a/util.cpp b/util.cpp
@@ -2193,7 +2193,7 @@ void print_hash_tests(void)
 	c11hash(&hash[0], &buf[0]);
 	printpfx("c11", hash);
 
-	cryptolight_hash(&hash[0], &buf[0], 76);
+	cryptolight_hash(&hash[0], &buf[0]);
 	printpfx("cryptolight", hash);
 
 	cryptonight_hash(&hash[0], &buf[0]);

Original file line number	Diff line number	Diff line change
`@@ -2380,7 +2380,7 @@ static void miner_thread(void userdata)`
`2380`	`2380`	`rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);`
`2381`	`2381`	`break;`
`2382`	`2382`	`case ALGO_CRYPTOLIGHT:`
`2383`		`- rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);`
	`2383`	`+ rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);`
`2384`	`2384`	`break;`
`2385`	`2385`	`case ALGO_CRYPTONIGHT:`
`2386`	`2386`	`{`
Original file line number	Diff line number	Diff line change
`@@ -135,7 +135,7 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)`
`135`	`135`	`}`
`136`	`136`	`}`
`137`	`137`
`138`		`-void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t d_long_state, uint32_t d_ctx_state, uint32_t d_ctx_a, uint32_t d_ctx_b, uint32_t d_ctx_key1, uint32_t d_ctx_key2);`
	`138`	`+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t d_long_state, uint32_t d_ctx_state, uint32_t d_ctx_a, uint32_t d_ctx_b, uint32_t d_ctx_key1, uint32_t d_ctx_key2, int variant, uint64_t *d_ctx_tweak);`
`139`	`139`
`140`	`140`	`void cryptonight_extra_setData(int thr_id, const void data, const void ptarget);`
`141`	`141`	`void cryptonight_extra_init(int thr_id/, uint32_t threads/);`