77#include < stdio.h>
88#include < memory.h>
99
10+ // IMPORTANT: leave this enabled!
1011#define USE_SHARED 1
1112
1213// aus cpu-miner.c
@@ -15,35 +16,41 @@ extern int device_map[8];
1516// aus heavy.cu
1617extern cudaError_t MyStreamSynchronize (cudaStream_t stream, int situation, int thr_id);
1718
19+ // aus driver.c
20+ extern " C" void set_device (int device);
21+
1822// Folgende Definitionen später durch header ersetzen
1923typedef unsigned char uint8_t ;
2024typedef unsigned int uint32_t ;
2125typedef unsigned long long uint64_t ;
2226
27+ // diese Struktur wird in der Init Funktion angefordert
28+ static cudaDeviceProp props;
29+
2330// globaler Speicher für alle HeftyHashes aller Threads
2431__constant__ uint32_t pTarget[8 ]; // Single GPU
2532extern uint32_t *d_resultNonce[8 ];
2633
2734__constant__ uint32_t groestlcoin_gpu_msg[32 ];
2835
36+ #define SPH_C32 (x ) ((uint32_t )(x ## U))
2937#define SPH_T32 (x ) ((x) & SPH_C32 (0xFFFFFFFF ))
3038
3139#define PC32up (j, r ) ((uint32_t )((j) + (r)))
3240#define PC32dn (j, r ) 0
3341#define QC32up (j, r ) 0xFFFFFFFF
3442#define QC32dn (j, r ) (((uint32_t )(r) << 24 ) ^ SPH_T32(~((uint32_t )(j) << 24 )))
3543
36- #define B32_0 (x ) ((x) & 0xFF )
37- #define B32_1 (x ) (((x) >> 8 ) & 0xFF )
38- #define B32_2 (x ) (((x) >> 16 ) & 0xFF )
39- #define B32_3 (x ) ((x) >> 24 )
40-
41- #define SPH_C32 (x ) ((uint32_t )(x ## U))
42- #define C32e (x ) ((SPH_C32(x) >> 24 ) \
43- | ((SPH_C32(x) >> 8 ) & SPH_C32 (0x0000FF00 )) \
44- | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000 )) \
45- | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000 )))
44+ #define B32_0 (x ) __byte_perm(x, 0 , 0x4440 )
45+ // ((x) & 0xFF)
46+ #define B32_1 (x ) __byte_perm(x, 0 , 0x4441 )
47+ // (((x) >> 8) & 0xFF)
48+ #define B32_2 (x ) __byte_perm(x, 0 , 0x4442 )
49+ // (((x) >> 16) & 0xFF)
50+ #define B32_3 (x ) __byte_perm(x, 0 , 0x4443 )
51+ // ((x) >> 24)
4652
53+ #if 0
4754#if USE_SHARED
4855#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
4956#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
@@ -63,6 +70,18 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
6370#define T3up(x) tex1Dfetch(t3up1, x)
6471#define T3dn(x) tex1Dfetch(t3dn1, x)
6572#endif
73+ #endif
74+
75+ // a healthy mix between shared and textured access provides the highest speed!
76+ #define T0up (x ) (*((uint32_t *)mixtabs + ( (x))))
77+ #define T0dn (x ) tex1Dfetch (t0dn1, x)
78+ #define T1up (x ) tex1Dfetch (t1up1, x)
79+ #define T1dn (x ) (*((uint32_t *)mixtabs + (768 +(x))))
80+ #define T2up (x ) tex1Dfetch (t2up1, x)
81+ #define T2dn (x ) (*((uint32_t *)mixtabs + (1280 +(x))))
82+ #define T3up (x ) (*((uint32_t *)mixtabs + (1536 +(x))))
83+ #define T3dn (x ) tex1Dfetch (t3dn1, x)
84+
6685texture<unsigned int , 1 , cudaReadModeElementType> t0up1;
6786texture<unsigned int , 1 , cudaReadModeElementType> t0dn1;
6887texture<unsigned int , 1 , cudaReadModeElementType> t1up1;
@@ -81,21 +100,6 @@ extern uint32_t T2dn_cpu[];
81100extern uint32_t T3up_cpu[];
82101extern uint32_t T3dn_cpu[];
83102
84- #if __CUDA_ARCH__ < 350
85- // Kepler (Compute 3.0)
86- #define S (x, n ) (((x) >> (n)) | ((x) << (32 - (n))))
87- #else
88- // Kepler (Compute 3.5)
89- #define S (x, n ) __funnelshift_r( x, x, n );
90- #endif
91- #define R (x, n ) ((x) >> (n))
92- #define Ch (x, y, z ) ((x & (y ^ z)) ^ z)
93- #define Maj (x, y, z ) ((x & (y | z)) | (y & z))
94- #define S0 (x ) (S(x, 2 ) ^ S(x, 13 ) ^ S(x, 22 ))
95- #define S1 (x ) (S(x, 6 ) ^ S(x, 11 ) ^ S(x, 25 ))
96- #define s0 (x ) (S(x, 7 ) ^ S(x, 18 ) ^ R(x, 3 ))
97- #define s1 (x ) (S(x, 17 ) ^ S(x, 19 ) ^ R(x, 10 ))
98-
99103#define SWAB32 (x ) ( ((x & 0x000000FF ) << 24 ) | ((x & 0x0000FF00 ) << 8 ) | ((x & 0x00FF0000 ) >> 8 ) | ((x & 0xFF000000 ) >> 24 ) )
100104
101105
@@ -152,32 +156,25 @@ __device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
152156 for (int k=0 ;k<16 ;k++) a[(k*2 )+0 ] ^= PC32up (k * 0x10 , 13 ); break ;
153157 }
154158
155- // RBTT
159+ // RBTT
156160#pragma unroll 16
157- for (int k=0 ;k<32 ;k+=2 )
158- {
159- t[k + 0 ] = T0up ( B32_0 (a[k & 0x1f ]) ) ^
160- T1up ( B32_1 (a[(k + 2 ) & 0x1f ]) ) ^
161- T2up ( B32_2 (a[(k + 4 ) & 0x1f ]) ) ^
162- T3up ( B32_3 (a[(k + 6 ) & 0x1f ]) ) ^
163- T0dn ( B32_0 (a[(k + 9 ) & 0x1f ]) ) ^
164- T1dn ( B32_1 (a[(k + 11 ) & 0x1f ]) ) ^
165- T2dn ( B32_2 (a[(k + 13 ) & 0x1f ]) ) ^
166- T3dn ( B32_3 (a[(k + 23 ) & 0x1f ]) );
167-
168- t[k + 1 ] = T0dn ( B32_0 (a[k & 0x1f ]) ) ^
169- T1dn ( B32_1 (a[(k + 2 ) & 0x1f ]) ) ^
170- T2dn ( B32_2 (a[(k + 4 ) & 0x1f ]) ) ^
171- T3dn ( B32_3 (a[(k + 6 ) & 0x1f ]) ) ^
172- T0up ( B32_0 (a[(k + 9 ) & 0x1f ]) ) ^
173- T1up ( B32_1 (a[(k + 11 ) & 0x1f ]) ) ^
174- T2up ( B32_2 (a[(k + 13 ) & 0x1f ]) ) ^
175- T3up ( B32_3 (a[(k + 23 ) & 0x1f ]) );
176- }
161+ for (int k=0 ;k<32 ;k+=2 )
162+ {
163+ uint32_t t0_0 = B32_0 (a[(k ) & 0x1f ]), t9_0 = B32_0 (a[(k + 9 ) & 0x1f ]);
164+ uint32_t t2_1 = B32_1 (a[(k + 2 ) & 0x1f ]), t11_1 = B32_1 (a[(k + 11 ) & 0x1f ]);
165+ uint32_t t4_2 = B32_2 (a[(k + 4 ) & 0x1f ]), t13_2 = B32_2 (a[(k + 13 ) & 0x1f ]);
166+ uint32_t t6_3 = B32_3 (a[(k + 6 ) & 0x1f ]), t23_3 = B32_3 (a[(k + 23 ) & 0x1f ]);
167+
168+ t[k + 0 ] = T0up ( t0_0 ) ^ T1up ( t2_1 ) ^ T2up ( t4_2 ) ^ T3up ( t6_3 ) ^
169+ T0dn ( t9_0 ) ^ T1dn ( t11_1 ) ^ T2dn ( t13_2 ) ^ T3dn ( t23_3 );
170+
171+ t[k + 1 ] = T0dn ( t0_0 ) ^ T1dn ( t2_1 ) ^ T2dn ( t4_2 ) ^ T3dn ( t6_3 ) ^
172+ T0up ( t9_0 ) ^ T1up ( t11_1 ) ^ T2up ( t13_2 ) ^ T3up ( t23_3 );
173+ }
177174#pragma unroll 32
178- for (int k=0 ;k<32 ;k++)
179- a[k] = t[k];
180- }
175+ for (int k=0 ;k<32 ;k++)
176+ a[k] = t[k];
177+ }
181178}
182179
183180__device__ __forceinline__ void groestlcoin_perm_Q (uint32_t *a, char *mixtabs)
@@ -233,32 +230,25 @@ __device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
233230 for (int k=0 ;k<16 ;k++) { a[(k*2 )+0 ] ^= QC32up (k * 0x10 , 13 ); a[(k*2 )+1 ] ^= QC32dn (k * 0x10 , 13 );} break ;
234231 }
235232
236- // RBTT
233+ // RBTT
237234#pragma unroll 16
238- for (int k=0 ;k<32 ;k+=2 )
239- {
240- t[k + 0 ] = T0up ( B32_0 (a[(k + 2 ) & 0x1f ]) ) ^
241- T1up ( B32_1 (a[(k + 6 ) & 0x1f ]) ) ^
242- T2up ( B32_2 (a[(k + 10 ) & 0x1f ]) ) ^
243- T3up ( B32_3 (a[(k + 22 ) & 0x1f ]) ) ^
244- T0dn ( B32_0 (a[(k + 1 ) & 0x1f ]) ) ^
245- T1dn ( B32_1 (a[(k + 5 ) & 0x1f ]) ) ^
246- T2dn ( B32_2 (a[(k + 9 ) & 0x1f ]) ) ^
247- T3dn ( B32_3 (a[(k + 13 ) & 0x1f ]) );
248-
249- t[k + 1 ] = T0dn ( B32_0 (a[(k + 2 ) & 0x1f ]) ) ^
250- T1dn ( B32_1 (a[(k + 6 ) & 0x1f ]) ) ^
251- T2dn ( B32_2 (a[(k + 10 ) & 0x1f ]) ) ^
252- T3dn ( B32_3 (a[(k + 22 ) & 0x1f ]) ) ^
253- T0up ( B32_0 (a[(k + 1 ) & 0x1f ]) ) ^
254- T1up ( B32_1 (a[(k + 5 ) & 0x1f ]) ) ^
255- T2up ( B32_2 (a[(k + 9 ) & 0x1f ]) ) ^
256- T3up ( B32_3 (a[(k + 13 ) & 0x1f ]) );
257- }
235+ for (int k=0 ;k<32 ;k+=2 )
236+ {
237+ uint32_t t2_0 = B32_0 (a[(k + 2 ) & 0x1f ]), t1_0 = B32_0 (a[(k + 1 ) & 0x1f ]);
238+ uint32_t t6_1 = B32_1 (a[(k + 6 ) & 0x1f ]), t5_1 = B32_1 (a[(k + 5 ) & 0x1f ]);
239+ uint32_t t10_2 = B32_2 (a[(k + 10 ) & 0x1f ]), t9_2 = B32_2 (a[(k + 9 ) & 0x1f ]);
240+ uint32_t t22_3 = B32_3 (a[(k + 22 ) & 0x1f ]), t13_3 = B32_3 (a[(k + 13 ) & 0x1f ]);
241+
242+ t[k + 0 ] = T0up ( t2_0 ) ^ T1up ( t6_1 ) ^ T2up ( t10_2 ) ^ T3up ( t22_3 ) ^
243+ T0dn ( t1_0 ) ^ T1dn ( t5_1 ) ^ T2dn ( t9_2 ) ^ T3dn ( t13_3 );
244+
245+ t[k + 1 ] = T0dn ( t2_0 ) ^ T1dn ( t6_1 ) ^ T2dn ( t10_2 ) ^ T3dn ( t22_3 ) ^
246+ T0up ( t1_0 ) ^ T1up ( t5_1 ) ^ T2up ( t9_2 ) ^ T3up ( t13_3 );
247+ }
258248#pragma unroll 32
259- for (int k=0 ;k<32 ;k++)
260- a[k] = t[k];
261- }
249+ for (int k=0 ;k<32 ;k++)
250+ a[k] = t[k];
251+ }
262252}
263253#if USE_SHARED
264254__global__ void /* __launch_bounds__(256) */
@@ -271,14 +261,17 @@ __global__ void
271261#if USE_SHARED
272262 extern __shared__ char mixtabs[];
273263
274- *((uint32_t *)mixtabs + ( threadIdx .x )) = tex1Dfetch (t0up1, threadIdx .x );
275- *((uint32_t *)mixtabs + (256 +threadIdx .x )) = tex1Dfetch (t0dn1, threadIdx .x );
276- *((uint32_t *)mixtabs + (512 +threadIdx .x )) = tex1Dfetch (t1up1, threadIdx .x );
277- *((uint32_t *)mixtabs + (768 +threadIdx .x )) = tex1Dfetch (t1dn1, threadIdx .x );
278- *((uint32_t *)mixtabs + (1024 +threadIdx .x )) = tex1Dfetch (t2up1, threadIdx .x );
279- *((uint32_t *)mixtabs + (1280 +threadIdx .x )) = tex1Dfetch (t2dn1, threadIdx .x );
280- *((uint32_t *)mixtabs + (1536 +threadIdx .x )) = tex1Dfetch (t3up1, threadIdx .x );
281- *((uint32_t *)mixtabs + (1792 +threadIdx .x )) = tex1Dfetch (t3dn1, threadIdx .x );
264+ if (threadIdx .x < 256 )
265+ {
266+ *((uint32_t *)mixtabs + ( threadIdx .x )) = tex1Dfetch (t0up1, threadIdx .x );
267+ *((uint32_t *)mixtabs + (256 +threadIdx .x )) = tex1Dfetch (t0dn1, threadIdx .x );
268+ *((uint32_t *)mixtabs + (512 +threadIdx .x )) = tex1Dfetch (t1up1, threadIdx .x );
269+ *((uint32_t *)mixtabs + (768 +threadIdx .x )) = tex1Dfetch (t1dn1, threadIdx .x );
270+ *((uint32_t *)mixtabs + (1024 +threadIdx .x )) = tex1Dfetch (t2up1, threadIdx .x );
271+ *((uint32_t *)mixtabs + (1280 +threadIdx .x )) = tex1Dfetch (t2dn1, threadIdx .x );
272+ *((uint32_t *)mixtabs + (1536 +threadIdx .x )) = tex1Dfetch (t3up1, threadIdx .x );
273+ *((uint32_t *)mixtabs + (1792 +threadIdx .x )) = tex1Dfetch (t3dn1, threadIdx .x );
274+ }
282275
283276 __syncthreads ();
284277#endif
@@ -407,8 +400,11 @@ __global__ void
407400// Setup-Funktionen
408401__host__ void groestlcoin_cpu_init (int thr_id, int threads)
409402{
410- cudaSetDevice (device_map[thr_id]);
411- cudaDeviceSetCacheConfig ( cudaFuncCachePreferShared );
403+ cudaSetDevice (device_map[thr_id]);
404+
405+ cudaGetDeviceProperties (&props, device_map[thr_id]);
406+
407+ cudaDeviceSetCacheConfig ( cudaFuncCachePreferL1 );
412408// Texturen mit obigem Makro initialisieren
413409 texDef (t0up1, d_T0up, T0up_cpu, sizeof (uint32_t )*256 );
414410 texDef (t0dn1, d_T0dn, T0dn_cpu, sizeof (uint32_t )*256 );
@@ -452,11 +448,9 @@ __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
452448
453449__host__ void groestlcoin_cpu_hash (int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
454450{
455- #if USE_SHARED
456- const int threadsperblock = 256 ; // Alignment mit mixtab Grösse. NICHT ÄNDERN
457- #else
458- const int threadsperblock = 512 ; // so einstellen wie gewünscht ;-)
459- #endif
451+ // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
452+ // alle anderen mit 512 Threads.
453+ int threadsperblock = (props.major >= 3 ) ? 768 : 512 ;
460454
461455 // berechne wie viele Thread Blocks wir brauchen
462456 dim3 grid ((threads + threadsperblock-1 )/threadsperblock);
0 commit comments