@@ -146,7 +146,8 @@ struct ScoreFunction
146146 static constexpr unsigned long long maxNumberOfSynapses = populationThreshold * numberOfNeighbors;
147147 static constexpr unsigned long long initNumberOfSynapses = numberOfNeurons * numberOfNeighbors;
148148 static constexpr long long radius = (long long )numberOfNeighbors / 2 ;
149- static constexpr long long paddingNeuronsCount = maxNumberOfNeurons + numberOfNeighbors;
149+ static constexpr long long paddingNeuronsCount = (maxNumberOfNeurons + numberOfNeighbors + BATCH_SIZE - 1 ) / BATCH_SIZE * BATCH_SIZE;
150+ static constexpr long long incommingSynapsesPitch = (numberOfNeighbors + 1 + BATCH_SIZE - 1 ) / BATCH_SIZE * BATCH_SIZE;
150151
151152 static_assert (numberOfInputNeurons % 64 == 0 , " numberOfInputNeurons must be divided by 64" );
152153 static_assert (numberOfOutputNeurons % 64 == 0 , " numberOfOutputNeurons must be divided by 64" );
@@ -216,7 +217,8 @@ struct ScoreFunction
216217
217218 Neuron* neurons;
218219 // Padding start and end of neurons so that we can reduce the condition checking
219- Neuron paddingNeurons[maxNumberOfNeurons + numberOfNeighbors];
220+ // Also padding at the end so that is divided by BATCH_SIZE
221+ Neuron paddingNeurons[paddingNeuronsCount];
220222 NeuronType neuronTypes[maxNumberOfNeurons];
221223 Synapse synapses[maxNumberOfSynapses];
222224
@@ -241,8 +243,7 @@ struct ScoreFunction
241243 unsigned long long removalNeuronsCount;
242244
243245 // Contain incomming synapse of neurons. The center one will be zeros
244- Synapse incommingSynapses[maxNumberOfSynapses + maxNumberOfNeurons];
245-
246+ Synapse incommingSynapses[maxNumberOfNeurons * incommingSynapsesPitch];
246247
247248 void mutate (unsigned long long mutateStep)
248249 {
@@ -596,15 +597,14 @@ struct ScoreFunction
596597 setMem (neuronValueBuffer, sizeof (neuronValueBuffer), 0 );
597598 Neuron* pPaddingNeurons = currentANN.paddingNeurons ;
598599 Synapse* synapses = incommingSynapses;
599- Neuron* neurons = currentANN.neurons ;
600600
601- for (unsigned long long n = 0 ; n < population; ++n, pPaddingNeurons++, synapses += (numberOfNeighbors + 1 ) )
601+ for (unsigned long long n = 0 ; n < population; ++n, pPaddingNeurons++, synapses += incommingSynapsesPitch )
602602 {
603603 int neuronValue = 0 ;
604- long long m = 0 ;
605604#if defined (__AVX512F__)
606605 const __m512i zeros512 = _mm512_setzero_si512 ();
607- for (; m + BATCH_SIZE <= numberOfNeighbors; m += BATCH_SIZE)
606+ const __m512i allOnes512 = _mm512_set1_epi8 (char (-1 ));
607+ for (long long m = 0 ; m < incommingSynapsesPitch; m += BATCH_SIZE)
608608 {
609609 const __m512i neurons512 = _mm512_loadu_si512 ((const __m512i*)(pPaddingNeurons + m));
610610 const __m512i synapses512 = _mm512_loadu_si512 ((const __m512i*)(synapses + m));
@@ -622,7 +622,7 @@ struct ScoreFunction
622622 const __m256i allOnes256 = _mm256_set1_epi8 (-1 );
623623 unsigned int negMask = 0 ;
624624 unsigned int posMask = 0 ;
625- for (; m + BATCH_SIZE <= numberOfNeighbors ; m += BATCH_SIZE)
625+ for (long long m = 0 ; m < incommingSynapsesPitch ; m += BATCH_SIZE)
626626 {
627627 const __m256i neurons256 = _mm256_loadu_si256 ((const __m256i*)(pPaddingNeurons + m));
628628 const __m256i synapses256 = _mm256_loadu_si256 ((const __m256i*)(synapses + m));
@@ -642,20 +642,10 @@ struct ScoreFunction
642642 }
643643
644644#endif
645-
646- for (; m <= numberOfNeighbors; ++m)
647- {
648- const Synapse synapseWeight = synapses[m];
649- const Neuron nVal = pPaddingNeurons[m];
650-
651- // Weight-sum
652- neuronValue += synapseWeight * nVal;
653- }
654-
655645 neuronValueBuffer[n] = (Neuron)clampNeuron (neuronValue);
656646 }
657647
658- copyMem (neurons, neuronValueBuffer, population * sizeof (Neuron));
648+ copyMem (currentANN. neurons , neuronValueBuffer, population * sizeof (Neuron));
659649 }
660650
661651 void runTickSimulation ()
@@ -682,18 +672,19 @@ struct ScoreFunction
682672 for (long long m = 0 ; m < radius; m++)
683673 {
684674 Synapse synapseWeight = kSynapses [m];
685- unsigned long long nnIndex = clampNeuronIndex (n + m, -( long long )numberOfNeighbors / 2 );
686- incommingSynapses[nnIndex * (numberOfNeighbors + 1 ) + (numberOfNeighbors - m)] = synapseWeight; // need to pad 1
675+ unsigned long long nnIndex = clampNeuronIndex (n + m, -radius );
676+ incommingSynapses[nnIndex * incommingSynapsesPitch + (numberOfNeighbors - m)] = synapseWeight;
687677 }
688678
689- incommingSynapses[n * (numberOfNeighbors + 1 ) + radius] = 0 ;
690-
691679 for (long long m = radius; m < numberOfNeighbors; m++)
692680 {
693681 Synapse synapseWeight = kSynapses [m];
694- unsigned long long nnIndex = clampNeuronIndex (n + m + 1 , -( long long )numberOfNeighbors / 2 );
695- incommingSynapses[nnIndex * (numberOfNeighbors + 1 ) + (numberOfNeighbors - m - 1 )] = synapseWeight;
682+ unsigned long long nnIndex = clampNeuronIndex (n + m + 1 , -radius );
683+ incommingSynapses[nnIndex * incommingSynapsesPitch + (numberOfNeighbors - m - 1 )] = synapseWeight;
696684 }
685+
686+ // Self incomming synapse is set as zero
687+ incommingSynapses[n * incommingSynapsesPitch + radius] = 0 ;
697688 }
698689
699690 for (unsigned long long tick = 0 ; tick < numberOfTicks; ++tick)
@@ -872,6 +863,7 @@ struct ScoreFunction
872863 Neuron* neurons = currentANN.neurons ;
873864 InitValue* initValue = (InitValue*)paddingInitValue;
874865
866+
875867 // Initialization
876868 population = numberOfNeurons;
877869 removalNeuronsCount = 0 ;
@@ -1000,6 +992,8 @@ struct ScoreFunction
1000992 bool initMemory ()
1001993 {
1002994 random2PoolLock = 0 ;
995+
996+ // Make sure all padding data is set as zeros
1003997 setMem (_computeBuffer, sizeof (_computeBuffer), 0 );
1004998
1005999 for (int i = 0 ; i < solutionBufferCount; i++)
0 commit comments