|
| 1 | +// Copyright (c) 2021 Jisang Yoon |
| 2 | +// All rights reserved. |
| 3 | +// |
| 4 | +// This source code is licensed under the Apache 2.0 license found in the |
| 5 | +// LICENSE file in the root directory of this source tree. |
| 6 | +#pragma once |
| 7 | +#include "utils/cuda_utils_kernels.cuh" |
| 8 | +#include "cuw2v/cuda_w2v_base_kernels.cuh" |
| 9 | + |
| 10 | + |
| 11 | +namespace cusim { |
| 12 | + |
| 13 | +__global__ void W2VNegSgKernel( |
| 14 | + const int* cols, const int* indptr, |
| 15 | + const int* random_table, default_random_engine* rngs, const int random_size, |
| 16 | + const int num_indptr, const int num_dims, const int neg, const int window_size, |
| 17 | + float* emb_in, float* emb_out, float* loss_nume, float* loss_deno, const float lr) { |
| 18 | + |
| 19 | + default_random_engine& rng = rngs[blockIdx.x]; |
| 20 | + float& _loss_nume = loss_nume[blockIdx.x]; |
| 21 | + float& _loss_deno = loss_deno[blockIdx.x]; |
| 22 | + |
| 23 | + uniform_int_distribution<int> dist_neg(0, random_size - 1); |
| 24 | + uniform_int_distribution<int> dist_window(0, window_size - 1); |
| 25 | + __shared__ int reduced_windows; |
| 26 | + __shared__ int neg_word; |
| 27 | + extern __shared__ float shared_memory[]; |
| 28 | + float* grad = &shared_memory[0]; |
| 29 | + |
| 30 | + // zero-initialize shared mem |
| 31 | + for (int i = threadIdx.x; i < num_dims; i += blockDim.x) |
| 32 | + grad[i] = 0.0f; |
| 33 | + __syncthreads(); |
| 34 | + |
| 35 | + for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { |
| 36 | + int beg = indptr[i], end = indptr[i + 1]; |
| 37 | + for (int j = beg; j < end; ++j) { |
| 38 | + if (threadIdx.x == 0) reduced_windows = dist_window(rng); |
| 39 | + __syncthreads(); |
| 40 | + int beg2 = max(beg, j - window_size + reduced_windows); |
| 41 | + int end2 = min(end, j + window_size - reduced_windows + 1); |
| 42 | + float* _emb_in = emb_in + num_dims * cols[j]; |
| 43 | + for (int k = beg2; k < end2; ++k) { |
| 44 | + if (k == j) continue; |
| 45 | + PositiveFeedback(_emb_in, emb_out + num_dims * cols[k], |
| 46 | + grad, _loss_nume, _loss_deno, num_dims, lr); |
| 47 | + for (int l = 0; l < neg; ++l) { |
| 48 | + if (threadIdx.x == 0) neg_word = random_table[dist_neg(rng)]; |
| 49 | + __syncthreads(); |
| 50 | + NegativeFeedback(_emb_in, emb_out + num_dims * neg_word, |
| 51 | + grad, _loss_nume, _loss_deno, num_dims, lr); |
| 52 | + } |
| 53 | + __syncthreads(); |
| 54 | + for (int l = threadIdx.x; l < num_dims; l += blockDim.x) { |
| 55 | + emb_in[num_dims * cols[j] + l] += grad[l]; |
| 56 | + grad[l] = 0.0f; |
| 57 | + } |
| 58 | + __syncthreads(); |
| 59 | + } |
| 60 | + } |
| 61 | + } |
| 62 | +} |
| 63 | + |
| 64 | +__global__ void W2VNegCbowKernel( |
| 65 | + const int* cols, const int* indptr, |
| 66 | + const int* random_table, default_random_engine* rngs, const int random_size, |
| 67 | + const int num_indptr, const int num_dims, const int neg, const int window_size, |
| 68 | + float* emb_in, float* emb_out, |
| 69 | + float* loss_nume, float* loss_deno, const bool use_mean, const float lr) { |
| 70 | + |
| 71 | + default_random_engine& rng = rngs[blockIdx.x]; |
| 72 | + float& _loss_nume = loss_nume[blockIdx.x]; |
| 73 | + float& _loss_deno = loss_deno[blockIdx.x]; |
| 74 | + |
| 75 | + uniform_int_distribution<int> dist_neg(0, random_size - 1); |
| 76 | + uniform_int_distribution<int> dist_window(0, window_size - 1); |
| 77 | + static __shared__ int reduced_windows; |
| 78 | + static __shared__ int neg_word; |
| 79 | + extern __shared__ float shared_memory[]; |
| 80 | + float* grad = &shared_memory[0]; |
| 81 | + float* cbow = &shared_memory[num_dims]; |
| 82 | + |
| 83 | + __syncthreads(); |
| 84 | + |
| 85 | + for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { |
| 86 | + int beg = indptr[i], end = indptr[i + 1]; |
| 87 | + for (int j = beg; j < end; ++j) { |
| 88 | + if (threadIdx.x == 0) reduced_windows = dist_window(rng); |
| 89 | + __syncthreads(); |
| 90 | + int beg2 = max(beg, j - window_size + reduced_windows); |
| 91 | + int end2 = min(end, j + window_size - reduced_windows + 1); |
| 92 | + if (end2 - beg2 <= 1) continue; |
| 93 | + |
| 94 | + // zero-initialize shared mem |
| 95 | + for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { |
| 96 | + grad[k] = 0.0f; |
| 97 | + cbow[k] = 0.0f; |
| 98 | + } |
| 99 | + |
| 100 | + // compute cbow |
| 101 | + for (int k = beg2; k < end2; ++k) { |
| 102 | + if (k == j) continue; |
| 103 | + for (int l = threadIdx.x; l < num_dims; l += blockDim.x) { |
| 104 | + cbow[l] += emb_in[num_dims * cols[k] + l]; |
| 105 | + } |
| 106 | + } |
| 107 | + if (use_mean) { |
| 108 | + for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { |
| 109 | + cbow[k] /= (end2 - beg2 - 1); |
| 110 | + } |
| 111 | + } |
| 112 | + __syncthreads(); |
| 113 | + |
| 114 | + PositiveFeedback(cbow, emb_out + num_dims * cols[j], grad, |
| 115 | + _loss_nume, _loss_deno, num_dims, lr); |
| 116 | + __syncthreads(); |
| 117 | + |
| 118 | + // update negative feedback |
| 119 | + for (int k = 0; k < neg; ++k){ |
| 120 | + if (threadIdx.x == 0) neg_word = random_table[dist_neg(rng)]; |
| 121 | + __syncthreads(); |
| 122 | + NegativeFeedback(cbow, emb_out + num_dims * neg_word, |
| 123 | + grad, _loss_nume, _loss_deno, num_dims, lr); |
| 124 | + } |
| 125 | + __syncthreads(); |
| 126 | + |
| 127 | + // normalize grad if use_mean = true |
| 128 | + if (use_mean) { |
| 129 | + for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { |
| 130 | + grad[k] /= (end2 - beg2 - 1); |
| 131 | + } |
| 132 | + } |
| 133 | + __syncthreads(); |
| 134 | + |
| 135 | + // update emb_in |
| 136 | + for (int k = beg2; k < end2; ++k) { |
| 137 | + if (k == j) continue; |
| 138 | + for (int l = threadIdx.x; l < num_dims; l += blockDim.x) |
| 139 | + emb_in[num_dims * cols[k] + l] += grad[l]; |
| 140 | + } |
| 141 | + __syncthreads(); |
| 142 | + |
| 143 | + } |
| 144 | + } |
| 145 | +} |
| 146 | + |
| 147 | +} // cusim |
0 commit comments