Skip to content

Commit bab04b0

Browse files
committed
train only train samples
1 parent 36d92ca commit bab04b0

File tree

8 files changed

+98
-24
lines changed

8 files changed

+98
-24
lines changed

cpp/include/culda/cuda_lda_kernels.cuh

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,19 @@ float Digamma(float x) {
2626
}
2727

2828
__global__ void EstepKernel(
29-
const int* cols, const int* indptr,
29+
const int* cols, const int* indptr, const bool* vali,
3030
const int num_cols, const int num_indptr,
3131
const int num_words, const int num_topics, const int num_iters,
3232
float* gamma, float* new_gamma, float* phi,
3333
float* alpha, float* beta,
34-
float* grad_alpha, float* new_beta) {
34+
float* grad_alpha, float* new_beta, float* train_losses, float* vali_losses) {
3535

3636
// storage for block
3737
float* _gamma = gamma + num_topics * blockIdx.x;
3838
float* _new_gamma = new_gamma + num_topics * blockIdx.x;
3939
float* _phi = phi + num_topics * blockIdx.x;
40+
float* _grad_alpha = grad_alpha + num_topics * blockIdx.x;
41+
4042

4143
for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
4244
int beg = indptr[i], end = indptr[i + 1];
@@ -56,18 +58,34 @@ __global__ void EstepKernel(
5658
// compute phi from gamma
5759
for (int k = beg; k < end; ++k) {
5860
const int w = cols[k];
61+
const bool _vali = vali[k];
62+
5963
// compute phi
60-
for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
61-
_phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
62-
__syncthreads();
63-
64-
// normalize phi and add it to new gamma and new beta
65-
float phi_sum = ReduceSum(_phi, num_topics);
66-
for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
67-
_phi[l] /= phi_sum;
68-
_new_gamma[l] += _phi[l];
69-
if (j + 1 == num_iters)
70-
new_beta[w * num_topics + l] += phi[l];
64+
if (not _vali or j + 1 == num_iters) {
65+
for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
66+
_phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
67+
__syncthreads();
68+
69+
// normalize phi and add it to new gamma and new beta
70+
float phi_sum = ReduceSum(_phi, num_topics);
71+
for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
72+
_phi[l] /= phi_sum;
73+
if (not _vali) _new_gamma[l] += _phi[l];
74+
if (j + 1 == num_iters) {
75+
if (not _vali) new_beta[w * num_topics + l] += _phi[l];
76+
_phi[l] *= beta[w * num_topics + l];
77+
}
78+
}
79+
__syncthreads();
80+
}
81+
if (j + 1 == num_iters) {
82+
float p = ReduceSum(_phi, num_topics);
83+
if (threadIdx.x == 0) {
84+
if (_vali)
85+
vali_losses[blockIdx.x] += logf(p + EPS);
86+
else
87+
train_losses[blockIdx.x] += logf(p + EPS);
88+
}
7189
}
7290
__syncthreads();
7391
}
@@ -79,7 +97,8 @@ __global__ void EstepKernel(
7997
}
8098
float gamma_sum = ReduceSum(_gamma, num_topics);
8199
for (int j = threadIdx.x; j < num_topics; j += blockDim.x)
82-
grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum));
100+
_grad_alpha[j] += (Digamma(_gamma[j]) - Digamma(gamma_sum));
101+
83102
__syncthreads();
84103
}
85104
}

cpp/include/culda/culda.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ class CuLDA {
6464
bool Init(std::string opt_path);
6565
void LoadModel(float* alpha, float* beta,
6666
float* grad_alpha, float* new_beta, const int num_words);
67-
void FeedData(const int* indices, const int* indptr,
67+
std::pair<float, float> FeedData(
68+
const int* indices, const int* indptr, const bool* vali,
6869
const int num_indices, const int num_indptr, const int num_iters);
6970
void Pull();
7071
void Push();

cpp/include/utils/types.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ struct DeviceInfo {
1111
};
1212

1313
#define WARP_SIZE 32
14+
#define EPS 1e-10f

cpp/src/culda/culda.cu

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,27 +61,50 @@ void CuLDA::LoadModel(float* alpha, float* beta,
6161
CHECK_CUDA(cudaDeviceSynchronize());
6262
}
6363

64-
void CuLDA::FeedData(const int* cols, const int* indptr,
64+
std::pair<float, float> CuLDA::FeedData(
65+
const int* cols, const int* indptr, const bool* vali,
6566
const int num_cols, const int num_indptr, const int num_iters) {
67+
68+
// copy feed data to GPU memory
6669
thrust::device_vector<int> dev_cols(num_cols);
6770
thrust::device_vector<int> dev_indptr(num_indptr + 1);
71+
thrust::device_vector<bool> dev_vali(num_cols);
72+
thrust::device_vector<float> dev_train_losses(block_cnt_, 0.0f);
73+
thrust::device_vector<float> dev_vali_losses(block_cnt_, 0.0f);
6874
thrust::copy(cols, cols + num_cols, dev_cols.begin());
6975
thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
76+
thrust::copy(vali, vali + num_cols, dev_vali.begin());
77+
7078
CHECK_CUDA(cudaDeviceSynchronize());
7179

80+
// run E step in GPU
7281
EstepKernel<<<block_cnt_, block_dim_>>>(
7382
thrust::raw_pointer_cast(dev_cols.data()),
7483
thrust::raw_pointer_cast(dev_indptr.data()),
84+
thrust::raw_pointer_cast(dev_vali.data()),
7585
num_cols, num_indptr, num_words_, num_topics_, num_iters,
7686
thrust::raw_pointer_cast(dev_gamma_.data()),
7787
thrust::raw_pointer_cast(dev_new_gamma_.data()),
7888
thrust::raw_pointer_cast(dev_phi_.data()),
7989
thrust::raw_pointer_cast(dev_alpha_.data()),
8090
thrust::raw_pointer_cast(dev_beta_.data()),
8191
thrust::raw_pointer_cast(dev_grad_alpha_.data()),
82-
thrust::raw_pointer_cast(dev_new_beta_.data()));
92+
thrust::raw_pointer_cast(dev_new_beta_.data()),
93+
thrust::raw_pointer_cast(dev_train_losses.data()),
94+
thrust::raw_pointer_cast(dev_vali_losses.data()));
8395

8496
CHECK_CUDA(cudaDeviceSynchronize());
97+
98+
// pull loss
99+
std::vector<float> train_losses(block_cnt_), vali_losses(block_cnt_);
100+
thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin());
101+
thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin());
102+
CHECK_CUDA(cudaDeviceSynchronize());
103+
104+
// accumulate
105+
float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f);
106+
float vali_loss = std::accumulate(vali_losses.begin(), vali_losses.end(), 0.0f);
107+
return {train_loss, vali_loss};
85108
}
86109

87110
void CuLDA::Pull() {

cusim/culda/bindings.cc

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ namespace py = pybind11;
1414

1515
typedef py::array_t<float, py::array::c_style | py::array::forcecast> float_array;
1616
typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
17+
typedef py::array_t<bool, py::array::c_style | py::array::forcecast> bool_array;
1718

1819
class CuLDABind {
1920
public:
@@ -54,17 +55,21 @@ class CuLDABind {
5455
_new_beta.mutable_data(0), num_words);
5556
}
5657

57-
void FeedData(py::object& cols, py::object indptr, const int num_iters) {
58+
std::pair<float, float> FeedData(py::object& cols, py::object& indptr, py::object& vali, const int num_iters) {
5859
int_array _cols(cols);
5960
int_array _indptr(indptr);
61+
bool_arrray _vali(vali);
6062
auto cols_buffer = _cols.request();
6163
auto indptr_buffer = _indptr.request();
62-
if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1) {
64+
auto vali_buffer = _vali.request();
65+
if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1 or vali_buffer.ndim != 1
66+
or cols_buffer.shape[0] != vali_buffer.shape[0]) {
6367
throw std::runtime_error("invalid cols or indptr");
6468
}
6569
int num_cols = cols_buffer.shape[0];
6670
int num_indptr = indptr_buffer.shape[0];
67-
obj_.FeedData(_cols.data(0), _indptr.data(0), num_cols, num_indptr, num_iters);
71+
return obj_.FeedData(_cols.data(0), _indptr.data(0), _vali.data(0),
72+
num_cols, num_indptr, num_iters);
6873
}
6974

7075
void Pull() {
@@ -93,7 +98,7 @@ PYBIND11_PLUGIN(culda_bind) {
9398
py::arg("alpha"), py::arg("beta"),
9499
py::arg("grad_alpha"), py::arg("new_beta"))
95100
.def("feed_data", &CuLDABind::FeedData,
96-
py::arg("cols"), py::arg("indptr"), py::arg("num_iters"))
101+
py::arg("cols"), py::arg("indptr"), py::arg("vali"), py::arg("num_iters"))
97102
.def("pull", &CuLDABind::Pull)
98103
.def("push", &CuLDABind::Push)
99104
.def("get_block_cnt", &CuLDABind::GetBlockCnt)

cusim/culda/pyculda.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,9 @@ def train_model(self):
9292

9393
def _train_e_step(self, h5f):
9494
offset, size = 0, h5f["cols"].shape[0]
95-
pbar = aux.Progbar(size)
95+
pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"])
96+
train_loss_nume, train_loss_deno = 0, 0
97+
vali_loss_nume, vali_loss_deno = 0, 0
9698
while True:
9799
target = h5f["indptr"][offset] + self.opt.batch_size
98100
if target < size:
@@ -103,10 +105,26 @@ def _train_e_step(self, h5f):
103105
beg, end = indptr[0], indptr[-1]
104106
indptr -= beg
105107
cols = h5f["cols"][beg:end]
108+
vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool)
106109
offset = next_offset
107110

108-
self.obj.FeedData(cols, indptr, self.opt.num_iters_in_e_step)
109-
pbar.update(end)
111+
# call cuda kernel
112+
train_loss, vali_loss = \
113+
self.obj.FeedData(cols, indptr, vali, self.opt.num_iters_in_e_step)
114+
115+
# accumulate loss
116+
train_loss_nume -= train_loss
117+
vali_loss_nume -= vali_loss
118+
vali_cnt = np.count_nonzero(vali)
119+
train_cnt = len(vali) - vali_cnt
120+
train_loss_nume += train_cnt
121+
vali_loss_nume += train_cnt
122+
train_loss = train_loss_nume / train_loss_deno
123+
vali_loss = vali_loss_nume / vali_loss_deno
124+
125+
# update progress bar
126+
pbar.update(end, values=[("train_loss", train_loss),
127+
("vali_loss", vali_loss)])
110128
if end == size:
111129
break
112130

cusim/ioutils/pyioutils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
6666
cols = h5f.create_dataset("cols", shape=(chunk_indices,),
6767
maxshape=(None,), dtype=np.int32,
6868
chunks=(chunk_indices,))
69+
vali = h5f.create_dataset("vali", shape=(chunk_indices,),
70+
maxshape=(None,), dtype=np.float32,
71+
chunks=(chunk_indices,))
6972
indptr = h5f.create_dataset("indptr", shape=(full_num_lines + 1,),
7073
dtype=np.int32, chunks=True)
7174
processed, offset = 1, 0
@@ -81,6 +84,9 @@ def convert_stream_to_h5(self, filepath, min_count, out_dir,
8184
rows[offset:offset + data_size] = _rows + (processed - 1)
8285
cols.resize((offset + data_size,))
8386
cols[offset:offset + data_size] = _cols
87+
vali.resize((offset + data_size,))
88+
vali[offset:offset + data_size] = \
89+
np.uniform(size=(data_size,)).astype(np.float32)
8490
indptr[processed:processed + read_lines] = _indptr + offset
8591
offset += data_size
8692
processed += read_lines

cusim/proto/config.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ message CuLDAConfigProto {
2828
optional int32 batch_size = 10 [default = 100000];
2929
optional int32 epochs = 11 [default = 10];
3030
optional int32 num_iters_in_e_step = 12 [default = 5];
31+
optional double vali_p = 13 [default = 0.2];
3132
}

0 commit comments

Comments
 (0)