Skip to content

Commit 4251dd5

Browse files
committed
add performance results
1 parent 94ca9b4 commit 4251dd5

File tree

2 files changed

+33
-8
lines changed

2 files changed

+33
-8
lines changed

README.md

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,41 @@ python setup.py install
3838

3939
| attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) |
4040
|:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:|
41-
| training time (sec) | 892.596 | 544.212 | 310.727 | 226.472 | **16.1615** |
42-
| pearson | 0.487832 | 0.487696 | 0.482821 | 0.487136 | **0.492101** |
43-
| spearman | 0.500846 | 0.506214 | 0.501048 | **0.506718** | 0.479468 |
41+
| training time (sec) | 892.596 | 544.212 | 310.727 | 226.472 | **16.162** |
42+
| pearson | 0.487832 | 0.487696 | 0.482821 | 0.487136 | **0.492101** |
43+
| spearman | 0.500846 | 0.506214 | 0.501048 | **0.506718** | 0.479468 |
44+
45+
- W2V (skip gram, negative sampling)
46+
47+
| attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) |
48+
|:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:|
49+
| training time (sec) | 586.545 | 340.489 | 220.804 | 146.23 | 33.9173 |
50+
| pearson | 0.354448 | 0.353952 | 0.352398 | 0.352925 | 0.342999 |
51+
| spearman | 0.369146 | 0.369365 | 0.370565 | 0.365822 | 0.339781 |
52+
53+
- W2V (CBOW, hierarchical softmax)
54+
55+
| attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) |
56+
|:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:|
57+
| training time (sec) | 250.135 | 155.121 | 103.57 | 73.8073 | **6.20787** |
58+
| pearson | 0.309651 | 0.321803 | 0.324854 | 0.314255 | **0.480298** |
59+
| spearman | 0.294047 | 0.308723 | 0.318293 | 0.300591 | **0.480971** |
60+
61+
- W2V (CBOW, negative sampling)
62+
63+
| attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) |
64+
|:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:|
65+
| training time (sec) | 176.923 | 100.369 | 69.7829 | 49.9274 | **9.90391** |
66+
| pearson | 0.18772 | 0.193152 | 0.204509 | 0.187924 | **0.368202** |
67+
| spearman | 0.243975 | 0.24587 | 0.260531 | 0.237441 | **0.358042** |
4468

4569
- LDA (`nytimes` dataset from https://archive.ics.uci.edu/ml/datasets/bag+of+words)
4670
- I found that setting `workers` variable in gensim LdaMulticore does not work properly (it uses all cores in instance anyway), so I just compared the speed between cusim with single GPU and gensim with 8 vcpus.
4771
- One can compare the quality of modeling by looking at `examples/cusim.topics.txt` and `examples/gensim.topics.txt`.
4872

73+
| attr | gensim (8 vpus) | cusim |
74+
|:--------------------|------------------:|--------:|
75+
| training time (sec) | 447.376 | 76.6972 |
4976

5077
### Future tasks
5178

cpp/include/cuw2v/cuda_w2v_base_kernels.cuh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#pragma once
77
#include "utils/cuda_utils_kernels.cuh"
88

9-
// #define MAX_EXP 6
9+
#define MAX_EXP 20
1010

1111
namespace cusim {
1212

@@ -15,8 +15,7 @@ __inline__ __device__
1515
void PositiveFeedback(const float* vec1, float* vec2, float* grad,
1616
float& loss_nume, float& loss_deno, const int num_dims, const float lr) {
1717
static __shared__ float g;
18-
float dot = Dot(vec1, vec2, num_dims);
19-
// if (dot <= -MAX_EXP or dot >= MAX_EXP) return;
18+
float dot = fmaxf(-MAX_EXP, fminf(MAX_EXP, Dot(vec1, vec2, num_dims)));
2019
if (threadIdx.x == 0) {
2120
float exp_dot = expf(-dot);
2221
g = exp_dot / (1 + exp_dot) * lr;
@@ -35,8 +34,7 @@ __inline__ __device__
3534
void NegativeFeedback(const float* vec1, float* vec2, float* grad,
3635
float& loss_nume, float& loss_deno, const int num_dims, const float lr) {
3736
static __shared__ float g;
38-
float dot = Dot(vec1, vec2, num_dims);
39-
// if (dot <= -MAX_EXP or dot >= MAX_EXP) return;
37+
float dot = fmaxf(-MAX_EXP, fminf(MAX_EXP, Dot(vec1, vec2, num_dims)));
4038
if (threadIdx.x == 0) {
4139
float exp_dot = expf(dot);
4240
g = exp_dot / (1 + exp_dot) * lr;

0 commit comments

Comments
 (0)