@@ -28,14 +28,19 @@ bool IoUtils::Init(std::string opt_path) {
28
28
}
29
29
30
30
void IoUtils::ParseLine (std::string line, std::vector<std::string>& ret) {
31
+ ParseLineImpl (line, ret);
32
+ }
33
+
34
+
35
+ void IoUtils::ParseLineImpl (std::string line, std::vector<std::string>& ret) {
31
36
ret.clear ();
32
37
int n = line.size ();
33
38
std::string element;
34
39
for (int i = 0 ; i < n; ++i) {
35
- if (line[i] == ' ' ) {
40
+ if (line[i] == ' ' or line[i] == ' , ' ) {
36
41
ret.push_back (element);
37
42
element.clear ();
38
- } else {
43
+ } else if (line[i] != ' " ' ) {
39
44
element += line[i];
40
45
}
41
46
}
@@ -61,6 +66,56 @@ int IoUtils::LoadStreamFile(std::string filepath) {
61
66
return count;
62
67
}
63
68
69
+ std::pair<int , int > IoUtils::TokenizeStream (int num_lines, int num_threads) {
70
+ int read_lines = std::min (num_lines, remain_lines_);
71
+ if (not read_lines) return {0 , 0 };
72
+ remain_lines_ -= read_lines;
73
+ indices_.clear ();
74
+ indices_.resize (read_lines);
75
+ indptr_.resize (read_lines);
76
+ std::fill (indptr_.begin (), indptr_.end (), 0 );
77
+ #pragma omp parallel num_threads(num_threads)
78
+ {
79
+ std::string line;
80
+ std::vector<std::string> line_vec;
81
+ #pragma omp for schedule(dynamic, 4)
82
+ for (int i = 0 ; i < read_lines; ++i) {
83
+ // get line thread-safely
84
+ {
85
+ std::unique_lock<std::mutex> lock (global_lock_);
86
+ getline (stream_fin_, line);
87
+ }
88
+
89
+ // seems to be bottle-neck
90
+ ParseLine (line, line_vec);
91
+
92
+ // tokenize
93
+ for (auto & word: line_vec) {
94
+ if (word_count_.count (word)) continue ;
95
+ indices_[i].push_back (word_count_[word]);
96
+ }
97
+ }
98
+ }
99
+ int cumsum = 0 ;
100
+ for (int i = 0 ; i < read_lines; ++i) {
101
+ cumsum += indices_[i].size ();
102
+ indptr_[i] = cumsum;
103
+ }
104
+ return {read_lines, indptr_[read_lines - 1 ]};
105
+ }
106
+
107
+ void IoUtils::GetToken (int * indices, int * indptr, int offset) {
108
+ int n = indices_.size ();
109
+ for (int i = 0 ; i < n; ++i) {
110
+ int beg = i == 0 ? 0 : indptr_[i - 1 ];
111
+ int end = indptr_[i];
112
+ for (int j = beg; j < end; ++j) {
113
+ indices[j] = indices_[i][j - beg];
114
+ }
115
+ indptr[i] = offset + indptr_[i];
116
+ }
117
+ }
118
+
64
119
std::pair<int , int > IoUtils::ReadStreamForVocab (int num_lines, int num_threads) {
65
120
int read_lines = std::min (num_lines, remain_lines_);
66
121
remain_lines_ -= read_lines;
@@ -77,7 +132,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
77
132
getline (stream_fin_, line);
78
133
}
79
134
80
- // seems to bottle-neck
135
+ // seems to be bottle-neck
81
136
ParseLine (line, line_vec);
82
137
83
138
// update private word count
@@ -94,10 +149,10 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
94
149
}
95
150
}
96
151
}
97
- return {read_lines, remain_lines_ };
152
+ return {read_lines, word_count_. size () };
98
153
}
99
154
100
- void IoUtils::GetWordVocab (int min_count) {
155
+ void IoUtils::GetWordVocab (int min_count, std::string keys_path ) {
101
156
INFO (" number of raw words: {}" , word_count_.size ());
102
157
for (auto & it: word_count_) {
103
158
if (it.second >= min_count) {
@@ -106,6 +161,18 @@ void IoUtils::GetWordVocab(int min_count) {
106
161
}
107
162
}
108
163
INFO (" number of words after filtering: {}" , word_list_.size ());
164
+
165
+ // write keys to csv file
166
+ std::ofstream fout (keys_path.c_str ());
167
+ INFO (" dump keys to {}" , keys_path);
168
+ std::string header = " index,key\n " ;
169
+ fout.write (header.c_str (), header.size ());
170
+ int n = word_list_.size ();
171
+ for (int i = 0 ; i < n; ++i) {
172
+ std::string line = std::to_string (i) + " ,\" " + word_list_[i] + " \"\n " ;
173
+ fout.write (line.c_str (), line.size ());
174
+ }
175
+ fout.close ();
109
176
}
110
177
111
178
} // namespace cusim
0 commit comments