@@ -13,47 +13,166 @@ IoUtils::IoUtils() {
13
13
14
14
IoUtils::~IoUtils () {}
15
15
16
- std::vector<std::string> IoUtils::parse_line (std::string line) {
16
+ bool IoUtils::Init (std::string opt_path) {
17
+ std::ifstream in (opt_path.c_str ());
18
+ if (not in.is_open ()) return false ;
19
+
20
+ std::string str ((std::istreambuf_iterator<char >(in)),
21
+ std::istreambuf_iterator<char >());
22
+ std::string err_cmt;
23
+ auto _opt = json11::Json::parse (str, err_cmt);
24
+ if (not err_cmt.empty ()) return false ;
25
+ opt_ = _opt;
26
+ CuSimLogger ().set_log_level (opt_[" c_log_level" ].int_value ());
27
+ return true ;
28
+ }
29
+
30
+ void IoUtils::ParseLine (std::string line, std::vector<std::string>& ret) {
31
+ ParseLineImpl (line, ret);
32
+ }
33
+
34
+
35
+ void IoUtils::ParseLineImpl (std::string line, std::vector<std::string>& ret) {
36
+ ret.clear ();
17
37
int n = line.size ();
18
- std::vector<std::string> ret;
19
38
std::string element;
20
39
for (int i = 0 ; i < n; ++i) {
21
- if (line[i] == ' ' ) {
40
+ if (line[i] == ' ' or line[i] == ' , ' ) {
22
41
ret.push_back (element);
23
42
element.clear ();
24
- } else {
25
- element += line[i];
43
+ } else if (line[i] != ' " ' ) {
44
+ element += std::tolower ( line[i]) ;
26
45
}
27
46
}
28
47
if (element.size () > 0 ) {
29
48
ret.push_back (element);
30
49
}
31
- return ret;
32
50
}
33
51
34
- void IoUtils::LoadGensimVocab (std::string filepath, int min_count) {
35
- INFO (" read gensim file to generate vocabulary: {}, min_count: {}" , filepath, min_count);
36
- std::ifstream fin (filepath.c_str ());
37
- std::unordered_map<std::string, int > word_count;
38
- while (not fin.eof ()) {
52
+ int IoUtils::LoadStreamFile (std::string filepath) {
53
+ INFO (" read gensim file to generate vocabulary: {}" , filepath);
54
+ if (stream_fin_.is_open ()) stream_fin_.close ();
55
+ stream_fin_.open (filepath.c_str ());
56
+ int count = 0 ;
57
+ std::string line;
58
+ while (getline (stream_fin_, line))
59
+ count++;
60
+ stream_fin_.close ();
61
+ stream_fin_.open (filepath.c_str ());
62
+ num_lines_ = count;
63
+ remain_lines_ = num_lines_;
64
+ INFO (" number of lines: {}" , num_lines_);
65
+ return count;
66
+ }
67
+
68
+ std::pair<int , int > IoUtils::TokenizeStream (int num_lines, int num_threads) {
69
+ int read_lines = std::min (num_lines, remain_lines_);
70
+ if (not read_lines) return {0 , 0 };
71
+ remain_lines_ -= read_lines;
72
+ indices_.clear ();
73
+ indices_.resize (read_lines);
74
+ indptr_.resize (read_lines);
75
+ std::fill (indptr_.begin (), indptr_.end (), 0 );
76
+ #pragma omp parallel num_threads(num_threads)
77
+ {
78
+ std::string line;
79
+ std::vector<std::string> line_vec;
80
+ #pragma omp for schedule(dynamic, 4)
81
+ for (int i = 0 ; i < read_lines; ++i) {
82
+ // get line thread-safely
83
+ {
84
+ std::unique_lock<std::mutex> lock (global_lock_);
85
+ getline (stream_fin_, line);
86
+ }
87
+
88
+ // seems to be bottle-neck
89
+ ParseLine (line, line_vec);
90
+
91
+ // tokenize
92
+ for (auto & word: line_vec) {
93
+ if (not word_count_.count (word)) continue ;
94
+ indices_[i].push_back (word_count_[word]);
95
+ }
96
+ }
97
+ }
98
+ int cumsum = 0 ;
99
+ for (int i = 0 ; i < read_lines; ++i) {
100
+ cumsum += indices_[i].size ();
101
+ indptr_[i] = cumsum;
102
+ }
103
+ return {read_lines, indptr_[read_lines - 1 ]};
104
+ }
105
+
106
+ void IoUtils::GetToken (int * indices, int * indptr, int offset) {
107
+ int n = indices_.size ();
108
+ for (int i = 0 ; i < n; ++i) {
109
+ int beg = i == 0 ? 0 : indptr_[i - 1 ];
110
+ int end = indptr_[i];
111
+ for (int j = beg; j < end; ++j) {
112
+ indices[j] = indices_[i][j - beg];
113
+ }
114
+ indptr[i] = offset + indptr_[i];
115
+ }
116
+ }
117
+
118
+ std::pair<int , int > IoUtils::ReadStreamForVocab (int num_lines, int num_threads) {
119
+ int read_lines = std::min (num_lines, remain_lines_);
120
+ remain_lines_ -= read_lines;
121
+ #pragma omp parallel num_threads(num_threads)
122
+ {
39
123
std::string line;
40
- getline (fin, line);
41
- std::vector<std::string> line_vec = parse_line (line);
42
- for (auto & word: line_vec) {
43
- if (not word_count.count (word)) word_count[word] = 0 ;
44
- word_count[word]++;
124
+ std::vector<std::string> line_vec;
125
+ std::unordered_map<std::string, int > word_count;
126
+ #pragma omp for schedule(dynamic, 4)
127
+ for (int i = 0 ; i < read_lines; ++i) {
128
+ // get line thread-safely
129
+ {
130
+ std::unique_lock<std::mutex> lock (global_lock_);
131
+ getline (stream_fin_, line);
132
+ }
133
+
134
+ // seems to be bottle-neck
135
+ ParseLine (line, line_vec);
136
+
137
+ // update private word count
138
+ for (auto & word: line_vec) {
139
+ word_count[word]++;
140
+ }
141
+ }
142
+
143
+ // update word count to class variable
144
+ {
145
+ std::unique_lock<std::mutex> lock (global_lock_);
146
+ for (auto & it: word_count) {
147
+ word_count_[it.first ] += it.second ;
148
+ }
45
149
}
46
150
}
47
- INFO (" number of raw words: {}" , word_count.size ());
48
- word_idmap_.clear ();
49
- word_list_.clear ();
50
- for (auto & it: word_count) {
151
+ if (not remain_lines_) stream_fin_.close ();
152
+ return {read_lines, word_count_.size ()};
153
+ }
154
+
155
+ void IoUtils::GetWordVocab (int min_count, std::string keys_path) {
156
+ INFO (" number of raw words: {}" , word_count_.size ());
157
+ for (auto & it: word_count_) {
51
158
if (it.second >= min_count) {
52
- word_idmap_[it.first ] = vocab_ .size ();
159
+ word_idmap_[it.first ] = word_idmap_ .size ();
53
160
word_list_.push_back (it.first );
54
161
}
55
162
}
56
163
INFO (" number of words after filtering: {}" , word_list_.size ());
164
+
165
+ // write keys to csv file
166
+ std::ofstream fout (keys_path.c_str ());
167
+ INFO (" dump keys to {}" , keys_path);
168
+ std::string header = " index,key\n " ;
169
+ fout.write (header.c_str (), header.size ());
170
+ int n = word_list_.size ();
171
+ for (int i = 0 ; i < n; ++i) {
172
+ std::string line = std::to_string (i) + " ,\" " + word_list_[i] + " \"\n " ;
173
+ fout.write (line.c_str (), line.size ());
174
+ }
175
+ fout.close ();
57
176
}
58
177
59
178
} // namespace cusim
0 commit comments