@@ -56,28 +56,45 @@ int IoUtils::LoadStreamFile(std::string filepath) {
56
56
word_idmap_.clear ();
57
57
word_list_.clear ();
58
58
word_count_.clear ();
59
+ num_lines_ = count;
60
+ remain_lines_ = num_lines_;
59
61
return count;
60
62
}
61
63
62
- std::pair<int , bool > IoUtils::ReadStreamForVocab (int num_lines) {
63
- int read_cnt = 0 ;
64
- std::string line;
65
- std::vector<std::string> line_vec;
66
- while (not stream_fin_.eof () and read_cnt < num_lines) {
67
- getline (stream_fin_, line);
68
- ParseLine (line, line_vec);
69
- for (auto & word: line_vec) {
70
- if (not word_count_.count (word)) word_count_[word] = 0 ;
71
- word_count_[word]++;
64
+ std::pair<int , int > IoUtils::ReadStreamForVocab (int num_lines, int num_threads) {
65
+ int read_lines = std::min (num_lines, remain_lines_);
66
+ remain_lines_ -= read_lines;
67
+ #pragma omp parallel num_threads(num_threads)
68
+ {
69
+ std::string line;
70
+ std::vector<std::string> line_vec;
71
+ std::unordered_map<std::string, int > word_count;
72
+ #pragma omp for schedule(dynamic, 4)
73
+ for (int i = 0 ; i < read_lines; ++i) {
74
+ // get line thread-safely
75
+ {
76
+ std::unique_lock<std::mutex> lock (global_lock_);
77
+ getline (stream_fin_, line);
78
+ }
79
+
80
+ // seems to bottle-neck
81
+ ParseLine (line, line_vec);
82
+
83
+ // update private word count
84
+ for (auto & word: line_vec) {
85
+ word_count[word]++;
86
+ }
87
+ }
88
+
89
+ // update word count to class variable
90
+ {
91
+ std::unique_lock<std::mutex> lock (global_lock_);
92
+ for (auto & it: word_count) {
93
+ word_count_[it.first ] += it.second ;
94
+ }
72
95
}
73
- read_cnt++;
74
- }
75
- bool finished = false ;
76
- if (stream_fin_.eof ()) {
77
- stream_fin_.close ();
78
- finished = true ;
79
96
}
80
- return {read_cnt, finished };
97
+ return {read_lines, remain_lines_ };
81
98
}
82
99
83
100
void IoUtils::GetWordVocab (int min_count) {
0 commit comments