@@ -13,9 +13,23 @@ IoUtils::IoUtils() {
13
13
14
14
IoUtils::~IoUtils () {}
15
15
16
- std::vector<std::string> IoUtils::parse_line (std::string line) {
16
+ bool IoUtils::Init (std::string opt_path) {
17
+ std::ifstream in (opt_path.c_str ());
18
+ if (not in.is_open ()) return false ;
19
+
20
+ std::string str ((std::istreambuf_iterator<char >(in)),
21
+ std::istreambuf_iterator<char >());
22
+ std::string err_cmt;
23
+ auto _opt = json11::Json::parse (str, err_cmt);
24
+ if (not err_cmt.empty ()) return false ;
25
+ opt_ = _opt;
26
+ CuSimLogger ().set_log_level (opt_[" c_log_level" ].int_value ());
27
+ return true ;
28
+ }
29
+
30
+ void IoUtils::ParseLine (std::string line, std::vector<std::string>& ret) {
31
+ ret.clear ();
17
32
int n = line.size ();
18
- std::vector<std::string> ret;
19
33
std::string element;
20
34
for (int i = 0 ; i < n; ++i) {
21
35
if (line[i] == ' ' ) {
@@ -28,26 +42,42 @@ std::vector<std::string> IoUtils::parse_line(std::string line) {
28
42
if (element.size () > 0 ) {
29
43
ret.push_back (element);
30
44
}
31
- return ret;
32
45
}
33
46
34
- void IoUtils::LoadGensimVocab (std::string filepath, int min_count) {
35
- INFO (" read gensim file to generate vocabulary: {}, min_count: {}" , filepath, min_count);
36
- std::ifstream fin (filepath.c_str ());
37
- std::unordered_map<std::string, int > word_count;
38
- while (not fin.eof ()) {
39
- std::string line;
40
- getline (fin, line);
41
- std::vector<std::string> line_vec = parse_line (line);
47
+ int IoUtils::LoadStreamFile (std::string filepath) {
48
+ INFO (" read gensim file to generate vocabulary: {}" , filepath);
49
+ stream_fin_.open (filepath.c_str ());
50
+ int count = 0 ;
51
+ std::string line;
52
+ while (getline (stream_fin_, line))
53
+ count++;
54
+ stream_fin_.close ();
55
+ stream_fin_.open (filepath.c_str ());
56
+ word_idmap_.clear ();
57
+ word_list_.clear ();
58
+ word_count_.clear ();
59
+ return count;
60
+ }
61
+
62
+ int IoUtils::ReadStreamForVocab (int num_lines) {
63
+ int read_cnt = 0 ;
64
+ std::string line;
65
+ std::vector<std::string> line_vec;
66
+ while (getline (stream_fin_, line) and read_cnt < num_lines) {
67
+ ParseLine (line, line_vec);
42
68
for (auto & word: line_vec) {
43
- if (not word_count .count (word)) word_count [word] = 0 ;
44
- word_count [word]++;
69
+ if (not word_count_ .count (word)) word_count_ [word] = 0 ;
70
+ word_count_ [word]++;
45
71
}
72
+ read_cnt++;
46
73
}
47
- INFO (" number of raw words: {}" , word_count.size ());
48
- word_idmap_.clear ();
49
- word_list_.clear ();
50
- for (auto & it: word_count) {
74
+ if (read_cnt < num_lines) stream_fin_.close ();
75
+ return read_cnt;
76
+ }
77
+
78
+ void IoUtils::GetWordVocab (int min_count) {
79
+ INFO (" number of raw words: {}" , word_count_.size ());
80
+ for (auto & it: word_count_) {
51
81
if (it.second >= min_count) {
52
82
word_idmap_[it.first ] = word_idmap_.size ();
53
83
word_list_.push_back (it.first );
0 commit comments