Skip to content

Commit d7a2094

Browse files
committed
use progressbar in reading stream
1 parent 3fea98a commit d7a2094

File tree

8 files changed

+130
-51
lines changed

8 files changed

+130
-51
lines changed

cpp/include/ioutils.hpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,17 @@ class IoUtils {
3030
public:
3131
IoUtils();
3232
~IoUtils();
33-
void LoadGensimVocab(std::string filepath, int min_count);
33+
bool Init(std::string opt_path);
34+
int LoadStreamFile(std::string filepath);
35+
int ReadStreamForVocab(int num_lines);
36+
void GetWordVocab(int min_count);
3437
private:
35-
std::vector<std::string> parse_line(std::string line);
38+
void ParseLine(std::string line, std::vector<std::string>& line_vec);
3639

40+
std::ifstream stream_fin_;
41+
json11::Json opt_;
3742
std::shared_ptr<spdlog::logger> logger_;
38-
std::unordered_map<std::string, int> word_idmap_;
43+
std::unordered_map<std::string, int> word_idmap_, word_count_;
3944
std::vector<std::string> word_list_;
4045
}; // class IoUtils
4146

cpp/src/ioutils.cc

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,23 @@ IoUtils::IoUtils() {
1313

1414
IoUtils::~IoUtils() {}
1515

16-
std::vector<std::string> IoUtils::parse_line(std::string line) {
16+
bool IoUtils::Init(std::string opt_path) {
17+
std::ifstream in(opt_path.c_str());
18+
if (not in.is_open()) return false;
19+
20+
std::string str((std::istreambuf_iterator<char>(in)),
21+
std::istreambuf_iterator<char>());
22+
std::string err_cmt;
23+
auto _opt = json11::Json::parse(str, err_cmt);
24+
if (not err_cmt.empty()) return false;
25+
opt_ = _opt;
26+
CuSimLogger().set_log_level(opt_["c_log_level"].int_value());
27+
return true;
28+
}
29+
30+
void IoUtils::ParseLine(std::string line, std::vector<std::string>& ret) {
31+
ret.clear();
1732
int n = line.size();
18-
std::vector<std::string> ret;
1933
std::string element;
2034
for (int i = 0; i < n; ++i) {
2135
if (line[i] == ' ') {
@@ -28,26 +42,42 @@ std::vector<std::string> IoUtils::parse_line(std::string line) {
2842
if (element.size() > 0) {
2943
ret.push_back(element);
3044
}
31-
return ret;
3245
}
3346

34-
void IoUtils::LoadGensimVocab(std::string filepath, int min_count) {
35-
INFO("read gensim file to generate vocabulary: {}, min_count: {}", filepath, min_count);
36-
std::ifstream fin(filepath.c_str());
37-
std::unordered_map<std::string, int> word_count;
38-
while (not fin.eof()) {
39-
std::string line;
40-
getline(fin, line);
41-
std::vector<std::string> line_vec = parse_line(line);
47+
int IoUtils::LoadStreamFile(std::string filepath) {
48+
INFO("read gensim file to generate vocabulary: {}", filepath);
49+
stream_fin_.open(filepath.c_str());
50+
int count = 0;
51+
std::string line;
52+
while (getline(stream_fin_, line))
53+
count++;
54+
stream_fin_.close();
55+
stream_fin_.open(filepath.c_str());
56+
word_idmap_.clear();
57+
word_list_.clear();
58+
word_count_.clear();
59+
return count;
60+
}
61+
62+
int IoUtils::ReadStreamForVocab(int num_lines) {
63+
int read_cnt = 0;
64+
std::string line;
65+
std::vector<std::string> line_vec;
66+
while (getline(stream_fin_, line) and read_cnt < num_lines) {
67+
ParseLine(line, line_vec);
4268
for (auto& word: line_vec) {
43-
if (not word_count.count(word)) word_count[word] = 0;
44-
word_count[word]++;
69+
if (not word_count_.count(word)) word_count_[word] = 0;
70+
word_count_[word]++;
4571
}
72+
read_cnt++;
4673
}
47-
INFO("number of raw words: {}", word_count.size());
48-
word_idmap_.clear();
49-
word_list_.clear();
50-
for (auto& it: word_count) {
74+
if (read_cnt < num_lines) stream_fin_.close();
75+
return read_cnt;
76+
}
77+
78+
void IoUtils::GetWordVocab(int min_count) {
79+
INFO("number of raw words: {}", word_count_.size());
80+
for (auto& it: word_count_) {
5181
if (it.second >= min_count) {
5282
word_idmap_[it.first] = word_idmap_.size();
5383
word_list_.push_back(it.first);

cusim/aux.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
import jsmin
1212
from google.protobuf.json_format import Parse, MessageToDict
13-
from cusim.config_pb2 import ConfigProto
1413

1514
# get_logger and Option refer to
1615
# https://github.com/kakao/buffalo/blob/
@@ -57,7 +56,8 @@ def load_json_file(fname):
5756
return ret
5857

5958
# use protobuf to restrict field and types
60-
def get_opt_as_proto(raw, proto_type=ConfigProto):
59+
def get_opt_as_proto(raw, proto_type=None):
60+
assert proto_type is not None
6161
proto = proto_type()
6262
# convert raw to proto
6363
Parse(json.dumps(Option(raw)), proto)

cusim/ioutils/bindings.cc

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,23 @@ typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
1818
class IoUtilsBind {
1919
public:
2020
IoUtilsBind() {}
21-
void LoadGensimVocab(std::string filepath, int min_count) {
22-
obj_.LoadGensimVocab(filepath, min_count);
21+
22+
bool Init(std::string opt_path) {
23+
return obj_.Init(opt_path);
24+
}
25+
26+
int LoadStreamFile(std::string filepath) {
27+
return obj_.LoadStreamFile(filepath);
28+
}
29+
30+
int ReadStreamForVocab(int num_lines) {
31+
return obj_.ReadStreamForVocab(num_lines);
32+
}
33+
34+
void GetWordVocab(int min_count) {
35+
return obj_.GetWordVocab(min_count);
2336
}
37+
2438
private:
2539
cusim::IoUtils obj_;
2640
};
@@ -30,8 +44,10 @@ PYBIND11_PLUGIN(ioutils_bind) {
3044

3145
py::class_<IoUtilsBind>(m, "IoUtilsBind")
3246
.def(py::init())
33-
.def("load_gensim_vocab", &IoUtilsBind::LoadGensimVocab,
34-
py::arg("filepath"), py::arg("min_count"))
47+
.def("init", &IoUtilsBind::Init, py::arg("opt_path"))
48+
.def("load_stream_file", &IoUtilsBind::LoadStreamFile, py::arg("filepath"))
49+
.def("read_stream_for_vocab", &IoUtilsBind::ReadStreamForVocab, py::arg("num_lines"))
50+
.def("get_word_vocab", &IoUtilsBind::GetWordVocab, py::arg("min_count"))
3551
.def("__repr__",
3652
[](const IoUtilsBind &a) {
3753
return "<IoUtilsBind>";

cusim/ioutils/pyioutils.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,37 @@
55
# LICENSE file in the root directory of this source tree.
66

77
# pylint: disable=no-name-in-module,too-few-public-methods,no-member
8+
import os
9+
import json
10+
import tempfile
11+
import tqdm
12+
813
from cusim import aux
914
from cusim.ioutils.ioutils_bind import IoUtilsBind
10-
15+
from cusim.config_pb2 import IoUtilsConfigProto
1116

1217
class IoUtils:
13-
def __init__(self, log_level=2):
14-
self.logger = aux.get_logger("ioutils", level=log_level)
18+
def __init__(self, opt=None):
19+
self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto)
20+
self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level)
21+
22+
tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
23+
opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
24+
tmp.write(opt_content)
25+
tmp.close()
26+
27+
self.logger.info("opt: %s", opt_content)
1528
self.obj = IoUtilsBind()
29+
assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
30+
os.remove(tmp.name)
1631

17-
def load_gensim_vocab(self, filepath, min_count):
18-
self.obj.load_gensim_vocab(filepath, min_count)
32+
def load_stream_vocab(self, filepath, min_count, chunk_lines=100000):
33+
full_num_lines = self.obj.load_stream_file(filepath)
34+
pbar = tqdm.trange(full_num_lines)
35+
while True:
36+
num_lines = self.obj.read_stream_for_vocab(chunk_lines)
37+
pbar.update(num_lines)
38+
if num_lines < chunk_lines:
39+
pbar.close()
40+
break
41+
self.obj.get_word_vocab(min_count)

cusim/proto/config.proto

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,7 @@
66

77
syntax = "proto2";
88

9-
message ConfigProto {
10-
optional int32 seed = 1 [default = 777];
11-
optional int32 c_log_level = 3 [default = 2];
12-
optional int32 py_log_level = 4 [default = 2];
13-
optional int32 max_m = 5 [default = 12];
14-
optional int32 max_m0 = 6 [default = 24];
15-
optional int32 ef_construction = 7 [default = 150];
16-
// optional int32 ef_search = 8 [default = 50];
17-
optional double level_mult = 9;
18-
optional bool save_remains = 10;
19-
optional double hyper_threads = 11 [default = 10];
20-
optional int32 block_dim = 12 [default = 32];
21-
optional string dist_type = 13 [default = "dot"];
22-
optional int32 visited_table_size = 17;
23-
optional int32 visited_list_size = 14 [default = 8192];
24-
optional bool nrz = 15 [default = true];
25-
optional bool reverse_cand = 16;
26-
optional double heuristic_coef = 18 [default = 0.25];
9+
message IoUtilsConfigProto {
10+
optional int32 py_log_level = 1 [default = 2];
11+
optional int32 c_log_level = 2 [default = 2];
2712
}

examples/example1.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,35 @@
55
# LICENSE file in the root directory of this source tree.
66

77
# pylint: disable=no-name-in-module,logging-format-truncated
8+
import os
9+
import subprocess
810
import fire
11+
12+
from gensim import downloader as api
913
from cusim import aux, IoUtils
1014

1115
LOGGER = aux.get_logger()
12-
CORPORA_PATH = "res/corpora.txt"
16+
DOWNLOAD_PATH = "./res"
17+
DATASET = "wiki-english-20171001"
18+
DATA_PATH = f"./res/{DATASET}.stream.txt"
1319
MIN_COUNT = 5
1420

21+
def download():
22+
if os.path.exists(DATA_PATH):
23+
LOGGER.info("%s already exists", DATA_PATH)
24+
return
25+
api.BASE_DIR = DOWNLOAD_PATH
26+
filepath = api.load(DATASET, return_path=True)
27+
LOGGER.info("filepath: %s", filepath)
28+
cmd = ["gunzip", "-c", filepath, ">", DATA_PATH]
29+
cmd = " ".join(cmd)
30+
LOGGER.info("cmd: %s", cmd)
31+
subprocess.call(cmd, shell=True)
32+
1533
def run():
34+
download()
1635
iou = IoUtils()
17-
iou.load_gensim_vocab(CORPORA_PATH, MIN_COUNT)
36+
iou.load_stream_vocab(DATA_PATH, 5)
1837

1938

2039
if __name__ == "__main__":

examples/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
fire
2+
gensim

0 commit comments

Comments
 (0)