Skip to content

Commit 68915a8

Browse files
committed
implement tokenization and use keras progressbar
1 parent a08ed01 commit 68915a8

File tree

4 files changed

+262
-14
lines changed

4 files changed

+262
-14
lines changed

cpp/src/ioutils.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
4141
ret.push_back(element);
4242
element.clear();
4343
} else if (line[i] != '"') {
44-
element += line[i];
44+
element += std::tolower(line[i]);
4545
}
4646
}
4747
if (element.size() > 0) {
@@ -51,18 +51,17 @@ void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
5151

5252
int IoUtils::LoadStreamFile(std::string filepath) {
5353
INFO("read gensim file to generate vocabulary: {}", filepath);
54+
if (stream_fin_.is_open()) stream_fin_.close();
5455
stream_fin_.open(filepath.c_str());
5556
int count = 0;
5657
std::string line;
5758
while (getline(stream_fin_, line))
5859
count++;
5960
stream_fin_.close();
6061
stream_fin_.open(filepath.c_str());
61-
word_idmap_.clear();
62-
word_list_.clear();
63-
word_count_.clear();
6462
num_lines_ = count;
6563
remain_lines_ = num_lines_;
64+
INFO("number of lines: {}", num_lines_);
6665
return count;
6766
}
6867

@@ -91,7 +90,7 @@ std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
9190

9291
// tokenize
9392
for (auto& word: line_vec) {
94-
if (word_count_.count(word)) continue;
93+
if (not word_count_.count(word)) continue;
9594
indices_[i].push_back(word_count_[word]);
9695
}
9796
}
@@ -149,6 +148,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
149148
}
150149
}
151150
}
151+
if (not remain_lines_) stream_fin_.close();
152152
return {read_lines, word_count_.size()};
153153
}
154154

cusim/aux.py

Lines changed: 219 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
#
44
# This source code is licensed under the Apache 2.0 license found in the
55
# LICENSE file in the root directory of this source tree.
6+
import os
67
import re
8+
import sys
79
import json
10+
import time
811
import logging
912
import logging.handlers
10-
13+
import numpy as np
1114
import jsmin
1215
from google.protobuf.json_format import Parse, MessageToDict
1316

@@ -117,3 +120,218 @@ def __getstate__(self):
117120

118121
def __setstate__(self, state):
119122
vars(self).update(state)
123+
124+
# reference: https://github.com/tensorflow/tensorflow/blob/
125+
# 85c8b2a817f95a3e979ecd1ed95bff1dc1335cff/tensorflow/python/
126+
# keras/utils/generic_utils.py#L483
127+
class Progbar:
128+
# pylint: disable=too-many-branches,too-many-statements,invalid-name
129+
# pylint: disable=blacklisted-name,no-else-return
130+
"""Displays a progress bar.
131+
Arguments:
132+
target: Total number of steps expected, None if unknown.
133+
width: Progress bar width on screen.
134+
verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
135+
stateful_metrics: Iterable of string names of metrics that should *not* be
136+
averaged over time. Metrics in this list will be displayed as-is. All
137+
others will be averaged by the progbar before display.
138+
interval: Minimum visual progress update interval (in seconds).
139+
unit_name: Display name for step counts (usually "step" or "sample").
140+
"""
141+
142+
def __init__(self,
143+
target,
144+
width=30,
145+
verbose=1,
146+
interval=0.05,
147+
stateful_metrics=None,
148+
unit_name='step'):
149+
self.target = target
150+
self.width = width
151+
self.verbose = verbose
152+
self.interval = interval
153+
self.unit_name = unit_name
154+
if stateful_metrics:
155+
self.stateful_metrics = set(stateful_metrics)
156+
else:
157+
self.stateful_metrics = set()
158+
159+
self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and
160+
sys.stdout.isatty()) or
161+
'ipykernel' in sys.modules or
162+
'posix' in sys.modules or
163+
'PYCHARM_HOSTED' in os.environ)
164+
self._total_width = 0
165+
self._seen_so_far = 0
166+
# We use a dict + list to avoid garbage collection
167+
# issues found in OrderedDict
168+
self._values = {}
169+
self._values_order = []
170+
self._start = time.time()
171+
self._last_update = 0
172+
173+
self._time_after_first_step = None
174+
175+
def update(self, current, values=None, finalize=None):
176+
"""Updates the progress bar.
177+
Arguments:
178+
current: Index of current step.
179+
values: List of tuples: `(name, value_for_last_step)`. If `name` is in
180+
`stateful_metrics`, `value_for_last_step` will be displayed as-is.
181+
Else, an average of the metric over time will be displayed.
182+
finalize: Whether this is the last update for the progress bar. If
183+
`None`, defaults to `current >= self.target`.
184+
"""
185+
if finalize is None:
186+
if self.target is None:
187+
finalize = False
188+
else:
189+
finalize = current >= self.target
190+
191+
values = values or []
192+
for k, v in values:
193+
if k not in self._values_order:
194+
self._values_order.append(k)
195+
if k not in self.stateful_metrics:
196+
# In the case that progress bar doesn't have a target value in the first
197+
# epoch, both on_batch_end and on_epoch_end will be called, which will
198+
# cause 'current' and 'self._seen_so_far' to have the same value. Force
199+
# the minimal value to 1 here, otherwise stateful_metric will be 0s.
200+
value_base = max(current - self._seen_so_far, 1)
201+
if k not in self._values:
202+
self._values[k] = [v * value_base, value_base]
203+
else:
204+
self._values[k][0] += v * value_base
205+
self._values[k][1] += value_base
206+
else:
207+
# Stateful metrics output a numeric value. This representation
208+
# means "take an average from a single value" but keeps the
209+
# numeric formatting.
210+
self._values[k] = [v, 1]
211+
self._seen_so_far = current
212+
213+
now = time.time()
214+
info = ' - %.0fs' % (now - self._start)
215+
if self.verbose == 1:
216+
if now - self._last_update < self.interval and not finalize:
217+
return
218+
219+
prev_total_width = self._total_width
220+
if self._dynamic_display:
221+
sys.stdout.write('\b' * prev_total_width)
222+
sys.stdout.write('\r')
223+
else:
224+
sys.stdout.write('\n')
225+
226+
if self.target is not None:
227+
numdigits = int(np.log10(self.target)) + 1
228+
bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target)
229+
prog = float(current) / self.target
230+
prog_width = int(self.width * prog)
231+
if prog_width > 0:
232+
bar += ('=' * (prog_width - 1))
233+
if current < self.target:
234+
bar += '>'
235+
else:
236+
bar += '='
237+
bar += ('.' * (self.width - prog_width))
238+
bar += ']'
239+
else:
240+
bar = '%7d/Unknown' % current
241+
242+
self._total_width = len(bar)
243+
sys.stdout.write(bar)
244+
245+
time_per_unit = self._estimate_step_duration(current, now)
246+
247+
if self.target is None or finalize:
248+
if time_per_unit >= 1 or time_per_unit == 0:
249+
info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
250+
elif time_per_unit >= 1e-3:
251+
info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
252+
else:
253+
info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
254+
else:
255+
eta = time_per_unit * (self.target - current)
256+
if eta > 3600:
257+
eta_format = '%d:%02d:%02d' % (eta // 3600,
258+
(eta % 3600) // 60, eta % 60)
259+
elif eta > 60:
260+
eta_format = '%d:%02d' % (eta // 60, eta % 60)
261+
else:
262+
eta_format = '%ds' % eta
263+
264+
info = ' - ETA: %s' % eta_format
265+
266+
for k in self._values_order:
267+
info += ' - %s:' % k
268+
if isinstance(self._values[k], list):
269+
avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
270+
if abs(avg) > 1e-3:
271+
info += ' %.4f' % avg
272+
else:
273+
info += ' %.4e' % avg
274+
else:
275+
info += ' %s' % self._values[k]
276+
277+
self._total_width += len(info)
278+
if prev_total_width > self._total_width:
279+
info += (' ' * (prev_total_width - self._total_width))
280+
281+
if finalize:
282+
info += '\n'
283+
284+
sys.stdout.write(info)
285+
sys.stdout.flush()
286+
287+
elif self.verbose == 2:
288+
if finalize:
289+
numdigits = int(np.log10(self.target)) + 1
290+
count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
291+
info = count + info
292+
for k in self._values_order:
293+
info += ' - %s:' % k
294+
avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
295+
if avg > 1e-3:
296+
info += ' %.4f' % avg
297+
else:
298+
info += ' %.4e' % avg
299+
info += '\n'
300+
301+
sys.stdout.write(info)
302+
sys.stdout.flush()
303+
304+
self._last_update = now
305+
306+
def add(self, n, values=None):
307+
self.update(self._seen_so_far + n, values)
308+
309+
def _estimate_step_duration(self, current, now):
310+
"""Estimate the duration of a single step.
311+
Given the step number `current` and the corresponding time `now`
312+
this function returns an estimate for how long a single step
313+
takes. If this is called before one step has been completed
314+
(i.e. `current == 0`) then zero is given as an estimate. The duration
315+
estimate ignores the duration of the (assumed to be non-representative)
316+
first step for estimates when more steps are available (i.e. `current>1`).
317+
Arguments:
318+
current: Index of current step.
319+
now: The current time.
320+
Returns: Estimate of the duration of a single step.
321+
"""
322+
if current:
323+
# there are a few special scenarios here:
324+
# 1) somebody is calling the progress bar without ever supplying step 1
325+
# 2) somebody is calling the progress bar and supplies step one mulitple
326+
# times, e.g. as part of a finalizing call
327+
# in these cases, we just fall back to the simple calculation
328+
if self._time_after_first_step is not None and current > 1:
329+
time_per_unit = (now - self._time_after_first_step) / (current - 1)
330+
else:
331+
time_per_unit = (now - self._start) / current
332+
333+
if current == 1:
334+
self._time_after_first_step = now
335+
return time_per_unit
336+
else:
337+
return 0

cusim/ioutils/pyioutils.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010

1111
import json
1212
import tempfile
13-
import tqdm
13+
14+
import h5py
15+
import numpy as np
16+
1417
from cusim import aux
1518
from cusim.ioutils.ioutils_bind import IoUtilsBind
1619
from cusim.config_pb2 import IoUtilsConfigProto
@@ -32,22 +35,50 @@ def __init__(self, opt=None):
3235

3336
def load_stream_vocab(self, filepath, min_count, keys_path):
3437
full_num_lines = self.obj.load_stream_file(filepath)
35-
pbar = tqdm.trange(full_num_lines, unit="line",
36-
postfix={"word_count": 0})
38+
pbar = aux.Progbar(full_num_lines, unit_name="line",
39+
stateful_metrics=["word_count"])
3740
processed = 0
3841
while True:
3942
read_lines, word_count = \
4043
self.obj.read_stream_for_vocab(
4144
self.opt.chunk_lines, self.opt.num_threads)
4245
processed += read_lines
43-
pbar.set_postfix({"word_count": word_count}, refresh=False)
44-
pbar.update(read_lines)
46+
pbar.update(processed, values=[("word_count", word_count)])
4547
if processed == full_num_lines:
4648
break
47-
pbar.close()
4849
self.obj.get_word_vocab(min_count, keys_path)
4950

50-
def convert_stream_to_h5(self, filepath, min_count, out_dir):
51+
def convert_stream_to_h5(self, filepath, min_count, out_dir,
52+
chunk_indices=10000):
5153
os.makedirs(out_dir, exist_ok=True)
5254
keys_path = pjoin(out_dir, "keys.csv")
55+
token_path = pjoin(out_dir, "token.h5")
56+
self.logger.info("save key and token to %s, %s",
57+
keys_path, token_path)
5358
self.load_stream_vocab(filepath, min_count, keys_path)
59+
full_num_lines = self.obj.load_stream_file(filepath)
60+
pbar = aux.Progbar(full_num_lines, unit_name="line")
61+
processed = 0
62+
h5f = h5py.File(token_path, "w")
63+
indices = h5f.create_dataset("indices", shape=(chunk_indices,),
64+
maxshape=(None,), dtype=np.int32,
65+
chunks=(chunk_indices,))
66+
indptr = h5f.create_dataset("indptr", shape=(full_num_lines + 1,),
67+
dtype=np.int32, chunks=True)
68+
processed, offset = 1, 0
69+
indptr[0] = 0
70+
while True:
71+
read_lines, data_size = self.obj.tokenize_stream(
72+
self.opt.chunk_lines, self.opt.num_threads)
73+
_indices = np.empty(shape=(data_size,), dtype=np.int32)
74+
_indptr = np.empty(shape=(read_lines,), dtype=np.int32)
75+
self.obj.get_token(_indices, _indptr, offset)
76+
indices.resize((offset + data_size,))
77+
indices[offset:offset + data_size] = _indices
78+
indptr[processed:processed + read_lines] = _indptr
79+
offset += data_size
80+
processed += read_lines
81+
pbar.update(processed - 1)
82+
if processed == full_num_lines + 1:
83+
break
84+
h5f.close()

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
tqdm
21
jsmin
32
numpy
43
pandas

0 commit comments

Comments
 (0)