implement tokenization and use keras progressbar

js1010 · js1010 · commit 68915a840c23 · 2021-02-07T13:07:29.000+09:00
diff --git a/cpp/src/ioutils.cc b/cpp/src/ioutils.cc
@@ -41,7 +41,7 @@ void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
       ret.push_back(element);
       element.clear();
     } else if (line[i] != '"') {
-      element += line[i];
+      element += std::tolower(line[i]);
     }
   }
   if (element.size() > 0) {
@@ -51,18 +51,17 @@ void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
 
 int IoUtils::LoadStreamFile(std::string filepath) {
   INFO("read gensim file to generate vocabulary: {}", filepath);
+  if (stream_fin_.is_open()) stream_fin_.close();
   stream_fin_.open(filepath.c_str());
   int count = 0;
   std::string line;
   while (getline(stream_fin_, line))
     count++;
   stream_fin_.close();
   stream_fin_.open(filepath.c_str());
-  word_idmap_.clear();
-  word_list_.clear();
-  word_count_.clear();
   num_lines_ = count;
   remain_lines_ = num_lines_;
+  INFO("number of lines: {}", num_lines_);
   return count;
 }
 
@@ -91,7 +90,7 @@ std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
 
       // tokenize
       for (auto& word: line_vec) {
-        if (word_count_.count(word)) continue;
+        if (not word_count_.count(word)) continue;
         indices_[i].push_back(word_count_[word]);
       }
     }
@@ -149,6 +148,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)
       }
     }
   }
+  if (not remain_lines_) stream_fin_.close();
   return {read_lines, word_count_.size()};
 }
 
diff --git a/cusim/aux.py b/cusim/aux.py
@@ -3,11 +3,14 @@
 #
 # This source code is licensed under the Apache 2.0 license found in the
 # LICENSE file in the root directory of this source tree.
+import os
 import re
+import sys
 import json
+import time
 import logging
 import logging.handlers
-
+import numpy as np
 import jsmin
 from google.protobuf.json_format import Parse, MessageToDict
 
@@ -117,3 +120,218 @@ def __getstate__(self):
 
   def __setstate__(self, state):
     vars(self).update(state)
+
+# reference: https://github.com/tensorflow/tensorflow/blob/
+# 85c8b2a817f95a3e979ecd1ed95bff1dc1335cff/tensorflow/python/
+# keras/utils/generic_utils.py#L483
+class Progbar:
+  # pylint: disable=too-many-branches,too-many-statements,invalid-name
+  # pylint: disable=blacklisted-name,no-else-return
+  """Displays a progress bar.
+  Arguments:
+      target: Total number of steps expected, None if unknown.
+      width: Progress bar width on screen.
+      verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+      stateful_metrics: Iterable of string names of metrics that should *not* be
+        averaged over time. Metrics in this list will be displayed as-is. All
+        others will be averaged by the progbar before display.
+      interval: Minimum visual progress update interval (in seconds).
+      unit_name: Display name for step counts (usually "step" or "sample").
+  """
+
+  def __init__(self,
+               target,
+               width=30,
+               verbose=1,
+               interval=0.05,
+               stateful_metrics=None,
+               unit_name='step'):
+    self.target = target
+    self.width = width
+    self.verbose = verbose
+    self.interval = interval
+    self.unit_name = unit_name
+    if stateful_metrics:
+      self.stateful_metrics = set(stateful_metrics)
+    else:
+      self.stateful_metrics = set()
+
+    self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and
+                              sys.stdout.isatty()) or
+                             'ipykernel' in sys.modules or
+                             'posix' in sys.modules or
+                             'PYCHARM_HOSTED' in os.environ)
+    self._total_width = 0
+    self._seen_so_far = 0
+    # We use a dict + list to avoid garbage collection
+    # issues found in OrderedDict
+    self._values = {}
+    self._values_order = []
+    self._start = time.time()
+    self._last_update = 0
+
+    self._time_after_first_step = None
+
+  def update(self, current, values=None, finalize=None):
+    """Updates the progress bar.
+    Arguments:
+        current: Index of current step.
+        values: List of tuples: `(name, value_for_last_step)`. If `name` is in
+          `stateful_metrics`, `value_for_last_step` will be displayed as-is.
+          Else, an average of the metric over time will be displayed.
+        finalize: Whether this is the last update for the progress bar. If
+          `None`, defaults to `current >= self.target`.
+    """
+    if finalize is None:
+      if self.target is None:
+        finalize = False
+      else:
+        finalize = current >= self.target
+
+    values = values or []
+    for k, v in values:
+      if k not in self._values_order:
+        self._values_order.append(k)
+      if k not in self.stateful_metrics:
+        # In the case that progress bar doesn't have a target value in the first
+        # epoch, both on_batch_end and on_epoch_end will be called, which will
+        # cause 'current' and 'self._seen_so_far' to have the same value. Force
+        # the minimal value to 1 here, otherwise stateful_metric will be 0s.
+        value_base = max(current - self._seen_so_far, 1)
+        if k not in self._values:
+          self._values[k] = [v * value_base, value_base]
+        else:
+          self._values[k][0] += v * value_base
+          self._values[k][1] += value_base
+      else:
+        # Stateful metrics output a numeric value. This representation
+        # means "take an average from a single value" but keeps the
+        # numeric formatting.
+        self._values[k] = [v, 1]
+    self._seen_so_far = current
+
+    now = time.time()
+    info = ' - %.0fs' % (now - self._start)
+    if self.verbose == 1:
+      if now - self._last_update < self.interval and not finalize:
+        return
+
+      prev_total_width = self._total_width
+      if self._dynamic_display:
+        sys.stdout.write('\b' * prev_total_width)
+        sys.stdout.write('\r')
+      else:
+        sys.stdout.write('\n')
+
+      if self.target is not None:
+        numdigits = int(np.log10(self.target)) + 1
+        bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target)
+        prog = float(current) / self.target
+        prog_width = int(self.width * prog)
+        if prog_width > 0:
+          bar += ('=' * (prog_width - 1))
+          if current < self.target:
+            bar += '>'
+          else:
+            bar += '='
+        bar += ('.' * (self.width - prog_width))
+        bar += ']'
+      else:
+        bar = '%7d/Unknown' % current
+
+      self._total_width = len(bar)
+      sys.stdout.write(bar)
+
+      time_per_unit = self._estimate_step_duration(current, now)
+
+      if self.target is None or finalize:
+        if time_per_unit >= 1 or time_per_unit == 0:
+          info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
+        elif time_per_unit >= 1e-3:
+          info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
+        else:
+          info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
+      else:
+        eta = time_per_unit * (self.target - current)
+        if eta > 3600:
+          eta_format = '%d:%02d:%02d' % (eta // 3600,
+                                         (eta % 3600) // 60, eta % 60)
+        elif eta > 60:
+          eta_format = '%d:%02d' % (eta // 60, eta % 60)
+        else:
+          eta_format = '%ds' % eta
+
+        info = ' - ETA: %s' % eta_format
+
+      for k in self._values_order:
+        info += ' - %s:' % k
+        if isinstance(self._values[k], list):
+          avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
+          if abs(avg) > 1e-3:
+            info += ' %.4f' % avg
+          else:
+            info += ' %.4e' % avg
+        else:
+          info += ' %s' % self._values[k]
+
+      self._total_width += len(info)
+      if prev_total_width > self._total_width:
+        info += (' ' * (prev_total_width - self._total_width))
+
+      if finalize:
+        info += '\n'
+
+      sys.stdout.write(info)
+      sys.stdout.flush()
+
+    elif self.verbose == 2:
+      if finalize:
+        numdigits = int(np.log10(self.target)) + 1
+        count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
+        info = count + info
+        for k in self._values_order:
+          info += ' - %s:' % k
+          avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
+          if avg > 1e-3:
+            info += ' %.4f' % avg
+          else:
+            info += ' %.4e' % avg
+        info += '\n'
+
+        sys.stdout.write(info)
+        sys.stdout.flush()
+
+    self._last_update = now
+
+  def add(self, n, values=None):
+    self.update(self._seen_so_far + n, values)
+
+  def _estimate_step_duration(self, current, now):
+    """Estimate the duration of a single step.
+    Given the step number `current` and the corresponding time `now`
+    this function returns an estimate for how long a single step
+    takes. If this is called before one step has been completed
+    (i.e. `current == 0`) then zero is given as an estimate. The duration
+    estimate ignores the duration of the (assumed to be non-representative)
+    first step for estimates when more steps are available (i.e. `current>1`).
+    Arguments:
+      current: Index of current step.
+      now: The current time.
+    Returns: Estimate of the duration of a single step.
+    """
+    if current:
+      # there are a few special scenarios here:
+      # 1) somebody is calling the progress bar without ever supplying step 1
+      # 2) somebody is calling the progress bar and supplies step one mulitple
+      #    times, e.g. as part of a finalizing call
+      # in these cases, we just fall back to the simple calculation
+      if self._time_after_first_step is not None and current > 1:
+        time_per_unit = (now - self._time_after_first_step) / (current - 1)
+      else:
+        time_per_unit = (now - self._start) / current
+
+      if current == 1:
+        self._time_after_first_step = now
+      return time_per_unit
+    else:
+      return 0
diff --git a/cusim/ioutils/pyioutils.py b/cusim/ioutils/pyioutils.py
@@ -10,7 +10,10 @@
 
 import json
 import tempfile
-import tqdm
+
+import h5py
+import numpy as np
+
 from cusim import aux
 from cusim.ioutils.ioutils_bind import IoUtilsBind
 from cusim.config_pb2 import IoUtilsConfigProto
@@ -32,22 +35,50 @@ def __init__(self, opt=None):
 
   def load_stream_vocab(self, filepath, min_count, keys_path):
     full_num_lines = self.obj.load_stream_file(filepath)
-    pbar = tqdm.trange(full_num_lines, unit="line",
-                       postfix={"word_count": 0})
+    pbar = aux.Progbar(full_num_lines, unit_name="line",
+                        stateful_metrics=["word_count"])
     processed = 0
     while True:
       read_lines, word_count = \
         self.obj.read_stream_for_vocab(
           self.opt.chunk_lines, self.opt.num_threads)
       processed += read_lines
-      pbar.set_postfix({"word_count": word_count}, refresh=False)
-      pbar.update(read_lines)
+      pbar.update(processed, values=[("word_count", word_count)])
       if processed == full_num_lines:
         break
-    pbar.close()
     self.obj.get_word_vocab(min_count, keys_path)
 
-  def convert_stream_to_h5(self, filepath, min_count, out_dir):
+  def convert_stream_to_h5(self, filepath, min_count, out_dir,
+                           chunk_indices=10000):
     os.makedirs(out_dir, exist_ok=True)
     keys_path = pjoin(out_dir, "keys.csv")
+    token_path = pjoin(out_dir, "token.h5")
+    self.logger.info("save key and token to %s, %s",
+                     keys_path, token_path)
     self.load_stream_vocab(filepath, min_count, keys_path)
+    full_num_lines = self.obj.load_stream_file(filepath)
+    pbar = aux.Progbar(full_num_lines, unit_name="line")
+    processed = 0
+    h5f = h5py.File(token_path, "w")
+    indices = h5f.create_dataset("indices", shape=(chunk_indices,),
+                                 maxshape=(None,), dtype=np.int32,
+                                 chunks=(chunk_indices,))
+    indptr =  h5f.create_dataset("indptr", shape=(full_num_lines + 1,),
+                                 dtype=np.int32, chunks=True)
+    processed, offset = 1, 0
+    indptr[0] = 0
+    while True:
+      read_lines, data_size = self.obj.tokenize_stream(
+        self.opt.chunk_lines, self.opt.num_threads)
+      _indices = np.empty(shape=(data_size,), dtype=np.int32)
+      _indptr = np.empty(shape=(read_lines,), dtype=np.int32)
+      self.obj.get_token(_indices, _indptr, offset)
+      indices.resize((offset + data_size,))
+      indices[offset:offset + data_size] = _indices
+      indptr[processed:processed + read_lines] = _indptr
+      offset += data_size
+      processed += read_lines
+      pbar.update(processed - 1)
+      if processed == full_num_lines + 1:
+        break
+    h5f.close()
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
-tqdm
 jsmin
 numpy
 pandas

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {`
`41`	`41`	`ret.push_back(element);`
`42`	`42`	`element.clear();`
`43`	`43`	`} else if (line[i] != '"') {`
`44`		`- element += line[i];`
	`44`	`+ element += std::tolower(line[i]);`
`45`	`45`	`}`
`46`	`46`	`}`
`47`	`47`	`if (element.size() > 0) {`
`@@ -51,18 +51,17 @@ void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {`
`51`	`51`
`52`	`52`	`int IoUtils::LoadStreamFile(std::string filepath) {`
`53`	`53`	`INFO("read gensim file to generate vocabulary: {}", filepath);`
	`54`	`+ if (stream_fin_.is_open()) stream_fin_.close();`
`54`	`55`	`stream_fin_.open(filepath.c_str());`
`55`	`56`	`int count = 0;`
`56`	`57`	`std::string line;`
`57`	`58`	`while (getline(stream_fin_, line))`
`58`	`59`	`count++;`
`59`	`60`	`stream_fin_.close();`
`60`	`61`	`stream_fin_.open(filepath.c_str());`
`61`		`- word_idmap_.clear();`
`62`		`- word_list_.clear();`
`63`		`- word_count_.clear();`
`64`	`62`	`num_lines_ = count;`
`65`	`63`	`remain_lines_ = num_lines_;`
	`64`	`+ INFO("number of lines: {}", num_lines_);`
`66`	`65`	`return count;`
`67`	`66`	`}`
`68`	`67`
`@@ -91,7 +90,7 @@ std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {`
`91`	`90`
`92`	`91`	`// tokenize`
`93`	`92`	`for (auto& word: line_vec) {`
`94`		`- if (word_count_.count(word)) continue;`
	`93`	`+ if (not word_count_.count(word)) continue;`
`95`	`94`	`indices_[i].push_back(word_count_[word]);`
`96`	`95`	`}`
`97`	`96`	`}`
`@@ -149,6 +148,7 @@ std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads)`
`149`	`148`	`}`
`150`	`149`	`}`
`151`	`150`	`}`
	`151`	`+ if (not remain_lines_) stream_fin_.close();`
`152`	`152`	`return {read_lines, word_count_.size()};`
`153`	`153`	`}`
`154`	`154`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-tqdm`
`2`	`1`	`jsmin`
`3`	`2`	`numpy`
`4`	`3`	`pandas`