From 130c115f4d208255fa77fcd2f25ea13fb44de285 Mon Sep 17 00:00:00 2001
From: Quanjia Yan <t-quyan@microsoft.com>
Date: Wed, 15 May 2019 17:18:42 +0800
Subject: [PATCH 1/4] use all data build vocabulary

---
 problem.py | 41 ++++++++++++++++++++++-------------------
 train.py   |  7 +++++--
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/problem.py b/problem.py
index b46efbf..bd0bb46 100644
--- a/problem.py
+++ b/problem.py
@@ -93,21 +93,24 @@ def output_target_num(self):
         else:
             return None
 
-    def get_data_generator_from_file(self, file_path, file_with_col_header, chunk_size=1000000):
-        with open(file_path, "r", encoding='utf-8') as f:
-            if file_with_col_header:
-                f.readline()
-            data_list = list()
-            for index, line in enumerate(f):
-                line = line.rstrip()
-                if not line:
-                    break
-                data_list.append(line)
-                if (index + 1) % chunk_size == 0:
-                    yield data_list
-                    data_list = list()
-            if len(data_list) > 0:
-                yield data_list
+    def get_data_generator_from_file(self, file_path, file_with_col_header, chunk_size=100):
+        # NOTE: file_path is a list type
+        data_list = list()
+        for single_path in file_path:
+            if single_path is not None:
+                with open(single_path, "r", encoding='utf-8') as f:
+                    if file_with_col_header:
+                        f.readline()
+                    for index, line in enumerate(f):
+                        line = line.rstrip()
+                        if not line:
+                            break
+                        data_list.append(line)
+                        if (index + 1) % chunk_size == 0:
+                            yield data_list
+                            data_list = list()
+                    if len(data_list) > 0:
+                        yield data_list
 
     def build_training_data_list(self, training_data_list, file_columns, input_types, answer_column_name, bpe_encoder=None):
         docs = dict()           # docs of each type of input
@@ -199,7 +202,7 @@ def build_training_multi_processor(self, training_data_generator, cpu_num_worker
 
             yield docs, target_docs, cnt_legal, cnt_illegal
 
-    def build(self, training_data_path, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None,
+    def build(self, data_path, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None,
               format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True,
               cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3):
         """
@@ -251,8 +254,8 @@ def build(self, training_data_path, file_columns, input_types, file_with_col_hea
             bpe_encoder = None
 
         self.file_column_num = len(file_columns)
-        progress = self.get_data_generator_from_file(training_data_path, file_with_col_header)
-        preprocessed_data_generator= self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder)
+        progress = self.get_data_generator_from_file(data_path, file_with_col_header)
+        preprocessed_data_generator = self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder)
         
         # update symbol universe
         total_cnt_legal, total_cnt_illegal = 0, 0
@@ -674,7 +677,7 @@ def encode(self, data_path, file_columns, input_types, file_with_col_header, obj
         else:
             bpe_encoder = None
 
-        progress = self.get_data_generator_from_file(data_path, file_with_col_header)
+        progress = self.get_data_generator_from_file([data_path], file_with_col_header)
         data, lengths, target, cnt_legal, cnt_illegal = self.encode_data_multi_processor(progress, cpu_num_workers,
                     file_columns, input_types, object_inputs, answer_column_name, min_sentence_len, extra_feature, max_lengths,
                     fixed_lengths, file_format, bpe_encoder=bpe_encoder)
diff --git a/train.py b/train.py
index 4289c48..e0f425b 100644
--- a/train.py
+++ b/train.py
@@ -138,15 +138,18 @@ def main(params):
 
         if cache_load_flag is False:
             logging.info("Preprocessing... Depending on your corpus size, this step may take a while.")
+            # modify train_data_path to [train_data_path, valid_data_path, test_data_path]
+            # remember the test_data may be None
+            data_path = [conf.train_data_path, conf.valid_data_path, conf.test_data_path]
             if conf.pretrained_emb_path:
-                emb_matrix = problem.build(conf.train_data_path, conf.file_columns, conf.input_types, conf.file_with_col_header,
+                emb_matrix = problem.build(data_path, conf.file_columns, conf.input_types, conf.file_with_col_header,
                                            conf.answer_column_name, word2vec_path=conf.pretrained_emb_path,
                                            word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type,
                                            file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb,
                                            show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers,
                                            max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency)
             else:
-                emb_matrix = problem.build(conf.train_data_path, conf.file_columns, conf.input_types, conf.file_with_col_header,
+                emb_matrix = problem.build(data_path, conf.file_columns, conf.input_types, conf.file_with_col_header,
                                            conf.answer_column_name, word2vec_path=None, word_emb_dim=None, format=None,
                                            file_type=None, involve_all_words=conf.involve_all_words_in_pretrained_emb,
                                            show_progress=True if params.mode == 'normal' else False,  cpu_num_workers = conf.cpu_num_workers,

From 20fe21d9ce0ea1a3c4efc3565fed36df1a44aa8c Mon Sep 17 00:00:00 2001
From: Quanjia Yan <t-quyan@microsoft.com>
Date: Wed, 15 May 2019 17:44:55 +0800
Subject: [PATCH 2/4] modify some variable_name

---
 problem.py | 10 +++++-----
 train.py   |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/problem.py b/problem.py
index bd0bb46..21c633b 100644
--- a/problem.py
+++ b/problem.py
@@ -93,10 +93,10 @@ def output_target_num(self):
         else:
             return None
 
-    def get_data_generator_from_file(self, file_path, file_with_col_header, chunk_size=100):
+    def get_data_generator_from_file(self, data_path_list, file_with_col_header, chunk_size=1000000):
         # NOTE: file_path is a list type
-        data_list = list()
-        for single_path in file_path:
+        for single_path in data_path_list:
+            data_list = list()
             if single_path is not None:
                 with open(single_path, "r", encoding='utf-8') as f:
                     if file_with_col_header:
@@ -202,7 +202,7 @@ def build_training_multi_processor(self, training_data_generator, cpu_num_worker
 
             yield docs, target_docs, cnt_legal, cnt_illegal
 
-    def build(self, data_path, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None,
+    def build(self, data_path_list, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None,
               format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True,
               cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3):
         """
@@ -254,7 +254,7 @@ def build(self, data_path, file_columns, input_types, file_with_col_header, answ
             bpe_encoder = None
 
         self.file_column_num = len(file_columns)
-        progress = self.get_data_generator_from_file(data_path, file_with_col_header)
+        progress = self.get_data_generator_from_file(data_path_list, file_with_col_header)
         preprocessed_data_generator = self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder)
         
         # update symbol universe
diff --git a/train.py b/train.py
index e0f425b..4d169ab 100644
--- a/train.py
+++ b/train.py
@@ -140,16 +140,16 @@ def main(params):
             logging.info("Preprocessing... Depending on your corpus size, this step may take a while.")
             # modify train_data_path to [train_data_path, valid_data_path, test_data_path]
             # remember the test_data may be None
-            data_path = [conf.train_data_path, conf.valid_data_path, conf.test_data_path]
+            data_path_list = [conf.train_data_path, conf.valid_data_path, conf.test_data_path]
             if conf.pretrained_emb_path:
-                emb_matrix = problem.build(data_path, conf.file_columns, conf.input_types, conf.file_with_col_header,
+                emb_matrix = problem.build(data_path_list, conf.file_columns, conf.input_types, conf.file_with_col_header,
                                            conf.answer_column_name, word2vec_path=conf.pretrained_emb_path,
                                            word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type,
                                            file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb,
                                            show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers,
                                            max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency)
             else:
-                emb_matrix = problem.build(data_path, conf.file_columns, conf.input_types, conf.file_with_col_header,
+                emb_matrix = problem.build(data_path_list, conf.file_columns, conf.input_types, conf.file_with_col_header,
                                            conf.answer_column_name, word2vec_path=None, word_emb_dim=None, format=None,
                                            file_type=None, involve_all_words=conf.involve_all_words_in_pretrained_emb,
                                            show_progress=True if params.mode == 'normal' else False,  cpu_num_workers = conf.cpu_num_workers,

From a30d53946633a54d485a59d9cf306707e72d030e Mon Sep 17 00:00:00 2001
From: boshining <13269623658@163.com>
Date: Tue, 6 Aug 2019 15:31:25 +0800
Subject: [PATCH 3/4] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 183ef86..1955e67 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+test
 # ***NeuronBlocks*** - Building Your NLP DNN Models Like Playing Lego
 
 [![language](https://img.shields.io/badge/language-en%20%7C%20中文-brightgreen.svg)](#language-supported)

From 09cca927deb4ea2db5398db8f4b328f193096e5f Mon Sep 17 00:00:00 2001
From: boshining <13269623658@163.com>
Date: Tue, 6 Aug 2019 16:19:15 +0800
Subject: [PATCH 4/4] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index bc53c5e..0208fc0 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 test
+test
 # ***NeuronBlocks*** - Building Your NLP DNN Models Like Playing Lego
 
 [![language](https://img.shields.io/badge/language-en%20%7C%20中文-brightgreen.svg)](#language-supported)