From 130c115f4d208255fa77fcd2f25ea13fb44de285 Mon Sep 17 00:00:00 2001 From: Quanjia Yan Date: Wed, 15 May 2019 17:18:42 +0800 Subject: [PATCH 1/4] use all data build vocabulary --- problem.py | 41 ++++++++++++++++++++++------------------- train.py | 7 +++++-- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/problem.py b/problem.py index b46efbf..bd0bb46 100644 --- a/problem.py +++ b/problem.py @@ -93,21 +93,24 @@ def output_target_num(self): else: return None - def get_data_generator_from_file(self, file_path, file_with_col_header, chunk_size=1000000): - with open(file_path, "r", encoding='utf-8') as f: - if file_with_col_header: - f.readline() - data_list = list() - for index, line in enumerate(f): - line = line.rstrip() - if not line: - break - data_list.append(line) - if (index + 1) % chunk_size == 0: - yield data_list - data_list = list() - if len(data_list) > 0: - yield data_list + def get_data_generator_from_file(self, file_path, file_with_col_header, chunk_size=100): + # NOTE: file_path is a list type + data_list = list() + for single_path in file_path: + if single_path is not None: + with open(single_path, "r", encoding='utf-8') as f: + if file_with_col_header: + f.readline() + for index, line in enumerate(f): + line = line.rstrip() + if not line: + break + data_list.append(line) + if (index + 1) % chunk_size == 0: + yield data_list + data_list = list() + if len(data_list) > 0: + yield data_list def build_training_data_list(self, training_data_list, file_columns, input_types, answer_column_name, bpe_encoder=None): docs = dict() # docs of each type of input @@ -199,7 +202,7 @@ def build_training_multi_processor(self, training_data_generator, cpu_num_worker yield docs, target_docs, cnt_legal, cnt_illegal - def build(self, training_data_path, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None, + def build(self, data_path, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None, format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True, cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3): """ @@ -251,8 +254,8 @@ def build(self, training_data_path, file_columns, input_types, file_with_col_hea bpe_encoder = None self.file_column_num = len(file_columns) - progress = self.get_data_generator_from_file(training_data_path, file_with_col_header) - preprocessed_data_generator= self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder) + progress = self.get_data_generator_from_file(data_path, file_with_col_header) + preprocessed_data_generator = self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder) # update symbol universe total_cnt_legal, total_cnt_illegal = 0, 0 @@ -674,7 +677,7 @@ def encode(self, data_path, file_columns, input_types, file_with_col_header, obj else: bpe_encoder = None - progress = self.get_data_generator_from_file(data_path, file_with_col_header) + progress = self.get_data_generator_from_file([data_path], file_with_col_header) data, lengths, target, cnt_legal, cnt_illegal = self.encode_data_multi_processor(progress, cpu_num_workers, file_columns, input_types, object_inputs, answer_column_name, min_sentence_len, extra_feature, max_lengths, fixed_lengths, file_format, bpe_encoder=bpe_encoder) diff --git a/train.py b/train.py index 4289c48..e0f425b 100644 --- a/train.py +++ b/train.py @@ -138,15 +138,18 @@ def main(params): if cache_load_flag is False: logging.info("Preprocessing... Depending on your corpus size, this step may take a while.") + # modify train_data_path to [train_data_path, valid_data_path, test_data_path] + # remember the test_data may be None + data_path = [conf.train_data_path, conf.valid_data_path, conf.test_data_path] if conf.pretrained_emb_path: - emb_matrix = problem.build(conf.train_data_path, conf.file_columns, conf.input_types, conf.file_with_col_header, + emb_matrix = problem.build(data_path, conf.file_columns, conf.input_types, conf.file_with_col_header, conf.answer_column_name, word2vec_path=conf.pretrained_emb_path, word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type, file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb, show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers, max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency) else: - emb_matrix = problem.build(conf.train_data_path, conf.file_columns, conf.input_types, conf.file_with_col_header, + emb_matrix = problem.build(data_path, conf.file_columns, conf.input_types, conf.file_with_col_header, conf.answer_column_name, word2vec_path=None, word_emb_dim=None, format=None, file_type=None, involve_all_words=conf.involve_all_words_in_pretrained_emb, show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers, From 20fe21d9ce0ea1a3c4efc3565fed36df1a44aa8c Mon Sep 17 00:00:00 2001 From: Quanjia Yan Date: Wed, 15 May 2019 17:44:55 +0800 Subject: [PATCH 2/4] modify some variable_name --- problem.py | 10 +++++----- train.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/problem.py b/problem.py index bd0bb46..21c633b 100644 --- a/problem.py +++ b/problem.py @@ -93,10 +93,10 @@ def output_target_num(self): else: return None - def get_data_generator_from_file(self, file_path, file_with_col_header, chunk_size=100): + def get_data_generator_from_file(self, data_path_list, file_with_col_header, chunk_size=1000000): # NOTE: file_path is a list type - data_list = list() - for single_path in file_path: + for single_path in data_path_list: + data_list = list() if single_path is not None: with open(single_path, "r", encoding='utf-8') as f: if file_with_col_header: @@ -202,7 +202,7 @@ def build_training_multi_processor(self, training_data_generator, cpu_num_worker yield docs, target_docs, cnt_legal, cnt_illegal - def build(self, data_path, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None, + def build(self, data_path_list, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None, format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True, cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3): """ @@ -254,7 +254,7 @@ def build(self, data_path, file_columns, input_types, file_with_col_header, answ bpe_encoder = None self.file_column_num = len(file_columns) - progress = self.get_data_generator_from_file(data_path, file_with_col_header) + progress = self.get_data_generator_from_file(data_path_list, file_with_col_header) preprocessed_data_generator = self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder) # update symbol universe diff --git a/train.py b/train.py index e0f425b..4d169ab 100644 --- a/train.py +++ b/train.py @@ -140,16 +140,16 @@ def main(params): logging.info("Preprocessing... Depending on your corpus size, this step may take a while.") # modify train_data_path to [train_data_path, valid_data_path, test_data_path] # remember the test_data may be None - data_path = [conf.train_data_path, conf.valid_data_path, conf.test_data_path] + data_path_list = [conf.train_data_path, conf.valid_data_path, conf.test_data_path] if conf.pretrained_emb_path: - emb_matrix = problem.build(data_path, conf.file_columns, conf.input_types, conf.file_with_col_header, + emb_matrix = problem.build(data_path_list, conf.file_columns, conf.input_types, conf.file_with_col_header, conf.answer_column_name, word2vec_path=conf.pretrained_emb_path, word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type, file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb, show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers, max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency) else: - emb_matrix = problem.build(data_path, conf.file_columns, conf.input_types, conf.file_with_col_header, + emb_matrix = problem.build(data_path_list, conf.file_columns, conf.input_types, conf.file_with_col_header, conf.answer_column_name, word2vec_path=None, word_emb_dim=None, format=None, file_type=None, involve_all_words=conf.involve_all_words_in_pretrained_emb, show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers, From a30d53946633a54d485a59d9cf306707e72d030e Mon Sep 17 00:00:00 2001 From: boshining <13269623658@163.com> Date: Tue, 6 Aug 2019 15:31:25 +0800 Subject: [PATCH 3/4] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 183ef86..1955e67 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +test # ***NeuronBlocks*** - Building Your NLP DNN Models Like Playing Lego [![language](https://img.shields.io/badge/language-en%20%7C%20中文-brightgreen.svg)](#language-supported) From 09cca927deb4ea2db5398db8f4b328f193096e5f Mon Sep 17 00:00:00 2001 From: boshining <13269623658@163.com> Date: Tue, 6 Aug 2019 16:19:15 +0800 Subject: [PATCH 4/4] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bc53c5e..0208fc0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ test +test # ***NeuronBlocks*** - Building Your NLP DNN Models Like Playing Lego [![language](https://img.shields.io/badge/language-en%20%7C%20中文-brightgreen.svg)](#language-supported)