Skip to content

Commit 58ad563

Browse files
chengfxljshou
authored andcommitted
Add encoding cache and lazy-train mechanism (#50)
* Add new config about knowledge distillation for query binary classifier * remove inferenced result in knowledge distillation for query binary classifier * Add AUC.py in tools folder * Add test_data_path into conf_kdqbc_bilstmattn_cnn.json * Modify AUC.py * Rename AUC.py into calculate_AUC.py * Modify test&calculate AUC commands for Knowledge Distillation for Query Binary Classifier * Add cpu_thread_num parameter in conf.training_params * Rename cpu_thread_num into cpu_num_workers * update comments in ModelConf.py * Add cup_num_workers in model_zoo/advanced/conf.json * Add the description of cpu_num_workers in Tutorial.md * Update inference speed of compressed model * Add ProcessorsScheduler Class * Add license in ProcessorScheduler.py * use lazy loading instead of one-off loading * Remove Debug Info in problem.py * use open instead of codecs.open * update the inference of build dictionary for classification * add md5 function in common_utils.py * add merge_encode_* function * update typo * update typo * reorg the logical flow in train.py * remove dummy comments in problem.py * add encoding cache mechanism * add lazy-load mechanism for training phase * enumerate problem types in problem.py * remove data_encoding.py * add lazy load train logic * Modify comment and remove debug code * Judge if test_path exists * fix parameter missing when use char embedding * merge master * add file_column_num in problem.py * merge add_encoding_cache branch * add SST-2 in .gitignore * merge master * use steps_per_validation instead of valid_times_per_epoch * Fix Learning Rate decay logic bug * add log of calculating md5 of training data * fix multi-gpu char_emb OOM problem & add char leval fix_lengths * Modify batch_num_to_show_results in multi-gpu * Modify batch_num_to_show_results * delete deepcopy in get_batches * add new parameters chunk_size and max_building_lines in conf and update tutorials
1 parent db26940 commit 58ad563

File tree

15 files changed

+590
-362
lines changed

15 files changed

+590
-362
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@
55
*.vs*
66
dataset/GloVe/
77
dataset/20_newsgroups/
8-
models/
8+
dataset/SST-2/
9+
models/

LearningMachine.py

Lines changed: 220 additions & 218 deletions
Large diffs are not rendered by default.

ModelConf.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414

1515
from losses.BaseLossConf import BaseLossConf
1616
#import traceback
17-
from settings import LanguageTypes, ProblemTypes, TaggingSchemes, SupportedMetrics, PredictionTypes, DefaultPredictionFields
18-
from utils.common_utils import log_set, prepare_dir
17+
from settings import LanguageTypes, ProblemTypes, TaggingSchemes, SupportedMetrics, PredictionTypes, DefaultPredictionFields, ConstantStatic
18+
from utils.common_utils import log_set, prepare_dir, md5
1919
from utils.exceptions import ConfigurationError
2020
import numpy as np
2121

@@ -219,6 +219,10 @@ def load_from_file(self, conf_path):
219219
# vocabulary setting
220220
self.max_vocabulary = self.get_item(['training_params', 'vocabulary', 'max_vocabulary'], default=800000, use_default=True)
221221
self.min_word_frequency = self.get_item(['training_params', 'vocabulary', 'min_word_frequency'], default=3, use_default=True)
222+
self.max_building_lines = self.get_item(['training_params', 'vocabulary', 'max_building_lines'], default=1000 * 1000, use_default=True)
223+
224+
# chunk_size
225+
self.chunk_size = self.get_item(['training_params', 'chunk_size'], default=1000 * 1000, use_default=True)
222226

223227
# file column header setting
224228
self.file_with_col_header = self.get_item(['inputs', 'file_with_col_header'], default=False, use_default=True)
@@ -280,6 +284,9 @@ def load_from_file(self, conf_path):
280284
tmp_problem_path = os.path.join(self.save_base_dir, '.necessary_cache', 'problem.pkl')
281285
self.problem_path = tmp_problem_path if os.path.isfile(tmp_problem_path) else os.path.join(self.save_base_dir, 'necessary_cache', 'problem.pkl')
282286

287+
# cache configuration
288+
self._load_cache_config_from_conf()
289+
283290
# training params
284291
self.training_params = self.get_item(['training_params'])
285292

@@ -303,7 +310,9 @@ def load_from_file(self, conf_path):
303310
self.max_epoch = self.params.max_epoch
304311
else:
305312
self.max_epoch = self.get_item(['training_params', 'max_epoch'], default=float('inf'))
306-
self.valid_times_per_epoch = self.get_item(['training_params', 'valid_times_per_epoch'], default=1)
313+
if 'valid_times_per_epoch' in self.conf['training_params']:
314+
logging.info("configuration[training_params][valid_times_per_epoch] is deprecated, please use configuration[training_params][steps_per_validation] instead")
315+
self.steps_per_validation = self.get_item(['training_params', 'steps_per_validation'], default=10)
307316
self.batch_num_to_show_results = self.get_item(['training_params', 'batch_num_to_show_results'], default=10)
308317
self.max_lengths = self.get_item(['training_params', 'max_lengths'], default=None, use_default=True)
309318
self.fixed_lengths = self.get_item(['training_params', 'fixed_lengths'], default=None, use_default=True)
@@ -529,3 +538,23 @@ def back_up(self, params):
529538
shutil.copy(params.conf_path, self.save_base_dir)
530539
logging.info('Configuration file is backed up to %s' % (self.save_base_dir))
531540

541+
def _load_cache_config_from_conf(self):
542+
# training data
543+
self.train_data_md5 = None
544+
if self.phase == 'train' and self.train_data_path:
545+
logging.info("Calculating the md5 of traing data ...")
546+
self.train_data_md5 = md5([self.train_data_path])
547+
logging.info("the md5 of traing data is %s"%(self.train_data_md5))
548+
549+
# problem
550+
self.problem_md5 = None
551+
552+
# encoding
553+
self.encoding_cache_dir = None
554+
self.encoding_cache_index_file_path = None
555+
self.encoding_cache_index_file_md5_path = None
556+
self.encoding_file_index = None
557+
self.encoding_cache_legal_line_cnt = 0
558+
self.encoding_cache_illegal_line_cnt = 0
559+
self.load_encoding_cache_generator = None
560+

Tutorial.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,12 @@ The architecture of the configuration file is:
147147
CUDA_VISIBLE_DEVICES= python train.py
148148
```
149149
- ***cpu_num_workers***. [default: -1] Define the number of processes to preprocess the dataset. The number of processes is equal to that of logical cores CPU supports if value is negtive or 0, otherwise it is equal to *cpu_num_workers*.
150+
- ***chunk_size***. [default: 1000000] Define the chunk size of files that NB reads every time for avoiding out of memory and the mechanism of lazy-loading.
150151
- ***batch_size***. Define the batch size here. If there are multiple GPUs, *batch_size* is the batch size of each GPU.
151152
- ***batch_num_to_show_results***. [necessary for training] During the training process, show the results every batch_num_to_show_results batches.
152153
- ***max_epoch***. [necessary for training] The maximum number of epochs to train.
153-
- ***valid_times_per_epoch***. [optional for training, default: 1] Define how many times to conduct validation per epoch. Usually, we conduct validation after each epoch, but for a very large corpus, we'd better validate multiple times in case to miss the best state of our model. The default value is 1.
154+
- ~~***valid_times_per_epoch***~~. [**deprecated**] Please use steps_per_validation instead.
155+
- ***steps_per_validation***. [default: 10] Define how many steps does each validation take place.
154156
- ***tokenizer***. [optional] Define tokenizer here. Currently, we support 'nltk' and 'jieba'. By default, 'nltk' for English and 'jieba' for Chinese.
155157
- **architecture**. Define the model architecture. The node is a list of layers (blocks) in block_zoo to represent a model. The supported layers of this toolkit are given in [block_zoo overview](https://microsoft.github.io/NeuronBlocks).
156158
@@ -729,5 +731,7 @@ To solve the above problems, NeuronBlocks supports *fixing embedding weight* (em
729731
730732
***training_params/vocabulary/max_vocabulary***. [int, optional for training, default: 800,000] The max size of corpus vocabulary. If corpus vocabulary size is larger than *max_vocabulary*, it will be cut according to word frequency.
731733
734+
***training_params/vocabulary/max_building_lines***. [int, optional for training, default: 1,000,000] The max lines NB will read from every file to build vocabulary
735+
732736
## <span id="faq">Frequently Asked Questions</span>
733737

Tutorial_zh_CN.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,10 +137,12 @@ python predict.py --conf_path=model_zoo/demo/conf.json
137137
CUDA_VISIBLE_DEVICES= python train.py
138138
```
139139
- ***cpu_num_workers***. [default: -1] Define the number of processes to preprocess the dataset. The number of processes is equal to that of logical cores CPU supports if value is negtive or 0, otherwise it is equal to *cpu_num_workers*.
140+
- ***chunk_size***. [default: 1000000] Define the chunk size of files that NB reads every time for avoiding out of memory and the mechanism of lazy-loading.
140141
- ***batch_size***. Define the batch size here. If there are multiple GPUs, *batch_size* is the batch size of each GPU.
141142
- ***batch_num_to_show_results***. [necessary for training] During the training process, show the results every batch_num_to_show_results batches.
142143
- ***max_epoch***. [necessary for training] The maximum number of epochs to train.
143-
- ***valid_times_per_epoch***. [optional for training, default: 1] Define how many times to conduct validation per epoch. Usually, we conduct validation after each epoch, but for a very large corpus, we'd better validate multiple times in case to miss the best state of our model. The default value is 1.
144+
- ~~***valid_times_per_epoch***~~. [**deprecated**] Please use steps_per_validation instead.
145+
- ***steps_per_validation***. [default: 10] Define how many steps does each validation take place.
144146
- ***tokenizer***. [optional] Define tokenizer here. Currently, we support 'nltk' and 'jieba'. By default, 'nltk' for English and 'jieba' for Chinese.
145147
- **architecture**. Define the model architecture. The node is a list of layers (blocks) in block_zoo to represent a model. The supported layers of this toolkit are given in [block_zoo overview](https://microsoft.github.io/NeuronBlocks).
146148
@@ -719,4 +721,6 @@ To solve the above problems, NeuronBlocks supports *fixing embedding weight* (em
719721
720722
***training_params/vocabulary/max_vocabulary***. [int, optional for training, default: 800,000] The max size of corpus vocabulary. If corpus vocabulary size is larger than *max_vocabulary*, it will be cut according to word frequency.
721723
724+
***training_params/vocabulary/max_building_lines***. [int, optional for training, default: 1,000,000] The max lines NB will read from every file to build vocabulary
725+
722726
## <span id="faq">常见问题与答案</span>

block_zoo/Embedding.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,10 @@ def inference(self):
6666
for emb_type in self.conf:
6767
if emb_type == 'position':
6868
continue
69-
self.output_dim[2] += self.conf[emb_type]['dim']
69+
if isinstance(self.conf[emb_type]['dim'], list):
70+
self.output_dim[2] += sum(self.conf[emb_type]['dim'])
71+
else:
72+
self.output_dim[2] += self.conf[emb_type]['dim']
7073

7174
super(EmbeddingConf, self).inference()
7275

@@ -113,6 +116,7 @@ def __init__(self, layer_conf):
113116
self.layer_conf = layer_conf
114117

115118
self.embeddings = nn.ModuleDict() if layer_conf.weight_on_gpu else dict()
119+
self.char_embeddings = nn.ModuleDict()
116120
for input_cluster in layer_conf.conf:
117121
if 'type' in layer_conf.conf[input_cluster]:
118122
# char embedding
@@ -122,7 +126,7 @@ def __init__(self, layer_conf):
122126
char_emb_conf = eval(layer_conf.conf[input_cluster]['type'] + "Conf")(** char_emb_conf_dict)
123127
char_emb_conf.inference()
124128
char_emb_conf.verify()
125-
self.embeddings[input_cluster] = eval(layer_conf.conf[input_cluster]['type'])(char_emb_conf)
129+
self.char_embeddings[input_cluster] = eval(layer_conf.conf[input_cluster]['type'])(char_emb_conf)
126130
else:
127131
# word embedding, postag embedding, and so on
128132
self.embeddings[input_cluster] = nn.Embedding(layer_conf.conf[input_cluster]['vocab_size'], layer_conf.conf[input_cluster]['dim'], padding_idx=0)
@@ -155,14 +159,13 @@ def forward(self, inputs, use_gpu=False):
155159
if 'extra' in input_cluster:
156160
continue
157161
input = inputs[input_cluster]
158-
# if 'type' in self.layer_conf.conf[input_cluster]:
159-
# emb = self.embeddings[input_cluster](input, lengths[input]).float()
160-
# else:
161-
# emb = self.embeddings[input_cluster](input).float()
162-
if list(self.embeddings[input_cluster].parameters())[0].device.type == 'cpu':
163-
emb = self.embeddings[input_cluster](input.cpu()).float()
162+
if input_cluster == 'char':
163+
emb = self.char_embeddings[input_cluster](input).float()
164164
else:
165-
emb = self.embeddings[input_cluster](input).float()
165+
if list(self.embeddings[input_cluster].parameters())[0].device.type == 'cpu':
166+
emb = self.embeddings[input_cluster](input.cpu()).float()
167+
else:
168+
emb = self.embeddings[input_cluster](input).float()
166169
if use_gpu is True:
167170
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
168171
emb = emb.to(device)

block_zoo/Pooling2D.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def default(self):
2929
self.pool_type = 'max' # Supported: ['max', mean']
3030
self.stride = 1
3131
self.padding = 0
32-
self.window_size = 3
32+
# self.window_size = [self.input_dims[0][1], self.input_dims[0][2]]
3333

3434
@DocInherit
3535
def declare(self):
@@ -38,7 +38,7 @@ def declare(self):
3838

3939
def check_size(self, value, attr):
4040
res = value
41-
if isinstance(value,int):
41+
if isinstance(value, int):
4242
res = [value, value]
4343
elif (isinstance(self.window_size, tuple) or isinstance(self.window_size, list)) and len(value)==2:
4444
res = list(value)
@@ -48,6 +48,9 @@ def check_size(self, value, attr):
4848

4949
@DocInherit
5050
def inference(self):
51+
52+
if not hasattr(self, "window_size"):
53+
self.window_size = [self.input_dims[0][1], self.input_dims[0][2]]
5154

5255
self.window_size = self.check_size(self.window_size, "window_size")
5356
self.stride = self.check_size(self.stride, "stride")

block_zoo/embedding/CNNCharEmbedding.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ def __init__(self, **kwargs):
2828

2929
@DocInherit
3030
def default(self):
31-
self.dim = 30 # cnn's output channel dim
31+
self.dim = [30] # cnn's output channel dim
3232
self.embedding_matrix_dim = 30 #
33-
self.stride = 1
33+
self.stride = [1]
3434
self.padding = 0
35-
self.window_size = 3
35+
self.window_size = [3]
3636
self.activation = 'ReLU'
3737

3838
@DocInherit
@@ -41,8 +41,14 @@ def declare(self):
4141
self.num_of_inputs = 1
4242
self.input_ranks = [3]
4343

44+
def change_to_list(self, attribute):
45+
for single in attribute:
46+
if not isinstance(getattr(self, single), list):
47+
setattr(self, single, [getattr(self, single)])
48+
4449
@DocInherit
4550
def inference(self):
51+
self.change_to_list(['dim', 'stride', 'window_size'])
4652
self.output_channel_num = self.dim
4753
self.output_rank = 3
4854

@@ -65,20 +71,24 @@ def __init__(self, layer_conf):
6571
super(CNNCharEmbedding, self).__init__(layer_conf)
6672
self.layer_conf = layer_conf
6773

74+
assert len(layer_conf.dim) == len(layer_conf.window_size) == len(layer_conf.stride), "The attribute dim/window_size/stride must have the same length."
75+
6876
self.char_embeddings = nn.Embedding(layer_conf.vocab_size, layer_conf.embedding_matrix_dim, padding_idx=self.layer_conf.padding)
6977
nn.init.uniform_(self.char_embeddings.weight, -0.001, 0.001)
7078

71-
self.char_cnn = nn.Conv2d(1, layer_conf.output_channel_num, (layer_conf.window_size, layer_conf.embedding_matrix_dim),
72-
stride=self.layer_conf.stride, padding=self.layer_conf.padding)
79+
self.char_cnn = nn.ModuleList()
80+
for i in range(len(layer_conf.output_channel_num)):
81+
self.char_cnn.append(nn.Conv2d(1, layer_conf.output_channel_num[i], (layer_conf.window_size[i], layer_conf.embedding_matrix_dim),
82+
stride=self.layer_conf.stride[i], padding=self.layer_conf.padding))
7383
if layer_conf.activation:
7484
self.activation = eval("nn." + self.layer_conf.activation)()
7585
else:
7686
self.activation = None
77-
if self.is_cuda():
78-
self.char_embeddings = self.char_embeddings.cuda()
79-
self.char_cnn = self.char_cnn.cuda()
80-
if self.activation and hasattr(self.activation, 'weight'):
81-
self.activation.weight = torch.nn.Parameter(self.activation.weight.cuda())
87+
# if self.is_cuda():
88+
# self.char_embeddings = self.char_embeddings.cuda()
89+
# self.char_cnn = self.char_cnn.cuda()
90+
# if self.activation and hasattr(self.activation, 'weight'):
91+
# self.activation.weight = torch.nn.Parameter(self.activation.weight.cuda())
8292

8393
def forward(self, string):
8494
"""
@@ -102,14 +112,24 @@ def forward(self, string):
102112
char_embs_lookup = char_embs_lookup.view(-1, string.size()[2], self.layer_conf.embedding_matrix_dim) #[batch_size * seq_len, char num in words, embedding_dim]
103113

104114
string_input = torch.unsqueeze(char_embs_lookup, 1) # [batch_size * seq_len, input_channel_num=1, char num in words, embedding_dim]
105-
string_conv = self.char_cnn(string_input).squeeze()
106-
if self.activation:
107-
string_conv = self.activation(string_conv)
108115

109-
string_maxpooling = F.max_pool1d(string_conv, string_conv.size(2)).squeeze()
110-
string_out = string_maxpooling.view(string.size()[0], -1, self.layer_conf.output_channel_num)
116+
outputs = []
117+
for index, single_cnn in enumerate(self.char_cnn):
118+
string_conv = single_cnn(string_input).squeeze(3)
119+
if self.activation:
120+
string_conv = self.activation(string_conv)
121+
122+
string_maxpooling = F.max_pool1d(string_conv, string_conv.size(2)).squeeze()
123+
string_out = string_maxpooling.view(string.size()[0], -1, self.layer_conf.output_channel_num[index])
124+
125+
outputs.append(string_out)
126+
127+
if len(outputs) > 1:
128+
string_output = torch.cat(outputs, 2)
129+
else:
130+
string_output = outputs[0]
111131

112-
return string_out
132+
return string_output
113133

114134

115135
if __name__ == '__main__':

model_zoo/advanced/conf.json

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,16 @@
4949
"training_params": {
5050
"vocabulary": {
5151
"min_word_frequency": 1,
52-
"max_vocabulary": 100000
52+
"max_vocabulary": 100000,
53+
"max_building_lines": 1000000
5354
},
5455
"optimizer": {
5556
"name": "Adam",
5657
"params": {
5758
"lr": 0.001
5859
}
5960
},
61+
"chunk_size": 1000000,
6062
"lr_decay": 0.95,
6163
"minimum_lr": 0.0001,
6264
"epoch_start_lr_decay": 1,
@@ -65,7 +67,7 @@
6567
"batch_size": 30,
6668
"batch_num_to_show_results": 10,
6769
"max_epoch": 3,
68-
"valid_times_per_epoch": 1,
70+
"steps_per_validation": 10,
6971
"text_preprocessing": ["DBC2SBC"],
7072
"max_lengths":{
7173
"question": 30,
@@ -90,10 +92,10 @@
9092
"cols": ["question_char", "answer_char"],
9193
"type": "CNNCharEmbedding",
9294
"dropout": 0.2,
93-
"dim": 30,
94-
"embedding_matrix_dim": 8,
95-
"stride":1,
96-
"window_size": 5,
95+
"dim": [30, 20, 100],
96+
"embedding_matrix_dim": 50,
97+
"stride":[1, 2, 3],
98+
"window_size": [3,3,5],
9799
"activation": "ReLU"
98100
}
99101
}

model_zoo/nlp_tasks/knowledge_distillation/query_binary_classifier_compression/conf_kdqbc_bilstmattn_cnn.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
"batch_size": 256,
5454
"batch_num_to_show_results": 10,
5555
"max_epoch": 30,
56-
"valid_times_per_epoch": 10,
56+
"steps_per_validation": 10,
5757
"fixed_lengths":{
5858
"query": 30
5959
}

0 commit comments

Comments
 (0)