Support multi-card evaluation (#410)

tianxin · web-flow · commit 028f9acfd763 · 2021-05-19T10:44:44.000+08:00
* add SemanticIndexing examples

* 1. delete shell script
2. refator sub-directory

* 1. use data.py instead of utils.py
2. calculate cosine_sim on GPU not CPU

* 1. delete python api in Model forward function for dynamic to static conversion
2. add margin parameter

* use data_set for predict.py

* use Dataset to load evaluation data when run_ann.py

* 1. set some command argument to required
2. rename some variable

* add BatchNegaive strategy

* delete .gitkeep

* add ance strategy

* delete unused arguments

* delete .gitkeep

* set data.py as common module of strategys

* 1. add SemanticIndexBase
2. move common fucntions to data.py

* mv build_index to ann_util.py

* 1. mv train.py to top directory

* get emb_size from model parameters

* support reduce embedding size by command-line argument

* handle illegal data

* add ClipGradByGlobalNorm for train_ance.py

* add README.md

* update README

* Update README

* Update README

* Update README

* upload model and data to bos

* update README

* update comment

* delete unused blanks

* mend

* add parameter description to README

* update README

* delete numpy in model.py

* mend

* mend

* 1. support multi-card evaluation
2. fix encoding for windows
diff --git a/examples/semantic_indexing/README.md b/examples/semantic_indexing/README.md
@@ -18,10 +18,10 @@
 以下模型结构参数为:
 `TrasformerLayer:12, Hidden:768, Heads:12, OutputEmbSize: 256`
 
-|Model|训练参数配置|MD5|
-| ------------ | ------------ | ------------ |
-|[batch_neg_v1.0](https://paddlenlp.bj.bcebos.com/models/semantic_index/batch_neg_v1.0.tar)|<div style="width: 200pt">margin:0.2 scale:30 epoch:3 lr:5E-5</div>|da1bb1487bd3fd6a53b8ef95c278f3e6|
-|[hardest_neg_v1.0](https://paddlenlp.bj.bcebos.com/models/semantic_index/hardest_neg_v1.0.tar)|margin:0.2 epoch:3 lr:5E-5|b535d890110ea608c8562c525a0b84b5|
+|Model|训练参数配置|硬件|MD5|
+| ------------ | ------------ | ------------ |-----------|
+|[batch_neg_v1.0](https://paddlenlp.bj.bcebos.com/models/semantic_index/batch_neg_v1.0.tar)|<div style="width: 150pt">margin:0.2 scale:30 epoch:3 lr:5E-5 bs:128 max_len:64 </div>|<div style="width: 100pt">单卡v100-16g</div>|da1bb1487bd3fd6a53b8ef95c278f3e6|
+|[hardest_neg_v1.0](https://paddlenlp.bj.bcebos.com/models/semantic_index/hardest_neg_v1.0.tar)|margin:0.2 epoch:3 lr:5E-5 bs:128 max_len:64 |单卡v100-16g|b535d890110ea608c8562c525a0b84b5|
 
 
 ## 数据准备
diff --git a/examples/semantic_indexing/data.py b/examples/semantic_indexing/data.py
@@ -155,7 +155,7 @@ def valid_checkpoint(step):
 
 def gen_id2corpus(corpus_file):
     id2corpus = {}
-    with open(corpus_file) as f:
+    with open(corpus_file, 'r', encoding='utf-8') as f:
         for idx, line in enumerate(f):
             id2corpus[idx] = line.rstrip()
     return id2corpus
@@ -164,7 +164,7 @@ def gen_id2corpus(corpus_file):
 def gen_text_file(similar_text_pair_file):
     text2similar_text = {}
     texts = []
-    with open(similar_text_pair_file) as f:
+    with open(similar_text_pair_file, 'r', encoding='utf-8') as f:
         for line in f:
             splited_line = line.rstrip().split("\t")
             if len(splited_line) != 2:
diff --git a/examples/semantic_indexing/evaluate.py b/examples/semantic_indexing/evaluate.py
@@ -52,14 +52,14 @@ def recall(rs, N=10):
 
 if __name__ == "__main__":
     text2similar = {}
-    with open(args.similar_text_pair) as f:
+    with open(args.similar_text_pair, 'r', encoding='utf-8') as f:
         for line in f:
             text, similar_text = line.rstrip().split("\t")
             text2similar[text] = similar_text
 
     rs = []
 
-    with open(args.recall_result_file) as f:
+    with open(args.recall_result_file, 'r', encoding='utf-8') as f:
         relevance_labels = []
         for index, line in enumerate(f):
 
@@ -77,7 +77,6 @@ def recall(rs, N=10):
 
     recall_N = []
     for topN in (10, 50):
-        #logger.info("Recall@{}: {}".format(topN, 100 * recall(rs, N=topN)))
         R = round(100 * recall(rs, N=topN), 3)
         recall_N.append(str(R))
     print("\t".join(recall_N))
diff --git a/examples/semantic_indexing/recall.py b/examples/semantic_indexing/recall.py
@@ -58,6 +58,9 @@
 
 if __name__ == "__main__":
     paddle.set_device(args.device)
+    rank = paddle.distributed.get_rank()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
 
     tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')
 
@@ -76,8 +79,9 @@
 
     model = SemanticIndexBase(
         pretrained_model, output_emb_size=args.output_emb_size)
+    model = paddle.DataParallel(model)
 
-    # load pretrained semantic model
+    # Load pretrained semantic model
     if args.params_path and os.path.isfile(args.params_path):
         state_dict = paddle.load(args.params_path)
         model.set_dict(state_dict)
@@ -99,7 +103,10 @@
         batchify_fn=batchify_fn,
         trans_fn=trans_func)
 
-    final_index = build_index(args, corpus_data_loader, model)
+    # Need better way to get inner model of DataParallel
+    inner_model = model._layers
+
+    final_index = build_index(args, corpus_data_loader, inner_model)
 
     text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
 
@@ -112,14 +119,14 @@
         batchify_fn=batchify_fn,
         trans_fn=trans_func)
 
-    query_embedding = model.get_semantic_embedding(query_data_loader)
+    query_embedding = inner_model.get_semantic_embedding(query_data_loader)
 
     if not os.path.exists(args.recall_result_dir):
         os.mkdir(args.recall_result_dir)
 
     recall_result_file = os.path.join(args.recall_result_dir,
                                       args.recall_result_file)
-    with open(recall_result_file, 'w') as f:
+    with open(recall_result_file, 'w', encoding='utf-8') as f:
         for batch_index, batch_query_embedding in enumerate(query_embedding):
             recalled_idx, cosine_sims = final_index.knn_query(
                 batch_query_embedding.numpy(), args.recall_num)