change dataset

yinhaofeng · yinhaofeng · commit ad8f23690f93 · 2020-09-15T17:13:01.000Z
diff --git a/models/match/dssm/config.yaml b/models/match/dssm/config.yaml
@@ -29,11 +29,12 @@ dataset:
 
 hyper_parameters:
   optimizer:
-    class: sgd
+    class: adam
     learning_rate: 0.001
-    strategy: async
-  trigram_d: 1439
+    strategy: sync
+  trigram_d: 6327
   neg_num: 1
+  slice_end: 8
   fc_sizes: [300, 300, 128]
   fc_acts: ['tanh', 'tanh', 'tanh']
 
@@ -44,7 +45,7 @@ runner:
 - name: train_runner
   class: train
   # num of epochs
-  epochs: 3
+  epochs: 1
   # device to run training or infer
   device: cpu
   save_checkpoint_interval: 1 # save model interval of epochs
@@ -54,14 +55,14 @@ runner:
   save_inference_feed_varnames: ["query", "doc_pos"] # feed vars of save inference
   save_inference_fetch_varnames: ["cos_sim_0.tmp_0"] # fetch vars of save inference
   init_model_path: "" # load model path
-  print_interval: 2
+  print_interval: 10
   phases: phase1
 - name: infer_runner
   class: infer
   # device to run training or infer
   device: cpu
   print_interval: 1
-  init_model_path: "increment/2" # load model path
+  init_model_path: "increment/0" # load model path
   phases: phase2
 
 # runner will run all the phase in each epoch
diff --git a/models/match/dssm/data/data_process.sh b/models/match/dssm/data/data_process.sh
@@ -0,0 +1,24 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+
+wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
+tar xzf dssm%2Fbq.tar.gz
+rm -f dssm%2Fbq.tar.gz
+mv bq/train.txt ./raw_data.txt
+python3 preprocess.py
+mkdir big_train
+mv train.txt ./big_train
+mkdir big_test
+mv test.txt ./big_test
diff --git a/models/match/dssm/data/preprocess.py b/models/match/dssm/data/preprocess.py
@@ -15,25 +15,27 @@
 
 import os
 import sys
+import jieba
 import numpy as np
 import random
 
-f = open("./zhidao", "r")
+f = open("./raw_data.txt", "r")
 lines = f.readlines()
 f.close()
 
 #建立字典
 word_dict = {}
 for line in lines:
     line = line.strip().split("\t")
-    text = line[0].split(" ") + line[1].split(" ")
+    text = line[0].strip("") + line[1].strip("")
+    text = jieba.cut(text)
     for word in text:
         if word in word_dict:
             continue
         else:
             word_dict[word] = len(word_dict) + 1
 
-f = open("./zhidao", "r")
+f = open("./raw_data.txt", "r")
 lines = f.readlines()
 f.close()
 
@@ -57,12 +59,13 @@
         else:
             pos_dict[line[0]] = [line[1]]
 
+print("build dict done")
 #划分训练集和测试集
 query_list = list(pos_dict.keys())
-#print(len(query))
+#print(len(query_list))
 random.shuffle(query_list)
-train_query = query_list[:90]
-test_query = query_list[90:]
+train_query = query_list[:11600]
+test_query = query_list[11600:]
 
 #获得训练集
 train_set = []
@@ -73,6 +76,7 @@
         for neg in neg_dict[query]:
             train_set.append([query, pos, neg])
 random.shuffle(train_set)
+print("get train_set done")
 
 #获得测试集
 test_set = []
@@ -84,13 +88,14 @@
     for neg in neg_dict[query]:
         test_set.append([query, neg, 0])
 random.shuffle(test_set)
+print("get test_set done")
 
 #训练集中的query,pos,neg转化为词袋
 f = open("train.txt", "w")
 for line in train_set:
-    query = line[0].strip().split(" ")
-    pos = line[1].strip().split(" ")
-    neg = line[2].strip().split(" ")
+    query = jieba.cut(line[0].strip())
+    pos = jieba.cut(line[1].strip())
+    neg = jieba.cut(line[2].strip())
     query_token = [0] * (len(word_dict) + 1)
     for word in query:
         query_token[word_dict[word]] = 1
@@ -109,8 +114,8 @@
 f = open("test.txt", "w")
 fa = open("label.txt", "w")
 for line in test_set:
-    query = line[0].strip().split(" ")
-    pos = line[1].strip().split(" ")
+    query = jieba.cut(line[0].strip())
+    pos = jieba.cut(line[1].strip())
     label = line[2]
     query_token = [0] * (len(word_dict) + 1)
     for word in query:
diff --git a/models/match/dssm/model.py b/models/match/dssm/model.py
@@ -29,6 +29,7 @@ def _init_hyper_parameters(self):
         self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts")
         self.learning_rate = envs.get_global_env(
             "hyper_parameters.learning_rate")
+        self.slice_end = envs.get_global_env("hyper_parameters.slice_end")
 
     def input_data(self, is_infer=False, **kwargs):
         query = fluid.data(
@@ -94,7 +95,7 @@ def fc(data, hidden_layers, hidden_acts, names):
         prob = fluid.layers.softmax(concat_Rs, axis=1)
 
         hit_prob = fluid.layers.slice(
-            prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
+            prob, axes=[0, 1], starts=[0, 0], ends=[self.slice_end, 1])
         loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
         avg_cost = fluid.layers.mean(x=loss)
         self._cost = avg_cost
diff --git a/models/match/dssm/readme.md b/models/match/dssm/readme.md
@@ -4,11 +4,12 @@
 
 ```
 ├── data #样例数据
-	├── train
-		├── train.txt #训练数据样例
-	├── test
-    	├── test.txt #测试数据样例
-	├── preprocess.py #数据处理程序
+    ├── train
+        ├── train.txt #训练数据样例
+    ├── test
+        ├── test.txt #测试数据样例
+    ├── preprocess.py #数据处理程序
+    ├── data_process #数据一键处理脚本
 ├── __init__.py
 ├── README.md #文档
 ├── model.py #模型文件
@@ -46,13 +47,19 @@ Query 和 Doc 的语义相似性可以用这两个向量的 cosine 距离表示
 <p>
 
 ## 数据准备
-我们公开了自建的测试集，包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
+BQ是一个智能客服中文问句匹配数据集，该数据集是自动问答系统语料，共有120,000对句子对，并标注了句子对相似度值。数据中存在错别字、语法不规范等问题，但更加贴近工业场景。执行以下命令可以获取上述数据集。
 ```
-wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
-tar xzf simnet_dataset-1.0.0.tar.gz
-rm simnet_dataset-1.0.0.tar.gz
+wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
+tar xzf dssm%2Fbq.tar.gz
+rm -f dssm%2Fbq.tar.gz
+```
+数据集样例：
+```
+请问一天是否都是限定只能转入或转出都是五万。    微众多少可以赎回短期理财        0
+微粒咨询电话号码多少    你们的人工客服电话是多少        1
+已经在银行换了新预留号码。      我现在换了电话号码，这个需要更换吗      1
+每个字段以tab键分隔，第1，2列表示两个文本。第3列表示类别（0或1，0表示两个文本不相似，1表示两个文本相似）。
 ```
-
 ## 运行环境
 PaddlePaddle>=1.7.2
 
@@ -120,21 +127,24 @@ PaddleRec Finish
 2. 在data目录下载并解压数据集，命令如下：  
 ``` 
 cd data
-wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
-tar xzf simnet_dataset-1.0.0.tar.gz
-rm simnet_dataset-1.0.0.tar.gz
+wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
+tar xzf dssm%2Fbq.tar.gz
+rm -f dssm%2Fbq.tar.gz
 ```
-3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本，您在解压数据集后，可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt和label.txt。将其放入train和test目录下以备训练时调用。命令如下：
+3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本，您在解压数据集后，可以看见目录中存在一个名为bq的目录。将其中的train.txt文件移动到data目录下，然后可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt和label.txt。将其放入train和test目录下以备训练时调用。生成时间较长，请耐心等待。命令如下：
 ```
-mv data/zhidao ./
-rm -rf data
+mv bq/train.txt ./raw_data.txt
 python3 preprocess.py
-rm -f ./train/train.txt
-mv train.txt ./train
-rm -f ./test/test.txt
-mv test.txt test
+mkdir big_train
+mv train.txt ./big_train
+mkdir big_test
+mv test.txt ./big_test
 cd ..
 ```
+也可以使用我们提供的一键数据处理脚本data_process.sh
+```
+sh data_process.sh
+```
 经过预处理的格式：  
 训练集为三个稀疏的BOW方式的向量：query,pos,neg  
 测试集为两个稀疏的BOW方式的向量：query,pos  
@@ -144,8 +154,9 @@ label.txt中对应的测试集中的标签
 
 将workspace改为您当前的绝对路径。（可用pwd命令获取绝对路径）  
 将dataset_train中的batch_size从8改为128
-将文件model.py中的 hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])  
-    改为hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[128, 1]).当您需要改变batchsize的时候，end中第一个参数也需要随之变化
+将hyper_parameters中的slice_end从8改为128.当您需要改变batchsize的时候，这个参数也需要随之变化
+将dataset_train中的data_path改为{workspace}/data/big_train
+将dataset_infer中的data_path改为{workspace}/data/big_test
 
 5.  执行脚本，开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练，并将结果输出到result文件中。然后启动transform.py整合数据，最后计算出正逆序指标：
 ```
@@ -155,26 +166,14 @@ sh run.sh
 输出结果示例：
 ```
 ................run.................
-!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
-CPU_NUM indicates that how many CPUPlace are used in the current task.
-And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
-
-export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
-
-!!! The default number of CPU_NUM=1.
-I0821 07:16:04.512531 32200 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
-I0821 07:16:04.515708 32200 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
-I0821 07:16:04.518872 32200 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
-I0821 07:16:04.520995 32200 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
-75
-pnr: 2.25581395349
-query_num: 11
-pair_num: 184 184
-equal_num: 44
-正序率： 0.692857142857
-97 43
-```
-6. 提醒：因为采取较小的数据集进行训练和测试，得到指标的浮动程度会比较大。如果得到的指标不合预期，可以多次执行步骤5，即可获得合理的指标。
+8989
+pnr:2.75621659307
+query_num:1369
+pair_num:16240 , 16240
+equal_num:77
+正序率: 0.733774670544
+pos_num: 11860 , neg_num: 4303
+```
 
 ## 进阶使用
   
diff --git a/models/match/dssm/run.sh b/models/match/dssm/run.sh
@@ -13,7 +13,7 @@
 # limitations under the License.
 #!/bin/bash
 echo "................run................."
-python -m paddlerec.run -m ./config.yaml >result1.txt
+python -m paddlerec.run -m ./config.yaml &> result1.txt
 grep -i "query_doc_sim" ./result1.txt >./result2.txt
 sed '$d' result2.txt >result.txt
 rm -f result1.txt
diff --git a/models/match/dssm/transform.py b/models/match/dssm/transform.py
@@ -32,13 +32,13 @@
 sim = []
 for line in open(filename):
     line = line.strip().split(",")
-    line[1] = line[1].split(":")
-    line = line[1][1].strip(" ")
+    line[3] = line[3].split(":")
+    line = line[3][1].strip(" ")
     line = line.strip("[")
     line = line.strip("]")
     sim.append(float(line))
 
-filename = './data/test/test.txt'
+filename = './data/big_test/test.txt'
 f = open(filename, "r")
 f.readline()
 query = []
diff --git a/models/match/match-pyramid/run.sh b/models/match/match-pyramid/run.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 echo "................run................."
-python -m paddlerec.run -m ./config.yaml >result1.txt
+python -m paddlerec.run -m ./config.yaml &>result1.txt
 grep -i "prediction" ./result1.txt >./result.txt
 rm -f result1.txt
 python eval.py
diff --git a/models/match/multiview-simnet/config.yaml b/models/match/multiview-simnet/config.yaml
@@ -26,19 +26,19 @@ dataset:
   batch_size: 1
   type: DataLoader # or QueueDataset
   data_path: "{workspace}/data/test"
-  sparse_slots: "1 2"
+  sparse_slots: "0 1"
 
 # hyper parameters of user-defined network
 hyper_parameters:
   optimizer:
     class: Adam
-    learning_rate: 0.0001
-    strategy: async
+    learning_rate: 0.001
+    strategy: sync
   query_encoder: "gru"
   title_encoder: "gru"
   query_encode_dim: 128
   title_encode_dim: 128
-  sparse_feature_dim: 1439
+  sparse_feature_dim: 6327
   embedding_dim: 128
   hidden_size: 128
   margin: 0.1
diff --git a/models/match/multiview-simnet/data/data_process.sh b/models/match/multiview-simnet/data/data_process.sh
@@ -0,0 +1,24 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+
+wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
+tar xzf dssm%2Fbq.tar.gz
+rm -f dssm%2Fbq.tar.gz
+mv bq/train.txt ./raw_data.txt
+python3 preprocess.py
+mkdir big_train
+mv train.txt ./big_train
+mkdir big_test
+mv test.txt ./big_test
diff --git a/models/match/multiview-simnet/data/preprocess.py b/models/match/multiview-simnet/data/preprocess.py
diff --git a/models/match/multiview-simnet/readme.md b/models/match/multiview-simnet/readme.md
diff --git a/models/match/multiview-simnet/run.sh b/models/match/multiview-simnet/run.sh
diff --git a/models/match/multiview-simnet/transform.py b/models/match/multiview-simnet/transform.py
diff --git a/tools/cal_pos_neg.py b/tools/cal_pos_neg.py