Skip to content

Commit 98c9498

Browse files
authored
Merge pull request #223 from yinhaofeng/change-dataset
change simnet and dssm data
2 parents 777be5a + 406f26a commit 98c9498

File tree

22 files changed

+535
-398
lines changed

22 files changed

+535
-398
lines changed

models/contentunderstanding/tagspace/data/text2paddle.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import os
1919
import csv
2020
import re
21+
import io
2122
import sys
2223
if six.PY2:
2324
reload(sys)
@@ -45,11 +46,11 @@ def build_dict(column_num=2, min_word_freq=0, train_dir="", test_dir=""):
4546
word_freq = collections.defaultdict(int)
4647
files = os.listdir(train_dir)
4748
for fi in files:
48-
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
49+
with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
4950
word_freq = word_count(column_num, f, word_freq)
5051
files = os.listdir(test_dir)
5152
for fi in files:
52-
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
53+
with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
5354
word_freq = word_count(column_num, f, word_freq)
5455

5556
word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq]
@@ -65,51 +66,51 @@ def write_paddle(text_idx, tag_idx, train_dir, test_dir, output_train_dir,
6566
if not os.path.exists(output_train_dir):
6667
os.mkdir(output_train_dir)
6768
for fi in files:
68-
with open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
69-
with open(
69+
with io.open(os.path.join(train_dir, fi), "r", encoding='utf-8') as f:
70+
with io.open(
7071
os.path.join(output_train_dir, fi), "w",
7172
encoding='utf-8') as wf:
7273
data_file = csv.reader(f)
7374
for row in data_file:
7475
tag_raw = re.split(r'\W+', row[0].strip())
7576
pos_index = tag_idx.get(tag_raw[0])
76-
wf.write(str(pos_index) + ",")
77+
wf.write(u"{},".format(str(pos_index)))
7778
text_raw = re.split(r'\W+', row[2].strip())
7879
l = [text_idx.get(w) for w in text_raw]
7980
for w in l:
80-
wf.write(str(w) + " ")
81-
wf.write("\n")
81+
wf.write(u"{} ".format(str(w)))
82+
wf.write(u"\n")
8283

8384
files = os.listdir(test_dir)
8485
if not os.path.exists(output_test_dir):
8586
os.mkdir(output_test_dir)
8687
for fi in files:
87-
with open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
88-
with open(
88+
with io.open(os.path.join(test_dir, fi), "r", encoding='utf-8') as f:
89+
with io.open(
8990
os.path.join(output_test_dir, fi), "w",
9091
encoding='utf-8') as wf:
9192
data_file = csv.reader(f)
9293
for row in data_file:
9394
tag_raw = re.split(r'\W+', row[0].strip())
9495
pos_index = tag_idx.get(tag_raw[0])
95-
wf.write(str(pos_index) + ",")
96+
wf.write(u"{},".format(str(pos_index)))
9697
text_raw = re.split(r'\W+', row[2].strip())
9798
l = [text_idx.get(w) for w in text_raw]
9899
for w in l:
99-
wf.write(str(w) + " ")
100-
wf.write("\n")
100+
wf.write(u"{} ".format(str(w)))
101+
wf.write(u"\n")
101102

102103

103104
def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir,
104105
output_vocab_text, output_vocab_tag):
105106
print("start constuct word dict")
106107
vocab_text = build_dict(2, 0, train_dir, test_dir)
107-
with open(output_vocab_text, "w", encoding='utf-8') as wf:
108-
wf.write(str(len(vocab_text)) + "\n")
108+
with io.open(output_vocab_text, "w", encoding='utf-8') as wf:
109+
wf.write(u"{}\n".format(str(len(vocab_text))))
109110

110111
vocab_tag = build_dict(0, 0, train_dir, test_dir)
111-
with open(output_vocab_tag, "w", encoding='utf-8') as wf:
112-
wf.write(str(len(vocab_tag)) + "\n")
112+
with io.open(output_vocab_tag, "w", encoding='utf-8') as wf:
113+
wf.write(u"{}\n".format(str(len(vocab_tag))))
113114

114115
print("construct word dict done\n")
115116
write_paddle(vocab_text, vocab_tag, train_dir, test_dir, output_train_dir,

models/match/dssm/config.yaml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@ dataset:
2929

3030
hyper_parameters:
3131
optimizer:
32-
class: sgd
32+
class: adam
3333
learning_rate: 0.001
34-
strategy: async
35-
trigram_d: 1439
34+
strategy: sync
35+
trigram_d: 2900
3636
neg_num: 1
37+
slice_end: 8
3738
fc_sizes: [300, 300, 128]
3839
fc_acts: ['tanh', 'tanh', 'tanh']
3940

@@ -44,7 +45,7 @@ runner:
4445
- name: train_runner
4546
class: train
4647
# num of epochs
47-
epochs: 3
48+
epochs: 1
4849
# device to run training or infer
4950
device: cpu
5051
save_checkpoint_interval: 1 # save model interval of epochs
@@ -54,14 +55,14 @@ runner:
5455
save_inference_feed_varnames: ["query", "doc_pos"] # feed vars of save inference
5556
save_inference_fetch_varnames: ["cos_sim_0.tmp_0"] # fetch vars of save inference
5657
init_model_path: "" # load model path
57-
print_interval: 2
58+
print_interval: 10
5859
phases: phase1
5960
- name: infer_runner
6061
class: infer
6162
# device to run training or infer
6263
device: cpu
6364
print_interval: 1
64-
init_model_path: "increment/2" # load model path
65+
init_model_path: "increment/0" # load model path
6566
phases: phase2
6667

6768
# runner will run all the phase in each epoch
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#!/bin/bash
15+
16+
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
17+
tar xzf dssm%2Fbq.tar.gz
18+
rm -f dssm%2Fbq.tar.gz
19+
mv bq/train.txt ./raw_data.txt
20+
python3 preprocess.py
21+
mkdir big_train
22+
mv train.txt ./big_train
23+
mkdir big_test
24+
mv test.txt ./big_test

models/match/dssm/data/preprocess.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#encoding=utf-8
12
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,29 +12,30 @@
1112
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1213
# See the License for the specific language governing permissions and
1314
# limitations under the License.
14-
#encoding=utf-8
1515

1616
import os
1717
import sys
18+
import jieba
1819
import numpy as np
1920
import random
2021

21-
f = open("./zhidao", "r")
22+
f = open("./raw_data.txt", "r")
2223
lines = f.readlines()
2324
f.close()
2425

2526
#建立字典
2627
word_dict = {}
2728
for line in lines:
2829
line = line.strip().split("\t")
29-
text = line[0].split(" ") + line[1].split(" ")
30+
text = line[0].strip("") + " " + line[1].strip("")
31+
text = jieba.cut(text)
3032
for word in text:
3133
if word in word_dict:
3234
continue
3335
else:
3436
word_dict[word] = len(word_dict) + 1
3537

36-
f = open("./zhidao", "r")
38+
f = open("./raw_data.txt", "r")
3739
lines = f.readlines()
3840
f.close()
3941

@@ -57,12 +59,13 @@
5759
else:
5860
pos_dict[line[0]] = [line[1]]
5961

62+
print("build dict done")
6063
#划分训练集和测试集
6164
query_list = list(pos_dict.keys())
62-
#print(len(query))
63-
random.shuffle(query_list)
64-
train_query = query_list[:90]
65-
test_query = query_list[90:]
65+
#print(len(query_list))
66+
#random.shuffle(query_list)
67+
train_query = query_list[:11600]
68+
test_query = query_list[11600:]
6669

6770
#获得训练集
6871
train_set = []
@@ -73,6 +76,7 @@
7376
for neg in neg_dict[query]:
7477
train_set.append([query, pos, neg])
7578
random.shuffle(train_set)
79+
print("get train_set done")
7680

7781
#获得测试集
7882
test_set = []
@@ -84,13 +88,14 @@
8488
for neg in neg_dict[query]:
8589
test_set.append([query, neg, 0])
8690
random.shuffle(test_set)
91+
print("get test_set done")
8792

8893
#训练集中的query,pos,neg转化为词袋
8994
f = open("train.txt", "w")
9095
for line in train_set:
91-
query = line[0].strip().split(" ")
92-
pos = line[1].strip().split(" ")
93-
neg = line[2].strip().split(" ")
96+
query = jieba.cut(line[0].strip())
97+
pos = jieba.cut(line[1].strip())
98+
neg = jieba.cut(line[2].strip())
9499
query_token = [0] * (len(word_dict) + 1)
95100
for word in query:
96101
query_token[word_dict[word]] = 1
@@ -109,8 +114,8 @@
109114
f = open("test.txt", "w")
110115
fa = open("label.txt", "w")
111116
for line in test_set:
112-
query = line[0].strip().split(" ")
113-
pos = line[1].strip().split(" ")
117+
query = jieba.cut(line[0].strip())
118+
pos = jieba.cut(line[1].strip())
114119
label = line[2]
115120
query_token = [0] * (len(word_dict) + 1)
116121
for word in query:

models/match/dssm/data/test/test.txt

Lines changed: 100 additions & 100 deletions
Large diffs are not rendered by default.

models/match/dssm/data/train/train.txt

Lines changed: 200 additions & 128 deletions
Large diffs are not rendered by default.

models/match/dssm/model.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def _init_hyper_parameters(self):
2929
self.hidden_acts = envs.get_global_env("hyper_parameters.fc_acts")
3030
self.learning_rate = envs.get_global_env(
3131
"hyper_parameters.learning_rate")
32+
self.slice_end = envs.get_global_env("hyper_parameters.slice_end")
3233

3334
def input_data(self, is_infer=False, **kwargs):
3435
query = fluid.data(
@@ -94,7 +95,7 @@ def fc(data, hidden_layers, hidden_acts, names):
9495
prob = fluid.layers.softmax(concat_Rs, axis=1)
9596

9697
hit_prob = fluid.layers.slice(
97-
prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
98+
prob, axes=[0, 1], starts=[0, 0], ends=[self.slice_end, 1])
9899
loss = -fluid.layers.reduce_sum(fluid.layers.log(hit_prob))
99100
avg_cost = fluid.layers.mean(x=loss)
100101
self._cost = avg_cost

models/match/dssm/readme.md

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44

55
```
66
├── data #样例数据
7-
├── train
8-
├── train.txt #训练数据样例
9-
├── test
10-
├── test.txt #测试数据样例
11-
├── preprocess.py #数据处理程序
7+
├── train
8+
├── train.txt #训练数据样例
9+
├── test
10+
├── test.txt #测试数据样例
11+
├── preprocess.py #数据处理程序
12+
├── data_process #数据一键处理脚本
1213
├── __init__.py
1314
├── README.md #文档
1415
├── model.py #模型文件
@@ -46,13 +47,19 @@ Query 和 Doc 的语义相似性可以用这两个向量的 cosine 距离表示
4647
<p>
4748

4849
## 数据准备
49-
我们公开了自建的测试集,包括百度知道、ECOM、QQSIM、UNICOM 四个数据集。这里我们选取百度知道数据集来进行训练。执行以下命令可以获取上述数据集。
50+
BQ是一个智能客服中文问句匹配数据集,该数据集是自动问答系统语料,共有120,000对句子对,并标注了句子对相似度值。数据中存在错别字、语法不规范等问题,但更加贴近工业场景。执行以下命令可以获取上述数据集。
5051
```
51-
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
52-
tar xzf simnet_dataset-1.0.0.tar.gz
53-
rm simnet_dataset-1.0.0.tar.gz
52+
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
53+
tar xzf dssm%2Fbq.tar.gz
54+
rm -f dssm%2Fbq.tar.gz
55+
```
56+
数据集样例:
57+
```
58+
请问一天是否都是限定只能转入或转出都是五万。 微众多少可以赎回短期理财 0
59+
微粒咨询电话号码多少 你们的人工客服电话是多少 1
60+
已经在银行换了新预留号码。 我现在换了电话号码,这个需要更换吗 1
61+
每个字段以tab键分隔,第1,2列表示两个文本。第3列表示类别(0或1,0表示两个文本不相似,1表示两个文本相似)。
5462
```
55-
5663
## 运行环境
5764
PaddlePaddle>=1.7.2
5865

@@ -120,21 +127,24 @@ PaddleRec Finish
120127
2. 在data目录下载并解压数据集,命令如下:
121128
```
122129
cd data
123-
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/simnet_dataset-1.0.0.tar.gz
124-
tar xzf simnet_dataset-1.0.0.tar.gz
125-
rm simnet_dataset-1.0.0.tar.gz
130+
wget https://paddlerec.bj.bcebos.com/dssm%2Fbq.tar.gz
131+
tar xzf dssm%2Fbq.tar.gz
132+
rm -f dssm%2Fbq.tar.gz
126133
```
127-
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为zhidao的文件。然后能可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt和label.txt。将其放入train和test目录下以备训练时调用。命令如下:
134+
3. 本文提供了快速将数据集中的汉字数据处理为可训练格式数据的脚本,您在解压数据集后,可以看见目录中存在一个名为bq的目录。将其中的train.txt文件移动到data目录下,然后可以在python3环境下运行我们提供的preprocess.py文件。即可生成可以直接用于训练的数据目录test.txt,train.txt和label.txt。将其放入train和test目录下以备训练时调用。生成时间较长,请耐心等待。命令如下:
128135
```
129-
mv data/zhidao ./
130-
rm -rf data
136+
mv bq/train.txt ./raw_data.txt
131137
python3 preprocess.py
132-
rm -f ./train/train.txt
133-
mv train.txt ./train
134-
rm -f ./test/test.txt
135-
mv test.txt test
138+
mkdir big_train
139+
mv train.txt ./big_train
140+
mkdir big_test
141+
mv test.txt ./big_test
136142
cd ..
137143
```
144+
也可以使用我们提供的一键数据处理脚本data_process.sh
145+
```
146+
sh data_process.sh
147+
```
138148
经过预处理的格式:
139149
训练集为三个稀疏的BOW方式的向量:query,pos,neg
140150
测试集为两个稀疏的BOW方式的向量:query,pos
@@ -144,8 +154,10 @@ label.txt中对应的测试集中的标签
144154

145155
将workspace改为您当前的绝对路径。(可用pwd命令获取绝对路径)
146156
将dataset_train中的batch_size从8改为128
147-
将文件model.py中的 hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[8, 1])
148-
改为hit_prob = fluid.layers.slice(prob, axes=[0, 1], starts=[0, 0], ends=[128, 1]).当您需要改变batchsize的时候,end中第一个参数也需要随之变化
157+
将hyper_parameters中的slice_end从8改为128.当您需要改变batchsize的时候,这个参数也需要随之变化
158+
将dataset_train中的data_path改为{workspace}/data/big_train
159+
将dataset_infer中的data_path改为{workspace}/data/big_test
160+
将hyper_parameters中的trigram_d改为5913
149161

150162
5. 执行脚本,开始训练.脚本会运行python -m paddlerec.run -m ./config.yaml启动训练,并将结果输出到result文件中。然后启动transform.py整合数据,最后计算出正逆序指标:
151163
```
@@ -155,26 +167,14 @@ sh run.sh
155167
输出结果示例:
156168
```
157169
................run.................
158-
!!! The CPU_NUM is not specified, you should set CPU_NUM in the environment variable list.
159-
CPU_NUM indicates that how many CPUPlace are used in the current task.
160-
And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.
161-
162-
export CPU_NUM=32 # for example, set CPU_NUM as number of physical CPU core which is 32.
163-
164-
!!! The default number of CPU_NUM=1.
165-
I0821 07:16:04.512531 32200 parallel_executor.cc:440] The Program will be executed on CPU using ParallelExecutor, 1 cards are used, so 1 programs are executed in parallel.
166-
I0821 07:16:04.515708 32200 build_strategy.cc:365] SeqOnlyAllReduceOps:0, num_trainers:1
167-
I0821 07:16:04.518872 32200 parallel_executor.cc:307] Inplace strategy is enabled, when build_strategy.enable_inplace = True
168-
I0821 07:16:04.520995 32200 parallel_executor.cc:375] Garbage collection strategy is enabled, when FLAGS_eager_delete_tensor_gb = 0
169-
75
170-
pnr: 2.25581395349
171-
query_num: 11
172-
pair_num: 184 184
173-
equal_num: 44
174-
正序率: 0.692857142857
175-
97 43
176-
```
177-
6. 提醒:因为采取较小的数据集进行训练和测试,得到指标的浮动程度会比较大。如果得到的指标不合预期,可以多次执行步骤5,即可获得合理的指标。
170+
8989
171+
pnr:2.75621659307
172+
query_num:1369
173+
pair_num:16240 , 16240
174+
equal_num:77
175+
正序率: 0.733774670544
176+
pos_num: 11860 , neg_num: 4303
177+
```
178178

179179
## 进阶使用
180180

0 commit comments

Comments
 (0)