Skip to content

Commit fec3bde

Browse files
authored
Merge pull request #386 from seemingwang/master
naml static model for cross_entropy test
2 parents f24bf7e + 8a59638 commit fec3bde

File tree

5 files changed

+173
-47
lines changed

5 files changed

+173
-47
lines changed

models/rank/naml/README.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ n篇用户浏览过的文章的article embedding向量组将再次通过attentio
5151
每篇文章用一行表示,存放在一个或多个以article{number}.txt为后缀的文件里,如article.txt, article3.txt
5252
每行的格式为:
5353
文章id 主类id 子类id 分词后的文章标题id 分词后的文章单词id
54-
以上5项用分号分割,id均为自然数,分词后的文章标题id 和 分词后的文章单词id 都用空格做分隔符
54+
以上5项用tab符号分割,id均为自然数,分词后的文章标题id 和 分词后的文章单词id 都用空格做分隔符
5555
另外还需要收集用户的浏览记录,存放在一个或多个以browse{number}.txt为后缀的文件里,如browse.txt, browse3.txt
5656
每个用户的单次浏览序列用一行表示,格式为:
5757
浏览过的文章id序列 接下来浏览过的文章id 接下来没浏览的文章id序列
58-
以上3项用分号分割,id序列之间用空格分割,接下来没浏览的文章id序列如果没有实际数据,可以采用负采样生成,
58+
以上3项用tab符号分割,id序列之间用空格分割,接下来没浏览的文章id序列如果没有实际数据,可以采用负采样生成,
5959
但是没浏览的序列id个数建议大于等于yaml配置文件中的neg_candidate_size
6060

6161
在模型目录的data/sample_data目录下为您准备了快速运行的示例数据
@@ -97,7 +97,7 @@ python3 -u ../../../tools/infer.py -m config.yaml
9797
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。
9898
同时,我们提供了全量数据生成的脚本,将会自动下载microsoft news dataset全量数据集并转换为模型能接受的
9999
输入格式,执行方法如下:
100-
PaddleRec/datasets/MIND/data
100+
进入路径PaddleRec/datasets/MIND/data
101101
执行 sh run.sh
102102
脚本运行完成后,打开dict/yaml_info.txt,将其中的词向量大小,类目大小,子类目大小信息copy到config_bigdata.yaml
103103
里,替换最后3行的超参数
@@ -109,13 +109,20 @@ PaddleRec/datasets/MIND/data
109109
python3 -u ../../../tools/trainer.py -m config_bigdata.yaml
110110
```
111111
以下为训练2个epoch的结果
112-
| 模型 | top1 acc | batch_size | epoch_num| Time of each epoch|
112+
| 模型 | auc | batch_size | epoch_num| Time of each epoch|
113113
| :------| :------ | :------ | :------| :------ |
114-
| naml | 0.72 | 50 | 3 | 约4小时 |
114+
| naml | 0.66 | 50 | 3 | 约4小时 |
115115

116116
预测
117117
```
118118
python3 -u ../../../tools/infer.py -m config_bigdata.yaml
119119
```
120120

121121
期待预测auc为0.66
122+
123+
124+
单机多卡执行方式(以训练为例)
125+
python3 -m paddle.distributed.launch ../../../tools/trainer.py -m config_bigdata.yaml
126+
在此情况下将使用单机上所有gpu卡,若需要指定部分gpu卡执行,可以通过设置环境变量CUDA_VISIBLE_DEVICES
127+
来实现。例如单机上有8张卡,只打算用前4卡张训练,可以设置export CUDA_VISIBLE_DEVICES=0,1,2,3
128+
再执行训练脚本即可。

models/rank/naml/config_bigdata.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
runner:
1616
train_data_dir: "../../../datasets/MIND/data/train"
1717
train_reader_path: "NAMLDataReader" # importlib format
18-
use_gpu: False
18+
use_gpu: True
1919
train_batch_size: 50
2020
epochs: 2
2121
print_interval: 10

models/rank/naml/dygraph_model.py

Lines changed: 27 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,20 @@
1919
import numpy as np
2020

2121
import net
22+
import sys
2223

2324

2425
class DygraphModel():
25-
def __init__(self):
26-
self.bucket = 100000
27-
self.absolute_limt = 200.0
28-
29-
def rescale(self, number):
30-
if number > self.absolute_limt:
31-
number = self.absolute_limt
32-
elif number < -self.absolute_limt:
33-
number = -self.absolute_limt
34-
return (number + self.absolute_limt) / (self.absolute_limt * 2 + 1e-8)
26+
# def __init__(self):
27+
# self.bucket = 100000
28+
# self.absolute_limt = 200.0
29+
#
30+
# def rescale(self, number):
31+
# if number > self.absolute_limt:
32+
# number = self.absolute_limt
33+
# elif number < -self.absolute_limt:
34+
# number = -self.absolute_limt
35+
# return (number + self.absolute_limt) / (self.absolute_limt * 2 + 1e-8)
3536

3637
# define model
3738
def create_model(self, config):
@@ -54,7 +55,7 @@ def create_model(self, config):
5455
# define feeds which convert numpy of batch data to paddle.tensor
5556
def create_feeds(self, batch, config):
5657
label = batch[0]
57-
return label, batch[1:], None
58+
return label, batch[1:]
5859

5960
# define loss function by predicts and label
6061
def create_loss(self, raw_pred, label):
@@ -78,50 +79,37 @@ def create_metrics(self):
7879
# metrics_list_name = ["acc"]
7980
# auc_metric = paddle.metric.Accuracy()
8081
metrics_list_name = ["auc"]
81-
auc_metric = paddle.metric.Auc(num_thresholds=self.bucket)
82+
auc_metric = paddle.metric.Auc()
8283
metrics_list = [auc_metric]
8384
return metrics_list, metrics_list_name
8485

8586
# construct train forward phase
8687
def train_forward(self, dy_model, metrics_list, batch_data, config):
87-
labels, sparse_tensor, dense_tensor = self.create_feeds(batch_data,
88-
config)
88+
labels, sparse_tensor = self.create_feeds(batch_data, config)
8989

90-
raw = dy_model(sparse_tensor, None)
90+
raw = dy_model(sparse_tensor)
9191

9292
loss = paddle.nn.functional.cross_entropy(
9393
input=raw, label=paddle.cast(labels, "float32"), soft_label=True)
9494

95-
scaled = raw.numpy()
96-
scaled_pre = []
97-
[rows, cols] = scaled.shape
98-
for i in range(rows):
99-
for j in range(cols):
100-
scaled_pre.append(1.0 - self.rescale(scaled[i, j]))
101-
scaled_pre.append(self.rescale(scaled[i, j]))
102-
scaled_np_predict = np.array(scaled_pre).reshape([-1, 2])
103-
metrics_list[0].update(scaled_np_predict,
104-
paddle.reshape(labels, [-1, 1]))
95+
soft_predict = paddle.nn.functional.sigmoid(
96+
paddle.reshape(raw, [-1, 1]))
97+
predict_2d = paddle.concat(x=[1 - soft_predict, soft_predict], axis=-1)
98+
labels = paddle.reshape(labels, [-1, 1])
99+
metrics_list[0].update(preds=predict_2d.numpy(), labels=labels.numpy())
105100

106101
loss = paddle.mean(loss)
107102
print_dict = None
108103
return loss, metrics_list, print_dict
109104

110105
def infer_forward(self, dy_model, metrics_list, batch_data, config):
111-
labels, sparse_tensor, dense_tensor = self.create_feeds(batch_data,
112-
config)
113-
raw = dy_model(sparse_tensor, None)
106+
labels, sparse_tensor = self.create_feeds(batch_data, config)
107+
raw = dy_model(sparse_tensor)
114108
#predict_raw = paddle.nn.functional.softmax(raw)
115109

116-
scaled = raw.numpy()
117-
scaled_pre = []
118-
[rows, cols] = scaled.shape
119-
for i in range(rows):
120-
for j in range(cols):
121-
scaled_pre.append(1.0 - self.rescale(scaled[i, j]))
122-
scaled_pre.append(self.rescale(scaled[i, j]))
123-
scaled_np_predict = np.array(scaled_pre).reshape([-1, 2])
124-
metrics_list[0].update(scaled_np_predict,
125-
paddle.reshape(labels, [-1, 1]))
126-
110+
soft_predict = paddle.nn.functional.sigmoid(
111+
paddle.reshape(raw, [-1, 1]))
112+
predict_2d = paddle.concat(x=[1 - soft_predict, soft_predict], axis=-1)
113+
labels = paddle.reshape(labels, [-1, 1])
114+
metrics_list[0].update(preds=predict_2d.numpy(), labels=labels.numpy())
127115
return metrics_list, None

models/rank/naml/net.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def __init__(self, article_content_size, article_title_size, browse_size,
144144
self.content_attention = self.make_attention_layer(
145145
"content_attention",
146146
[self.conv_out_channel_size, self.attention_projection_size])
147-
print(self.word2vec_embedding)
147+
#print(self.word2vec_embedding)
148148

149149
def make_attention_layer(self, name_base, size):
150150
row = size[0]
@@ -182,7 +182,7 @@ def func(input):
182182

183183
return func
184184

185-
def forward(self, sparse_inputs, dense_inputs):
185+
def forward(self, sparse_inputs):
186186
cate_sample, cate_visit, sub_cate_sample, sub_cate_visit, title_sample, title_visit, content_sample, content_visit = sparse_inputs[:]
187187
cate = paddle.concat([cate_sample, cate_visit], axis=-1)
188188
sub_cate = paddle.concat([sub_cate_sample, sub_cate_visit], axis=-1)

models/rank/naml/static_model.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import math
16+
import paddle
17+
18+
from net import NAMLLayer
19+
20+
21+
class StaticModel():
22+
def __init__(self, config):
23+
self.cost = None
24+
self.infer_target_var = None
25+
self.config = config
26+
self._init_hyper_parameters()
27+
28+
def _init_hyper_parameters(self):
29+
self.article_content_size = self.config.get(
30+
"hyper_parameters.article_content_size")
31+
self.article_title_size = self.config.get(
32+
"hyper_parameters.article_title_size")
33+
self.browse_size = self.config.get("hyper_parameters.browse_size")
34+
self.neg_condidate_sample_size = self.config.get(
35+
"hyper_parameters.neg_condidate_sample_size")
36+
self.word_dimension = self.config.get(
37+
"hyper_parameters.word_dimension")
38+
self.category_size = self.config.get("hyper_parameters.category_size")
39+
self.sub_category_size = self.config.get(
40+
"hyper_parameters.sub_category_size")
41+
self.cate_dimension = self.config.get(
42+
"hyper_parameters.category_dimension")
43+
self.word_dict_size = self.config.get(
44+
"hyper_parameters.word_dict_size")
45+
self.learning_rate = self.config.get(
46+
"hyper_parameters.optimizer.learning_rate")
47+
self.sample_size = self.neg_condidate_sample_size + 1
48+
49+
def create_feeds(self, is_infer=False):
50+
inputs = [
51+
paddle.static.data(
52+
name="sampe_cate",
53+
shape=[None, self.sample_size],
54+
dtype='int64'), paddle.static.data(
55+
name="browse_cate",
56+
shape=[None, self.browse_size],
57+
dtype='int64'), paddle.static.data(
58+
name="sampe_sub_cate",
59+
shape=[None, self.sample_size],
60+
dtype='int64'), paddle.static.data(
61+
name="browse_sub_cate",
62+
shape=[None, self.browse_size],
63+
dtype='int64'),
64+
paddle.static.data(
65+
name="sampe_title",
66+
shape=[None, self.sample_size, self.article_title_size],
67+
dtype='int64'), paddle.static.data(
68+
name="browse_title",
69+
shape=[None, self.browse_size, self.article_title_size],
70+
dtype='int64'),
71+
paddle.static.data(
72+
name="sample_article",
73+
shape=[None, self.sample_size, self.article_content_size],
74+
dtype='int64'), paddle.static.data(
75+
name="browse_article",
76+
shape=[None, self.browse_size, self.article_content_size],
77+
dtype='int64')
78+
]
79+
label = paddle.static.data(
80+
name="label", shape=[None, self.sample_size], dtype="int64")
81+
return [label] + inputs
82+
83+
def net(self, input, is_infer=False):
84+
self.labels = input[0]
85+
self.sparse_inputs = input[1:]
86+
#self.dense_input = input[-1]
87+
#sparse_number = self.sparse_inputs_slots - 1
88+
model = NAMLLayer(self.article_content_size, self.article_title_size,
89+
self.browse_size, self.neg_condidate_sample_size,
90+
self.word_dimension, self.category_size,
91+
self.sub_category_size, self.cate_dimension,
92+
self.word_dict_size)
93+
94+
raw = model(self.sparse_inputs)
95+
96+
soft_predict = paddle.nn.functional.sigmoid(
97+
paddle.reshape(raw, [-1, 1]))
98+
predict_2d = paddle.concat(x=[1 - soft_predict, soft_predict], axis=-1)
99+
labels = paddle.reshape(self.labels, [-1, 1])
100+
#metrics_list[0].update(preds=predict_2d.numpy(), labels=labels.numpy())
101+
#self.predict = predict_2d
102+
103+
auc, batch_auc, _ = paddle.static.auc(input=predict_2d,
104+
label=labels,
105+
num_thresholds=2**12,
106+
slide_steps=20)
107+
self.inference_target_var = auc
108+
if is_infer:
109+
fetch_dict = {'auc': auc}
110+
return fetch_dict
111+
112+
cost = paddle.nn.functional.cross_entropy(
113+
input=raw,
114+
label=paddle.cast(self.labels, "float32"),
115+
soft_label=True)
116+
avg_cost = paddle.mean(x=cost)
117+
self._cost = avg_cost
118+
119+
fetch_dict = {'cost': avg_cost, 'auc': auc}
120+
return fetch_dict
121+
122+
def create_optimizer(self, strategy=None):
123+
optimizer = paddle.optimizer.Adam(
124+
learning_rate=self.learning_rate, lazy_mode=True)
125+
if strategy != None:
126+
import paddle.distributed.fleet as fleet
127+
optimizer = fleet.distributed_optimizer(optimizer, strategy)
128+
optimizer.minimize(self._cost)
129+
130+
def infer_net(self, input):
131+
return self.net(input, is_infer=True)

0 commit comments

Comments
 (0)