Skip to content

Commit 5fabe3c

Browse files
committed
revert
1 parent 4063be5 commit 5fabe3c

File tree

6 files changed

+200
-92
lines changed

6 files changed

+200
-92
lines changed

datasets/MIND/run.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
wget https://paddlerec.bj.bcebos.com/datasets/MIND/bigdata.zip
2+
mkdir data
3+
unzip bigdata.zip -d ./data

models/rank/naml/NAMLDataReader.py

Lines changed: 82 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
1+
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
115
from __future__ import print_function
216
import numpy as np
3-
import re,random
17+
import re, random
418
from paddle.io import IterableDataset
519

620

@@ -16,38 +30,50 @@ def __init__(self, file_list, config):
1630
elif re.match('[\\S]*article[0-9]*.txt$', x) != None:
1731
self.article_file_list.append(x)
1832
self.config = config
19-
self.article_content_size = config.get("hyper_parameters.article_content_size")
20-
self.article_title_size = config.get("hyper_parameters.article_title_size")
33+
self.article_content_size = config.get(
34+
"hyper_parameters.article_content_size")
35+
self.article_title_size = config.get(
36+
"hyper_parameters.article_title_size")
2137
self.browse_size = config.get("hyper_parameters.browse_size")
22-
self.neg_condidate_sample_size = config.get("hyper_parameters.neg_condidate_sample_size")
23-
self.word_dict_size = int(config.get("hyper_parameters.word_dict_size"))
38+
self.neg_condidate_sample_size = config.get(
39+
"hyper_parameters.neg_condidate_sample_size")
40+
self.word_dict_size = int(
41+
config.get("hyper_parameters.word_dict_size"))
2442
self.category_size = int(config.get("hyper_parameters.category_size"))
25-
self.sub_category_size = int(config.get("hyper_parameters.sub_category_size"))
43+
self.sub_category_size = int(
44+
config.get("hyper_parameters.sub_category_size"))
2645
self.article_map_cate = {}
2746
self.article_map_title = {}
2847
self.article_map_content = {}
2948
self.article_map_sub_cate = {}
3049
self.init()
3150

32-
def convert_unk(self,id):
51+
def convert_unk(self, id):
3352
if id in self.article_map_cate:
3453
return id
3554
return "padding"
55+
3656
def init(self):
3757
self.article_map_cate["padding"] = self.category_size
3858
self.article_map_sub_cate["padding"] = self.sub_category_size
39-
self.article_map_title["padding"] = [self.word_dict_size] * self.article_title_size
40-
self.article_map_content["padding"] = [self.word_dict_size]* self.article_content_size
59+
self.article_map_title["padding"] = [self.word_dict_size
60+
] * self.article_title_size
61+
self.article_map_content["padding"] = [self.word_dict_size
62+
] * self.article_content_size
4163
#line [0]id cate_id sub_cate_id [3]title content
4264
for file in self.article_file_list:
43-
with open(file,"r") as rf:
65+
with open(file, "r") as rf:
4466
for l in rf:
4567
line = l.strip().split('\t')
4668
id = line[0]
4769
#line 0 cate 1:subcate, 2:title, 3 content;
48-
line = [[int(line[1])],[int(line[2])],[int(t) for t in line[3].split(" ")],[int(t) for t in line[4].split(" ")]]
49-
line[2] += [self.word_dict_size] * (self.article_title_size - len(line[2]))
50-
line[3] += [self.word_dict_size] * (self.article_content_size - len(line[3]))
70+
line = [[int(line[1])], [int(line[2])],
71+
[int(t) for t in line[3].split(" ")],
72+
[int(t) for t in line[4].split(" ")]]
73+
line[2] += [self.word_dict_size] * (
74+
self.article_title_size - len(line[2]))
75+
line[3] += [self.word_dict_size] * (
76+
self.article_content_size - len(line[3]))
5177
self.article_map_cate[id] = line[0][0]
5278
self.article_map_sub_cate[id] = line[1][0]
5379
if len(line[2]) > self.article_title_size:
@@ -77,29 +103,61 @@ def __iter__(self):
77103
line[0] += ["unk"] * (self.browse_size - len(line[0]))
78104
neg_candidate = line[2]
79105
if len(neg_candidate) < self.neg_condidate_sample_size:
80-
continue;
106+
continue
81107
candidate = neg_candidate[:self.neg_condidate_sample_size]
82108
candidate.append(line[1][0])
83109
line[1] = []
84110
ids = list(range(self.neg_condidate_sample_size + 1))
85111
random.shuffle(ids)
86112
label = []
87113
for i in ids:
88-
line[1].append(candidate[i]) #1 condidate 0:visit
114+
line[1].append(candidate[i]) #1 condidate 0:visit
89115
if i == self.neg_condidate_sample_size:
90116
label.append(1)
91117
else:
92118
label.append(0)
93119

94120
article_list = [np.array(label)]
95-
# l = [self.article_map[i] for i in line[1]]
96-
article_list.append(np.array([self.article_map_cate[self.convert_unk(i)] for i in line[1]]))
97-
article_list.append(np.array([self.article_map_cate[self.convert_unk(i)] for i in line[0]]))
98-
article_list.append(np.array([self.article_map_sub_cate[self.convert_unk(i)] for i in line[1]]))
99-
article_list.append(np.array([self.article_map_sub_cate[self.convert_unk(i)] for i in line[0]]))
100-
article_list.append(np.array([self.article_map_title[self.convert_unk(i)] for i in line[1]]))
101-
article_list.append(np.array([self.article_map_title[self.convert_unk(i)] for i in line[0]]))
102-
article_list.append(np.array([self.article_map_content[self.convert_unk(i)] for i in line[1]]))
103-
article_list.append(np.array([self.article_map_content[self.convert_unk(i)] for i in line[0]]))
121+
# l = [self.article_map[i] for i in line[1]]
122+
article_list.append(
123+
np.array([
124+
self.article_map_cate[self.convert_unk(i)]
125+
for i in line[1]
126+
]))
127+
article_list.append(
128+
np.array([
129+
self.article_map_cate[self.convert_unk(i)]
130+
for i in line[0]
131+
]))
132+
article_list.append(
133+
np.array([
134+
self.article_map_sub_cate[self.convert_unk(i)]
135+
for i in line[1]
136+
]))
137+
article_list.append(
138+
np.array([
139+
self.article_map_sub_cate[self.convert_unk(i)]
140+
for i in line[0]
141+
]))
142+
article_list.append(
143+
np.array([
144+
self.article_map_title[self.convert_unk(i)]
145+
for i in line[1]
146+
]))
147+
article_list.append(
148+
np.array([
149+
self.article_map_title[self.convert_unk(i)]
150+
for i in line[0]
151+
]))
152+
article_list.append(
153+
np.array([
154+
self.article_map_content[self.convert_unk(i)]
155+
for i in line[1]
156+
]))
157+
article_list.append(
158+
np.array([
159+
self.article_map_content[self.convert_unk(i)]
160+
for i in line[0]
161+
]))
104162
#output_list = [article_list,None]
105163
yield article_list

models/rank/naml/README.md

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@
4141
}
4242
```
4343

44+
naml 实现了一个news-encoder, 通过text卷积提取文章特征并采用attention机制把特征压缩为一个n维向量(article embedding),
45+
n篇用户浏览过的文章的article embedding向量组将再次通过attention机制被进一步压缩成最终的user-behavior-embedding(包含了用户行为特征)
46+
此user-behavior-embedding 和 一篇新文章的article embedding 的向量内积则表示用户对此文章的喜好程度。
47+
48+
4449
## 数据准备
4550
此模型训练和预测涉及用户浏览文章历史,以及文章的具体信息,需要先收集所有训练和预测数据里出现过的文章,
4651
每篇文章用一行表示,存放在一个或多个以article{number}.txt为后缀的文件里,如article.txt, article3.txt
@@ -91,10 +96,11 @@ python3 -u ../../../tools/infer.py -m config.yaml
9196
## 效果复现
9297
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。
9398
同时,我们处理好了一份中等规模的microsoft news dataset的数据,可以从https://paddlerec.bj.bcebos.com/datasets/MIND/bigdata.zip下载,
94-
之后解压到data目录,
95-
运行
99+
之后解压到 ../../../datasets/MIND目录,也可以直接运行../../../datasets/MIND目录下的run.sh生成训练,测试数据
100+
101+
运行方式
96102
```
97-
python3 -u ../../../tools/trainer.py -m config.yaml
103+
python3 -u ../../../tools/trainer.py -m config_bigdata.yaml
98104
```
99105
以下为训练2个epoch的结果
100106
| 模型 | top1 acc | batch_size | epoch_num| Time of each epoch|
@@ -103,11 +109,8 @@ python3 -u ../../../tools/trainer.py -m config.yaml
103109

104110
预测
105111
```
106-
python3 -u ../../../tools/infer.py -m config.yaml
112+
python3 -u ../../../tools/infer.py -m config_bigdata.yaml
107113
```
108114

109-
预测top1 acc:0.427
110-
2021-01-27 10:58:27,084 - INFO - epoch: 1 done, acc: 0.427140, epoch time: 126.27 s
111-
112-
113-
115+
期待运行结果如下
116+
INFO - epoch: 1 done, acc: 0.427140, epoch time: 126.27 s

models/rank/naml/config_bigdata.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,20 @@
1313
# limitations under the License.
1414

1515
runner:
16-
train_data_dir: "data/bigdata/train"
16+
train_data_dir: "../../../datasets/MIND/data/train"
1717
train_reader_path: "NAMLDataReader" # importlib format
18-
use_gpu: False
18+
use_gpu: True
1919
train_batch_size: 50
20-
epochs: 30
20+
epochs: 2
2121
print_interval: 2
2222
#model_init_path: "output_model/0" # init model
2323
model_save_path: "output_model"
2424
infer_batch_size: 10
2525
infer_reader_path: "NAMLDataReader" # importlib format
26-
test_data_dir: "data/bigdata/test"
26+
test_data_dir: "../../../datasets/MIND/data/test"
2727
infer_load_path: "output_model"
2828
infer_start_epoch: 0
29-
infer_end_epoch: 3
29+
infer_end_epoch: 1
3030

3131
# hyper parameters of user-defined network
3232
hyper_parameters:

models/rank/naml/dygraph_model.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,21 @@
2323
class DygraphModel():
2424
# define model
2525
def create_model(self, config):
26-
article_content_size = config.get("hyper_parameters.article_content_size")
26+
article_content_size = config.get(
27+
"hyper_parameters.article_content_size")
2728
article_title_size = config.get("hyper_parameters.article_title_size")
2829
browse_size = config.get("hyper_parameters.browse_size")
29-
neg_condidate_sample_size = config.get("hyper_parameters.neg_condidate_sample_size")
30+
neg_condidate_sample_size = config.get(
31+
"hyper_parameters.neg_condidate_sample_size")
3032
word_dimension = config.get("hyper_parameters.word_dimension")
3133
category_size = config.get("hyper_parameters.category_size")
3234
sub_category_size = config.get("hyper_parameters.sub_category_size")
3335
cate_dimension = config.get("hyper_parameters.category_dimension")
3436
word_dict_size = config.get("hyper_parameters.word_dict_size")
35-
return net.NAMLLayer(article_content_size, article_title_size, browse_size, neg_condidate_sample_size,
36-
word_dimension, category_size, sub_category_size, cate_dimension, word_dict_size)
37+
return net.NAMLLayer(article_content_size, article_title_size,
38+
browse_size, neg_condidate_sample_size,
39+
word_dimension, category_size, sub_category_size,
40+
cate_dimension, word_dict_size)
3741

3842
# define feeds which convert numpy of batch data to paddle.tensor
3943
def create_feeds(self, batch, config):
@@ -42,7 +46,10 @@ def create_feeds(self, batch, config):
4246

4347
# define loss function by predicts and label
4448
def create_loss(self, raw_pred, label):
45-
cost = paddle.nn.functional.cross_entropy(input=raw_pred, label=paddle.cast(label, "float32"), soft_label=True)
49+
cost = paddle.nn.functional.cross_entropy(
50+
input=raw_pred,
51+
label=paddle.cast(label, "float32"),
52+
soft_label=True)
4653
avg_cost = paddle.mean(x=cost)
4754
return avg_cost
4855

@@ -63,11 +70,13 @@ def create_metrics(self):
6370

6471
# construct train forward phase
6572
def train_forward(self, dy_model, metrics_list, batch_data, config):
66-
labels, sparse_tensor, dense_tensor = self.create_feeds(batch_data,config)
73+
labels, sparse_tensor, dense_tensor = self.create_feeds(batch_data,
74+
config)
6775

6876
raw = dy_model(sparse_tensor, None)
6977

70-
loss = paddle.nn.functional.cross_entropy(input=raw, label=paddle.cast(labels, "float32"), soft_label=True)
78+
loss = paddle.nn.functional.cross_entropy(
79+
input=raw, label=paddle.cast(labels, "float32"), soft_label=True)
7180
correct = metrics_list[0].compute(raw, labels)
7281
metrics_list[0].update(correct)
7382
loss = paddle.mean(loss)
@@ -81,4 +90,5 @@ def infer_forward(self, dy_model, metrics_list, batch_data, config):
8190
raw = paddle.nn.functional.softmax(raw)
8291
correct = metrics_list[0].compute(raw, label)
8392
metrics_list[0].update(correct)
93+
8494
return metrics_list, None

0 commit comments

Comments
 (0)