Skip to content

Commit 21b8d43

Browse files
authored
Merge pull request #722 from wangzhen38/bug_fix_39
fix bugs from test3.9
2 parents 23639e0 + 36011ac commit 21b8d43

File tree

12 files changed

+264
-50
lines changed

12 files changed

+264
-50
lines changed
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from os import listdir, path, makedirs
16+
import random
17+
import sys
18+
import time
19+
import datetime
20+
21+
22+
def print_stats(data):
23+
total_ratings = 0
24+
print("STATS")
25+
for user in data:
26+
total_ratings += len(data[user])
27+
print("Total Ratings: {}".format(total_ratings))
28+
print("Total User count: {}".format(len(data.keys())))
29+
30+
31+
def save_data_to_file(data, filename):
32+
with open(filename, 'w') as out:
33+
for userId in data:
34+
for record in data[userId]:
35+
out.write("{}\t{}\t{}\n".format(userId, record[0], record[1]))
36+
37+
38+
def create_NETFLIX_data_timesplit(all_data, train_min, train_max, test_min,
39+
test_max):
40+
"""
41+
Creates time-based split of NETFLIX data into train, and (validation, test)
42+
:param all_data:
43+
:param train_min:
44+
:param train_max:
45+
:param test_min:
46+
:param test_max:
47+
:return:
48+
"""
49+
train_min_ts = time.mktime(
50+
datetime.datetime.strptime(train_min, "%Y-%m-%d").timetuple())
51+
train_max_ts = time.mktime(
52+
datetime.datetime.strptime(train_max, "%Y-%m-%d").timetuple())
53+
test_min_ts = time.mktime(
54+
datetime.datetime.strptime(test_min, "%Y-%m-%d").timetuple())
55+
test_max_ts = time.mktime(
56+
datetime.datetime.strptime(test_max, "%Y-%m-%d").timetuple())
57+
58+
training_data = dict()
59+
validation_data = dict()
60+
test_data = dict()
61+
62+
train_set_items = set()
63+
64+
for userId, userRatings in all_data.items():
65+
time_sorted_ratings = sorted(
66+
userRatings, key=lambda x: x[2]) # sort by timestamp
67+
for rating_item in time_sorted_ratings:
68+
if rating_item[2] >= train_min_ts and rating_item[
69+
2] <= train_max_ts:
70+
if userId not in training_data:
71+
training_data[userId] = []
72+
training_data[userId].append(rating_item)
73+
train_set_items.add(
74+
rating_item[0]) # keep track of items from training set
75+
elif rating_item[2] >= test_min_ts and rating_item[
76+
2] <= test_max_ts:
77+
if userId not in training_data:
78+
# only include users seen in the training set
79+
continue
80+
p = random.random()
81+
if p <= 0.5:
82+
if userId not in validation_data:
83+
validation_data[userId] = []
84+
validation_data[userId].append(rating_item)
85+
else:
86+
if userId not in test_data:
87+
test_data[userId] = []
88+
test_data[userId].append(rating_item)
89+
90+
# remove items not not seen in training set
91+
for userId, userRatings in test_data.items():
92+
test_data[userId] = [
93+
rating for rating in userRatings if rating[0] in train_set_items
94+
]
95+
for userId, userRatings in validation_data.items():
96+
validation_data[userId] = [
97+
rating for rating in userRatings if rating[0] in train_set_items
98+
]
99+
100+
return training_data, validation_data, test_data
101+
102+
103+
def main(args):
104+
user2id_map = dict()
105+
item2id_map = dict()
106+
userId = 0
107+
itemId = 0
108+
all_data = dict()
109+
110+
folder = args[1]
111+
out_folder = args[2]
112+
# create necessary folders:
113+
for output_dir in [(out_folder + f)
114+
for f in ["/NF_TRAIN", "/NF_VALID", "/NF_TEST"]]:
115+
makedirs(output_dir, exist_ok=True)
116+
117+
text_files = [
118+
path.join(folder, f) for f in listdir(folder)
119+
if path.isfile(path.join(folder, f)) and ('.txt' in f)
120+
]
121+
122+
for text_file in text_files:
123+
with open(text_file, 'r') as f:
124+
print("Processing: {}".format(text_file))
125+
lines = f.readlines()
126+
item = int(lines[0][:-2]) # remove newline and :
127+
if item not in item2id_map:
128+
item2id_map[item] = itemId
129+
itemId += 1
130+
131+
for rating in lines[1:]:
132+
parts = rating.strip().split(",")
133+
user = int(parts[0])
134+
if user not in user2id_map:
135+
user2id_map[user] = userId
136+
userId += 1
137+
rating = float(parts[1])
138+
ts = int(
139+
time.mktime(
140+
datetime.datetime.strptime(parts[2], "%Y-%m-%d")
141+
.timetuple()))
142+
if user2id_map[user] not in all_data:
143+
all_data[user2id_map[user]] = []
144+
all_data[user2id_map[user]].append(
145+
(item2id_map[item], rating, ts))
146+
147+
print("STATS FOR ALL INPUT DATA")
148+
print_stats(all_data)
149+
150+
# Netflix full
151+
(nf_train, nf_valid, nf_test) = create_NETFLIX_data_timesplit(
152+
all_data, "1999-12-01", "2005-11-30", "2005-12-01", "2005-12-31")
153+
print("Netflix full train")
154+
print_stats(nf_train)
155+
save_data_to_file(nf_train, out_folder + "/NF_TRAIN/nf.train.txt")
156+
print("Netflix full valid")
157+
print_stats(nf_valid)
158+
save_data_to_file(nf_valid, out_folder + "/NF_VALID/nf.valid.txt")
159+
print("Netflix full test")
160+
print_stats(nf_test)
161+
save_data_to_file(nf_test, out_folder + "/NF_TEST/nf.test.txt")
162+
'''
163+
(n3m_train, n3m_valid, n3m_test) = create_NETFLIX_data_timesplit(
164+
all_data, "2005-09-01", "2005-11-30", "2005-12-01", "2005-12-31")
165+
166+
print("Netflix 3m train")
167+
print_stats(n3m_train)
168+
save_data_to_file(n3m_train, out_folder + "/N3M_TRAIN/n3m.train.txt")
169+
print("Netflix 3m valid")
170+
print_stats(n3m_valid)
171+
save_data_to_file(n3m_valid, out_folder + "/N3M_VALID/n3m.valid.txt")
172+
print("Netflix 3m test")
173+
print_stats(n3m_test)
174+
save_data_to_file(n3m_test, out_folder + "/N3M_TEST/n3m.test.txt")
175+
176+
(n6m_train, n6m_valid, n6m_test) = create_NETFLIX_data_timesplit(
177+
all_data, "2005-06-01", "2005-11-30", "2005-12-01", "2005-12-31")
178+
print("Netflix 6m train")
179+
print_stats(n6m_train)
180+
save_data_to_file(n6m_train, out_folder + "/N6M_TRAIN/n6m.train.txt")
181+
print("Netflix 6m valid")
182+
print_stats(n6m_valid)
183+
save_data_to_file(n6m_valid, out_folder + "/N6M_VALID/n6m.valid.txt")
184+
print("Netflix 6m test")
185+
print_stats(n6m_test)
186+
save_data_to_file(n6m_test, out_folder + "/N6M_TEST/n6m.test.txt")
187+
188+
# Netflix 1 year
189+
(n1y_train, n1y_valid, n1y_test) = create_NETFLIX_data_timesplit(
190+
all_data, "2004-06-01", "2005-05-31", "2005-06-01", "2005-06-30")
191+
print("Netflix 1y train")
192+
print_stats(n1y_train)
193+
save_data_to_file(n1y_train, out_folder + "/N1Y_TRAIN/n1y.train.txt")
194+
print("Netflix 1y valid")
195+
print_stats(n1y_valid)
196+
save_data_to_file(n1y_valid, out_folder + "/N1Y_VALID/n1y.valid.txt")
197+
print("Netflix 1y test")
198+
print_stats(n1y_test)
199+
save_data_to_file(n1y_test, out_folder + "/N1Y_TEST/n1y.test.txt")
200+
'''
201+
202+
203+
if __name__ == "__main__":
204+
main(sys.argv)

datasets/Netflix_deeprec/run.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
wget https://paddlerec.bj.bcebos.com/datasets/Netflix-DeepRec/nf_prize_dataset.tar.gz
2+
tar -xvf nf_prize_dataset.tar.gz
3+
tar -xf download/training_set.tar
4+
python netflix_data_convert.py training_set Netflix

datasets/ml-1m_ensfm/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414
#!/bin/bash
1515

16-
16+
mkdir data
1717
wget https://paddlerec.bj.bcebos.com/recall/ensfm/ml-1m-ensfm.tar.gz
1818
tar zxvf ml-1m-ensfm.tar.gz -C data
1919
rm ml-1m-ensfm.tar.gz

models/rank/autofis/readme.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
├── README.md #文档
1313
├── config.yaml # sample数据配置
1414
├── config_bigdata.yaml # 全量数据配置
15-
├── net.py # 模型核心组网(动静统一)
15+
├── net.py # 模型核心组网
1616
├── criteo_reader.py #数据读取程序
1717
├── dygraph_model.py # 构建动态图
1818
├── trainer.py # 训练脚本
@@ -33,11 +33,11 @@
3333
- [FAQ](#FAQ)
3434

3535
## 模型简介
36-
Automatic Feature Interaction Selection in Factorization Models(点击率预测问题下因子分解机模型的自动特征交互选择模)是华为在2020kdd上提出了新的CTR预估方法。论文指出,很多CTR预估算法都需要进行特征组合,但是传统的特征组合方式都是简单的暴力组合或者人工选择,人工选择的方式依赖于先验知识,而简单的暴力组合其实对模型的性能的提升并不是总有益的,有些组合方式其实对模型的性能提升并没有多少的帮助,甚至会损害模型的性能,而且大量无效的特征组合会形成很多的参数,降低内存的利用率。根据AutoML技术,提出AutoFIS,顾名思义,就是自动去找最佳的特征组合。
36+
Automatic Feature Interaction Selection in Factorization Models(点击率预测问题下因子分解机模型的自动特征交互选择模型)是华为在2020kdd上提出了新的CTR预估方法。论文指出,很多CTR预估算法都需要进行特征组合,但是传统的特征组合方式都是简单的暴力组合或者人工选择,人工选择的方式依赖于先验知识,而简单的暴力组合其实对模型的性能的提升并不总是有益的,有些组合方式其实对模型的性能提升并没有多少的帮助,甚至会损害模型的性能,而且大量无效的特征组合会形成很多的参数,降低内存的利用率。根据AutoML技术,提出AutoFIS,顾名思义,就是自动去找最佳的特征组合。
3737

3838
## 数据准备
3939

40-
数据为[Criteo](http://labs.criteo.com/downloads/download-terabyte-click-log),选择了第6-12天的数据作为训练集,低13天的数据测试集。正负样本采用后的比例约为1:1
40+
数据为[Criteo](http://labs.criteo.com/downloads/download-terabyte-click-log),选择了第6-12天的数据作为训练集,第13天的数据测试集。正负样本采用后的比例约为1:1
4141
在模型目录的data目录下为您准备了快速运行的示例数据,若需要使用全量数据可以参考下方[效果复现](#效果复现)部分。
4242

4343
## 运行环境
@@ -48,14 +48,16 @@ python 2.7/3.5/3.6/3.7
4848
os : windows/linux/macos
4949

5050
## 快速开始
51-
本文提供了样例数据可以供您快速体验,在任意目录下均可执行。在deepfm模型目录的快速执行命令如下
51+
本文提供了样例数据可以供您快速体验,在任意目录下均可执行。在autofis模型目录的快速执行命令如下
5252
```bash
5353
# 进入模型目录
54-
# cd models/rank/deepfm # 在任意目录均可运行
54+
# cd models/rank/autofis # 在任意目录均可运行
5555
# 动态图训练
56-
python -u ../../../tools/trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml
56+
python trainer.py -m config.yaml # stage0:自动搜索最佳特征组合 全量数据运行config_bigdata.yaml
57+
python trainer.py -m config.yaml -o stage=1 # stage1:训练最终模 全量数据运行config_bigdata.yaml型
58+
5759
# 动态图预测
58-
python -u ../../../tools/infer.py -m config.yaml
60+
python -u ../../../tools/infer.py -m config.yaml -o stage=1 # 全量数据运行config_bigdata.yaml
5961
```
6062
## 效果复现
6163
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
@@ -65,7 +67,7 @@ python -u ../../../tools/infer.py -m config.yaml
6567
| AutodeepFM | 0.8009 |0.5403 | 2000 | 1 | 约3小时 |
6668

6769
1. 确认您当前所在目录为PaddleRec/models/rank/autofis
68-
2. 进入paddlerec/datasets/criteo_autofis
70+
2. 进入Paddlerec/datasets/criteo_autofis
6971
3. 执行命令运行全量数据
7072

7173
``` bash
@@ -77,7 +79,7 @@ cd - # 切回模型目录
7779
# 动态图训练
7880
python trainer.py -m config_bigdata.yaml # stage0:自动搜索最佳特征组合
7981
python trainer.py -m config_bigdata.yaml -o stage=1 # stage1:训练最终模型
80-
python -u ../../../tools/infer.py -m config_bigdata.yaml -o stage=1 # 全量数据运行config_bigdata.yaml
82+
python -u ../../../tools/infer.py -m config_bigdata.yaml -o stage=1 # 预测
8183
```
8284
## 进阶使用
8385

models/rank/bert4rec/config_bigdata.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@ runner:
2020
use_gpu: True
2121
train_batch_size: 1
2222
data_batch_size: 256
23-
epochs: 10
23+
epochs: 50
2424
print_interval: 100
2525

2626
model_save_path: "output_model_bert4rec_all"
2727
test_data_dir: "../../../datasets/Beauty/data/test"
2828
infer_reader_path: "data_reader" # importlib format
2929
infer_batch_size: 1
3030
infer_load_path: "output_model_bert4rec_all"
31-
infer_start_epoch: 9
32-
infer_end_epoch: 10
31+
infer_start_epoch: 49
32+
infer_end_epoch: 50
3333

3434

3535
# hyper parameters of user-defined network

models/rank/deeprec/README.md

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
├── small_train.txt
99
├── test #测试数据
1010
├── small_test.txt
11-
├── __init__.py
1211
├── README.md #文档
1312
├── config.yaml # sample数据配置
1413
├── config_bigdata.yaml # 全量数据配置
15-
├── deeprec_reader.py # 数据读取程序
16-
├── net.py # 模型核心组网(动静统一)
1714
├── dygraph_model.py # 构建动态图
15+
├── deeprec_reader.py # 数据读取程序
16+
├── infer.py # 预测入口
17+
├── net.py # 模型核心组网
18+
├── netflix_reader.py # 数据读取程序
19+
├── trainer.py # 训练入口
1820
```
1921

2022
注:在阅读该示例前,建议您先了解以下内容:
@@ -70,7 +72,7 @@ DeepRec是一个自编码器, 由encoder和decoder组成, 在encoder和decoder
7072
[DeepRec](../../../doc/imgs/AutoEncoder.png)
7173

7274

73-
### 效果复现
75+
## 效果复现
7476
为了方便使用者能够快速的跑通每一个模型,我们在每个模型下都提供了样例数据。如果需要复现readme中的效果,请按如下步骤依次操作即可。
7577
在全量数据下模型的指标如下:
7678

@@ -79,10 +81,10 @@ DeepRec是一个自编码器, 由encoder和decoder组成, 在encoder和decoder
7981
| DeepRec | 0.9172 | [n(17768), 512, 512, 1024] | 128 | 45 | 约55秒 |
8082

8183
1. 确认您当前所在目录为PaddleRec/models/rank/deeprec
82-
2. 进入paddlerec/datasets/Netflix目录下,执行该脚本,会从国内源的服务器上下载Netflix Price数据集,并解压到指定文件夹。
84+
2. 进入Paddlerec/datasets/Netflix_deeprec目录下,执行该脚本
8385

8486
``` bash
85-
cd data
87+
cd ../../../datasets/Netflix_deeprec
8688
sh run.sh
8789
```
8890
3. 切回模型目录,执行命令运行全量数据

models/rank/deeprec/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ runner:
3232

3333
hyper_parameters:
3434
dp_drop_prob: 0.0
35-
layer_sizes: [1000, 16, 8]
35+
layer_sizes: [50, 16, 8]
3636
optimizer:
3737
class: SGD
3838
learning_rate: 0.05

models/rank/deeprec/config_bigdata.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,21 @@
1414

1515

1616
runner:
17-
train_data_dir: "data/Netflix/NF_TRAIN"
17+
train_data_dir: "../../../datasets/Netflix_deeprec/Netflix/NF_TRAIN"
1818
train_reader_path: "netflix_reader" # importlib format
1919
reader_batch_size: 128
2020
use_gpu: True
2121
mode: "train" # control the return of the create_feeds function
2222
epochs: 60
2323
print_interval: 1000
24-
model_save_path: "output_model_deeprec"
24+
model_save_path: "output_model_deeprec_all"
2525

26-
test_data_dir: "data/Netflix/NF_TEST"
26+
test_data_dir: "../../../datasets/Netflix_deeprec/Netflix/NF_TEST"
2727
infer_reader_path: "netflix_reader" # importlib format
28-
infer_load_path: "output_model_deeprec"
28+
infer_load_path: "output_model_deeprec_all"
2929
infer_start_epoch: 30
30-
infer_batch_size: 128
3130
infer_end_epoch: 39
31+
infer_batch_size: 128
3232

3333
hyper_parameters:
3434
dp_drop_prob: 0.8

models/recall/tisas/config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,16 @@ runner:
1616
train_data_dir: "data/sample_data"
1717
train_reader_path: "movielens_reader" # importlib format
1818
train_batch_size: 2
19-
model_save_path: "output_model_tisas_demo"
19+
model_save_path: "output_model_tisas"
2020
mode: "train"
21-
use_gpu: True
21+
use_gpu: False
2222
epochs: 3
2323
print_interval: 10
2424

2525
test_data_dir: "data/sample_data"
2626
infer_reader_path: "movielens_reader" # importlib format
2727
infer_batch_size: 1
28-
infer_load_path: "output_model_tisas_demo"
28+
infer_load_path: "output_model_tisas"
2929
infer_start_epoch: 2
3030
infer_end_epoch: 3
3131

0 commit comments

Comments
 (0)