Skip to content

Commit bed6d9e

Browse files
committed
Merge remote-tracking branch 'origin/master'
2 parents ec56cb7 + dfa375b commit bed6d9e

File tree

67 files changed

+6393
-203
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+6393
-203
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ data/test/inference/fg_export_multi/variables/variables.index filter=lfs diff=lf
55
data/test/inference/tb_multitower_export/assets/pipeline.config filter=lfs diff=lfs merge=lfs -text
66
data/test/latest_ckpt_test/model.ckpt-500.meta filter=lfs diff=lfs merge=lfs -text
77
data/test/tb_data/taobao_test_data filter=lfs diff=lfs merge=lfs -text
8+
data/test/tb_data/taobao_multi_seq_test_data filter=lfs diff=lfs merge=lfs -text
89
data/test/test.csv filter=lfs diff=lfs merge=lfs -text
910
data/test/inference/tb_multitower_placeholder_rename_export/assets/pipeline.config filter=lfs diff=lfs merge=lfs -text
1011
data/test/inference/tb_multitower_export/variables/variables.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
@@ -15,6 +16,7 @@ data/test/criteo_sample.tfrecord filter=lfs diff=lfs merge=lfs -text
1516
data/test/rtp/taobao_valid.csv filter=lfs diff=lfs merge=lfs -text
1617
data/test/rtp/taobao_train_feature.txt filter=lfs diff=lfs merge=lfs -text
1718
data/test/tb_data/taobao_train_data filter=lfs diff=lfs merge=lfs -text
19+
data/test/tb_data/taobao_multi_seq_train_data filter=lfs diff=lfs merge=lfs -text
1820
data/test/inference/fg_export_single/variables/variables.index filter=lfs diff=lfs merge=lfs -text
1921
data/test/inference/lookup_data_test80.csv filter=lfs diff=lfs merge=lfs -text
2022
data/test/inference/tb_multitower_export/variables/variables.index filter=lfs diff=lfs merge=lfs -text

.github/workflows/ci.yml

Lines changed: 36 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ jobs:
2121
TEST_DEVICES: ""
2222
run: |
2323
source activate /home/admin/tf12_py2/
24-
if [ ! -e "/tmp/easyrec_data_20210818.tar.gz" ]
24+
if [ ! -e "/tmp/easyrec_data_20220113.tar.gz" ]
2525
then
26-
wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz -O /tmp/easyrec_data_20210818.tar.gz
26+
wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz -O /tmp/easyrec_data_20220113.tar.gz
2727
fi
28-
tar -zvxf /tmp/easyrec_data_20210818.tar.gz
28+
tar -zvxf /tmp/easyrec_data_20220113.tar.gz
2929
source scripts/ci_test.sh
3030
- name: LabelAndComment
3131
env:
@@ -53,51 +53,52 @@ jobs:
5353
if (labels != null) {
5454
pass_label = labels.find(label=>label.name=='ci_test_passed');
5555
}
56+
5657
var fail_label = null;
5758
if (labels != null) {
5859
fail_label = labels.find(label=>label.name=='ci_test_failed');
5960
}
6061
62+
if (pass_label) {
63+
github.rest.issues.removeLabel({
64+
issue_number: context.issue.number,
65+
owner: context.repo.owner,
66+
repo: context.repo.repo,
67+
name: 'ci_test_passed'
68+
})
69+
}
70+
71+
if (fail_label) {
72+
github.rest.issues.removeLabel({
73+
issue_number: context.issue.number,
74+
owner: context.repo.owner,
75+
repo: context.repo.repo,
76+
name: 'ci_test_failed'
77+
})
78+
}
79+
6180
if (CI_TEST_PASSED == 1) {
62-
if (! pass_label) {
63-
github.rest.issues.addLabels({
64-
issue_number: context.issue.number,
65-
owner: context.repo.owner,
66-
repo: context.repo.repo,
67-
labels: ['ci_test_passed']
68-
})
69-
}
70-
if (fail_label) {
71-
github.rest.issues.removeLabel({
72-
issue_number: context.issue.number,
73-
owner: context.repo.owner,
74-
repo: context.repo.repo,
75-
name: 'ci_test_failed'
76-
})
77-
}
81+
github.rest.issues.addLabels({
82+
issue_number: context.issue.number,
83+
owner: context.repo.owner,
84+
repo: context.repo.repo,
85+
labels: ['ci_test_passed']
86+
})
87+
7888
github.rest.issues.createComment({
7989
owner: context.repo.owner,
8090
repo: context.repo.repo,
8191
issue_number: context.issue.number,
8292
body: "CI Test Passed"
8393
})
8494
} else {
85-
if (!fail_label) {
86-
github.rest.issues.addLabels({
87-
issue_number: context.issue.number,
88-
owner: context.repo.owner,
89-
repo: context.repo.repo,
90-
labels: ['ci_test_failed']
91-
})
92-
}
93-
if (pass_label) {
94-
github.rest.issues.removeLabel({
95-
issue_number: context.issue.number,
96-
owner: context.repo.owner,
97-
repo: context.repo.repo,
98-
name: 'ci_test_passed'
99-
})
100-
}
95+
github.rest.issues.addLabels({
96+
issue_number: context.issue.number,
97+
owner: context.repo.owner,
98+
repo: context.repo.repo,
99+
labels: ['ci_test_failed']
100+
})
101+
101102
github.rest.issues.createComment({
102103
owner: context.repo.owner,
103104
repo: context.repo.repo,

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,12 @@ log
2222
# pai job
2323
pai_jobs/easy_rec
2424
pai_jobs/easy_rec.tar.gz
25+
pai_jobs/easy_rec*.tar.gz
2526

2627

2728
# idea files
2829
.idea
30+
31+
# unit test
32+
/data
33+
/UNIT_TEST_CASE_LIST

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
### EasyRec is an easy to use framework for Recommendation
1010

11-
EasyRec implements state of the art deep learning models used in common recommedation tasks: candidate generation(matching), scoring(ranking), and multi-task learning. It improves the efficiency of generating high performance models by simple configuration and hyper parameter tuning(HPO).
11+
EasyRec implements state of the art deep learning models used in common recommendation tasks: candidate generation(matching), scoring(ranking), and multi-task learning. It improves the efficiency of generating high performance models by simple configuration and hyper parameter tuning(HPO).
1212

1313
 
1414

docs/source/benchmark.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# benchmark介绍
2+
3+
为了验证算法的准确性、帮助用户更好的使用EasyRec,我们做了大量的benchmark测试。我们还提供公开数据集、EasyRec配置文件,供用户更好的理解和使用EasyRec。
4+
5+
# 单目标数据集
6+
7+
## Taobao 数据集介绍
8+
9+
- 该数据集是淘宝展示广告点击率预估数据集,包含用户、广告特征和行为日志。[天池比赛链接](https://tianchi.aliyun.com/dataset/dataDetail?dataId=56)
10+
- 训练数据表:pai_online_project.easyrec_demo_taobao_train_data
11+
- 测试数据表:pai_online_project.easyrec_demo_taobao_test_data
12+
13+
## Avazu CTR 数据集
14+
15+
- 该数据集是DSP广告公司Avazu在Kaggle平台举办的移动广告点击率预测模型挑战赛中使用的。[Click-Through Rate Prediction比赛链接](https://www.kaggle.com/c/avazu-ctr-prediction)
16+
- 训练数据表:pai_online_project.dwd_avazu_ctr_deepmodel_train
17+
- 测试数据表:pai_online_project.dwd_avazu_ctr_deepmodel_test
18+
19+
# 多目标数据集
20+
21+
## AliCCP 数据集
22+
23+
- 数据集采集自手机淘宝移动客户端的推荐系统日志,其中包含点击和与之关联的转化数据。[天池比赛链接](https://tianchi.aliyun.com/dataset/dataDetail?dataId=408)
24+
- 训练数据表:pai_rec_dev.AliCCP_sample_train_data_processed
25+
- 测试数据表:pai_rec_dev.AliCCP_sample_test_data_processeds
26+
27+
## CENSUS
28+
29+
- CENSUS有48842个样本数据,每个样本14个属性,包括age, occupation, education, income等。样本的标注值为收入水平,例如>50K、\<=50K。[Census Income数据集链接](https://archive.ics.uci.edu/ml/datasets/census+income)
30+
- 训练数据表:pai_rec_dev.census_income_train
31+
- 测试数据表:pai_rec_dev.census_income_test
32+
33+
# 单目标模型在taobao数据集上的测试结果
34+
35+
- 在PAI上面测试使用的资源包括2个parameter server,9个worker,其中一个worker做评估:
36+
```json
37+
{"ps":{"count":2,
38+
"cpu":1000,
39+
"memory":40000},
40+
"worker":{"count":9,
41+
"cpu":1000,
42+
"memory":40000}
43+
}
44+
```
45+
46+
## 单目标测试结果
47+
48+
| model | global_step | best_auc | config |
49+
| ---------- | ----------- | -------- | ------------------------------------------------------------------------------------------------------------- |
50+
| MultiTower | 1800 | 0.614680 | [taobao_mutiltower.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_mutiltower.config) |
51+
| DIN | 1600 | 0.617049 | [din.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_din.config) |
52+
| DeepFM | 1600 | 0.580521 | [deepfm.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_deepfm.config) |
53+
| DCN | 1500 | 0.596816 | [dcn.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_dcn.config) |
54+
| BST | 3500 | 0.566251 | [bst.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_bst.config) |
55+
| AutoInt | 700 | 0.605982 | [autoint.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/ctr/taobao_autoint.config) |
56+
57+
# 多目标模型在Ali-CCP数据集上的测试结果
58+
59+
- 在PAI上面测试使用的资源包括2个parameter server,9个worker,其中一个worker做评估:
60+
```json
61+
{"ps":{"count":2,
62+
"cpu":1000,
63+
"memory":40000},
64+
"worker":{"count":9,
65+
"cpu":1000,
66+
"memory":40000}
67+
}
68+
```
69+
70+
## 多目标测试结果
71+
72+
| model | global_step | ctr auc | masked cvr auc | ctcvr auc | 训练时间 | config |
73+
| --------------- | ----------- | --------- | -------------- | --------- | ---- | -------------------------------------------------------------------------------------------------------------------- |
74+
| SimpleMultiTask | 4100 | 0.592606 | | 0.6306802 | 1小时 | [simple_multi_task.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/simple_multi_task.config) |
75+
| MMoE | 3100 | 0.5869702 | | 0.6330008 | 1小时 | [mmoe.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/mmoe.config) |
76+
| ESMM | 800 | 0.5974812 | 0.6841141 | 0.6362526 | 3小时 | [esmm.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/esmm.config) |
77+
| PLE | 3200 | 0.5874 | | 0.6159 | 2小时 | [ple.config](http://easyrec.oss-cn-beijing.aliyuncs.com/benchmark/multi_task/ple.config) |

docs/source/develop.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ TEMPDIR=/tmp python -m easy_rec.python.test.odps_run --oss_config ~/.ossutilconf
5555
下载测试数据
5656

5757
```bash
58-
wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz
59-
tar -xvzf easyrec_data_20210818.tar.gz
58+
wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz
59+
tar -xvzf easyrec_data_20220113.tar.gz
6060
```
6161

6262
如果您要添加新数据,请在“git commit”之前执行以下操作,以将其提交到 git-lfs:

docs/source/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ Welcome to easy_rec's documentation!
8888
faq
8989
tf_on_yarn
9090
get_role_arn
91-
91+
benchmark
9292

9393

9494
Indices and tables

docs/source/models/dlrm.md

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# DLRM
2+
3+
### 简介
4+
5+
DLRM(Deep Learning Recommendation Model for Personalization and Recommendation Systems\[Facebook\])是一种DNN模型, 支持使用连续值特征(price/age/...)和ID类特征(user_id/item_id/...), 并对特征之间的交互(interaction)进行了建模(基于内积的方式).
6+
7+
```
8+
output:
9+
probability of a click
10+
model: |
11+
_________________>DNN(top)<___________
12+
/ | \
13+
/_________________>INTERACTION <_________\
14+
// \\
15+
DNN(bot) ____________\\_________
16+
| | |
17+
| _____|_______ _____|______
18+
| |_Emb_|____|__| ... |_Emb_|__|___|
19+
input:
20+
[ dense features ] [sparse indices] , ..., [sparse indices]
21+
```
22+
23+
### 配置说明
24+
25+
```protobuf
26+
model_config {
27+
model_class: 'DLRM'
28+
29+
feature_groups {
30+
group_name: 'dense'
31+
feature_names: 'age_level'
32+
feature_names: 'pvalue_level'
33+
feature_names: 'shopping_level'
34+
feature_names: 'new_user_class_level'
35+
feature_names: 'price'
36+
37+
wide_deep: DEEP
38+
}
39+
40+
feature_groups {
41+
group_name: 'sparse'
42+
feature_names: 'user_id'
43+
feature_names: 'cms_segid'
44+
feature_names: 'cms_group_id'
45+
feature_names: 'occupation'
46+
feature_names: 'adgroup_id'
47+
feature_names: 'cate_id'
48+
feature_names: 'campaign_id'
49+
feature_names: 'customer'
50+
feature_names: 'brand'
51+
feature_names: 'pid'
52+
feature_names: 'tag_category_list'
53+
feature_names: 'tag_brand_list'
54+
55+
wide_deep: DEEP
56+
}
57+
58+
dlrm {
59+
bot_dnn {
60+
hidden_units: [64, 32, 16]
61+
}
62+
63+
top_dnn {
64+
hidden_units: [128, 64]
65+
}
66+
67+
l2_regularization: 1e-5
68+
}
69+
70+
embedding_regularization: 1e-5
71+
}
72+
```
73+
74+
- model_class: 'DLRM', 不需要修改
75+
76+
- feature_groups: 特征组
77+
78+
- 包含两个feature_group: dense 和sparse group, **group name不能变**
79+
80+
- wide_deep: dlrm模型使用的都是Deep features, 所以都设置成DEEP
81+
82+
- dlrm: dlrm模型相关的参数
83+
84+
- bot_dnn: dense mlp的参数配置
85+
86+
- hidden_units: dnn每一层的channel数目,即神经元的数目
87+
88+
- top_dnn: 输出(logits)之前的mlp, 输入为dense features, sparse features and interact features.
89+
90+
- hidden_units: dnn每一层的channel数目,即神经元的数目
91+
92+
- arch_interaction_op: cat or dot
93+
94+
- cat: 将dense_features和sparse features concat起来, 然后输入bot_dnn
95+
- dot: 将dense_features和sparse features做内积interaction, 并将interaction的结果和sparse features concat起来, 然后输入bot_dnn
96+
97+
- arch_interaction_itself:
98+
99+
- 仅当arch_interaction_op = 'dot'时有效, features是否和自身做内积
100+
101+
- arch_with_dense_feature:
102+
103+
- 仅当arch_interaction_op = 'dot'时有效,
104+
- if true, dense features也会和sparse features以及interact features concat起来, 然后进入bot_dnn.
105+
- 默认是false, 即仅将sparse features和interact features concat起来,输入bot_dnn.
106+
107+
- l2_regularization: 对DNN参数的regularization, 减少overfit
108+
109+
- embedding_regularization: 对embedding部分加regularization, 减少overfit
110+
111+
### 示例Config
112+
113+
[DLRM_demo.config](https://easyrec.oss-cn-beijing.aliyuncs.com/config/dlrm_on_taobao.config)
114+
115+
### 参考论文
116+
117+
[DLRM](https://arxiv.org/abs/1906.00091)

docs/source/models/rank.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
deepfm
99
fm
1010
wide_and_deep
11+
dlrm
1112
dcn
1213
autoint
1314
din

docs/source/quick_start/local_tutorial.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
```bash
66
git clone https://github.com/alibaba/EasyRec.git
77
cd EasyRec
8-
wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20210818.tar.gz
8+
wget https://easyrec.oss-cn-beijing.aliyuncs.com/data/easyrec_data_20220113.tar.gz
99
bash scripts/gen_proto.sh # 根据proto文件生成 配置解析.py文件
1010
python setup.py install
1111
```

0 commit comments

Comments
 (0)