Skip to content

Commit 24295b3

Browse files
authored
Merge pull request #542 from esythan/master
Feature importance
2 parents dcdc792 + 4b2dd67 commit 24295b3

File tree

6 files changed

+645
-6
lines changed

6 files changed

+645
-6
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# workspace
16+
#workspace: "models/rank/dnn"
17+
18+
19+
runner:
20+
data_dir: "infer_data/offline"
21+
# use_gpu: False
22+
# use_auc: True
23+
batch_size: 10
24+
print_interval: 10
25+
26+
sync_mode: "async"
27+
thread_num: 1
28+
reader_type: "InmemoryDataset" # DataLoader / QueueDataset / RecDataset
29+
pipe_command: "python3 inmemorydataset_reader.py"
30+
31+
init_model_path: "output_model_benchdnn_queue/20190720/6"
32+
33+
dataset_debug: False
34+
parse_ins_id: True
35+
dump_fields_path: "dump_offline_infer"
36+
37+
shots_shuffle_list: [["2"], ["3"]]
38+
39+
40+
# hyper parameters of user-defined network
41+
hyper_parameters:
42+
# optimizer config
43+
optimizer:
44+
class: Adam
45+
learning_rate: 0.001
46+
strategy: async
47+
# user-defined <key, value> pairs
48+
dict_dim : 1000000
49+
emb_dim : 9
50+
sparse_feature_dim: 9
51+
slot_num: 408
52+
layer_sizes: [512, 256, 128, 128, 128]
53+
distributed_embedding: 0
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import sys
15+
import yaml
16+
import six
17+
import os
18+
import copy
19+
import paddle.distributed.fleet as fleet
20+
import logging
21+
import numpy as np
22+
23+
logging.basicConfig(
24+
format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
25+
logger = logging.getLogger(__name__)
26+
27+
fea_dict = {}
28+
29+
30+
class Reader(fleet.MultiSlotStringDataGenerator):
31+
def init(self, config):
32+
self.config = config
33+
padding = "0"
34+
#sparse_slots = "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
35+
self.slots = self.slot_reader(slot_num=408)
36+
self.slot2index = {}
37+
self.visit = {}
38+
for i in range(len(self.slots)):
39+
self.slot2index[self.slots[i]] = i
40+
self.visit[self.slots[i]] = False
41+
self.padding = padding
42+
logger.info("pipe init success")
43+
44+
def slot_reader(self, slot_num=0, slot_file='./slot'):
45+
slots = []
46+
# slot is not 0, label=1,
47+
if slot_num > 0:
48+
for i in range(slot_num + 2):
49+
if i == 0:
50+
continue
51+
slots.append(str(i))
52+
else:
53+
with open(slot_file, "r") as rf:
54+
for line in rf.readlines():
55+
slots.append(line.strip())
56+
return slots
57+
58+
def line_process(self, line):
59+
ins_id, line = line.strip().split("\t")
60+
line = line.strip().split(" ")
61+
output = [(i, []) for i in self.slots]
62+
for i in line:
63+
slot_feasign = i.split(":")
64+
if len(slot_feasign) < 2:
65+
print(i)
66+
slot = slot_feasign[1]
67+
if slot not in self.slots:
68+
continue
69+
feasign = int(slot_feasign[0])
70+
if feasign not in fea_dict:
71+
fea_dict[feasign] = str(len(fea_dict))
72+
output[self.slot2index[slot]][1].append(fea_dict[feasign])
73+
self.visit[slot] = True
74+
for i in self.visit:
75+
slot = i
76+
if not self.visit[slot]:
77+
output[self.slot2index[i]][1].extend([self.padding])
78+
else:
79+
self.visit[slot] = False
80+
output = [("ins_id", [ins_id])] + output
81+
return output
82+
#return [label] + sparse_feature + [dense_feature]
83+
84+
def generate_sample(self, line):
85+
"Dataset Generator"
86+
87+
def reader():
88+
output_dict = self.line_process(line)
89+
# {key, value} dict format: {'labels': [1], 'sparse_slot1': [2, 3], 'sparse_slot2': [4, 5, 6, 8], 'dense_slot': [1,2,3,4]}
90+
# dict must match static_model.create_feed()
91+
yield output_dict
92+
93+
return reader
94+
95+
96+
if __name__ == "__main__":
97+
yaml_path = sys.argv[1]
98+
utils_path = sys.argv[2]
99+
sys.path.append(utils_path)
100+
import common
101+
yaml_helper = common.YamlHelper()
102+
config = yaml_helper.load_yaml(yaml_path)
103+
104+
r = Reader()
105+
r.init(config)
106+
r.run_from_stdin()

models/rank/slot_dnn/net.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __init__(self,
6363
self._mlp_layers.append(act)
6464

6565
def forward(self, slot_inputs):
66-
66+
self.all_vars = []
6767
embs = []
6868
self.inference_model_feed_vars = []
6969
for s_input in slot_inputs:
@@ -82,14 +82,19 @@ def forward(self, slot_inputs):
8282
param_attr=paddle.ParamAttr(name="embedding"))
8383
#emb = self.embedding(s_input)
8484
self.inference_model_feed_vars.append(emb)
85+
8586
bow = paddle.fluid.layers.sequence_pool(input=emb, pool_type='sum')
87+
self.all_vars.append(bow)
8688
#paddle.fluid.layers.Print(bow)
8789
embs.append(bow)
8890

8991
y_dnn = paddle.concat(x=embs, axis=1)
92+
self.all_vars.append(y_dnn)
9093

9194
for n_layer in self._mlp_layers:
9295
y_dnn = n_layer(y_dnn)
96+
self.all_vars.append(y_dnn)
9397

9498
self.predict = F.sigmoid(paddle.clip(y_dnn, min=-15.0, max=15.0))
99+
self.all_vars.append(self.predict)
95100
return self.predict

models/rank/slot_dnn/static_model.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import math
1616
import paddle
17+
import paddle.fluid as fluid
1718

1819
from net import BenchmarkDNNLayer
1920

@@ -49,7 +50,7 @@ def create_feeds(self, is_infer=False):
4950
]
5051

5152
label = paddle.static.data(
52-
name="1", shape=[None, 1], dtype="int64", lod_level=1)
53+
name="click", shape=[None, 1], dtype="int64", lod_level=1)
5354

5455
feeds_list = [label] + slot_ids
5556
return feeds_list
@@ -67,13 +68,21 @@ def net(self, input, is_infer=False):
6768

6869
self.predict = dnn_model(self.slot_inputs)
6970

71+
# self.all_vars = input + dnn_model.all_vars
72+
self.all_vars = dnn_model.all_vars
73+
7074
predict_2d = paddle.concat(x=[1 - self.predict, self.predict], axis=1)
7175
#label_int = paddle.cast(self.label, 'int64')
72-
auc, batch_auc_var, _ = paddle.static.auc(input=predict_2d,
73-
label=self.label_input,
74-
slide_steps=0)
75-
self.inference_target_var = auc
76+
77+
auc, batch_auc_var, self.auc_stat_list = paddle.static.auc(
78+
input=predict_2d, label=self.label_input, slide_steps=0)
79+
self.metric_list = fluid.contrib.layers.ctr_metric_bundle(
80+
self.predict,
81+
fluid.layers.cast(
82+
x=self.label_input, dtype='float32'))
7683
self.inference_model_feed_vars = dnn_model.inference_model_feed_vars
84+
self.inference_target_var = self.predict
85+
7786
if is_infer:
7887
fetch_dict = {'auc': auc}
7988
return fetch_dict

0 commit comments

Comments
 (0)