Merge pull request #542 from esythan/master

seemingwang · web-flow · commit 24295b3d252c · 2021-09-10T11:49:56.000+08:00
Feature importance
diff --git a/models/rank/slot_dnn/config_offline_infer.yaml b/models/rank/slot_dnn/config_offline_infer.yaml
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# workspace
+#workspace: "models/rank/dnn"
+
+
+runner:
+  data_dir: "infer_data/offline"
+  # use_gpu: False
+  # use_auc: True
+  batch_size: 10
+  print_interval: 10
+
+  sync_mode: "async"
+  thread_num: 1
+  reader_type: "InmemoryDataset"  # DataLoader / QueueDataset / RecDataset
+  pipe_command: "python3 inmemorydataset_reader.py"
+
+  init_model_path: "output_model_benchdnn_queue/20190720/6"
+
+  dataset_debug: False
+  parse_ins_id: True
+  dump_fields_path: "dump_offline_infer"
+
+  shots_shuffle_list: [["2"], ["3"]]
+
+
+# hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.001
+    strategy: async
+  # user-defined <key, value> pairs
+  dict_dim : 1000000
+  emb_dim : 9
+  sparse_feature_dim: 9
+  slot_num: 408
+  layer_sizes: [512, 256, 128, 128, 128]
+  distributed_embedding: 0
diff --git a/models/rank/slot_dnn/inmemorydataset_reader.py b/models/rank/slot_dnn/inmemorydataset_reader.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import yaml
+import six
+import os
+import copy
+import paddle.distributed.fleet as fleet
+import logging
+import numpy as np
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+fea_dict = {}
+
+
+class Reader(fleet.MultiSlotStringDataGenerator):
+    def init(self, config):
+        self.config = config
+        padding = "0"
+        #sparse_slots = "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
+        self.slots = self.slot_reader(slot_num=408)
+        self.slot2index = {}
+        self.visit = {}
+        for i in range(len(self.slots)):
+            self.slot2index[self.slots[i]] = i
+            self.visit[self.slots[i]] = False
+        self.padding = padding
+        logger.info("pipe init success")
+
+    def slot_reader(self, slot_num=0, slot_file='./slot'):
+        slots = []
+        # slot is not 0, label=1, 
+        if slot_num > 0:
+            for i in range(slot_num + 2):
+                if i == 0:
+                    continue
+                slots.append(str(i))
+        else:
+            with open(slot_file, "r") as rf:
+                for line in rf.readlines():
+                    slots.append(line.strip())
+        return slots
+
+    def line_process(self, line):
+        ins_id, line = line.strip().split("\t")
+        line = line.strip().split(" ")
+        output = [(i, []) for i in self.slots]
+        for i in line:
+            slot_feasign = i.split(":")
+            if len(slot_feasign) < 2:
+                print(i)
+            slot = slot_feasign[1]
+            if slot not in self.slots:
+                continue
+            feasign = int(slot_feasign[0])
+            if feasign not in fea_dict:
+                fea_dict[feasign] = str(len(fea_dict))
+            output[self.slot2index[slot]][1].append(fea_dict[feasign])
+            self.visit[slot] = True
+        for i in self.visit:
+            slot = i
+            if not self.visit[slot]:
+                output[self.slot2index[i]][1].extend([self.padding])
+            else:
+                self.visit[slot] = False
+        output = [("ins_id", [ins_id])] + output
+        return output
+        #return [label] + sparse_feature + [dense_feature]
+
+    def generate_sample(self, line):
+        "Dataset Generator"
+
+        def reader():
+            output_dict = self.line_process(line)
+            # {key, value} dict format: {'labels': [1], 'sparse_slot1': [2, 3], 'sparse_slot2': [4, 5, 6, 8], 'dense_slot': [1,2,3,4]} 
+            # dict must match static_model.create_feed()
+            yield output_dict
+
+        return reader
+
+
+if __name__ == "__main__":
+    yaml_path = sys.argv[1]
+    utils_path = sys.argv[2]
+    sys.path.append(utils_path)
+    import common
+    yaml_helper = common.YamlHelper()
+    config = yaml_helper.load_yaml(yaml_path)
+
+    r = Reader()
+    r.init(config)
+    r.run_from_stdin()
diff --git a/models/rank/slot_dnn/net.py b/models/rank/slot_dnn/net.py
@@ -63,7 +63,7 @@ def __init__(self,
                 self._mlp_layers.append(act)
 
     def forward(self, slot_inputs):
-
+        self.all_vars = []
         embs = []
         self.inference_model_feed_vars = []
         for s_input in slot_inputs:
@@ -82,14 +82,19 @@ def forward(self, slot_inputs):
                     param_attr=paddle.ParamAttr(name="embedding"))
                 #emb = self.embedding(s_input)
             self.inference_model_feed_vars.append(emb)
+
             bow = paddle.fluid.layers.sequence_pool(input=emb, pool_type='sum')
+            self.all_vars.append(bow)
             #paddle.fluid.layers.Print(bow)
             embs.append(bow)
 
         y_dnn = paddle.concat(x=embs, axis=1)
+        self.all_vars.append(y_dnn)
 
         for n_layer in self._mlp_layers:
             y_dnn = n_layer(y_dnn)
+            self.all_vars.append(y_dnn)
 
         self.predict = F.sigmoid(paddle.clip(y_dnn, min=-15.0, max=15.0))
+        self.all_vars.append(self.predict)
         return self.predict
diff --git a/models/rank/slot_dnn/static_model.py b/models/rank/slot_dnn/static_model.py
@@ -14,6 +14,7 @@
 
 import math
 import paddle
+import paddle.fluid as fluid
 
 from net import BenchmarkDNNLayer
 
@@ -49,7 +50,7 @@ def create_feeds(self, is_infer=False):
         ]
 
         label = paddle.static.data(
-            name="1", shape=[None, 1], dtype="int64", lod_level=1)
+            name="click", shape=[None, 1], dtype="int64", lod_level=1)
 
         feeds_list = [label] + slot_ids
         return feeds_list
@@ -67,13 +68,21 @@ def net(self, input, is_infer=False):
 
         self.predict = dnn_model(self.slot_inputs)
 
+        # self.all_vars = input + dnn_model.all_vars
+        self.all_vars = dnn_model.all_vars
+
         predict_2d = paddle.concat(x=[1 - self.predict, self.predict], axis=1)
         #label_int = paddle.cast(self.label, 'int64')
-        auc, batch_auc_var, _ = paddle.static.auc(input=predict_2d,
-                                                  label=self.label_input,
-                                                  slide_steps=0)
-        self.inference_target_var = auc
+
+        auc, batch_auc_var, self.auc_stat_list = paddle.static.auc(
+            input=predict_2d, label=self.label_input, slide_steps=0)
+        self.metric_list = fluid.contrib.layers.ctr_metric_bundle(
+            self.predict,
+            fluid.layers.cast(
+                x=self.label_input, dtype='float32'))
         self.inference_model_feed_vars = dnn_model.inference_model_feed_vars
+        self.inference_target_var = self.predict
+
         if is_infer:
             fetch_dict = {'auc': auc}
             return fetch_dict
diff --git a/tools/feature_importance.py b/tools/feature_importance.py
diff --git a/tools/utils/static_ps/metric_helper.py b/tools/utils/static_ps/metric_helper.py