PaddlePaddle
diff --git a/‎datasets/AmazonBook/preprocess.py‎
Lines changed: 76 additions & 0 deletions b/‎datasets/AmazonBook/preprocess.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎datasets/AmazonBook/run.sh‎
Lines changed: 11 additions & 0 deletions b/‎datasets/AmazonBook/run.sh‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎datasets/readme.md‎
Lines changed: 1 addition & 0 deletions b/‎datasets/readme.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎models/recall/mind/README.md‎
Lines changed: 22 additions & 20 deletions b/‎models/recall/mind/README.md‎
Lines changed: 22 additions & 20 deletions
diff --git a/‎models/recall/mind/config.yaml‎
Lines changed: 3 additions & 4 deletions b/‎models/recall/mind/config.yaml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎models/recall/mind/config_bigdata.yaml‎
Lines changed: 50 additions & 0 deletions b/‎models/recall/mind/config_bigdata.yaml‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎models/recall/mind/data/processs.py‎
Lines changed: 1 addition & 64 deletions b/‎models/recall/mind/data/processs.py‎
Lines changed: 1 addition & 64 deletions
diff --git a/‎models/recall/mind/data/run.sh‎
Lines changed: 0 additions & 13 deletions b/‎models/recall/mind/data/run.sh‎
Lines changed: 0 additions & 13 deletions
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import json
+import numpy as np
+import argparse
+import random
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-type", type=str, default="train", help="train|valid|test")
+parser.add_argument("-maxlen", type=int, default=20)
+
+
+def load_graph(source):
+    graph = {}
+    with open(source) as fr:
+        for line in fr:
+            conts = line.strip().split(',')
+            user_id = int(conts[0])
+            item_id = int(conts[1])
+            time_stamp = int(conts[2])
+            if user_id not in graph:
+                graph[user_id] = []
+            graph[user_id].append((item_id, time_stamp))
+
+    for user_id, value in graph.items():
+        value.sort(key=lambda x: x[1])
+        graph[user_id] = [x[0] for x in value]
+    return graph
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    filelist = []
+    for i in range(10):
+        filelist.append(open(args.type + "/part-%d" % (i), "w"))
+    action_graph = load_graph("book_data/book_" + args.type + ".txt")
+    if args.type == "train":
+        for uid, item_list in action_graph.items():
+            for i in range(4, len(item_list)):
+                if i >= args.maxlen:
+                    hist_item = item_list[i - args.maxlen:i]
+                else:
+                    hist_item = item_list[:i]
+                target_item = item_list[i]
+                print(
+                    " ".join(["user_id:" + str(uid)] + [
+                        "hist_item:" + str(n) for n in hist_item
+                    ] + ["target_item:" + str(target_item)]),
+                    file=random.choice(filelist))
+    else:
+        for uid, item_list in action_graph.items():
+            k = int(len(item_list) * 0.8)
+            if k >= args.maxlen:
+                hist_item = item_list[k - args.maxlen:k]
+            else:
+                hist_item = item_list[:k]
+            target_item = item_list[k:]
+            print(
+                " ".join(["user_id:" + str(uid), "target_item:0"] + [
+                    "hist_item:" + str(n) for n in hist_item
+                ] + ["eval_item:" + str(n) for n in target_item]),
+                file=random.choice(filelist))
@@ -0,0 +1,11 @@
+
+wget https://paddlerec.bj.bcebos.com/datasets/AmazonBook/AmazonBook.tar.gz
+
+tar -xvf AmazonBook.tar.gz
+
+rm -rf train valid
+mkdir train
+mkdir valid
+
+mv book_data/book_train.txt  train
+python preprocess.py -type valid -maxlen 20
@@ -26,3 +26,4 @@ sh data_process.sh
  |[one_billion](http://www.statmt.org/lm-benchmark/)|拥有十亿个单词基准，为语言建模实验提供标准的训练和测试|[One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling](https://arxiv.org/abs/1312.3005)|
  |[MIND](https://paddlerec.bj.bcebos.com/datasets/MIND/bigdata.zip)|MIND即MIcrosoft News Dataset的简写，MIND里的数据来自Microsoft News用户的行为日志。MIND的数据集里包含了1,000,000的用户以及这些用户与160,000的文章的交互行为。|[Microsoft(2020)](https://msnews.github.io)|
  |[movielens_pinterest_NCF](https://paddlerec.bj.bcebos.com/ncf/Data.zip)|论文原作者处理过的movielens数据集和pinterest数据集|[《Neural Collaborative Filtering 》](https://arxiv.org/pdf/1708.05031.pdf)|
+|[AmazonBook](https://paddlerec.bj.bcebos.com/datasets/AmazonBook/AmazonBook.tar.gz)|论文原作者处理过的AmazonBook数据集 |[《Controllable Multi-Interest Framework for Recommendation》](https://arxiv.org/abs/2005.09347)|
@@ -5,17 +5,16 @@
 ├── data #样例数据
 │   ├── demo                    #demo训练数据
 │   │   └── demo.txt     
-│   ├── processs.py             #处理全量数据的脚本
-│   ├── run.sh                  #全量数据下载的脚本
 │   └── valid                    #demo测试数据
 │       └── part-0    
-├── config.yaml                 #数据配置
+├── config.yaml                 #demo数据配置
+├── config_bigdata.yaml         #全量数据配置
+├── infer.py                    #评测动态图
 ├── dygraph_model.py            #构建动态图
-├── evaluate_dygraph.py         #评测动态图
-├── evaluate_reader.py          #评测数据reader
-├── evaluate_static.py          #评测静态图
 ├── mind_reader.py              #训练数据reader
+├── mind_infer_reader.py        #评测数据reader
 ├── net.py                      #模型核心组网（动静合一）
+├── static_infer.py             #评测静态图
 └── static_model.py             #构建静态图
 ```
 
@@ -57,11 +56,14 @@ Multi-Interest Network with Dynamic Routing (MIND) 是通过构建用户和商
 
 测试数据的格式如下：
 ```
-user_id:543354 hist_item:143963 hist_item:157508 hist_item:105486 hist_item:40502 hist_item:167813 hist_item:233564 hist_item:221866 hist_item:280310 hist_item:61638 hist_item:158494 hist_item:74449 hist_item:283630 hist_item:135155 hist_item:96176 hist_item:20139 hist_item:89420 hist_item:247990 hist_item:126605 target_item:172183 target_item:114193 target_item:79966 target_item:134420 target_item:50557
-user_id:543362 hist_item:119546 hist_item:78597 hist_item:86809 hist_item:63551 target_item:326165
-user_id:543366 hist_item:45463 hist_item:9903 hist_item:3956 hist_item:49726 target_item:199426
+user_id:487766 target_item:0 hist_item:17784 hist_item:126 hist_item:36 hist_item:124 hist_item:34 hist_item:1 hist_item:134 hist_item:6331 hist_item:141 hist_item:4336 hist_item:1373 eval_item:1062 eval_item:867 eval_item:62
+user_id:487793 target_item:0 hist_item:153428 hist_item:132997 hist_item:155723 hist_item:66546 hist_item:335397 hist_item:1926 eval_item:1122 eval_item:10105
+user_id:487805 target_item:0 hist_item:291025 hist_item:25190 hist_item:2820 hist_item:26047 hist_item:47259 hist_item:36376 eval_item:260145 eval_item:83865
+user_id:487811 target_item:0 hist_item:180837 hist_item:202701 hist_item:184587 hist_item:211642 eval_item:101621 eval_item:55716
+user_id:487820 target_item:0 hist_item:268524 hist_item:44318 hist_item:35153 hist_item:70847 eval_item:238318
+user_id:487825 target_item:0 hist_item:35602 hist_item:4353 hist_item:1540 hist_item:72921 eval_item:501
 ```
-其中`hist_item`和`target_item`均是变长序列，读取方式可以看`evaluate_reader.py`
+其中`hist_item`和`eval_item`均是变长序列，读取方式可以看`mind_infer_reader.py`
 
 ## 运行环境
 PaddlePaddle>=2.0
@@ -75,16 +77,16 @@ os : windows/linux/macos
 在mind模型目录的快速执行命令如下：
 ```
 # 进入模型目录
-# cd models/recall/word2vec # 在任意目录均可运行
+# cd models/recall/mind # 在任意目录均可运行
 # 动态图训练
 python -u ../../../tools/trainer.py -m config.yaml 
 # 动态图预测
-python -u evaluate_dygraph.py -m config.yaml  -top_n 50  #对测试数据进行预测，并通过faiss召回候选结果评测Reacll、NDCG、HitRate指标
+python -u infer.py -m config.yaml -top_n 50  #对测试数据进行预测，并通过faiss召回候选结果评测Reacll、NDCG、HitRate指标
 
 # 静态图训练
 python -u ../../../tools/static_trainer.py -m config.yaml # 全量数据运行config_bigdata.yaml 
 # 静态图预测
-python -u evaluate_static.py -m config.yaml  -top_n 50  #对测试数据进行预测，并通过faiss召回候选结果评测Reacll、NDCG、HitRate指标
+python -u static_infer.py -m config.yaml -top_n 50  #对测试数据进行预测，并通过faiss召回候选结果评测Reacll、NDCG、HitRate指标
 ```
 
 ## 模型组网
@@ -97,21 +99,21 @@ python -u evaluate_static.py -m config.yaml  -top_n 50  #对测试数据进行
 在全量数据下模型的指标如下：
 | 模型 |  batch_size | epoch_num| Recall@50 | NDCG@50 | HitRate@50 |Time of each epoch |
 | :------| :------ | :------ | :------| :------ | :------|  :------ | 
-| mind(静态图) | 128 | 6 | 4.61% | 11.28%| 18.92%| -- |
-| mind(动态图) | 128 | 6 | 4.57% | 11.25%| 18.99%| -- |
+| mind(静态图) | 128 | 6 | 5.61% | 8.96% | 11.81% | -- |
+| mind(动态图) | 128 | 6 | 5.54% | 8.85% | 11.75% | -- |
 
 1. 确认您当前所在目录为PaddleRec/models/recall/mind
-2. 进入data目录下执行run.sh脚本，会下载处理完成的AmazonBook数据集，并解压到指定目录
+2. 进入paddlerec/datasets/AmazonBook目录下执行run.sh脚本，会下载处理完成的AmazonBook数据集，并解压到指定目录
 ```bash
-cd ./data
+cd ../../../datasets/AmazonBook
 sh run.sh
 ``` 
 3. 切回模型目录,执行命令运行全量数据
 ```bash
-d - # 切回模型目录
+cd - # 切回模型目录
 # 动态图训练
-python -u ../../../tools/trainer.py -m config.yaml # 全量数据运行config.yaml 
-python -u evaluate_dygraph.py -m config.yaml # 全量数据运行config.yaml 
+python -u ../../../tools/trainer.py -m config_bigdata.yaml # 全量数据运行config_bigdata
+python -u infer.py -m config_bigdata.yaml # 全量数据运行config_bigdata
 ```
 
 ## 进阶使用
 
@@ -18,15 +18,15 @@ runner:
   use_gpu: True
   use_auc: False
   train_batch_size: 128
-  epochs: 6
+  epochs: 2
   print_interval: 500
   model_save_path: "output_model_mind"
   infer_batch_size: 128
-  infer_reader_path: "evaluate_reader" # importlib format
+  infer_reader_path: "mind_infer_reader" # importlib format
   test_data_dir: "data/valid"
   infer_load_path: "output_model_mind"
   infer_start_epoch: 0
-  infer_end_epoch: 4
+  infer_end_epoch: 1
 
   # distribute_config
   # sync_mode: "async"
@@ -48,4 +48,3 @@ hyper_parameters:
   neg_samples: 1280
   maxlen: 20
   pow_p: 1.0
-  distributed_embedding: 0
@@ -0,0 +1,50 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "../../../datasets/AmazonBook/train"
+  train_reader_path: "mind_reader" # importlib format
+  use_gpu: True
+  use_auc: False
+  train_batch_size: 128
+  epochs: 6
+  print_interval: 500
+  model_save_path: "output_model_mind"
+  infer_batch_size: 128
+  infer_reader_path: "mind_infer_reader" # importlib format
+  test_data_dir: "../../../datasets/AmazonBook/valid"
+  infer_load_path: "output_model_mind"
+  infer_start_epoch: 0
+  infer_end_epoch: 1
+
+  # distribute_config
+  # sync_mode: "async"
+  # split_file_list: False
+  # thread_num: 1
+
+
+  # hyper parameters of user-defined network
+hyper_parameters:
+  # optimizer config
+  optimizer:
+    class: Adam
+    learning_rate: 0.005
+    # strategy: async
+  # user-defined <key, value> pairs
+  item_count: 367983
+  embedding_dim: 64
+  hidden_size: 64
+  neg_samples: 1280
+  maxlen: 20
+  pow_p: 1.0
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,66 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
-import os
-import json
-import numpy as np
-import argparse
-import random
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "-type", type=str, default="train", help="train|valid|test")
-parser.add_argument("-maxlen", type=int, default=20)
-
-
-def load_graph(source):
-    graph = {}
-    with open(source) as fr:
-        for line in fr:
-            conts = line.strip().split(',')
-            user_id = int(conts[0])
-            item_id = int(conts[1])
-            time_stamp = int(conts[2])
-            if user_id not in graph:
-                graph[user_id] = []
-            graph[user_id].append((item_id, time_stamp))
-
-    for user_id, value in graph.items():
-        value.sort(key=lambda x: x[1])
-        graph[user_id] = [x[0] for x in value]
-    return graph
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    filelist = []
-    for i in range(10):
-        filelist.append(open(args.type + "/part-%d" % (i), "w"))
-    action_graph = load_graph("data/book_data/book_" + args.type + ".txt")
-    if args.type == "train":
-        for uid, item_list in action_graph.items():
-            for i in range(4, len(item_list)):
-                if i >= args.maxlen:
-                    hist_item = item_list[i - args.maxlen:i]
-                else:
-                    hist_item = item_list[:i]
-                target_item = item_list[i]
-                print(
-                    " ".join(["user_id:" + str(uid)] + [
-                        "hist_item:" + str(n) for n in hist_item
-                    ] + ["target_item:" + str(target_item)]),
-                    file=random.choice(filelist))
-    else:
-        for uid, item_list in action_graph.items():
-            k = int(len(item_list) * 0.8)
-            if k >= args.maxlen:
-                hist_item = item_list[k - args.maxlen:k]
-            else:
-                hist_item = item_list[:k]
-            target_item = item_list[k:]
-            print(
-                " ".join(["user_id:" + str(uid), "target_item:0"] + [
-                    "hist_item:" + str(n) for n in hist_item
-                ] + ["eval_item:" + str(n) for n in target_item]),
-                file=random.choice(filelist))