PaddlePaddle
diff --git a/‎.pre-commit-config.yaml‎
100644100755 b/‎.pre-commit-config.yaml‎
100644100755
diff --git a/‎datasets/movielens_pinterest_NCF/data_process.sh‎
Lines changed: 6 additions & 0 deletions b/‎datasets/movielens_pinterest_NCF/data_process.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎datasets/movielens_pinterest_NCF/get_test_data.py‎
Lines changed: 35 additions & 0 deletions b/‎datasets/movielens_pinterest_NCF/get_test_data.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎datasets/movielens_pinterest_NCF/get_train_data.py‎
Lines changed: 92 additions & 0 deletions b/‎datasets/movielens_pinterest_NCF/get_train_data.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎datasets/movielens_pinterest_NCF/readme.md‎
Lines changed: 9 additions & 0 deletions b/‎datasets/movielens_pinterest_NCF/readme.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎datasets/movielens_pinterest_NCF/run.sh‎
Lines changed: 9 additions & 0 deletions b/‎datasets/movielens_pinterest_NCF/run.sh‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎datasets/readme.md‎
Lines changed: 1 addition & 0 deletions b/‎datasets/readme.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎models/recall/ncf/config.yaml‎
Lines changed: 40 additions & 0 deletions b/‎models/recall/ncf/config.yaml‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎models/recall/ncf/config_bigdata.yaml‎
Lines changed: 40 additions & 0 deletions b/‎models/recall/ncf/config_bigdata.yaml‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎models/recall/ncf/data/test/small_data.txt‎
Lines changed: 100 additions & 0 deletions b/‎models/recall/ncf/data/test/small_data.txt‎
Lines changed: 100 additions & 0 deletions
@@ -0,0 +1,6 @@
+mkdir Data
+pip3 install scipy
+wget https://paddlerec.bj.bcebos.com/ncf/Data.zip
+unzip Data/Data.zip -d Data/
+python3 get_train_data.py --num_neg 4  --train_data_path "Data/train_data.csv"  
+python3 get_test_data.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+filename = './Data/ml-1m.test.negative'
+f = open(filename, "r")
+lines = f.readlines()
+f.close()
+filename = './test_data.csv'
+f = open(filename, "w")
+for line in lines:
+    line = line.strip().split("\t")
+    user_id = line[0].strip("()").split(",")[0]
+    positive_item = line[0].strip("()").split(",")[1]
+    negative_item = []
+    for item in line[1:]:
+        negative_item.append(int(item))
+
+    f.write(user_id + "," + positive_item + "," + "1" + "\n")
+    for item in negative_item:
+        f.write(user_id + "," + str(item) + "," + "0" + "\n")
+
+f.close()
@@ -0,0 +1,92 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import scipy.sparse as sp
+import numpy as np
+from time import time
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run GMF.")
+    parser.add_argument(
+        '--path', nargs='?', default='Data/', help='Input data path.')
+    parser.add_argument(
+        '--dataset', nargs='?', default='ml-1m', help='Choose a dataset.')
+    parser.add_argument(
+        '--num_neg',
+        type=int,
+        default=4,
+        help='Number of negative instances to pair with a positive instance.')
+    parser.add_argument(
+        '--train_data_path',
+        type=str,
+        default="Data/train_data.csv",
+        help='train_data_path')
+    return parser.parse_args()
+
+
+def get_train_data(filename, write_file, num_negatives):
+    '''
+        Read .rating file and Return dok matrix.
+        The first line of .rating file is: num_users\t num_items
+        '''
+    # Get number of users and items
+    num_users, num_items = 0, 0
+    with open(filename, "r") as f:
+        line = f.readline()
+        while line != None and line != "":
+            arr = line.split("\t")
+            u, i = int(arr[0]), int(arr[1])
+            num_users = max(num_users, u)
+            num_items = max(num_items, i)
+            line = f.readline()
+    print("users_num:", num_users, "items_num:", num_items)
+    # Construct matrix
+    mat = sp.dok_matrix((num_users + 1, num_items + 1), dtype=np.float32)
+    with open(filename, "r") as f:
+        line = f.readline()
+        while line != None and line != "":
+            arr = line.split("\t")
+            user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
+            if (rating > 0):
+                mat[user, item] = 1.0
+            line = f.readline()
+
+    file = open(write_file, 'w')
+    print("writing " + write_file)
+
+    for (u, i) in mat.keys():
+        # positive instance
+        user_input = str(u)
+        item_input = str(i)
+        label = str(1)
+        sample = "{0},{1},{2}".format(user_input, item_input, label) + "\n"
+        file.write(sample)
+        # negative instances
+        for t in range(num_negatives):
+            j = np.random.randint(num_items)
+            while (u, j) in mat.keys():
+                j = np.random.randint(num_items)
+            user_input = str(u)
+            item_input = str(j)
+            label = str(0)
+            sample = "{0},{1},{2}".format(user_input, item_input, label) + "\n"
+            file.write(sample)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    get_train_data(args.path + args.dataset + ".train.rating",
+                   args.train_data_path, args.num_neg)
@@ -0,0 +1,9 @@
+# NCF使用的数据集
+
+本数据集供NCF模型复现论文使用，使用的是初步处理过后的数据，分为两个数据集：ml-1m（即MovieLens数据集）和pinterest-20（即Pinterest数据集）
+每个数据集分为三个文件，后缀分别为：（.test.negative），（.test.rating），（.train.rating）
+
+在train.rating和test.rating中的数据格式为：
+user_id + \t + item_id + \t + rating(用户评分) + \t + timestamp(时间戳)
+在test.negative中的数据格式为：
+(userID,itemID) + \t + negativeItemID1 + \t + negativeItemID2 …(包含99个negative样本)
@@ -0,0 +1,9 @@
+mkdir big_train
+cd big_train
+wget https://paddlerec.bj.bcebos.com/ncf/train_data.csv
+cd ..
+mkdir big_test
+cd big_test
+wget https://paddlerec.bj.bcebos.com/ncf/test_data.csv
+cd ..
+wget https://paddlerec.bj.bcebos.com/ncf/Data.zip
@@ -25,3 +25,4 @@ sh data_process.sh
  |[senti_clas](https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz)|情感倾向分析（Sentiment Classification，简称Senta）针对带有主观描述的中文文本，可自动判断该文本的情感极性类别并给出相应的置信度。情感类型分为积极、消极。情感倾向分析能够帮助企业理解用户消费习惯、分析热点话题和危机舆情监控，为企业提供有利的决策支持|--|
  |[one_billion](http://www.statmt.org/lm-benchmark/)|拥有十亿个单词基准，为语言建模实验提供标准的训练和测试|[One Billion Word Benchmark for Measuring Progress in Statistical Language Modeling](https://arxiv.org/abs/1312.3005)|
  |[MIND](https://paddlerec.bj.bcebos.com/datasets/MIND/bigdata.zip)|MIND即MIcrosoft News Dataset的简写，MIND里的数据来自Microsoft News用户的行为日志。MIND的数据集里包含了1,000,000的用户以及这些用户与160,000的文章的交互行为。|[Microsoft(2020)](https://msnews.github.io)|
+ |[movielens_pinterest_NCF](https://paddlerec.bj.bcebos.com/ncf/Data.zip)|论文原作者处理过的movielens数据集和pinterest数据集|[《Neural Collaborative Filtering 》](https://arxiv.org/pdf/1708.05031.pdf)|
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "data/train"
+  train_reader_path: "movielens_reader"  # importlib format
+  train_batch_size: 5
+  model_save_path: "output_model_ncf"
+
+  use_gpu: False
+  epochs: 3
+  print_interval: 10
+  
+  test_data_dir: "data/test"
+  infer_reader_path: "movielens_reader"  # importlib format
+  infer_batch_size: 5
+  infer_load_path: "output_model_ncf"
+  infer_start_epoch: 2
+  infer_end_epoch: 3
+
+hyper_parameters:
+  optimizer: 
+    class: adam
+    learning_rate: 0.001
+  num_users: 6040
+  num_items: 3706
+  mf_dim: 8
+  mode: "NCF_NeuMF"  # optional: NCF_NeuMF, NCF_GMF, NCF_MLP
+  fc_layers: [64, 32, 16, 8]
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  train_data_dir: "../../../datasets/movielens_pinterest_NCF/big_train"
+  train_reader_path: "movielens_reader"  # importlib format
+  train_batch_size: 256
+  model_save_path: "output_model_ncf"
+
+  use_gpu: False
+  epochs: 20
+  print_interval: 1
+  
+  test_data_dir: "../../../datasets/movielens_pinterest_NCF/big_test"
+  infer_reader_path: "movielens_reader"  # importlib format
+  infer_batch_size: 1
+  infer_load_path: "output_model_ncf"
+  infer_start_epoch: 19
+  infer_end_epoch: 20
+
+hyper_parameters:
+  optimizer: 
+    class: adam
+    learning_rate: 0.001
+  num_users: 6040
+  num_items: 3706
+  mf_dim: 8
+  mode: "NCF_NeuMF"  # optional: NCF_NeuMF, NCF_GMF, NCF_MLP
+  fc_layers: [64, 32, 16, 8]
@@ -0,0 +1,100 @@
+4764,174,1
+4764,2958,0
+4764,452,0
+4764,1946,0
+4764,3208,0
+2044,2237,1
+2044,1998,0
+2044,328,0
+2044,1542,0
+2044,1932,0
+4276,65,1
+4276,3247,0
+4276,942,0
+4276,3666,0
+4276,2222,0
+3933,682,1
+3933,2451,0
+3933,3695,0
+3933,1643,0
+3933,3568,0
+1151,1265,1
+1151,118,0
+1151,2532,0
+1151,2083,0
+1151,2350,0
+1757,876,1
+1757,201,0
+1757,3633,0
+1757,1068,0
+1757,2549,0
+3370,276,1
+3370,2435,0
+3370,606,0
+3370,910,0
+3370,2146,0
+5137,1018,1
+5137,2163,0
+5137,3167,0
+5137,2315,0
+5137,3595,0
+3933,2831,1
+3933,2881,0
+3933,2949,0
+3933,3660,0
+3933,417,0
+3102,999,1
+3102,1902,0
+3102,2161,0
+3102,3042,0
+3102,1113,0
+2022,336,1
+2022,1672,0
+2022,2656,0
+2022,3649,0
+2022,883,0
+2664,655,1
+2664,3660,0
+2664,1711,0
+2664,3386,0
+2664,1668,0
+25,701,1
+25,32,0
+25,2482,0
+25,3177,0
+25,2767,0
+1738,1643,1
+1738,2187,0
+1738,228,0
+1738,650,0
+1738,3101,0
+5411,1241,1
+5411,2546,0
+5411,3019,0
+5411,3618,0
+5411,1674,0
+638,579,1
+638,3512,0
+638,783,0
+638,2111,0
+638,1880,0
+3554,200,1
+3554,2893,0
+3554,2428,0
+3554,969,0
+3554,2741,0
+4283,1074,1
+4283,3056,0
+4283,2032,0
+4283,405,0
+4283,1505,0
+5111,200,1
+5111,3488,0
+5111,477,0
+5111,2790,0
+5111,40,0
+3964,515,1
+3964,1528,0
+3964,2173,0
+3964,1701,0
+3964,2832,0