17
17
Load the training dataset and evaluation dataset from csv file into memory.
18
18
Prepare input for model training and evaluation.
19
19
"""
20
- import time
21
-
22
20
import numpy as np
23
21
from six .moves import xrange # pylint: disable=redefined-builtin
24
22
import tensorflow as tf
25
23
26
24
from official .recommendation import constants # pylint: disable=g-bad-import-order
27
25
28
- # The column names and types of csv file
29
- _CSV_COLUMN_NAMES = [constants .USER , constants .ITEM , constants .RATING ]
30
- _CSV_TYPES = [[0 ], [0 ], [0 ]]
31
-
32
26
# The buffer size for shuffling train dataset.
33
27
_SHUFFLE_BUFFER_SIZE = 1024
34
28
@@ -37,7 +31,7 @@ class NCFDataSet(object):
37
31
"""A class containing data information for model training and evaluation."""
38
32
39
33
def __init__ (self , train_data , num_users , num_items , num_negatives ,
40
- true_items , all_items ):
34
+ true_items , all_items , all_eval_data ):
41
35
"""Initialize NCFDataset class.
42
36
43
37
Args:
@@ -50,17 +44,19 @@ def __init__(self, train_data, num_users, num_items, num_negatives,
50
44
evaluation. Each entry is a latest positive instance for one user.
51
45
all_items: A nested list, all items for evaluation, and each entry is the
52
46
evaluation items for one user.
47
+ all_eval_data: A numpy array of eval/test dataset.
53
48
"""
54
49
self .train_data = train_data
55
50
self .num_users = num_users
56
51
self .num_items = num_items
57
52
self .num_negatives = num_negatives
58
53
self .eval_true_items = true_items
59
54
self .eval_all_items = all_items
55
+ self .all_eval_data = all_eval_data
60
56
61
57
62
58
def load_data (file_name ):
63
- """Load data from a csv file which splits on \t ."""
59
+ """Load data from a csv file which splits on tab key ."""
64
60
lines = tf .gfile .Open (file_name , "r" ).readlines ()
65
61
66
62
# Process the file line by line
@@ -122,13 +118,11 @@ def data_preprocessing(train_fname, test_fname, test_neg_fname, num_negatives):
122
118
all_items .append (items ) # All items (including positive and negative items)
123
119
all_test_data .extend (users_items ) # Generate test dataset
124
120
125
- # Save test dataset into csv file
126
- np .savetxt (constants .TEST_DATA , np .asarray (all_test_data ).astype (int ),
127
- fmt = "%i" , delimiter = "," )
128
-
129
121
# Create NCFDataset object
130
122
ncf_dataset = NCFDataSet (
131
- train_data , num_users , num_items , num_negatives , true_items , all_items )
123
+ train_data , num_users , num_items , num_negatives , true_items , all_items ,
124
+ np .asarray (all_test_data )
125
+ )
132
126
133
127
return ncf_dataset
134
128
@@ -144,6 +138,9 @@ def generate_train_dataset(train_data, num_items, num_negatives):
144
138
num_items: An integer, the number of items in positive training instances.
145
139
num_negatives: An integer, the number of negative training instances
146
140
following positive training instances. It is 4 by default.
141
+
142
+ Returns:
143
+ A numpy array of training dataset.
147
144
"""
148
145
all_train_data = []
149
146
# A set with user-item tuples
@@ -158,13 +155,10 @@ def generate_train_dataset(train_data, num_items, num_negatives):
158
155
j = np .random .randint (num_items )
159
156
all_train_data .append ([u , j , 0 ])
160
157
161
- # Save the train dataset into a csv file
162
- np .savetxt (constants .TRAIN_DATA , np .asarray (all_train_data ).astype (int ),
163
- fmt = "%i" , delimiter = "," )
158
+ return np .asarray (all_train_data )
164
159
165
160
166
- def input_fn (training , batch_size , repeat = 1 , ncf_dataset = None ,
167
- num_parallel_calls = 1 ):
161
+ def input_fn (training , batch_size , ncf_dataset , repeat = 1 ):
168
162
"""Input function for model training and evaluation.
169
163
170
164
The train input consists of 1 positive instance (user and item have
@@ -176,55 +170,39 @@ def input_fn(training, batch_size, repeat=1, ncf_dataset=None,
176
170
Args:
177
171
training: A boolean flag for training mode.
178
172
batch_size: An integer, batch size for training and evaluation.
173
+ ncf_dataset: An NCFDataSet object, which contains the information about
174
+ training and test data.
179
175
repeat: An integer, how many times to repeat the dataset.
180
- ncf_dataset: An NCFDataSet object, which contains the information to
181
- generate negative training instances.
182
- num_parallel_calls: An integer, number of cpu cores for parallel input
183
- processing.
184
176
185
177
Returns:
186
178
dataset: A tf.data.Dataset object containing examples loaded from the files.
187
179
"""
188
- # Default test file name
189
- file_name = constants .TEST_DATA
190
-
191
180
# Generate random negative instances for training in each epoch
192
181
if training :
193
- t1 = time .time ()
194
- generate_train_dataset (
182
+ train_data = generate_train_dataset (
195
183
ncf_dataset .train_data , ncf_dataset .num_items ,
196
184
ncf_dataset .num_negatives )
197
- file_name = constants . TRAIN_DATA
198
- tf . logging . info (
199
- "Generating training instances: {:.1f}s" . format ( time . time () - t1 ))
200
-
201
- # Create a dataset containing the text lines.
202
- dataset = tf . data . TextLineDataset ( file_name )
203
-
204
- # Test dataset only has two fields while train dataset has three
205
- num_cols = len ( _CSV_COLUMN_NAMES ) - 1
206
- # Shuffle the dataset for training
207
- if training :
185
+ # Get train features and labels
186
+ train_features = [
187
+ ( constants . USER , np . expand_dims ( train_data [:, 0 ], axis = 1 )),
188
+ ( constants . ITEM , np . expand_dims ( train_data [:, 1 ], axis = 1 ))
189
+ ]
190
+ train_labels = [
191
+ ( constants . RATING , np . expand_dims ( train_data [:, 2 ], axis = 1 ))]
192
+
193
+ dataset = tf . data . Dataset . from_tensor_slices (
194
+ ( dict ( train_features ), dict ( train_labels ))
195
+ )
208
196
dataset = dataset .shuffle (buffer_size = _SHUFFLE_BUFFER_SIZE )
209
- num_cols += 1
210
-
211
- def _parse_csv (line ):
212
- """Parse each line of the csv file."""
213
- # Decode the line into its fields
214
- fields = tf .decode_csv (line , record_defaults = _CSV_TYPES [0 :num_cols ])
215
- fields = [tf .expand_dims (field , axis = 0 ) for field in fields ]
216
-
217
- # Pack the result into a dictionary
218
- features = dict (zip (_CSV_COLUMN_NAMES [0 :num_cols ], fields ))
219
- # Separate the labels from the features for training
220
- if training :
221
- labels = features .pop (constants .RATING )
222
- return features , labels
223
- # Return features only for test/prediction
224
- return features
225
-
226
- # Parse each line into a dictionary
227
- dataset = dataset .map (_parse_csv , num_parallel_calls = num_parallel_calls )
197
+ else :
198
+ # Create eval/test dataset
199
+ test_user = ncf_dataset .all_eval_data [:, 0 ]
200
+ test_item = ncf_dataset .all_eval_data [:, 1 ]
201
+ test_features = [
202
+ (constants .USER , np .expand_dims (test_user , axis = 1 )),
203
+ (constants .ITEM , np .expand_dims (test_item , axis = 1 ))]
204
+
205
+ dataset = tf .data .Dataset .from_tensor_slices (dict (test_features ))
228
206
229
207
# Repeat and batch the dataset
230
208
dataset = dataset .repeat (repeat )
0 commit comments