Skip to content

Commit c79ec9f

Browse files
authored
Add create LoDTensor from list option and simplify recommender book example (#10946)
* add create lodtensor from list * modify book example
1 parent 72149c1 commit c79ec9f

File tree

4 files changed

+69
-108
lines changed

4 files changed

+69
-108
lines changed

python/paddle/fluid/lod_tensor.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,12 @@ def _convert_lod(lod):
9393

9494

9595
def create_lod_tensor(data, lod, place):
96-
"""Create a lod tensor from a numpy array or an existing lod tensor.
96+
"""Create a lod tensor from a numpy array, a list, or an existing lod tensor.
9797
9898
Create a lod tensor by doing the following:
9999
1. Check that the length-based input lod is valid.
100100
2. Convert the length-based lod to a offset-based LoD.
101-
3. Copy the data from a numpy array or a existing lod tensor to
101+
3. Copy the data from a numpy array, a list or a existing lod tensor to
102102
CPU or GPU device (based on input place).
103103
4. Set the level of detail (LoD) using the offset-based LoD.
104104
@@ -117,7 +117,7 @@ def create_lod_tensor(data, lod, place):
117117
for more details regarding LoD.
118118
119119
Args:
120-
data: a numpy array or a LoDTensor holding the data to be copied.
120+
data: a numpy array or a LoDTensor or a list holding the data to be copied.
121121
lod: a list of lists indicating the length-based LoD info specified by the user.
122122
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
123123
@@ -126,6 +126,18 @@ def create_lod_tensor(data, lod, place):
126126
"""
127127
if isinstance(data, core.LoDTensor):
128128
return create_lod_tensor(np.array(data), lod, place)
129+
elif isinstance(data, list):
130+
# When input data is a list, it only deal with the case where the base element
131+
# is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
132+
# LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
133+
# of words or other indexes in the sequence.
134+
new_lod = []
135+
for seq in data:
136+
new_lod.append(len(seq))
137+
assert [new_lod] == lod, "data and lod do not match"
138+
flattened_data = np.concatenate(data, axis=0).astype("int64")
139+
flattened_data = flattened_data.reshape([len(flattened_data), 1])
140+
return create_lod_tensor(flattened_data, lod, place)
129141
elif isinstance(data, np.ndarray):
130142
assert _validate_lod(lod,
131143
data.shape[0]), "the provided lod info is invalid"
@@ -134,9 +146,8 @@ def create_lod_tensor(data, lod, place):
134146
tensor.set_lod(_convert_lod(lod))
135147
return tensor
136148
else:
137-
raise Exception(
138-
"data should be either a LoDTensor or a Numpy array, but you pass type %s instead"
139-
% (type(data)))
149+
raise TypeError(
150+
"data should be either a LoDTensor, a Numpy array or a list")
140151

141152

142153
def create_random_int_lodtensor(lod, base_shape, place, low, high):

python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -197,43 +197,30 @@ def event_handler(event):
197197
num_epochs=1,
198198
event_handler=event_handler,
199199
reader=train_reader,
200-
feed_order=[
201-
'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id',
202-
'category_id', 'movie_title', 'score'
203-
])
200+
feed_order=feed_order)
204201

205202

206203
def infer(use_cuda, inference_program, save_path):
207204
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
208205
inferencer = fluid.Inferencer(
209206
inference_program, param_path=save_path, place=place)
210207

211-
def create_lod_tensor(data, lod=None):
212-
tensor = fluid.LoDTensor()
213-
if lod is None:
214-
# Tensor, the shape is [batch_size, 1]
215-
index = 0
216-
lod_0 = [index]
217-
for l in range(len(data)):
218-
index += 1
219-
lod_0.append(index)
220-
lod = [lod_0]
221-
tensor.set_lod(lod)
222-
223-
flattened_data = np.concatenate(data, axis=0).astype("int64")
224-
flattened_data = flattened_data.reshape([len(flattened_data), 1])
225-
tensor.set(flattened_data, place)
226-
return tensor
227-
228-
# Generate a random input for inference
229-
user_id = create_lod_tensor([[1]])
230-
gender_id = create_lod_tensor([[1]])
231-
age_id = create_lod_tensor([[0]])
232-
job_id = create_lod_tensor([[10]])
233-
movie_id = create_lod_tensor([[783]])
234-
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
235-
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
236-
[[0, 5]])
208+
# Use the first data from paddle.dataset.movielens.test() as input.
209+
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
210+
# where `data` is a list of sequences of index numbers, `lod` is
211+
# the level of detail (lod) info associated with `data`.
212+
# For example, data = [[10, 2, 3], [2, 3]] means that it contains
213+
# two sequences of indexes, of length 3 and 2, respectively.
214+
# Correspondingly, lod = [[3, 2]] contains one level of detail info,
215+
# indicating that `data` consists of two sequences of length 3 and 2.
216+
user_id = fluid.create_lod_tensor([[1]], [[1]], place)
217+
gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
218+
age_id = fluid.create_lod_tensor([[0]], [[1]], place)
219+
job_id = fluid.create_lod_tensor([[10]], [[1]], place)
220+
movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
221+
category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
222+
movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
223+
place)
237224

238225
results = inferencer.infer(
239226
{

python/paddle/fluid/tests/book/test_recommender_system.py

Lines changed: 28 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
173173
test_reader = paddle.batch(
174174
paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
175175

176-
feeding = {
177-
'user_id': 0,
178-
'gender_id': 1,
179-
'age_id': 2,
180-
'job_id': 3,
181-
'movie_id': 4,
182-
'category_id': 5,
183-
'movie_title': 6,
184-
'score': 7
185-
}
186-
187-
def func_feed(feeding, data):
188-
feed_tensors = {}
189-
for (key, idx) in feeding.iteritems():
190-
tensor = fluid.LoDTensor()
191-
if key != "category_id" and key != "movie_title":
192-
if key == "score":
193-
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
194-
"float32")
195-
else:
196-
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
197-
"int64")
198-
else:
199-
numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
200-
data)
201-
lod_info = [len(item) for item in numpy_data]
202-
offset = 0
203-
lod = [offset]
204-
for item in lod_info:
205-
offset += item
206-
lod.append(offset)
207-
numpy_data = np.concatenate(numpy_data, axis=0)
208-
tensor.set_lod([lod])
209-
210-
numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
211-
tensor.set(numpy_data, place)
212-
feed_tensors[key] = tensor
213-
return feed_tensors
176+
feed_order = [
177+
'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
178+
'movie_title', 'score'
179+
]
214180

215181
def train_loop(main_program):
216182
exe.run(framework.default_startup_program())
217183

184+
feed_list = [
185+
main_program.global_block().var(var_name) for var_name in feed_order
186+
]
187+
feeder = fluid.DataFeeder(feed_list, place)
188+
218189
PASS_NUM = 100
219190
for pass_id in range(PASS_NUM):
220191
for batch_id, data in enumerate(train_reader()):
221192
# train a mini-batch
222193
outs = exe.run(program=main_program,
223-
feed=func_feed(feeding, data),
194+
feed=feeder.feed(data),
224195
fetch_list=[avg_cost])
225196
out = np.array(outs[0])
226197
if (batch_id + 1) % 10 == 0:
227198
avg_cost_set = []
228199
for test_data in test_reader():
229-
avg_cost_np = exe.run(
230-
program=test_program,
231-
feed=func_feed(feeding, test_data),
232-
fetch_list=[avg_cost])
200+
avg_cost_np = exe.run(program=test_program,
201+
feed=feeder.feed(test_data),
202+
fetch_list=[avg_cost])
233203
avg_cost_set.append(avg_cost_np[0])
234204
break # test only 1 segment for speeding up CI
235205

@@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
279249
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
280250
exe = fluid.Executor(place)
281251

282-
def create_lod_tensor(data, lod=None):
283-
tensor = fluid.LoDTensor()
284-
if lod is None:
285-
# Tensor, the shape is [batch_size, 1]
286-
index = 0
287-
lod_0 = [index]
288-
for l in range(len(data)):
289-
index += 1
290-
lod_0.append(index)
291-
lod = [lod_0]
292-
tensor.set_lod(lod)
293-
294-
flattened_data = np.concatenate(data, axis=0).astype("int64")
295-
flattened_data = flattened_data.reshape([len(flattened_data), 1])
296-
tensor.set(flattened_data, place)
297-
return tensor
298-
299252
inference_scope = fluid.core.Scope()
300253
with fluid.scope_guard(inference_scope):
301254
# Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -307,26 +260,33 @@ def create_lod_tensor(data, lod=None):
307260

308261
# Use the first data from paddle.dataset.movielens.test() as input
309262
assert feed_target_names[0] == "user_id"
310-
user_id = create_lod_tensor([[1]])
263+
# Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
264+
# where `data` is a list of sequences of index numbers, `lod` is
265+
# the level of detail (lod) info associated with `data`.
266+
# For example, data = [[10, 2, 3], [2, 3]] means that it contains
267+
# two sequences of indexes, of length 3 and 2, respectively.
268+
# Correspondingly, lod = [[3, 2]] contains one level of detail info,
269+
# indicating that `data` consists of two sequences of length 3 and 2.
270+
user_id = fluid.create_lod_tensor([[1]], [[1]], place)
311271

312272
assert feed_target_names[1] == "gender_id"
313-
gender_id = create_lod_tensor([[1]])
273+
gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
314274

315275
assert feed_target_names[2] == "age_id"
316-
age_id = create_lod_tensor([[0]])
276+
age_id = fluid.create_lod_tensor([[0]], [[1]], place)
317277

318278
assert feed_target_names[3] == "job_id"
319-
job_id = create_lod_tensor([[10]])
279+
job_id = fluid.create_lod_tensor([[10]], [[1]], place)
320280

321281
assert feed_target_names[4] == "movie_id"
322-
movie_id = create_lod_tensor([[783]])
282+
movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
323283

324284
assert feed_target_names[5] == "category_id"
325-
category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
285+
category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
326286

327287
assert feed_target_names[6] == "movie_title"
328-
movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
329-
[[0, 5]])
288+
movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
289+
[[5]], place)
330290

331291
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
332292
# and results will contain a list of data corresponding to fetch_targets.

python/paddle/fluid/tests/test_lod_tensor.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,14 @@ def test_convert_lod(self):
5353
self.assertEqual(_convert_lod(lod), converted_lod)
5454

5555
def test_create_lod_tensor(self):
56-
# Only numpy array or a fluid LoDTensor is valid input to
57-
# create_lod_tensor function, currently a list of lists is not.
58-
data = [[1, 2], [3, 4]]
59-
self.assertRaises(Exception, create_lod_tensor, data, [],
56+
# Create LoDTensor from a list
57+
data = [[1, 2, 3], [3, 4]]
58+
wrong_lod = [[2, 2]]
59+
correct_lod = [[3, 2]]
60+
self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
6061
fluid.CPUPlace())
62+
tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
63+
self.assertEqual(tensor.lod(), [[0, 3, 5]])
6164

6265
# Create LoDTensor from numpy array
6366
data = numpy.random.random([10, 1])

0 commit comments

Comments
 (0)