Skip to content

Commit 224bd8f

Browse files
kexinzhaowangkuiyi
authored andcommitted
Add lod_tensor.py for ease of creating lod tensor in book examples (#10817)
* add lod_tensor utility python module * add lod_tensor test code * add more lod tensor tests * modify word2vec example code using new api * add comment
1 parent 0d598cf commit 224bd8f

File tree

4 files changed

+284
-20
lines changed

4 files changed

+284
-20
lines changed

python/paddle/fluid/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
InferenceTranspiler, memory_optimize, release_memory
4949
from concurrency import (Go, make_channel, channel_send, channel_recv,
5050
channel_close, Select)
51+
from lod_tensor import create_lod_tensor, create_random_int_lodtensor
5152
import clip
5253
import profiler
5354
import unique_name
@@ -59,7 +60,7 @@
5960

6061
__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
6162
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
62-
parallel_executor.__all__ + [
63+
parallel_executor.__all__ + lod_tensor.__all__ + [
6364
'io',
6465
'initializer',
6566
'layers',

python/paddle/fluid/lod_tensor.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import core
16+
import numpy as np
17+
18+
__all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
19+
20+
21+
def _validate_lod(lod, tensor_height=-1):
22+
"""Check whether the input length-based lod info is valid.
23+
24+
There are several things to check:
25+
1. lod should be a list of lists. Empty list is fine.
26+
2. The length of each sublist (a lod level) should be at least one.
27+
3. Each element in each lod level should be an integer greater than 0.
28+
4. The sum of one lod level should be equal to the length of the next lod level.
29+
5. The sum of the last lod level should be equal to the tensor height.
30+
Bypass this check if user does not provide tensor_height as input.
31+
32+
Args:
33+
lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]].
34+
tensor_height: the outermost dimension of the tensor with which the input
35+
lod is associated with.
36+
37+
Returns:
38+
A boolean indicating whether the input lod is valid or not.
39+
"""
40+
assert isinstance(lod, list), "lod should be a list"
41+
# Empty lod is fine
42+
if len(lod) == 0:
43+
return True
44+
45+
lod_sum = []
46+
for level in lod:
47+
assert isinstance(level, list), "each item in lod should be a list"
48+
# Each level of lod should have at least one length info
49+
if len(level) < 1:
50+
return False
51+
level_sum = 0
52+
for lod_len in level:
53+
# Each length in a level should be > 0
54+
if lod_len <= 0:
55+
return False
56+
level_sum += lod_len
57+
lod_sum.append(level_sum)
58+
59+
for idx, val in enumerate(lod_sum[:-1]):
60+
# Each level's sum should be equal to
61+
# the number of items in the next level
62+
if val != len(lod[idx + 1]):
63+
return False
64+
65+
if tensor_height == -1:
66+
return True
67+
else:
68+
# Last level's sum should be equal to the tensor height
69+
return lod_sum[-1] == tensor_height
70+
71+
72+
def _convert_lod(lod):
73+
"""Convert a length-based lod to a offset-based lod.
74+
75+
If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]],
76+
then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]].
77+
78+
Args:
79+
lod: a length-based lod info.
80+
81+
Returns:
82+
A list of lists as the offset-based lod converted to from the input lod.
83+
"""
84+
new_lod = []
85+
for level in lod:
86+
cur_len = 0
87+
new_level = [cur_len]
88+
for lod_len in level:
89+
cur_len += lod_len
90+
new_level.append(cur_len)
91+
new_lod.append(new_level)
92+
return new_lod
93+
94+
95+
def create_lod_tensor(data, lod, place):
96+
"""Create a lod tensor from a numpy array or an existing lod tensor.
97+
98+
Create a lod tensor by doing the following:
99+
1. Check that the length-based input lod is valid.
100+
2. Convert the length-based lod to a offset-based LoD.
101+
3. Copy the data from a numpy array or a existing lod tensor to
102+
CPU or GPU device (based on input place).
103+
4. Set the level of detail (LoD) using the offset-based LoD.
104+
105+
Use example:
106+
Suppose we want LoDTensor to hold data for sequences of word, where each word is
107+
represented by an integer. If we want to create a LoDTensor to represent two
108+
sentences, one of 2 words, and one of 3 words.
109+
110+
Then 'data' can be a numpy array of integers with shape (5, 1).
111+
'lod' will be [[2, 3]], indicating the length(# of words) in each sentence.
112+
This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]]
113+
inside the function call.
114+
115+
Please refer to
116+
github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md
117+
for more details regarding LoD.
118+
119+
Args:
120+
data: a numpy array or a LoDTensor holding the data to be copied.
121+
lod: a list of lists indicating the length-based LoD info specified by the user.
122+
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
123+
124+
Returns:
125+
A fluid LoDTensor object with tensor data and lod info.
126+
"""
127+
if isinstance(data, core.LoDTensor):
128+
return create_lod_tensor(np.array(data), lod, place)
129+
elif isinstance(data, np.ndarray):
130+
assert _validate_lod(lod,
131+
data.shape[0]), "the provided lod info is invalid"
132+
tensor = core.LoDTensor()
133+
tensor.set(data, place)
134+
tensor.set_lod(_convert_lod(lod))
135+
return tensor
136+
else:
137+
raise Exception(
138+
"data should be either a LoDTensor or a Numpy array, but you pass type %s instead"
139+
% (type(data)))
140+
141+
142+
def create_random_int_lodtensor(lod, base_shape, place, low, high):
143+
"""Create a LoDTensor containing random integers.
144+
145+
This function is frequently used in the book examples. So we revised it based on
146+
the new create_lod_tensor API and put it here in the lod_tensor module to simplify
147+
the code.
148+
149+
The function does the following:
150+
1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input
151+
and the shape of the basic element in 'base_shape'.
152+
2. Create a numpy array of this shape.
153+
3. Create the LoDTensor using create_lod_tensor API.
154+
155+
Suppose we want LoDTensor to hold data for sequences of word, where each word is
156+
represented by an integer. If we want to create a LoDTensor to represent two
157+
sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input
158+
length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be
159+
[5, 1], holding 5 words for two sentences.
160+
161+
Args:
162+
data: a numpy array or a LoDTensor holding the data to be copied.
163+
lod: a list of lists indicating the length-based LoD info specified by the user.
164+
base_shape: the shape of the basic element to be held by the LoDTensor.
165+
place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
166+
low: the lower bound of the random integers.
167+
high: the upper bound of the random integers.
168+
169+
Returns:
170+
A fluid LoDTensor object with tensor data and lod info.
171+
"""
172+
assert isinstance(base_shape, list), "base_shape should be a list"
173+
converted_lod = _convert_lod(lod)
174+
# append the total number of basic elements to the front of its shape
175+
overall_shape = [converted_lod[-1][-1]] + base_shape
176+
# the range of integer data elements is [low, high]
177+
data = np.random.random_integers(low, high, overall_shape).astype("int64")
178+
return create_lod_tensor(data, lod, place)

python/paddle/fluid/tests/book/test_word2vec.py

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,6 @@
2121
import sys
2222

2323

24-
def create_random_lodtensor(lod, place, low, high):
25-
# The range of data elements is [low, high]
26-
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
27-
res = fluid.LoDTensor()
28-
res.set(data, place)
29-
res.set_lod([lod])
30-
return res
31-
32-
3324
def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
3425
PASS_NUM = 100
3526
EMBED_SIZE = 32
@@ -175,16 +166,22 @@ def infer(use_cuda, save_dirname=None):
175166
word_dict = paddle.dataset.imikolov.build_dict()
176167
dict_size = len(word_dict)
177168

178-
# Setup inputs, by creating 4 words, the lod of which should be [0, 1]
179-
lod = [0, 1]
180-
first_word = create_random_lodtensor(
181-
lod, place, low=0, high=dict_size - 1)
182-
second_word = create_random_lodtensor(
183-
lod, place, low=0, high=dict_size - 1)
184-
third_word = create_random_lodtensor(
185-
lod, place, low=0, high=dict_size - 1)
186-
fourth_word = create_random_lodtensor(
187-
lod, place, low=0, high=dict_size - 1)
169+
# Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
170+
# is simply an index to look up for the corresponding word vector and hence
171+
# the shape of word (base_shape) should be [1]. The length-based level of
172+
# detail (lod) info of each LoDtensor should be [[1]] meaning there is only
173+
# one lod_level and there is only one sequence of one word on this level.
174+
# Note that lod info should be a list of lists.
175+
lod = [[1]]
176+
base_shape = [1]
177+
first_word = fluid.create_random_int_lodtensor(
178+
lod, base_shape, place, low=0, high=dict_size - 1)
179+
second_word = fluid.create_random_int_lodtensor(
180+
lod, base_shape, place, low=0, high=dict_size - 1)
181+
third_word = fluid.create_random_int_lodtensor(
182+
lod, base_shape, place, low=0, high=dict_size - 1)
183+
fourth_word = fluid.create_random_int_lodtensor(
184+
lod, base_shape, place, low=0, high=dict_size - 1)
188185

189186
assert feed_target_names[0] == 'firstw'
190187
assert feed_target_names[1] == 'secondw'
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import paddle.fluid as fluid
16+
from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod
17+
import numpy
18+
import unittest
19+
20+
21+
class TestLoDTensor(unittest.TestCase):
22+
def test_validate_lod(self):
23+
lod = (1, 2, 1)
24+
self.assertRaises(AssertionError, _validate_lod, lod, -1)
25+
lod = [[1, 2], (2, 3)]
26+
self.assertRaises(AssertionError, _validate_lod, lod, -1)
27+
lod = [1, 2, 3]
28+
self.assertRaises(AssertionError, _validate_lod, lod, -1)
29+
30+
lod = []
31+
self.assertTrue(_validate_lod(lod, -1))
32+
lod = [[], [1], [3]]
33+
self.assertFalse(_validate_lod(lod, -1))
34+
lod = [[0], [-1], [3]]
35+
self.assertFalse(_validate_lod(lod, -1))
36+
37+
# Each level's sum should be equal to the number of items in the next level
38+
# Moreover, last level's sum should be equal to the tensor height
39+
lod = [[2, 3], [1, 3, 1, 2, 1]]
40+
self.assertTrue(_validate_lod(lod, tensor_height=8))
41+
lod = [[1, 3], [2, 1, 3]]
42+
self.assertFalse(_validate_lod(lod, tensor_height=6))
43+
lod = [[1, 3], [2, 1, 3, 4]]
44+
self.assertFalse(_validate_lod(lod, tensor_height=5))
45+
46+
def test_convert_lod(self):
47+
lod = [[1, 2, 3]]
48+
converted_lod = [[0, 1, 3, 6]]
49+
self.assertEqual(_convert_lod(lod), converted_lod)
50+
51+
lod = [[2, 3], [1, 3, 1, 2, 1]]
52+
converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]]
53+
self.assertEqual(_convert_lod(lod), converted_lod)
54+
55+
def test_create_lod_tensor(self):
56+
# Only numpy array or a fluid LoDTensor is valid input to
57+
# create_lod_tensor function, currently a list of lists is not.
58+
data = [[1, 2], [3, 4]]
59+
self.assertRaises(Exception, create_lod_tensor, data, [],
60+
fluid.CPUPlace())
61+
62+
# Create LoDTensor from numpy array
63+
data = numpy.random.random([10, 1])
64+
lod = [[2, 1], [3, 3, 4]]
65+
tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
66+
self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
67+
68+
# Create LoDTensor from another LoDTensor, they are differnt instances
69+
new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
70+
new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
71+
self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
72+
self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]])
73+
74+
def test_create_random_int_lodtensor(self):
75+
# The shape of a word, commonly used in speech and NLP problem, is [1]
76+
shape = [1]
77+
lod = [[2, 3, 5]]
78+
dict_size = 10000
79+
low = 0
80+
high = dict_size - 1
81+
tensor = create_random_int_lodtensor(lod, shape,
82+
fluid.CPUPlace(), low, high)
83+
self.assertEqual(tensor.lod(), [[0, 2, 5, 10]])
84+
self.assertEqual(tensor.shape(), [10, 1])
85+
86+
87+
if __name__ == '__main__':
88+
unittest.main()

0 commit comments

Comments
 (0)