Skip to content

Commit 2183d01

Browse files
committed
Add pool3d and conv3d_trans Python API
1 parent 3ab3253 commit 2183d01

File tree

1 file changed

+257
-16
lines changed
  • python/paddle/fluid/layers

1 file changed

+257
-16
lines changed

python/paddle/fluid/layers/nn.py

Lines changed: 257 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,16 @@
3939
'chunk_eval',
4040
'sequence_conv',
4141
'conv2d',
42+
'conv3d',
4243
'sequence_pool',
4344
'sequence_softmax',
4445
'softmax',
4546
'pool2d',
47+
'pool3d',
4648
'batch_norm',
4749
'beam_search_decode',
4850
'conv2d_transpose',
51+
'conv3d_transpose',
4952
'sequence_expand',
5053
'lstm_unit',
5154
'reduce_sum',
@@ -1385,13 +1388,12 @@ def conv3d(input,
13851388
13861389
The convolution3D layer calculates the output based on the input, filter
13871390
and strides, paddings, dilations, groups parameters. Input(Input) and
1388-
Output(Output) are in NCHW format. Where N is batch size, C is the number of
1389-
channels, H is the height of the feature, and W is the width of the feature.
1390-
The details of convolution layer, please refer UFLDL's `convolution,
1391-
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
1392-
If bias attribution and activation type are provided, bias is added to the
1393-
output of the convolution, and the corresponding activation function is
1394-
applied to the final result.
1391+
Output(Output) are in NCDHW format. Where N is batch size C is the number of
1392+
channels, D is the depth of the feature, H is the height of the feature,
1393+
and W is the width of the feature. Convlution3D is similar with Convlution2D
1394+
but adds one dimension(depth). If bias attribution and activation type are
1395+
provided, bias is added to the output of the convolution, and the
1396+
corresponding activation function is applied to the final result.
13951397
13961398
For each input :math:`X`, the equation is:
13971399
@@ -1401,8 +1403,8 @@ def conv3d(input,
14011403
14021404
In the above equation:
14031405
1404-
* :math:`X`: Input value, a tensor with NCHW format.
1405-
* :math:`W`: Filter value, a tensor with MCHW format.
1406+
* :math:`X`: Input value, a tensor with NCDHW format.
1407+
* :math:`W`: Filter value, a tensor with MCDHW format.
14061408
* :math:`\\ast`: Convolution operation.
14071409
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
14081410
* :math:`\\sigma`: Activation function.
@@ -1433,16 +1435,16 @@ def conv3d(input,
14331435
num_filters(int): The number of filter. It is as same as the output
14341436
image channel.
14351437
filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
1436-
it must contain two integers, (filter_size_D, filter_size_H, filter_size_W).
1438+
it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
14371439
Otherwise, the filter will be a square.
14381440
stride (int|tuple): The stride size. If stride is a tuple, it must
1439-
contain two integers, (stride_D, stride_H, stride_W). Otherwise, the
1441+
contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
14401442
stride_D = stride_H = stride_W = stride. Default: stride = 1.
14411443
padding (int|tuple): The padding size. If padding is a tuple, it must
1442-
contain two integers, (padding_D, padding_H, padding_W). Otherwise, the
1444+
contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
14431445
padding_D = padding_H = padding_W = padding. Default: padding = 0.
14441446
dilation (int|tuple): The dilation size. If dilation is a tuple, it must
1445-
contain two integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
1447+
contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
14461448
dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
14471449
groups (int): The groups number of the Conv3d Layer. According to grouped
14481450
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -1528,7 +1530,7 @@ def _get_default_param_initializer():
15281530
'use_mkldnn': use_mkldnn
15291531
})
15301532

1531-
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=3)
1533+
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
15321534

15331535
return helper.append_activation(pre_act)
15341536

@@ -1720,12 +1722,84 @@ def pool2d(input,
17201722
if not isinstance(use_cudnn, bool):
17211723
raise ValueError("use_cudnn should be True or False")
17221724

1723-
helper = LayerHelper('pool2d', **locals())
1725+
l_type = 'conv2d'
1726+
1727+
helper = LayerHelper(l_type, **locals())
1728+
dtype = helper.input_dtype()
1729+
pool_out = helper.create_tmp_variable(dtype)
1730+
1731+
helper.append_op(
1732+
type=l_type,
1733+
inputs={"X": input},
1734+
outputs={"Out": pool_out},
1735+
attrs={
1736+
"pooling_type": pool_type,
1737+
"ksize": pool_size,
1738+
"global_pooling": global_pooling,
1739+
"strides": pool_stride,
1740+
"paddings": pool_padding,
1741+
"use_cudnn": use_cudnn,
1742+
"ceil_mode": ceil_mode,
1743+
"use_mkldnn": use_mkldnn
1744+
})
1745+
1746+
return pool_out
1747+
1748+
1749+
def pool3d(input,
1750+
pool_size=-1,
1751+
pool_type="max",
1752+
pool_stride=1,
1753+
pool_padding=0,
1754+
global_pooling=False,
1755+
use_cudnn=True,
1756+
ceil_mode=False,
1757+
use_mkldnn=False,
1758+
name=None):
1759+
"""
1760+
This function adds the operator for pooling in 3-dimensions, using the
1761+
pooling configurations mentioned in input parameters.
1762+
1763+
Args:
1764+
input (Variable): ${input_comment}
1765+
pool_size (int): ${ksize_comment}
1766+
pool_type (str): ${pooling_type_comment}
1767+
pool_stride (int): stride of the pooling layer.
1768+
pool_padding (int): padding size.
1769+
global_pooling (bool): ${global_pooling_comment}
1770+
use_cudnn (bool): ${use_cudnn_comment}
1771+
ceil_mode (bool): ${ceil_mode_comment}
1772+
use_mkldnn (bool): ${use_mkldnn_comment}
1773+
name (str): A name for this layer(optional). If set None, the layer
1774+
will be named automatically.
1775+
1776+
Returns:
1777+
Variable: output of pool3d layer.
1778+
"""
1779+
if pool_type not in ["max", "avg"]:
1780+
raise ValueError(
1781+
"Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
1782+
str(pool_type))
1783+
1784+
if global_pooling is False and pool_size == -1:
1785+
raise ValueError(
1786+
"When the global_pooling is False, pool_size must be passed "
1787+
"and be a valid value. Received pool_size: " + str(pool_size))
1788+
1789+
pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
1790+
pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding')
1791+
pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
1792+
1793+
if not isinstance(use_cudnn, bool):
1794+
raise ValueError("use_cudnn should be True or False")
1795+
1796+
l_type = "pool3d"
1797+
helper = LayerHelper(l_type, **locals())
17241798
dtype = helper.input_dtype()
17251799
pool_out = helper.create_tmp_variable(dtype)
17261800

17271801
helper.append_op(
1728-
type="pool2d",
1802+
type=l_type,
17291803
inputs={"X": input},
17301804
outputs={"Out": pool_out},
17311805
attrs={
@@ -2146,6 +2220,173 @@ def conv2d_transpose(input,
21462220
return out
21472221

21482222

2223+
def conv3d_transpose(input,
2224+
num_filters,
2225+
output_size=None,
2226+
filter_size=None,
2227+
padding=0,
2228+
stride=1,
2229+
dilation=1,
2230+
groups=None,
2231+
param_attr=None,
2232+
bias_attr=None,
2233+
use_cudnn=True,
2234+
act=None,
2235+
name=None):
2236+
"""
2237+
**Convlution3D transpose layer**
2238+
2239+
The convolution3D transpose layer calculates the output based on the input,
2240+
filter, and dilations, strides, paddings. Input(Input) and output(Output)
2241+
are in NCDHW format. Where N is batch size, C is the number of channels,
2242+
D is the depth of the feature, H is the height of the feature, and W
2243+
is the width of the feature. Parameters(dilations, strides, paddings) are
2244+
two elements. These two elements represent height and width, respectively.
2245+
The details of convolution transpose layer, please refer to the following
2246+
explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
2247+
2248+
For each input :math:`X`, the equation is:
2249+
2250+
.. math::
2251+
2252+
Out = W \\ast X
2253+
2254+
In the above equation:
2255+
2256+
* :math:`X`: Input value, a tensor with NCDHW format.
2257+
* :math:`W`: Filter value, a tensor with MCDHW format.
2258+
* :math:`\\ast` : Convolution transpose operation.
2259+
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
2260+
different.
2261+
2262+
Example:
2263+
2264+
- Input:
2265+
2266+
Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
2267+
2268+
Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
2269+
2270+
- Output:
2271+
2272+
Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
2273+
2274+
Where
2275+
2276+
.. math::
2277+
2278+
D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
2279+
H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
2280+
W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
2281+
2282+
Args:
2283+
input(Variable): The input image with [N, C, D, H, W] format.
2284+
num_filters(int): The number of the filter. It is as same as the output
2285+
image channel.
2286+
output_size(int|tuple|None): The output image size. If output size is a
2287+
tuple, it must contain three integers, (image_D, image_H, image_W). This
2288+
parameter only works when filter_size is None.
2289+
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
2290+
it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
2291+
Otherwise, the filter will be a square. None if use output size to
2292+
calculate filter_size.
2293+
padding(int|tuple): The padding size. If padding is a tuple, it must
2294+
contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
2295+
padding_D = padding_H = padding_W = padding. Default: padding = 0.
2296+
stride(int|tuple): The stride size. If stride is a tuple, it must
2297+
contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
2298+
stride_D = stride_H = stride_W = stride. Default: stride = 1.
2299+
dilation(int|tuple): The dilation size. If dilation is a tuple, it must
2300+
contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
2301+
dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
2302+
groups(int): The groups number of the Conv3d transpose layer. Inspired by
2303+
grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
2304+
when group=2, the first half of the filters is only connected to the
2305+
first half of the input channels, while the second half of the
2306+
filters is only connected to the second half of the input channels.
2307+
Default: groups=1
2308+
param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer.
2309+
Default: None
2310+
bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None
2311+
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
2312+
library is installed. Default: True
2313+
act(str): Activation type. Default: None
2314+
name(str|None): A name for this layer(optional). If set None, the layer
2315+
will be named automatically.
2316+
2317+
Returns:
2318+
Variable: The tensor variable storing the convolution transpose result.
2319+
2320+
Raises:
2321+
ValueError: If the shapes of input, filter_size, stride, padding and
2322+
groups mismatch.
2323+
2324+
Examples:
2325+
.. code-block:: python
2326+
2327+
data = fluid.layers.data(
2328+
name='data', shape=[3, 12, 32, 32], dtype='float32')
2329+
conv2d_transpose = fluid.layers.conv3d_transpose(
2330+
input=data, num_filters=2, filter_size=3)
2331+
"""
2332+
l_type = "conv3d_transpose"
2333+
helper = LayerHelper(l_type, **locals())
2334+
if not isinstance(input, Variable):
2335+
raise TypeError("Input of conv3d_transpose must be Variable")
2336+
input_channel = input.shape[1]
2337+
2338+
padding = utils.convert_to_list(padding, 3, 'padding')
2339+
stride = utils.convert_to_list(stride, 3, 'stride')
2340+
dilation = utils.convert_to_list(dilation, 3, 'dilation')
2341+
2342+
if not isinstance(use_cudnn, bool):
2343+
raise ValueError("use_cudnn should be True or False")
2344+
2345+
if filter_size is None:
2346+
if output_size is None:
2347+
raise ValueError("output_size must be set when filter_size is None")
2348+
if isinstance(output_size, int):
2349+
output_size = [output_size, output_size]
2350+
2351+
d_in = input.shape[2]
2352+
h_in = input.shape[3]
2353+
w_in = input.shape[4]
2354+
2355+
filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 *
2356+
padding[0] - 1) / dilation[0] + 1
2357+
filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 *
2358+
padding[1] - 1) / dilation[1] + 1
2359+
filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 *
2360+
padding[2] - 1) / dilation[2] + 1
2361+
filter_size = [filter_size_d, filter_size_h, filter_size_w]
2362+
else:
2363+
filter_size = utils.convert_to_list(filter_size, 3,
2364+
'conv3d_transpose.filter_size')
2365+
2366+
groups = 1 if groups is None else groups
2367+
filter_shape = [input_channel, num_filters / groups] + filter_size
2368+
img_filter = helper.create_parameter(
2369+
dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
2370+
2371+
pre_bias = helper.create_tmp_variable(dtype=input.dtype)
2372+
helper.append_op(
2373+
type=l_type,
2374+
inputs={'Input': [input],
2375+
'Filter': [img_filter]},
2376+
outputs={'Output': pre_bias},
2377+
attrs={
2378+
'strides': stride,
2379+
'paddings': padding,
2380+
'dilations': dilation,
2381+
'groups': groups,
2382+
'use_cudnn': use_cudnn
2383+
})
2384+
2385+
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
2386+
out = helper.append_activation(pre_act)
2387+
return out
2388+
2389+
21492390
def sequence_expand(x, y, ref_level=-1, name=None):
21502391
"""Sequence Expand Layer. This layer will expand the input variable **x**
21512392
according to specified level lod of **y**. Please note that lod level of

0 commit comments

Comments
 (0)