Skip to content

Commit b49d8df

Browse files
authored
fix rnn lstm gru weights bug (#13)
1 parent b088cb3 commit b49d8df

File tree

1 file changed

+114
-39
lines changed

1 file changed

+114
-39
lines changed

tensorlayerx/backend/ops/paddle_nn.py

Lines changed: 114 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import warnings
1616
import math
1717
from paddle import _C_ops
18+
from paddle.framework import core
19+
from paddle import in_dynamic_mode
1820

1921
def padding_format(padding):
2022
"""
@@ -1484,23 +1486,24 @@ def concat_states(states, bidirectional=False, state_components=1):
14841486
componnets.append(states[i::state_components])
14851487
return tuple([pd.stack(item) for item in componnets])
14861488

1489+
14871490
class rnnbase(LayerList):
14881491

14891492
def __init__(
1490-
self,
1491-
mode,
1492-
input_size,
1493-
hidden_size,
1494-
num_layers,
1495-
bias,
1496-
batch_first,
1497-
dropout,
1498-
bidirectional,
1499-
is_train,
1500-
w_ih,
1501-
w_hh,
1502-
b_ih,
1503-
b_hh,
1493+
self,
1494+
mode,
1495+
input_size,
1496+
hidden_size,
1497+
num_layers,
1498+
bias,
1499+
batch_first,
1500+
dropout,
1501+
bidirectional,
1502+
is_train,
1503+
w_ih,
1504+
w_hh,
1505+
b_ih,
1506+
b_hh,
15041507
):
15051508
super(rnnbase, self).__init__()
15061509
self.mode = mode
@@ -1596,39 +1599,104 @@ def __init__(
15961599
self.flatten_parameters()
15971600

15981601
def flatten_parameters(self):
1602+
"""
1603+
Resets parameter data pointer to address in continuous memory block for
1604+
cudnn usage.
1605+
"""
15991606
if self.could_use_cudnn:
1600-
self._all_weights = self.parameters(include_sublayers=False)
1601-
shape = [np.prod(param.shape) for param in self._all_weights]
1607+
# layer.parameters() is depth first and ordered
1608+
# for i in layer: for j in direct: w_ih, w_hh, b_ih, b_hh
1609+
# need to reorganize to cudnn param layout:
1610+
# all bias following all weights
1611+
params = self.parameters(include_sublayers=False)
1612+
shape = [np.prod(param.shape) for param in params]
1613+
self._all_weights = [None] * len(params)
1614+
for i, param in enumerate(params):
1615+
base = self.num_layers * self.bidirect
1616+
num = i // base
1617+
odd = num % 2
1618+
offset = (2 * base) * (num // 2)
1619+
new_id = (i - num * base) * 2 + odd + offset
1620+
self._all_weights[new_id] = param
1621+
# Wrap using a list to avoid registed into params and saving, maybe
1622+
# need a better way to handle this later. Use `create_parameter` to
1623+
# add both to main_program and startup_program for static-graph.
1624+
# Use Constant initializer to avoid make effect on random generator.
16021625
self._flat_weight = [
16031626
self.create_parameter(
1604-
shape=[np.sum(shape)], dtype=self._all_weights[0].dtype, default_initializer=I.Constant(0.0)
1605-
)
1627+
shape=[np.sum(shape)],
1628+
dtype=params[0].dtype,
1629+
default_initializer=I.Constant(0.0))
16061630
]
1607-
self._dropout_state = self.create_variable(dtype=fluid.core.VarDesc.VarType.UINT8)
1608-
with fluid.program_guard(fluid.default_startup_program(), fluid.default_startup_program()):
1609-
with framework.no_grad():
1631+
# dropout state may also can be hided and avoid saving
1632+
# should dropout state be persistable for static-graph
1633+
self._dropout_state = self.create_variable(
1634+
dtype=core.VarDesc.VarType.UINT8)
1635+
with fluid.program_guard(fluid.default_startup_program(),
1636+
fluid.default_startup_program()):
1637+
with paddle.no_grad():
16101638
self._helper.append_op(
1611-
type="coalesce_tensor", inputs={"Input": self._all_weights}, outputs={
1639+
type="coalesce_tensor",
1640+
inputs={"Input": self._all_weights},
1641+
outputs={
16121642
"Output": self._all_weights,
16131643
"FusedOutput": self._flat_weight
1614-
}, attrs={
1644+
},
1645+
attrs={
16151646
"copy_data": True,
16161647
"use_align": False,
1617-
"dtype": self._all_weights[0].dtype
1618-
}
1619-
)
1648+
"dtype": params[0].dtype
1649+
})
16201650

16211651
def _cudnn_impl(self, inputs, initial_states, sequence_length):
16221652
if not self.time_major:
1623-
inputs = pd.tensor.transpose(inputs, [1, 0, 2])
1624-
_, _, out, state = _C_ops.rnn(
1625-
inputs, initial_states, self._all_weights, sequence_length,
1626-
self._dropout_state, self.state_components, 'dropout_prob',
1627-
self.dropout, 'is_bidirec', self.bidirect == 2,
1628-
'input_size', self.input_size, 'hidden_size', self.hidden_size,
1629-
'num_layers', self.num_layers, 'mode', self.mode, 'is_test',
1630-
not self.training)
1631-
out = pd.tensor.transpose(out, [1, 0, 2]) if not self.time_major else out
1653+
inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
1654+
1655+
if in_dynamic_mode():
1656+
_, _, out, state = _C_ops.rnn(
1657+
inputs, initial_states, self._all_weights, sequence_length,
1658+
self._dropout_state, self.state_components, 'dropout_prob',
1659+
self.dropout, 'is_bidirec', self.bidirect == 2,
1660+
'input_size', self.input_size, 'hidden_size', self.hidden_size,
1661+
'num_layers', self.num_layers, 'mode', self.mode, 'is_test',
1662+
not self.training)
1663+
else:
1664+
out = self._helper.create_variable_for_type_inference(inputs.dtype)
1665+
state = [
1666+
self._helper.create_variable_for_type_inference(inputs.dtype)
1667+
for i in range(self.state_components)
1668+
]
1669+
reserve = self._helper.create_variable_for_type_inference(
1670+
dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
1671+
1672+
inputs = {
1673+
'Input': inputs,
1674+
'WeightList': self._all_weights,
1675+
'PreState': initial_states,
1676+
'SequenceLength': sequence_length
1677+
}
1678+
attrs = {
1679+
'dropout_prob': self.dropout,
1680+
'is_bidirec': self.bidirect == 2,
1681+
'input_size': self.input_size,
1682+
'hidden_size': self.hidden_size,
1683+
'num_layers': self.num_layers,
1684+
'mode': self.mode,
1685+
'is_test': not self.training
1686+
}
1687+
1688+
outputs = {
1689+
'Out': out,
1690+
'State': state,
1691+
'Reserve': reserve,
1692+
'DropoutState': self._dropout_state,
1693+
}
1694+
1695+
self._helper.append_op(
1696+
type="rnn", inputs=inputs, outputs=outputs, attrs=attrs)
1697+
1698+
out = paddle.tensor.transpose(out,
1699+
[1, 0, 2]) if not self.time_major else out
16321700
return out, tuple(state) if len(state) > 1 else state[0]
16331701

16341702
def check_hidden(self, h, batch_size):
@@ -1661,9 +1729,13 @@ def forward(self, inputs, initial_states=None):
16611729
self.check_hidden(c, batch_size)
16621730
else:
16631731
self.check_hidden(initial_states, batch_size)
1732+
16641733
if not isinstance(initial_states, (tuple, list)):
1665-
initial_states = [initial_states,]
1666-
if self.could_use_cudnn:
1734+
initial_states = [initial_states, ]
1735+
1736+
if self.could_use_cudnn and (
1737+
not paddle.device.is_compiled_with_rocm() or
1738+
sequence_length is None):
16671739
# Add CPU kernel and dispatch in backend later
16681740
return self._cudnn_impl(inputs, initial_states, sequence_length)
16691741

@@ -1672,15 +1744,18 @@ def forward(self, inputs, initial_states=None):
16721744

16731745
for i, rnn_layer in enumerate(self):
16741746
if i > 0:
1675-
inputs = F.dropout(inputs, self.dropout, training=self.training, mode="upscale_in_train")
1747+
inputs = F.dropout(
1748+
inputs,
1749+
self.dropout,
1750+
training=self.training,
1751+
mode="upscale_in_train")
16761752
outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
16771753
final_states.append(final_state)
16781754
inputs = outputs
16791755

16801756
final_states = concat_states(final_states, self.bidirect == 2, self.state_components)
16811757
return outputs, final_states
16821758

1683-
16841759
class layernorm(object):
16851760

16861761
def __init__(self, normalized_shape, gamma, beta, eps, input_shape):

0 commit comments

Comments
 (0)