Skip to content

Commit 02d6805

Browse files
committed
add sparsed bias grad, test=develop
1 parent 42470f1 commit 02d6805

File tree

6 files changed

+78
-29
lines changed

6 files changed

+78
-29
lines changed

paddle/fluid/operators/hierarchical_sigmoid_op.cc

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
107107
"it should have shape like [N, L], L is the length of the Path")
108108
.AsDispensable();
109109
AddInput("Bias",
110-
"(LoDTensor, optional), The bias is a tensor with shape"
111-
"[1, num_classes - 1].");
110+
"(LoDTensor, optional), The bias is a tensor with shape or "
111+
"[non_leaf_num, 1]"
112+
"[num_classes - 1, 1].");
112113
AddOutput(
113114
"Out",
114115
"(LoDTensor, required) The output of hierarchical sigmoid operator."
@@ -148,11 +149,11 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
148149
"Output(W@Grad should not be null.");
149150
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
150151
"Output(X@Grad should not be null.");
151-
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
152-
ctx->SetOutputDim(framework::GradVarName("Bias"),
153-
ctx->GetInputDim("Bias"));
154-
}
155152
if (!ctx->Attrs().Get<bool>("is_sparse")) {
153+
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
154+
ctx->SetOutputDim(framework::GradVarName("Bias"),
155+
ctx->GetInputDim("Bias"));
156+
}
156157
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
157158
}
158159
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
@@ -172,20 +173,31 @@ class HierarchicalSigmoidGradOpGradVarTypeInference
172173
public:
173174
void operator()(const framework::OpDesc& op_desc,
174175
framework::BlockDesc* block) const override {
175-
auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
176+
auto out_W_var_name = op_desc.Output(framework::GradVarName("W")).front();
177+
auto out_Bias_var_name =
178+
op_desc.Output(framework::GradVarName("Bias")).front();
176179
auto attr = op_desc.GetAttr("is_sparse");
177180
bool is_sparse = boost::get<bool>(attr);
178181
if (is_sparse) {
179182
VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
180183
<< " is set to SelectedRows";
181-
block->Var(out_var_name)
184+
block->Var(out_W_var_name)
185+
->SetType(framework::proto::VarType::SELECTED_ROWS);
186+
VLOG(3) << "hierarchical_sigmoid_grad op "
187+
<< framework::GradVarName("Bias") << " is set to SelectedRows";
188+
block->Var(out_Bias_var_name)
182189
->SetType(framework::proto::VarType::SELECTED_ROWS);
183190
} else {
184191
VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
185192
<< " is set to LoDTensor";
186-
block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
193+
block->Var(out_W_var_name)
194+
->SetType(framework::proto::VarType::LOD_TENSOR);
195+
VLOG(3) << "hierarchical_sigmoid_grad op "
196+
<< framework::GradVarName("Bias") << " is set to SelectedRows";
197+
block->Var(out_Bias_var_name)
198+
->SetType(framework::proto::VarType::LOD_TENSOR);
187199
}
188-
block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
200+
block->Var(out_W_var_name)->SetDataType(block->Var("W")->GetDataType());
189201
}
190202
};
191203

paddle/fluid/operators/hierarchical_sigmoid_op.h

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -124,13 +124,12 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
124124
auto* w = ctx.Input<framework::LoDTensor>("W");
125125
auto* path = ctx.Input<framework::LoDTensor>("PTable");
126126
auto* code = ctx.Input<framework::LoDTensor>("PCode");
127+
auto* bias = ctx.Input<framework::LoDTensor>("Bias");
127128
auto* in_grad =
128129
ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
129130
bool is_sparse = ctx.Attr<bool>("is_sparse");
130131
auto& dev_ctx = ctx.template device_context<DeviceContext>();
131132
math::SetConstant<DeviceContext, T> zero;
132-
auto* bias_grad =
133-
ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
134133
auto* label = ctx.Input<framework::LoDTensor>("Label");
135134
auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
136135
auto* out_grad =
@@ -174,12 +173,15 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
174173
pre_out_grad_mat * out_grad_mat.broadcast(bcast);
175174
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
176175
// be consistent with the clipping in forward.
177-
if (bias_grad) {
178-
bias_grad->mutable_data<T>(ctx.GetPlace());
179-
zero(dev_ctx, bias_grad, static_cast<T>(0.0));
180-
bit_code->AddGrad(pre_out_grad, bias_grad);
181-
}
176+
182177
if (!is_sparse) {
178+
auto* bias_grad =
179+
ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
180+
if (bias_grad) {
181+
bias_grad->mutable_data<T>(ctx.GetPlace());
182+
zero(dev_ctx, bias_grad, static_cast<T>(0.0));
183+
bit_code->AddGrad(pre_out_grad, bias_grad);
184+
}
183185
auto* w_grad =
184186
ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
185187
w_grad->mutable_data<T>(ctx.GetPlace());
@@ -199,6 +201,21 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
199201

200202
w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
201203
zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
204+
auto* bias_grad =
205+
ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
206+
if (bias_grad) {
207+
bias_grad->set_rows(real_rows);
208+
// build ids -> rows index map
209+
bias_grad->SyncIndex();
210+
bias_grad->set_height(bias->dims()[0]);
211+
auto* bias_grad_value = bias_grad->mutable_value();
212+
std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
213+
bias->dims()[1]};
214+
bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
215+
ctx.GetPlace());
216+
zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
217+
bit_code->AddGrad(pre_out_grad, bias_grad);
218+
}
202219
bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
203220
}
204221
bit_code->MulGradError(pre_out_grad, *w, in_grad);

paddle/fluid/operators/math/matrix_bit_code.cc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,24 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
4848
}
4949
}
5050

51+
template <typename T>
52+
void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
53+
framework::SelectedRows* vec) {
54+
size_t batch_size = tmat.dims()[0];
55+
size_t width = tmat.dims()[1];
56+
for (size_t i = 0; i < batch_size; ++i) {
57+
auto code = code_table->get_code(i);
58+
int code_length = code->get_length();
59+
for (int j = 0; j < code_length; ++j) {
60+
size_t index = code->calc_index(j);
61+
int64_t row_index =
62+
vec->AutoGrownIndex(static_cast<int64_t>(index), false, true);
63+
vec->mutable_value()->data<T>()[row_index] +=
64+
tmat.data<T>()[i * width + j];
65+
}
66+
}
67+
}
68+
5169
template <typename T>
5270
void MatrixBitCodeFunctor<T>::Sum(const framework::LoDTensor& tmat,
5371
framework::LoDTensor* sum, T scale_sum) {

paddle/fluid/operators/math/matrix_bit_code.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,11 @@ class MatrixBitCodeFunctor {
241241
*/
242242
void AddGrad(const framework::LoDTensor& tmat, framework::LoDTensor* vec);
243243

244+
/* For selected rows For j < code_length
245+
vec(0, index(i, j)) += tmat(i, j)
246+
*/
247+
void AddGrad(const framework::LoDTensor& tmat, framework::SelectedRows* vec);
248+
244249
/* For j < code_length
245250
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
246251
*/

python/paddle/fluid/layers/nn.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4639,14 +4639,14 @@ def hsigmoid(input,
46394639
if not is_costum:
46404640
bias = helper.create_parameter(
46414641
attr=helper.bias_attr,
4642-
shape=[1, num_classes - 1],
4642+
shape=[num_classes - 1, 1],
46434643
is_bias=True,
46444644
dtype=input.dtype)
46454645
inputs['Bias'] = bias
46464646
else:
46474647
bias = helper.create_parameter(
46484648
attr=helper.bias_attr,
4649-
shape=[1, non_leaf_num],
4649+
shape=[non_leaf_num, 1],
46504650
is_bias=True,
46514651
dtype=input.dtype)
46524652
inputs['Bias'] = bias

python/paddle/fluid/tests/unittests/test_hsigmoid_op.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes):
7777
length = code_table.get_length()
7878
for j in range(length):
7979
idx = code_table.cal_index(j)
80-
pre_output[i][j] += bias[0][idx]
80+
pre_output[i][j] += bias[idx][0]
8181
for i in range(batch_size):
8282
code_table = CodeTable(num_classes, label[i])
8383
length = code_table.get_length()
@@ -115,7 +115,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes):
115115
length = code_table.get_length()
116116
for j in range(length):
117117
idx = code_table.cal_index(j)
118-
pre_output[i][j] += bias[0][idx]
118+
pre_output[i][j] += bias[idx][0]
119119
for i in range(batch_size):
120120
code_table = CodeTableWithCustomTree(ptable, pcode, i)
121121
length = code_table.get_length()
@@ -150,7 +150,7 @@ def setUp(self):
150150
w = np.random.random(
151151
(num_classes - 1, feature_size)).astype("float32") * 2
152152
label = np.random.randint(0, num_classes, (batch_size, 1))
153-
bias = np.random.random((1, num_classes - 1)).astype("float32")
153+
bias = np.random.random((num_classes - 1, 1)).astype("float32")
154154
self.attrs = {'num_classes': num_classes, 'is_sparse': False}
155155
self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
156156
pre_output, out = hsigmoid(x, w, label, bias, num_classes)
@@ -178,7 +178,7 @@ def setUp(self):
178178
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
179179
pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
180180
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
181-
bias = np.random.random((1, num_classes - 1)).astype("float32")
181+
bias = np.random.random((num_classes - 1, 1)).astype("float32")
182182
self.attrs = {'num_classes': num_classes, 'is_sparse': True}
183183
self.inputs = {
184184
'X': x,
@@ -193,7 +193,6 @@ def setUp(self):
193193
self.outputs = {'PreOut': pre_output, 'Out': out}
194194

195195
def test_check_output(self):
196-
print("checking output in CostumTree")
197196
self.check_output()
198197

199198

@@ -208,7 +207,7 @@ def hs_net_conf(self, is_sparse):
208207

209208
emb = fluid.layers.embedding(
210209
input=input_word,
211-
is_sparse=False,
210+
is_sparse=is_sparse,
212211
size=[3, 3],
213212
param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
214213
scale=1 / math.sqrt(3))))
@@ -220,6 +219,7 @@ def hs_net_conf(self, is_sparse):
220219
ptable=ptable,
221220
pcode=pcode,
222221
is_costum=True,
222+
bias_attr=True,
223223
is_sparse=is_sparse)
224224

225225
avg_cost = fluid.layers.reduce_mean(cost)
@@ -240,7 +240,6 @@ def training_test(self, is_sparse):
240240
optimizer.minimize(loss)
241241

242242
main_program = fluid.default_main_program()
243-
# print("main program: {program}".format{program=str(main_program)})
244243
place = fluid.CPUPlace()
245244
feeder = fluid.DataFeeder(feed_list=data_list, place=place)
246245
exe = fluid.Executor(place)
@@ -279,7 +278,7 @@ def setUp(self):
279278
-1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
280279
pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
281280
1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store
282-
bias = np.random.random((1, num_classes - 1)).astype("float32")
281+
bias = np.random.random((num_classes - 1, 1)).astype("float32")
283282
self.attrs = {'num_classes': num_classes, 'is_sparse': False}
284283
self.inputs = {
285284
'X': x,
@@ -294,11 +293,9 @@ def setUp(self):
294293
self.outputs = {'PreOut': pre_output, 'Out': out}
295294

296295
def test_check_output(self):
297-
print("checking output in CostumTree")
298296
self.check_output()
299297

300298
def test_check_grad(self):
301-
print("checking outputGrad in CostumTree")
302299
self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
303300

304301

0 commit comments

Comments
 (0)