Skip to content

Commit b571a41

Browse files
committed
Merge remote-tracking branch 'baidu/develop' into feature/add_v2_api_doc
2 parents 1e29b12 + b25c512 commit b571a41

24 files changed

+370
-84
lines changed

demo/seqToseq/api_train_v2.py

Lines changed: 46 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -126,51 +126,57 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
126126

127127
def main():
128128
paddle.init(use_gpu=False, trainer_count=1)
129+
is_generating = True
129130

130131
# source and target dict dim.
131132
dict_size = 30000
132133
source_dict_dim = target_dict_dim = dict_size
133134

134-
# define network topology
135-
cost = seqToseq_net(source_dict_dim, target_dict_dim)
136-
parameters = paddle.parameters.create(cost)
137-
138-
# define optimize method and trainer
139-
optimizer = paddle.optimizer.Adam(
140-
learning_rate=5e-5,
141-
regularization=paddle.optimizer.L2Regularization(rate=1e-3))
142-
trainer = paddle.trainer.SGD(cost=cost,
143-
parameters=parameters,
144-
update_equation=optimizer)
145-
146-
# define data reader
147-
feeding = {
148-
'source_language_word': 0,
149-
'target_language_word': 1,
150-
'target_language_next_word': 2
151-
}
152-
153-
wmt14_reader = paddle.batch(
154-
paddle.reader.shuffle(
155-
paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
156-
batch_size=5)
157-
158-
# define event_handler callback
159-
def event_handler(event):
160-
if isinstance(event, paddle.event.EndIteration):
161-
if event.batch_id % 10 == 0:
162-
print "\nPass %d, Batch %d, Cost %f, %s" % (
163-
event.pass_id, event.batch_id, event.cost, event.metrics)
164-
else:
165-
sys.stdout.write('.')
166-
sys.stdout.flush()
167-
168-
# start to train
169-
trainer.train(
170-
reader=wmt14_reader,
171-
event_handler=event_handler,
172-
num_passes=10000,
173-
feeding=feeding)
135+
# train the network
136+
if not is_generating:
137+
cost = seqToseq_net(source_dict_dim, target_dict_dim)
138+
parameters = paddle.parameters.create(cost)
139+
140+
# define optimize method and trainer
141+
optimizer = paddle.optimizer.Adam(
142+
learning_rate=5e-5,
143+
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
144+
trainer = paddle.trainer.SGD(cost=cost,
145+
parameters=parameters,
146+
update_equation=optimizer)
147+
# define data reader
148+
wmt14_reader = paddle.batch(
149+
paddle.reader.shuffle(
150+
paddle.dataset.wmt14.train(dict_size), buf_size=8192),
151+
batch_size=5)
152+
153+
# define event_handler callback
154+
def event_handler(event):
155+
if isinstance(event, paddle.event.EndIteration):
156+
if event.batch_id % 10 == 0:
157+
print "\nPass %d, Batch %d, Cost %f, %s" % (
158+
event.pass_id, event.batch_id, event.cost,
159+
event.metrics)
160+
else:
161+
sys.stdout.write('.')
162+
sys.stdout.flush()
163+
164+
# start to train
165+
trainer.train(
166+
reader=wmt14_reader, event_handler=event_handler, num_passes=2)
167+
168+
# generate a english sequence to french
169+
else:
170+
gen_creator = paddle.dataset.wmt14.test(dict_size)
171+
gen_data = []
172+
for item in gen_creator():
173+
gen_data.append((item[0], ))
174+
if len(gen_data) == 3:
175+
break
176+
177+
beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
178+
parameters = paddle.dataset.wmt14.model()
179+
trg_dict = paddle.dataset.wmt14.trg_dict(dict_size)
174180

175181

176182
if __name__ == '__main__':

paddle/gserver/layers/SequenceLastInstanceLayer.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ namespace paddle {
2525
* Input: a sequence
2626
* If SequenceLevel = kNonseq:
2727
* Output: a sequence containing only the last instance of the input sequence
28+
* If stride_ > 0:
29+
* Output: a shorten sequence. The operation of getting last instance of a
30+
* sequence is independently performed on every slice of the input
31+
* sequence, which is obtained by sliding a window with the window
32+
* size set to stride_.
2833
* If SequenceLevel = kSeq:
2934
* Check input sequence must has sub-sequence
3035
* Output: a sequence containing only the last instance of each sub-sequence
@@ -37,6 +42,7 @@ class SequenceLastInstanceLayer : public SequencePoolLayer {
3742
protected:
3843
MatrixPtr tmpSrc_;
3944
MatrixPtr tmpDest_;
45+
std::vector<int> instanceIds_;
4046

4147
public:
4248
explicit SequenceLastInstanceLayer(const LayerConfig& config)
@@ -54,6 +60,7 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
5460
bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
5561
const ParameterMap& parameterMap) {
5662
SequencePoolLayer::init(layerMap, parameterMap);
63+
reversed_ = config_.select_first();
5764

5865
tmpSrc_ =
5966
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
@@ -66,17 +73,19 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
6673
void SequenceLastInstanceLayer::forward(PassType passType) {
6774
SequencePoolLayer::forward(passType);
6875

69-
const int* starts = startPositions_->getData(false);
76+
auto starts = (stride_ > 0) ? stridePositions_->getData()
77+
: startPositions_->getData(false);
7078
MatrixPtr inputValue = getInputValue(0);
7179
MatrixPtr outputValue = getOutputValue();
7280

7381
{
7482
AsyncGpuBlock asyncGpuBlock;
7583
REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
7684

85+
instanceIds_.clear();
7786
for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
78-
int insId =
79-
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
87+
int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
88+
instanceIds_.push_back(insId);
8089

8190
outputValue->subMatrix(seqId, 1, tmpDest_)
8291
->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
@@ -96,18 +105,13 @@ void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
96105

97106
MatrixPtr inputGrad = getInputGrad(0);
98107
MatrixPtr outputGrad = getOutputGrad();
99-
const int* starts = startPositions_->getData(false);
100-
size_t numSequences = startPositions_->getSize() - 1;
101108

102109
if (inputGrad) {
103110
AsyncGpuBlock asyncGpuBlock;
104111
REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
105112

106-
for (size_t seqId = 0; seqId < numSequences; ++seqId) {
107-
int insId =
108-
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
109-
110-
inputGrad->subMatrix(insId, 1, tmpDest_)
113+
for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
114+
inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
111115
->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
112116
}
113117
}

paddle/gserver/layers/SequencePoolLayer.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ bool SequencePoolLayer::init(const LayerMap& layerMap,
3737
} else {
3838
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
3939
}
40+
stride_ = config_.seq_pool_stride();
4041
setNeedSequenceInfo(false);
4142
return true;
4243
}
@@ -55,8 +56,6 @@ void SequencePoolLayer::forward(PassType passType) {
5556
CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
5657
CHECK_EQ(newBatchSize_, starts->getSize() - 1);
5758

58-
resetOutput(newBatchSize_, dim);
59-
6059
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
6160
* thus, in this case, output_ has no sequenceStartPositions.
6261
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
@@ -67,6 +66,15 @@ void SequencePoolLayer::forward(PassType passType) {
6766
<< "when trans_type = seq, input must hasSubseq";
6867
output_.degradeSequence(input);
6968
}
69+
if (stride_ > 0) {
70+
CHECK_EQ(input.hasSubseq(), 0UL)
71+
<< "sequence stride pooling is invalid for hasSubseq now";
72+
output_.poolSequenceWithStride(
73+
input, stride_, &stridePositions_, reversed_);
74+
newBatchSize_ = stridePositions_->getSize() - 1;
75+
}
76+
77+
resetOutput(newBatchSize_, dim);
7078
}
7179

7280
void SequencePoolLayer::backward(const UpdateCallback& callback) {

paddle/gserver/layers/SequencePoolLayer.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ namespace paddle {
2626
* Output: output size is the number of input sequences (NOT input instances)
2727
* output[i] = seqlastin/average/max_{for each instance in this
2828
* sequence}{input[i]}
29+
* If stride_ > 0:
30+
* Check input sequence must not have sub-sequence
31+
* Output: a shorten sequence, pooling is performed upon a small local
32+
* area
2933
* If SequenceLevel = kSeq:
3034
* Check input sequence must has sub-sequence
3135
* Output: output size is the number of input sub-sequences
@@ -42,6 +46,11 @@ class SequencePoolLayer : public Layer {
4246
enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
4347
size_t newBatchSize_;
4448
ICpuGpuVectorPtr startPositions_;
49+
int stride_;
50+
// Store the start position of each window.
51+
IVectorPtr stridePositions_;
52+
// Whether the input sequence is reversed or not.
53+
bool reversed_ = false;
4554

4655
public:
4756
explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}

paddle/gserver/tests/test_LayerGrad.cpp

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -804,10 +804,14 @@ TEST(Layer, ExpandLayer) {
804804
testExpandLayer("seq", true); // seq expand to hasSubseq
805805
}
806806

807-
void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
807+
void testDegradeLayer(bool hasSubseq,
808+
string layer_type,
809+
string trans_type,
810+
int stride) {
808811
TestConfig config;
809812
config.layerConfig.set_type(layer_type);
810813
config.layerConfig.set_size(10);
814+
config.layerConfig.set_seq_pool_stride(stride);
811815
config.biasSize = 0;
812816

813817
config.inputDefs.push_back(
@@ -827,36 +831,46 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
827831
if (layer_type == "average") {
828832
for (auto strategy : {"average", "sum", "squarerootn"}) {
829833
LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
830-
<< " average_strategy=" << strategy;
834+
<< " average_strategy=" << strategy
835+
<< " seq_pool_stride=" << stride;
831836
config.layerConfig.set_average_strategy(strategy);
832837
testDegradeLayerGrad(config, layer_type);
833838
}
834839
} else {
835-
LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
840+
LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
841+
<< " seq_pool_stride=" << stride;
836842
testDegradeLayerGrad(config, layer_type);
837843
}
838844
}
839845

840846
TEST(Layer, MaxLayer) {
841-
testDegradeLayer(false, "max", "non-seq"); // seq max to non-seq
842-
testDegradeLayer(true, "max", "non-seq"); // hasSubseq max to non-seq
843-
testDegradeLayer(true, "max", "seq"); // hasSubseq max to seq
847+
testDegradeLayer(false, "max", "non-seq", -1); // seq max to non-seq
848+
testDegradeLayer(true, "max", "non-seq", -1); // hasSubseq max to non-seq
849+
testDegradeLayer(true, "max", "seq", -1); // hasSubseq max to seq
844850
}
845851

846852
TEST(Layer, SequenceLastInstanceLayer) {
847853
testDegradeLayer(false,
848854
"seqlastins",
849-
"non-seq"); // seq seqlastins to non-seq
855+
"non-seq",
856+
-1); // seq seqlastins to non-seq
857+
testDegradeLayer(false,
858+
"seqlastins",
859+
"non-seq",
860+
5); // seq seqlastins to a shorten seq, stride window = 5
850861
testDegradeLayer(true,
851862
"seqlastins",
852-
"non-seq"); // hasSubseq seqlastins to non-seq
853-
testDegradeLayer(true, "seqlastins", "seq"); // hasSubseq seqlastins to seq
863+
"non-seq",
864+
-1); // hasSubseq seqlastins to non-seq
865+
testDegradeLayer(
866+
true, "seqlastins", "seq", -1); // hasSubseq seqlastins to seq
854867
}
855868

856869
TEST(Layer, AverageLayer) {
857-
testDegradeLayer(false, "average", "non-seq"); // seq average to non-seq
858-
testDegradeLayer(true, "average", "non-seq"); // hasSubseq average to non-seq
859-
testDegradeLayer(true, "average", "seq"); // hasSubseq average to seq
870+
testDegradeLayer(false, "average", "non-seq", -1); // seq average to non-seq
871+
testDegradeLayer(
872+
true, "average", "non-seq", -1); // hasSubseq average to non-seq
873+
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
860874
}
861875

862876
TEST(Layer, SequenceConcatLayer) {

paddle/parameter/Argument.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,49 @@ void Argument::degradeSequence(const Argument& input) {
559559
tgtBuf[numSequences] = numSubSequences;
560560
}
561561

562+
void Argument::poolSequenceWithStride(const Argument& input,
563+
size_t stride,
564+
IVectorPtr* stridePostions,
565+
bool reversed) {
566+
// If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
567+
// then sequenceStartPositions = [0, 2, 3, 4, 7].
568+
// If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
569+
// else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
570+
571+
CHECK(input.sequenceStartPositions);
572+
CHECK_EQ(input.hasSubseq(), 0UL);
573+
CHECK_GT(stride, 0) << "stride must larger than 0";
574+
size_t numSequences = input.getNumSequences();
575+
ICpuGpuVector::resizeOrCreate(
576+
sequenceStartPositions, numSequences + 1, false);
577+
const int* starts = input.sequenceStartPositions->getData(false);
578+
int* tgtBuf = sequenceStartPositions->getMutableData(false);
579+
// first index of target sequence and stride positions are both 0
580+
tgtBuf[0] = 0;
581+
std::vector<int> stridePos;
582+
for (size_t seqId = 0; seqId < numSequences; ++seqId) {
583+
size_t seqLength = starts[seqId + 1] - starts[seqId];
584+
stridePos.emplace_back(starts[seqId]);
585+
if (seqLength == 0) {
586+
// empty sequence
587+
tgtBuf[seqId + 1] = tgtBuf[seqId];
588+
} else {
589+
int size = ceil((float)seqLength / stride);
590+
tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
591+
for (int i = 0; i < size - 1; ++i) {
592+
int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
593+
: stridePos.back() + stride;
594+
stridePos.emplace_back(cur);
595+
}
596+
}
597+
}
598+
stridePos.emplace_back(starts[numSequences]);
599+
int size = stridePos.size();
600+
CHECK_EQ(size - 1, tgtBuf[numSequences]);
601+
IVector::resizeOrCreate(*stridePostions, size, false);
602+
(*stridePostions)->copyFrom(stridePos.data(), size);
603+
}
604+
562605
void Argument::getValueString(
563606
std::unordered_map<std::string, std::string>* out) const {
564607
if (value) {

paddle/parameter/Argument.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,15 @@ struct Argument {
291291
*/
292292
void degradeSequence(const Argument& input);
293293

294+
/*
295+
After pooling with stride n (n is smaller than sequence length),
296+
a long sequence will be shorten.
297+
This function is invalid for sequence having sub-sequence.
298+
*/
299+
void poolSequenceWithStride(const Argument& input,
300+
size_t stride,
301+
IVectorPtr* stridePositions,
302+
bool reversed = false);
294303
/**
295304
* @brief getValueString will return the argument's output in string. There
296305
* are several kinds of output. The keys of output dictionary are 'value',
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
add_simple_unittest(test_common)
2+
add_simple_unittest(test_argument)

0 commit comments

Comments
 (0)