PaddlePaddle
diff --git a/‎demo/seqToseq/api_train_v2.py‎
Lines changed: 46 additions & 40 deletions b/‎demo/seqToseq/api_train_v2.py‎
Lines changed: 46 additions & 40 deletions
diff --git a/‎paddle/gserver/layers/SequenceLastInstanceLayer.cpp‎
Lines changed: 14 additions & 10 deletions b/‎paddle/gserver/layers/SequenceLastInstanceLayer.cpp‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎paddle/gserver/layers/SequencePoolLayer.cpp‎
Lines changed: 10 additions & 2 deletions b/‎paddle/gserver/layers/SequencePoolLayer.cpp‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎paddle/gserver/layers/SequencePoolLayer.h‎
Lines changed: 9 additions & 0 deletions b/‎paddle/gserver/layers/SequencePoolLayer.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎paddle/gserver/tests/test_LayerGrad.cpp‎
Lines changed: 26 additions & 12 deletions b/‎paddle/gserver/tests/test_LayerGrad.cpp‎
Lines changed: 26 additions & 12 deletions
diff --git a/‎paddle/parameter/Argument.cpp‎
Lines changed: 43 additions & 0 deletions b/‎paddle/parameter/Argument.cpp‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎paddle/parameter/Argument.h‎
Lines changed: 9 additions & 0 deletions b/‎paddle/parameter/Argument.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎paddle/parameter/tests/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎paddle/parameter/tests/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -126,51 +126,57 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
 
 def main():
     paddle.init(use_gpu=False, trainer_count=1)
+    is_generating = True
 
     # source and target dict dim.
     dict_size = 30000
     source_dict_dim = target_dict_dim = dict_size
 
-    # define network topology
-    cost = seqToseq_net(source_dict_dim, target_dict_dim)
-    parameters = paddle.parameters.create(cost)
-
-    # define optimize method and trainer
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=5e-5,
-        regularization=paddle.optimizer.L2Regularization(rate=1e-3))
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer)
-
-    # define data reader
-    feeding = {
-        'source_language_word': 0,
-        'target_language_word': 1,
-        'target_language_next_word': 2
-    }
-
-    wmt14_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
-        batch_size=5)
-
-    # define event_handler callback
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 10 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-            else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-
-    # start to train
-    trainer.train(
-        reader=wmt14_reader,
-        event_handler=event_handler,
-        num_passes=10000,
-        feeding=feeding)
+    # train the network
+    if not is_generating:
+        cost = seqToseq_net(source_dict_dim, target_dict_dim)
+        parameters = paddle.parameters.create(cost)
+
+        # define optimize method and trainer
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+        # define data reader
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
+            batch_size=5)
+
+        # define event_handler callback
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 10 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost,
+                        event.metrics)
+                else:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+
+        # start to train
+        trainer.train(
+            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+
+    # generate a english sequence to french
+    else:
+        gen_creator = paddle.dataset.wmt14.test(dict_size)
+        gen_data = []
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == 3:
+                break
+
+        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
+        parameters = paddle.dataset.wmt14.model()
+        trg_dict = paddle.dataset.wmt14.trg_dict(dict_size)
 
 
 if __name__ == '__main__':
 
@@ -25,6 +25,11 @@ namespace paddle {
  * Input: a sequence
  * If SequenceLevel = kNonseq:
  *   Output: a sequence containing only the last instance of the input sequence
+ *   If stride_ > 0:
+ *      Output: a shorten sequence. The operation of getting last instance of a
+ *              sequence is independently performed on every slice of the input
+ *              sequence, which is obtained by sliding a window with the window
+ *              size set to stride_.
  * If SequenceLevel = kSeq:
  *   Check input sequence must has sub-sequence
  *   Output: a sequence containing only the last instance of each sub-sequence
@@ -37,6 +42,7 @@ class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
+  std::vector<int> instanceIds_;
 
 public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
@@ -54,6 +60,7 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                      const ParameterMap& parameterMap) {
   SequencePoolLayer::init(layerMap, parameterMap);
+  reversed_ = config_.select_first();
 
   tmpSrc_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
@@ -66,17 +73,19 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
 void SequenceLastInstanceLayer::forward(PassType passType) {
   SequencePoolLayer::forward(passType);
 
-  const int* starts = startPositions_->getData(false);
+  auto starts = (stride_ > 0) ? stridePositions_->getData()
+                              : startPositions_->getData(false);
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
   {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
 
+    instanceIds_.clear();
     for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      int insId =
-          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
+      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
+      instanceIds_.push_back(insId);
 
       outputValue->subMatrix(seqId, 1, tmpDest_)
           ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
@@ -96,18 +105,13 @@ void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  const int* starts = startPositions_->getData(false);
-  size_t numSequences = startPositions_->getSize() - 1;
 
   if (inputGrad) {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
 
-    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-      int insId =
-          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
-
-      inputGrad->subMatrix(insId, 1, tmpDest_)
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
           ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
     }
   }
 
@@ -37,6 +37,7 @@ bool SequencePoolLayer::init(const LayerMap& layerMap,
   } else {
     LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
   }
+  stride_ = config_.seq_pool_stride();
   setNeedSequenceInfo(false);
   return true;
 }
@@ -55,8 +56,6 @@ void SequencePoolLayer::forward(PassType passType) {
   CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
   CHECK_EQ(newBatchSize_, starts->getSize() - 1);
 
-  resetOutput(newBatchSize_, dim);
-
   /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
    * thus, in this case, output_ has no sequenceStartPositions.
    * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
@@ -67,6 +66,15 @@ void SequencePoolLayer::forward(PassType passType) {
         << "when trans_type = seq, input must hasSubseq";
     output_.degradeSequence(input);
   }
+  if (stride_ > 0) {
+    CHECK_EQ(input.hasSubseq(), 0UL)
+        << "sequence stride pooling is invalid for hasSubseq now";
+    output_.poolSequenceWithStride(
+        input, stride_, &stridePositions_, reversed_);
+    newBatchSize_ = stridePositions_->getSize() - 1;
+  }
+
+  resetOutput(newBatchSize_, dim);
 }
 
 void SequencePoolLayer::backward(const UpdateCallback& callback) {
 
@@ -26,6 +26,10 @@ namespace paddle {
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = seqlastin/average/max_{for each instance in this
  * sequence}{input[i]}
+ *    If stride_ > 0:
+ *        Check input sequence must not have sub-sequence
+ *        Output: a shorten sequence, pooling is performed upon a small local
+ *                area
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
@@ -42,6 +46,11 @@ class SequencePoolLayer : public Layer {
   enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
   size_t newBatchSize_;
   ICpuGpuVectorPtr startPositions_;
+  int stride_;
+  // Store the start position of each window.
+  IVectorPtr stridePositions_;
+  // Whether the input sequence is reversed or not.
+  bool reversed_ = false;
 
 public:
   explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
 
@@ -804,10 +804,14 @@ TEST(Layer, ExpandLayer) {
   testExpandLayer("seq", true);       // seq expand to hasSubseq
 }
 
-void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
+void testDegradeLayer(bool hasSubseq,
+                      string layer_type,
+                      string trans_type,
+                      int stride) {
   TestConfig config;
   config.layerConfig.set_type(layer_type);
   config.layerConfig.set_size(10);
+  config.layerConfig.set_seq_pool_stride(stride);
   config.biasSize = 0;
 
   config.inputDefs.push_back(
@@ -827,36 +831,46 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
   if (layer_type == "average") {
     for (auto strategy : {"average", "sum", "squarerootn"}) {
       LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy;
+                << " average_strategy=" << strategy
+                << " seq_pool_stride=" << stride;
       config.layerConfig.set_average_strategy(strategy);
       testDegradeLayerGrad(config, layer_type);
     }
   } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
+    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+              << " seq_pool_stride=" << stride;
     testDegradeLayerGrad(config, layer_type);
   }
 }
 
 TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq");  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq");   // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq");       // hasSubseq max to seq
+  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
+  testDegradeLayer(true, "max", "non-seq", -1);   // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);       // hasSubseq max to seq
 }
 
 TEST(Layer, SequenceLastInstanceLayer) {
   testDegradeLayer(false,
                    "seqlastins",
-                   "non-seq");  // seq seqlastins to non-seq
+                   "non-seq",
+                   -1);  // seq seqlastins to non-seq
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   5);  // seq seqlastins to a shorten seq, stride window = 5
   testDegradeLayer(true,
                    "seqlastins",
-                   "non-seq");  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
+                   "non-seq",
+                   -1);  // hasSubseq seqlastins to non-seq
+  testDegradeLayer(
+      true, "seqlastins", "seq", -1);  // hasSubseq seqlastins to seq
 }
 
 TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
-  testDegradeLayer(true, "average", "non-seq");  // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq");      // hasSubseq average to seq
+  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(
+      true, "average", "non-seq", -1);           // hasSubseq average to non-seq
+  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
 }
 
 TEST(Layer, SequenceConcatLayer) {
 
@@ -559,6 +559,49 @@ void Argument::degradeSequence(const Argument& input) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
+void Argument::poolSequenceWithStride(const Argument& input,
+                                      size_t stride,
+                                      IVectorPtr* stridePostions,
+                                      bool reversed) {
+  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
+  // then sequenceStartPositions = [0, 2, 3, 4, 7].
+  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
+  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
+
+  CHECK(input.sequenceStartPositions);
+  CHECK_EQ(input.hasSubseq(), 0UL);
+  CHECK_GT(stride, 0) << "stride must larger than 0";
+  size_t numSequences = input.getNumSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  // first index of target sequence and stride positions are both 0
+  tgtBuf[0] = 0;
+  std::vector<int> stridePos;
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    size_t seqLength = starts[seqId + 1] - starts[seqId];
+    stridePos.emplace_back(starts[seqId]);
+    if (seqLength == 0) {
+      // empty sequence
+      tgtBuf[seqId + 1] = tgtBuf[seqId];
+    } else {
+      int size = ceil((float)seqLength / stride);
+      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
+      for (int i = 0; i < size - 1; ++i) {
+        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
+                           : stridePos.back() + stride;
+        stridePos.emplace_back(cur);
+      }
+    }
+  }
+  stridePos.emplace_back(starts[numSequences]);
+  int size = stridePos.size();
+  CHECK_EQ(size - 1, tgtBuf[numSequences]);
+  IVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->copyFrom(stridePos.data(), size);
+}
+
 void Argument::getValueString(
     std::unordered_map<std::string, std::string>* out) const {
   if (value) {
 
@@ -291,6 +291,15 @@ struct Argument {
    */
   void degradeSequence(const Argument& input);
 
+  /*
+   After pooling with stride n (n is smaller than sequence length),
+   a long sequence will be shorten.
+   This function is invalid for sequence having sub-sequence.
+   */
+  void poolSequenceWithStride(const Argument& input,
+                              size_t stride,
+                              IVectorPtr* stridePositions,
+                              bool reversed = false);
   /**
    * @brief getValueString will return the argument's output in string. There
    * are several kinds of output. The keys of output dictionary are 'value',
 
@@ -1 +1,2 @@
 add_simple_unittest(test_common)
+add_simple_unittest(test_argument)
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`add_simple_unittest(test_common)`
	`2`	`+add_simple_unittest(test_argument)`