|
169 | 169 | 'log_loss',
|
170 | 170 | 'add_position_encoding',
|
171 | 171 | 'bilinear_tensor_product',
|
| 172 | + 'lstm', |
172 | 173 | ]
|
173 | 174 |
|
174 | 175 |
|
@@ -472,6 +473,168 @@ def dynamic_lstm(input,
|
472 | 473 | return hidden, cell
|
473 | 474 |
|
474 | 475 |
|
| 476 | +def lstm(input, |
| 477 | + init_h, |
| 478 | + init_c, |
| 479 | + max_len, |
| 480 | + hidden_size, |
| 481 | + num_layers, |
| 482 | + dropout_prob=0.0, |
| 483 | + is_bidirec=False, |
| 484 | + is_test=False, |
| 485 | + name=None, |
| 486 | + default_initializer=None, |
| 487 | + seed=-1): |
| 488 | + """ |
| 489 | + If Device is GPU, This op will use cudnn LSTM implementation |
| 490 | +
|
| 491 | + A four-gate Long Short-Term Memory network with no peephole connections. |
| 492 | + In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, |
| 493 | + the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: |
| 494 | +
|
| 495 | + $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ |
| 496 | +
|
| 497 | + $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ |
| 498 | +
|
| 499 | + $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ |
| 500 | +
|
| 501 | + $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ |
| 502 | +
|
| 503 | + $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ |
| 504 | +
|
| 505 | + $$ h_t = o_t \\odot tanh(c_t) $$ |
| 506 | +
|
| 507 | + - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix |
| 508 | + of weights from the input gate to the input) |
| 509 | + - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). |
| 510 | + - sigmoid is the logistic sigmoid function. |
| 511 | + - $i, f, o$ and $c$ are the input gate, forget gate, output gate, |
| 512 | + and cell activation vectors, respectively, all of which have the same size as |
| 513 | + the cell output activation vector $h$. |
| 514 | + - The $\odot$ is the element-wise product of the vectors. |
| 515 | + - `tanh` is the activation functions. |
| 516 | + - $\tilde{c_t}$ is also called candidate hidden state, |
| 517 | + which is computed based on the current input and the previous hidden state. |
| 518 | +
|
| 519 | + Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, |
| 520 | + X represensts a matrix multiplication |
| 521 | +
|
| 522 | +
|
| 523 | + Args: |
| 524 | + input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) |
| 525 | + init_h(Variable): The initial hidden state of the LSTM |
| 526 | + This is a tensor with shape ( num_layers x batch_size x hidden_size) |
| 527 | + if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) |
| 528 | + init_c(Variable): The initial cell state of the LSTM. |
| 529 | + This is a tensor with shape ( num_layers x batch_size x hidden_size ) |
| 530 | + if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) |
| 531 | + max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len |
| 532 | + hidden_size (int): hidden size of the LSTM |
| 533 | + num_layers (int): total layers number of the LSTM |
| 534 | + dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps |
| 535 | + There is NO dropout work on rnn output of the last RNN layers |
| 536 | + is_bidirec (bool): If it is bidirectional |
| 537 | + is_test (bool): If it is in test phrase |
| 538 | + name (str|None): A name for this layer(optional). If set None, the layer |
| 539 | + will be named automatically. |
| 540 | + default_initializer(Initialize|None): Where use initializer to initialize the Weight |
| 541 | + If set None, defaule initializer will be used |
| 542 | + seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed |
| 543 | +
|
| 544 | +
|
| 545 | + Returns: |
| 546 | + rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) |
| 547 | + if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) |
| 548 | + last_h(Tensor): the hidden state of the last step of LSTM |
| 549 | + shape is ( num_layers x batch_size x hidden_size ) |
| 550 | + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) |
| 551 | + last_c(Tensor): the cell state of the last step of LSTM |
| 552 | + shape is ( num_layers x batch_size x hidden_size ) |
| 553 | + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) |
| 554 | +
|
| 555 | +
|
| 556 | + Examples: |
| 557 | + .. code-block:: python |
| 558 | +
|
| 559 | + input = embedding |
| 560 | + batch_size = 20 |
| 561 | + max_len = 100 |
| 562 | + dropout_prob = 0.2 |
| 563 | + input_size = 100 |
| 564 | + hidden_size = 150 |
| 565 | + num_layers = 1 |
| 566 | + init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) |
| 567 | + init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) |
| 568 | +
|
| 569 | + rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \ |
| 570 | + max_len, dropout_prob, input_size, hidden_size, \ |
| 571 | + num_layers) |
| 572 | + """ |
| 573 | + |
| 574 | + helper = LayerHelper('cudnn_lstm', **locals()) |
| 575 | + |
| 576 | + dtype = input.dtype |
| 577 | + input_shape = list(input.shape) |
| 578 | + input_size = input_shape[-1] |
| 579 | + weight_size = 0 |
| 580 | + for i in range(num_layers): |
| 581 | + if i == 0: |
| 582 | + input_weight_size = (input_size * hidden_size) * 4 |
| 583 | + else: |
| 584 | + if is_bidirec: |
| 585 | + input_weight_size = (hidden_size * 2 * hidden_size) * 4 |
| 586 | + else: |
| 587 | + input_weight_size = (hidden_size * hidden_size) * 4 |
| 588 | + |
| 589 | + hidden_weight_size = (hidden_size * hidden_size) * 4 |
| 590 | + |
| 591 | + if is_bidirec: |
| 592 | + weight_size += (input_weight_size + hidden_weight_size) * 2 |
| 593 | + weight_size += hidden_size * 8 * 2 |
| 594 | + else: |
| 595 | + weight_size += input_weight_size + hidden_weight_size |
| 596 | + weight_size += hidden_size * 8 |
| 597 | + |
| 598 | + weight = helper.create_parameter( |
| 599 | + attr=helper.param_attr, |
| 600 | + shape=[weight_size], |
| 601 | + dtype=dtype, |
| 602 | + default_initializer=default_initializer) |
| 603 | + |
| 604 | + out = helper.create_variable_for_type_inference(dtype) |
| 605 | + last_h = helper.create_variable_for_type_inference(dtype) |
| 606 | + last_c = helper.create_variable_for_type_inference(dtype) |
| 607 | + |
| 608 | + cache = helper.create_variable( |
| 609 | + persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True) |
| 610 | + |
| 611 | + helper.append_op( |
| 612 | + type='cudnn_lstm', |
| 613 | + inputs={ |
| 614 | + 'Input': input, |
| 615 | + 'InitH': init_h, |
| 616 | + 'InitC': init_c, |
| 617 | + 'W': weight, |
| 618 | + 'Cache': cache, |
| 619 | + }, |
| 620 | + outputs={ |
| 621 | + 'Out': out, |
| 622 | + 'last_h': last_h, |
| 623 | + 'last_c': last_c, |
| 624 | + }, |
| 625 | + attrs={ |
| 626 | + 'max_len': max_len, |
| 627 | + 'is_bidirec': is_bidirec, |
| 628 | + 'input_size': input_size, |
| 629 | + 'hidden_size': hidden_size, |
| 630 | + 'num_layers': num_layers, |
| 631 | + 'is_test': is_test, |
| 632 | + 'dropout_prob': dropout_prob, |
| 633 | + 'seed': seed, |
| 634 | + }) |
| 635 | + return out, last_h, last_c |
| 636 | + |
| 637 | + |
475 | 638 | def dynamic_lstmp(input,
|
476 | 639 | size,
|
477 | 640 | proj_size,
|
|
0 commit comments