Merge branch 'master' of https://github.com/zsdonghao/tensorlayer

zsdonghao · zsdonghao · commit 2e1e42cedce8 · 2017-10-11T11:07:19.000+01:00
diff --git a/example/tutorial_atari_pong.py b/example/tutorial_atari_pong.py
@@ -93,17 +93,17 @@ def prepro(I):
 
         prob = sess.run(
             sampling_prob,
-            feed_dict={t_states: x}
-        )
+            feed_dict={t_states: x})
+        
         # action. 1: STOP  2: UP  3: DOWN
         # action = np.random.choice([1,2,3], p=prob.flatten())
         action = tl.rein.choice_action_by_probs(prob.flatten(), [1,2,3])
 
         observation, reward, done, _ = env.step(action)
         reward_sum += reward
-        xs.append(x)            # all observations in a episode
-        ys.append(action - 1)   # all fake labels in a episode (action begins from 1, so minus 1)
-        rs.append(reward)       # all rewards in a episode
+        xs.append(x)            # all observations in an episode
+        ys.append(action - 1)   # all fake labels in an episode (action begins from 1, so minus 1)
+        rs.append(reward)       # all rewards in an episode
         
         if done:
             episode_number += 1
@@ -125,9 +125,7 @@ def prepro(I):
                     feed_dict={
                         t_states: epx,
                         t_actions: epy,
-                        t_discount_rewards: disR
-                    }
-                )
+                        t_discount_rewards: disR})
 
             if episode_number % (batch_size * 100) == 0:
                 tl.files.save_npz(network.all_params, name=model_file_name+'.npz')
diff --git a/example/tutorial_imdb_fasttext.py b/example/tutorial_imdb_fasttext.py
@@ -40,7 +40,7 @@
 # in addition to unigrams.
 N_GRAM = 2
 
-# Size of vocabulary; less frequent works will be treated as "unknown"
+# Size of vocabulary; less frequent words will be treated as "unknown"
 VOCAB_SIZE = 100000
 
 # Number of buckets used for hashing n-grams
@@ -71,7 +71,7 @@ def __init__(self, vocab_size, embedding_size, n_labels):
             tf.int32, shape=[None], name='labels')
 
         # Network structure
-        network = AverageEmbeddingInputlayer(
+        network = AverageEmbeddingInputLayer(
             self.inputs, self.vocab_size, self.embedding_size)
         self.network = DenseLayer(network, self.n_labels)
 
diff --git a/tensorlayer/layers.py b/tensorlayer/layers.py
@@ -651,30 +651,34 @@ def __init__(
         self.all_drop = {}
 
 
-class AverageEmbeddingInputlayer(Layer):
-    """The :class:`AverageEmbeddingInputlayer` class is for FastText Embedding for sentence classification, see `[1] <http://arxiv.org/abs/1607.01759>`_.
+class AverageEmbeddingInputLayer(Layer):
+    """:class:`AverageEmbeddingInputlayer` averages over embeddings of inputs.
+
+    :class:`AverageEmbeddingInputlayer` can be used as the input layer
+    for models like DAN[1] and FastText[2].
 
     Parameters
     ------------
-    inputs : input placeholder or tensor; zeros are paddings
+    inputs : input placeholder or tensor
     vocabulary_size : an integer, the size of vocabulary
     embedding_size : an integer, the dimension of embedding vectors
+    pad_value : an integer, the scalar pad value used in inputs
     name : a string, the name of the layer
     embeddings_initializer : the initializer of the embedding matrix
     embeddings_kwargs : kwargs to get embedding matrix variable
 
     References
     ------------
-    - [1] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016). `Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`_
-    - [2] Recht, B., Re, C., Wright, S., & Niu, F. (2011). `Hogwild: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent. <https://arxiv.org/abs/1106.5730>`_ In NPIS 2011 (pp. 693–701).
-    - [3] `TensorFlow Candidate Sampling <https://www.tensorflow.org/api_guides/python/nn#Candidate_Sampling>`_
+    - [1] Iyyer, M., Manjunatha, V., Boyd-Graber, J., & Daum’e III, H. (2015). Deep Unordered Composition Rivals Syntactic Methods for Text Classification. In Association for Computational Linguistics.
+    - [2] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016).`Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`_
     """
     def __init__(
             self, inputs, vocabulary_size, embedding_size,
-            name='fasttext_layer',
+            pad_value=0,
+            name='average_embedding_layer',
             embeddings_initializer=tf.random_uniform_initializer(-0.1, 0.1),
-            embeddings_kwargs={}
-    ):#None):
+            embeddings_kwargs=None,
+    ):
         super().__init__(name=name)
 
         if inputs.get_shape().ndims != 2:
@@ -690,29 +694,24 @@ def __init__(
                 name='embeddings',
                 shape=(vocabulary_size, embedding_size),
                 initializer=embeddings_initializer,
-                # **(embeddings_kwargs or {}),
-                **embeddings_kwargs)
+                **(embeddings_kwargs or {}),
+            )
 
             word_embeddings = tf.nn.embedding_lookup(
                 self.embeddings, self.inputs,
                 name='word_embeddings',
             )
-
-            # Masks used to ignore padding words
-            masks = tf.expand_dims(
-                tf.sign(self.inputs),
-                axis=-1,
-                name='masks',
-            )
-            sum_word_embeddings = tf.reduce_sum(
-                word_embeddings * tf.cast(masks, tf.float32),
-                axis=1,
+            # Zero out embeddings of pad value
+            masks = tf.not_equal(self.inputs, pad_value, name='masks')
+            word_embeddings *= tf.cast(
+                tf.expand_dims(masks, axis=-1),
+                tf.float32,
             )
+            sum_word_embeddings = tf.reduce_sum(word_embeddings, axis=1)
 
             # Count number of non-padding words in each sentence
-            # Used to commute average word embeddings in sentences
             sentence_lengths = tf.count_nonzero(
-                self.inputs,
+                masks,
                 axis=1,
                 keep_dims=True,
                 dtype=tf.float32,
@@ -721,7 +720,7 @@ def __init__(
 
             sentence_embeddings = tf.divide(
                 sum_word_embeddings,
-                sentence_lengths,
+                sentence_lengths + 1e-8,  # Add epsilon to avoid dividing by 0
                 name='sentence_embeddings'
             )