New Feature: Modify the Hash layer to support the lookup table (#387)

dengc367 · web-flow · commit 825bba10aea4 · 2021-07-12T22:27:25.000+08:00
New Feature: 

Modify the `Hash` layer to support the lookup feature. 

Now there are two hash techniques  supported in the `Hash` layer:

1. Lookup Table: when setup `vocabulary_path`,  it can looks up input keys in a table  and output the corresponding values. Missed keys are always return the default value, eg. `0`.
2. Bucket Hash: when `vocabulary_path` is not set, `Hash` will hash the input keys to [0,num_buckets). Parameter `mask_zero` can set `True`, which will set the hash value `0` when the input keys are `0` or `0.0`, and other value will be hash in range [1,num_buckets).

Initializing `Hash` with `vocabulary_path` CSV file which need follow the convention：the first column as keys and second column as values which are seperated by comma. 

The following is example snippet:

    * `1,emerson`
    * `2,lake`
    * `3,palmer`

    &gt;&gt;&gt; hash = Hash(
    ...   num_buckets=3+1,
    ...   vocabulary_path=filename,
    ...   default_value=0)
    &gt;&gt;&gt; hash(tf.constant('lake')).numpy()
    2
    &gt;&gt;&gt; hash(tf.constant('lakeemerson')).numpy()
    0
diff --git a/deepctr/feature_column.py b/deepctr/feature_column.py
@@ -15,12 +15,12 @@
 
 
 class SparseFeat(namedtuple('SparseFeat',
-                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embeddings_initializer',
+                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer',
                              'embedding_name',
                              'group_name', 'trainable'])):
     __slots__ = ()
 
-    def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="int32", embeddings_initializer=None,
+    def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype="int32", embeddings_initializer=None,
                 embedding_name=None,
                 group_name=DEFAULT_GROUP_NAME, trainable=True):
 
@@ -32,7 +32,7 @@ def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="
         if embedding_name is None:
             embedding_name = name
 
-        return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, dtype,
+        return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype,
                                               embeddings_initializer,
                                               embedding_name, group_name, trainable)
 
@@ -64,6 +64,10 @@ def embedding_dim(self):
     def use_hash(self):
         return self.sparsefeat.use_hash
 
+    @property
+    def vocabulary_path(self):
+        return self.sparsefeat.vocabulary_path
+
     @property
     def dtype(self):
         return self.sparsefeat.dtype
diff --git a/deepctr/inputs.py b/deepctr/inputs.py
@@ -51,7 +51,7 @@ def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, r
         feat_name = fg.name
         if len(return_feat_list) == 0 or feat_name in return_feat_list:
             if fg.use_hash:
-                lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list))(input_dict[feat_name])
+                lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list), vocabulary_path=fg.vocabulary_path)(input_dict[feat_name])
             else:
                 lookup_idx = input_dict[feat_name]
 
@@ -80,7 +80,7 @@ def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_co
         embedding_name = fc.embedding_name
         if (len(return_feat_list) == 0 or feature_name in return_feat_list):
             if fc.use_hash:
-                lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))(
+                lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list), vocabulary_path=fc.vocabulary_path)(
                     sparse_input_dict[feature_name])
             else:
                 lookup_idx = sparse_input_dict[feature_name]
@@ -97,7 +97,7 @@ def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_f
         feature_name = fc.name
         embedding_name = fc.embedding_name
         if fc.use_hash:
-            lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name])
+            lookup_idx = Hash(fc.vocabulary_size, mask_zero=True, vocabulary_path=fc.vocabulary_path)(sequence_input_dict[feature_name])
         else:
             lookup_idx = sequence_input_dict[feature_name]
         varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
diff --git a/deepctr/layers/core.py b/deepctr/layers/core.py
@@ -68,8 +68,8 @@ def build(self, input_shape):
                              'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
                              'Got different shapes: %s,%s' % (input_shape[0], input_shape[1]))
         size = 4 * \
-               int(input_shape[0][-1]
-                   ) if len(self.hidden_units) == 0 else self.hidden_units[-1]
+            int(input_shape[0][-1]
+                ) if len(self.hidden_units) == 0 else self.hidden_units[-1]
         self.kernel = self.add_weight(shape=(size, 1),
                                       initializer=glorot_normal(
                                           seed=self.seed),
@@ -78,9 +78,6 @@ def build(self, input_shape):
             shape=(1,), initializer=Zeros(), name="bias")
         self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg, self.dropout_rate, self.use_bn, seed=self.seed)
 
-        self.dense = tf.keras.layers.Lambda(lambda x: tf.nn.bias_add(tf.tensordot(
-            x[0], x[1], axes=(-1, 0)), x[2]))
-
         super(LocalActivationUnit, self).build(
             input_shape)  # Be sure to call this somewhere!
 
@@ -96,7 +93,7 @@ def call(self, inputs, training=None, **kwargs):
 
         att_out = self.dnn(att_input, training=training)
 
-        attention_score = self.dense([att_out, self.kernel, self.bias])
+        attention_score = tf.nn.bias_add(tf.tensordot(att_out, self.kernel, axes=(-1, 0)), self.bias)
 
         return attention_score
 
diff --git a/deepctr/layers/sequence.py b/deepctr/layers/sequence.py
@@ -560,10 +560,10 @@ def call(self, inputs, mask=None, training=None, **kwargs):
         if self.blinding:
             try:
                 outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
-                                                      :, :, 0] * (-2 ** 32 + 1))
-            except:
+                    :, :, 0] * (-2 ** 32 + 1))
+            except AttributeError as e:
                 outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
-                                                                :, :, 0] * (-2 ** 32 + 1))
+                    :, :, 0] * (-2 ** 32 + 1))
 
         outputs -= reduce_max(outputs, axis=-1, keep_dims=True)
         outputs = softmax(outputs)
@@ -633,14 +633,14 @@ def build(self, input_shape):
         _, T, num_units = input_shape.as_list()  # inputs.get_shape().as_list()
         # First part of the PE function: sin and cos argument
         position_enc = np.array([
-            [pos / np.power(10000, 2. * i / num_units)
-             for i in range(num_units)]
+            [pos / np.power(10000, 2. * (i//2) / num_units) for i in range(num_units)]
             for pos in range(T)])
 
         # Second part, apply the cosine to even columns and sin to odds.
         position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
         position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
-
+        if self.zero_pad:
+            position_enc[0, :] = np.zeros(num_units)
         self.lookup_table = self.add_weight("lookup_table", (T, num_units),
                                             initializer=tf.initializers.identity(position_enc),
                                             trainable=self.pos_embedding_trainable)
@@ -651,13 +651,7 @@ def build(self, input_shape):
     def call(self, inputs, mask=None):
         _, T, num_units = inputs.get_shape().as_list()
         position_ind = tf.expand_dims(tf.range(T), 0)
-
-        if self.zero_pad:
-            self.lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
-                                           self.lookup_table[1:, :]), 0)
-
         outputs = tf.nn.embedding_lookup(self.lookup_table, position_ind)
-
         if self.scale:
             outputs = outputs * num_units ** 0.5
         return outputs + inputs
diff --git a/deepctr/layers/utils.py b/deepctr/layers/utils.py
@@ -7,6 +7,11 @@
 """
 import tensorflow as tf
 from tensorflow.python.keras.layers import Flatten
+from tensorflow.python.ops.lookup_ops import TextFileInitializer
+try:
+    from tensorflow.python.ops.lookup_ops import StaticHashTable
+except ImportError as e:
+    from tensorflow.python.ops.lookup_ops import HashTable as StaticHashTable
 
 
 class NoMask(tf.keras.layers.Layer):
@@ -25,14 +30,47 @@ def compute_mask(self, inputs, mask):
 
 
 class Hash(tf.keras.layers.Layer):
-    """
-    hash the input to [0,num_buckets)
-    if mask_zero = True,0 or 0.0 will be set to 0,other value will be set in range[1,num_buckets)
+    """Looks up keys in a table when setup `vocabulary_path`, which outputs the corresponding values.
+    If `vocabulary_path` is not set, `Hash` will hash the input to [0,num_buckets). When `mask_zero` = True,
+    input value `0` or `0.0` will be set to `0`, and other value will be set in range [1,num_buckets).
+
+    The following snippet initializes a `Hash` with `vocabulary_path` file with the first column as keys and
+    second column as values:
+
+    * `1,emerson`
+    * `2,lake`
+    * `3,palmer`
+
+    >>> hash = Hash(
+    ...   num_buckets=3+1,
+    ...   vocabulary_path=filename,
+    ...   default_value=0)
+    >>> hash(tf.constant('lake')).numpy()
+    2
+    >>> hash(tf.constant('lakeemerson')).numpy()
+    0
+
+    Args:
+        num_buckets: An `int` that is >= 1. The number of buckets or the vocabulary size + 1
+            when `vocabulary_path` is setup.
+        mask_zero: default is False. The `Hash` value will hash input `0` or `0.0` to value `0` when
+            the `mask_zero` is `True`. `mask_zero` is not used when `vocabulary_path` is setup.
+        vocabulary_path: default `None`. The `CSV` text file path of the vocabulary hash, which contains
+            two columns seperated by delimiter `comma`, the first column is the value and the second is
+            the key. The key data type is `string`, the value data type is `int`. The path must
+            be accessible from wherever `Hash` is initialized.
+        default_value: default '0'. The default value if a key is missing in the table.
+        **kwargs: Additional keyword arguments.
     """
 
-    def __init__(self, num_buckets, mask_zero=False, **kwargs):
+    def __init__(self, num_buckets, mask_zero=False, vocabulary_path=None, default_value=0, **kwargs):
         self.num_buckets = num_buckets
         self.mask_zero = mask_zero
+        self.vocabulary_path = vocabulary_path
+        self.default_value = default_value
+        if self.vocabulary_path:
+            initializer = TextFileInitializer(vocabulary_path, 'string', 1, 'int64', 0, delimiter=',')
+            self.hash_table = StaticHashTable(initializer, default_value=self.default_value)
         super(Hash, self).__init__(**kwargs)
 
     def build(self, input_shape):
@@ -41,13 +79,16 @@ def build(self, input_shape):
 
     def call(self, x, mask=None, **kwargs):
 
-
         if x.dtype != tf.string:
             zero = tf.as_string(tf.zeros([1], dtype=x.dtype))
             x = tf.as_string(x, )
         else:
             zero = tf.as_string(tf.zeros([1], dtype='int32'))
 
+        if self.vocabulary_path:
+            hash_x = self.hash_table.lookup(x)
+            return hash_x
+
         num_buckets = self.num_buckets if not self.mask_zero else self.num_buckets - 1
         try:
             hash_x = tf.string_to_hash_bucket_fast(x, num_buckets,
@@ -60,8 +101,12 @@ def call(self, x, mask=None, **kwargs):
             hash_x = (hash_x + 1) * mask
 
         return hash_x
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
     def get_config(self, ):
-        config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, }
+        config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, 'vocabulary_path': self.vocabulary_path, 'default_value': self.default_value}
         base_config = super(Hash, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/docs/source/Features.md b/docs/source/Features.md
@@ -23,12 +23,13 @@ DNN based CTR prediction models usually have following 4 modules:
 
 ## Feature Columns
 ### SparseFeat
-``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, dtype, embeddings_initializer, embedding_name, group_name, trainable)``
+``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype, embeddings_initializer, embedding_name, group_name, trainable)``
 
 - name : feature name
 - vocabulary_size : number of unique feature values for sprase feature or hashing space when `use_hash=True`
 - embedding_dim : embedding dimension
-- use_hash : defualt `False`.If `True` the input will be hashed to space of size `vocabulary_size`.
+- use_hash : default `False`.If `True` the input will be hashed to space of size `vocabulary_size`.
+- vocabulary_path : default `None`. The `CSV` text file path of the vocabulary table used by `tf.lookup.TextFileInitializer`, which assigns one entry in the table for each line in the file. One entry contains two columns seperated by comma, the first is the value column, the second is the key column. The `0` value is reserved to use if a key is missing in the table, so hash value need start from `1`.
 - dtype : default `int32`.dtype of input tensor.
 - embeddings_initializer : initializer for the `embeddings` matrix.
 - embedding_name : default `None`. If None, the embedding_name will be same as `name`.
diff --git a/examples/run_multivalue_movielens_vocab_hash.py b/examples/run_multivalue_movielens_vocab_hash.py
@@ -0,0 +1,124 @@
+from deepctr.models import DeepFM
+from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
+import functools
+import os
+import numpy as np
+import pandas as pd
+import shutil
+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+try:
+    import tensorflow.compat.v1 as tf
+except ImportError as e:
+    import tensorflow as tf
+
+
+def init_vocab(df, tmpdir):
+    """initialize the vacabulary file of the sparse features
+    """
+    vocab_size = {}
+
+    df_user_id = df.user_id.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index')
+    df_user_id.index += 1
+    df_user_id.to_csv(f'{tmpdir}/user_id.csv', sep=',', index=True, header=False)
+    # must set to vocabulary size pluse 1, because 0 is used for miss of has and mask, same below
+    vocab_size['user_id'] = len(df_user_id) + 1
+
+    df_movie_id = df.movie_id.drop_duplicates().dropna().sort_values().reset_index().drop(
+        columns='index')
+    df_movie_id.index += 1
+    df_movie_id.to_csv(f'{tmpdir}/movie_id.csv', sep=',', index=True, header=False)
+    vocab_size['movie_id'] = len(df_movie_id) + 1
+
+    df_genre = pd.DataFrame({
+        'genre': list(set(functools.reduce(lambda x, y: x + y, df.genres.str.split('|'))))
+    }).genre.sort_values()
+    df_genre.index += 1
+    df_genre.to_csv(f'{tmpdir}/genre.csv', sep=',', index=True, header=False)
+    vocab_size['genre'] = len(df_genre) + 1
+
+    df_gender = df.gender.drop_duplicates().replace(
+        r'^\s*$', np.nan,
+        regex=True).dropna().sort_values().reset_index().drop(
+            columns='index')
+    df_gender.index += 1
+    df_gender.to_csv(f'{tmpdir}/gender.csv', sep=',', index=True, header=False)
+    vocab_size['gender'] = len(df_gender) + 1
+
+    df_age = df.age.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index')
+    df_age.index += 1
+    df_age.to_csv(f'{tmpdir}/age.csv', sep=',', index=True, header=False)
+    vocab_size['age'] = len(df_age) + 1
+
+    df_occupation = df.occupation.drop_duplicates().replace(
+        r'^\s*$', np.nan,
+        regex=True).dropna().sort_values().reset_index().drop(
+            columns='index')
+    df_occupation.index += 1
+    df_occupation.to_csv(f'{tmpdir}/occupation.csv', sep=',', index=True, header=False)
+    vocab_size['occupation'] = len(df_occupation) + 1
+
+    df_zip = df.zip.drop_duplicates().replace(
+        r'^\s*$', np.nan,
+        regex=True).dropna().sort_values().reset_index().drop(columns='index')
+    df_zip.index += 1
+    df_zip.to_csv(f'{tmpdir}/zip.csv', sep=',', index=True, header=False)
+    vocab_size['zip'] = len(df_zip) + 1
+    return vocab_size
+
+
+if __name__ == "__main__":
+    # change this to where the movielens dataset and work directory is
+    workdir = os.path.dirname(__file__)
+    data = pd.read_csv(f"{workdir}/movielens_sample.txt")
+
+    metadir = f'{workdir}/meta'
+    if not os.path.exists(metadir):
+        os.mkdir(metadir)
+    vocab_size = init_vocab(data, metadir)
+
+    sparse_features = ["movie_id", "user_id",
+                       "gender", "age", "occupation", "zip", ]
+
+    data[sparse_features] = data[sparse_features].astype(str)
+    target = ['rating']
+
+    # 1.Use hashing encoding on the fly for sparse features,and process sequence features
+
+    genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
+    genres_length = np.array(list(map(len, genres_list)))
+    max_len = max(genres_length)
+
+    # Notice : padding=`post`
+    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
+
+    # 2.set hashing space for each sparse field and generate feature config for sequence feature
+
+    fixlen_feature_columns = [SparseFeat(feat, vocab_size[feat], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/{feat}.csv', dtype='string')
+                              for feat in sparse_features]
+    varlen_feature_columns = [
+        VarLenSparseFeat(SparseFeat('genres', vocabulary_size=vocab_size['genre'], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/genre.csv', dtype="string"),
+                         maxlen=max_len, combiner='mean',
+                         )]  # Notice : value 0 is for padding for sequence input feature
+    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
+    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
+    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
+
+    # 3.generate input data for model
+    model_input = {name: data[name] for name in feature_names}
+    model_input['genres'] = genres_list
+
+    # 4.Define Model,compile and train
+    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
+    model.compile("adam", "mse", metrics=['mse'], )
+    if not hasattr(tf, 'version') or tf.version.VERSION < '2.0.0':
+        with tf.Session() as sess:
+            sess.run(tf.tables_initializer())
+            history = model.fit(model_input, data[target].values,
+                                batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
+    else:
+        history = model.fit(model_input, data[target].values,
+                            batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
+    if os.path.exists(metadir):
+        shutil.rmtree(metadir)
+
+# %%
diff --git a/tests/feature_test.py b/tests/feature_test.py
@@ -1,6 +1,8 @@
 from deepctr.models import DeepFM
-from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
+from deepctr.feature_column import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
 import numpy as np
+
+
 def test_long_dense_vector():
 
     feature_columns = [SparseFeat('user_id', 4, ), SparseFeat('item_id', 5, ), DenseFeat("pic_vec", 5)]
@@ -16,4 +18,12 @@ def test_long_dense_vector():
 
     model = DeepFM(feature_columns, feature_columns[:-1])
     model.compile('adagrad', 'binary_crossentropy')
-    model.fit(model_input, label)
+    model.fit(model_input, label)
+
+
+def test_feature_column_sparsefeat_vocabulary_path():
+    vocab_path = "./dummy_test.csv"
+    sf = SparseFeat('user_id', 4, vocabulary_path=vocab_path)
+    assert sf.vocabulary_path == vocab_path
+    vlsf = VarLenSparseFeat(sf, 6)
+    assert vlsf.vocabulary_path == vocab_path
diff --git a/tests/layers/sequence_test.py b/tests/layers/sequence_test.py
diff --git a/tests/layers/utils_test.py b/tests/layers/utils_test.py
diff --git a/tests/layers/vocabulary_example.csv b/tests/layers/vocabulary_example.csv
diff --git a/tests/utils.py b/tests/utils.py