Skip to content

Commit 825bba1

Browse files
authored
New Feature: Modify the Hash layer to support the lookup table (#387)
New Feature: Modify the `Hash` layer to support the lookup feature. Now there are two hash techniques supported in the `Hash` layer: 1. Lookup Table: when setup `vocabulary_path`, it can looks up input keys in a table and output the corresponding values. Missed keys are always return the default value, eg. `0`. 2. Bucket Hash: when `vocabulary_path` is not set, `Hash` will hash the input keys to [0,num_buckets). Parameter `mask_zero` can set `True`, which will set the hash value `0` when the input keys are `0` or `0.0`, and other value will be hash in range [1,num_buckets). Initializing `Hash` with `vocabulary_path` CSV file which need follow the convention:the first column as keys and second column as values which are seperated by comma. The following is example snippet: * `1,emerson` * `2,lake` * `3,palmer` >>> hash = Hash( ... num_buckets=3+1, ... vocabulary_path=filename, ... default_value=0) >>> hash(tf.constant('lake')).numpy() 2 >>> hash(tf.constant('lakeemerson')).numpy() 0
1 parent 0df401c commit 825bba1

File tree

12 files changed

+250
-44
lines changed

12 files changed

+250
-44
lines changed

deepctr/feature_column.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515

1616

1717
class SparseFeat(namedtuple('SparseFeat',
18-
['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embeddings_initializer',
18+
['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer',
1919
'embedding_name',
2020
'group_name', 'trainable'])):
2121
__slots__ = ()
2222

23-
def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="int32", embeddings_initializer=None,
23+
def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype="int32", embeddings_initializer=None,
2424
embedding_name=None,
2525
group_name=DEFAULT_GROUP_NAME, trainable=True):
2626

@@ -32,7 +32,7 @@ def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="
3232
if embedding_name is None:
3333
embedding_name = name
3434

35-
return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, dtype,
35+
return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype,
3636
embeddings_initializer,
3737
embedding_name, group_name, trainable)
3838

@@ -64,6 +64,10 @@ def embedding_dim(self):
6464
def use_hash(self):
6565
return self.sparsefeat.use_hash
6666

67+
@property
68+
def vocabulary_path(self):
69+
return self.sparsefeat.vocabulary_path
70+
6771
@property
6872
def dtype(self):
6973
return self.sparsefeat.dtype

deepctr/inputs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, r
5151
feat_name = fg.name
5252
if len(return_feat_list) == 0 or feat_name in return_feat_list:
5353
if fg.use_hash:
54-
lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list))(input_dict[feat_name])
54+
lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list), vocabulary_path=fg.vocabulary_path)(input_dict[feat_name])
5555
else:
5656
lookup_idx = input_dict[feat_name]
5757

@@ -80,7 +80,7 @@ def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_co
8080
embedding_name = fc.embedding_name
8181
if (len(return_feat_list) == 0 or feature_name in return_feat_list):
8282
if fc.use_hash:
83-
lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))(
83+
lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list), vocabulary_path=fc.vocabulary_path)(
8484
sparse_input_dict[feature_name])
8585
else:
8686
lookup_idx = sparse_input_dict[feature_name]
@@ -97,7 +97,7 @@ def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_f
9797
feature_name = fc.name
9898
embedding_name = fc.embedding_name
9999
if fc.use_hash:
100-
lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name])
100+
lookup_idx = Hash(fc.vocabulary_size, mask_zero=True, vocabulary_path=fc.vocabulary_path)(sequence_input_dict[feature_name])
101101
else:
102102
lookup_idx = sequence_input_dict[feature_name]
103103
varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)

deepctr/layers/core.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ def build(self, input_shape):
6868
'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
6969
'Got different shapes: %s,%s' % (input_shape[0], input_shape[1]))
7070
size = 4 * \
71-
int(input_shape[0][-1]
72-
) if len(self.hidden_units) == 0 else self.hidden_units[-1]
71+
int(input_shape[0][-1]
72+
) if len(self.hidden_units) == 0 else self.hidden_units[-1]
7373
self.kernel = self.add_weight(shape=(size, 1),
7474
initializer=glorot_normal(
7575
seed=self.seed),
@@ -78,9 +78,6 @@ def build(self, input_shape):
7878
shape=(1,), initializer=Zeros(), name="bias")
7979
self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg, self.dropout_rate, self.use_bn, seed=self.seed)
8080

81-
self.dense = tf.keras.layers.Lambda(lambda x: tf.nn.bias_add(tf.tensordot(
82-
x[0], x[1], axes=(-1, 0)), x[2]))
83-
8481
super(LocalActivationUnit, self).build(
8582
input_shape) # Be sure to call this somewhere!
8683

@@ -96,7 +93,7 @@ def call(self, inputs, training=None, **kwargs):
9693

9794
att_out = self.dnn(att_input, training=training)
9895

99-
attention_score = self.dense([att_out, self.kernel, self.bias])
96+
attention_score = tf.nn.bias_add(tf.tensordot(att_out, self.kernel, axes=(-1, 0)), self.bias)
10097

10198
return attention_score
10299

deepctr/layers/sequence.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -560,10 +560,10 @@ def call(self, inputs, mask=None, training=None, **kwargs):
560560
if self.blinding:
561561
try:
562562
outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
563-
:, :, 0] * (-2 ** 32 + 1))
564-
except:
563+
:, :, 0] * (-2 ** 32 + 1))
564+
except AttributeError as e:
565565
outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
566-
:, :, 0] * (-2 ** 32 + 1))
566+
:, :, 0] * (-2 ** 32 + 1))
567567

568568
outputs -= reduce_max(outputs, axis=-1, keep_dims=True)
569569
outputs = softmax(outputs)
@@ -633,14 +633,14 @@ def build(self, input_shape):
633633
_, T, num_units = input_shape.as_list() # inputs.get_shape().as_list()
634634
# First part of the PE function: sin and cos argument
635635
position_enc = np.array([
636-
[pos / np.power(10000, 2. * i / num_units)
637-
for i in range(num_units)]
636+
[pos / np.power(10000, 2. * (i//2) / num_units) for i in range(num_units)]
638637
for pos in range(T)])
639638

640639
# Second part, apply the cosine to even columns and sin to odds.
641640
position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i
642641
position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1
643-
642+
if self.zero_pad:
643+
position_enc[0, :] = np.zeros(num_units)
644644
self.lookup_table = self.add_weight("lookup_table", (T, num_units),
645645
initializer=tf.initializers.identity(position_enc),
646646
trainable=self.pos_embedding_trainable)
@@ -651,13 +651,7 @@ def build(self, input_shape):
651651
def call(self, inputs, mask=None):
652652
_, T, num_units = inputs.get_shape().as_list()
653653
position_ind = tf.expand_dims(tf.range(T), 0)
654-
655-
if self.zero_pad:
656-
self.lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
657-
self.lookup_table[1:, :]), 0)
658-
659654
outputs = tf.nn.embedding_lookup(self.lookup_table, position_ind)
660-
661655
if self.scale:
662656
outputs = outputs * num_units ** 0.5
663657
return outputs + inputs

deepctr/layers/utils.py

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
"""
88
import tensorflow as tf
99
from tensorflow.python.keras.layers import Flatten
10+
from tensorflow.python.ops.lookup_ops import TextFileInitializer
11+
try:
12+
from tensorflow.python.ops.lookup_ops import StaticHashTable
13+
except ImportError as e:
14+
from tensorflow.python.ops.lookup_ops import HashTable as StaticHashTable
1015

1116

1217
class NoMask(tf.keras.layers.Layer):
@@ -25,14 +30,47 @@ def compute_mask(self, inputs, mask):
2530

2631

2732
class Hash(tf.keras.layers.Layer):
28-
"""
29-
hash the input to [0,num_buckets)
30-
if mask_zero = True,0 or 0.0 will be set to 0,other value will be set in range[1,num_buckets)
33+
"""Looks up keys in a table when setup `vocabulary_path`, which outputs the corresponding values.
34+
If `vocabulary_path` is not set, `Hash` will hash the input to [0,num_buckets). When `mask_zero` = True,
35+
input value `0` or `0.0` will be set to `0`, and other value will be set in range [1,num_buckets).
36+
37+
The following snippet initializes a `Hash` with `vocabulary_path` file with the first column as keys and
38+
second column as values:
39+
40+
* `1,emerson`
41+
* `2,lake`
42+
* `3,palmer`
43+
44+
>>> hash = Hash(
45+
... num_buckets=3+1,
46+
... vocabulary_path=filename,
47+
... default_value=0)
48+
>>> hash(tf.constant('lake')).numpy()
49+
2
50+
>>> hash(tf.constant('lakeemerson')).numpy()
51+
0
52+
53+
Args:
54+
num_buckets: An `int` that is >= 1. The number of buckets or the vocabulary size + 1
55+
when `vocabulary_path` is setup.
56+
mask_zero: default is False. The `Hash` value will hash input `0` or `0.0` to value `0` when
57+
the `mask_zero` is `True`. `mask_zero` is not used when `vocabulary_path` is setup.
58+
vocabulary_path: default `None`. The `CSV` text file path of the vocabulary hash, which contains
59+
two columns seperated by delimiter `comma`, the first column is the value and the second is
60+
the key. The key data type is `string`, the value data type is `int`. The path must
61+
be accessible from wherever `Hash` is initialized.
62+
default_value: default '0'. The default value if a key is missing in the table.
63+
**kwargs: Additional keyword arguments.
3164
"""
3265

33-
def __init__(self, num_buckets, mask_zero=False, **kwargs):
66+
def __init__(self, num_buckets, mask_zero=False, vocabulary_path=None, default_value=0, **kwargs):
3467
self.num_buckets = num_buckets
3568
self.mask_zero = mask_zero
69+
self.vocabulary_path = vocabulary_path
70+
self.default_value = default_value
71+
if self.vocabulary_path:
72+
initializer = TextFileInitializer(vocabulary_path, 'string', 1, 'int64', 0, delimiter=',')
73+
self.hash_table = StaticHashTable(initializer, default_value=self.default_value)
3674
super(Hash, self).__init__(**kwargs)
3775

3876
def build(self, input_shape):
@@ -41,13 +79,16 @@ def build(self, input_shape):
4179

4280
def call(self, x, mask=None, **kwargs):
4381

44-
4582
if x.dtype != tf.string:
4683
zero = tf.as_string(tf.zeros([1], dtype=x.dtype))
4784
x = tf.as_string(x, )
4885
else:
4986
zero = tf.as_string(tf.zeros([1], dtype='int32'))
5087

88+
if self.vocabulary_path:
89+
hash_x = self.hash_table.lookup(x)
90+
return hash_x
91+
5192
num_buckets = self.num_buckets if not self.mask_zero else self.num_buckets - 1
5293
try:
5394
hash_x = tf.string_to_hash_bucket_fast(x, num_buckets,
@@ -60,8 +101,12 @@ def call(self, x, mask=None, **kwargs):
60101
hash_x = (hash_x + 1) * mask
61102

62103
return hash_x
104+
105+
def compute_output_shape(self, input_shape):
106+
return input_shape
107+
63108
def get_config(self, ):
64-
config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, }
109+
config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, 'vocabulary_path': self.vocabulary_path, 'default_value': self.default_value}
65110
base_config = super(Hash, self).get_config()
66111
return dict(list(base_config.items()) + list(config.items()))
67112

docs/source/Features.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,13 @@ DNN based CTR prediction models usually have following 4 modules:
2323
2424
## Feature Columns
2525
### SparseFeat
26-
``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, dtype, embeddings_initializer, embedding_name, group_name, trainable)``
26+
``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype, embeddings_initializer, embedding_name, group_name, trainable)``
2727

2828
- name : feature name
2929
- vocabulary_size : number of unique feature values for sprase feature or hashing space when `use_hash=True`
3030
- embedding_dim : embedding dimension
31-
- use_hash : defualt `False`.If `True` the input will be hashed to space of size `vocabulary_size`.
31+
- use_hash : default `False`.If `True` the input will be hashed to space of size `vocabulary_size`.
32+
- vocabulary_path : default `None`. The `CSV` text file path of the vocabulary table used by `tf.lookup.TextFileInitializer`, which assigns one entry in the table for each line in the file. One entry contains two columns seperated by comma, the first is the value column, the second is the key column. The `0` value is reserved to use if a key is missing in the table, so hash value need start from `1`.
3233
- dtype : default `int32`.dtype of input tensor.
3334
- embeddings_initializer : initializer for the `embeddings` matrix.
3435
- embedding_name : default `None`. If None, the embedding_name will be same as `name`.
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
from deepctr.models import DeepFM
2+
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
3+
import functools
4+
import os
5+
import numpy as np
6+
import pandas as pd
7+
import shutil
8+
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
9+
try:
10+
import tensorflow.compat.v1 as tf
11+
except ImportError as e:
12+
import tensorflow as tf
13+
14+
15+
def init_vocab(df, tmpdir):
16+
"""initialize the vacabulary file of the sparse features
17+
"""
18+
vocab_size = {}
19+
20+
df_user_id = df.user_id.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index')
21+
df_user_id.index += 1
22+
df_user_id.to_csv(f'{tmpdir}/user_id.csv', sep=',', index=True, header=False)
23+
# must set to vocabulary size pluse 1, because 0 is used for miss of has and mask, same below
24+
vocab_size['user_id'] = len(df_user_id) + 1
25+
26+
df_movie_id = df.movie_id.drop_duplicates().dropna().sort_values().reset_index().drop(
27+
columns='index')
28+
df_movie_id.index += 1
29+
df_movie_id.to_csv(f'{tmpdir}/movie_id.csv', sep=',', index=True, header=False)
30+
vocab_size['movie_id'] = len(df_movie_id) + 1
31+
32+
df_genre = pd.DataFrame({
33+
'genre': list(set(functools.reduce(lambda x, y: x + y, df.genres.str.split('|'))))
34+
}).genre.sort_values()
35+
df_genre.index += 1
36+
df_genre.to_csv(f'{tmpdir}/genre.csv', sep=',', index=True, header=False)
37+
vocab_size['genre'] = len(df_genre) + 1
38+
39+
df_gender = df.gender.drop_duplicates().replace(
40+
r'^\s*$', np.nan,
41+
regex=True).dropna().sort_values().reset_index().drop(
42+
columns='index')
43+
df_gender.index += 1
44+
df_gender.to_csv(f'{tmpdir}/gender.csv', sep=',', index=True, header=False)
45+
vocab_size['gender'] = len(df_gender) + 1
46+
47+
df_age = df.age.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index')
48+
df_age.index += 1
49+
df_age.to_csv(f'{tmpdir}/age.csv', sep=',', index=True, header=False)
50+
vocab_size['age'] = len(df_age) + 1
51+
52+
df_occupation = df.occupation.drop_duplicates().replace(
53+
r'^\s*$', np.nan,
54+
regex=True).dropna().sort_values().reset_index().drop(
55+
columns='index')
56+
df_occupation.index += 1
57+
df_occupation.to_csv(f'{tmpdir}/occupation.csv', sep=',', index=True, header=False)
58+
vocab_size['occupation'] = len(df_occupation) + 1
59+
60+
df_zip = df.zip.drop_duplicates().replace(
61+
r'^\s*$', np.nan,
62+
regex=True).dropna().sort_values().reset_index().drop(columns='index')
63+
df_zip.index += 1
64+
df_zip.to_csv(f'{tmpdir}/zip.csv', sep=',', index=True, header=False)
65+
vocab_size['zip'] = len(df_zip) + 1
66+
return vocab_size
67+
68+
69+
if __name__ == "__main__":
70+
# change this to where the movielens dataset and work directory is
71+
workdir = os.path.dirname(__file__)
72+
data = pd.read_csv(f"{workdir}/movielens_sample.txt")
73+
74+
metadir = f'{workdir}/meta'
75+
if not os.path.exists(metadir):
76+
os.mkdir(metadir)
77+
vocab_size = init_vocab(data, metadir)
78+
79+
sparse_features = ["movie_id", "user_id",
80+
"gender", "age", "occupation", "zip", ]
81+
82+
data[sparse_features] = data[sparse_features].astype(str)
83+
target = ['rating']
84+
85+
# 1.Use hashing encoding on the fly for sparse features,and process sequence features
86+
87+
genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
88+
genres_length = np.array(list(map(len, genres_list)))
89+
max_len = max(genres_length)
90+
91+
# Notice : padding=`post`
92+
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
93+
94+
# 2.set hashing space for each sparse field and generate feature config for sequence feature
95+
96+
fixlen_feature_columns = [SparseFeat(feat, vocab_size[feat], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/{feat}.csv', dtype='string')
97+
for feat in sparse_features]
98+
varlen_feature_columns = [
99+
VarLenSparseFeat(SparseFeat('genres', vocabulary_size=vocab_size['genre'], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/genre.csv', dtype="string"),
100+
maxlen=max_len, combiner='mean',
101+
)] # Notice : value 0 is for padding for sequence input feature
102+
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
103+
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
104+
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
105+
106+
# 3.generate input data for model
107+
model_input = {name: data[name] for name in feature_names}
108+
model_input['genres'] = genres_list
109+
110+
# 4.Define Model,compile and train
111+
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
112+
model.compile("adam", "mse", metrics=['mse'], )
113+
if not hasattr(tf, 'version') or tf.version.VERSION < '2.0.0':
114+
with tf.Session() as sess:
115+
sess.run(tf.tables_initializer())
116+
history = model.fit(model_input, data[target].values,
117+
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
118+
else:
119+
history = model.fit(model_input, data[target].values,
120+
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
121+
if os.path.exists(metadir):
122+
shutil.rmtree(metadir)
123+
124+
# %%

tests/feature_test.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from deepctr.models import DeepFM
2-
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
2+
from deepctr.feature_column import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
33
import numpy as np
4+
5+
46
def test_long_dense_vector():
57

68
feature_columns = [SparseFeat('user_id', 4, ), SparseFeat('item_id', 5, ), DenseFeat("pic_vec", 5)]
@@ -16,4 +18,12 @@ def test_long_dense_vector():
1618

1719
model = DeepFM(feature_columns, feature_columns[:-1])
1820
model.compile('adagrad', 'binary_crossentropy')
19-
model.fit(model_input, label)
21+
model.fit(model_input, label)
22+
23+
24+
def test_feature_column_sparsefeat_vocabulary_path():
25+
vocab_path = "./dummy_test.csv"
26+
sf = SparseFeat('user_id', 4, vocabulary_path=vocab_path)
27+
assert sf.vocabulary_path == vocab_path
28+
vlsf = VarLenSparseFeat(sf, 6)
29+
assert vlsf.vocabulary_path == vocab_path

0 commit comments

Comments
 (0)