Skip to content

Commit 9f15559

Browse files
authored
Support pre-defined key-value vocabulary in Hash Layer
New Feature: Support pre-defined key-value vocabulary in Hash Layer
2 parents 0df401c + 95ad62e commit 9f15559

21 files changed

+271
-53
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ If you find this code useful in your research, please cite it using the followin
101101
<td>
102102
<a href="https://github.com/zanshuxun"><img width="70" height="70" src="https://github.com/zanshuxun.png?s=40" alt="pic"></a><br>
103103
<a href="https://github.com/zanshuxun">Zan Shuxun</a> ​
104-
<p>Beijing University <br> of Posts and <br> Telecommunications </p>​
104+
<p>Alibaba Group </p>​
105105
</td>
106106
<td>
107107
​ <a href="https://github.com/pandeconscious"><img width="70" height="70" src="https://github.com/pandeconscious.png?s=40" alt="pic"></a><br>

deepctr/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .utils import check_version
22

3-
__version__ = '0.8.6'
3+
__version__ = '0.8.7'
44
check_version(__version__)

deepctr/feature_column.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515

1616

1717
class SparseFeat(namedtuple('SparseFeat',
18-
['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embeddings_initializer',
18+
['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer',
1919
'embedding_name',
2020
'group_name', 'trainable'])):
2121
__slots__ = ()
2222

23-
def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="int32", embeddings_initializer=None,
23+
def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, vocabulary_path=None, dtype="int32", embeddings_initializer=None,
2424
embedding_name=None,
2525
group_name=DEFAULT_GROUP_NAME, trainable=True):
2626

@@ -32,7 +32,7 @@ def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="
3232
if embedding_name is None:
3333
embedding_name = name
3434

35-
return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, dtype,
35+
return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype,
3636
embeddings_initializer,
3737
embedding_name, group_name, trainable)
3838

@@ -64,6 +64,10 @@ def embedding_dim(self):
6464
def use_hash(self):
6565
return self.sparsefeat.use_hash
6666

67+
@property
68+
def vocabulary_path(self):
69+
return self.sparsefeat.vocabulary_path
70+
6771
@property
6872
def dtype(self):
6973
return self.sparsefeat.dtype

deepctr/inputs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, r
5151
feat_name = fg.name
5252
if len(return_feat_list) == 0 or feat_name in return_feat_list:
5353
if fg.use_hash:
54-
lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list))(input_dict[feat_name])
54+
lookup_idx = Hash(fg.vocabulary_size, mask_zero=(feat_name in mask_feat_list), vocabulary_path=fg.vocabulary_path)(input_dict[feat_name])
5555
else:
5656
lookup_idx = input_dict[feat_name]
5757

@@ -80,7 +80,7 @@ def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_co
8080
embedding_name = fc.embedding_name
8181
if (len(return_feat_list) == 0 or feature_name in return_feat_list):
8282
if fc.use_hash:
83-
lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list))(
83+
lookup_idx = Hash(fc.vocabulary_size, mask_zero=(feature_name in mask_feat_list), vocabulary_path=fc.vocabulary_path)(
8484
sparse_input_dict[feature_name])
8585
else:
8686
lookup_idx = sparse_input_dict[feature_name]
@@ -97,7 +97,7 @@ def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_f
9797
feature_name = fc.name
9898
embedding_name = fc.embedding_name
9999
if fc.use_hash:
100-
lookup_idx = Hash(fc.vocabulary_size, mask_zero=True)(sequence_input_dict[feature_name])
100+
lookup_idx = Hash(fc.vocabulary_size, mask_zero=True, vocabulary_path=fc.vocabulary_path)(sequence_input_dict[feature_name])
101101
else:
102102
lookup_idx = sequence_input_dict[feature_name]
103103
varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)

deepctr/layers/core.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ def build(self, input_shape):
6868
'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
6969
'Got different shapes: %s,%s' % (input_shape[0], input_shape[1]))
7070
size = 4 * \
71-
int(input_shape[0][-1]
72-
) if len(self.hidden_units) == 0 else self.hidden_units[-1]
71+
int(input_shape[0][-1]
72+
) if len(self.hidden_units) == 0 else self.hidden_units[-1]
7373
self.kernel = self.add_weight(shape=(size, 1),
7474
initializer=glorot_normal(
7575
seed=self.seed),
@@ -78,9 +78,6 @@ def build(self, input_shape):
7878
shape=(1,), initializer=Zeros(), name="bias")
7979
self.dnn = DNN(self.hidden_units, self.activation, self.l2_reg, self.dropout_rate, self.use_bn, seed=self.seed)
8080

81-
self.dense = tf.keras.layers.Lambda(lambda x: tf.nn.bias_add(tf.tensordot(
82-
x[0], x[1], axes=(-1, 0)), x[2]))
83-
8481
super(LocalActivationUnit, self).build(
8582
input_shape) # Be sure to call this somewhere!
8683

@@ -96,7 +93,7 @@ def call(self, inputs, training=None, **kwargs):
9693

9794
att_out = self.dnn(att_input, training=training)
9895

99-
attention_score = self.dense([att_out, self.kernel, self.bias])
96+
attention_score = tf.nn.bias_add(tf.tensordot(att_out, self.kernel, axes=(-1, 0)), self.bias)
10097

10198
return attention_score
10299

deepctr/layers/sequence.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -560,10 +560,10 @@ def call(self, inputs, mask=None, training=None, **kwargs):
560560
if self.blinding:
561561
try:
562562
outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
563-
:, :, 0] * (-2 ** 32 + 1))
564-
except:
563+
:, :, 0] * (-2 ** 32 + 1))
564+
except AttributeError:
565565
outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
566-
:, :, 0] * (-2 ** 32 + 1))
566+
:, :, 0] * (-2 ** 32 + 1))
567567

568568
outputs -= reduce_max(outputs, axis=-1, keep_dims=True)
569569
outputs = softmax(outputs)
@@ -633,14 +633,14 @@ def build(self, input_shape):
633633
_, T, num_units = input_shape.as_list() # inputs.get_shape().as_list()
634634
# First part of the PE function: sin and cos argument
635635
position_enc = np.array([
636-
[pos / np.power(10000, 2. * i / num_units)
637-
for i in range(num_units)]
636+
[pos / np.power(10000, 2. * (i//2) / num_units) for i in range(num_units)]
638637
for pos in range(T)])
639638

640639
# Second part, apply the cosine to even columns and sin to odds.
641640
position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i
642641
position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1
643-
642+
if self.zero_pad:
643+
position_enc[0, :] = np.zeros(num_units)
644644
self.lookup_table = self.add_weight("lookup_table", (T, num_units),
645645
initializer=tf.initializers.identity(position_enc),
646646
trainable=self.pos_embedding_trainable)
@@ -651,13 +651,7 @@ def build(self, input_shape):
651651
def call(self, inputs, mask=None):
652652
_, T, num_units = inputs.get_shape().as_list()
653653
position_ind = tf.expand_dims(tf.range(T), 0)
654-
655-
if self.zero_pad:
656-
self.lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
657-
self.lookup_table[1:, :]), 0)
658-
659654
outputs = tf.nn.embedding_lookup(self.lookup_table, position_ind)
660-
661655
if self.scale:
662656
outputs = outputs * num_units ** 0.5
663657
return outputs + inputs

deepctr/layers/utils.py

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
"""
88
import tensorflow as tf
99
from tensorflow.python.keras.layers import Flatten
10+
from tensorflow.python.ops.lookup_ops import TextFileInitializer
11+
try:
12+
from tensorflow.python.ops.lookup_ops import StaticHashTable
13+
except ImportError as e:
14+
from tensorflow.python.ops.lookup_ops import HashTable as StaticHashTable
1015

1116

1217
class NoMask(tf.keras.layers.Layer):
@@ -25,14 +30,47 @@ def compute_mask(self, inputs, mask):
2530

2631

2732
class Hash(tf.keras.layers.Layer):
28-
"""
29-
hash the input to [0,num_buckets)
30-
if mask_zero = True,0 or 0.0 will be set to 0,other value will be set in range[1,num_buckets)
33+
"""Looks up keys in a table when setup `vocabulary_path`, which outputs the corresponding values.
34+
If `vocabulary_path` is not set, `Hash` will hash the input to [0,num_buckets). When `mask_zero` = True,
35+
input value `0` or `0.0` will be set to `0`, and other value will be set in range [1,num_buckets).
36+
37+
The following snippet initializes a `Hash` with `vocabulary_path` file with the first column as keys and
38+
second column as values:
39+
40+
* `1,emerson`
41+
* `2,lake`
42+
* `3,palmer`
43+
44+
>>> hash = Hash(
45+
... num_buckets=3+1,
46+
... vocabulary_path=filename,
47+
... default_value=0)
48+
>>> hash(tf.constant('lake')).numpy()
49+
2
50+
>>> hash(tf.constant('lakeemerson')).numpy()
51+
0
52+
53+
Args:
54+
num_buckets: An `int` that is >= 1. The number of buckets or the vocabulary size + 1
55+
when `vocabulary_path` is setup.
56+
mask_zero: default is False. The `Hash` value will hash input `0` or `0.0` to value `0` when
57+
the `mask_zero` is `True`. `mask_zero` is not used when `vocabulary_path` is setup.
58+
vocabulary_path: default `None`. The `CSV` text file path of the vocabulary hash, which contains
59+
two columns seperated by delimiter `comma`, the first column is the value and the second is
60+
the key. The key data type is `string`, the value data type is `int`. The path must
61+
be accessible from wherever `Hash` is initialized.
62+
default_value: default '0'. The default value if a key is missing in the table.
63+
**kwargs: Additional keyword arguments.
3164
"""
3265

33-
def __init__(self, num_buckets, mask_zero=False, **kwargs):
66+
def __init__(self, num_buckets, mask_zero=False, vocabulary_path=None, default_value=0, **kwargs):
3467
self.num_buckets = num_buckets
3568
self.mask_zero = mask_zero
69+
self.vocabulary_path = vocabulary_path
70+
self.default_value = default_value
71+
if self.vocabulary_path:
72+
initializer = TextFileInitializer(vocabulary_path, 'string', 1, 'int64', 0, delimiter=',')
73+
self.hash_table = StaticHashTable(initializer, default_value=self.default_value)
3674
super(Hash, self).__init__(**kwargs)
3775

3876
def build(self, input_shape):
@@ -41,13 +79,16 @@ def build(self, input_shape):
4179

4280
def call(self, x, mask=None, **kwargs):
4381

44-
4582
if x.dtype != tf.string:
4683
zero = tf.as_string(tf.zeros([1], dtype=x.dtype))
4784
x = tf.as_string(x, )
4885
else:
4986
zero = tf.as_string(tf.zeros([1], dtype='int32'))
5087

88+
if self.vocabulary_path:
89+
hash_x = self.hash_table.lookup(x)
90+
return hash_x
91+
5192
num_buckets = self.num_buckets if not self.mask_zero else self.num_buckets - 1
5293
try:
5394
hash_x = tf.string_to_hash_bucket_fast(x, num_buckets,
@@ -60,8 +101,12 @@ def call(self, x, mask=None, **kwargs):
60101
hash_x = (hash_x + 1) * mask
61102

62103
return hash_x
104+
105+
def compute_output_shape(self, input_shape):
106+
return input_shape
107+
63108
def get_config(self, ):
64-
config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, }
109+
config = {'num_buckets': self.num_buckets, 'mask_zero': self.mask_zero, 'vocabulary_path': self.vocabulary_path, 'default_value': self.default_value}
65110
base_config = super(Hash, self).get_config()
66111
return dict(list(base_config.items()) + list(config.items()))
67112

docs/source/Examples.md

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,72 @@ if __name__ == "__main__":
322322
history = model.fit(model_input, data[target].values,
323323
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
324324
```
325+
## Hash Layer with pre-defined key-value vocabulary
326+
327+
This examples how to use pre-defined key-value vocabulary in `Hash` Layer.`movielens_age_vocabulary.csv` stores the key-value mapping for `age` feature.
328+
329+
```python
330+
from deepctr.models import DeepFM
331+
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
332+
import numpy as np
333+
import pandas as pd
334+
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
335+
336+
try:
337+
import tensorflow.compat.v1 as tf
338+
except ImportError as e:
339+
import tensorflow as tf
340+
341+
if __name__ == "__main__":
342+
data = pd.read_csv("./movielens_sample.txt")
343+
sparse_features = ["movie_id", "user_id",
344+
"gender", "age", "occupation", "zip", ]
345+
346+
data[sparse_features] = data[sparse_features].astype(str)
347+
target = ['rating']
348+
349+
# 1.Use hashing encoding on the fly for sparse features,and process sequence features
350+
351+
genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
352+
genres_length = np.array(list(map(len, genres_list)))
353+
max_len = max(genres_length)
354+
355+
# Notice : padding=`post`
356+
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
357+
358+
# 2.set hashing space for each sparse field and generate feature config for sequence feature
359+
360+
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True,
361+
vocabulary_path='./movielens_age_vocabulary.csv' if feat == 'age' else None,
362+
dtype='string')
363+
for feat in sparse_features]
364+
varlen_feature_columns = [
365+
VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4,
366+
use_hash=True, dtype="string"),
367+
maxlen=max_len, combiner='mean',
368+
)] # Notice : value 0 is for padding for sequence input feature
369+
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
370+
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
371+
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
372+
373+
# 3.generate input data for model
374+
model_input = {name: data[name] for name in feature_names}
375+
model_input['genres'] = genres_list
376+
377+
# 4.Define Model,compile and train
378+
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
379+
model.compile("adam", "mse", metrics=['mse'], )
380+
if not hasattr(tf, 'version') or tf.version.VERSION < '2.0.0':
381+
with tf.Session() as sess:
382+
sess.run(tf.tables_initializer())
383+
history = model.fit(model_input, data[target].values,
384+
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
385+
else:
386+
history = model.fit(model_input, data[target].values,
387+
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
388+
389+
```
390+
325391

326392
## Estimator with TFRecord: Classification Criteo
327393

docs/source/Features.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,13 @@ DNN based CTR prediction models usually have following 4 modules:
2323
2424
## Feature Columns
2525
### SparseFeat
26-
``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, dtype, embeddings_initializer, embedding_name, group_name, trainable)``
26+
``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype, embeddings_initializer, embedding_name, group_name, trainable)``
2727

2828
- name : feature name
29-
- vocabulary_size : number of unique feature values for sprase feature or hashing space when `use_hash=True`
29+
- vocabulary_size : number of unique feature values for sparse feature or hashing space when `use_hash=True`
3030
- embedding_dim : embedding dimension
31-
- use_hash : defualt `False`.If `True` the input will be hashed to space of size `vocabulary_size`.
31+
- use_hash : default `False`.If `True` the input will be hashed to space of size `vocabulary_size`.
32+
- vocabulary_path : default `None`. The `CSV` text file path of the vocabulary table used by `tf.lookup.TextFileInitializer`, which assigns one entry in the table for each line in the file. One entry contains two columns separated by comma, the first is the value column, the second is the key column. The `0` value is reserved to use if a key is missing in the table, so hash value need start from `1`.
3233
- dtype : default `int32`.dtype of input tensor.
3334
- embeddings_initializer : initializer for the `embeddings` matrix.
3435
- embedding_name : default `None`. If None, the embedding_name will be same as `name`.

docs/source/History.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# History
2+
- 07/18/2021 : [v0.8.7](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.7) released.Support pre-defined key-value vocabulary in `Hash` Layer. [example](./Examples.html#hash-layer-with-pre-defined-key-value-vocabulary)
23
- 06/14/2021 : [v0.8.6](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.6) released.Add [IFM](./Features.html#ifm-input-aware-factorization-machine) [DIFM](./Features.html#difm-dual-input-aware-factorization-machine), [FEFM and DeepFEFM](./Features.html#deepfefm-deep-field-embedded-factorization-machine) model.
34
- 03/13/2021 : [v0.8.5](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.5) released.Add [BST](./Features.html#bst-behavior-sequence-transformer) model.
45
- 02/12/2021 : [v0.8.4](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.4) released.Fix bug in DCN-Mix.

0 commit comments

Comments
 (0)