Skip to content

Commit db63fc6

Browse files
author
浅梦
authored
simplify input logic
1 parent 924f00f commit db63fc6

33 files changed

+231
-266
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
strategy:
1919
matrix:
2020
python-version: [3.5,3.6,3.7]
21-
tf-version: [1.4.0,1.14.0,2.0.0b1]
21+
tf-version: [1.4.0,1.14.0,2.0.0]
2222

2323
exclude:
2424
- python-version: 3.7

deepctr/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
from . import models
33
from .utils import check_version
44

5-
__version__ = '0.6.1'
5+
__version__ = '0.6.2'
66
check_version(__version__)

deepctr/contrib/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def __init__(self,
8787

8888
else:
8989

90-
total_arg_size += shape[1]#.value
90+
total_arg_size += int(shape[1])#.value
9191

9292
dtype = [a.dtype for a in args][0]
9393

deepctr/inputs.py

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -42,39 +42,31 @@ def __new__(cls, name, dimension, maxlen, combiner="mean", use_hash=False, dtype
4242
embedding_name = name
4343
return super(VarLenSparseFeat, cls).__new__(cls, name, dimension, maxlen, combiner, use_hash, dtype, embedding_name,embedding)
4444

45-
46-
def get_fixlen_feature_names(feature_columns):
47-
features = build_input_features(feature_columns, include_varlen=False,include_fixlen=True)
48-
return list(features.keys())
49-
50-
def get_varlen_feature_names(feature_columns):
51-
features = build_input_features(feature_columns, include_varlen=True,include_fixlen=False)
45+
def get_feature_names(feature_columns):
46+
features = build_input_features(feature_columns)
5247
return list(features.keys())
5348

5449
def get_inputs_list(inputs):
5550
return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))
5651

57-
def build_input_features(feature_columns, include_varlen=True, mask_zero=True, prefix='',include_fixlen=True):
52+
def build_input_features(feature_columns, mask_zero=True, prefix=''):
5853
input_features = OrderedDict()
59-
if include_fixlen:
60-
for fc in feature_columns:
61-
if isinstance(fc,SparseFeat):
62-
input_features[fc.name] = Input(
63-
shape=(1,), name=prefix+fc.name, dtype=fc.dtype)
64-
elif isinstance(fc,DenseFeat):
65-
input_features[fc.name] = Input(
66-
shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
67-
if include_varlen:
68-
for fc in feature_columns:
69-
if isinstance(fc,VarLenSparseFeat):
70-
input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + 'seq_' + fc.name,
71-
dtype=fc.dtype)
72-
if not mask_zero:
73-
for fc in feature_columns:
74-
input_features[fc.name+"_seq_length"] = Input(shape=(
54+
for fc in feature_columns:
55+
if isinstance(fc,SparseFeat):
56+
input_features[fc.name] = Input(
57+
shape=(1,), name=prefix+fc.name, dtype=fc.dtype)
58+
elif isinstance(fc,DenseFeat):
59+
input_features[fc.name] = Input(
60+
shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
61+
elif isinstance(fc,VarLenSparseFeat):
62+
input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name,
63+
dtype=fc.dtype)
64+
if not mask_zero:
65+
input_features[fc.name + "_seq_length"] = Input(shape=(
7566
1,), name=prefix + 'seq_length_' + fc.name)
76-
input_features[fc.name+"_seq_max_length"] = fc.maxlen
77-
67+
input_features[fc.name + "_seq_max_length"] = fc.maxlen
68+
else:
69+
raise TypeError("Invalid feature column type,got",type(fc))
7870

7971
return input_features
8072

@@ -119,8 +111,6 @@ def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns,
119111
l2_reg),
120112
name=prefix + '_seq_emb_' + feat.name,
121113
mask_zero=seq_mask_zero)
122-
123-
124114
return sparse_embedding
125115

126116

deepctr/layers/sequence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ def build(self, input_shape):
708708

709709
if self.axis < 1 or self.axis > len(input_shape):
710710
raise ValueError("axis must be 1~%d,now is %d" %
711-
(len(input_shape), len(input_shape)))
711+
(len(input_shape), self.axis))
712712

713713
if self.k < 1 or self.k > input_shape[self.axis]:
714714
raise ValueError("k must be in 1 ~ %d,now k is %d" %

deepctr/models/afm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def AFM(linear_feature_columns, dnn_feature_columns, embedding_size=8, use_atten
3737
"""
3838

3939

40-
features = build_input_features(linear_feature_columns+dnn_feature_columns)
40+
features = build_input_features(linear_feature_columns + dnn_feature_columns)
4141

4242
inputs_list = list(features.values())
4343

deepctr/models/ccpm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def CCPM(linear_feature_columns, dnn_feature_columns, embedding_size=8, conv_ker
4242
raise ValueError(
4343
"conv_kernel_width must have same element with conv_filters")
4444

45-
features = build_input_features(linear_feature_columns+dnn_feature_columns)
45+
features = build_input_features(linear_feature_columns + dnn_feature_columns)
4646
inputs_list = list(features.values())
4747

4848
sparse_embedding_list, _ = input_from_feature_columns(features,dnn_feature_columns,embedding_size,

deepctr/models/fibinet.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ def FiBiNET(linear_feature_columns, dnn_feature_columns, embedding_size=8, bilin
3838
:return: A Keras model instance.
3939
"""
4040

41-
features = build_input_features(
42-
linear_feature_columns + dnn_feature_columns)
41+
features = build_input_features(linear_feature_columns + dnn_feature_columns)
4342

4443
inputs_list = list(features.values())
4544

deepctr/models/mlr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def MLR(region_feature_columns, base_feature_columns=None, region_num=4,
4040
if bias_feature_columns is None:
4141
bias_feature_columns = []
4242

43-
features = build_input_features(region_feature_columns + base_feature_columns+bias_feature_columns)
43+
features = build_input_features(region_feature_columns + base_feature_columns + bias_feature_columns)
4444

4545
inputs_list = list(features.values())
4646

docs/source/Examples.md

Lines changed: 76 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ from sklearn.model_selection import train_test_split
3131
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
3232

3333
from deepctr.models import DeepFM
34-
from deepctr.inputs import SparseFeat, DenseFeat,get_fixlen_feature_names
34+
from deepctr.inputs import SparseFeat, DenseFeat, get_feature_names
3535

3636
if __name__ == "__main__":
3737
data = pd.read_csv('./criteo_sample.txt')
@@ -59,14 +59,13 @@ if __name__ == "__main__":
5959
dnn_feature_columns = fixlen_feature_columns
6060
linear_feature_columns = fixlen_feature_columns
6161

62-
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
62+
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
6363

6464
# 3.generate input data for model
6565

6666
train, test = train_test_split(data, test_size=0.2)
67-
train_model_input = [train[name] for name in fixlen_feature_names]
68-
69-
test_model_input = [test[name] for name in fixlen_feature_names]
67+
train_model_input = {name:train[name] for name in feature_names}
68+
test_model_input = {name:test[name] for name in feature_names}
7069

7170
# 4.Define Model,train,predict and evaluate
7271
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
@@ -91,7 +90,7 @@ from sklearn.model_selection import train_test_split
9190
from sklearn.preprocessing import MinMaxScaler
9291

9392
from deepctr.models import DeepFM
94-
from deepctr.inputs import SparseFeat, DenseFeat,get_fixlen_feature_names
93+
from deepctr.inputs import SparseFeat, DenseFeat,get_feature_names
9594

9695
if __name__ == "__main__":
9796
data = pd.read_csv('./criteo_sample.txt')
@@ -115,14 +114,14 @@ if __name__ == "__main__":
115114

116115
linear_feature_columns = fixlen_feature_columns
117116
dnn_feature_columns = fixlen_feature_columns
118-
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns, )
117+
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns, )
119118

120119
# 3.generate input data for model
121120

122121
train, test = train_test_split(data, test_size=0.2)
123-
train_model_input = [train[name] for name in fixlen_feature_names]
124122

125-
test_model_input = [test[name] for name in fixlen_feature_names]
123+
train_model_input = {name:train[name] for name in feature_names}
124+
test_model_input = {name:test[name] for name in feature_names}
126125

127126

128127
# 4.Define Model,train,predict and evaluate
@@ -156,7 +155,7 @@ from sklearn.model_selection import train_test_split
156155
from sklearn.preprocessing import LabelEncoder
157156

158157
from deepctr.models import DeepFM
159-
from deepctr.inputs import SparseFeat,get_fixlen_feature_names
158+
from deepctr.inputs import SparseFeat,get_feature_names
160159

161160
if __name__ == "__main__":
162161

@@ -174,12 +173,13 @@ if __name__ == "__main__":
174173
for feat in sparse_features]
175174
linear_feature_columns = fixlen_feature_columns
176175
dnn_feature_columns = fixlen_feature_columns
177-
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
176+
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
178177

179178
# 3.generate input data for model
180179
train, test = train_test_split(data, test_size=0.2)
181-
train_model_input = [train[name].values for name in fixlen_feature_names]
182-
test_model_input = [test[name].values for name in fixlen_feature_names]
180+
train_model_input = {name:train[name].values for name in feature_names}
181+
test_model_input = {name:test[name].values for name in feature_names}
182+
183183
# 4.Define Model,train,predict and evaluate
184184
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
185185
model.compile("adam", "mse", metrics=['mse'], )
@@ -228,7 +228,7 @@ from sklearn.preprocessing import LabelEncoder
228228
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
229229

230230
from deepctr.models import DeepFM
231-
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names,get_varlen_feature_names
231+
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_feature_names
232232

233233

234234
def split(x):
@@ -239,49 +239,49 @@ def split(x):
239239
key2index[key] = len(key2index) + 1
240240
return list(map(lambda x: key2index[x], key_ans))
241241

242+
if __name__ == "__main__":
243+
data = pd.read_csv("./movielens_sample.txt")
244+
sparse_features = ["movie_id", "user_id",
245+
"gender", "age", "occupation", "zip", ]
246+
target = ['rating']
242247

243-
data = pd.read_csv("./movielens_sample.txt")
244-
sparse_features = ["movie_id", "user_id",
245-
"gender", "age", "occupation", "zip", ]
246-
target = ['rating']
248+
# 1.Label Encoding for sparse features,and process sequence features
249+
for feat in sparse_features:
250+
lbe = LabelEncoder()
251+
data[feat] = lbe.fit_transform(data[feat])
252+
# preprocess the sequence feature
247253

248-
# 1.Label Encoding for sparse features,and process sequence features
249-
for feat in sparse_features:
250-
lbe = LabelEncoder()
251-
data[feat] = lbe.fit_transform(data[feat])
252-
# preprocess the sequence feature
254+
key2index = {}
255+
genres_list = list(map(split, data['genres'].values))
256+
genres_length = np.array(list(map(len, genres_list)))
257+
max_len = max(genres_length)
258+
# Notice : padding=`post`
259+
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
253260

254-
key2index = {}
255-
genres_list = list(map(split, data['genres'].values))
256-
genres_length = np.array(list(map(len, genres_list)))
257-
max_len = max(genres_length)
258-
# Notice : padding=`post`
259-
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
261+
# 2.count #unique features for each sparse field and generate feature config for sequence feature
260262

261-
# 2.count #unique features for each sparse field and generate feature config for sequence feature
263+
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
264+
for feat in sparse_features]
265+
varlen_feature_columns = [VarLenSparseFeat('genres', len(
266+
key2index) + 1, max_len, 'mean')] # Notice : value 0 is for padding for sequence input feature
262267

263-
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique())
264-
for feat in sparse_features]
265-
varlen_feature_columns = [VarLenSparseFeat('genres', len(
266-
key2index) + 1, max_len, 'mean')] # Notice : value 0 is for padding for sequence input feature
268+
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
269+
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
267270

268-
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
269-
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
270-
fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
271-
varlen_feature_names = get_varlen_feature_names(linear_feature_columns+dnn_feature_columns)
271+
feature_names = get_feature_names(linear_feature_columns+dnn_feature_columns)
272272

273273

274-
# 3.generate input data for model
275-
fixlen_input = [data[name].values for name in fixlen_feature_names]
276-
varlen_input = [genres_list]#varlen_feature_names[0]
277-
model_input = fixlen_input + varlen_input # make sure the order is right
274+
# 3.generate input data for model
275+
model_input = {name:data[name] for name in feature_names}#
276+
model_input["genres"] = genres_list
278277

279-
# 4.Define Model,compile and train
280-
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
281278

282-
model.compile("adam", "mse", metrics=['mse'], )
283-
history = model.fit(model_input, data[target].values,
284-
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
279+
# 4.Define Model,compile and train
280+
model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression')
281+
282+
model.compile("adam", "mse", metrics=['mse'], )
283+
history = model.fit(model_input, data[target].values,
284+
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
285285

286286
```
287287

@@ -293,44 +293,43 @@ import pandas as pd
293293
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
294294

295295
from deepctr.models import DeepFM
296-
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names
297-
298-
data = pd.read_csv("./movielens_sample.txt")
299-
sparse_features = ["movie_id", "user_id",
300-
"gender", "age", "occupation", "zip", ]
296+
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_feature_names
301297

302-
data[sparse_features] = data[sparse_features].astype(str)
303-
target = ['rating']
298+
if __name__ == "__main__":
299+
data = pd.read_csv("./movielens_sample.txt")
300+
sparse_features = ["movie_id", "user_id",
301+
"gender", "age", "occupation", "zip", ]
304302

305-
# 1.Use hashing encoding on the fly for sparse features,and process sequence features
303+
data[sparse_features] = data[sparse_features].astype(str)
304+
target = ['rating']
306305

307-
genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
308-
genres_length = np.array(list(map(len, genres_list)))
309-
max_len = max(genres_length)
306+
# 1.Use hashing encoding on the fly for sparse features,and process sequence features
310307

311-
# Notice : padding=`post`
312-
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
308+
genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
309+
genres_length = np.array(list(map(len, genres_list)))
310+
max_len = max(genres_length)
313311

314-
# 2.set hashing space for each sparse field and generate feature config for sequence feature
312+
# Notice : padding=`post`
313+
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
315314

316-
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, use_hash=True, dtype='string')
317-
for feat in sparse_features]
318-
varlen_feature_columns = [VarLenSparseFeat('genres', 100, max_len, 'mean', use_hash=True,
319-
dtype="string")] # Notice : value 0 is for padding for sequence input feature
320-
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
321-
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
322-
feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
315+
# 2.set hashing space for each sparse field and generate feature config for sequence feature
323316

324-
# 3.generate input data for model
325-
fixlen_input = [data[name].values for name in feature_names]
326-
varlen_input = [genres_list]
317+
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, use_hash=True, dtype='string')
318+
for feat in sparse_features]
319+
varlen_feature_columns = [VarLenSparseFeat('genres', 100, max_len, 'mean', use_hash=True,
320+
dtype="string")] # Notice : value 0 is for padding for sequence input feature
321+
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
322+
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
323+
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
327324

328-
model_input = fixlen_input + varlen_input # make sure the order is right
325+
# 3.generate input data for model
326+
model_input = {name:data[name] for name in feature_names}
327+
model_input['genres'] = genres_list
329328

330-
# 4.Define Model,compile and train
331-
model = DeepFM(linear_feature_columns,dnn_feature_columns, task='regression')
329+
# 4.Define Model,compile and train
330+
model = DeepFM(linear_feature_columns,dnn_feature_columns, task='regression')
332331

333-
model.compile("adam", "mse", metrics=['mse'], )
334-
history = model.fit(model_input, data[target].values,
335-
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
332+
model.compile("adam", "mse", metrics=['mse'], )
333+
history = model.fit(model_input, data[target].values,
334+
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
336335
```

0 commit comments

Comments
 (0)