|
| 1 | +from deepctr.models import DeepFM |
| 2 | +from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names |
| 3 | +import functools |
| 4 | +import os |
| 5 | +import numpy as np |
| 6 | +import pandas as pd |
| 7 | +import shutil |
| 8 | +from tensorflow.python.keras.preprocessing.sequence import pad_sequences |
| 9 | +try: |
| 10 | + import tensorflow.compat.v1 as tf |
| 11 | +except ImportError as e: |
| 12 | + import tensorflow as tf |
| 13 | + |
| 14 | + |
| 15 | +def init_vocab(df, tmpdir): |
| 16 | + """initialize the vacabulary file of the sparse features |
| 17 | + """ |
| 18 | + vocab_size = {} |
| 19 | + |
| 20 | + df_user_id = df.user_id.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index') |
| 21 | + df_user_id.index += 1 |
| 22 | + df_user_id.to_csv(f'{tmpdir}/user_id.csv', sep=',', index=True, header=False) |
| 23 | + # must set to vocabulary size pluse 1, because 0 is used for miss of has and mask, same below |
| 24 | + vocab_size['user_id'] = len(df_user_id) + 1 |
| 25 | + |
| 26 | + df_movie_id = df.movie_id.drop_duplicates().dropna().sort_values().reset_index().drop( |
| 27 | + columns='index') |
| 28 | + df_movie_id.index += 1 |
| 29 | + df_movie_id.to_csv(f'{tmpdir}/movie_id.csv', sep=',', index=True, header=False) |
| 30 | + vocab_size['movie_id'] = len(df_movie_id) + 1 |
| 31 | + |
| 32 | + df_genre = pd.DataFrame({ |
| 33 | + 'genre': list(set(functools.reduce(lambda x, y: x + y, df.genres.str.split('|')))) |
| 34 | + }).genre.sort_values() |
| 35 | + df_genre.index += 1 |
| 36 | + df_genre.to_csv(f'{tmpdir}/genre.csv', sep=',', index=True, header=False) |
| 37 | + vocab_size['genre'] = len(df_genre) + 1 |
| 38 | + |
| 39 | + df_gender = df.gender.drop_duplicates().replace( |
| 40 | + r'^\s*$', np.nan, |
| 41 | + regex=True).dropna().sort_values().reset_index().drop( |
| 42 | + columns='index') |
| 43 | + df_gender.index += 1 |
| 44 | + df_gender.to_csv(f'{tmpdir}/gender.csv', sep=',', index=True, header=False) |
| 45 | + vocab_size['gender'] = len(df_gender) + 1 |
| 46 | + |
| 47 | + df_age = df.age.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index') |
| 48 | + df_age.index += 1 |
| 49 | + df_age.to_csv(f'{tmpdir}/age.csv', sep=',', index=True, header=False) |
| 50 | + vocab_size['age'] = len(df_age) + 1 |
| 51 | + |
| 52 | + df_occupation = df.occupation.drop_duplicates().replace( |
| 53 | + r'^\s*$', np.nan, |
| 54 | + regex=True).dropna().sort_values().reset_index().drop( |
| 55 | + columns='index') |
| 56 | + df_occupation.index += 1 |
| 57 | + df_occupation.to_csv(f'{tmpdir}/occupation.csv', sep=',', index=True, header=False) |
| 58 | + vocab_size['occupation'] = len(df_occupation) + 1 |
| 59 | + |
| 60 | + df_zip = df.zip.drop_duplicates().replace( |
| 61 | + r'^\s*$', np.nan, |
| 62 | + regex=True).dropna().sort_values().reset_index().drop(columns='index') |
| 63 | + df_zip.index += 1 |
| 64 | + df_zip.to_csv(f'{tmpdir}/zip.csv', sep=',', index=True, header=False) |
| 65 | + vocab_size['zip'] = len(df_zip) + 1 |
| 66 | + return vocab_size |
| 67 | + |
| 68 | + |
| 69 | +if __name__ == "__main__": |
| 70 | + # change this to where the movielens dataset and work directory is |
| 71 | + workdir = os.path.dirname(__file__) |
| 72 | + data = pd.read_csv(f"{workdir}/movielens_sample.txt") |
| 73 | + |
| 74 | + metadir = f'{workdir}/meta' |
| 75 | + if not os.path.exists(metadir): |
| 76 | + os.mkdir(metadir) |
| 77 | + vocab_size = init_vocab(data, metadir) |
| 78 | + |
| 79 | + sparse_features = ["movie_id", "user_id", |
| 80 | + "gender", "age", "occupation", "zip", ] |
| 81 | + |
| 82 | + data[sparse_features] = data[sparse_features].astype(str) |
| 83 | + target = ['rating'] |
| 84 | + |
| 85 | + # 1.Use hashing encoding on the fly for sparse features,and process sequence features |
| 86 | + |
| 87 | + genres_list = list(map(lambda x: x.split('|'), data['genres'].values)) |
| 88 | + genres_length = np.array(list(map(len, genres_list))) |
| 89 | + max_len = max(genres_length) |
| 90 | + |
| 91 | + # Notice : padding=`post` |
| 92 | + genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0) |
| 93 | + |
| 94 | + # 2.set hashing space for each sparse field and generate feature config for sequence feature |
| 95 | + |
| 96 | + fixlen_feature_columns = [SparseFeat(feat, vocab_size[feat], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/{feat}.csv', dtype='string') |
| 97 | + for feat in sparse_features] |
| 98 | + varlen_feature_columns = [ |
| 99 | + VarLenSparseFeat(SparseFeat('genres', vocabulary_size=vocab_size['genre'], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/genre.csv', dtype="string"), |
| 100 | + maxlen=max_len, combiner='mean', |
| 101 | + )] # Notice : value 0 is for padding for sequence input feature |
| 102 | + linear_feature_columns = fixlen_feature_columns + varlen_feature_columns |
| 103 | + dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns |
| 104 | + feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) |
| 105 | + |
| 106 | + # 3.generate input data for model |
| 107 | + model_input = {name: data[name] for name in feature_names} |
| 108 | + model_input['genres'] = genres_list |
| 109 | + |
| 110 | + # 4.Define Model,compile and train |
| 111 | + model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression') |
| 112 | + model.compile("adam", "mse", metrics=['mse'], ) |
| 113 | + if not hasattr(tf, 'version') or tf.version.VERSION < '2.0.0': |
| 114 | + with tf.Session() as sess: |
| 115 | + sess.run(tf.tables_initializer()) |
| 116 | + history = model.fit(model_input, data[target].values, |
| 117 | + batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) |
| 118 | + else: |
| 119 | + history = model.fit(model_input, data[target].values, |
| 120 | + batch_size=256, epochs=10, verbose=2, validation_split=0.2, ) |
| 121 | + if os.path.exists(metadir): |
| 122 | + shutil.rmtree(metadir) |
| 123 | + |
| 124 | +# %% |
0 commit comments