Skip to content

Commit e50188f

Browse files
gagikayeqinglitensorflower-gardener
authored
Merging DLRM model code into 2.5 branch. (#10036)
* Adds trainer and checkpoint exporter as the arguments of the run_experiment functions. PiperOrigin-RevId: 368778443 * DLRM and DCN v2 ranking models. PiperOrigin-RevId: 375529985 * Updating READMEs for DLRM Model. PiperOrigin-RevId: 375729667 * Removing tensorflow recommenders library from requirement.txt. PiperOrigin-RevId: 375826079 Co-authored-by: Yeqing Li <[email protected]> Co-authored-by: A. Unique TensorFlower <[email protected]>
1 parent 1cb16f2 commit e50188f

File tree

17 files changed

+1558
-22
lines changed

17 files changed

+1558
-22
lines changed

official/README-TPU.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,7 @@
2626
* [shapemask](vision/detection): An object detection and instance segmentation model using shape priors. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/ZbXgVoc6Rf6mBRlPj0JpLA).
2727

2828
## Recommendation
29+
* [dlrm](recommendation/ranking): [Deep Learning Recommendation Model for
30+
Personalization and Recommendation Systems](https://arxiv.org/abs/1906.00091).
31+
* [dcn v2](recommendation/ranking): [Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/abs/2008.13535).
2932
* [ncf](recommendation): Neural Collaborative Filtering. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/0k3gKjZlR1ewkVTRyLB6IQ).

official/README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,11 @@ In the near future, we will add:
6464

6565
### Recommendation
6666

67-
| Model | Reference (Paper) |
68-
|-------|-------------------|
69-
| [NCF](recommendation) | [Neural Collaborative Filtering](https://arxiv.org/abs/1708.05031) |
67+
Model | Reference (Paper)
68+
-------------------------------- | -----------------
69+
[DLRM](recommendation/ranking) | [Deep Learning Recommendation Model for Personalization and Recommendation Systems](https://arxiv.org/abs/1906.00091)
70+
[DCN v2](recommendation/ranking) | [Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/abs/2008.13535)
71+
[NCF](recommendation) | [Neural Collaborative Filtering](https://arxiv.org/abs/1708.05031)
7072

7173
## How to get started with the official models
7274

official/core/train_lib.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,29 +15,31 @@
1515
"""TFM common training driver library."""
1616
# pytype: disable=attribute-error
1717
import os
18-
from typing import Any, Mapping, Tuple
18+
from typing import Any, Mapping, Tuple, Optional
1919

2020
# Import libraries
2121
from absl import logging
2222
import orbit
2323
import tensorflow as tf
2424

2525
from official.core import base_task
26+
from official.core import base_trainer
2627
from official.core import config_definitions
2728
from official.core import train_utils
2829

29-
BestCheckpointExporter = train_utils.BestCheckpointExporter
3030
maybe_create_best_ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter
3131

3232

33-
def run_experiment(distribution_strategy: tf.distribute.Strategy,
34-
task: base_task.Task,
35-
mode: str,
36-
params: config_definitions.ExperimentConfig,
37-
model_dir: str,
38-
run_post_eval: bool = False,
39-
save_summary: bool = True) \
40-
-> Tuple[tf.keras.Model, Mapping[str, Any]]:
33+
def run_experiment(
34+
distribution_strategy: tf.distribute.Strategy,
35+
task: base_task.Task,
36+
mode: str,
37+
params: config_definitions.ExperimentConfig,
38+
model_dir: str,
39+
run_post_eval: bool = False,
40+
save_summary: bool = True,
41+
trainer: Optional[base_trainer.Trainer] = None
42+
) -> Tuple[tf.keras.Model, Mapping[str, Any]]:
4143
"""Runs train/eval configured by the experiment params.
4244
4345
Args:
@@ -50,6 +52,8 @@ def run_experiment(distribution_strategy: tf.distribute.Strategy,
5052
run_post_eval: Whether to run post eval once after training, metrics logs
5153
are returned.
5254
save_summary: Whether to save train and validation summary.
55+
trainer: the base_trainer.Trainer instance. It should be created within the
56+
strategy.scope().
5357
5458
Returns:
5559
A 2-tuple of (model, eval_logs).
@@ -59,13 +63,14 @@ def run_experiment(distribution_strategy: tf.distribute.Strategy,
5963
"""
6064

6165
with distribution_strategy.scope():
62-
trainer = train_utils.create_trainer(
63-
params,
64-
task,
65-
train='train' in mode,
66-
evaluate=('eval' in mode) or run_post_eval,
67-
checkpoint_exporter=maybe_create_best_ckpt_exporter(
68-
params, model_dir))
66+
if not trainer:
67+
trainer = train_utils.create_trainer(
68+
params,
69+
task,
70+
train='train' in mode,
71+
evaluate=('eval' in mode) or run_post_eval,
72+
checkpoint_exporter=maybe_create_best_ckpt_exporter(
73+
params, model_dir))
6974

7075
if trainer.checkpoint:
7176
checkpoint_manager = tf.train.CheckpointManager(
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# TF Model Garden Ranking Models
2+
3+
## Overview
4+
This is an implementation of [DLRM](https://arxiv.org/abs/1906.00091) and
5+
[DCN v2](https://arxiv.org/abs/2008.13535) ranking models that can be used for
6+
tasks such as CTR prediction.
7+
8+
The model inputs are numerical and categorical features, and output is a scalar
9+
(for example click probability).
10+
The model can be trained and evaluated on GPU, TPU and CPU. The deep ranking
11+
models are both memory intensive (for embedding tables/lookup) and compute
12+
intensive for deep networks (MLPs). CPUs are best suited for large sparse
13+
embedding lookup, GPUs for fast compute. TPUs are designed for both.
14+
15+
When training on TPUs we use
16+
[TPUEmbedding layer](https://github.com/tensorflow/recommenders/blob/main/tensorflow_recommenders/layers/embedding/tpu_embedding_layer.py)
17+
for categorical features. TPU embedding supports large embedding tables with
18+
fast lookup, the size of embedding tables scales linearly with the size of TPU
19+
pod. We can have up to 90 GB embedding tables for TPU v3-8 and 5.6 TB for
20+
v3-512 and 22,4 TB for TPU Pod v3-2048.
21+
22+
The Model code is in
23+
[TensorFlow Recommenders](https://github.com/tensorflow/recommenders/tree/main/tensorflow_recommenders/experimental/models)
24+
library, while input pipeline, configuration and training loop is here.
25+
26+
## Prerequisites
27+
To get started, download the code from TensorFlow models GitHub repository or
28+
use the pre-installed Google Cloud VM.
29+
30+
```bash
31+
git clone https://github.com/tensorflow/models.git
32+
export PYTHONPATH=$PYTHONPATH:$(pwd)/models
33+
```
34+
35+
We also need to install
36+
[TensorFlow Recommenders](https://www.tensorflow.org/recommenders) library.
37+
If you are using [tf-nightly](https://pypi.org/project/tf-nightly/) make
38+
sure to install
39+
[tensorflow-recommenders](https://pypi.org/project/tensorflow-recommenders/)
40+
without its dependancies by passing `--no-deps` argument.
41+
42+
For tf-nightly:
43+
```bash
44+
pip install tensorflow-recommenders --no-deps
45+
```
46+
47+
For stable TensorFlow 2.4+ [releases](https://pypi.org/project/tensorflow/):
48+
```bash
49+
pip install tensorflow-recommenders
50+
```
51+
52+
53+
## Dataset
54+
55+
The models can be trained on various datasets, Two commonly used ones are
56+
[Criteo Terabyte](https://labs.criteo.com/2013/12/download-terabyte-click-logs/)
57+
and [Criteo Kaggle](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/)
58+
datasets.
59+
We can train on synthetic data, by setting the flag `use_synthetic_data=True`.
60+
61+
### Download
62+
63+
The dataset is the Terabyte click logs dataset provided by Criteo. Follow the
64+
[instructions](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) at
65+
the Criteo website to download the data.
66+
67+
Note that the dataset is large (~1TB).
68+
69+
### Preprocess the data
70+
71+
Data preprocessing steps are summarized below.
72+
73+
Integer feature processing steps, sequentially:
74+
75+
1. Missing values are replaced with zeros.
76+
2. Negative values are replaced with zeros.
77+
3. Integer features are transformed by log(x+1) and are hence tf.float32.
78+
79+
Categorical features:
80+
81+
1. Categorical data is bucketized to tf.int32.
82+
2. Optionally, the resulting integers are hashed to a lower dimensionality.
83+
This is necessary to reduce the sizes of the large tables. Simple hashing
84+
function such as modulus will suffice, i.e. feature_value % MAX_INDEX.
85+
86+
The vocabulary sizes resulting from pre-processing are passed in to the model
87+
trainer using the model.vocab_sizes config.
88+
89+
The full dataset is composed of 24 directories. Partition the data into training
90+
and eval sets, for example days 1-23 for training and day 24 for evaluation.
91+
92+
Training and eval datasets are expected to be saved in many tab-separated values
93+
(TSV) files in the following format: numberical fetures, categorical features
94+
and label.
95+
96+
On each row of the TSV file first `num_dense_features` inputs are numerical
97+
features, then `vocab_sizes` categorical features and the last one is the label
98+
(either 0 or 1). Each i-th categorical feature is expected to be an integer in
99+
the range of `[0, vocab_sizes[i])`.
100+
101+
## Train and Evaluate
102+
103+
To train DLRM model we use dot product feature interaction, i.e.
104+
`interaction: 'dot'` to train DCN v2 model we use `interaction: 'cross'`.
105+
106+
107+
### Training on TPU
108+
109+
```shell
110+
export TPU_NAME=my-dlrm-tpu
111+
export EXPERIMENT_NAME=my_experiment_name
112+
export BUCKET_NAME="gs://my_dlrm_bucket"
113+
export DATA_DIR="${BUCKET_NAME}/data"
114+
115+
python3 models/official/recommendation/ranking/train.py --mode=train_and_eval \
116+
--model_dir=${BUCKET_NAME}/model_dirs/${EXPERIMENT_NAME} --params_override="
117+
runtime:
118+
distribution_strategy: 'tpu'
119+
task:
120+
use_synthetic_data: false
121+
train_data:
122+
input_path: '${DATA_DIR}/train/*'
123+
global_batch_size: 16384
124+
validation_data:
125+
input_path: '${DATA_DIR}/eval/*'
126+
global_batch_size: 16384
127+
model:
128+
num_dense_features: 13
129+
bottom_mlp: [512,256,128]
130+
embedding_dim: 128
131+
top_mlp: [1024,1024,512,256,1]
132+
interaction: 'dot'
133+
vocab_sizes: [39884406, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63,
134+
38532951, 2953546, 403346, 10, 2208, 11938, 155, 4, 976, 14,
135+
39979771, 25641295, 39664984, 585935, 12972, 108, 36]
136+
trainer:
137+
use_orbit: true
138+
validation_interval: 90000
139+
checkpoint_interval: 100000
140+
validation_steps: 5440
141+
train_steps: 256054
142+
steps_per_loop: 1000
143+
"
144+
```
145+
146+
The data directory should have two subdirectories:
147+
148+
* $DATA_DIR/train
149+
* $DATA_DIR/eval
150+
151+
### Training on GPU
152+
153+
Training on GPUs are similar to TPU training. Only distribution strategy needs
154+
to be updated and number of GPUs provided (for 4 GPUs):
155+
156+
```shell
157+
python3 official/recommendation/ranking/main.py --mode=train_and_eval \
158+
--model_dir=${BUCKET_NAME}/model_dirs/${EXPERIMENT_NAME} --params_override="
159+
runtime:
160+
distribution_strategy: 'mirrored'
161+
num_gpus: 4
162+
...
163+
"
164+
```
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Flags and common definitions for Ranking Models."""
16+
17+
from absl import flags
18+
import tensorflow as tf
19+
20+
from official.common import flags as tfm_flags
21+
22+
FLAGS = flags.FLAGS
23+
24+
25+
def define_flags() -> None:
26+
"""Defines flags for training the Ranking model."""
27+
tfm_flags.define_flags()
28+
29+
FLAGS.set_default(name='experiment', value='dlrm_criteo')
30+
FLAGS.set_default(name='mode', value='train_and_eval')
31+
32+
flags.DEFINE_integer(
33+
name='seed',
34+
default=None,
35+
help='This value will be used to seed both NumPy and TensorFlow.')
36+
flags.DEFINE_string(
37+
name='profile_steps',
38+
default='20,40',
39+
help='Save profiling data to model dir at given range of global steps. '
40+
'The value must be a comma separated pair of positive integers, '
41+
'specifying the first and last step to profile. For example, '
42+
'"--profile_steps=2,4" triggers the profiler to process 3 steps, starting'
43+
' from the 2nd step. Note that profiler has a non-trivial performance '
44+
'overhead, and the output file can be gigantic if profiling many steps.')
45+
46+
47+
@tf.keras.utils.register_keras_serializable(package='RANKING')
48+
class WarmUpAndPolyDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
49+
"""Learning rate callable for the embeddings.
50+
51+
Linear warmup on [0, warmup_steps] then
52+
Constant on [warmup_steps, decay_start_steps]
53+
And polynomial decay on [decay_start_steps, decay_start_steps + decay_steps].
54+
"""
55+
56+
def __init__(self,
57+
batch_size: int,
58+
decay_exp: float = 2.0,
59+
learning_rate: float = 40.0,
60+
warmup_steps: int = 8000,
61+
decay_steps: int = 12000,
62+
decay_start_steps: int = 10000):
63+
super(WarmUpAndPolyDecay, self).__init__()
64+
self.batch_size = batch_size
65+
self.decay_exp = decay_exp
66+
self.learning_rate = learning_rate
67+
self.warmup_steps = warmup_steps
68+
self.decay_steps = decay_steps
69+
self.decay_start_steps = decay_start_steps
70+
71+
def __call__(self, step):
72+
decay_exp = self.decay_exp
73+
learning_rate = self.learning_rate
74+
warmup_steps = self.warmup_steps
75+
decay_steps = self.decay_steps
76+
decay_start_steps = self.decay_start_steps
77+
78+
scal = self.batch_size / 2048
79+
80+
adj_lr = learning_rate * scal
81+
if warmup_steps == 0:
82+
return adj_lr
83+
84+
warmup_lr = step / warmup_steps * adj_lr
85+
global_step = tf.cast(step, tf.float32)
86+
decay_steps = tf.cast(decay_steps, tf.float32)
87+
decay_start_step = tf.cast(decay_start_steps, tf.float32)
88+
warmup_lr = tf.cast(warmup_lr, tf.float32)
89+
90+
steps_since_decay_start = global_step - decay_start_step
91+
already_decayed_steps = tf.minimum(steps_since_decay_start, decay_steps)
92+
decay_lr = adj_lr * (
93+
(decay_steps - already_decayed_steps) / decay_steps)**decay_exp
94+
decay_lr = tf.maximum(0.0001, decay_lr)
95+
96+
lr = tf.where(
97+
global_step < warmup_steps, warmup_lr,
98+
tf.where(
99+
tf.logical_and(decay_steps > 0, global_step > decay_start_step),
100+
decay_lr, adj_lr))
101+
102+
lr = tf.maximum(0.01, lr)
103+
return lr
104+
105+
def get_config(self):
106+
return {
107+
'batch_size': self.batch_size,
108+
'decay_exp': self.decay_exp,
109+
'learning_rate': self.learning_rate,
110+
'warmup_steps': self.warmup_steps,
111+
'decay_steps': self.decay_steps,
112+
'decay_start_steps': self.decay_start_steps
113+
}

0 commit comments

Comments
 (0)