Refactor recommendation ml code.

lintian06 · copybara-github · commit 22f86fd27e56 · 2020-12-21T21:48:26.000-08:00
PiperOrigin-RevId: 348577962
diff --git a/lite/examples/recommendation/ml/data/example_generation_movielens.py b/lite/examples/recommendation/ml/data/example_generation_movielens.py
@@ -35,20 +35,10 @@
 import tensorflow as tf
 
 FLAGS = flags.FLAGS
-flags.DEFINE_string("data_dir", "/tmp",
-                    "Path to download and store movielens data.")
-flags.DEFINE_string("output_dir", None,
-                    "Path to the directory of output files.")
-flags.DEFINE_bool("build_movie_vocab", True,
-                  "If yes, generate sorted movie vocab.")
-flags.DEFINE_integer("min_timeline_length", 3,
-                     "The minimum timeline length to construct examples.")
-flags.DEFINE_integer("max_context_length", 10,
-                     "The maximun length of user context history.")
-
 # Permalinks to download movielens data.
 MOVIELENS_1M_URL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
 MOVIELENS_ZIP_FILENAME = "ml-1m.zip"
+MOVIELENS_ZIP_HASH = "a6898adb50b9ca05aa231689da44c217cb524e7ebd39d264c56e2832f2c54e20"
 MOVIELENS_EXTRACTED_DIR = "ml-1m"
 RATINGS_FILE_NAME = "ratings.dat"
 MOVIES_FILE_NAME = "movies.dat"
@@ -60,6 +50,19 @@
 OOV_MOVIE_ID = 0
 
 
+def define_flags():
+  flags.DEFINE_string("data_dir", "/tmp",
+                      "Path to download and store movielens data.")
+  flags.DEFINE_string("output_dir", None,
+                      "Path to the directory of output files.")
+  flags.DEFINE_bool("build_movie_vocab", True,
+                    "If yes, generate sorted movie vocab.")
+  flags.DEFINE_integer("min_timeline_length", 3,
+                       "The minimum timeline length to construct examples.")
+  flags.DEFINE_integer("max_context_length", 10,
+                       "The maximun length of user context history.")
+
+
 def download_and_extract_data(data_directory, url=MOVIELENS_1M_URL):
   """Download and extract zip containing MovieLens data to a given directory.
 
@@ -74,6 +77,8 @@ def download_and_extract_data(data_directory, url=MOVIELENS_1M_URL):
   path_to_zip = tf.keras.utils.get_file(
       fname=MOVIELENS_ZIP_FILENAME,
       origin=url,
+      file_hash=MOVIELENS_ZIP_HASH,
+      hash_algorithm="sha256",
       extract=True,
       cache_dir=data_directory)
   extracted_file_dir = os.path.join(
@@ -154,10 +159,13 @@ def generate_examples_from_timelines(timelines,
 
 
 def write_tfrecords(tf_examples, filename):
-  """Write tf examples to tfrecord file."""
+  """Writes tf examples to tfrecord file, and returns the count."""
   with tf.io.TFRecordWriter(filename) as file_writer:
+    i = 0
     for example in tf_examples:
       file_writer.write(example)
+      i += 1
+    return i
 
 
 def generate_sorted_movie_vocab(movies_df, movie_counts):
@@ -176,8 +184,9 @@ def write_vocab_json(vocab_movies, filename):
     json.dump(vocab_movies, jsonfile, indent=2)
 
 
-def main(_):
-  data_dir = FLAGS.data_dir
+def generate_datasets(data_dir, output_dir, min_timeline_length,
+                      max_context_length, build_movie_vocab):
+  """Generates train and test datasets as TFRecord, and returns stats."""
   if not tf.io.gfile.exists(data_dir):
     tf.io.gfile.makedirs(data_dir)
 
@@ -186,24 +195,37 @@ def main(_):
   timelines, movie_counts = convert_to_timelines(ratings_df)
   train_examples, test_examples = generate_examples_from_timelines(
       timelines=timelines,
-      min_timeline_len=FLAGS.min_timeline_length,
-      max_context_len=FLAGS.max_context_length)
-
-  if not tf.io.gfile.exists(FLAGS.output_dir):
-    tf.io.gfile.makedirs(FLAGS.output_dir)
-  write_tfrecords(
-      tf_examples=train_examples,
-      filename=os.path.join(FLAGS.output_dir, OUTPUT_TRAINING_DATA_FILENAME))
-  write_tfrecords(
-      tf_examples=test_examples,
-      filename=os.path.join(FLAGS.output_dir, OUTPUT_TESTING_DATA_FILENAME))
-  if FLAGS.build_movie_vocab:
+      min_timeline_len=min_timeline_length,
+      max_context_len=max_context_length)
+
+  if not tf.io.gfile.exists(output_dir):
+    tf.io.gfile.makedirs(output_dir)
+  train_file = os.path.join(output_dir, OUTPUT_TRAINING_DATA_FILENAME)
+  train_size = write_tfrecords(tf_examples=train_examples, filename=train_file)
+  test_file = os.path.join(output_dir, OUTPUT_TESTING_DATA_FILENAME)
+  test_size = write_tfrecords(tf_examples=test_examples, filename=test_file)
+  stats = {
+      "train_size": train_size,
+      "test_size": test_size,
+      "train_file": train_file,
+      "test_file": test_file,
+  }
+  if build_movie_vocab:
     vocab_movies = generate_sorted_movie_vocab(
         movies_df=movies_df, movie_counts=movie_counts)
-    write_vocab_json(
-        vocab_movies=vocab_movies,
-        filename=os.path.join(FLAGS.output_dir, OUTPUT_MOVIE_VOCAB_FILENAME))
+    vocab_file = os.path.join(output_dir, OUTPUT_MOVIE_VOCAB_FILENAME)
+    write_vocab_json(vocab_movies=vocab_movies, filename=vocab_file)
+    stats.update(vocab_size=len(vocab_movies), vocab_file=vocab_file)
+  return stats
+
+
+def main(_):
+  stats = generate_datasets(FLAGS.data_dir, FLAGS.output_dir,
+                            FLAGS.min_timeline_length, FLAGS.max_context_length,
+                            FLAGS.build_movie_vocab)
+  tf.compat.v1.logging.info("Generated dataset: %s", stats)
 
 
 if __name__ == "__main__":
+  define_flags()
   app.run(main)
diff --git a/lite/examples/recommendation/ml/model/recommendation_model_launcher_keras.py b/lite/examples/recommendation/ml/model/recommendation_model_launcher_keras.py
@@ -76,7 +76,7 @@ def on_epoch_end(self, epoch, logs=None):
     self.checkpoint_manager.save(checkpoint_number=step_counter)
 
 
-def get_input_fn(data_filepattern):
+def get_input_fn(data_filepattern, batch_size):
   """Get input_fn for recommendation model estimator."""
 
   def decode_example(serialized_proto):
@@ -112,7 +112,7 @@ def input_fn():
     d = d.repeat()
     d = d.shuffle(buffer_size=100)
     d = d.map(decode_example)
-    d = d.batch(FLAGS.batch_size, drop_remainder=True)
+    d = d.batch(batch_size, drop_remainder=True)
     d = d.prefetch(1)
     return d
 
@@ -226,8 +226,9 @@ def main(_):
   params['num_predictions'] = FLAGS.num_predictions
 
   logger.info('Setting up train and eval input_fns.')
-  train_input_fn = get_input_fn(FLAGS.training_data_filepattern)
-  eval_input_fn = get_input_fn(FLAGS.testing_data_filepattern)
+  train_input_fn = get_input_fn(FLAGS.training_data_filepattern,
+                                FLAGS.batch_size)
+  eval_input_fn = get_input_fn(FLAGS.testing_data_filepattern, FLAGS.batch_size)
 
   logger.info('Build keras model for mode: {}.'.format(FLAGS.run_mode))
   model = build_keras_model(params=params)
diff --git a/lite/examples/recommendation/ml/model/recommendation_model_launcher_keras_test.py b/lite/examples/recommendation/ml/model/recommendation_model_launcher_keras_test.py
@@ -73,12 +73,15 @@ def setUp(self):
     FLAGS.encoder_type = 'cnn'
     FLAGS.num_predictions = 10
     FLAGS.max_history_length = 10
+    FLAGS.batch_size = 1
 
   def testModelFnTrainModeExecute(self):
     """Verifies that 'model_fn' can be executed in train and eval mode."""
     self.params['encoder_type'] = FLAGS.encoder_type
-    train_input_fn = launcher.get_input_fn(FLAGS.training_data_filepattern)
-    eval_input_fn = launcher.get_input_fn(FLAGS.testing_data_filepattern)
+    train_input_fn = launcher.get_input_fn(FLAGS.training_data_filepattern,
+                                           FLAGS.batch_size)
+    eval_input_fn = launcher.get_input_fn(FLAGS.testing_data_filepattern,
+                                          FLAGS.batch_size)
     model = launcher.build_keras_model(params=self.params)
     launcher.train_and_eval(
         model=model,
@@ -96,8 +99,10 @@ def testModelFnExportModeExecute(self):
     """Verifies model can be exported to savedmodel and tflite model."""
     self.params['encoder_type'] = FLAGS.encoder_type
     self.params['num_predictions'] = FLAGS.num_predictions
-    train_input_fn = launcher.get_input_fn(FLAGS.training_data_filepattern)
-    eval_input_fn = launcher.get_input_fn(FLAGS.testing_data_filepattern)
+    train_input_fn = launcher.get_input_fn(FLAGS.training_data_filepattern,
+                                           FLAGS.batch_size)
+    eval_input_fn = launcher.get_input_fn(FLAGS.testing_data_filepattern,
+                                          FLAGS.batch_size)
     model = launcher.build_keras_model(params=self.params)
     launcher.train_and_eval(
         model=model,
@@ -142,4 +147,5 @@ def testModelFnExportModeExecute(self):
 
 
 if __name__ == '__main__':
+  launcher.define_flags()
   tf.test.main()