Use proper capitalization for Dataflow, add CLI for beam_pipeline_test

klmilam · mbernico · commit fd18181967cf · 2020-07-14T17:49:28.000Z
Change-Id: I10253fa57237eac6394bb13aa0920fbb804d423e
diff --git a/tfrutil/beam_pipeline.py b/tfrutil/beam_pipeline.py
@@ -16,7 +16,7 @@
 
 """TFRUtil Beam Pipeline.
 
-This file implements the full beam pipeline for TFRUtil.
+This file implements the full Beam pipeline for TFRUtil.
 """
 
 from typing import Any, Dict, Generator, Union
@@ -94,7 +94,7 @@ def _get_pipeline_options(
     options_dict['project'] = project
   if region:
     options_dict['region'] = region
-  if runner == 'DataFlowRunner':
+  if runner == 'DataflowRunner':
     options_dict['setup_file'] = _get_setup_py_filepath()
   if dataflow_options:
     options_dict.update(dataflow_options)
@@ -204,15 +204,15 @@ def build_pipeline(
   """Runs TFRUtil Beam Pipeline.
 
   Args:
-    df: Pandas Dataframe
+    df: Pandas DataFrame
     job_label: User description for the beam job.
-    runner: Beam Runner: (e.g. DataFlowRunner, DirectRunner).
-    project: GCP project ID (if DataFlowRunner)
-    region: GCP compute region (if DataFlowRunner)
+    runner: Beam Runner: (e.g. DataflowRunner, DirectRunner).
+    project: GCP project ID (if DataflowRunner)
+    region: GCP compute region (if DataflowRunner)
     output_dir: GCS or Local Path for output.
     compression: gzip or None.
     num_shards: Number of shards.
-    dataflow_options: DataFlow Runner Options (optional)
+    dataflow_options: Dataflow Runner Options (optional)
     integer_label: Flags if label is already an integer.
 
   Returns:
diff --git a/tfrutil/beam_pipeline_test.py b/tfrutil/beam_pipeline_test.py
@@ -83,3 +83,7 @@ def test_get_setup_py_filepath(self):
     filepath = beam_pipeline._get_setup_py_filepath()
     self.assertTrue(os.path.isfile(filepath))
     self.assertTrue(os.path.isabs(filepath))
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tfrutil/client.py b/tfrutil/client.py
@@ -33,20 +33,20 @@
 
 
 def _validate_data(df):
-  """ Verify required image csv columsn exist in data."""
+  """ Verifies required image csv columsn exist in data."""
   if constants.IMAGE_URI_KEY not in df.columns:
   # or label_col not in df.columns:
     raise AttributeError(
-        'Dataframe must contain image_uri column {}.')
+        'DataFrame must contain image_uri column {}.')
   if constants.LABEL_KEY not in df.columns:
     raise AttributeError(
-        'Dataframe must contain label column.')
+        'DataFrame must contain label column.')
   if constants.SPLIT_KEY not in df.columns:
     raise AttributeError(
-        'Dataframe must contain split column.')
+        'DataFrame must contain split column.')
   if list(df.columns) != constants.IMAGE_CSV_COLUMNS:
     raise AttributeError(
-        'Dataframe column order must be {}'.format(
+        'DataFrame column order must be {}'.format(
             constants.IMAGE_CSV_COLUMNS))
 
 
@@ -56,18 +56,18 @@ def _validate_runner(
     project: str,
     region: str):
   """Validates an appropriate beam runner is chosen."""
-  if runner not in ['DataFlowRunner', 'DirectRunner']:
+  if runner not in ['DataflowRunner', 'DirectRunner']:
     raise AttributeError('Runner {} is not supported.'.format(runner))
 
   # gcs_path is a bool, true if all image paths start with gs://
   gcs_path = df[constants.IMAGE_URI_KEY].str.startswith('gs://').all()
-  if (runner == 'DataFlowRunner') & (not gcs_path):
-    raise AttributeError('DataFlowRunner requires GCS image locations.')
+  if (runner == 'DataflowRunner') & (not gcs_path):
+    raise AttributeError('DataflowRunner requires GCS image locations.')
 
-  if (runner == 'DataFlowRunner') & (
+  if (runner == 'DataflowRunner') & (
       any(not v for v in [project, region])):
     raise AttributeError(
-        'DataFlowRunner requires valid `project` and `region` to be specified.'
+        'DataflowRunner requires valid `project` and `region` to be specified.'
         'The `project` is {} and `region` is {}'.format(project, region))
 
 # def read_image_directory(dirpath) -> pd.DataFrame:
@@ -188,19 +188,19 @@ def create_tfrecords(
       a Pandas DataFrame.
       If 'infer' (default), header is taken from the first line of a CSV
     runner: Beam runner. Can be 'DirectRunner' or 'DataFlowRunner'
-    project: GCP project name (Required if DataFlowRunner)
-    region: GCP region name (Required if DataFlowRunner)
-    dataflow_options: Options dict for dataflow runner
-    job_label: User supplied description for the beam job name.
+    project: GCP project name (Required if DataflowRunner)
+    region: GCP region name (Required if DataflowRunner)
+    dataflow_options: Options dict for DataflowRunner
+    job_label: User supplied description for the Beam job name.
     compression: Can be 'gzip' or None for no compression.
     num_shards: Number of shards to divide the TFRecords into. Default is
         0 = no sharding.
 
   Returns:
     job_results: Dict
-      job_id: DataFlow Job ID or 'DirectRunner'
+      job_id: Dataflow Job ID or 'DirectRunner'
       metrics: (optional) Beam metrics. Only used for DirectRunner
-      dataflow_url: (optional) Job URL for DataFlowRunner
+      dataflow_url: (optional) Job URL for DataflowRunner
   """
 
   df = to_dataframe(input_data, header, names)
@@ -253,8 +253,8 @@ def create_tfrecords(
     logging.info("Job Complete.")
 
   else:
-    logging.info("Using DataFlow Runner.")
-    # Construct DataFlow URL
+    logging.info("Using Dataflow Runner.")
+    # Construct Dataflow URL
 
     job_id = result.job_id()
 
@@ -272,8 +272,8 @@ def create_tfrecords(
 
   logging.shutdown()
 
-  if runner == 'DataFlowRunner':
-    # if this is a dataflow job, copy the logfile to gcs
+  if runner == 'DataflowRunner':
+    # if this is a Dataflow job, copy the logfile to GCS
     common.copy_logfile_to_gcs(logfile, output_dir)
 
   return job_result
diff --git a/tfrutil/client_test.py b/tfrutil/client_test.py
@@ -51,7 +51,7 @@ def test_create_tfrecords_direct_runner(self, mock_beam):
 
   @mock.patch('tfrutil.client.beam_pipeline')
   def test_create_tfrecords_dataflow_runner(self, mock_beam):
-    """Tests `create_tfrecords` DataFlow case."""
+    """Tests `create_tfrecords` Dataflow case."""
     mock_beam.build_pipeline().run().job_id.return_value = 'foo_id'
 
     df2 = self.test_df.copy()
@@ -67,7 +67,7 @@ def test_create_tfrecords_dataflow_runner(self, mock_beam):
     os.makedirs(outdir, exist_ok=True)
     r = client.create_tfrecords(
         df2,
-        runner='DataFlowRunner',
+        runner='DataflowRunner',
         output_dir=outdir,
         region=self.test_region,
         project=self.test_project)
@@ -112,7 +112,7 @@ def test_missing_split(self):
       client._validate_data(df2)
 
   def test_columns_out_of_order(self):
-    """Tests validating column order wrong."""
+    """Tests validating wrong column order."""
     with self.assertRaises(AttributeError):
       df2 = self.test_df.copy()
       cols = ['image_uri', 'split', 'label']
@@ -137,38 +137,38 @@ def test_invalid_runner(self):
           region=self.test_region)
 
   def test_local_path_with_dataflow_runner(self):
-    """Tests DataFlowRunner conflict with local path."""
+    """Tests DataflowRunner conflict with local path."""
     with self.assertRaises(AttributeError):
       client._validate_runner(
           self.df_test,
-          runner='DataFlowRunner',
+          runner='DataflowRunner',
           project=self.test_project,
           region=self.test_region)
 
   def test_gcs_path_with_dataflow_runner(self):
-    """Tests DataFlowRunner with gcs path."""
+    """Tests DataflowRunner with GCS path."""
     df2 = self.test_df.copy()
     df2[constants.IMAGE_URI_KEY] = 'gs://' + df2[constants.IMAGE_URI_KEY]
     self.assertIsNone(
         client._validate_runner(
             df2,
-            runner='DataFlowRunner',
+            runner='DataflowRunner',
             project=self.test_project,
             region=self.test_region))
 
   def test_gcs_path_with_dataflow_runner_missing_param(self):
-    """Tests DataFlowRunner with missing required parameter."""
+    """Tests DataflowRunner with missing required parameter."""
     df2 = self.test_df.copy()
     df2[constants.IMAGE_URI_KEY] = 'gs://' + df2[constants.IMAGE_URI_KEY]
     for p, r in [
         (None, self.test_region), (self.test_project, None), (None, None)]:
       with self.assertRaises(AttributeError) as context:
         client._validate_runner(
             df2,
-            runner='DataFlowRunner',
+            runner='DataflowRunner',
             project=p,
             region=r)
-      self.assertTrue('DataFlowRunner requires valid `project` and `region`'
+      self.assertTrue('DataflowRunner requires valid `project` and `region`'
                       in repr(context.exception))