Make data.download not use Docker with DirectRunner and apache_beam>=2.68.0 (#182)

hyperc54 · web-flow · commit eee9fe0b2bf2 · 2025-11-07T17:32:42.000+01:00
* rm environment_type DOCKER to make DirectRunner not use docker with beam&gt;2.67

* fix incorrct dont_sonify input

* Remove unused constant

* adapt readme
diff --git a/basic_pitch/data/README.md b/basic_pitch/data/README.md
@@ -5,7 +5,7 @@ The code and scripts in this section deal with training basic pitch on your own.
 * **--runner**: The method used to run the Beam Pipeline for processing the dataset. Options include `DirectRunner`, running directly in the code process running the pipeline, `PortableRunner`, which can be used to run the pipeline in a docker container locally, and `DataflowRunner`, which can be used to run the pipeline in a docker container on Dataflow. 
 * **--timestamped**: If passed, the dataset will be put into a timestamp directory instead of 'splits'.
 * **--batch-size**: Number of examples per tfrecord when partitioning the dataset.
-* **--sdk_container_image**: The Docker container image used to process the data if using `PortableRunner` or `DirectRunner`.
+* **--sdk_container_image**: The Docker container image used to process the data if using `PortableRunner`.
 * **--job_endpoint**: the endpoint where the job is running. It defaults to `embed` which works for `PortableRunner`. 
 
 Additional arguments that work with Beam in general can be used as well, and will be passed along and used by the pipeline. If using `DataflowRunner`, you will be required to pass `--temp_location={Path to GCS Bucket}`, `--staging_location={Path to GCS Bucket}`, `--project={Name of GCS Project}` and `--region={GCS region}`. 
diff --git a/basic_pitch/data/datasets/guitarset.py b/basic_pitch/data/datasets/guitarset.py
@@ -165,7 +165,6 @@ def main(known_args: argparse.Namespace, pipeline_args: List[str]) -> None:
         "save_main_session": True,
         "sdk_container_image": known_args.sdk_container_image,
         "job_endpoint": known_args.job_endpoint,
-        "environment_type": "DOCKER",
         "environment_config": known_args.sdk_container_image,
     }
     pipeline.run(
diff --git a/basic_pitch/data/datasets/ikala.py b/basic_pitch/data/datasets/ikala.py
@@ -165,7 +165,6 @@ def main(known_args: argparse.Namespace, pipeline_args: List[str]) -> None:
         "save_main_session": True,
         "sdk_container_image": known_args.sdk_container_image,
         "job_endpoint": known_args.job_endpoint,
-        "environment_type": "DOCKER",
         "environment_config": known_args.sdk_container_image,
     }
     input_data = create_input_data(known_args.train_percent, known_args.split_seed)
diff --git a/basic_pitch/data/datasets/maestro.py b/basic_pitch/data/datasets/maestro.py
@@ -46,8 +46,6 @@ def __init__(self, source: str) -> None:
         self.source = source
 
     def setup(self) -> None:
-        # Oddly enough we dont want to include the gcs bucket uri.
-        # Just the path within the bucket
         self.maestro_remote = mirdata.initialize("maestro", data_home=self.source)
         self.filesystem = beam.io.filesystems.FileSystems()
 
@@ -89,8 +87,6 @@ def setup(self) -> None:
         import apache_beam as beam
         import mirdata
 
-        # Oddly enough we dont want to include the gcs bucket uri.
-        # Just the path within the bucket
         self.maestro_remote = mirdata.initialize("maestro", data_home=self.source)
         self.filesystem = beam.io.filesystems.FileSystems()
         if self.download:
diff --git a/basic_pitch/data/datasets/medleydb_pitch.py b/basic_pitch/data/datasets/medleydb_pitch.py
@@ -164,7 +164,6 @@ def main(known_args: argparse.Namespace, pipeline_args: List[str]) -> None:
         "save_main_session": True,
         "sdk_container_image": known_args.sdk_container_image,
         "job_endpoint": known_args.job_endpoint,
-        "environment_type": "DOCKER",
         "environment_config": known_args.sdk_container_image,
     }
     pipeline.run(
diff --git a/basic_pitch/data/datasets/slakh.py b/basic_pitch/data/datasets/slakh.py
@@ -200,7 +200,6 @@ def main(known_args: argparse.Namespace, pipeline_args: List[str]) -> None:
         "save_main_session": True,
         "sdk_container_image": known_args.sdk_container_image,
         "job_endpoint": known_args.job_endpoint,
-        "environment_type": "DOCKER",
         "environment_config": known_args.sdk_container_image,
     }
     pipeline.run(
diff --git a/basic_pitch/train.py b/basic_pitch/train.py
@@ -277,7 +277,7 @@ def console_entry_point() -> None:
         args.size_evaluation_callback_datasets,
         datasets_to_use,
         dataset_sampling_frequency,
-        args.dont_sonify,
+        args.no_sonify,
         args.no_contours,
         args.weighted_onset_loss,
         args.positive_onset_weight,

Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,6 @@ def main(known_args: argparse.Namespace, pipeline_args: List[str]) -> None:`
`165`	`165`	`"save_main_session": True,`
`166`	`166`	`"sdk_container_image": known_args.sdk_container_image,`
`167`	`167`	`"job_endpoint": known_args.job_endpoint,`
`168`		`- "environment_type": "DOCKER",`
`169`	`168`	`"environment_config": known_args.sdk_container_image,`
`170`	`169`	`}`
`171`	`170`	`pipeline.run(`
Original file line number	Diff line number	Diff line change
`@@ -164,7 +164,6 @@ def main(known_args: argparse.Namespace, pipeline_args: List[str]) -> None:`
`164`	`164`	`"save_main_session": True,`
`165`	`165`	`"sdk_container_image": known_args.sdk_container_image,`
`166`	`166`	`"job_endpoint": known_args.job_endpoint,`
`167`		`- "environment_type": "DOCKER",`
`168`	`167`	`"environment_config": known_args.sdk_container_image,`
`169`	`168`	`}`
`170`	`169`	`pipeline.run(`
Original file line number	Diff line number	Diff line change
`@@ -200,7 +200,6 @@ def main(known_args: argparse.Namespace, pipeline_args: List[str]) -> None:`
`200`	`200`	`"save_main_session": True,`
`201`	`201`	`"sdk_container_image": known_args.sdk_container_image,`
`202`	`202`	`"job_endpoint": known_args.job_endpoint,`
`203`		`- "environment_type": "DOCKER",`
`204`	`203`	`"environment_config": known_args.sdk_container_image,`
`205`	`204`	`}`
`206`	`205`	`pipeline.run(`