Fix lint

Amar3tto · Amar3tto · commit 4ce240647a25 · 2025-12-23T19:28:06.000+04:00
diff --git a/sdks/python/apache_beam/examples/inference/pytorch_imagenet_rightfit.py b/sdks/python/apache_beam/examples/inference/pytorch_imagenet_rightfit.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 
 """This streaming pipeline performs image classification using an open-source
-PyTorch EfficientNet-B0 model optimized for T4 GPUs. It reads image URIs from Pub/Sub,
-decodes and preprocesses them in parallel, and runs inference with adaptive batch sizing for optimal GPU utilization.
-The pipeline ensures exactly-once semantics via stateful deduplication and idempotent BigQuery writes,
-allowing stable and reproducible performance measurements under continuous load.
+PyTorch EfficientNet-B0 model optimized for T4 GPUs.
+It reads image URIs from Pub/Sub, decodes and preprocesses them in parallel,
+and runs inference with adaptive batch sizing for optimal GPU utilization.
+The pipeline ensures exactly-once semantics via stateful deduplication and
+idempotent BigQuery writes, allowing stable and reproducible performance
+measurements under continuous load.
 Resources like Pub/Sub topic/subscription cleanup is handled programmatically.
 """
 
@@ -36,18 +38,20 @@
 
 import apache_beam as beam
 from apache_beam.coders import BytesCoder
+from apache_beam.io.filesystems import FileSystems
 from apache_beam.ml.inference.base import KeyedModelHandler
 from apache_beam.ml.inference.base import PredictionResult
 from apache_beam.ml.inference.base import RunInference
 from apache_beam.ml.inference.pytorch_inference import PytorchModelHandlerTensor
-from apache_beam.options.pipeline_options import PipelineOptions, SetupOptions, StandardOptions
+from apache_beam.options.pipeline_options import PipelineOptions
+from apache_beam.options.pipeline_options import SetupOptions
+from apache_beam.options.pipeline_options import StandardOptions
 from apache_beam.runners.runner import PipelineResult
 from apache_beam.transforms import userstate
 from apache_beam.transforms import window
 
-import PIL.Image as PILImage
 from google.cloud import pubsub_v1
-from apache_beam.io.filesystems import FileSystems
+import PIL.Image as PILImage
 
 # ============ Utility & Preprocessing ============
 
@@ -102,14 +106,15 @@ def process(self, element):
 
 
 class MakeKeyDoFn(beam.DoFn):
-  """Produce (image_id, payload) where image_id is stable for dedup & BQ insertId."""
+  """Produce (image_id, payload) stable for dedup & BQ insertId."""
   def __init__(self, input_mode: str):
     self.input_mode = input_mode
 
   def process(self, element: str | bytes):
-    # Input can be raw bytes from Pub/Sub or a GCS URI string, depending on mode.
+    # Input can be raw bytes from Pub/Sub or a GCS URI string, depends on mode
     if self.input_mode == "bytes":
-      # element is bytes message, assume it includes {"image_id": "...", "bytes": base64?} or just raw bytes.
+      # element is bytes message, assume it includes
+      # {"image_id": "...", "bytes": base64?} or just raw bytes.
       import hashlib
       b = element if isinstance(element, (bytes, bytearray)) else bytes(element)
       image_id = hashlib.sha1(b).hexdigest()
@@ -127,7 +132,6 @@ class DedupDoFn(beam.DoFn):
   seen = userstate.ReadModifyWriteStateSpec('seen', BytesCoder())
 
   def process(self, element, seen=beam.DoFn.StateParam(seen)):
-    key, payload = element
     if seen.read() == b'1':
       return
     seen.write(b'1')
@@ -176,7 +180,9 @@ def process(self, kv: Tuple[str, PredictionResult]):
       logits = logits.unsqueeze(0)
 
     probs = F.softmax(logits, dim=-1)  # [B, C]
-    values, indices = torch.topk(probs, k=min(self.top_k, probs.shape[-1]), dim=-1)
+    values, indices = torch.topk(
+        probs, k=min(self.top_k, probs.shape[-1]), dim=-1
+    )
 
     topk = [{
         "class_id": int(idx.item()), "score": float(val.item())
@@ -334,7 +340,7 @@ def pick_batch_size(arg: str) -> Optional[int]:
 
 
 def run_load_pipeline(known_args, pipeline_args):
-  """Reads GCS file with URIs and publishes them to Pub/Sub (for streaming mode)."""
+  """Reads GCS file with URIs and publishes them to Pub/Sub (for streaming)."""
   # enforce smaller/CPU-only defaults for feeder
   override_or_add(pipeline_args, '--device', 'CPU')
   override_or_add(pipeline_args, '--num_workers', '5')
@@ -362,7 +368,10 @@ def run_load_pipeline(known_args, pipeline_args):
       lines
       | 'ToBytes' >> beam.Map(lambda line: line.encode('utf-8'))
       |
-      'PublishToPubSub' >> beam.io.WriteToPubSub(topic=known_args.pubsub_topic))
+      'PublishToPubSub' >> beam.io.WriteToPubSub(
+          topic=known_args.pubsub_topic
+      )
+  )
   return pipeline.run()
 
 
@@ -378,8 +387,8 @@ def run(
       topic_path=known_args.pubsub_topic,
       subscription_path=known_args.pubsub_subscription)
 
-  # If streaming -> start feeder thread that reads URIs from GCS and fills Pub/Sub.
   if known_args.mode == 'streaming':
+    # Start feeder thread that reads URIs from GCS and fills Pub/Sub.
     threading.Thread(
         target=lambda:
         (time.sleep(900), run_load_pipeline(known_args, pipeline_args)),
@@ -398,7 +407,6 @@ def run(
   # Device
   device = 'GPU' if known_args.device.upper() == 'GPU' else 'CPU'
 
-  model = None
   bs_ok = None
   last_err = None
   for bs in tried:
@@ -431,15 +439,15 @@ def run(
         "Falling back to batch_size=8 due to previous errors: %s", last_err)
     bs_ok = 8
     model_handler = PytorchModelHandlerTensor(
-        model_class=lambda: create_timm_model(known_args.pretrained_model_name),
+        model_class=lambda: create_timm_model(
+            known_args.pretrained_model_name
+        ),
         model_params={},
         state_dict_path=known_args.model_state_dict_path,
         device=device,
         inference_batch_size=bs_ok,
     )
 
-  tokenizer = None
-
   pipeline = test_pipeline or beam.Pipeline(options=pipeline_options)
 
   if known_args.mode == 'batch':
@@ -491,13 +499,13 @@ def run(
               model_name=known_args.pretrained_model_name)))
 
   if known_args.publish_to_big_query == 'true':
-    # Schema: image_id:STRING, model_name:STRING, topk:STRING(JSON), infer_ts_ms:INT64
     _ = (
         results
         | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
             known_args.output_table,
             schema=
-            'image_id:STRING, model_name:STRING, topk:STRING, infer_ts_ms:INT64',
+            'image_id:STRING, model_name:STRING, topk:STRING, '
+            'infer_ts_ms:INT64',
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             method=beam.io.WriteToBigQuery.Method.STREAMING_INSERTS))