update padding strategy for persistent cache (#2464)

eedalong · web-flow · commit 552190740c1f · 2024-11-18T15:28:58.000+08:00
diff --git a/swift/torchacc_utils.py b/swift/torchacc_utils.py
@@ -27,14 +27,26 @@ def get_bucket_sizes(max_length: int) -> List[int]:
     the bucket sizes. If not set, we use a normal distribution bucketing with
     8 buckets.
     """
+    padding_p_base = 2
     if os.getenv('TORCHACC_DATA_BUCKETS') is not None:
         bucket_sizes = [int(x) for x in os.getenv('TORCHACC_DATA_BUCKETS').split(',')]
         bucket_sizes.append(max_length)
-    else:  # default normal distribution bucketing.
-        mean = max_length // 2
-        var = max_length // 8
-        bucket_sizes = [mean + i * var for i in range(-3, 4)]
+    else:
+        if os.getenv('TORCHACC_CACHE_PATH') is not None:  # padding strategy when persistent cache is enabled
+            padding_p_base = 1.4
+        padding_p_base = os.getenv('TORCHACC_PADDING_P_BASE', padding_p_base)
+        try:
+            padding_p_base = float(padding_p_base)
+        except ValueError as e:
+            logger.error(f'Expect TORCHACC_PADDINF_P_BASE to be a float number, but encountered {padding_p_base}')
+            raise e
+        bucket_sizes = [16, 32, 48, 64, 96, 128]
+        base_size = 256
+        while base_size < max_length:
+            bucket_sizes.append((int(base_size) + 127) // 128 * 128)
+            base_size *= padding_p_base
         bucket_sizes.append(max_length)
+
     return bucket_sizes
 
 
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -213,7 +213,9 @@ def compute_loss(self, model, inputs, return_outputs=None, num_items_in_batch=No
                 acc = torch.tensor(acc_list, device=preds.device).float().mean()
             else:
                 if use_torchacc():
-                    ta_trim_graph()
+                    # Only enabled during evaluation/test
+                    if not model.training:
+                        ta_trim_graph()
                     preds = preds.to('cpu')
                     masks = masks.to('cpu')
                     labels = labels.to('cpu')