[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 96fcc41ee478 · 2022-02-01T15:18:30.000Z
for more information, see https://pre-commit.ci
diff --git a/dashboard/python_scripts/compute_stats.py b/dashboard/python_scripts/compute_stats.py
@@ -27,11 +27,19 @@ def get_args():
         required=True,
         help="path to the parquet dataset folder",
     )
-    parser.add_argument("--save-path-stats-json", type=str, help="Where to save the stats json.")
-    parser.add_argument("--save-path-stats-full-json", type=str, help="Where to save the stats json.")
-    parser.add_argument("--save-batch-size", type=int, required=True, help="Batch size when writing.")
+    parser.add_argument(
+        "--save-path-stats-json", type=str, help="Where to save the stats json."
+    )
+    parser.add_argument(
+        "--save-path-stats-full-json", type=str, help="Where to save the stats json."
+    )
+    parser.add_argument(
+        "--save-batch-size", type=int, required=True, help="Batch size when writing."
+    )
     parser.add_argument("--use-datasets-caching", action="store_true")
-    parser.add_argument("--num-proc", type=int, default=1, help="Number of procs use for preprocessing.")
+    parser.add_argument(
+        "--num-proc", type=int, default=1, help="Number of procs use for preprocessing."
+    )
     parser.add_argument(
         "--seed-id",
         type=int,
@@ -57,12 +65,16 @@ def main():
         level=logging.INFO,
     )
     args = get_args()
-    logger.info(f"** The job is runned with the following arguments: **\n{args}\n **** ")
+    logger.info(
+        f"** The job is runned with the following arguments: **\n{args}\n **** "
+    )
 
     if not args.use_datasets_caching:
         datasets.set_caching_enabled(False)
     else:
-        logger.info(f"the datasets results will be cached at {config.HF_DATASETS_CACHE}.")
+        logger.info(
+            f"the datasets results will be cached at {config.HF_DATASETS_CACHE}."
+        )
 
     ds = load_from_disk(args.dataset_path)
 
@@ -73,15 +85,19 @@ def main():
     splits = {
         **{
             mime_type: ds.filter(
-                lambda mime_types_: [mime_type_ == mime_type for mime_type_ in mime_types_],
+                lambda mime_types_: [
+                    mime_type_ == mime_type for mime_type_ in mime_types_
+                ],
                 input_columns="content_mime_detected",
                 batched=True,
                 num_proc=args.num_proc,
             )
             for mime_type in selected_mime_types
         },
         "others": ds.filter(
-            lambda mime_types_: [mime_type_ not in selected_mime_types for mime_type_ in mime_types_],
+            lambda mime_types_: [
+                mime_type_ not in selected_mime_types for mime_type_ in mime_types_
+            ],
             input_columns="content_mime_detected",
             batched=True,
             num_proc=args.num_proc,
@@ -96,7 +112,11 @@ def get_length_text(example):
         example["length_text"] = len(example["text"])
         return example
 
-    cols_to_remove = [col for col in ds.column_names if col not in ["content_languages", "url_host_tld"]]
+    cols_to_remove = [
+        col
+        for col in ds.column_names
+        if col not in ["content_languages", "url_host_tld"]
+    ]
     ds_html = ds_html.map(
         get_length_text,
         batched=False,
@@ -105,10 +125,14 @@ def get_length_text(example):
     )
 
     data_stats["html_empty_text"] = len([e for e in ds_html["length_text"] if e == 0])
-    data_stats["html_mean_length_non_empty_text"] = mean([e for e in ds_html["length_text"] if e != 0])
+    data_stats["html_mean_length_non_empty_text"] = mean(
+        [e for e in ds_html["length_text"] if e != 0]
+    )
     data_stats["seed_id"] = args.seed_id
 
-    logger.info(f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows.")
+    logger.info(
+        f"There is {data_stats['html_empty_text']} empty text rows out of {len(ds_html)} rows."
+    )
 
     save_path = Path(args.save_path_stats_json)
     save_path_tmp = f"{str(save_path.absolute())}.tmp"