Manual download logic (#708)

thomasw21 · VictorSanh · web-flow · commit 29c3dd86c4dc · 2022-01-04T19:00:02.000+01:00
* Add README

* Allow to run manually downloaded datasets

* Update README

* make style

* remove datasets dependency in tasks.py

* Nit

* Nicer error message in web interface

* Fix error

* \n

* Fix character escape

* ` is not a correct escapte character

* I don't know

* fix data_dir for cases where `subset_name` is `None`

* tiny grammarly fixes

Co-authored-by: Victor Sanh &lt;victorsanh@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -70,6 +70,15 @@ collection = TemplateCollection()
 # and the value is an instance of DatasetTemplates
 print(collection.datasets_templates)
 ```
+
+## Running datasets that need manual download
+
+Some datasets are not handled automatically by `datasets` and require users to download the dataset manually.
+
+In order to handle those datasets as well, we require users to download the dataset and put it in `~/.cache/promptsource`. This is the root directory containing all manually downloaded datasets.
+
+You can override this default path using `PROMPTSOURCE_MANUAL_DATASET_DIR` environment variable. This should point to the root directory.
+
 ## Contributing
 Contribution guidelines and step-by-step *HOW TO* are described [here](CONTRIBUTING.md).
 
diff --git a/promptsource/__init__.py b/promptsource/__init__.py
@@ -0,0 +1 @@
+DEFAULT_PROMPTSOURCE_CACHE_HOME = "~/.cache/promptsource"
diff --git a/promptsource/app.py b/promptsource/app.py
@@ -204,7 +204,7 @@ def get_infos(d_name):
     fig.update_xaxes(visible=False, showticklabels=False)
     st.plotly_chart(fig, use_container_width=True)
     st.write(
-        f"- Top 3 training subsets account for `{100*plot_df[:3]['Train size'].sum()/nb_training_instances:.2f}%` of the training instances."
+        f"- Top 3 training subsets account for `{100 * plot_df[:3]['Train size'].sum() / nb_training_instances:.2f}%` of the training instances."
     )
     biggest_training_subset = plot_df.iloc[0]
     st.write(
@@ -257,7 +257,20 @@ def get_infos(d_name):
         if len(configs) > 0:
             conf_option = st.sidebar.selectbox("Subset", configs, index=0, format_func=lambda a: a.name)
 
-        dataset = get_dataset(dataset_key, str(conf_option.name) if conf_option else None)
+        subset_name = str(conf_option.name) if conf_option else None
+        try:
+            dataset = get_dataset(dataset_key, subset_name)
+        except OSError as e:
+            st.error(
+                f"Some datasets are not handled automatically by `datasets` and require users to download the "
+                f"dataset manually. This applies to {dataset_key}{f'/{subset_name}' if subset_name is not None else ''}. "
+                f"\n\nPlease download the raw dataset to `~/.cache/promptsource/{dataset_key}{f'/{subset_name}' if subset_name is not None else ''}`. "
+                f"\n\nYou can choose another cache directory by overriding `PROMPTSOURCE_MANUAL_DATASET_DIR` environment "
+                f"variable and downloading raw dataset to `$PROMPTSOURCE_MANUAL_DATASET_DIR/{dataset_key}{f'/{subset_name}' if subset_name is not None else ''}`"
+                f"\n\nOriginal error:\n{str(e)}"
+            )
+            st.stop()
+
         splits = list(dataset.keys())
         index = 0
         if "train" in splits:
@@ -596,7 +609,6 @@ def get_infos(d_name):
                             st.write("Target")
                             show_text(prompt[1], width=40)
 
-
 #
 # Must sync state at end
 #
diff --git a/promptsource/seqio_tasks/tasks.py b/promptsource/seqio_tasks/tasks.py
@@ -2,7 +2,6 @@
 import functools
 from typing import Dict, List, Optional, Tuple
 
-import datasets
 import pkg_resources
 import seqio
 import t5
@@ -12,6 +11,7 @@
 
 import promptsource.templates
 from promptsource.seqio_tasks import utils
+from promptsource.utils import load_dataset
 
 
 GET_METRICS = {
@@ -59,7 +59,7 @@ def postprocess_fn(output_or_target, example=None, is_target=False):
 def get_tf_dataset(split, shuffle_files, seed, dataset_name, subset_name, template, split_mapping):
     # HF datasets does not support file-level shuffling
     del shuffle_files, seed
-    dataset = datasets.load_dataset(dataset_name, subset_name)
+    dataset = load_dataset(dataset_name, subset_name)
     dataset = dataset[split_mapping[split]]
     dataset = utils.apply_template(dataset, template)
     return utils.hf_dataset_to_tf_dataset(dataset)
diff --git a/promptsource/utils.py b/promptsource/utils.py
@@ -1,8 +1,10 @@
 # coding=utf-8
+import os
 
 import datasets
 import requests
 
+from promptsource import DEFAULT_PROMPTSOURCE_CACHE_HOME
 from promptsource.templates import INCLUDED_USERS
 
 
@@ -49,7 +51,28 @@ def get_dataset(path, conf=None):
         builder_instance.download_and_prepare()
         return builder_instance.as_dataset()
     else:
-        return datasets.load_dataset(path, conf)
+        return load_dataset(path, conf)
+
+
+def load_dataset(dataset_name, subset_name):
+    try:
+        return datasets.load_dataset(dataset_name, subset_name)
+    except datasets.builder.ManualDownloadError:
+        cache_root_dir = (
+            os.environ["PROMPTSOURCE_MANUAL_DATASET_DIR"]
+            if "PROMPTSOURCE_MANUAL_DATASET_DIR" in os.environ
+            else DEFAULT_PROMPTSOURCE_CACHE_HOME
+        )
+        data_dir = (
+            f"{cache_root_dir}/{dataset_name}"
+            if subset_name is None
+            else f"{cache_root_dir}/{dataset_name}/{subset_name}"
+        )
+        return datasets.load_dataset(
+            dataset_name,
+            subset_name,
+            data_dir=data_dir,
+        )
 
 
 def get_dataset_confs(path):

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+DEFAULT_PROMPTSOURCE_CACHE_HOME = "~/.cache/promptsource"`