autogluon · Innixma · Feb 22, 2025 · Feb 14, 2025 · Feb 21, 2025 · Innixma
diff --git a/scripts/simulate/run_download_input.py b/scripts/simulate/run_download_input.py
@@ -2,14 +2,8 @@
 
 
 if __name__ == '__main__':
-    context_name = 'BAG_D244_F3_C1416_small'  # The context you want to download
-    dry_run = True  # Set False to download files
+    context_name = 'D244_F3_C1416_3'  # The context you want to download
     include_zs = True  # Set False to only download files necessary for SingleBest (skip predict proba files)
 
-    if dry_run:
-        print(f'NOTE: Files will not be downloaded as `dry_run=True`.\n'
-              f'This will log what files will be downloaded instead.\n'
-              f'Set `dry_run=False` to download the files.')
-
     context = get_context(context_name)
-    context.download(include_zs=include_zs, dry_run=dry_run)
+    context.download(include_zs=include_zs, use_s3=False)
diff --git a/setup.py b/setup.py
@@ -4,6 +4,7 @@
     'autogluon.core[all]',
     'pytest',
     'typing-extensions>=4.11,<5',  # used for `Self` type hint
+    'huggingface-hub',
 ]
 
 setup(

diff --git a/tabrepo/contexts/context.py b/tabrepo/contexts/context.py
@@ -1,3 +1,4 @@
+
 from __future__ import annotations
 
 from dataclasses import asdict, dataclass
@@ -22,9 +23,80 @@
 from ..predictions.tabular_predictions import TabularModelPredictions
 from ..repository.evaluation_repository import EvaluationRepository
 from ..utils import catchtime
+from ..utils.huggingfacehub_utils import download_from_huggingface
 from ..utils.download import download_files
 
 
+def download_from_s3(name: str, include_zs: bool, exists: str, dry_run: bool, s3_download_map, benchmark_paths, verbose: bool):
+    print(f'Downloading files for {name} context... '
+          f'(include_zs={include_zs}, exists="{exists}", dry_run={dry_run})')
+    if dry_run:
+        print(f'\tNOTE: `dry_run=True`! Files will not be downloaded.')
+    assert exists in ["raise", "ignore", "overwrite"]
+    assert s3_download_map is not None, \
+        f'self.s3_download_map is None: download functionality is disabled'
+    file_paths_expected = benchmark_paths.get_file_paths(include_zs=include_zs)
+
+    file_paths_to_download = [f for f in file_paths_expected if f in s3_download_map]
+    if len(file_paths_to_download) == 0:
+        print(f'WARNING: Matching file paths to download is 0! '
+              f'`self.s3_download_map` probably has incorrect keys.')
+    file_paths_already_exist = [f for f in file_paths_to_download if benchmark_paths.exists(f)]
+    file_paths_missing = [f for f in file_paths_to_download if not benchmark_paths.exists(f)]
+
+    if exists == 'raise':
+        if file_paths_already_exist:
+            raise AssertionError(f'`exists="{exists}"`, '
+                                 f'and found {len(file_paths_already_exist)} files that already exist locally!\n'
+                                 f'\tExisting Files: {file_paths_already_exist}\n'
+                                 f'\tMissing  Files: {file_paths_missing}\n'
+                                 f'Either manually inspect and delete existing files, '
+                                 f'set `exists="ignore"` to keep your local files and only download missing files, '
+                                 f'or set `exists="overwrite"` to overwrite your existing local files.')
+    elif exists == 'ignore':
+        file_paths_to_download = file_paths_missing
+    elif exists == 'overwrite':
+        file_paths_to_download = file_paths_to_download
+    else:
+        raise ValueError(f'Invalid value for exists (`exists="{exists}"`). '
+                         f'Valid values: {["raise", "ignore", "overwrite"]}')
+
+    s3_to_local_tuple_list = [(val, key) for key, val in s3_download_map.items()
+                              if key in file_paths_to_download]
+
+    log_extra = ''
+
+    num_exist = len(file_paths_already_exist)
+    if exists == 'overwrite':
+        if num_exist > 0:
+            log_extra += f'\tWill overwrite {num_exist} files that exist locally:\n' \
+                         f'\t\t{file_paths_already_exist}'
+        else:
+            log_extra = f''
+    if exists == 'ignore':
+        log_extra += f'\tWill skip {num_exist} files that exist locally:\n' \
+                     f'\t\t{file_paths_already_exist}'
+    if file_paths_missing:
+        if log_extra:
+            log_extra += '\n'
+        log_extra += f'Will download {len(file_paths_missing)} files that are missing locally:\n' \
+                     f'\t\t{file_paths_missing}'
+
+    if log_extra:
+        print(log_extra)
+    print(f'\tDownloading {len(s3_to_local_tuple_list)} files from s3 to local...')
+    for s3_path, local_path in s3_to_local_tuple_list:
+        print(f'\t\t"{s3_path}" -> "{local_path}"')
+    s3_required_list = [(s3_path, local_path) for s3_path, local_path in s3_to_local_tuple_list if
+                        s3_path[:2] == "s3"]
+    urllib_required_list = [(s3_path, local_path) for s3_path, local_path in s3_to_local_tuple_list if
+                            s3_path[:2] != "s3"]
+    if urllib_required_list:
+        download_files(remote_to_local_tuple_list=urllib_required_list, dry_run=dry_run, verbose=verbose)
+    if s3_required_list:
+        download_s3_files(s3_to_local_tuple_list=s3_required_list, dry_run=dry_run, verbose=verbose)
+
+
 @dataclass
 class BenchmarkPaths:
     configs: str
@@ -260,7 +332,9 @@ def download(self,
                  include_zs: bool = True,
                  exists: str = 'raise',
                  verbose: bool = True,
-                 dry_run: bool = False):
+                 dry_run: bool = False,
+                 use_s3: bool = True,
+    ):
         """
         Downloads all BenchmarkContext required files from s3 to local disk.
 
@@ -275,78 +349,27 @@ def download(self,
                 Guarantees alignment between local and remote files (at the time of download)
         :param dry_run: If True, will not download files, but instead log what would have been downloaded.
         """
-        print(f'Downloading files for {self.name} context... '
-              f'(include_zs={include_zs}, exists="{exists}", dry_run={dry_run})')
-        if dry_run:
-            print(f'\tNOTE: `dry_run=True`! Files will not be downloaded.')
-        assert exists in ["raise", "ignore", "overwrite"]
-        assert self.s3_download_map is not None, \
-            f'self.s3_download_map is None: download functionality is disabled'
-        file_paths_expected = self.benchmark_paths.get_file_paths(include_zs=include_zs)
-
-        file_paths_to_download = [f for f in file_paths_expected if f in self.s3_download_map]
-        if len(file_paths_to_download) == 0:
-            print(f'WARNING: Matching file paths to download is 0! '
-                  f'`self.s3_download_map` probably has incorrect keys.')
-        file_paths_already_exist = [f for f in file_paths_to_download if self.benchmark_paths.exists(f)]
-        file_paths_missing = [f for f in file_paths_to_download if not self.benchmark_paths.exists(f)]
-
-        if exists == 'raise':
-            if file_paths_already_exist:
-                raise AssertionError(f'`exists="{exists}"`, '
-                                     f'and found {len(file_paths_already_exist)} files that already exist locally!\n'
-                                     f'\tExisting Files: {file_paths_already_exist}\n'
-                                     f'\tMissing  Files: {file_paths_missing}\n'
-                                     f'Either manually inspect and delete existing files, '
-                                     f'set `exists="ignore"` to keep your local files and only download missing files, '
-                                     f'or set `exists="overwrite"` to overwrite your existing local files.')
-        elif exists == 'ignore':
-            file_paths_to_download = file_paths_missing
-        elif exists == 'overwrite':
-            file_paths_to_download = file_paths_to_download
+        if use_s3:
+            download_from_s3(
+                name=self.name, include_zs=include_zs, exists=exists, dry_run=dry_run,
+                s3_download_map=self.s3_download_map, benchmark_paths=self.benchmark_paths, verbose=verbose
+            )
         else:
-            raise ValueError(f'Invalid value for exists (`exists="{exists}"`). '
-                             f'Valid values: {["raise", "ignore", "overwrite"]}')
-
-        s3_to_local_tuple_list = [(val, key) for key, val in self.s3_download_map.items()
-                                  if key in file_paths_to_download]
-
-        log_extra = ''
-
-        num_exist = len(file_paths_already_exist)
-        if exists == 'overwrite':
-            if num_exist > 0:
-                log_extra += f'\tWill overwrite {num_exist} files that exist locally:\n' \
-                            f'\t\t{file_paths_already_exist}'
-            else:
-                log_extra = f''
-        if exists == 'ignore':
-            log_extra += f'\tWill skip {num_exist} files that exist locally:\n' \
-                            f'\t\t{file_paths_already_exist}'
-        if file_paths_missing:
-            if log_extra:
-                log_extra += '\n'
-            log_extra += f'Will download {len(file_paths_missing)} files that are missing locally:\n' \
-                         f'\t\t{file_paths_missing}'
-
-        if log_extra:
-            print(log_extra)
-        print(f'\tDownloading {len(s3_to_local_tuple_list)} files from s3 to local...')
-        for s3_path, local_path in s3_to_local_tuple_list:
-            print(f'\t\t"{s3_path}" -> "{local_path}"')
-        s3_required_list = [(s3_path, local_path) for s3_path, local_path in s3_to_local_tuple_list if s3_path[:2] == "s3"]
-        urllib_required_list = [(s3_path, local_path) for s3_path, local_path in s3_to_local_tuple_list if s3_path[:2] != "s3"]
-        if urllib_required_list:
-            download_files(remote_to_local_tuple_list=urllib_required_list, dry_run=dry_run, verbose=verbose)
-        if s3_required_list:
-            download_s3_files(s3_to_local_tuple_list=s3_required_list, dry_run=dry_run, verbose=verbose)
+            if verbose:
+                print(f'Downloading files for {self.name} context... '
+                      f'(include_zs={include_zs}, exists="{exists}")')
+            download_from_huggingface(
+                datasets=self.benchmark_paths.datasets,
+            )
 
     def load(self,
              folds: List[int] = None,
              load_predictions: bool = True,
              download_files: bool = True,
              prediction_format: str = "memmap",
-             exists: str = 'ignore') -> Tuple[ZeroshotSimulatorContext, TabularModelPredictions, GroundTruth]:
+             exists: str = 'ignore',
+             use_s3: bool = True,
+             ) -> Tuple[ZeroshotSimulatorContext, TabularModelPredictions, GroundTruth]:
         """
         :param folds: If None, uses self.folds as default.
             If specified, must be a subset of `self.folds`. This will filter the results to only the specified folds.
@@ -397,7 +420,7 @@ def load(self,
                         missing_files_str = [f'\n\t"{m}"' for m in missing_files]
                         raise FileNotFoundError(f'Missing {len(missing_files)} required files: \n[{",".join(missing_files_str)}\n]')
                 print(f'Downloading input files from s3...')
-                self.download(include_zs=load_predictions, exists=exists)
+                self.download(include_zs=load_predictions, exists=exists, use_s3=use_s3)
             self.benchmark_paths.assert_exists_all(check_zs=load_predictions)
 
             configs_hyperparameters = self.load_configs_hyperparameters()
@@ -419,13 +442,15 @@ def load_repo(
         download_files: bool = True,
         prediction_format: str = "memmap",
         exists: str = 'ignore',
+        use_s3: bool = True,
     ) -> EvaluationRepository:
         zsc, zeroshot_pred_proba, zeroshot_gt = self.load(
             folds=folds,
             load_predictions=load_predictions,
             download_files=download_files,
             prediction_format=prediction_format,
             exists=exists,
+            use_s3=use_s3,
         )
         repo = EvaluationRepository(
             zeroshot_context=zsc,

diff --git a/tabrepo/repository/evaluation_repository.py b/tabrepo/repository/evaluation_repository.py
@@ -327,7 +327,7 @@ def _convert_sim_artifacts(cls, results_lst_simulation_artifacts: list[dict[str,
         return simulation_artifacts_full
 
 
-def load_repository(version: str, *, load_predictions: bool = True, cache: bool | str = False, prediction_format: str = "memmap") -> EvaluationRepository:
+def load_repository(version: str, *, load_predictions: bool = True, cache: bool | str = False, prediction_format: str = "memmap", use_s3: bool = True) -> EvaluationRepository:
     """
     Load the specified EvaluationRepository. Will automatically download all required inputs if they do not already exist on local disk.
 
@@ -347,7 +347,8 @@ def load_repository(version: str, *, load_predictions: bool = True, cache: bool
         Options: ["memmap", "memopt", "mem"]
         Determines the way the predictions are represented in the repo.
         It is recommended to keep the value as "memmap" for optimal performance.
-
+    use_s3: bool, default = True
+        Whether to use S3 to download tabrepo files, if False uses HuggingFace instead.
     Returns
     -------
     EvaluationRepository object for the given context.
@@ -358,7 +359,7 @@ def load_repository(version: str, *, load_predictions: bool = True, cache: bool
         if isinstance(cache, str) and cache == "overwrite":
             kwargs["ignore_cache"] = True
             kwargs["exists"] = "overwrite"
-        repo = get_subcontext(version).load(load_predictions=load_predictions, prediction_format=prediction_format, **kwargs)
+        repo = get_subcontext(version).load(load_predictions=load_predictions, prediction_format=prediction_format, use_s3=use_s3, **kwargs)
     else:
-        repo = get_subcontext(version).load_from_parent(load_predictions=load_predictions, prediction_format=prediction_format)
+        repo = get_subcontext(version).load_from_parent(load_predictions=load_predictions, prediction_format=prediction_format, use_s3=use_s3)
     return repo