Explicit raise on invalid model_index + add ignore_metadata_errors option (#1377)

Wauplin · davanstrien · web-flow · commit 0cc38226fb3f · 2023-03-07T21:33:27.000+01:00
* Explicit raise on invalid model_index + add ignore_metadata_errors option

* fix tests + typo

* use cache dir in RepocardMetadataTest

* get rid of Repository in tests (speed-up)

* tpyo

* Apply suggestions from code review

Co-authored-by: Daniel van Strien &lt;davanstrien@users.noreply.github.com&gt;

---------

Co-authored-by: Daniel van Strien &lt;davanstrien@users.noreply.github.com&gt;
diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py
@@ -409,7 +409,7 @@ def fetch_upload_modes(
     create_pr: bool = False,
 ) -> Dict[str, UploadMode]:
     """
-    Requests the Hub "preupload" endpoint to determine wether each input file
+    Requests the Hub "preupload" endpoint to determine whether each input file
     should be uploaded as a regular git blob or as git LFS blob.
 
     Args:
diff --git a/src/huggingface_hub/community.py b/src/huggingface_hub/community.py
@@ -44,7 +44,7 @@ class Discussion:
             The username of the Discussion / Pull Request author.
             Can be `"deleted"` if the user has been deleted since.
         is_pull_request (`bool`):
-            Wether or not this is a Pull Request.
+            Whether or not this is a Pull Request.
         created_at (`datetime`):
             The `datetime` of creation of the Discussion / Pull Request.
     """
@@ -96,7 +96,7 @@ class DiscussionWithDetails(Discussion):
             The username of the Discussion / Pull Request author.
             Can be `"deleted"` if the user has been deleted since.
         is_pull_request (`bool`):
-            Wether or not this is a Pull Request.
+            Whether or not this is a Pull Request.
         created_at (`datetime`):
             The `datetime` of creation of the Discussion / Pull Request.
         events (`list` of [`DiscussionEvent`])
@@ -175,7 +175,7 @@ class DiscussionComment(DiscussionEvent):
         content (`str`):
             The raw markdown content of the comment. Mentions, links and images are not rendered.
         edited (`bool`):
-            Wether or not this comment has been edited.
+            Whether or not this comment has been edited.
         hidden (`bool`):
             Whether or not this comment has been hidden.
     """
diff --git a/src/huggingface_hub/repocard.py b/src/huggingface_hub/repocard.py
@@ -40,7 +40,7 @@ class RepoCard:
     default_template_path = TEMPLATE_MODELCARD_PATH
     repo_type = "model"
 
-    def __init__(self, content: str):
+    def __init__(self, content: str, ignore_metadata_errors: bool = False):
         """Initialize a RepoCard from string content. The content should be a
         Markdown file with a YAML block at the beginning and a Markdown body.
 
@@ -76,6 +76,7 @@ def __init__(self, content: str):
 
         # Set the content of the RepoCard, as well as underlying .data and .text attributes.
         # See the `content` property setter for more details.
+        self.ignore_metadata_errors = ignore_metadata_errors
         self.content = content
 
     @property
@@ -105,7 +106,7 @@ def content(self, content: str):
             data_dict = {}
             self.text = content
 
-        self.data = self.card_data_class(**data_dict)
+        self.data = self.card_data_class(**data_dict, ignore_metadata_errors=self.ignore_metadata_errors)
 
     def __str__(self):
         return self.content
@@ -136,20 +137,22 @@ def load(
         repo_id_or_path: Union[str, Path],
         repo_type: Optional[str] = None,
         token: Optional[str] = None,
+        ignore_metadata_errors: bool = False,
     ):
         """Initialize a RepoCard from a Hugging Face Hub repo's README.md or a local filepath.
 
         Args:
             repo_id_or_path (`Union[str, Path]`):
                 The repo ID associated with a Hugging Face Hub repo or a local filepath.
             repo_type (`str`, *optional*):
-                The type of Hugging Face repo to push to. Defaults to None, which will use
-                use "model". Other options are "dataset" and "space". Not used when loading from
-                a local filepath. If this is called from a child class, the default value will be
-                the child class's `repo_type`.
+                The type of Hugging Face repo to push to. Defaults to None, which will use use "model". Other options
+                are "dataset" and "space". Not used when loading from a local filepath. If this is called from a child
+                class, the default value will be the child class's `repo_type`.
             token (`str`, *optional*):
-                Authentication token, obtained with `huggingface_hub.HfApi.login` method. Will default to
-                the stored token.
+                Authentication token, obtained with `huggingface_hub.HfApi.login` method. Will default to the stored token.
+            ignore_metadata_errors (`str`):
+                If True, errors while parsing the metadata section will be ignored. Some information might be lost during
+                the process. Use it at your own risk.
 
         Returns:
             [`huggingface_hub.repocard.RepoCard`]: The RepoCard (or subclass) initialized from the repo's
@@ -178,7 +181,7 @@ def load(
 
         # Preserve newlines in the existing file.
         with Path(card_path).open(mode="r", newline="", encoding="utf-8") as f:
-            return cls(f.read())
+            return cls(f.read(), ignore_metadata_errors=ignore_metadata_errors)
 
     def validate(self, repo_type: Optional[str] = None):
         """Validates card against Hugging Face Hub's card validation logic.
diff --git a/src/huggingface_hub/repocard_data.py b/src/huggingface_hub/repocard_data.py
@@ -158,7 +158,7 @@ class CardData:
     inherit from `dict` to allow this export step.
     """
 
-    def __init__(self, **kwargs):
+    def __init__(self, ignore_metadata_errors: bool = False, **kwargs):
         self.__dict__.update(kwargs)
 
     def to_dict(self) -> Dict[str, Any]:
@@ -248,6 +248,9 @@ class ModelCardData(CardData):
             `eval_results` to construct the `model-index` within the card's metadata. The name
             you supply here is what will be used on PapersWithCode's leaderboards. If None is provided
             then the repo name is used as a default. Defaults to None.
+        ignore_metadata_errors (`str`):
+            If True, errors while parsing the metadata section will be ignored. Some information might be lost during
+            the process. Use it at your own risk.
         kwargs (`dict`, *optional*):
             Additional metadata that will be added to the model card. Defaults to None.
 
@@ -277,6 +280,7 @@ def __init__(
         metrics: Optional[List[str]] = None,
         eval_results: Optional[List[EvalResult]] = None,
         model_name: Optional[str] = None,
+        ignore_metadata_errors: bool = False,
         **kwargs,
     ):
         self.language = language
@@ -294,8 +298,15 @@ def __init__(
                 model_name, eval_results = model_index_to_eval_results(model_index)
                 self.model_name = model_name
                 self.eval_results = eval_results
-            except KeyError:
-                logger.warning("Invalid model-index. Not loading eval results into CardData.")
+            except KeyError as error:
+                if ignore_metadata_errors:
+                    logger.warning("Invalid model-index. Not loading eval results into CardData.")
+                else:
+                    raise ValueError(
+                        f"Invalid `model_index` in metadata cannot be parsed: KeyError {error}. Pass"
+                        " `ignore_metadata_errors=True` to ignore this error while loading a Model Card. Warning:"
+                        " some information will be lost. Use it at your own risk."
+                    )
 
         super().__init__(**kwargs)
 
@@ -350,6 +361,9 @@ class DatasetCardData(CardData):
             If not provided, it will be gathered from the 'train-eval-index' key of the kwargs.
         configs (`Union[str, List[str]]`, *optional*):
             A list of the available dataset configs for the dataset.
+        ignore_metadata_errors (`str`):
+            If True, errors while parsing the metadata section will be ignored. Some information might be lost during
+            the process. Use it at your own risk.
     """
 
     def __init__(
@@ -368,6 +382,7 @@ def __init__(
         pretty_name: Optional[str] = None,
         train_eval_index: Optional[Dict] = None,
         configs: Optional[Union[str, List[str]]] = None,
+        ignore_metadata_errors: bool = False,
         **kwargs,
     ):
         self.annotations_creators = annotations_creators
@@ -421,6 +436,9 @@ class SpaceCardData(CardData):
             List of datasets related to this Space. Should be a dataset ID found on https://hf.co/datasets.
         tags (`List[str]`, *optional*)
             List of tags to add to your Space that can be used when filtering on the Hub.
+        ignore_metadata_errors (`str`):
+            If True, errors while parsing the metadata section will be ignored. Some information might be lost during
+            the process. Use it at your own risk.
         kwargs (`dict`, *optional*):
             Additional metadata that will be added to the space card.
 
@@ -452,6 +470,7 @@ def __init__(
         models: Optional[List[str]] = None,
         datasets: Optional[List[str]] = None,
         tags: Optional[List[str]] = None,
+        ignore_metadata_errors: bool = False,
         **kwargs,
     ):
         self.title = title
diff --git a/tests/test_repocard.py b/tests/test_repocard.py