Merge pull request #346 from semantic-systems/refactor_hf

huntila · web-flow · commit 41953e987062 · 2026-01-06T15:17:55.000+01:00
Refactor Huggingface Sources
diff --git a/config.py b/config.py
@@ -122,6 +122,7 @@ class Config:
             },
             "module": "huggingface_models",
             "search-endpoint": f"https://huggingface.co/api/models?limit={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
+            "get-resource-endpoint": f"https://huggingface.co/api/models/",
         },
         "Huggingface - Datasets": {
             "logo": {
@@ -133,6 +134,7 @@ class Config:
             },
             "module": "huggingface_datasets",
             "search-endpoint": f"https://huggingface.co/api/datasets?limit={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
+            "get-resource-endpoint": f"https://huggingface.co/api/datasets/",
         },
         "OPENAIRE - Products": {
             "logo": {
diff --git a/sources/huggingface_datasets.py b/sources/huggingface_datasets.py
@@ -1,72 +1,121 @@
-from objects import thing, Article, Author, Dataset, Person
-from sources import data_retriever
-import utils
-from main import app
-
-@utils.handle_exceptions
-def search(source: str, search_term: str, results, failed_sources): 
-    search_result = data_retriever.retrieve_data(source=source, 
-                                                base_url=app.config['DATA_SOURCES'][source].get('search-endpoint', ''),
-                                                search_term=search_term,
-                                                failed_sources=failed_sources)    
-
-    total_hits = len(search_result)
-
-    if int(total_hits) > 0:
-        utils.log_event(type="info", message=f"{source} - {total_hits} records matched")   
-
-        for hit in search_result:
-
-            dataset = map_entry_to_dataset(hit)
-            results['resources'].append(dataset)
-
-def map_entry_to_dataset(record) -> Dataset:
+from typing import Union, Dict, Any, List, Iterable
 
-    dataset = Dataset()     # thing -> CreateWork -> Dataset
-
-    dataset.identifier = record.get("id", "")
-    dataset.name = record.get("id", "")
-    dataset.additionalType = "DATASET"
-    dataset.url = "https://huggingface.co/datasets/" + record.get("id", "")
-    dataset.description = utils.remove_html_tags(record.get("description", ""))
-    dataset.abstract = dataset.description
-    dataset.license = record.get("license", {}).get("id", "")
-    dataset.datePublished = record.get("createdAt", "")
-    dataset.dateModified = record.get("lastModified", "")
+import utils
+from config import Config
+from sources.base import BaseSource
+from sources import data_retriever
+from objects import thing, Article, Author, Dataset, Person
 
-    # much metadata is contained in the tags
-    tags = record.get("tags", [])
 
-    dataset.inLanguage = [t.split("language:")[1] for t in tags if t.startswith("language:")]
-    dataset.genre = ", ".join(t.split("task_categories:")[1] for t in tags if t.startswith("task_categories:"))
-    dataset.encodingFormat = ", ".join(t.split("format:")[1] for t in tags if t.startswith("format:"))
-    dataset.countryOfOrigin = next((t.split("region:")[1] for t in tags if t.startswith("region:")), "")
-    dataset.keywords = tags
+class HuggingFaceDatasets(BaseSource):
+    SOURCE = "Huggingface - Datasets"
+    SEARCH_ENDPOINT = Config.DATA_SOURCES[SOURCE].get('search-endpoint', '')
+    RESOURCE_ENDPOINT = Config.DATA_SOURCES[SOURCE].get("get-resource-endpoint", "")
+
+    def fetch(self, search_term: str, failed_sources: list = []) -> Dict[str, Any]:
+        """
+        Fetch raw json from the source using the given search term.
+        """
+        return data_retriever.retrieve_data(
+            source=self.SOURCE,
+            base_url=self.SEARCH_ENDPOINT,
+            search_term=search_term,
+            failed_sources=failed_sources,
+        ) or {}
+
+    def extract_hits(self, raw: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
+        """
+        Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
+        """
+        return raw
+
+    def map_hit(self, source_name: str, hit: Dict[str, Any]):
+        """
+        Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
+        """
+        dataset = Dataset()     # thing -> CreateWork -> Dataset
+
+        dataset.identifier = hit.get("id", "")
+        dataset.name = hit.get("id", "")
+        dataset.additionalType = "DATASET"
+        dataset.url = "https://huggingface.co/datasets/" + hit.get("id", "")
+        dataset.description = utils.remove_html_tags(hit.get("description", ""))
+        dataset.abstract = dataset.description
+        dataset.license = hit.get("license", {}).get("id", "")
+        dataset.datePublished = hit.get("createdAt", "")
+        dataset.dateModified = hit.get("lastModified", "")
+
+        # much metadata is contained in the tags
+        tags = hit.get("tags", [])
+
+        dataset.inLanguage = [t.split("language:")[1] for t in tags if t.startswith("language:")]
+        dataset.genre = ", ".join(t.split("task_categories:")[1] for t in tags if t.startswith("task_categories:"))
+        dataset.encodingFormat = ", ".join(t.split("format:")[1] for t in tags if t.startswith("format:"))
+        dataset.countryOfOrigin = next((t.split("region:")[1] for t in tags if t.startswith("region:")), "")
+        dataset.keywords = tags
+
+        dataset.license = next((t.split("license:")[1] for t in tags if t.startswith("license:")), "")
+
+        dataset.creativeWorkStatus = (
+            "disabled" if hit.get("disabled")
+            else "private" if hit.get("private")
+            else "gated" if hit.get("gated")
+            else "public"
+        )
+
+        if hit.get("author"):
+            dataset.author = [Author(name=hit["author"])]
+            dataset.publisher = dataset.author[0].name if dataset.author else ""
+
+        _source = thing()
+        _source.name = source_name
+        _source.originalSource = dataset.publisher
+        _source.identifier = dataset.identifier
+        _source.url = dataset.url
+        dataset.source.append(_source)
 
-    dataset.license = next((t.split("license:")[1] for t in tags if t.startswith("license:")), "")
+        return dataset
 
-    dataset.creativeWorkStatus = (
-        "disabled" if record.get("disabled")
-        else "private" if record.get("private")
-        else "gated" if record.get("gated")
-        else "public"
-    )
+    def search(self, source_name: str, search_term: str, results: dict, failed_sources: list) -> None:
+        """
+        Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
+        """
+        search_result = self.fetch(search_term, failed_sources)
+
+        total_hits = len(search_result)
+        if int(total_hits) > 0:
+            utils.log_event(type="info", message=f"{self.SOURCE} - {total_hits} records matched")
+
+            for hit in search_result:
+                dataset = self.map_hit(self.SOURCE, hit)
+                results['resources'].append(dataset)
+
+    def get_resource(self, doi: str) -> Dataset | None:
+        search_result = data_retriever.retrieve_object(
+            source=self.SOURCE,
+            base_url=self.RESOURCE_ENDPOINT,
+            identifier=doi,
+            quote=False,
+        )
+        if search_result:
+            dataset = self.map_hit(self.SOURCE, search_result)
+            utils.log_event(type="info", message=f"{self.SOURCE} - retrieved dataset details")
+            return dataset
+        else:
+            utils.log_event(type="error", message=f"{self.SOURCE} - failed to retrieve dataset details")
+            return None
 
-    if record.get("author"):
-        dataset.author = [Author(name=record["author"])]
-        dataset.publisher = dataset.author[0].name if dataset.author else ""
 
-    _source = thing()
-    _source.name = 'Huggingface - Datasets'
-    _source.originalSource = dataset.publisher
-    _source.identifier = dataset.identifier
-    _source.url = dataset.url
-    dataset.source.append(_source)
+@utils.handle_exceptions
+def search(source: str, search_term: str, results, failed_sources) -> None:
+    """
+    Entrypoint to search Huggingface Datasets.
+    """
+    HuggingFaceDatasets().search(source, search_term, results, failed_sources)
 
-    return dataset
 
 @utils.handle_exceptions
-def get_resource(source: str, source_id: str, doi: str):
+def get_resource(source: str, source_id: str, doi: str) -> Dataset | None:
     """
     Retrieve detailed information for the dataset. 
 
@@ -76,21 +125,4 @@ def get_resource(source: str, source_id: str, doi: str):
 
     :return: dataset
     """
-
-    print(f"{source=}")
-    print(f"{source_id=}")
-    print(f"{doi=}")
-
-    base_url = 'https://huggingface.co/api/datasets/'
-    search_result = data_retriever.retrieve_object(source=source,
-                                                    base_url=base_url,
-                                                    identifier=doi)
-    
-    if search_result:
-        dataset = map_entry_to_dataset(search_result)
-        utils.log_event(type="info", message=f"{source} - retrieved dataset details")
-        return dataset
-    else:
-        utils.log_event(type="error", message=f"{source} - failed to retrieve dataset details")
-        return None
-    
+    return HuggingFaceDatasets().get_resource(doi)
diff --git a/sources/huggingface_models.py b/sources/huggingface_models.py