semantic-systems
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎config.py‎
Lines changed: 14 additions & 0 deletions b/‎config.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎sources/__init__.py‎ b/‎sources/__init__.py‎
diff --git a/‎sources/base.py‎
Lines changed: 33 additions & 0 deletions b/‎sources/base.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎sources/cordis.py‎
Lines changed: 62 additions & 15 deletions b/‎sources/cordis.py‎
Lines changed: 62 additions & 15 deletions
diff --git a/‎sources/core.py‎
Lines changed: 115 additions & 68 deletions b/‎sources/core.py‎
Lines changed: 115 additions & 68 deletions
@@ -30,6 +30,8 @@ jobs:
           echo "CLIENT_ID_ORCID=${{ secrets.CLIENT_ID_ORCID }}" >> ./.env
           echo "CLIENT_SECRET_ORCID=${{ secrets.CLIENT_SECRET_ORCID }}" >> ./.env
           echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> ./.env
+          echo "DASHBOARD_USERNAME=${{ secrets.DASHBOARD_USERNAME }}" >> ./.env
+          echo "DASHBOARD_PASSWORD=${{ secrets.DASHBOARD_PASSWORD }}" >> ./.env
           echo "LLAMA3_USERNAME=${{ secrets.LLAMA3_USERNAME }}" >> ./.env
           echo "LLAMA3_PASSWORD=${{ secrets.LLAMA3_PASSWORD }}" >> ./.env
           echo "ELASTIC_SERVER=${{ secrets.ELASTIC_SERVER }}" >> ./.env
 
@@ -122,6 +122,7 @@ class Config:
             },
             "module": "huggingface_models",
             "search-endpoint": f"https://huggingface.co/api/models?limit={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
+            "get-resource-endpoint": f"https://huggingface.co/api/models/",
         },
         "Huggingface - Datasets": {
             "logo": {
@@ -133,6 +134,7 @@ class Config:
             },
             "module": "huggingface_datasets",
             "search-endpoint": f"https://huggingface.co/api/datasets?limit={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
+            "get-resource-endpoint": f"https://huggingface.co/api/datasets/",
         },
         "OPENAIRE - Products": {
             "logo": {
@@ -181,6 +183,18 @@ class Config:
             "get-publication-endpoint": "https://api.crossref.org/works/",
             "get-publication-references-endpoint": "https://api.crossref.org/works/",
         },
+        "DataCite": {
+            "logo": {
+                "name": "DataCite",
+                "link": "https://datacite.org/",
+                "src": "DataCite-Logo.png",
+                "width": "w-100",
+                "height": "h-100",
+            },
+            "module": "datacite",
+            "search-endpoint": "https://api.datacite.org/dois?query=titles.title:",
+            "get-publication-endpoint": "https://api.datacite.org/dois/"
+        },
         "SEMANTIC SCHOLAR - Publications": {
             "logo": {
                 "name": "SEMANTIC SCHOLAR",
 
@@ -0,0 +1,33 @@
+# sources/base.py
+from abc import ABC, abstractmethod
+from typing import Iterable, Dict, Any
+
+class BaseSource(ABC):
+
+    @abstractmethod
+    def fetch(self, search_term: str) -> Dict[str, Any]:
+        """
+        Fetch raw json from the source using the given search term.
+        """
+        ...
+
+    @abstractmethod
+    def extract_hits(self, raw: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
+        """
+        Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
+        """
+        ...
+
+    @abstractmethod
+    def map_hit(self, source_name: str, hit: Dict[str, Any]):
+        """
+        Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
+        """
+        ...
+
+    @abstractmethod
+    def search(self, source_name: str, search_term: str, results: dict, failed_sources: list) -> None:
+        """
+        Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
+        """
+        ...
@@ -1,21 +1,46 @@
 from objects import thing, Project, Author
 from sources import data_retriever
+from typing import Iterable, Dict, Any, List
 import utils
 from main import app
 
-@utils.handle_exceptions
-def search(source: str, search_term: str, results, failed_sources): 
-    search_term = f"({search_term})"
-    search_result = data_retriever.retrieve_data(source=source, 
-                                                base_url=app.config['DATA_SOURCES'][source].get('search-endpoint', ''),
-                                                search_term=search_term,
-                                                failed_sources=failed_sources) 
-    total_records_found = search_result.get('result', {}).get('header', {}).get('totalHits', 0)
-    total_records_pulled = search_result.get('result', {}).get('header', {}).get('numHits', 0)
-    utils.log_event(type="info", message=f"{source} - {total_records_found} records matched; pulled top {total_records_pulled}")   
-
-    hits = search_result.get('hits', {}).get('hit', [])
-    for hit in hits:
+from sources.base import BaseSource
+
+class CORDIS(BaseSource):
+
+    SOURCE = 'CORDIS'
+
+    @utils.handle_exceptions
+    def fetch(self, search_term: str, failed_sources) -> Dict[str, Any]:
+        """
+        Fetch raw json from the source using the given search term.
+        """
+        search_term = f"({search_term})"
+        search_result = data_retriever.retrieve_data(source=self.SOURCE, 
+                                                    base_url=app.config['DATA_SOURCES'][self.SOURCE].get('search-endpoint', ''),
+                                                    search_term=search_term,
+                                                    failed_sources=failed_sources) 
+        total_records_found = search_result.get('result', {}).get('header', {}).get('totalHits', 0)
+        total_records_pulled = search_result.get('result', {}).get('header', {}).get('numHits', 0)
+        utils.log_event(type="info", message=f"{self.SOURCE} - {total_records_found} records matched; pulled top {total_records_pulled}")   
+
+        return search_result
+    
+
+    @utils.handle_exceptions
+    def extract_hits(self, raw: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
+        """
+        Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
+        """
+        return raw.get('hits', {}).get('hit', [])
+    
+
+    @utils.handle_exceptions
+    def map_hit(self, hit: Dict[str, Any]):
+        """
+        Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
+        """
+
         if isinstance(hit, dict):
             projectNode = hit.get('project', {})
             type = projectNode.get('contenttype', '')
@@ -51,11 +76,33 @@ def search(source: str, search_term: str, results, failed_sources):
                         project.inLanguage.append(languages)                
 
                 _source = thing()
-                _source.name = 'CORDIS'
+                _source.name = self.SOURCE
                 _source.identifier = projectNode.get('id', '')
                 _source.url = project.url                        
                 project.source.append(_source)
 
+                return project
+            
+        return None
+    
+
+    @utils.handle_exceptions
+    def search(self, source_name: str, search_term: str, results: dict, failed_sources: list) -> None:
+        """
+        Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
+        """
+        raw = self.fetch(search_term, failed_sources)
+        hits = self.extract_hits(raw)
 
+        for hit in hits:
+            project = self.map_hit(hit)
+
+            if project:
                 results['projects'].append(project)
-    
+
+@utils.handle_exceptions
+def search(source: str, search_term: str, results, failed_sources):
+    """
+    Entrypoint to search CORDIS publications.
+    """
+    CORDIS().search(source, search_term, results, failed_sources)
@@ -1,90 +1,137 @@
 from objects import thing, Article, Author, Organization
 from sources import data_retriever
 from config import Config
+from typing import Iterable, Dict, Any, List
 import utils
 import requests
 from main import app
 
-@utils.handle_exceptions
-def search(source: str, search_term: str, results, failed_sources):
+from sources.base import BaseSource
 
-    # we cannot use data_retriever.retrieve_data here because we need to send the request with an API key in the header
-    # learn more: https://api.core.ac.uk/docs/v3#tag/Search
-    limit = Config.NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT
-    api_url = f'https://api.core.ac.uk/v3/search/works/?limit={limit}&q={search_term}&_exists_:doi'
-    headers = {"Authorization":"Bearer " + Config.CORE_API_KEY}
 
-    # send the request
-    r = requests.get(api_url, headers=headers)
-    r.raise_for_status()
-    search_results = r.json()
+class CORE(BaseSource):
 
-    hits = search_results['results']
-    total_hits = search_results['totalHits']
-    total_results = len(hits)
+    SOURCE = 'CORE'
 
-    utils.log_event(type="info", message=f"{source} - {total_hits} records matched; pulled top {total_results}") 
-    print(f"{source} - {total_hits} records matched; pulled top {total_results}")
+    @utils.handle_exceptions
+    def fetch(self, search_term: str, failed_sources) -> Dict[str, Any]:
+        """
+        Fetch raw json from the source using the given search term.
+        """
+        # we cannot use data_retriever.retrieve_data here because we need to send the request with an API key in the header
+        # learn more: https://api.core.ac.uk/docs/v3#tag/Search
+        limit = Config.NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT
+        api_url = f'https://api.core.ac.uk/v3/search/works/?limit={limit}&q={search_term}&_exists_:doi'
+        headers = {"Authorization":"Bearer " + Config.CORE_API_KEY}
 
-    for i, hit in enumerate(hits):
-        digitalObj = map_digital_obj(source, hit)
+        # send the request
+        response = requests.get(api_url, headers=headers)
 
-        # we only create a result object if we found a DOI, otherwise None
-        if digitalObj:
-            results['publications'].append(digitalObj)   
-
-@utils.handle_exceptions
-def map_digital_obj(source: str, hit: dict) -> Article:
-
-    publication = Article() 
-    publication.additionalType = hit.get("documentType", "")
-    publication.name = hit.get("title", "")
-
-    # go through the links and find the one with type: display
-    links = hit.get("links", {})
-    for link in links:
-        if link.get("type", "") == "display":
-            publication.url = link.get("url", "")
-            break
+        if response.status_code == 200:
+            search_result = response.json()
+            return search_result
+        
+        failed_sources.append(self.SOURCE)
+        return None
 
-    publication.encoding_contentUrl = hit.get("downloadUrl", "")
 
-    # publications may not always have a DOI!
-    # if we don't find one, we do NOT create a result object for the hit
-    if not hit.get("doi", None):
-        print("No DOI found for publication:", publication.name)
-        return None
+    @utils.handle_exceptions
+    def extract_hits(self, raw: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
+        """
+        Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
+        """
+
+        hits = raw['results']
+        total_hits = raw['totalHits']
+        total_results = len(hits)
 
-    publication.identifier = hit.get("doi", "")
-    publication.datePublished = hit.get("publishedDate", "")
-    publication.inLanguage.append(hit.get("language", {}).get("code", ""))
+        utils.log_event(type="info", message=f"{self.SOURCE} - {total_hits} records matched; pulled top {total_results}") 
 
-    # abstracts may also be empty
-    abstract = hit.get("abstract", "")
-    if not abstract:
-        abstract = ""
+        return hits
+    
+
+    @utils.handle_exceptions
+    def map_hit(self, hit: Dict[str, Any]):
+        """
+        Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
+        """
+
+        publication = Article() 
+        publication.additionalType = hit.get("documentType", "")
+        publication.name = hit.get("title", "")
+
+        # go through the links and find the one with type: display
+        links = hit.get("links", {})
+        for link in links:
+            if link.get("type", "") == "display":
+                publication.url = link.get("url", "")
+                break
+        
+        publication.encoding_contentUrl = hit.get("downloadUrl", "")
+
+        # publications may not always have a DOI!
+        # if we don't find one, we do NOT create a result object for the hit
+        if not hit.get("doi", None):
+            print("No DOI found for publication:", publication.name)
+            return None
+
+        publication.identifier = hit.get("doi", "")
+        publication.datePublished = hit.get("publishedDate", "")
+        publication.inLanguage.append(hit.get("language", {}).get("code", ""))
+
+        # abstracts may also be empty
+        abstract = hit.get("abstract", "")
+        if not abstract:
+            abstract = ""
+
+        publication.description = utils.remove_html_tags(abstract)
+        publication.abstract = publication.description
+
+        publication.citationCount = hit.get("citationCount", "")
+
+        if hit.get("publisher", ""):
+            _publisher = Organization()
+            _publisher.name = hit.get("publisher", "")
+            publication.publisher = _publisher
+
+        authors = hit.get("authors", [])
+        for author in authors:
+            _author = Author()
+            _author.additionalType = 'Person'
+            _author.name = author.get("name", "")
+            publication.author.append(_author)
+        
+        _source = thing()
+        _source.name = self.SOURCE
+        _source.identifier = publication.identifier
+        _source.url = publication.url
+        publication.source.append(_source)
+
+        return publication
+    
 
-    publication.description = utils.remove_html_tags(abstract)
-    publication.abstract = publication.description
+    @utils.handle_exceptions
+    def search(self, source_name: str, search_term: str, results: dict, failed_sources: list) -> None:
+        """
+        Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
+        """
+        raw = self.fetch(search_term, failed_sources)
 
-    publication.citationCount = hit.get("citationCount", "")
+        if raw == None:
+            return
 
-    if hit.get("publisher", ""):
-        _publisher = Organization()
-        _publisher.name = hit.get("publisher", "")
-        publication.publisher = _publisher
+        hits = self.extract_hits(raw)
 
-    authors = hit.get("authors", [])
-    for author in authors:
-        _author = Author()
-        _author.additionalType = 'Person'
-        _author.name = author.get("name", "")
-        publication.author.append(_author)
-    
-    _source = thing()
-    _source.name = source
-    _source.identifier = publication.identifier
-    _source.url = publication.url
-    publication.source.append(_source)
+        for hit in hits:
+            digitalObj = self.map_hit(hit)
 
-    return publication
+            # we only create a result object if we found a DOI, otherwise None
+            if digitalObj:
+                results['publications'].append(digitalObj)
+
+@utils.handle_exceptions
+def search(source: str, search_term: str, results, failed_sources):
+    """
+    Entrypoint to search CORE publications.
+    """
+    CORE().search(source, search_term, results, failed_sources)