Skip to content

Commit 33b88fc

Browse files
authored
Merge pull request #334 from DavidBlavid/huggingface_readme
Request Huggingface Model README
2 parents 776813b + 9406002 commit 33b88fc

File tree

1 file changed

+22
-9
lines changed

1 file changed

+22
-9
lines changed

sources/huggingface_models.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
1-
"""Simple wrappers for Hugging Face model search/detail retrieval that mirror
2-
Dataset‑handlers style (single mapping helper + thin search/get functions).
3-
"""
41
from __future__ import annotations
52

63
from objects import thing, CreativeWork, Author
74
from sources import data_retriever
85
import utils
96
from main import app
107

8+
import requests
119

12-
def map_entry_to_model(record) -> CreativeWork:
13-
"""Convert a single Huggingface model record into a :class:`CreativeWork`."""
10+
11+
def map_entry_to_model(record, request_readme: bool = False) -> CreativeWork:
12+
"""Convert a single Huggingface model record into a `CreativeWork` object."""
1413

1514
model = CreativeWork() # thing -> CreativeWork
1615

@@ -19,7 +18,20 @@ def map_entry_to_model(record) -> CreativeWork:
1918
model.additionalType = "MODEL"
2019
model.url = f"https://huggingface.co/{model.name}"
2120

22-
model.description = utils.remove_html_tags(record.get("description", ""))
21+
# model descriptions are usually contained in a README file, which we will request separately
22+
if request_readme:
23+
readme_url = f"https://huggingface.co/{model.name}/raw/main/README.md"
24+
try:
25+
response = requests.get(readme_url, timeout=5)
26+
if response.status_code == 200:
27+
model.description = utils.remove_html_tags(response.text)
28+
else:
29+
model.description = utils.remove_html_tags(record.get("description", ""))
30+
except requests.RequestException:
31+
model.description = utils.remove_html_tags(record.get("description", ""))
32+
else:
33+
model.description = utils.remove_html_tags(record.get("description", ""))
34+
2335
model.abstract = model.description
2436
model.dateCreated = record.get("createdAt", "")
2537
model.datePublished = model.dateCreated
@@ -59,7 +71,7 @@ def map_entry_to_model(record) -> CreativeWork:
5971

6072
@utils.handle_exceptions
6173
def search(source: str, search_term: str, results, failed_sources):
62-
"""Populate *results['resources']* with models matching *search_term*."""
74+
"""Populate results['resources'] with models matching *search_term*."""
6375
search_result = data_retriever.retrieve_data(
6476
source=source,
6577
base_url=app.config["DATA_SOURCES"][source].get("search-endpoint", ""),
@@ -74,7 +86,8 @@ def search(source: str, search_term: str, results, failed_sources):
7486
utils.log_event(type="info", message=f"{source} - {total_hits} records matched")
7587

7688
for hit in search_result:
77-
model = map_entry_to_model(hit)
89+
# here we do not request the README to keep search fast and API volume low
90+
model = map_entry_to_model(hit, request_readme=False)
7891
results["resources"].append(model)
7992

8093

@@ -89,7 +102,7 @@ def get_resource(source: str, source_id: str, identifier: str):
89102
)
90103

91104
if search_result:
92-
model = map_entry_to_model(search_result)
105+
model = map_entry_to_model(search_result, request_readme=True)
93106
utils.log_event(type="info", message=f"{source} - retrieved model details")
94107
return model
95108
else:

0 commit comments

Comments
 (0)