diff --git a/docs/source/de/guides/inference.md b/docs/source/de/guides/inference.md index e34103ac34..aee64cad14 100644 --- a/docs/source/de/guides/inference.md +++ b/docs/source/de/guides/inference.md @@ -107,7 +107,7 @@ Das Ziel von [`InferenceClient`] ist es, die einfachste Schnittstelle zum Ausfü | | [Feature Extraction](https://huggingface.co/tasks/feature-extraction) | ✅ | [`~InferenceClient.feature_extraction`] | | | [Fill Mask](https://huggingface.co/tasks/fill-mask) | ✅ | [`~InferenceClient.fill_mask`] | | | [Question Answering](https://huggingface.co/tasks/question-answering) | ✅ | [`~InferenceClient.question_answering`] | -| | [Sentence Similarity](https://huggingface.co/tasks/sentence-similarity) | ✅ | [`~InferenceClient.sentence_similarity`] | +| | [Sentence Similarity](https://huggingface.co/tasks/sentence-similarity) | ✅ | [`~InferenceClient.sentence_similarity`] | | | [Summarization](https://huggingface.co/tasks/summarization) | ✅ | [`~InferenceClient.summarization`] | | | [Table Question Answering](https://huggingface.co/tasks/table-question-answering) | ✅ | [`~InferenceClient.table_question_answering`] | | | [Text Classification](https://huggingface.co/tasks/text-classification) | ✅ | [`~InferenceClient.text_classification`] | diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py index 1098d3f606..6f0f6953d7 100644 --- a/src/huggingface_hub/inference/_client.py +++ b/src/huggingface_hub/inference/_client.py @@ -35,7 +35,6 @@ import base64 import logging import re -import time import warnings from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Optional, Union, overload @@ -301,8 +300,6 @@ def _inner_post( if request_parameters.task in TASKS_EXPECTING_IMAGES and "Accept" not in request_parameters.headers: request_parameters.headers["Accept"] = "image/png" - t0 = time.time() - timeout = self.timeout while True: with _open_as_binary(request_parameters.data) as data_as_binary: try: @@ -326,30 +323,9 @@ def _inner_post( except HTTPError as error: if error.response.status_code == 422 and request_parameters.task != "unknown": msg = str(error.args[0]) - print(error.response.text) if len(error.response.text) > 0: msg += f"\n{error.response.text}\n" - msg += f"\nMake sure '{request_parameters.task}' task is supported by the model." error.args = (msg,) + error.args[1:] - if error.response.status_code == 503: - # If Model is unavailable, either raise a TimeoutError... - if timeout is not None and time.time() - t0 > timeout: - raise InferenceTimeoutError( - f"Model not loaded on the server: {request_parameters.url}. Please retry with a higher timeout (current:" - f" {self.timeout}).", - request=error.request, - response=error.response, - ) from error - # ...or wait 1s and retry - logger.info(f"Waiting for model to be loaded on the server: {error}") - time.sleep(1) - if "X-wait-for-model" not in request_parameters.headers and request_parameters.url.startswith( - INFERENCE_ENDPOINT - ): - request_parameters.headers["X-wait-for-model"] = "1" - if timeout is not None: - timeout = max(self.timeout - (time.time() - t0), 1) # type: ignore - continue raise def audio_classification( @@ -3261,6 +3237,13 @@ def zero_shot_image_classification( response = self._inner_post(request_parameters) return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response) + @_deprecate_method( + version="0.33.0", + message=( + "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)." + " Use `HfApi.list_models(..., inference_provider='...')` to list warm models per provider." + ), + ) def list_deployed_models( self, frameworks: Union[None, str, Literal["all"], List[str]] = None ) -> Dict[str, List[str]]: @@ -3444,6 +3427,13 @@ def health_check(self, model: Optional[str] = None) -> bool: response = get_session().get(url, headers=build_hf_headers(token=self.token)) return response.status_code == 200 + @_deprecate_method( + version="0.33.0", + message=( + "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)." + " Use `HfApi.model_info` to get the model status both with HF Inference API and external providers." + ), + ) def get_model_status(self, model: Optional[str] = None) -> ModelStatus: """ Get the status of a model hosted on the HF Inference API. diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py index 106909bd73..26b301bbd3 100644 --- a/src/huggingface_hub/inference/_generated/_async_client.py +++ b/src/huggingface_hub/inference/_generated/_async_client.py @@ -22,7 +22,6 @@ import base64 import logging import re -import time import warnings from typing import TYPE_CHECKING, Any, AsyncIterable, Dict, List, Literal, Optional, Set, Union, overload @@ -299,8 +298,6 @@ async def _inner_post( if request_parameters.task in TASKS_EXPECTING_IMAGES and "Accept" not in request_parameters.headers: request_parameters.headers["Accept"] = "image/png" - t0 = time.time() - timeout = self.timeout while True: with _open_as_binary(request_parameters.data) as data_as_binary: # Do not use context manager as we don't want to close the connection immediately when returning @@ -331,27 +328,6 @@ async def _inner_post( except aiohttp.ClientResponseError as error: error.response_error_payload = response_error_payload await session.close() - if response.status == 422 and request_parameters.task != "unknown": - error.message += f". Make sure '{request_parameters.task}' task is supported by the model." - if response.status == 503: - # If Model is unavailable, either raise a TimeoutError... - if timeout is not None and time.time() - t0 > timeout: - raise InferenceTimeoutError( - f"Model not loaded on the server: {request_parameters.url}. Please retry with a higher timeout" - f" (current: {self.timeout}).", - request=error.request, - response=error.response, - ) from error - # ...or wait 1s and retry - logger.info(f"Waiting for model to be loaded on the server: {error}") - if "X-wait-for-model" not in request_parameters.headers and request_parameters.url.startswith( - INFERENCE_ENDPOINT - ): - request_parameters.headers["X-wait-for-model"] = "1" - await asyncio.sleep(1) - if timeout is not None: - timeout = max(self.timeout - (time.time() - t0), 1) # type: ignore - continue raise error except Exception: await session.close() @@ -3325,6 +3301,13 @@ async def zero_shot_image_classification( response = await self._inner_post(request_parameters) return ZeroShotImageClassificationOutputElement.parse_obj_as_list(response) + @_deprecate_method( + version="0.33.0", + message=( + "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)." + " Use `HfApi.list_models(..., inference_provider='...')` to list warm models per provider." + ), + ) async def list_deployed_models( self, frameworks: Union[None, str, Literal["all"], List[str]] = None ) -> Dict[str, List[str]]: @@ -3554,6 +3537,13 @@ async def health_check(self, model: Optional[str] = None) -> bool: response = await client.get(url, proxy=self.proxies) return response.status == 200 + @_deprecate_method( + version="0.33.0", + message=( + "HF Inference API is getting revamped and will only support warm models in the future (no cold start allowed)." + " Use `HfApi.model_info` to get the model status both with HF Inference API and external providers." + ), + ) async def get_model_status(self, model: Optional[str] = None) -> ModelStatus: """ Get the status of a model hosted on the HF Inference API. diff --git a/src/huggingface_hub/inference/_providers/hf_inference.py b/src/huggingface_hub/inference/_providers/hf_inference.py index 25d08aadd1..9ec64a0fed 100644 --- a/src/huggingface_hub/inference/_providers/hf_inference.py +++ b/src/huggingface_hub/inference/_providers/hf_inference.py @@ -38,14 +38,7 @@ def _prepare_url(self, api_key: str, mapped_model: str) -> str: # hf-inference provider can handle URLs (e.g. Inference Endpoints or TGI deployment) if mapped_model.startswith(("http://", "https://")): return mapped_model - - return ( - # Feature-extraction and sentence-similarity are the only cases where we handle models with several tasks. - f"{self.base_url}/pipeline/{self.task}/{mapped_model}" - if self.task in ("feature-extraction", "sentence-similarity") - # Otherwise, we use the default endpoint - else f"{self.base_url}/models/{mapped_model}" - ) + return f"{self.base_url}/models/{mapped_model}" def _prepare_payload(self, inputs: Any, parameters: Dict, mapped_model: str) -> Optional[Dict]: if isinstance(inputs, bytes): diff --git a/tests/cassettes/TestInferenceClient.test_sentence_similarity[hf-inference,sentence-similarity].yaml b/tests/cassettes/TestInferenceClient.test_sentence_similarity[hf-inference,sentence-similarity].yaml index 246aeccb10..090885b1d2 100644 --- a/tests/cassettes/TestInferenceClient.test_sentence_similarity[hf-inference,sentence-similarity].yaml +++ b/tests/cassettes/TestInferenceClient.test_sentence_similarity[hf-inference,sentence-similarity].yaml @@ -17,7 +17,7 @@ interactions: X-Amzn-Trace-Id: - 0434ff33-56fe-49db-9380-17b81e41f756 method: POST - uri: https://router.huggingface.co/hf-inference/pipeline/sentence-similarity/sentence-transformers/all-MiniLM-L6-v2 + uri: https://router.huggingface.co/hf-inference/models/sentence-transformers/all-MiniLM-L6-v2 response: body: string: '[0.7785724997520447,0.4587624967098236,0.29062220454216003]' diff --git a/tests/cassettes/test_async_sentence_similarity.yaml b/tests/cassettes/test_async_sentence_similarity.yaml index 3d975bedfc..6af535ae13 100644 --- a/tests/cassettes/test_async_sentence_similarity.yaml +++ b/tests/cassettes/test_async_sentence_similarity.yaml @@ -3,7 +3,7 @@ interactions: body: null headers: {} method: POST - uri: https://router.huggingface.co/hf-inference/pipeline/sentence-similarity/sentence-transformers/all-MiniLM-L6-v2 + uri: https://router.huggingface.co/hf-inference/models/sentence-transformers/all-MiniLM-L6-v2 response: body: string: '[0.7785724997520447,0.4587624967098236,0.29062220454216003]' diff --git a/tests/test_inference_async_client.py b/tests/test_inference_async_client.py index aa3b2a1ab5..b25a220ca8 100644 --- a/tests/test_inference_async_client.py +++ b/tests/test_inference_async_client.py @@ -300,6 +300,7 @@ def test_sync_vs_async_signatures() -> None: @pytest.mark.asyncio +@pytest.mark.skip("Deprecated (get_model_status)") async def test_get_status_too_big_model() -> None: model_status = await AsyncInferenceClient(token=False).get_model_status("facebook/nllb-moe-54b") assert model_status.loaded is False @@ -309,6 +310,7 @@ async def test_get_status_too_big_model() -> None: @pytest.mark.asyncio +@pytest.mark.skip("Deprecated (get_model_status)") async def test_get_status_loaded_model() -> None: model_status = await AsyncInferenceClient(token=False).get_model_status("bigscience/bloom") assert model_status.loaded is True @@ -318,18 +320,21 @@ async def test_get_status_loaded_model() -> None: @pytest.mark.asyncio +@pytest.mark.skip("Deprecated (get_model_status)") async def test_get_status_unknown_model() -> None: with pytest.raises(ClientResponseError): await AsyncInferenceClient(token=False).get_model_status("unknown/model") @pytest.mark.asyncio +@pytest.mark.skip("Deprecated (get_model_status)") async def test_get_status_model_as_url() -> None: with pytest.raises(NotImplementedError): await AsyncInferenceClient(token=False).get_model_status("https://unkown/model") @pytest.mark.asyncio +@pytest.mark.skip("Deprecated (list_deployed_models)") async def test_list_deployed_models_single_frameworks() -> None: models_by_task = await AsyncInferenceClient().list_deployed_models("text-generation-inference") assert isinstance(models_by_task, dict) diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py index 62b658ff54..d61021563b 100644 --- a/tests/test_inference_client.py +++ b/tests/test_inference_client.py @@ -869,6 +869,7 @@ def test_accept_header_image(self, get_session_mock: MagicMock, bytes_to_image_m class TestModelStatus(TestBase): + @expect_deprecation("get_model_status") def test_too_big_model(self) -> None: client = InferenceClient(token=False) model_status = client.get_model_status("facebook/nllb-moe-54b") @@ -877,6 +878,7 @@ def test_too_big_model(self) -> None: assert model_status.compute_type == "cpu" assert model_status.framework == "transformers" + @expect_deprecation("get_model_status") def test_loaded_model(self) -> None: client = InferenceClient(token=False) model_status = client.get_model_status("bigscience/bloom") @@ -885,11 +887,13 @@ def test_loaded_model(self) -> None: assert isinstance(model_status.compute_type, dict) # e.g. {'gpu': {'gpu': 'a100', 'count': 8}} assert model_status.framework == "text-generation-inference" + @expect_deprecation("get_model_status") def test_unknown_model(self) -> None: client = InferenceClient() with pytest.raises(HfHubHTTPError): client.get_model_status("unknown/model") + @expect_deprecation("get_model_status") def test_model_as_url(self) -> None: client = InferenceClient() with pytest.raises(NotImplementedError): @@ -897,16 +901,19 @@ def test_model_as_url(self) -> None: class TestListDeployedModels(TestBase): + @expect_deprecation("list_deployed_models") @patch("huggingface_hub.inference._client.get_session") def test_list_deployed_models_main_frameworks_mock(self, get_session_mock: MagicMock) -> None: InferenceClient().list_deployed_models() assert len(get_session_mock.return_value.get.call_args_list) == len(MAIN_INFERENCE_API_FRAMEWORKS) + @expect_deprecation("list_deployed_models") @patch("huggingface_hub.inference._client.get_session") def test_list_deployed_models_all_frameworks_mock(self, get_session_mock: MagicMock) -> None: InferenceClient().list_deployed_models("all") assert len(get_session_mock.return_value.get.call_args_list) == len(ALL_INFERENCE_API_FRAMEWORKS) + @expect_deprecation("list_deployed_models") def test_list_deployed_models_single_frameworks(self) -> None: models_by_task = InferenceClient().list_deployed_models("text-generation-inference") assert isinstance(models_by_task, dict) diff --git a/utils/generate_async_inference_client.py b/utils/generate_async_inference_client.py index da494af321..0e9f2cb1e6 100644 --- a/utils/generate_async_inference_client.py +++ b/utils/generate_async_inference_client.py @@ -175,8 +175,6 @@ def _rename_to_AsyncInferenceClient(code: str) -> str: if request_parameters.task in TASKS_EXPECTING_IMAGES and "Accept" not in request_parameters.headers: request_parameters.headers["Accept"] = "image/png" - t0 = time.time() - timeout = self.timeout while True: with _open_as_binary(request_parameters.data) as data_as_binary: # Do not use context manager as we don't want to close the connection immediately when returning @@ -205,25 +203,6 @@ def _rename_to_AsyncInferenceClient(code: str) -> str: except aiohttp.ClientResponseError as error: error.response_error_payload = response_error_payload await session.close() - if response.status == 422 and request_parameters.task != "unknown": - error.message += f". Make sure '{request_parameters.task}' task is supported by the model." - if response.status == 503: - # If Model is unavailable, either raise a TimeoutError... - if timeout is not None and time.time() - t0 > timeout: - raise InferenceTimeoutError( - f"Model not loaded on the server: {request_parameters.url}. Please retry with a higher timeout" - f" (current: {self.timeout}).", - request=error.request, - response=error.response, - ) from error - # ...or wait 1s and retry - logger.info(f"Waiting for model to be loaded on the server: {error}") - if "X-wait-for-model" not in request_parameters.headers and request_parameters.url.startswith(INFERENCE_ENDPOINT): - request_parameters.headers["X-wait-for-model"] = "1" - await asyncio.sleep(1) - if timeout is not None: - timeout = max(self.timeout - (time.time() - t0), 1) # type: ignore - continue raise error except Exception: await session.close()