update

philschmid · philschmid · commit 67ac77730298 · 2024-08-20T13:53:49.000+02:00
diff --git a/README.md b/README.md
@@ -1,21 +1,18 @@
-
 <div style="display:flex; text-align:center; justify-content:center;">
 <img src="https://huggingface.co/front/assets/huggingface_logo.svg" width="100"/>
 <h1 style="margin-top:auto;"> Hugging Face Inference Toolkit <h1>
 </div>
 
-Hugging Face Inference Toolkit is for serving 🤗 Transformers models in containers. This library provides default pre-processing, predict and postprocessing for Transformers, Sentence Tranfsformers. It is also possible to define custom `handler.py` for customization. The Toolkit is build to work with the [Hugging Face Hub](https://huggingface.co/models).
-
----
+Hugging Face Inference Toolkit is for serving 🤗 Transformers models in containers. This library provides default pre-processing, predict and postprocessing for Transformers, Sentence Tranfsformers. It is also possible to define custom `handler.py` for customization. The Toolkit is build to work with the [Hugging Face Hub](https://huggingface.co/models) and is used as "default" option in [Inference Endpoints](https://ui.endpoints.huggingface.co/)
 
-## 💻  Getting Started with Hugging Face Inference Toolkit
+## 💻 Getting Started with Hugging Face Inference Toolkit
 
-* Clone the repository `git clone <https://github.com/huggingface/huggingface-inference-toolkit``>
-* Install the dependencies in dev mode `pip install -e ".[torch,st,diffusers,test,quality]"`
-  * If you develop on AWS inferentia2 install with `pip install -e ".[test,quality]" optimum-neuron[neuronx] --upgrade`
-  * If you develop on Google Cloud install with `pip install -e ".[torch,st,diffusers,google,test,quality]"`
-* Unit Testing: `make unit-test`
-* Integration testing: `make integ-test`
+- Clone the repository `git clone https://github.com/huggingface/huggingface-inference-toolkit`
+- Install the dependencies in dev mode `pip install -e ".[torch,st,diffusers,test,quality]"`
+  - If you develop on AWS inferentia2 install with `pip install -e ".[test,quality]" optimum-neuron[neuronx] --upgrade`
+  - If you develop on Google Cloud install with `pip install -e ".[torch,st,diffusers,google,test,quality]"`
+- Unit Testing: `make unit-test`
+- Integration testing: `make integ-test`
 
 ### Local run
 
@@ -68,18 +65,18 @@ curl --request POST \
 
 The Hugging Face Inference Toolkit allows user to provide a custom inference through a `handler.py` file which is located in the repository.
 
-For an example check [philschmid/custom-pipeline-text-classification](https://huggingface.co/philschmid/custom-pipeline-text-classification):  
+For an example check [philschmid/custom-pipeline-text-classification](https://huggingface.co/philschmid/custom-pipeline-text-classification):
 
 ```bash
 model.tar.gz/
 |- pytorch_model.bin
 |- ....
 |- handler.py
-|- requirements.txt 
+|- requirements.txt
 ```
 
 In this example, `pytroch_model.bin` is the model file saved from training, `handler.py` is the custom inference handler, and `requirements.txt` is a requirements file to add additional dependencies.
-The custom module can override the following methods:  
+The custom module can override the following methods:
 
 ### Vertex AI Support
 
@@ -136,9 +133,9 @@ curl --request POST \
 
 The Hugging Face Inference Toolkit provides support for deploying Hugging Face on AWS Inferentia2. To deploy a model on Inferentia2 you have 3 options:
 
-* Provide `HF_MODEL_ID`, the model repo id on huggingface.co which contains the compiled model under `.neuron` format e.g. `optimum/bge-base-en-v1.5-neuronx`
-* Provide the `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH` environment variables to compile the model on the fly, e.g. `HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128`
-* Include `neuron` dictionary in the [config.json](https://huggingface.co/optimum/tiny_random_bert_neuron/blob/main/config.json) file in the model archive, e.g. `neuron: {"static_batch_size": 1, "static_sequence_length": 128}`
+- Provide `HF_MODEL_ID`, the model repo id on huggingface.co which contains the compiled model under `.neuron` format e.g. `optimum/bge-base-en-v1.5-neuronx`
+- Provide the `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH` environment variables to compile the model on the fly, e.g. `HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128`
+- Include `neuron` dictionary in the [config.json](https://huggingface.co/optimum/tiny_random_bert_neuron/blob/main/config.json) file in the model archive, e.g. `neuron: {"static_batch_size": 1, "static_sequence_length": 128}`
 
 The currently supported tasks can be found [here](https://huggingface.co/docs/optimum-neuron/en/package_reference/supported_models). If you plan to deploy an LLM, we recommend taking a look at [Neuronx TGI](https://huggingface.co/blog/text-generation-inference-on-inferentia2), which is purposly build for LLMs.
 
@@ -148,14 +145,14 @@ Start Hugging Face Inference Toolkit with the following environment variables.
 
 _Note: You need to run this on an Inferentia2 instance._
 
-* transformers `text-classification` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
+- transformers `text-classification` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
 
 ```bash
 mkdir tmp2/
 HF_MODEL_ID="distilbert/distilbert-base-uncased-finetuned-sst-2-english" HF_TASK="text-classification" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128  HF_MODEL_DIR=tmp2 uvicorn src.huggingface_inference_toolkit.webservice_starlette:app  --port 5000
 ```
 
-* sentence transformers `feature-extration` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
+- sentence transformers `feature-extration` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
 
 ```bash
 HF_MODEL_ID="sentence-transformers/all-MiniLM-L6-v2" HF_TASK="feature-extraction" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 HF_MODEL_DIR=tmp2 uvicorn src.huggingface_inference_toolkit.webservice_starlette:app  --port 5000
@@ -284,19 +281,13 @@ HF_OPTIMUM_SEQUENCE_LENGTH="128"
 
 ## ⚙ Supported Front-Ends
 
-* [x] Starlette (HF Endpoints)
-* [x] Starlette (Vertex AI)
-* [ ] Starlette (Azure ML)
-* [ ] Starlette (SageMaker)
-
----
-
-## 🤝 Contributing
-
----
+- [x] Starlette (HF Endpoints)
+- [x] Starlette (Vertex AI)
+- [ ] Starlette (Azure ML)
+- [ ] Starlette (SageMaker)
 
-## 📜  License
+## 📜 License
 
-TBD.
+This project is licensed under the Apache-2.0 License.
 
 ---
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,15 +4,15 @@ no_implicit_optional = true
 scripts_are_modules = true
 
 [tool.ruff]
-lint.select = [
+select = [
   "E", # pycodestyle errors
   "W", # pycodestyle warnings
   "F", # pyflakes
   "I", # isort
   "C", # flake8-comprehensions
   "B", # flake8-bugbear
 ]
-lint.ignore = [
+ignore = [
   "E501", # Line length (handled by ruff-format)
   "B008", # do not perform function calls in argument defaults
   "C901", # too complex
@@ -21,13 +21,13 @@ lint.ignore = [
 line-length = 119
 
 # Allow unused variables when underscore-prefixed.
-lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
 # Assume Python 3.11.
 target-version = "py311"
 
-lint.per-file-ignores = {"__init__.py" = ["F401"]}
+per-file-ignores = { "__init__.py" = ["F401"] }
 
 [tool.isort]
 profile = "black"
-known_third_party = ["transformers", "starlette", "huggingface_hub"]
+known_third_party = ["transformers", "starlette", "huggingface_hub"]
diff --git a/setup.cfg b/setup.cfg
@@ -10,7 +10,6 @@ known_third_party =
     datasets
     tensorflow
     torch
-    robyn
 
 line_length = 119
 lines_after_imports = 2
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 # We don't declare our dependency on transformers here because we build with
 # different packages for different variants
 
-VERSION = "0.4.3"
+VERSION = "0.5.0"
 
 # Ubuntu packages
 # libsndfile1-dev: torchaudio requires the development version of the libsndfile package which can be installed via a system package manager. On Ubuntu it can be installed as follows: apt install libsndfile1-dev