zenml-io
diff --git a/‎.gitignore‎
Lines changed: 14 additions & 2 deletions b/‎.gitignore‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm-complete-guide/README.md‎
Lines changed: 47 additions & 1 deletion b/‎llm-complete-guide/README.md‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎llm-complete-guide/__init__.py‎ b/‎llm-complete-guide/__init__.py‎
diff --git a/‎llm-complete-guide/constants.py‎
Lines changed: 40 additions & 1 deletion b/‎llm-complete-guide/constants.py‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎llm-complete-guide/data/test_dataset.json‎
Lines changed: 166 additions & 0 deletions b/‎llm-complete-guide/data/test_dataset.json‎
Lines changed: 166 additions & 0 deletions
@@ -150,6 +150,18 @@ zencoder/cloned_public_repos
 llm-lora-finetuning/ckpt/
 llm-lora-finetuning/data_generation/
 llm-lora-finetuning/datagen/
-nohup.out
+fiftyone-ls-demo/
+llm-lora-finetuning/mistral-zenml-finetune/
 .flashrank_cache
-
+bge-base-financial-matryoshka/
+embeddings
+llm-lora-finetuning/meta-llama/
+llm-lora-finetuning/microsoft/
+llm-lora-finetuning/unsloth/
+llm-lora-finetuning/configs/shopify.yaml
+finetuned-matryoshka/
+finetuned-all-MiniLM-L6-v2/
+finetuned-snowflake-arctic-embed-m/
+
+# ollama ignores
+nohup.out
@@ -73,7 +73,7 @@ A list of updated and maintained projects by the ZenML team and the community:
 | [LLM RAG Pipeline with Langchain and OpenAI](llm-agents/)                    | NLP, LLMs                             | `slack` `langchain` `llama_index`                                        |
 | [Orbit User Analysis](orbit-user-analysis)                             | Data Analysis, Tabular                | -                                                                        |
 | [Huggingface to Sagemaker](huggingface-sagemaker)                      | NLP                                   | `pytorch` `mlflow` `huggingface` `aws` `s3` `kubeflow` `slack` `github`  |
-| [Complete Guide to LLMs (from RAG to finetuning)](llm-complete-guide)               | NLP, LLMs                           | `openai` `supabase`  |
+| [Complete Guide to LLMs (from RAG to finetuning)](llm-complete-guide)               | NLP, LLMs, embeddings, finetuning                           | `openai` `supabase` `huggingface` `argilla`  |
 | [LLM LoRA Finetuning (Phi3 and Llama 3.1)](llm-lora-finetuning)               | NLP, LLMs                           | `gcp`  |
 | [ECP Price Prediction with GCP Cloud Composer](airflow-cloud-composer-etl-feature-train/README.md)               | Regression, Airflow                           | `cloud-composer` `airflow` |
 | [Simple LLM finetuning with Lightning Studio](simple-llm-finetuning/README.md)               | Lightning AI Studio, LLMs                           | `cloud-composer` `airflow` |
 
@@ -116,7 +116,7 @@ Note that Claude will require a different API key from Anthropic. See [the
 `litellm` docs](https://docs.litellm.ai/docs/providers/anthropic) on how to set
 this up.
 
-### Run the evaluation pipeline
+### Run the LLM RAG evaluation pipeline
 
 To run the evaluation pipeline, you can use the following command:
 
@@ -127,6 +127,52 @@ python run.py --evaluation
 You'll need to have first run the RAG pipeline to have the necessary assets in
 the database to evaluate.
 
+## Embeddings finetuning
+
+For embeddings finetuning we first generate synthetic data and then finetune the
+embeddings. Both of these pipelines are described in [the LLMOps guide](https://docs.zenml.io/v/docs/user-guide/llmops-guide/finetuning-embeddings) and
+instructions for how to run them are provided below.
+
+### Run the `distilabel` synthetic data generation pipeline
+
+To run the `distilabel` synthetic data generation pipeline, you can use the following commands:
+
+```shell
+pip install -r requirements-argilla.txt # special requirements
+python run.py --synthetic
+```
+
+You will also need to have set up and connected to an Argilla instance for this
+to work. Please follow the instructions in the [Argilla
+documentation](https://docs.argilla.io/latest/getting_started/quickstart/)
+to set up and connect to an Argilla instance on the Hugging Face Hub. [ZenML's
+Argilla integration
+documentation](https://docs.zenml.io/v/docs/stack-components/annotators/argilla)
+will guide you through the process of connecting to your instance as a stack
+component.
+
+### Finetune the embeddings
+
+To run the pipeline for finetuning the embeddings, you can use the following
+commands:
+
+```shell
+pip install -r requirements-argilla.txt # special requirements
+python run.py --embeddings
+```
+
+As with the previous pipeline, you will need to have set up and connected to an Argilla instance for this
+to work. Please follow the instructions in the [Argilla
+documentation](https://docs.argilla.io/latest/getting_started/quickstart/)
+to set up and connect to an Argilla instance on the Hugging Face Hub. [ZenML's
+Argilla integration
+documentation](https://docs.zenml.io/v/docs/stack-components/annotators/argilla)
+will guide you through the process of connecting to your instance as a stack
+component.
+
+*Credit to Phil Schmid for his [tutorial on embeddings finetuning with Matryoshka
+loss function](https://www.philschmid.de/fine-tune-embedding-model-for-rag) which we adapted for this project.*
+
 ## ☁️ Running in your own VPC
 
 The basic RAG pipeline will run using a local stack, but if you want to improve
 
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 
-
 # Vector Store constants
 CHUNK_SIZE = 2000
 CHUNK_OVERLAP = 50
@@ -35,3 +34,43 @@
     "claude3": "claude-3-opus-20240229",
     "claudehaiku": "claude-3-haiku-20240307",
 }
+
+# CHUNKING_METHOD = "split-by-document"
+CHUNKING_METHOD = "split-by-header"
+DATASET_NAME = f"zenml/rag_qa_embedding_questions_{CHUNKING_METHOD}"
+MODEL_PATH = "all-MiniLM-L6-v2"
+# MODEL_PATH = "embedding-data/distilroberta-base-sentence-transformer"
+NUM_EPOCHS = 30
+WARMUP_STEPS = 0.1  # 10% of train data
+NUM_GENERATIONS = 2
+EVAL_BATCH_SIZE = 64
+
+DUMMY_DATASET_NAME = "embedding-data/sentence-compression"
+# DUMMY_MODEL_PATH = "embedding-data/distilroberta-base-sentence-transformer"
+DUMMY_MODEL_PATH = "all-MiniLM-L6-v2"
+DUMMY_EPOCHS = 10
+
+# Markdown Loader constants
+FILES_TO_IGNORE = [
+    "toc.md",
+]
+
+# embeddings finetuning constants
+EMBEDDINGS_MODEL_NAME_ZENML = "finetuned-zenml-docs-embeddings"
+DATASET_NAME_DEFAULT = "zenml/rag_qa_embedding_questions_0_60_0"
+DATASET_NAME_DISTILABEL = f"{DATASET_NAME_DEFAULT}_distilabel"
+DATASET_NAME_ARGILLA = DATASET_NAME_DEFAULT.replace("zenml/", "")
+OPENAI_MODEL_GEN = "gpt-4o"
+OPENAI_MODEL_GEN_KWARGS_EMBEDDINGS = {
+    "temperature": 0.7,
+    "max_new_tokens": 512,
+}
+EMBEDDINGS_MODEL_ID_BASELINE = "Snowflake/snowflake-arctic-embed-m"
+EMBEDDINGS_MODEL_ID_FINE_TUNED = "finetuned-snowflake-arctic-embed-m"
+EMBEDDINGS_MODEL_MATRYOSHKA_DIMS: list[int] = [
+    384,
+    256,
+    128,
+    64,
+]  # Important: large to small
+USE_ARGILLA_ANNOTATIONS = False