Moved secrets into ZenML secrets

AlexejPenner · AlexejPenner · commit e83b505a7b3f · 2024-10-25T18:23:35.000+02:00
diff --git a/llm-complete-guide/README.md b/llm-complete-guide/README.md
@@ -49,8 +49,10 @@ Depending on your setup you may run into some issues when running the `pip insta
 In order to use the default LLM for this query, you'll need an account and an
 API key from OpenAI specified as another environment variable:
 
+zenml secret create supabase_postgres_db --password="YOUR_PASSWORD" --user="YOU_USER" --host="YOUR_HOST" --port="YOUR_PORT"
+
 ```shell
-export OPENAI_API_KEY=<your-openai-api-key>
+zenml secret create openai --api_key=<your-openai-api-key>
 ```
 
 ### Setting up Supabase
@@ -66,22 +68,15 @@ You'll want to save the Supabase database password as a ZenML secret so that it
 isn't stored in plaintext. You can do this by running the following command:
 
 ```shell
-zenml secret create supabase_postgres_db --password="YOUR_PASSWORD"
+zenml secret create supabase_postgres_db --password="YOUR_PASSWORD" --user="YOU_USER" --host="YOUR_HOST" --port="YOUR_PORT"
 ```
 
-You'll then want to connect to this database instance by getting the connection
+You can get the user, host and port for this database instance by getting the connection
 string from the Supabase dashboard.
 
 ![](.assets/supabase-connection-string.png)
 
-You can use these details to populate some environment variables where the
-pipeline code expects them:
-
-```shell
-export ZENML_POSTGRES_USER=<your-supabase-user>
-export ZENML_POSTGRES_HOST=<your-supabase-host>
-export ZENML_POSTGRES_PORT=<your-supabase-port>
-```
+Alternatively you can use a different database as the backend. 
 
 ### Running the RAG pipeline
 
diff --git a/llm-complete-guide/configs/rag.yaml b/llm-complete-guide/configs/rag.yaml
diff --git a/llm-complete-guide/configs/rag_gcp.yaml b/llm-complete-guide/configs/rag_gcp.yaml
@@ -0,0 +1,30 @@
+# environment configuration
+settings:
+  docker:
+    requirements:
+      - unstructured
+      - sentence-transformers>=3
+      - pgvector
+      - datasets
+      - litellm
+      - numpy
+      - psycopg2-binary
+      - tiktoken
+      - ratelimit
+    environment:
+      ZENML_SUPABASE_SECRET_NAME: alexej_supabase_postgres_db
+      ZENML_OPENAI_SECRET_NAME: alexej_openai
+      ZENML_ENABLE_RICH_TRACEBACK: FALSE
+      ZENML_LOGGING_VERBOSITY: INFO
+
+steps:
+  url_scraper:
+    parameters:
+      docs_url: https://docs.zenml.io
+  generate_embeddings:
+    step_operator: "terraform-gcp-6c0fd52233ca"
+    settings:
+      step_operator.vertex:
+        accelerator_type: "NVIDIA_TESLA_P100"
+        accelerator_count: 1
+        machine_type: "n1-standard-8"
diff --git a/llm-complete-guide/configs/rag_local_dev.yaml b/llm-complete-guide/configs/rag_local_dev.yaml
@@ -0,0 +1,30 @@
+# environment configuration
+settings:
+  docker:
+    requirements:
+      - unstructured
+      - sentence-transformers>=3
+      - pgvector
+      - datasets
+      - litellm
+      - numpy
+      - psycopg2-binary
+      - tiktoken
+      - ratelimit
+    environment:
+      ZENML_SUPABASE_SECRET_NAME: alexej_supabase_postgres_db
+      ZENML_OPENAI_SECRET_NAME: alexej_openai
+      ZENML_ENABLE_RICH_TRACEBACK: FALSE
+      ZENML_LOGGING_VERBOSITY: INFO
+
+steps:
+  url_scraper:
+    parameters:
+      docs_url: https://docs.zenml.io/stack-components/orchestrators
+#  generate_embeddings:
+#    step_operator: "terraform-gcp-6c0fd52233ca"
+#    settings:
+#      step_operator.vertex:
+#        accelerator_type: "NVIDIA_TESLA_P100"
+#        accelerator_count: 1
+#        machine_type: "n1-standard-8"
diff --git a/llm-complete-guide/most_basic_eval.py b/llm-complete-guide/most_basic_eval.py
@@ -20,6 +20,8 @@
 
 from openai import OpenAI
 
+from utils.openai_utils import get_openai_api_key
+
 
 def preprocess_text(text):
     text = text.lower()
@@ -51,7 +53,7 @@ def answer_question(query, corpus, top_n=2):
         return "I don't have enough information to answer the question."
 
     context = "\n".join(relevant_chunks)
-    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+    client = OpenAI(api_key=get_openai_api_key())
     chat_completion = client.chat.completions.create(
         messages=[
             {
@@ -117,7 +119,7 @@ def evaluate_retrieval(question, expected_answer, corpus, top_n=2):
 
 
 def evaluate_generation(question, expected_answer, generated_answer):
-    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+    client = OpenAI(api_key=get_openai_api_key())
     chat_completion = client.chat.completions.create(
         messages=[
             {
diff --git a/llm-complete-guide/most_basic_rag_pipeline.py b/llm-complete-guide/most_basic_rag_pipeline.py
@@ -21,6 +21,8 @@
 
 from openai import OpenAI
 
+from utils.openai_utils import get_openai_api_key
+
 
 def preprocess_text(text):
     text = text.lower()
@@ -52,7 +54,7 @@ def answer_question(query, corpus, top_n=2):
         return "I don't have enough information to answer the question."
 
     context = "\n".join(relevant_chunks)
-    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+    client = OpenAI(api_key=get_openai_api_key())
     chat_completion = client.chat.completions.create(
         messages=[
             {
diff --git a/llm-complete-guide/pipelines/llm_basic_rag.py b/llm-complete-guide/pipelines/llm_basic_rag.py
@@ -22,11 +22,15 @@
 )
 from steps.url_scraper import url_scraper
 from steps.web_url_loader import web_url_loader
-from zenml import pipeline
+from zenml import pipeline, Model
+
+model_definition = Model(
+    name=""
+)
 
 
 @pipeline
-def llm_basic_rag() -> None:
+def llm_basic_rag(model=model_definition) -> None:
     """Executes the pipeline to train a basic RAG model.
 
     This function performs the following steps:
diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py
@@ -48,7 +48,6 @@
     llm_basic_rag,
     llm_eval,
 )
-from pipelines.finetune_embeddings_legacy import chunking_experiment
 from structures import Document
 from zenml.materializers.materializer_registry import materializer_registry
 
@@ -190,7 +189,7 @@ def main(
     print(f"Running Pipeline with pipeline args: {pipeline_args}")
     if rag:
         config_path = os.path.join(
-            os.path.dirname(os.path.realpath(__file__)), "configs", "rag.yaml"
+            os.path.dirname(os.path.realpath(__file__)), "configs", "rag_local_dev.yaml"
         )
         llm_basic_rag.with_options(config_path=config_path, **pipeline_args)()
     if evaluation:
diff --git a/llm-complete-guide/steps/distilabel_generate_queries.py b/llm-complete-guide/steps/distilabel_generate_queries.py
@@ -27,6 +27,8 @@
 from distilabel.pipeline import Pipeline
 from zenml import step
 
+from utils.openai_utils import get_openai_api_key
+
 synthetic_generation_context = """
 The text is a chunk from technical documentation of ZenML.
 ZenML is an MLOps + LLMOps framework that makes your infrastructure and workflow metadata accessible to data science teams.
@@ -42,7 +44,7 @@ def generate_synthetic_queries(
     Annotated[Dataset, "test_with_queries"],
 ]:
     llm = OpenAILLM(
-        model=OPENAI_MODEL_GEN, api_key=os.getenv("OPENAI_API_KEY")
+        model=OPENAI_MODEL_GEN, api_key=get_openai_api_key()
     )
 
     with Pipeline(
diff --git a/llm-complete-guide/utils/llm_utils.py b/llm-complete-guide/utils/llm_utils.py
@@ -21,6 +21,8 @@
 
 import logging
 
+from zenml.cli import secret
+
 # Configure logging levels for specific modules
 logging.getLogger("pytorch").setLevel(logging.CRITICAL)
 logging.getLogger("sentence-transformers").setLevel(logging.CRITICAL)
@@ -212,48 +214,76 @@ def split_documents(
     return chunked_documents
 
 
-def get_local_db_connection_details() -> Dict[str, str]:
-    """Returns the connection details for the local database.
+def get_db_password(secret_name: str) -> str:
+    """Returns the password for the PostgreSQL database.
 
     Returns:
-        dict: A dictionary containing the connection details for the local
-        database.
+        str: The password for the PostgreSQL database.
+    """
+    password = os.getenv("ZENML_POSTGRES_DB_PASSWORD")
+    if not password:
+        from zenml.client import Client
 
-    Raises:
-        RuntimeError: If the environment variables ZENML_POSTGRES_USER, ZENML_POSTGRES_HOST, or ZENML_POSTGRES_PORT are not set.
+        password = (
+            Client()
+            .get_secret(secret_name)
+            .secret_values["password"]
+        )
+    return password
+
+
+def get_db_user(secret_name: str) -> str:
+    """Returns the user for the PostgreSQL database.
+
+    Returns:
+        str: The user for the PostgreSQL database.
     """
     user = os.getenv("ZENML_POSTGRES_USER")
-    host = os.getenv("ZENML_POSTGRES_HOST")
-    port = os.getenv("ZENML_POSTGRES_PORT")
+    if not user:
+        from zenml.client import Client
 
-    if not user or not host or not port:
-        raise RuntimeError(
-            "Please make sure to set the environment variables: ZENML_POSTGRES_USER, ZENML_POSTGRES_HOST, and ZENML_POSTGRES_PORT"
+        user = (
+            Client()
+            .get_secret(secret_name)
+            .secret_values["user"]
         )
+    return user
 
-    return {
-        "user": user,
-        "host": host,
-        "port": port,
-    }
 
+def get_db_host(secret_name: str) -> str:
+    """Returns the host for the PostgreSQL database.
 
-def get_db_password() -> str:
-    """Returns the password for the PostgreSQL database.
+    Returns:
+        str: The host for the PostgreSQL database.
+    """
+    host = os.getenv("ZENML_POSTGRES_HOST")
+    if not host:
+        from zenml.client import Client
+
+        host = (
+            Client()
+            .get_secret(secret_name)
+            .secret_values["host"]
+        )
+    return host
+
+
+def get_db_port(secret_name: str) -> str:
+    """Returns the port for the PostgreSQL database.
 
     Returns:
-        str: The password for the PostgreSQL database.
+        str: The port for the PostgreSQL database.
     """
-    password = os.getenv("ZENML_POSTGRES_DB_PASSWORD")
-    if not password:
+    port = os.getenv("ZENML_POSTGRES_DB_PASSWORD")
+    if not port:
         from zenml.client import Client
 
-        password = (
+        port = (
             Client()
             .get_secret("supabase_postgres_db")
-            .secret_values["password"]
+            .secret_values["port"]
         )
-    return password
+    return port
 
 
 def get_db_conn() -> connection:
@@ -265,15 +295,19 @@ def get_db_conn() -> connection:
     Returns:
         connection: A psycopg2 connection object to the PostgreSQL database.
     """
-    pg_password = get_db_password()
+    secret_name = os.getenv("ZENML_SUPABASE_SECRET_NAME")
 
-    local_database_connection = get_local_db_connection_details()
+    if not secret_name:
+        raise RuntimeError(
+            "Please make sure to set the environment variable: ZENML_SUPABASE_SECRET_NAME to point at the secret that "
+            "contains your supabase connection details."
+        )
 
     CONNECTION_DETAILS = {
-        "user": local_database_connection["user"],
-        "password": pg_password,
-        "host": local_database_connection["host"],
-        "port": local_database_connection["port"],
+        "user": get_db_user(secret_name),
+        "password": get_db_password(secret_name),
+        "host": get_db_host(secret_name),
+        "port": get_db_port(secret_name),
         "dbname": "postgres",
     }
 
diff --git a/llm-complete-guide/utils/openai_utils.py b/llm-complete-guide/utils/openai_utils.py
@@ -0,0 +1,21 @@
+import os
+
+from zenml.client import Client
+
+
+def get_openai_api_key():
+    secret_name = os.getenv("ZENML_OPENAI_SECRET_NAME")
+
+    if not secret_name:
+        raise RuntimeError(
+            "Please make sure to set the environment variable: ZENML_OPENAI_SECRET_NAME to point at the secret that "
+            "contains your openai api key."
+        )
+
+    api_key = (
+        Client()
+        .get_secret(secret_name)
+        .secret_values["api_key"]
+    )
+
+    return api_key