Merge pull request #70 from sudoleg/develop

sudoleg · web-flow · commit 0bef0048cf06 · 2024-06-30T19:26:16.000+02:00
project structure updates &amp; embedding models
diff --git a/.assets/home.md b/.assets/home.md
@@ -1,8 +1,6 @@
 # YTAI - Your personal YouTube AI
 
-Get insights from YouTube videos with YTAI, an LLM-based app that allows you to summarize or even ask questions and receive answers about them! tailors YouTube video summaries to your needs, offering a custom prompt feature for summaries exactly how you want them.
-
-Check out the project on [GitHub](https://github.com/sudoleg/ytai) for more information! Also, if you like the app, I would be very happy about a star :star:
+Get insights from YouTube videos with YTAI, an LLM-based app that allows you to summarize or even ask questions and receive answers about them! Check out the project on [GitHub](https://github.com/sudoleg/ytai) for more information! Also, if you like the app, I would be very happy about a star :star:
 
 ## Summary
 
diff --git a/.assets/rag_quidelines.md b/.assets/rag_quidelines.md
@@ -6,9 +6,9 @@ You can either provide a question or just a topic from the video. If you ask a q
 
 ## General tips
 
-- be specific
-- be concise
-- Don't include instructions, like 'explain in detail' or 'answer from a perspective of X' etc.
+- be concise and specific
+- don't combine questions unless they are closely related!
+- don't include instructions, like 'explain in detail' or 'answer from a perspective of X' etc.
 
 ## Important notice ❗
 
diff --git a/README.md b/README.md
@@ -49,16 +49,6 @@ docker build --tag=ytai:latest .
 docker run -d -p 8501:8501 -v $(pwd):/app/responses -e OPENAI_API_KEY=<your-openai-api-key> --name yt-summarizer sudoleg/ytai:latest
 ```
 
-### development in virtual environment
-
-```bash
-python -m venv .venv
-source .venv/bin/activate
-pip install -r requirements.txt
-export OPENAI_API_KEY=<your-openai-api-key>
-streamlit run main.py
-```
-
 ## Contributing
 
 Feedback and contributions are welcome! This is a small side-project and it's very easy to get started! Here’s the gist to get your changes rolling:
@@ -72,6 +62,22 @@ Feedback and contributions are welcome! This is a small side-project and it's ve
 5. **Pull Request**: Push your changes to your fork and submit a pull request (PR) to the main repository. Describe your changes and any relevant details.
 6. **Engage**: Respond to feedback on your PR to finalize your contribution.
 
+### development in virtual environment
+
+```bash
+# create and activate a virtual environment
+python -m venv .venv
+source .venv/bin/activate
+# install requirements
+pip install -r requirements.txt
+# you'll need an API key
+export OPENAI_API_KEY=<your-openai-api-key>
+# run chromadb (necessary for chat)
+docker-compose up -d chromadb
+# run app
+streamlit run main.py
+```
+
 ## Technologies used
 
 The project is built using some amazing libraries:
diff --git a/config.json b/config.json
@@ -2,12 +2,18 @@
     "app_title": "AI YouTube Video Summarizer",
     "github_repo_link": "https://github.com/sudoleg/ytai",
     "default_model": "gpt-3.5-turbo",
-    "available_models": [
-        "gpt-3.5-turbo",
-        "gpt-4",
-        "gpt-4-turbo",
-        "gpt-4o"
-    ],
+    "available_models": {
+        "embeddings": [
+            "text-embedding-3-small",
+            "text-embedding-3-large"
+        ],
+        "gpts": [
+            "gpt-3.5-turbo",
+            "gpt-4",
+            "gpt-4-turbo",
+            "gpt-4o"
+        ]
+    },
     "temperature": 1.0,
     "top_p": 1.0,
     "help_texts": {
@@ -19,6 +25,7 @@
         "saving_responses": "Whether to save responses in the directory, where you run the app. The responses will be saved under '<YT-channel-name>/<video-title>.md'.",
         "chunk_size": "A larger chunk size increases the amount of context provided to the model to answer your question. However, it may be less relevant than with a small chunk size, as smaller chunks can encapsulate more semantic meaning. I would reccommend to use a smaller chunk size for shorter and a larger one for longer videos (> 1h).",
         "preprocess_checkbox": "By enabling this, the original transcript gets preprocessed. This can greatly improve the results, especially for videos with automatically generated transcripts. However, it results in higher costs, as the whole transcript get's processed by gpt3.5-turbo. Also, the preprocessing will take a substantial amount of time.",
-        "selected_video": "Once you process a video, it gets saved in a database. You can chat with it at any time, without processing it again! Tip: you may also search for videos by typing (parts of) its title."
+        "selected_video": "Once you process a video, it gets saved in a database. You can chat with it at any time, without processing it again! Tip: you may also search for videos by typing (parts of) its title.",
+        "embeddings": "Embeddings are a numerical representation of text that can be used to measure the relatedness between two pieces of text. Embedding models create these numerical representations. Read more at https://platform.openai.com/docs/models/embeddings"
     }
 }
diff --git a/data/.gitkeep b/data/.gitkeep
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -41,7 +41,7 @@ services:
       # the summaries to be saved
       - /Users/Shared/yt-summaries:/app/responses
       # leave as it is
-      - sqlite:/app/data/app
+      - ./data:/app/data
     environment:
       # replace with your OpenAI API key or the name of the environment
       # variable that stores it on your machine 
@@ -54,5 +54,3 @@ services:
 volumes:
   chroma:
     driver: local
-  sqlite:
-    driver: local
diff --git a/modules/persistance.py b/modules/persistance.py
@@ -11,7 +11,7 @@
     UUIDField,
 )
 
-SQL_DB = SqliteDatabase("data/app/videos.sqlite3")
+SQL_DB = SqliteDatabase("data/videos.sqlite3")
 
 
 class BaseModel(Model):
diff --git a/modules/ui.py b/modules/ui.py
@@ -62,7 +62,7 @@ def display_model_settings_sidebar():
         st.header("Model settings")
         model = st.selectbox(
             "Select a large language model",
-            tuple(get_default_config_value("available_models")),
+            tuple(get_default_config_value("available_models.gpts")),
             key="model",
             help=get_default_config_value("help_texts.model"),
         )
diff --git a/pages/chat.py b/pages/chat.py
@@ -45,13 +45,19 @@
 
 CHUNK_SIZE_FOR_UNPROCESSED_TRANSCRIPT = 512
 
-
 st.set_page_config("Chat", layout="wide", initial_sidebar_state="auto")
 display_api_key_warning()
 
 # --- sidebar with model settings ---
 display_nav_menu()
 display_model_settings_sidebar()
+st.sidebar.info("Choose `text-embedding-3-large` if your video is **not** in English!")
+selected_embeddings_model = st.sidebar.selectbox(
+    label="Select an embedding model",
+    options=tuple(get_default_config_value("available_models.embeddings")),
+    key="embeddings_model",
+    help=get_default_config_value("help_texts.embeddings"),
+)
 display_link_to_repo()
 # --- end ---
 
@@ -91,7 +97,7 @@
     )
     openai_embedding_model = OpenAIEmbeddings(
         api_key=st.session_state.openai_api_key,
-        model="text-embedding-3-small",
+        model=st.session_state.embeddings_model,
     )
 # --- end ---
 
@@ -237,9 +243,9 @@ def refresh_page(message: str):
                     collection = chroma_client.get_or_create_collection(
                         name=randomname.get_name(),
                         metadata={
-                            "yt_video_id": saved_video.yt_video_id,
-                            "yt_channel": saved_video.channel,
                             "yt_video_title": saved_video.title,
+                            "chunk_size": chunk_size,
+                            "embeddings_model": selected_embeddings_model,
                         },
                     )
 
@@ -305,6 +311,15 @@ def refresh_page(message: str):
 
 with col2:
     if collection and collection.count() > 0:
+
+        # the users input has to be embedded using the same embeddings model as was used for creating
+        # the embeddings for the transcript excerpts. Here we ensure that the embedding function passed
+        # as argument to the vector store is the same as was used for the embeddings
+        collection_embeddings_model = collection.metadata.get("embeddings_model")
+        if collection_embeddings_model != selected_embeddings_model:
+            openai_embedding_model.model = collection_embeddings_model
+
+        # init vector store
         chroma_db = Chroma(
             client=chroma_client,
             collection_name=collection.name,
@@ -322,6 +337,7 @@ def refresh_page(message: str):
         if prompt:
             with st.spinner("Generating answer..."):
                 try:
+
                     relevant_docs = find_relevant_documents(query=prompt, db=chroma_db)
                     response = generate_response(
                         question=prompt,

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`UUIDField,`
`12`	`12`	`)`
`13`	`13`
`14`		`-SQL_DB = SqliteDatabase("data/app/videos.sqlite3")`
	`14`	`+SQL_DB = SqliteDatabase("data/videos.sqlite3")`
`15`	`15`
`16`	`16`
`17`	`17`	`class BaseModel(Model):`
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ def display_model_settings_sidebar():`
`62`	`62`	`st.header("Model settings")`
`63`	`63`	`model = st.selectbox(`
`64`	`64`	`"Select a large language model",`
`65`		`- tuple(get_default_config_value("available_models")),`
	`65`	`+ tuple(get_default_config_value("available_models.gpts")),`
`66`	`66`	`key="model",`
`67`	`67`	`help=get_default_config_value("help_texts.model"),`
`68`	`68`	`)`