YAML files: adding more comments and explanation (#8688)

olruas · gitFoxCode · Sebastian Wludzik · Manul from Pathway · commit f65904e56994 · 2025-05-07T15:59:38.000Z
Co-authored-by: foxCode (Sebastian Włudzik) &lt;Sebeklis132@gmail.com&gt;
Co-authored-by: Sebastian Wludzik &lt;sebastian.wludzik@pathway.com&gt;
Co-authored-by: Szymon Dudycz &lt;szymond@pathway.com&gt;
Co-authored-by: bjornengdahl &lt;51057906+bjornengdahl@users.noreply.github.com&gt;
GitOrigin-RevId: 8d36da94e48045de49370e09fd05802a0e74d426
diff --git a/examples/pipelines/adaptive-rag/app.yaml b/examples/pipelines/adaptive-rag/app.yaml
@@ -1,9 +1,22 @@
+# This YAML configuration file is used to set up and configure the Adaptive RAG template.
+# It defines various components such as data sources, language models, embedders, splitters, parsers, and retrievers.
+# Each section is configured to specify how the template should process and handle data for generating responses.
+# You can learn more about the YAML syntax here: https://pathway.com/developers/templates/configure-yaml
+
+
+
+# $sources defines the data sources used to read the data which will be indexed in the RAG.
+# You can learn more how to configure data sources here:
+# https://pathway.com/developers/templates/yaml-examples/data-sources-examples
+
 $sources:
+  # File System connector, reading data locally.
   - !pw.io.fs.read
     path: data
     format: binary
     with_metadata: true
 
+  # Uncomment to use the SharePoint connector
   # - !pw.xpacks.connectors.sharepoint.read 
   #   url: $SHAREPOINT_URL
   #   tenant: $SHAREPOINT_TENANT
@@ -14,6 +27,7 @@ $sources:
   #   with_metadata: true
   #   refresh_interval: 30
 
+  # Uncomment to use the Google Drive connector
   # - !pw.io.gdrive.read
   #   object_id: $DRIVE_ID
   #   service_user_credentials_file: gdrive_indexer.json
@@ -24,6 +38,14 @@ $sources:
   #   with_metadata: true
   #   refresh_interval: 30
 
+
+
+# Configures the LLM model settings for generating responses.
+# The list of available Pathway LLM wrappers is available here:
+# https://pathway.com/developers/api-docs/pathway-xpacks-llm/llms
+# You can learn more about those in our documentation:
+# https://pathway.com/developers/templates/rag-customization/llm-chats
+
 $llm: !pw.xpacks.llm.llms.OpenAIChat
   model: "gpt-4o-mini"
   retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
@@ -32,39 +54,49 @@ $llm: !pw.xpacks.llm.llms.OpenAIChat
   temperature: 0
   capacity: 8
 
+# Specifies the embedder model for converting text into embeddings.
 $embedder: !pw.xpacks.llm.embedders.OpenAIEmbedder
   model: "text-embedding-ada-002"
   cache_strategy: !pw.udfs.DefaultCache
 
+# Defines the splitter settings for dividing text into smaller chunks.
 $splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
   max_tokens: 400
 
+# Configures the parser for processing and extracting information from documents.
 $parser: !pw.xpacks.llm.parsers.DoclingParser
   cache_strategy: !pw.udfs.DefaultCache
 
+# Sets up the retriever factory for indexing and retrieving documents.
 $retriever_factory: !pw.stdlib.indexing.BruteForceKnnFactory
   reserved_space: 1000
   embedder: $embedder
   metric: !pw.stdlib.indexing.BruteForceKnnMetricKind.COS
 
+# Manages the storage and retrieval of documents for the RAG template.
 $document_store: !pw.xpacks.llm.document_store.DocumentStore
   docs: $sources
   parser: $parser
   splitter: $splitter
   retriever_factory: $retriever_factory
 
+# Configures the question-answering component using the RAG approach.
+# The component builds a RAG over an index.
+# You can interact with obtained RAG using a REST API.
+# You can learn more about the available operations here:
+# https://pathway.com/developers/templates/rag-customization/rest-api
 question_answerer: !pw.xpacks.llm.question_answering.AdaptiveRAGQuestionAnswerer
   llm: $llm
   indexer: $document_store
   n_starting_documents: 2
   factor: 2
   max_iterations: 4
 
-# Change host and port by uncommenting these lines
+# Change host and port of the webserver by uncommenting these lines
 # host: "0.0.0.0"
 # port: 8000
 
-# Cache configuration
+# Activate on-disk caching for UDFs for which `cache_strategy` is set
 # with_cache: true
 
 # If `terminate_on_error` is true then the program will terminate whenever any error is encountered.
diff --git a/examples/pipelines/demo-document-indexing/app.yaml b/examples/pipelines/demo-document-indexing/app.yaml
@@ -1,9 +1,21 @@
+# This YAML configuration file is used to set up and configure the Document indexing RAG template.
+# It defines various components such as data sources, embedders, splitters, parsers, and retrievers.
+# Each section is configured to specify how the template should process and handle data for answering the queries.
+# You can learn more about the YAML syntax here: https://pathway.com/developers/templates/configure-yaml
+
+
+# $sources defines the data sources used to read the data which will be indexed in the RAG.
+# You can learn more how to configure data sources here:
+# https://pathway.com/developers/templates/yaml-examples/data-sources-examples
+
 $sources:
+  # File System connector, reading data locally.
   - !pw.io.fs.read
     path: files-for-indexing
     format: binary
     with_metadata: true
 
+  # Uncomment to use the SharePoint connector
   # - !pw.xpacks.connectors.sharepoint.read 
   #   url: $SHAREPOINT_URL
   #   tenant: $SHAREPOINT_TENANT
@@ -14,6 +26,7 @@ $sources:
   #   with_metadata: true
   #   refresh_interval: 30
 
+  # Uncomment to use the Google Drive connector
   # - !pw.io.gdrive.read
   #   object_id: $DRIVE_ID
   #   service_user_credentials_file: gdrive_indexer.json
@@ -24,35 +37,41 @@ $sources:
   #   with_metadata: true
   #   refresh_interval: 30
 
+# Model used for embedding
 $embedding_model: "mixedbread-ai/mxbai-embed-large-v1"
 
+# Specifies the embedder model for converting text into embeddings.
 $embedder: !pw.xpacks.llm.embedders.SentenceTransformerEmbedder
   model: $embedding_model
   call_kwargs: 
     show_progress_bar: False
 
+# Defines the splitter settings for dividing text into smaller chunks.
 $splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
   max_tokens: 400
 
+# Configures the parser for processing and extracting information from documents.
 $parser: !pw.xpacks.llm.parsers.UnstructuredParser
   cache_strategy: !pw.udfs.DefaultCache
 
+# Sets up the retriever factory for indexing and retrieving documents.
 $retriever_factory: !pw.stdlib.indexing.BruteForceKnnFactory
   reserved_space: 1000
   embedder: $embedder
   metric: !pw.stdlib.indexing.BruteForceKnnMetricKind.COS
 
+# Manages the storage and retrieval of documents for the RAG template.
 document_store: !pw.xpacks.llm.document_store.DocumentStore
   docs: $sources
   parser: $parser
   splitter: $splitter
   retriever_factory: $retriever_factory
 
-# Change host and port by uncommenting these lines
+# Change host and port of the webserver by uncommenting these lines
 # host: "0.0.0.0"
 # port: 8000
 
-# Cache configuration
+# Activate on-disk caching for UDFs for which `cache_strategy` is set
 # with_cache: true
 
 # If `terminate_on_error` is true then the program will terminate whenever any error is encountered.
diff --git a/examples/pipelines/demo-question-answering/app.yaml b/examples/pipelines/demo-question-answering/app.yaml
@@ -1,9 +1,22 @@
+# This YAML configuration file is used to set up and configure the Question Answering RAG template.
+# It defines various components such as data sources, language models, embedders, splitters, parsers, and retrievers.
+# Each section is configured to specify how the template should process and handle data for generating responses.
+# You can learn more about the YAML syntax here: https://pathway.com/developers/templates/configure-yaml
+
+
+
+# $sources defines the data sources used to read the data which will be indexed in the RAG.
+# You can learn more how to configure data sources here:
+# https://pathway.com/developers/templates/yaml-examples/data-sources-examples
+
 $sources:
+  # File System connector, reading data locally.
   - !pw.io.fs.read
     path: data
     format: binary
     with_metadata: true
 
+  # Uncomment to use the SharePoint connector
   # - !pw.xpacks.connectors.sharepoint.read 
   #   url: $SHAREPOINT_URL
   #   tenant: $SHAREPOINT_TENANT
@@ -14,6 +27,7 @@ $sources:
   #   with_metadata: true
   #   refresh_interval: 30
 
+  # Uncomment to use the Google Drive connector
   # - !pw.io.gdrive.read
   #   object_id: $DRIVE_ID
   #   service_user_credentials_file: gdrive_indexer.json
@@ -24,6 +38,13 @@ $sources:
   #   with_metadata: true
   #   refresh_interval: 30
 
+
+# Configures the LLM model settings for generating responses.
+# The list of available Pathway LLM wrappers is available here:
+# https://pathway.com/developers/api-docs/pathway-xpacks-llm/llms
+# You can learn more about those in our documentation:
+# https://pathway.com/developers/templates/rag-customization/llm-chats
+
 $llm: !pw.xpacks.llm.llms.OpenAIChat
   model: "gpt-4o"
   retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
@@ -32,27 +53,37 @@ $llm: !pw.xpacks.llm.llms.OpenAIChat
   temperature: 0
   capacity: 8
 
+# Specifies the embedder model for converting text into embeddings.
 $embedder: !pw.xpacks.llm.embedders.OpenAIEmbedder
   model: "text-embedding-ada-002"
   cache_strategy: !pw.udfs.DefaultCache
 
+# Defines the splitter settings for dividing text into smaller chunks.
 $splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
   max_tokens: 400
 
+# Configures the parser for processing and extracting information from documents.
 $parser: !pw.xpacks.llm.parsers.DoclingParser
   async_mode: "fully_async"
 
+# Sets up the retriever factory for indexing and retrieving documents.
 $retriever_factory: !pw.stdlib.indexing.UsearchKnnFactory
   reserved_space: 1000
   embedder: $embedder
   metric: !pw.stdlib.indexing.USearchMetricKind.COS
   
+# Manages the storage and retrieval of documents for the RAG template.
 $document_store: !pw.xpacks.llm.document_store.DocumentStore
   docs: $sources
   parser: $parser
   splitter: $splitter
   retriever_factory: $retriever_factory
 
+# Configures the question-answering component using the RAG approach.
+# The component builds a RAG over an index.
+# You can interact with obtained RAG using a REST API.
+# You can learn more about the available operations here:
+# https://pathway.com/developers/templates/rag-customization/rest-api
 question_answerer: !pw.xpacks.llm.question_answering.BaseRAGQuestionAnswerer
   llm: $llm
   indexer: $document_store
@@ -63,11 +94,11 @@ question_answerer: !pw.xpacks.llm.question_answering.BaseRAGQuestionAnswerer
   # and `{context}` as a placeholder for context documents.
   # prompt_template: "Given these documents: {context}, please answer the question: {query}"
 
-# Change host and port by uncommenting these lines
+# Change host and port of the webserver by uncommenting these lines
 # host: "0.0.0.0"
 # port: $PATHWAY_PORT
 
-# Cache configuration
+# Activate on-disk caching for UDFs for which `cache_strategy` is set
 # with_cache: true
 
 # If `terminate_on_error` is true then the program will terminate whenever any error is encountered.
diff --git a/examples/pipelines/gpt_4o_multimodal_rag/app.yaml b/examples/pipelines/gpt_4o_multimodal_rag/app.yaml
@@ -1,9 +1,22 @@
+# This YAML configuration file is used to set up and configure the Multimodal RAG template.
+# It defines various components such as data sources, language models, embedders, splitters, parsers, and retrievers.
+# Each section is configured to specify how the template should process and handle data for generating responses.
+# You can learn more about the YAML syntax here: https://pathway.com/developers/templates/configure-yaml
+
+
+
+# $sources defines the data sources used to read the data which will be indexed in the RAG.
+# You can learn more how to configure data sources here:
+# https://pathway.com/developers/templates/yaml-examples/data-sources-examples
+
 $sources:
+  # File System connector, reading data locally.
   - !pw.io.fs.read
     path: data
     format: binary
     with_metadata: true
 
+  # Uncomment to use the SharePoint connector
   # - !pw.xpacks.connectors.sharepoint.read 
   #   url: $SHAREPOINT_URL
   #   tenant: $SHAREPOINT_TENANT
@@ -14,6 +27,7 @@ $sources:
   #   with_metadata: true
   #   refresh_interval: 30
 
+  # Uncomment to use the Google Drive connector
   # - !pw.io.gdrive.read
   #   object_id: $DRIVE_ID
   #   service_user_credentials_file: gdrive_indexer.json
@@ -24,6 +38,14 @@ $sources:
   #   with_metadata: true
   #   refresh_interval: 30
 
+
+
+# Configures the LLM model settings for generating responses.
+# The list of available Pathway LLM wrappers is available here:
+# https://pathway.com/developers/api-docs/pathway-xpacks-llm/llms
+# You can learn more about those in our documentation:
+# https://pathway.com/developers/templates/rag-customization/llm-chats
+
 $llm: !pw.xpacks.llm.llms.OpenAIChat
   model: "gpt-3.5-turbo"
   retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
@@ -32,33 +54,40 @@ $llm: !pw.xpacks.llm.llms.OpenAIChat
   temperature: 0
   capacity: 8
 
+# Specifies the embedder model for converting text into embeddings.
 $embedder: !pw.xpacks.llm.embedders.OpenAIEmbedder
   model: "text-embedding-ada-002"
   cache_strategy: !pw.udfs.DefaultCache
 
+# Defines the splitter settings for dividing text into smaller chunks.
 $parsing_llm: !pw.xpacks.llm.llms.OpenAIChat
   model: "gpt-4o"
   retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
     max_retries: 6
   cache_strategy: !pw.udfs.DefaultCache
 
+# Configures the parser for processing and extracting information from documents.
 $parser: !pw.xpacks.llm.parsers.DoclingParser
   multimodal_llm: parsing_llm
 
+# Sets up the splitter for chunking the documents.
 $splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
   max_tokens: 400
 
+# Sets up the retriever factory for indexing and retrieving documents.
 $retriever_factory: !pw.stdlib.indexing.BruteForceKnnFactory
   reserved_space: 1000
   embedder: $embedder
   metric: !pw.engine.BruteForceKnnMetricKind.COS
   
+# Manages the storage and retrieval of documents for the RAG template.
 $document_store: !pw.xpacks.llm.document_store.DocumentStore
   docs: $sources
   parser: $parser
   splitter: $splitter
   retriever_factory: $retriever_factory
 
+# Configures the question-answering component using the RAG approach.
 question_answerer: !pw.xpacks.llm.question_answering.BaseRAGQuestionAnswerer
   llm: $llm
   indexer: $document_store
@@ -69,11 +98,11 @@ question_answerer: !pw.xpacks.llm.question_answering.BaseRAGQuestionAnswerer
   # and `{context}` as a placeholder for context documents.
   # prompt_template: "Given these documents: {context}, please answer the question: {query}"
 
-# Change host and port by uncommenting these lines
+# Change host and port of the webserver by uncommenting these lines
 # host: "0.0.0.0"
 # port: 8000
 
-# Cache configuration
+# Activate on-disk caching for UDFs for which `cache_strategy` is set
 # with_cache: true
 
 # If `terminate_on_error` is true then the program will terminate whenever any error is encountered.
diff --git a/examples/pipelines/private-rag/app.yaml b/examples/pipelines/private-rag/app.yaml
diff --git a/examples/pipelines/slides_ai_search/app.yaml b/examples/pipelines/slides_ai_search/app.yaml