Skip to content

Commit f65904e

Browse files
olruasgitFoxCodeSebastian Wludzikszymondudyczbjornengdahl
authored andcommitted
YAML files: adding more comments and explanation (#8688)
Co-authored-by: foxCode (Sebastian Włudzik) <Sebeklis132@gmail.com> Co-authored-by: Sebastian Wludzik <sebastian.wludzik@pathway.com> Co-authored-by: Szymon Dudycz <szymond@pathway.com> Co-authored-by: bjornengdahl <51057906+bjornengdahl@users.noreply.github.com> GitOrigin-RevId: 8d36da94e48045de49370e09fd05802a0e74d426
1 parent c9ee7d9 commit f65904e

File tree

6 files changed

+175
-12
lines changed

6 files changed

+175
-12
lines changed

examples/pipelines/adaptive-rag/app.yaml

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,22 @@
1+
# This YAML configuration file is used to set up and configure the Adaptive RAG template.
2+
# It defines various components such as data sources, language models, embedders, splitters, parsers, and retrievers.
3+
# Each section is configured to specify how the template should process and handle data for generating responses.
4+
# You can learn more about the YAML syntax here: https://pathway.com/developers/templates/configure-yaml
5+
6+
7+
8+
# $sources defines the data sources used to read the data which will be indexed in the RAG.
9+
# You can learn more how to configure data sources here:
10+
# https://pathway.com/developers/templates/yaml-examples/data-sources-examples
11+
112
$sources:
13+
# File System connector, reading data locally.
214
- !pw.io.fs.read
315
path: data
416
format: binary
517
with_metadata: true
618

19+
# Uncomment to use the SharePoint connector
720
# - !pw.xpacks.connectors.sharepoint.read
821
# url: $SHAREPOINT_URL
922
# tenant: $SHAREPOINT_TENANT
@@ -14,6 +27,7 @@ $sources:
1427
# with_metadata: true
1528
# refresh_interval: 30
1629

30+
# Uncomment to use the Google Drive connector
1731
# - !pw.io.gdrive.read
1832
# object_id: $DRIVE_ID
1933
# service_user_credentials_file: gdrive_indexer.json
@@ -24,6 +38,14 @@ $sources:
2438
# with_metadata: true
2539
# refresh_interval: 30
2640

41+
42+
43+
# Configures the LLM model settings for generating responses.
44+
# The list of available Pathway LLM wrappers is available here:
45+
# https://pathway.com/developers/api-docs/pathway-xpacks-llm/llms
46+
# You can learn more about those in our documentation:
47+
# https://pathway.com/developers/templates/rag-customization/llm-chats
48+
2749
$llm: !pw.xpacks.llm.llms.OpenAIChat
2850
model: "gpt-4o-mini"
2951
retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
@@ -32,39 +54,49 @@ $llm: !pw.xpacks.llm.llms.OpenAIChat
3254
temperature: 0
3355
capacity: 8
3456

57+
# Specifies the embedder model for converting text into embeddings.
3558
$embedder: !pw.xpacks.llm.embedders.OpenAIEmbedder
3659
model: "text-embedding-ada-002"
3760
cache_strategy: !pw.udfs.DefaultCache
3861

62+
# Defines the splitter settings for dividing text into smaller chunks.
3963
$splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
4064
max_tokens: 400
4165

66+
# Configures the parser for processing and extracting information from documents.
4267
$parser: !pw.xpacks.llm.parsers.DoclingParser
4368
cache_strategy: !pw.udfs.DefaultCache
4469

70+
# Sets up the retriever factory for indexing and retrieving documents.
4571
$retriever_factory: !pw.stdlib.indexing.BruteForceKnnFactory
4672
reserved_space: 1000
4773
embedder: $embedder
4874
metric: !pw.stdlib.indexing.BruteForceKnnMetricKind.COS
4975

76+
# Manages the storage and retrieval of documents for the RAG template.
5077
$document_store: !pw.xpacks.llm.document_store.DocumentStore
5178
docs: $sources
5279
parser: $parser
5380
splitter: $splitter
5481
retriever_factory: $retriever_factory
5582

83+
# Configures the question-answering component using the RAG approach.
84+
# The component builds a RAG over an index.
85+
# You can interact with obtained RAG using a REST API.
86+
# You can learn more about the available operations here:
87+
# https://pathway.com/developers/templates/rag-customization/rest-api
5688
question_answerer: !pw.xpacks.llm.question_answering.AdaptiveRAGQuestionAnswerer
5789
llm: $llm
5890
indexer: $document_store
5991
n_starting_documents: 2
6092
factor: 2
6193
max_iterations: 4
6294

63-
# Change host and port by uncommenting these lines
95+
# Change host and port of the webserver by uncommenting these lines
6496
# host: "0.0.0.0"
6597
# port: 8000
6698

67-
# Cache configuration
99+
# Activate on-disk caching for UDFs for which `cache_strategy` is set
68100
# with_cache: true
69101

70102
# If `terminate_on_error` is true then the program will terminate whenever any error is encountered.

examples/pipelines/demo-document-indexing/app.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,21 @@
1+
# This YAML configuration file is used to set up and configure the Document indexing RAG template.
2+
# It defines various components such as data sources, embedders, splitters, parsers, and retrievers.
3+
# Each section is configured to specify how the template should process and handle data for answering the queries.
4+
# You can learn more about the YAML syntax here: https://pathway.com/developers/templates/configure-yaml
5+
6+
7+
# $sources defines the data sources used to read the data which will be indexed in the RAG.
8+
# You can learn more how to configure data sources here:
9+
# https://pathway.com/developers/templates/yaml-examples/data-sources-examples
10+
111
$sources:
12+
# File System connector, reading data locally.
213
- !pw.io.fs.read
314
path: files-for-indexing
415
format: binary
516
with_metadata: true
617

18+
# Uncomment to use the SharePoint connector
719
# - !pw.xpacks.connectors.sharepoint.read
820
# url: $SHAREPOINT_URL
921
# tenant: $SHAREPOINT_TENANT
@@ -14,6 +26,7 @@ $sources:
1426
# with_metadata: true
1527
# refresh_interval: 30
1628

29+
# Uncomment to use the Google Drive connector
1730
# - !pw.io.gdrive.read
1831
# object_id: $DRIVE_ID
1932
# service_user_credentials_file: gdrive_indexer.json
@@ -24,35 +37,41 @@ $sources:
2437
# with_metadata: true
2538
# refresh_interval: 30
2639

40+
# Model used for embedding
2741
$embedding_model: "mixedbread-ai/mxbai-embed-large-v1"
2842

43+
# Specifies the embedder model for converting text into embeddings.
2944
$embedder: !pw.xpacks.llm.embedders.SentenceTransformerEmbedder
3045
model: $embedding_model
3146
call_kwargs:
3247
show_progress_bar: False
3348

49+
# Defines the splitter settings for dividing text into smaller chunks.
3450
$splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
3551
max_tokens: 400
3652

53+
# Configures the parser for processing and extracting information from documents.
3754
$parser: !pw.xpacks.llm.parsers.UnstructuredParser
3855
cache_strategy: !pw.udfs.DefaultCache
3956

57+
# Sets up the retriever factory for indexing and retrieving documents.
4058
$retriever_factory: !pw.stdlib.indexing.BruteForceKnnFactory
4159
reserved_space: 1000
4260
embedder: $embedder
4361
metric: !pw.stdlib.indexing.BruteForceKnnMetricKind.COS
4462

63+
# Manages the storage and retrieval of documents for the RAG template.
4564
document_store: !pw.xpacks.llm.document_store.DocumentStore
4665
docs: $sources
4766
parser: $parser
4867
splitter: $splitter
4968
retriever_factory: $retriever_factory
5069

51-
# Change host and port by uncommenting these lines
70+
# Change host and port of the webserver by uncommenting these lines
5271
# host: "0.0.0.0"
5372
# port: 8000
5473

55-
# Cache configuration
74+
# Activate on-disk caching for UDFs for which `cache_strategy` is set
5675
# with_cache: true
5776

5877
# If `terminate_on_error` is true then the program will terminate whenever any error is encountered.

examples/pipelines/demo-question-answering/app.yaml

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,22 @@
1+
# This YAML configuration file is used to set up and configure the Question Answering RAG template.
2+
# It defines various components such as data sources, language models, embedders, splitters, parsers, and retrievers.
3+
# Each section is configured to specify how the template should process and handle data for generating responses.
4+
# You can learn more about the YAML syntax here: https://pathway.com/developers/templates/configure-yaml
5+
6+
7+
8+
# $sources defines the data sources used to read the data which will be indexed in the RAG.
9+
# You can learn more how to configure data sources here:
10+
# https://pathway.com/developers/templates/yaml-examples/data-sources-examples
11+
112
$sources:
13+
# File System connector, reading data locally.
214
- !pw.io.fs.read
315
path: data
416
format: binary
517
with_metadata: true
618

19+
# Uncomment to use the SharePoint connector
720
# - !pw.xpacks.connectors.sharepoint.read
821
# url: $SHAREPOINT_URL
922
# tenant: $SHAREPOINT_TENANT
@@ -14,6 +27,7 @@ $sources:
1427
# with_metadata: true
1528
# refresh_interval: 30
1629

30+
# Uncomment to use the Google Drive connector
1731
# - !pw.io.gdrive.read
1832
# object_id: $DRIVE_ID
1933
# service_user_credentials_file: gdrive_indexer.json
@@ -24,6 +38,13 @@ $sources:
2438
# with_metadata: true
2539
# refresh_interval: 30
2640

41+
42+
# Configures the LLM model settings for generating responses.
43+
# The list of available Pathway LLM wrappers is available here:
44+
# https://pathway.com/developers/api-docs/pathway-xpacks-llm/llms
45+
# You can learn more about those in our documentation:
46+
# https://pathway.com/developers/templates/rag-customization/llm-chats
47+
2748
$llm: !pw.xpacks.llm.llms.OpenAIChat
2849
model: "gpt-4o"
2950
retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
@@ -32,27 +53,37 @@ $llm: !pw.xpacks.llm.llms.OpenAIChat
3253
temperature: 0
3354
capacity: 8
3455

56+
# Specifies the embedder model for converting text into embeddings.
3557
$embedder: !pw.xpacks.llm.embedders.OpenAIEmbedder
3658
model: "text-embedding-ada-002"
3759
cache_strategy: !pw.udfs.DefaultCache
3860

61+
# Defines the splitter settings for dividing text into smaller chunks.
3962
$splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
4063
max_tokens: 400
4164

65+
# Configures the parser for processing and extracting information from documents.
4266
$parser: !pw.xpacks.llm.parsers.DoclingParser
4367
async_mode: "fully_async"
4468

69+
# Sets up the retriever factory for indexing and retrieving documents.
4570
$retriever_factory: !pw.stdlib.indexing.UsearchKnnFactory
4671
reserved_space: 1000
4772
embedder: $embedder
4873
metric: !pw.stdlib.indexing.USearchMetricKind.COS
4974

75+
# Manages the storage and retrieval of documents for the RAG template.
5076
$document_store: !pw.xpacks.llm.document_store.DocumentStore
5177
docs: $sources
5278
parser: $parser
5379
splitter: $splitter
5480
retriever_factory: $retriever_factory
5581

82+
# Configures the question-answering component using the RAG approach.
83+
# The component builds a RAG over an index.
84+
# You can interact with obtained RAG using a REST API.
85+
# You can learn more about the available operations here:
86+
# https://pathway.com/developers/templates/rag-customization/rest-api
5687
question_answerer: !pw.xpacks.llm.question_answering.BaseRAGQuestionAnswerer
5788
llm: $llm
5889
indexer: $document_store
@@ -63,11 +94,11 @@ question_answerer: !pw.xpacks.llm.question_answering.BaseRAGQuestionAnswerer
6394
# and `{context}` as a placeholder for context documents.
6495
# prompt_template: "Given these documents: {context}, please answer the question: {query}"
6596

66-
# Change host and port by uncommenting these lines
97+
# Change host and port of the webserver by uncommenting these lines
6798
# host: "0.0.0.0"
6899
# port: $PATHWAY_PORT
69100

70-
# Cache configuration
101+
# Activate on-disk caching for UDFs for which `cache_strategy` is set
71102
# with_cache: true
72103

73104
# If `terminate_on_error` is true then the program will terminate whenever any error is encountered.

examples/pipelines/gpt_4o_multimodal_rag/app.yaml

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,22 @@
1+
# This YAML configuration file is used to set up and configure the Multimodal RAG template.
2+
# It defines various components such as data sources, language models, embedders, splitters, parsers, and retrievers.
3+
# Each section is configured to specify how the template should process and handle data for generating responses.
4+
# You can learn more about the YAML syntax here: https://pathway.com/developers/templates/configure-yaml
5+
6+
7+
8+
# $sources defines the data sources used to read the data which will be indexed in the RAG.
9+
# You can learn more how to configure data sources here:
10+
# https://pathway.com/developers/templates/yaml-examples/data-sources-examples
11+
112
$sources:
13+
# File System connector, reading data locally.
214
- !pw.io.fs.read
315
path: data
416
format: binary
517
with_metadata: true
618

19+
# Uncomment to use the SharePoint connector
720
# - !pw.xpacks.connectors.sharepoint.read
821
# url: $SHAREPOINT_URL
922
# tenant: $SHAREPOINT_TENANT
@@ -14,6 +27,7 @@ $sources:
1427
# with_metadata: true
1528
# refresh_interval: 30
1629

30+
# Uncomment to use the Google Drive connector
1731
# - !pw.io.gdrive.read
1832
# object_id: $DRIVE_ID
1933
# service_user_credentials_file: gdrive_indexer.json
@@ -24,6 +38,14 @@ $sources:
2438
# with_metadata: true
2539
# refresh_interval: 30
2640

41+
42+
43+
# Configures the LLM model settings for generating responses.
44+
# The list of available Pathway LLM wrappers is available here:
45+
# https://pathway.com/developers/api-docs/pathway-xpacks-llm/llms
46+
# You can learn more about those in our documentation:
47+
# https://pathway.com/developers/templates/rag-customization/llm-chats
48+
2749
$llm: !pw.xpacks.llm.llms.OpenAIChat
2850
model: "gpt-3.5-turbo"
2951
retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
@@ -32,33 +54,40 @@ $llm: !pw.xpacks.llm.llms.OpenAIChat
3254
temperature: 0
3355
capacity: 8
3456

57+
# Specifies the embedder model for converting text into embeddings.
3558
$embedder: !pw.xpacks.llm.embedders.OpenAIEmbedder
3659
model: "text-embedding-ada-002"
3760
cache_strategy: !pw.udfs.DefaultCache
3861

62+
# Defines the splitter settings for dividing text into smaller chunks.
3963
$parsing_llm: !pw.xpacks.llm.llms.OpenAIChat
4064
model: "gpt-4o"
4165
retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
4266
max_retries: 6
4367
cache_strategy: !pw.udfs.DefaultCache
4468

69+
# Configures the parser for processing and extracting information from documents.
4570
$parser: !pw.xpacks.llm.parsers.DoclingParser
4671
multimodal_llm: parsing_llm
4772

73+
# Sets up the splitter for chunking the documents.
4874
$splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
4975
max_tokens: 400
5076

77+
# Sets up the retriever factory for indexing and retrieving documents.
5178
$retriever_factory: !pw.stdlib.indexing.BruteForceKnnFactory
5279
reserved_space: 1000
5380
embedder: $embedder
5481
metric: !pw.engine.BruteForceKnnMetricKind.COS
5582

83+
# Manages the storage and retrieval of documents for the RAG template.
5684
$document_store: !pw.xpacks.llm.document_store.DocumentStore
5785
docs: $sources
5886
parser: $parser
5987
splitter: $splitter
6088
retriever_factory: $retriever_factory
6189

90+
# Configures the question-answering component using the RAG approach.
6291
question_answerer: !pw.xpacks.llm.question_answering.BaseRAGQuestionAnswerer
6392
llm: $llm
6493
indexer: $document_store
@@ -69,11 +98,11 @@ question_answerer: !pw.xpacks.llm.question_answering.BaseRAGQuestionAnswerer
6998
# and `{context}` as a placeholder for context documents.
7099
# prompt_template: "Given these documents: {context}, please answer the question: {query}"
71100

72-
# Change host and port by uncommenting these lines
101+
# Change host and port of the webserver by uncommenting these lines
73102
# host: "0.0.0.0"
74103
# port: 8000
75104

76-
# Cache configuration
105+
# Activate on-disk caching for UDFs for which `cache_strategy` is set
77106
# with_cache: true
78107

79108
# If `terminate_on_error` is true then the program will terminate whenever any error is encountered.

0 commit comments

Comments
 (0)