sandialabs
diff --git a/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/tutorials/Tutorial_1-Document_Indexing/README.md‎
Lines changed: 34 additions & 28 deletions b/‎docs/tutorials/Tutorial_1-Document_Indexing/README.md‎
Lines changed: 34 additions & 28 deletions
diff --git a/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_1_CreateSyntheticData.script‎
Lines changed: 26 additions & 0 deletions b/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_1_CreateSyntheticData.script‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_1_CreateSyntheticData.sh‎
Lines changed: 2 additions & 30 deletions b/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_1_CreateSyntheticData.sh‎
Lines changed: 2 additions & 30 deletions
diff --git a/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_2_IndexStories.script‎
Lines changed: 22 additions & 0 deletions b/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_2_IndexStories.script‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_2_IndexStories.sh‎
Lines changed: 2 additions & 27 deletions b/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_2_IndexStories.sh‎
Lines changed: 2 additions & 27 deletions
diff --git a/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_3_SearchStories.script‎
Lines changed: 16 additions & 0 deletions b/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_3_SearchStories.script‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_3_SearchStories.sh‎
Lines changed: 2 additions & 19 deletions b/‎docs/tutorials/Tutorial_1-Document_Indexing/Step_3_SearchStories.sh‎
Lines changed: 2 additions & 19 deletions
diff --git a/‎docs/tutorials/Tutorial_2-Search_by_Example_and_RAG/README.md‎
Lines changed: 44 additions & 32 deletions b/‎docs/tutorials/Tutorial_2-Search_by_Example_and_RAG/README.md‎
Lines changed: 44 additions & 32 deletions
diff --git a/‎docs/tutorials/Tutorial_2-Search_by_Example_and_RAG/Step_1_CreateVectorDatabase.script‎
Lines changed: 17 additions & 0 deletions b/‎docs/tutorials/Tutorial_2-Search_by_Example_and_RAG/Step_1_CreateVectorDatabase.script‎
Lines changed: 17 additions & 0 deletions
@@ -1,5 +1,10 @@
 # Changelog
 
+## 0.9.3 (in development)
+### Improvements
+- Updated tutorials to use lancedb.  
+- Moved tutorial scripts out of shell scripts into their own files
+
 ## 0.9.2
 ### Improvements
 - Added ability for sources and segments to have multiple names in chatterlang.
 
@@ -57,20 +57,22 @@ However, for testing and development purposes, we often need synthetic data that
 
 ### The Solution: AI-Generated Stories
 
-The first step uses TalkPipe's ChatterLang scripting language to generate 50 fictional stories about technology development. Here's what happens:
+The first step uses TalkPipe's ChatterLang scripting language to generate 50 fictional stories about technology development. The pipeline is defined in `Step_1_CreateSyntheticData.script`:
 
-```bash
-export TALKPIPE_CHATTERLANG_SCRIPT='
-    LOOP 50 TIMES {
-        INPUT FROM "Write a fictitious five sentence story about technology development in an imaginary country." 
-        | llmPrompt[source="ollama", model="llama3.2", multi_turn=False] 
-        | toDict[field_list="_:content"] 
-        | llmPrompt[source="ollama", model="llama3.2", system_prompt="Write exactly one title for this story in plain text with no markdown", field="content", set_as="title", multi_turn=False] 
-        | dumpsJsonl | print;
-    }
-'
+```
+LOOP 50 TIMES {
+    INPUT FROM "Write a fictitious five sentence story about technology development in an imaginary country."
+    | llmPrompt[source="ollama", model="llama3.2", multi_turn=False]
+    | toDict[field_list="_:content"]
+    | llmPrompt[source="ollama", model="llama3.2", system_prompt="Write exactly one title for this story in plain text with no markdown", field="content", set_as="title", multi_turn=False]
+    | dumpsJsonl | print;
+}
+```
+
+To run this script:
 
-python -m talkpipe.app.chatterlang_script --script CHATTERLANG_SCRIPT > stories.json
+```bash
+chatterlang_script --script Step_1_CreateSyntheticData.script > stories.json
 ```
 
 ### Breaking Down the Pipeline
@@ -166,17 +168,19 @@ Think of an index like the index at the back of a book, but much more sophistica
 
 ### The Solution: Whoosh Indexing
 
-Step 2 takes our generated stories and creates a searchable index using the Whoosh library:
+Step 2 takes our generated stories and creates a searchable index using the Whoosh library. The pipeline is defined in `Step_2_IndexStories.script`:
 
-```bash
-export TALKPIPE_CHATTERLANG_SCRIPT='
-    INPUT FROM "stories.json" 
-    | readJsonl 
-    | progressTicks[tick_count=1, print_count=True]
-    | indexWhoosh[index_path="./full_text_index", field_list="content,title", overwrite=True]
-'
+```
+INPUT FROM "stories.json"
+| readJsonl
+| progressTicks[tick_count=1, print_count=True]
+| indexWhoosh[index_path="./full_text_index", field_list="content,title", overwrite=True]
+```
+
+To run this script:
 
-python -m talkpipe.app.chatterlang_script --script CHATTERLANG_SCRIPT
+```bash
+chatterlang_script --script Step_2_IndexStories.script
 ```
 
 ### Understanding the Indexing Pipeline
@@ -283,15 +287,17 @@ Most search implementations require significant custom development, but TalkPipe
 
 ### The Solution: Dual Interface Search
 
-Step 3 creates both an API endpoint and a web interface using a single command:
+Step 3 creates both an API endpoint and a web interface using a single command. The pipeline is defined in `Step_3_SearchStories.script`:
 
-```bash
-export TALKPIPE_CHATTERLANG_SCRIPT='
-  | searchWhoosh[index_path="full_text_index", field="query"] 
-  | formatItem[field_list="document.title:Title,document.content:Content,score:Score"]
-'
+```
+| searchWhoosh[index_path="full_text_index", field="query"]
+| formatItem[field_list="document.title:Title,document.content:Content,score:Score"]
+```
 
-python -m talkpipe.app.chatterlang_serve --form-config story_search_ui.yml --title "Story Search" --display-property query --script CHATTERLANG_SCRIPT
+To run this script:
+
+```bash
+chatterlang_serve --form-config story_search_ui.yml --title "Story Search" --display-property query --script Step_3_SearchStories.script
 ```
 
 ### Understanding the Search System
 
@@ -0,0 +1,26 @@
+###################################################################################
+# Step 1: Create Synthetic Data
+#
+# This script generates synthetic data for document indexing.
+# "chatterlang_script" is a command installed with talkpipe that allows you
+# to run Chatterlang scripts from the command line.
+#
+# This particular script generates a set of fictitious stories that we'll use
+# to test the document indexing and search.
+# The pipeline is:
+# 1. Loop 50 times
+# 2. "INPUT FROM..." issues a prompt to the LLM to generate a five-sentence story 
+#    about technology development in an imaginary country.
+# 3. The output is processed to create a dictionary with the story content.
+# 4. A second LLM prompt generates a title for the story.
+# 5. The results are formatted as JSONL and printed to the console.
+# 6. The output is redirected to a file named "stories.json".
+###################################################################################
+
+LOOP 50 TIMES {
+    INPUT FROM "Write a fictitious five sentence story about technology development in an imaginary country."
+    | llmPrompt[source="ollama", model="llama3.2", multi_turn=False]
+    | toDict[field_list="_:content"]
+    | llmPrompt[source="ollama", model="llama3.2", system_prompt="Write exactly one title for this story in plain text with no markdown", field="content", set_as="title", multi_turn=False]
+    | dumpsJsonl | print;
+}
@@ -1,30 +1,2 @@
-###################################################################################
-# Step 1: Create Synthetic Data
-#
-# This script generates synthetic data for document indexing.
-# "chatterlang_script" is a command installed with talkpipe that allows you
-# to run Chatterlang scripts from the command line.
-#
-# This particular script generates a set of fictitious stories that we'll use
-# to test the document indexing and search.
-# The pipeline is:
-# 1. Loop 50 times
-# 2. "INPUT FROM..." issues a prompt to the LLM to generate a five-sentence story 
-#    about technology development in an imaginary country.
-# 3. The output is processed to create a dictionary with the story content.
-# 4. A second LLM prompt generates a title for the story.
-# 5. The results are formatted as JSONL and printed to the console.
-# 6. The output is redirected to a file named "stories.json".
-###################################################################################
-
-export TALKPIPE_CHATTERLANG_SCRIPT='
-    LOOP 50 TIMES {
-        INPUT FROM "Write a fictitious five sentence story about technology development in an imaginary country." 
-        | llmPrompt[source="ollama", model="llama3.2", multi_turn=False] 
-        | toDict[field_list="_:content"] 
-        | llmPrompt[source="ollama", model="llama3.2", system_prompt="Write exactly one title for this story in plain text with no markdown", field="content", set_as="title", multi_turn=False] 
-        | dumpsJsonl | print;
-    }
-'
-#chatterlang_script --script "
-python -m talkpipe.app.chatterlang_script --script CHATTERLANG_SCRIPT > stories.json
+#!/bin/bash
+chatterlang_script --script Step_1_CreateSyntheticData.script > stories.json
@@ -0,0 +1,22 @@
+###################################################################################
+# Step 2: Index Stories
+#
+# This script indexes the stories generated in Step 1 using the Whoosh library.
+# It reads the JSON file created in Step 1 and indexes the content and titles of the stories.
+# The indexed data can then be used for full-text search.
+#
+# As a side note, the first half of this script issues a single piece of data, the
+# filename.  The next segment, `readJsonl`, reads the JSONL file line by line and
+# issues one decoded JSON object at a time.  This is a good example of how the
+# constitution of the data being processed can change as it flows through the pipeline.
+#
+# The pipeline is:
+# 1. Read the JSONL file "stories.json" created in Step 1.
+# 2. Use the `indexWhoosh` segment to index the content and title fields.
+# 3. The index is stored in the specified path "./full_text_index".
+###################################################################################
+
+INPUT FROM "stories.json"
+| readJsonl
+| progressTicks[tick_count=1, print_count=True]
+| indexWhoosh[index_path="./full_text_index", field_list="content,title", overwrite=True]
@@ -1,27 +1,2 @@
-###################################################################################
-# Step 2: Index Stories
-#
-# This script indexes the stories generated in Step 1 using the Whoosh library.
-# It reads the JSON file created in Step 1 and indexes the content and titles of the stories.
-# The indexed data can then be used for full-text search.
-#
-# As a side note, the first half of this script issues a single piece of data, the
-# filename.  The next segment, `readJsonl`, reads the JSONL file line by line and 
-# issues one decoded JSON object at a time.  This is a good example of how the 
-# constitution of the data being processed can change as it flows through the pipeline.
-#
-# The pipeline is:
-# 1. Read the JSONL file "stories.json" created in Step 1.
-# 2. Use the `indexWhoosh` segment to index the content and title fields.
-# 3. The index is stored in the specified path "./full_text_index".
-###################################################################################
-
-export TALKPIPE_CHATTERLANG_SCRIPT='
-    INPUT FROM "stories.json" 
-    | readJsonl 
-    | progressTicks[tick_count=1, print_count=True]
-    | indexWhoosh[index_path="./full_text_index", field_list="content,title", overwrite=True]
-'
-
-#chatterlang_script --script "
-python -m talkpipe.app.chatterlang_script --script CHATTERLANG_SCRIPT
+#!/bin/bash
+chatterlang_script --script Step_2_IndexStories.script
@@ -0,0 +1,16 @@
+###################################################################################
+# Step 3: Search Stories
+# This script allows users to search for indexed stories using the Whoosh library.
+# It opens two interface.  The first is an API endpoint that accepts search queries
+# and returns matching stories. The second is a command-line interface that allows
+# users to enter search terms interactively.
+#
+# It accomplishes this by using the chatterlang_serve to serve a pipeline that
+# reads queries in the form of JSON objects, processes them, and returns results.
+# The same application provides a search-like interface, configured by a yaml file,
+# that makes it easy for a user to create the JSON sent to the endpoint without
+# needing to write any code.
+###################################################################################
+
+| searchWhoosh[index_path="full_text_index", field="query"]
+| formatItem[field_list="document.title:Title,document.content:Content,score:Score"]
@@ -1,19 +1,2 @@
-###################################################################################
-# Step 3: Search Stories
-# This script allows users to search for indexed stories using the Whoosh library.
-# It opens two interface.  The first is an API endpoint that accepts search queries
-# and returns matching stories. The second is a command-line interface that allows
-# users to enter search terms interactively.
-#
-# It accomplishes this by using the chatterlang_serve to serve a pipeline that
-# reads queries in the form of JSON objects, processes them, and returns results.
-# The same application provides a search-like interface, configured by a yaml file,
-# that makes it easy for a user to create the JSON sent to the endpoint without
-# needing to write any code.
-###################################################################################
-
-export TALKPIPE_CHATTERLANG_SCRIPT='
-  | searchWhoosh[index_path="full_text_index", field="query"] 
-  | formatItem[field_list="document.title:Title,document.content:Content,score:Score"]
-'
-chatterlang_serve --form-config story_search_ui.yml --title \"Story\ Search\" --display-property query --script CHATTERLANG_SCRIPT
+#!/bin/bash
+chatterlang_serve --form-config story_search_ui.yml --title "Story Search" --display-property query --script Step_3_SearchStories.script
@@ -34,16 +34,20 @@ Vector embeddings solve this by converting text into high-dimensional mathematic
 
 ### The Implementation
 
+The pipeline is defined in `Step_1_CreateVectorDatabase.script`:
+
+```
+INPUT FROM "../Tutorial_1-Document_Indexing/stories.json"
+| readJsonl
+| progressTicks[tick_count=1, print_count=True]
+| llmEmbed[field="content", source="ollama", model="mxbai-embed-large", set_as="vector"]
+| addToLanceDB[path="./vector_index", table_name="stories", vector_field="vector", metadata_field_list="title,content", overwrite=True]
+```
+
+To run this script:
+
 ```bash
-export TALKPIPE_CHATTERLANG_SCRIPT='
-    INPUT FROM "../Tutorial_1-Document_Indexing/stories.json"
-    | readJsonl 
-    | progressTicks[tick_count=1, print_count=True] 
-    | llmEmbed[field="content", source="ollama", model="mxbai-embed-large", set_as="vector"]
-    | addVector[path="./vector_index", vector_field="vector", metadata_field_list="title,content", overwrite=True]
-'
-
-python -m talkpipe.app.chatterlang_script --script CHATTERLANG_SCRIPT
+chatterlang_script --script Step_1_CreateVectorDatabase.script
 ```
 
 ### Breaking Down the Pipeline
@@ -67,12 +71,12 @@ The `mxbai-embed-large` model is specifically designed for semantic search - it'
 
 **3. Building the Index**
 ```
-| addVector[path="./vector_index", vector_field="vector", metadata_field_list="title,content", overwrite=True]
+| addToLanceDB[path="./vector_index", table_name="stories", vector_field="vector", metadata_field_list="title,content", overwrite=True]
 ```
 This creates a specialized index that:
-- Stores vectors for similarity search
+- Stores vectors for similarity search in a LanceDB table named "stories"
 - Preserves original metadata (title and content) for retrieval
-- Enables fast nearest-neighbor queries
+- Enables fast nearest-neighbor queries using LanceDB's efficient vector search capabilities
 
 ### Real-World Applications
 
@@ -96,15 +100,19 @@ Your users don't always know the right keywords. Sometimes they have an example
 
 ### The Solution: Semantic Search Interface
 
+The pipeline is defined in `Step_2_SearchByExample.script`:
+
+```
+| copy
+| llmEmbed[field="example", source="ollama", model="mxbai-embed-large", set_as="vector"]
+| searchLanceDB[field="vector", path="./vector_index", table_name="stories", limit=10]
+| formatItem[field_list="document.title:Title, document.content:Content, score:Score"]
+```
+
+To run this script:
+
 ```bash
-export TALKPIPE_CHATTERLANG_SCRIPT='    
-    | copy
-    | llmEmbed[field="example", source="ollama", model="mxbai-embed-large", set_as="vector"]
-    | searchVector[vector_field="vector", path="./vector_index"]
-    | formatItem[field_list="document.title:Title, document.content:Content, score:Score"]
-'
-
-python -m talkpipe.app.chatterlang_serve --form-config story_by_example_ui.yml --display-property example --script CHATTERLANG_SCRIPT
+chatterlang_serve --form-config story_by_example_ui.yml --display-property example --script Step_2_SearchByExample.script
 ```
 
 ### Understanding the Search Pipeline
@@ -123,9 +131,9 @@ The user's example text is converted to a vector using the same model that index
 
 **3. Vector Search**
 ```
-| searchVector[vector_field="vector", path="./vector_index"]
+| searchLanceDB[field="vector", path="./vector_index", table_name="stories", limit=10]
 ```
-This finds the documents whose vectors are closest to the query vector - literally the nearest neighbors in high-dimensional space.
+This finds the documents whose vectors are closest to the query vector - literally the nearest neighbors in high-dimensional space. LanceDB provides efficient approximate nearest neighbor search for fast retrieval.
 
 **4. Result Formatting**
 ```
@@ -167,23 +175,27 @@ Finding relevant documents is helpful, but what users often really want is an an
 
 ### The RAG Implementation
 
+The pipeline is defined in `Step_3_SpecializedRag.script`:
+
+```
+| copy
+| llmEmbed[field="example", source="ollama", model="mxbai-embed-large", set_as="vector"]
+| searchLanceDB[field="vector", path="./vector_index", table_name="stories", all_results_at_once=True, set_as="results"]
+| ragPrompt
+| llmPrompt[source="ollama", model="llama3.2"]
+```
+
+To run this script:
+
 ```bash
-export TALKPIPE_CHATTERLANG_SCRIPT='
-    | copy
-    | llmEmbed[field="example", source="ollama", model="mxbai-embed-large", set_as="vector"]
-    | searchVector[vector_field="vector", path="./vector_index", all_results_at_once=True, set_as="results"]
-    | ragPrompt
-    | llmPrompt[source="ollama", model="llama3.2"]
-'
-
-python -m talkpipe.app.chatterlang_serve --form-config story_by_example_ui.yml --load-module step_3_extras.py --display-property example --script CHATTERLANG_SCRIPT
+chatterlang_serve --form-config story_by_example_ui.yml --load-module step_3_extras.py --display-property example --script Step_3_SpecializedRag.script
 ```
 
 ### What's Different in the RAG Pipeline
 
 **1. Batch Results Collection**
 ```
-| searchVector[..., all_results_at_once=True, set_as="results"]
+| searchLanceDB[..., all_results_at_once=True, set_as="results"]
 ```
 Instead of processing results one by one, we collect all search results together. This allows the next step to see the full context.
 
 
@@ -0,0 +1,17 @@
+###################################################################################
+# This script creates a vector database using the provided configuration file.
+# It uses the `chatterlang_script` command to run a Chatterlang script that
+# uses the synthetic data generated in the previous tutorial.
+#
+# The pipeline is:
+# 1. Read the JSONL file "stories.json" created in the previous tutorial.
+# 2. Use the `llmEmbed` segment to generate embeddings for the content field
+#    using the specified model.
+# 3. The embeddings are stored in a vector index at the specified path.
+###################################################################################
+
+INPUT FROM "../Tutorial_1-Document_Indexing/stories.json"
+| readJsonl
+| progressTicks[tick_count=1, print_count=True]
+| llmEmbed[field="content", source="ollama", model="mxbai-embed-large", set_as="vector"]
+| addToLanceDB[path="./vector_index", table_name="stories", vector_field="vector", metadata_field_list="title,content", overwrite=True]