Add deterministic setting to database functions closes #1048. Add ability to index database expressions #1049

davidmezzetti · davidmezzetti · commit 157d86acca1e · 2026-02-22T16:26:42.000-05:00
diff --git a/docs/embeddings/configuration/general.md b/docs/embeddings/configuration/general.md
@@ -66,10 +66,13 @@ Sets the auto id generation method. When this is not set, an autogenerated numer
 columns:
     text: name of the text column
     object: name of the object column
+    store: limit json data fields to this list of columns
 ```
 
 Sets the `text` and `object` column names. Defaults to `text` and `object` if not provided.
 
+`store` sets a list of columns to store in the JSON data field. When this isn't provided, all columns are stored (default). When `store` is set to `None`, no JSON columns are stored. This is useful is a field is only needed at indexing time but not search time.
+
 ## format
 ```yaml
 format: json|pickle
diff --git a/docs/embeddings/indexing.md b/docs/embeddings/indexing.md
@@ -23,6 +23,8 @@ As mentioned above, computed vectors are stored in an ANN. There are various ind
 
 Embeddings indexes can optionally [store content](../configuration/database#content). When this is enabled, the input content is saved in a database alongside the computed vectors. This enables filtering on additional fields and content retrieval.
 
+The columns used for text, object and JSON data storage are set via [column configuration](../configuration/general#columns).
+
 ## Index vs Upsert
 
 Data is loaded into an index with either an [index](../methods#txtai.embeddings.base.Embeddings.index) or [upsert](../methods#txtai.embeddings.base.Embeddings.upsert) call.
diff --git a/src/python/txtai/database/duckdb.py b/src/python/txtai/database/duckdb.py
@@ -81,15 +81,7 @@ def rows(self):
             rows = self.cursor.fetchmany(batch)
 
     def addfunctions(self):
-        if self.connection and self.functions:
-            for name, _, fn, deterministic in self.functions:
-                # Get function type hints
-                hints = get_type_hints(fn)
-
-                # Create database functions
-                self.connection.create_function(
-                    name, fn, return_type=hints.get("return", str), side_effects=not deterministic if deterministic is not None else False
-                )
+        self.loadfunctions(self.connection)
 
     def copy(self, path):
         # Delete existing file, if necessary
@@ -116,8 +108,14 @@ def copy(self, path):
             for table in tables:
                 connection.execute(f"COPY {table} FROM '{directory}/{table}.parquet' (FORMAT parquet)")
 
-            # Create indexes and sync data to database file
-            connection.execute(Statement.CREATE_SECTIONS_INDEX)
+            # Copy functions
+            self.loadfunctions(connection)
+
+            # Copy indexes
+            for (sql,) in self.connection.execute("SELECT sql FROM duckdb_indexes()").fetchall():
+                connection.execute(sql)
+
+            # Sync data to database file
             connection.execute("CHECKPOINT")
 
         # Start transaction
@@ -156,3 +154,24 @@ def formatargs(self, args):
             args = (query, [value for _, value in sorted(params, key=lambda x: x[0])])
 
         return args
+
+    def loadfunctions(self, connection):
+        """
+        Load database functions.
+
+        Args:
+            connection: connection to create functions
+        """
+
+        if self.functions and connection:
+            for name, _, fn, deterministic in self.functions:
+                # Create function if it doesn't already exist
+                result = connection.execute("SELECT 1 FROM duckdb_functions() WHERE function_name = ?", [name]).fetchone()
+                if not result:
+                    # Get function type hints
+                    hints = get_type_hints(fn)
+
+                    # Create database functions
+                    connection.create_function(
+                        name, fn, return_type=hints.get("return", str), side_effects=not deterministic if deterministic is not None else False
+                    )
diff --git a/src/python/txtai/database/rdbms.py b/src/python/txtai/database/rdbms.py
@@ -145,7 +145,7 @@ def resolve(self, name, alias=None):
 
         # Resolve expression
         if self.expressions and name in self.expressions:
-            return self.expressions[name]
+            return self.expressions[name]["expression"]
 
         # Name is already resolved, skip
         if name.startswith(self.jsonprefix()) or any(f"s.{s}" == name for s in sections):
@@ -251,6 +251,9 @@ def initialize(self):
             # Create initial table schema
             self.createtables()
 
+            # Create indexes
+            self.createindexes()
+
     def session(self, path=None, connection=None):
         """
         Starts a new database session.
@@ -281,6 +284,23 @@ def createtables(self):
         self.cursor.execute(Statement.CREATE_SECTIONS % "sections")
         self.cursor.execute(Statement.CREATE_SECTIONS_INDEX)
 
+    def createindexes(self):
+        """
+        Creates expression indexes
+        """
+
+        if self.expressions:
+            for key, values in self.expressions.items():
+                # Create index for expression, if enabled
+                if values["index"]:
+                    # Get parameters
+                    name = f"expression_{key}".lower()
+                    expression = values["expression"]
+                    table = "documents" if expression.startswith(self.jsonprefix()) else "sections"
+
+                    # Execute statement
+                    self.cursor.execute(Statement.CREATE_EXPRESSION_INDEX % (name, table, expression))
+
     def finalize(self):
         """
         Post processing logic run after inserting a batch of documents. Default method is no-op.
@@ -306,9 +326,13 @@ def loaddocument(self, uid, document, tags, entry):
         # Get and remove object field from document
         obj = document.pop(self.object) if self.object in document else None
 
-        # Insert document as JSON
         if document:
-            self.insertdocument(uid, json.dumps(document, allow_nan=False), tags, entry)
+            # Apply data filters, if necessary
+            data = {key: value for key, value in document.items() if key in self.store} if self.store is not None else document
+
+            # Insert document as JSON
+            if data:
+                self.insertdocument(uid, json.dumps(data, allow_nan=False), tags, entry)
 
         # If text and object are both available, load object as it won't otherwise be used
         if self.text in document and obj:
diff --git a/src/python/txtai/database/schema/statement.py b/src/python/txtai/database/schema/statement.py
@@ -96,3 +96,6 @@ class Statement:
         + "LEFT JOIN scores sc ON s.indexid = sc.indexid"
     )
     IDS_CLAUSE = "s.indexid in (SELECT indexid from batch WHERE batch=%s)"
+
+    # Expression indexes
+    CREATE_EXPRESSION_INDEX = "CREATE INDEX IF NOT EXISTS %s ON %s(%s)"
diff --git a/test/python/testdatabase/testduckdb.py b/test/python/testdatabase/testduckdb.py
@@ -63,17 +63,17 @@ def testFunction(self):
             {
                 "path": "sentence-transformers/nli-mpnet-base-v2",
                 "content": self.backend,
-                "functions": [{"name": "length", "function": "testdatabase.testduckdb.length"}],
+                "functions": [{"name": "textlength", "function": "testdatabase.testduckdb.length"}],
             }
         )
 
         # Create an index for the list of text
         embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)])
 
         # Search for best match
-        result = embeddings.search("select length(text) length from txtai where id = 0", 1)[0]
+        result = embeddings.search("select textlength(text) length from txtai where id = 0", 1)[0]
 
-        self.assertEqual(result["length"], 39)
+        self.assertEqual(int(result["length"]), 39)
 
 
 def length(text):
diff --git a/test/python/testdatabase/testrdbms.py b/test/python/testdatabase/testrdbms.py
@@ -247,6 +247,22 @@ def testExplainEmpty(self):
 
             self.assertEqual(self.embeddings.explain("select * from txtai limit 1")[0]["id"], "0")
 
+        def testExpressions(self):
+            """
+            Test expressions
+            """
+
+            # Test indexed expressions
+            embeddings = Embeddings(
+                path="sentence-transformers/nli-mpnet-base-v2",
+                content=self.backend,
+                expressions=[{"name": "textlength", "expression": "length(text)", "index": True}],
+            )
+            embeddings.index(self.data)
+
+            result = embeddings.search("SELECT textlength FROM txtai WHERE id = 0", 1)[0]
+            self.assertEqual(result["textlength"], len(self.data[0]))
+
         def testGenerator(self):
             """
             Test index with a generator
diff --git a/test/python/testdatabase/testsqlite.py b/test/python/testdatabase/testsqlite.py
@@ -52,17 +52,17 @@ def testFunction(self):
             {
                 "path": "sentence-transformers/nli-mpnet-base-v2",
                 "content": self.backend,
-                "functions": [{"name": "length", "function": "testdatabase.testsqlite.length"}],
+                "functions": [{"name": "textlength", "function": "testdatabase.testsqlite.length"}],
             }
         )
 
         # Create an index for the list of text
         embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)])
 
         # Search for best match
-        result = embeddings.search("select length(text) length from txtai where id = 0", 1)[0]
+        result = embeddings.search("select textlength(text) length from txtai where id = 0", 1)[0]
 
-        self.assertEqual(result["length"], 39)
+        self.assertEqual(int(result["length"]), 39)
 
 
 def length(text):

Original file line number	Diff line number	Diff line change
`@@ -96,3 +96,6 @@ class Statement:`
`96`	`96`	`+ "LEFT JOIN scores sc ON s.indexid = sc.indexid"`
`97`	`97`	`)`
`98`	`98`	`IDS_CLAUSE = "s.indexid in (SELECT indexid from batch WHERE batch=%s)"`
	`99`	`+`
	`100`	`+ # Expression indexes`
	`101`	`+ CREATE_EXPRESSION_INDEX = "CREATE INDEX IF NOT EXISTS %s ON %s(%s)"`
Original file line number	Diff line number	Diff line change
`@@ -63,17 +63,17 @@ def testFunction(self):`
`63`	`63`	`{`
`64`	`64`	`"path": "sentence-transformers/nli-mpnet-base-v2",`
`65`	`65`	`"content": self.backend,`
`66`		`- "functions": [{"name": "length", "function": "testdatabase.testduckdb.length"}],`
	`66`	`+ "functions": [{"name": "textlength", "function": "testdatabase.testduckdb.length"}],`
`67`	`67`	`}`
`68`	`68`	`)`
`69`	`69`
`70`	`70`	`# Create an index for the list of text`
`71`	`71`	`embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)])`
`72`	`72`
`73`	`73`	`# Search for best match`
`74`		`- result = embeddings.search("select length(text) length from txtai where id = 0", 1)[0]`
	`74`	`+ result = embeddings.search("select textlength(text) length from txtai where id = 0", 1)[0]`
`75`	`75`
`76`		`- self.assertEqual(result["length"], 39)`
	`76`	`+ self.assertEqual(int(result["length"]), 39)`
`77`	`77`
`78`	`78`
`79`	`79`	`def length(text):`
Original file line number	Diff line number	Diff line change
`@@ -52,17 +52,17 @@ def testFunction(self):`
`52`	`52`	`{`
`53`	`53`	`"path": "sentence-transformers/nli-mpnet-base-v2",`
`54`	`54`	`"content": self.backend,`
`55`		`- "functions": [{"name": "length", "function": "testdatabase.testsqlite.length"}],`
	`55`	`+ "functions": [{"name": "textlength", "function": "testdatabase.testsqlite.length"}],`
`56`	`56`	`}`
`57`	`57`	`)`
`58`	`58`
`59`	`59`	`# Create an index for the list of text`
`60`	`60`	`embeddings.index([(uid, text, None) for uid, text in enumerate(self.data)])`
`61`	`61`
`62`	`62`	`# Search for best match`
`63`		`- result = embeddings.search("select length(text) length from txtai where id = 0", 1)[0]`
	`63`	`+ result = embeddings.search("select textlength(text) length from txtai where id = 0", 1)[0]`
`64`	`64`
`65`		`- self.assertEqual(result["length"], 39)`
	`65`	`+ self.assertEqual(int(result["length"]), 39)`
`66`	`66`
`67`	`67`
`68`	`68`	`def length(text):`