Skip to content

Commit d66b138

Browse files
authored
Merge pull request #1651 from tursodatabase/vector-search-improve-random-row
vector search: speed up random row selection query
2 parents 2c40df2 + f80444a commit d66b138

File tree

4 files changed

+79
-7
lines changed

4 files changed

+79
-7
lines changed

libsql-ffi/bundled/SQLite3MultipleCiphers/src/sqlite3.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212002,6 +212002,7 @@ int diskAnnCreateIndex(
212002212002
int type, dims;
212003212003
u64 maxNeighborsParam, blockSizeBytes;
212004212004
char *zSql;
212005+
const char *zRowidColumnName;
212005212006
char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...)
212006212007
char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...)
212007212008
if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){
@@ -212069,6 +212070,7 @@ int diskAnnCreateIndex(
212069212070
columnSqlDefs,
212070212071
columnSqlNames
212071212072
);
212073+
zRowidColumnName = "index_key";
212072212074
}else{
212073212075
zSql = sqlite3MPrintf(
212074212076
db,
@@ -212078,9 +212080,31 @@ int diskAnnCreateIndex(
212078212080
columnSqlDefs,
212079212081
columnSqlNames
212080212082
);
212083+
zRowidColumnName = "rowid";
212081212084
}
212082212085
rc = sqlite3_exec(db, zSql, 0, 0, 0);
212083212086
sqlite3DbFree(db, zSql);
212087+
if( rc != SQLITE_OK ){
212088+
return rc;
212089+
}
212090+
/*
212091+
* vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB)
212092+
* in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell
212093+
*
212094+
* as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them
212095+
* so, in order to remove this overhead for random row selection - we creating an index with just single column used
212096+
* in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small
212097+
*/
212098+
zSql = sqlite3MPrintf(
212099+
db,
212100+
"CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)",
212101+
zDbSName,
212102+
zIdxName,
212103+
zIdxName,
212104+
zRowidColumnName
212105+
);
212106+
rc = sqlite3_exec(db, zSql, 0, 0, 0);
212107+
sqlite3DbFree(db, zSql);
212084212108
return rc;
212085212109
}
212086212110

@@ -212110,8 +212134,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid)
212110212134

212111212135
zSql = sqlite3MPrintf(
212112212136
pIndex->db,
212113-
"SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)",
212114-
pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow
212137+
"SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)",
212138+
pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow
212115212139
);
212116212140
if( zSql == NULL ){
212117212141
rc = SQLITE_NOMEM_BKPT;

libsql-ffi/bundled/src/sqlite3.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212002,6 +212002,7 @@ int diskAnnCreateIndex(
212002212002
int type, dims;
212003212003
u64 maxNeighborsParam, blockSizeBytes;
212004212004
char *zSql;
212005+
const char *zRowidColumnName;
212005212006
char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...)
212006212007
char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...)
212007212008
if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){
@@ -212069,6 +212070,7 @@ int diskAnnCreateIndex(
212069212070
columnSqlDefs,
212070212071
columnSqlNames
212071212072
);
212073+
zRowidColumnName = "index_key";
212072212074
}else{
212073212075
zSql = sqlite3MPrintf(
212074212076
db,
@@ -212078,9 +212080,31 @@ int diskAnnCreateIndex(
212078212080
columnSqlDefs,
212079212081
columnSqlNames
212080212082
);
212083+
zRowidColumnName = "rowid";
212081212084
}
212082212085
rc = sqlite3_exec(db, zSql, 0, 0, 0);
212083212086
sqlite3DbFree(db, zSql);
212087+
if( rc != SQLITE_OK ){
212088+
return rc;
212089+
}
212090+
/*
212091+
* vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB)
212092+
* in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell
212093+
*
212094+
* as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them
212095+
* so, in order to remove this overhead for random row selection - we creating an index with just single column used
212096+
* in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small
212097+
*/
212098+
zSql = sqlite3MPrintf(
212099+
db,
212100+
"CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)",
212101+
zDbSName,
212102+
zIdxName,
212103+
zIdxName,
212104+
zRowidColumnName
212105+
);
212106+
rc = sqlite3_exec(db, zSql, 0, 0, 0);
212107+
sqlite3DbFree(db, zSql);
212084212108
return rc;
212085212109
}
212086212110

@@ -212110,8 +212134,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid)
212110212134

212111212135
zSql = sqlite3MPrintf(
212112212136
pIndex->db,
212113-
"SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)",
212114-
pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow
212137+
"SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)",
212138+
pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow
212115212139
);
212116212140
if( zSql == NULL ){
212117212141
rc = SQLITE_NOMEM_BKPT;

libsql-sqlite3/src/vectordiskann.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,7 @@ int diskAnnCreateIndex(
442442
int type, dims;
443443
u64 maxNeighborsParam, blockSizeBytes;
444444
char *zSql;
445+
const char *zRowidColumnName;
445446
char columnSqlDefs[VECTOR_INDEX_SQL_RENDER_LIMIT]; // definition of columns (e.g. index_key INTEGER BINARY, index_key1 TEXT, ...)
446447
char columnSqlNames[VECTOR_INDEX_SQL_RENDER_LIMIT]; // just column names (e.g. index_key, index_key1, index_key2, ...)
447448
if( vectorIdxKeyDefsRender(pKey, "index_key", columnSqlDefs, sizeof(columnSqlDefs)) != 0 ){
@@ -509,6 +510,7 @@ int diskAnnCreateIndex(
509510
columnSqlDefs,
510511
columnSqlNames
511512
);
513+
zRowidColumnName = "index_key";
512514
}else{
513515
zSql = sqlite3MPrintf(
514516
db,
@@ -518,9 +520,31 @@ int diskAnnCreateIndex(
518520
columnSqlDefs,
519521
columnSqlNames
520522
);
523+
zRowidColumnName = "rowid";
521524
}
522525
rc = sqlite3_exec(db, zSql, 0, 0, 0);
523526
sqlite3DbFree(db, zSql);
527+
if( rc != SQLITE_OK ){
528+
return rc;
529+
}
530+
/*
531+
* vector blobs are usually pretty huge (more than a page size, for example, node block for 1024d f32 embeddings with 1bit compression will occupy ~20KB)
532+
* in this case, main table B-Tree takes on redundant shape where all leaf nodes has only 1 cell
533+
*
534+
* as we have a query which selects random row using OFFSET/LIMIT trick - we will need to read all these leaf nodes pages just to skip them
535+
* so, in order to remove this overhead for random row selection - we creating an index with just single column used
536+
* in this case B-Tree leafs will be full of rowids and the overhead for page reads will be very small
537+
*/
538+
zSql = sqlite3MPrintf(
539+
db,
540+
"CREATE INDEX IF NOT EXISTS \"%w\".%s_shadow_idx ON %s_shadow (%s)",
541+
zDbSName,
542+
zIdxName,
543+
zIdxName,
544+
zRowidColumnName
545+
);
546+
rc = sqlite3_exec(db, zSql, 0, 0, 0);
547+
sqlite3DbFree(db, zSql);
524548
return rc;
525549
}
526550

@@ -550,8 +574,8 @@ static int diskAnnSelectRandomShadowRow(const DiskAnnIndex *pIndex, u64 *pRowid)
550574

551575
zSql = sqlite3MPrintf(
552576
pIndex->db,
553-
"SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM %s), 1)",
554-
pIndex->zDbSName, pIndex->zShadow, pIndex->zShadow
577+
"SELECT rowid FROM \"%w\".%s LIMIT 1 OFFSET ABS(RANDOM()) %% MAX((SELECT COUNT(*) FROM \"%w\".%s), 1)",
578+
pIndex->zDbSName, pIndex->zShadow, pIndex->zDbSName, pIndex->zShadow
555579
);
556580
if( zSql == NULL ){
557581
rc = SQLITE_NOMEM_BKPT;

libsql-sqlite3/test/libsql_vector_index.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ do_execsql_test vector-sql {
140140
INSERT INTO t_sql VALUES(vector('[1,2,3]')), (vector('[2,3,4]'));
141141
SELECT sql FROM sqlite_master WHERE name LIKE '%t_sql%';
142142
SELECT name FROM libsql_vector_meta_shadow WHERE name = 't_sql_idx';
143-
} {{CREATE TABLE t_sql( v FLOAT32(3))} {CREATE TABLE t_sql_idx_shadow (index_key INTEGER , data BLOB, PRIMARY KEY (index_key))} {CREATE INDEX t_sql_idx ON t_sql( libsql_vector_idx(v) )} {t_sql_idx}}
143+
} {{CREATE TABLE t_sql( v FLOAT32(3))} {CREATE TABLE t_sql_idx_shadow (index_key INTEGER , data BLOB, PRIMARY KEY (index_key))} {CREATE INDEX t_sql_idx_shadow_idx ON t_sql_idx_shadow (index_key)} {CREATE INDEX t_sql_idx ON t_sql( libsql_vector_idx(v) )} {t_sql_idx}}
144144

145145
do_execsql_test vector-drop-index {
146146
CREATE TABLE t_index_drop( v FLOAT32(3));

0 commit comments

Comments
 (0)