Skip to content

Commit f8dde82

Browse files
authored
Localdocs fixes (#3083)
Signed-off-by: Adam Treat <[email protected]>
1 parent 1789a3c commit f8dde82

File tree

5 files changed

+70
-13
lines changed

5 files changed

+70
-13
lines changed

gpt4all-chat/CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
66

7+
## [Unreleased]
8+
9+
### Fixed
10+
- Limit bm25 retrieval to only specified collections ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
11+
- Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
12+
- Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
13+
- Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
14+
715
## [3.4.1] - 2024-10-11
816

917
### Fixed
@@ -155,6 +163,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
155163
- Fix several Vulkan resource management issues ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
156164
- Fix crash/hang when some models stop generating, by showing special tokens ([#2701](https://github.com/nomic-ai/gpt4all/pull/2701))
157165

166+
[Unreleased]: https://github.com/nomic-ai/gpt4all/compare/v3.4.1...HEAD
158167
[3.4.1]: https://github.com/nomic-ai/gpt4all/compare/v3.4.0...v3.4.1
159168
[3.4.0]: https://github.com/nomic-ai/gpt4all/compare/v3.3.0...v3.4.0
160169
[3.3.1]: https://github.com/nomic-ai/gpt4all/compare/v3.3.0...v3.3.1

gpt4all-chat/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ include(../common/common.cmake)
44

55
set(APP_VERSION_MAJOR 3)
66
set(APP_VERSION_MINOR 4)
7-
set(APP_VERSION_PATCH 1)
7+
set(APP_VERSION_PATCH 2)
88
set(APP_VERSION_BASE "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
9-
set(APP_VERSION "${APP_VERSION_BASE}")
9+
set(APP_VERSION "${APP_VERSION_BASE}-dev0")
1010

1111
project(gpt4all VERSION ${APP_VERSION_BASE} LANGUAGES CXX C)
1212

gpt4all-chat/qml/LocalDocsSettings.qml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ MySettingsTab {
176176
ListElement { text: qsTr("Application default") }
177177
Component.onCompleted: {
178178
MySettings.embeddingsDeviceList.forEach(d => append({"text": d}));
179+
deviceBox.updateModel();
179180
}
180181
}
181182
Accessible.name: deviceLabel.text

gpt4all-chat/src/database.cpp

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -233,12 +233,17 @@ static const QString SELECT_COUNT_CHUNKS_SQL = uR"(
233233
)"_s;
234234

235235
static const QString SELECT_CHUNKS_FTS_SQL = uR"(
236-
select id, bm25(chunks_fts) as score
237-
from chunks_fts
236+
select fts.id, bm25(chunks_fts) as score
237+
from chunks_fts fts
238+
join documents d on fts.document_id = d.id
239+
join collection_items ci on d.folder_id = ci.folder_id
240+
join collections co on ci.collection_id = co.id
238241
where chunks_fts match ?
239-
order by score limit %1;
242+
and co.name in ('%1')
243+
order by score limit %2;
240244
)"_s;
241245

246+
242247
#define NAMED_PAIR(name, typea, a, typeb, b) \
243248
struct name { typea a; typeb b; }; \
244249
static bool operator==(const name &x, const name &y) { return x.a == y.a && x.b == y.b; } \
@@ -349,6 +354,14 @@ static const QString UPDATE_LAST_UPDATE_TIME_SQL = uR"(
349354
update collections set last_update_time = ? where id = ?;
350355
)"_s;
351356

357+
static const QString FTS_INTEGRITY_SQL = uR"(
358+
insert into chunks_fts(chunks_fts, rank) values('integrity-check', 1);
359+
)"_s;
360+
361+
static const QString FTS_REBUILD_SQL = uR"(
362+
insert into chunks_fts(chunks_fts) values('rebuild');
363+
)"_s;
364+
352365
static bool addCollection(QSqlQuery &q, const QString &collection_name, const QDateTime &start_update,
353366
const QDateTime &last_update, const QString &embedding_model, CollectionItem &item)
354367
{
@@ -1815,6 +1828,7 @@ void Database::start()
18151828
m_databaseValid = false;
18161829
} else {
18171830
cleanDB();
1831+
ftsIntegrityCheck();
18181832
QSqlQuery q(m_db);
18191833
if (!refreshDocumentIdCache(q)) {
18201834
m_databaseValid = false;
@@ -2328,7 +2342,7 @@ QList<int> Database::searchBM25(const QString &query, const QList<QString> &coll
23282342
QList<BM25Query> bm25Queries = queriesForFTS5(query);
23292343

23302344
QSqlQuery sqlQuery(m_db);
2331-
sqlQuery.prepare(SELECT_CHUNKS_FTS_SQL.arg(k));
2345+
sqlQuery.prepare(SELECT_CHUNKS_FTS_SQL.arg(collections.join("', '"), QString::number(k)));
23322346

23332347
QList<SearchResult> results;
23342348
for (auto &bm25Query : std::as_const(bm25Queries)) {
@@ -2346,11 +2360,13 @@ QList<int> Database::searchBM25(const QString &query, const QList<QString> &coll
23462360
}
23472361
}
23482362

2349-
do {
2350-
const int chunkId = sqlQuery.value(0).toInt();
2351-
const float score = sqlQuery.value(1).toFloat();
2352-
results.append({chunkId, score});
2353-
} while (sqlQuery.next());
2363+
if (sqlQuery.at() != QSql::AfterLastRow) {
2364+
do {
2365+
const int chunkId = sqlQuery.value(0).toInt();
2366+
const float score = sqlQuery.value(1).toFloat();
2367+
results.append({chunkId, score});
2368+
} while (sqlQuery.next());
2369+
}
23542370

23552371
k = qMin(k, results.size());
23562372
std::partial_sort(
@@ -2524,6 +2540,26 @@ void Database::retrieveFromDB(const QList<QString> &collections, const QString &
25242540
results->append(tempResults.value(id));
25252541
}
25262542

2543+
bool Database::ftsIntegrityCheck()
2544+
{
2545+
QSqlQuery q(m_db);
2546+
2547+
// Returns an error executing sql if it the integrity check fails
2548+
// See: https://www.sqlite.org/fts5.html#the_integrity_check_command
2549+
const bool success = q.exec(FTS_INTEGRITY_SQL);
2550+
if (!success && q.lastError().nativeErrorCode() != "267" /*SQLITE_CORRUPT_VTAB from sqlite header*/) {
2551+
qWarning() << "ERROR: Cannot prepare sql for fts integrity check" << q.lastError();
2552+
return false;
2553+
}
2554+
2555+
if (!success && !q.exec(FTS_REBUILD_SQL)) {
2556+
qWarning() << "ERROR: Cannot exec sql for fts rebuild" << q.lastError();
2557+
return false;
2558+
}
2559+
2560+
return true;
2561+
}
2562+
25272563
// FIXME This is very slow and non-interruptible and when we close the application and we're
25282564
// cleaning a large table this can cause the app to take forever to shut down. This would ideally be
25292565
// interruptible and we'd continue 'cleaning' when we restart
@@ -2574,7 +2610,7 @@ bool Database::cleanDB()
25742610
int document_id = q.value(0).toInt();
25752611
QString document_path = q.value(1).toString();
25762612
QFileInfo info(document_path);
2577-
if (info.exists() && info.isReadable() && m_scannedFileExtensions.contains(info.suffix()))
2613+
if (info.exists() && info.isReadable() && m_scannedFileExtensions.contains(info.suffix(), Qt::CaseInsensitive))
25782614
continue;
25792615

25802616
#if defined(DEBUG)

gpt4all-chat/src/database.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,20 @@ class QTimer;
4141

4242
/* Version 0: GPT4All v2.4.3, full-text search
4343
* Version 1: GPT4All v2.5.3, embeddings in hsnwlib
44-
* Version 2: GPT4All v3.0.0, embeddings in sqlite */
44+
* Version 2: GPT4All v3.0.0, embeddings in sqlite
45+
* Version 3: GPT4All v3.4.0, hybrid search
46+
*/
4547

4648
// minimum supported version
4749
static const int LOCALDOCS_MIN_VER = 1;
50+
51+
// FIXME: (Adam) The next time we bump the version we should add triggers to manage the fts external
52+
// content table as recommended in the official documentation to keep the fts index in sync
53+
// See: https://www.sqlite.org/fts5.html#external_content_tables
54+
55+
// FIXME: (Adam) The fts virtual table should include the chunk_id explicitly instead of relying upon
56+
// the id of the two tables to be in sync
57+
4858
// current version
4959
static const int LOCALDOCS_VERSION = 3;
5060

@@ -252,6 +262,7 @@ private Q_SLOTS:
252262
void enqueueDocumentInternal(DocumentInfo &&info, bool prepend = false);
253263
void enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos);
254264
void scanQueue();
265+
bool ftsIntegrityCheck();
255266
bool cleanDB();
256267
void addFolderToWatch(const QString &path);
257268
void removeFolderFromWatch(const QString &path);

0 commit comments

Comments
 (0)