Skip to content

Commit ebda914

Browse files
authored
localdocs: fix regressions caused by docx change (#3079)
Signed-off-by: Jared Van Bortel <[email protected]>
1 parent 9fd48ee commit ebda914

File tree

3 files changed

+155
-83
lines changed

3 files changed

+155
-83
lines changed

gpt4all-chat/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
88

99
### Fixed
1010
- Fix models.json cache location ([#3052](https://github.com/nomic-ai/gpt4all/pull/3052))
11+
- Fix LocalDocs regressions caused by docx change ([#3079](https://github.com/nomic-ai/gpt4all/pull/3079))
1112

1213
## [3.4.0] - 2024-10-08
1314

gpt4all-chat/src/database.cpp

Lines changed: 145 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <stdexcept>
3333

3434
using namespace Qt::Literals::StringLiterals;
35+
namespace ranges = std::ranges;
3536
namespace us = unum::usearch;
3637

3738
//#define DEBUG
@@ -175,6 +176,14 @@ static const QString INSERT_CHUNK_FTS_SQL = uR"(
175176
values(?, ?, ?, ?, ?, ?, ?);
176177
)"_s;
177178

179+
static const QString SELECT_CHUNKED_DOCUMENTS_SQL[] = {
180+
uR"(
181+
select distinct document_id from chunks;
182+
)"_s, uR"(
183+
select distinct document_id from chunks_fts;
184+
)"_s,
185+
};
186+
178187
static const QString DELETE_CHUNKS_SQL[] = {
179188
uR"(
180189
delete from embeddings
@@ -230,53 +239,6 @@ static const QString SELECT_CHUNKS_FTS_SQL = uR"(
230239
order by score limit %1;
231240
)"_s;
232241

233-
static bool addChunk(QSqlQuery &q, int document_id, const QString &chunk_text, const QString &file,
234-
const QString &title, const QString &author, const QString &subject, const QString &keywords,
235-
int page, int from, int to, int words, int *chunk_id)
236-
{
237-
if (!q.prepare(INSERT_CHUNK_SQL))
238-
return false;
239-
q.addBindValue(document_id);
240-
q.addBindValue(chunk_text);
241-
q.addBindValue(file);
242-
q.addBindValue(title);
243-
q.addBindValue(author);
244-
q.addBindValue(subject);
245-
q.addBindValue(keywords);
246-
q.addBindValue(page);
247-
q.addBindValue(from);
248-
q.addBindValue(to);
249-
q.addBindValue(words);
250-
if (!q.exec() || !q.next())
251-
return false;
252-
*chunk_id = q.value(0).toInt();
253-
254-
if (!q.prepare(INSERT_CHUNK_FTS_SQL))
255-
return false;
256-
q.addBindValue(document_id);
257-
q.addBindValue(chunk_text);
258-
q.addBindValue(file);
259-
q.addBindValue(title);
260-
q.addBindValue(author);
261-
q.addBindValue(subject);
262-
q.addBindValue(keywords);
263-
if (!q.exec())
264-
return false;
265-
return true;
266-
}
267-
268-
static bool removeChunksByDocumentId(QSqlQuery &q, int document_id)
269-
{
270-
for (const auto &cmd: DELETE_CHUNKS_SQL) {
271-
if (!q.prepare(cmd))
272-
return false;
273-
q.addBindValue(document_id);
274-
if (!q.exec())
275-
return false;
276-
}
277-
return true;
278-
}
279-
280242
#define NAMED_PAIR(name, typea, a, typeb, b) \
281243
struct name { typea a; typeb b; }; \
282244
static bool operator==(const name &x, const name &y) { return x.a == y.a && x.b == y.b; } \
@@ -634,18 +596,6 @@ static bool selectAllFolderPaths(QSqlQuery &q, QList<QString> *folder_paths)
634596
return true;
635597
}
636598

637-
static bool sqlRemoveDocsByFolderPath(QSqlQuery &q, const QString &path)
638-
{
639-
for (const auto &cmd: FOLDER_REMOVE_ALL_DOCS_SQL) {
640-
if (!q.prepare(cmd))
641-
return false;
642-
q.addBindValue(path);
643-
if (!q.exec())
644-
return false;
645-
}
646-
return true;
647-
}
648-
649599
static const QString INSERT_COLLECTION_ITEM_SQL = uR"(
650600
insert into collection_items(collection_id, folder_id)
651601
values(?, ?)
@@ -889,6 +839,79 @@ void Database::rollback()
889839
Q_ASSERT(ok);
890840
}
891841

842+
bool Database::refreshDocumentIdCache(QSqlQuery &q)
843+
{
844+
m_documentIdCache.clear();
845+
for (const auto &cmd: SELECT_CHUNKED_DOCUMENTS_SQL) {
846+
if (!q.exec(cmd))
847+
return false;
848+
while (q.next())
849+
m_documentIdCache << q.value(0).toInt();
850+
}
851+
return true;
852+
}
853+
854+
bool Database::addChunk(QSqlQuery &q, int document_id, const QString &chunk_text, const QString &file,
855+
const QString &title, const QString &author, const QString &subject, const QString &keywords,
856+
int page, int from, int to, int words, int *chunk_id)
857+
{
858+
if (!q.prepare(INSERT_CHUNK_SQL))
859+
return false;
860+
q.addBindValue(document_id);
861+
q.addBindValue(chunk_text);
862+
q.addBindValue(file);
863+
q.addBindValue(title);
864+
q.addBindValue(author);
865+
q.addBindValue(subject);
866+
q.addBindValue(keywords);
867+
q.addBindValue(page);
868+
q.addBindValue(from);
869+
q.addBindValue(to);
870+
q.addBindValue(words);
871+
if (!q.exec() || !q.next())
872+
return false;
873+
*chunk_id = q.value(0).toInt();
874+
875+
if (!q.prepare(INSERT_CHUNK_FTS_SQL))
876+
return false;
877+
q.addBindValue(document_id);
878+
q.addBindValue(chunk_text);
879+
q.addBindValue(file);
880+
q.addBindValue(title);
881+
q.addBindValue(author);
882+
q.addBindValue(subject);
883+
q.addBindValue(keywords);
884+
if (!q.exec())
885+
return false;
886+
m_documentIdCache << document_id;
887+
return true;
888+
}
889+
890+
bool Database::removeChunksByDocumentId(QSqlQuery &q, int document_id)
891+
{
892+
for (const auto &cmd: DELETE_CHUNKS_SQL) {
893+
if (!q.prepare(cmd))
894+
return false;
895+
q.addBindValue(document_id);
896+
if (!q.exec())
897+
return false;
898+
}
899+
m_documentIdCache.remove(document_id);
900+
return true;
901+
}
902+
903+
bool Database::sqlRemoveDocsByFolderPath(QSqlQuery &q, const QString &path)
904+
{
905+
for (const auto &cmd: FOLDER_REMOVE_ALL_DOCS_SQL) {
906+
if (!q.prepare(cmd))
907+
return false;
908+
q.addBindValue(path);
909+
if (!q.exec())
910+
return false;
911+
}
912+
return refreshDocumentIdCache(q);
913+
}
914+
892915
bool Database::hasContent()
893916
{
894917
return m_db.tables().contains("chunks", Qt::CaseInsensitive);
@@ -1246,9 +1269,13 @@ class TxtDocumentReader final : public DocumentReader {
12461269
protected:
12471270
std::optional<QString> advance() override
12481271
{
1272+
if (getError())
1273+
return std::nullopt;
12491274
while (!m_stream.atEnd()) {
12501275
QString word;
12511276
m_stream >> word;
1277+
if (getError())
1278+
return std::nullopt;
12521279
if (!word.isEmpty())
12531280
return word;
12541281
}
@@ -1257,9 +1284,11 @@ class TxtDocumentReader final : public DocumentReader {
12571284

12581285
std::optional<ChunkStreamer::Status> getError() const override
12591286
{
1260-
if (!m_file.error())
1261-
return std::nullopt;
1262-
return m_file.binarySeen() ? ChunkStreamer::Status::BINARY_SEEN : ChunkStreamer::Status::ERROR;
1287+
if (m_file.binarySeen())
1288+
return ChunkStreamer::Status::BINARY_SEEN;
1289+
if (m_file.error())
1290+
return ChunkStreamer::Status::ERROR;
1291+
return std::nullopt;
12631292
}
12641293

12651294
BinaryDetectingFile m_file;
@@ -1300,12 +1329,24 @@ void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const Q
13001329
m_page = 0;
13011330

13021331
// make sure the document doesn't already have any chunks
1303-
QSqlQuery q(m_database->m_db);
1304-
if (!removeChunksByDocumentId(q, documentId))
1305-
handleDocumentError("ERROR: Cannot remove chunks of document", documentId, doc.file.canonicalPath(), q.lastError());
1332+
if (m_database->m_documentIdCache.contains(documentId)) {
1333+
QSqlQuery q(m_database->m_db);
1334+
if (!m_database->removeChunksByDocumentId(q, documentId))
1335+
handleDocumentError("ERROR: Cannot remove chunks of document", documentId, doc.file.canonicalPath(), q.lastError());
1336+
}
13061337
}
13071338
}
13081339

1340+
std::optional<DocumentInfo::key_type> ChunkStreamer::currentDocKey() const
1341+
{
1342+
return m_docKey;
1343+
}
1344+
1345+
void ChunkStreamer::reset()
1346+
{
1347+
m_docKey.reset();
1348+
}
1349+
13091350
ChunkStreamer::Status ChunkStreamer::step()
13101351
{
13111352
// TODO: implement line_from/line_to
@@ -1318,8 +1359,10 @@ ChunkStreamer::Status ChunkStreamer::step()
13181359
Status retval;
13191360

13201361
for (;;) {
1321-
if (auto error = m_reader->getError())
1362+
if (auto error = m_reader->getError()) {
1363+
m_docKey.reset(); // done processing
13221364
return *error;
1365+
}
13231366
if (m_database->scanQueueInterrupted()) {
13241367
retval = Status::INTERRUPTED;
13251368
break;
@@ -1340,43 +1383,52 @@ ChunkStreamer::Status ChunkStreamer::step()
13401383
}
13411384
}
13421385

1343-
if (!word || m_chunk.length() >= maxChunkSize + 1) { // +1 for leading space
1386+
if (!word || m_chunk.length() >= maxChunkSize + 1) { // +1 for trailing space
13441387
if (!m_chunk.isEmpty()) {
13451388
int nThisChunkWords = 0;
13461389
auto chunk = m_chunk; // copy
13471390

13481391
// handle overlength chunks
13491392
if (m_chunk.length() > maxChunkSize + 1) {
13501393
// find the final space
1351-
qsizetype lastSpace = chunk.lastIndexOf(u' ', -2);
1394+
qsizetype chunkEnd = chunk.lastIndexOf(u' ', -2);
13521395

1353-
if (lastSpace < 0) {
1396+
qsizetype spaceSize;
1397+
if (chunkEnd >= 0) {
13541398
// slice off the last word
1399+
spaceSize = 1;
13551400
Q_ASSERT(m_nChunkWords >= 1);
1356-
lastSpace = maxChunkSize;
1401+
// one word left
13571402
nThisChunkWords = m_nChunkWords - 1;
13581403
m_nChunkWords = 1;
13591404
} else {
13601405
// slice the overlong word
1406+
spaceSize = 0;
1407+
chunkEnd = maxChunkSize;
1408+
// partial word left, don't count it
13611409
nThisChunkWords = m_nChunkWords;
13621410
m_nChunkWords = 0;
13631411
}
1364-
// save the extra part
1365-
m_chunk = chunk.sliced(lastSpace + 1);
1366-
// slice
1367-
chunk.truncate(lastSpace + 1);
1368-
Q_ASSERT(chunk.length() <= maxChunkSize + 1);
1412+
// save the second part, excluding space if any
1413+
m_chunk = chunk.sliced(chunkEnd + spaceSize);
1414+
// consume the first part
1415+
chunk.truncate(chunkEnd);
13691416
} else {
13701417
nThisChunkWords = m_nChunkWords;
13711418
m_nChunkWords = 0;
1419+
// there is no second part
1420+
m_chunk.clear();
1421+
// consume the whole chunk, excluding space
1422+
chunk.chop(1);
13721423
}
1424+
Q_ASSERT(chunk.length() <= maxChunkSize);
13731425

13741426
QSqlQuery q(m_database->m_db);
13751427
int chunkId = 0;
1376-
if (!addChunk(q,
1428+
if (!m_database->addChunk(q,
13771429
m_documentId,
1378-
chunk.chopped(1), // strip trailing space
1379-
m_reader->doc().file.canonicalFilePath(),
1430+
chunk,
1431+
m_reader->doc().file.fileName(), // basename
13801432
m_title,
13811433
m_author,
13821434
m_subject,
@@ -1399,12 +1451,11 @@ ChunkStreamer::Status ChunkStreamer::step()
13991451
toEmbed.chunk = chunk;
14001452
m_database->appendChunk(toEmbed);
14011453
++nChunks;
1402-
1403-
m_chunk.clear();
14041454
}
14051455

14061456
if (!word) {
14071457
retval = Status::DOC_COMPLETE;
1458+
m_docKey.reset(); // done processing
14081459
break;
14091460
}
14101461
}
@@ -1532,8 +1583,14 @@ DocumentInfo Database::dequeueDocument()
15321583

15331584
void Database::removeFolderFromDocumentQueue(int folder_id)
15341585
{
1535-
if (auto it = m_docsToScan.find(folder_id); it != m_docsToScan.end())
1536-
m_docsToScan.erase(it);
1586+
if (auto queueIt = m_docsToScan.find(folder_id); queueIt != m_docsToScan.end()) {
1587+
if (auto key = m_chunkStreamer.currentDocKey()) {
1588+
if (ranges::any_of(queueIt->second, [&key](auto &d) { return d.key() == key; }))
1589+
m_chunkStreamer.reset(); // done with this document
1590+
}
1591+
// remove folder from queue
1592+
m_docsToScan.erase(queueIt);
1593+
}
15371594
}
15381595

15391596
void Database::enqueueDocumentInternal(DocumentInfo &&info, bool prepend)
@@ -1758,7 +1815,12 @@ void Database::start()
17581815
m_databaseValid = false;
17591816
} else {
17601817
cleanDB();
1761-
addCurrentFolders();
1818+
QSqlQuery q(m_db);
1819+
if (!refreshDocumentIdCache(q)) {
1820+
m_databaseValid = false;
1821+
} else {
1822+
addCurrentFolders();
1823+
}
17621824
}
17631825

17641826
if (!m_databaseValid)

gpt4all-chat/src/database.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ class ChunkStreamer {
163163

164164
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title,
165165
const QString &author, const QString &subject, const QString &keywords);
166+
std::optional<DocumentInfo::key_type> currentDocKey() const;
167+
void reset();
166168

167169
Status step();
168170

@@ -224,6 +226,12 @@ private Q_SLOTS:
224226
void commit();
225227
void rollback();
226228

229+
bool addChunk(QSqlQuery &q, int document_id, const QString &chunk_text, const QString &file,
230+
const QString &title, const QString &author, const QString &subject, const QString &keywords,
231+
int page, int from, int to, int words, int *chunk_id);
232+
bool refreshDocumentIdCache(QSqlQuery &q);
233+
bool removeChunksByDocumentId(QSqlQuery &q, int document_id);
234+
bool sqlRemoveDocsByFolderPath(QSqlQuery &q, const QString &path);
227235
bool hasContent();
228236
// not found -> 0, , exists and has content -> 1, error -> -1
229237
int openDatabase(const QString &modelPath, bool create = true, int ver = LOCALDOCS_VERSION);
@@ -293,6 +301,7 @@ private Q_SLOTS:
293301
QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress
294302
std::atomic<bool> m_databaseValid;
295303
ChunkStreamer m_chunkStreamer;
304+
QSet<int> m_documentIdCache; // cached list of documents with chunks for fast lookup
296305

297306
friend class ChunkStreamer;
298307
};

0 commit comments

Comments
 (0)