Skip to content

Commit 36a3826

Browse files
committed
localdocs: avoid cases where batch can make no progress (#3094)
Signed-off-by: Jared Van Bortel <[email protected]>
1 parent f8dde82 commit 36a3826

File tree

3 files changed

+38
-38
lines changed

3 files changed

+38
-38
lines changed

gpt4all-chat/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1111
- Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
1212
- Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
1313
- Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
14+
- Prevent LocalDocs from not making progress in certain cases ([#3094](https://github.com/nomic-ai/gpt4all/pull/3094))
1415

1516
## [3.4.1] - 2024-10-11
1617

gpt4all-chat/src/database.cpp

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,9 +1129,12 @@ static void handleDocumentError(const QString &errorMessage, int document_id, co
11291129

11301130
class DocumentReader {
11311131
public:
1132+
struct Metadata { QString title, author, subject, keywords; };
1133+
11321134
static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);
11331135

11341136
const DocumentInfo &doc () const { return *m_info; }
1137+
const Metadata &metadata() const { return m_metadata; }
11351138
const std::optional<QString> &word () const { return m_word; }
11361139
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
11371140
virtual std::optional<ChunkStreamer::Status> getError() const { return std::nullopt; }
@@ -1143,11 +1146,16 @@ class DocumentReader {
11431146
explicit DocumentReader(const DocumentInfo &info)
11441147
: m_info(&info) {}
11451148

1146-
void postInit() { m_word = advance(); }
1149+
void postInit(Metadata &&metadata = {})
1150+
{
1151+
m_metadata = std::move(metadata);
1152+
m_word = advance();
1153+
}
11471154

11481155
virtual std::optional<QString> advance() = 0;
11491156

11501157
const DocumentInfo *m_info;
1158+
Metadata m_metadata;
11511159
std::optional<QString> m_word;
11521160
};
11531161

@@ -1161,7 +1169,13 @@ class PdfDocumentReader final : public DocumentReader {
11611169
QString path = info.file.canonicalFilePath();
11621170
if (m_doc.load(path) != QPdfDocument::Error::None)
11631171
throw std::runtime_error(fmt::format("Failed to load PDF: {}", path));
1164-
postInit();
1172+
Metadata metadata {
1173+
.title = m_doc.metaData(QPdfDocument::MetaDataField::Title ).toString(),
1174+
.author = m_doc.metaData(QPdfDocument::MetaDataField::Author ).toString(),
1175+
.subject = m_doc.metaData(QPdfDocument::MetaDataField::Subject ).toString(),
1176+
.keywords = m_doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
1177+
};
1178+
postInit(std::move(metadata));
11651179
}
11661180

11671181
int page() const override { return m_currentPage; }
@@ -1200,6 +1214,7 @@ class WordDocumentReader final : public DocumentReader {
12001214

12011215
m_paragraph = &m_doc.paragraphs();
12021216
m_run = &m_paragraph->runs();
1217+
// TODO(jared): metadata for Word documents?
12031218
postInit();
12041219
}
12051220

@@ -1324,20 +1339,14 @@ ChunkStreamer::ChunkStreamer(Database *database)
13241339

13251340
ChunkStreamer::~ChunkStreamer() = default;
13261341

1327-
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel,
1328-
const QString &title, const QString &author, const QString &subject,
1329-
const QString &keywords)
1342+
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel)
13301343
{
13311344
auto docKey = doc.key();
13321345
if (!m_docKey || *m_docKey != docKey) {
13331346
m_docKey = docKey;
13341347
m_reader = DocumentReader::fromDocument(doc);
13351348
m_documentId = documentId;
13361349
m_embeddingModel = embeddingModel;
1337-
m_title = title;
1338-
m_author = author;
1339-
m_subject = subject;
1340-
m_keywords = keywords;
13411350
m_chunk.clear();
13421351
m_page = 0;
13431352

@@ -1376,10 +1385,6 @@ ChunkStreamer::Status ChunkStreamer::step()
13761385
m_docKey.reset(); // done processing
13771386
return *error;
13781387
}
1379-
if (m_database->scanQueueInterrupted()) {
1380-
retval = Status::INTERRUPTED;
1381-
break;
1382-
}
13831388

13841389
// get a word, if needed
13851390
std::optional<QString> word = QString(); // empty string to disable EOF logic
@@ -1438,14 +1443,15 @@ ChunkStreamer::Status ChunkStreamer::step()
14381443

14391444
QSqlQuery q(m_database->m_db);
14401445
int chunkId = 0;
1446+
auto &metadata = m_reader->metadata();
14411447
if (!m_database->addChunk(q,
14421448
m_documentId,
14431449
chunk,
14441450
m_reader->doc().file.fileName(), // basename
1445-
m_title,
1446-
m_author,
1447-
m_subject,
1448-
m_keywords,
1451+
metadata.title,
1452+
metadata.author,
1453+
metadata.subject,
1454+
metadata.keywords,
14491455
m_page,
14501456
line_from,
14511457
line_to,
@@ -1472,6 +1478,11 @@ ChunkStreamer::Status ChunkStreamer::step()
14721478
break;
14731479
}
14741480
}
1481+
1482+
if (m_database->scanQueueInterrupted()) {
1483+
retval = Status::INTERRUPTED;
1484+
break;
1485+
}
14751486
}
14761487

14771488
if (nChunks) {
@@ -1635,13 +1646,16 @@ bool Database::scanQueueInterrupted() const
16351646

16361647
void Database::scanQueueBatch()
16371648
{
1638-
m_scanDurationTimer.start();
1639-
16401649
transaction();
16411650

1642-
// scan for up to 100ms or until we run out of documents
1643-
while (!m_docsToScan.empty() && !scanQueueInterrupted())
1651+
m_scanDurationTimer.start();
1652+
1653+
// scan for up to the maximum scan duration or until we run out of documents
1654+
while (!m_docsToScan.empty()) {
16441655
scanQueue();
1656+
if (scanQueueInterrupted())
1657+
break;
1658+
}
16451659

16461660
commit();
16471661

@@ -1727,22 +1741,8 @@ void Database::scanQueue()
17271741
Q_ASSERT(document_id != -1);
17281742

17291743
{
1730-
QString title, author, subject, keywords;
1731-
if (info.isPdf()) {
1732-
QPdfDocument doc;
1733-
if (doc.load(document_path) != QPdfDocument::Error::None) {
1734-
qWarning() << "ERROR: Could not load pdf" << document_id << document_path;
1735-
return updateFolderToIndex(folder_id, countForFolder);
1736-
}
1737-
title = doc.metaData(QPdfDocument::MetaDataField::Title).toString();
1738-
author = doc.metaData(QPdfDocument::MetaDataField::Author).toString();
1739-
subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString();
1740-
keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString();
1741-
// TODO(jared): metadata for Word documents?
1742-
}
1743-
17441744
try {
1745-
m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords);
1745+
m_chunkStreamer.setDocument(info, document_id, embedding_model);
17461746
} catch (const std::runtime_error &e) {
17471747
qWarning() << "LocalDocs ERROR:" << e.what();
17481748
goto dequeue;

gpt4all-chat/src/database.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,7 @@ class ChunkStreamer {
171171
explicit ChunkStreamer(Database *database);
172172
~ChunkStreamer();
173173

174-
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title,
175-
const QString &author, const QString &subject, const QString &keywords);
174+
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel);
176175
std::optional<DocumentInfo::key_type> currentDocKey() const;
177176
void reset();
178177

0 commit comments

Comments
 (0)