@@ -1129,9 +1129,12 @@ static void handleDocumentError(const QString &errorMessage, int document_id, co
11291129
11301130class DocumentReader {
11311131public:
1132+ struct Metadata { QString title, author, subject, keywords; };
1133+
11321134 static std::unique_ptr<DocumentReader> fromDocument (const DocumentInfo &info);
11331135
11341136 const DocumentInfo &doc () const { return *m_info; }
1137+ const Metadata &metadata () const { return m_metadata; }
11351138 const std::optional<QString> &word () const { return m_word; }
11361139 const std::optional<QString> &nextWord () { m_word = advance (); return m_word; }
11371140 virtual std::optional<ChunkStreamer::Status> getError () const { return std::nullopt ; }
@@ -1143,11 +1146,16 @@ class DocumentReader {
11431146 explicit DocumentReader (const DocumentInfo &info)
11441147 : m_info(&info) {}
11451148
1146- void postInit () { m_word = advance (); }
1149+ void postInit (Metadata &&metadata = {})
1150+ {
1151+ m_metadata = std::move (metadata);
1152+ m_word = advance ();
1153+ }
11471154
11481155 virtual std::optional<QString> advance () = 0;
11491156
11501157 const DocumentInfo *m_info;
1158+ Metadata m_metadata;
11511159 std::optional<QString> m_word;
11521160};
11531161
@@ -1161,7 +1169,13 @@ class PdfDocumentReader final : public DocumentReader {
11611169 QString path = info.file .canonicalFilePath ();
11621170 if (m_doc.load (path) != QPdfDocument::Error::None)
11631171 throw std::runtime_error (fmt::format (" Failed to load PDF: {}" , path));
1164- postInit ();
1172+ Metadata metadata {
1173+ .title = m_doc.metaData (QPdfDocument::MetaDataField::Title ).toString (),
1174+ .author = m_doc.metaData (QPdfDocument::MetaDataField::Author ).toString (),
1175+ .subject = m_doc.metaData (QPdfDocument::MetaDataField::Subject ).toString (),
1176+ .keywords = m_doc.metaData (QPdfDocument::MetaDataField::Keywords).toString (),
1177+ };
1178+ postInit (std::move (metadata));
11651179 }
11661180
11671181 int page () const override { return m_currentPage; }
@@ -1200,6 +1214,7 @@ class WordDocumentReader final : public DocumentReader {
12001214
12011215 m_paragraph = &m_doc.paragraphs ();
12021216 m_run = &m_paragraph->runs ();
1217+ // TODO(jared): metadata for Word documents?
12031218 postInit ();
12041219 }
12051220
@@ -1324,20 +1339,14 @@ ChunkStreamer::ChunkStreamer(Database *database)
13241339
13251340ChunkStreamer::~ChunkStreamer () = default ;
13261341
1327- void ChunkStreamer::setDocument (const DocumentInfo &doc, int documentId, const QString &embeddingModel,
1328- const QString &title, const QString &author, const QString &subject,
1329- const QString &keywords)
1342+ void ChunkStreamer::setDocument (const DocumentInfo &doc, int documentId, const QString &embeddingModel)
13301343{
13311344 auto docKey = doc.key ();
13321345 if (!m_docKey || *m_docKey != docKey) {
13331346 m_docKey = docKey;
13341347 m_reader = DocumentReader::fromDocument (doc);
13351348 m_documentId = documentId;
13361349 m_embeddingModel = embeddingModel;
1337- m_title = title;
1338- m_author = author;
1339- m_subject = subject;
1340- m_keywords = keywords;
13411350 m_chunk.clear ();
13421351 m_page = 0 ;
13431352
@@ -1376,10 +1385,6 @@ ChunkStreamer::Status ChunkStreamer::step()
13761385 m_docKey.reset (); // done processing
13771386 return *error;
13781387 }
1379- if (m_database->scanQueueInterrupted ()) {
1380- retval = Status::INTERRUPTED;
1381- break ;
1382- }
13831388
13841389 // get a word, if needed
13851390 std::optional<QString> word = QString (); // empty string to disable EOF logic
@@ -1438,14 +1443,15 @@ ChunkStreamer::Status ChunkStreamer::step()
14381443
14391444 QSqlQuery q (m_database->m_db );
14401445 int chunkId = 0 ;
1446+ auto &metadata = m_reader->metadata ();
14411447 if (!m_database->addChunk (q,
14421448 m_documentId,
14431449 chunk,
14441450 m_reader->doc ().file .fileName (), // basename
1445- m_title ,
1446- m_author ,
1447- m_subject ,
1448- m_keywords ,
1451+ metadata. title ,
1452+ metadata. author ,
1453+ metadata. subject ,
1454+ metadata. keywords ,
14491455 m_page,
14501456 line_from,
14511457 line_to,
@@ -1472,6 +1478,11 @@ ChunkStreamer::Status ChunkStreamer::step()
14721478 break ;
14731479 }
14741480 }
1481+
1482+ if (m_database->scanQueueInterrupted ()) {
1483+ retval = Status::INTERRUPTED;
1484+ break ;
1485+ }
14751486 }
14761487
14771488 if (nChunks) {
@@ -1635,13 +1646,16 @@ bool Database::scanQueueInterrupted() const
16351646
16361647void Database::scanQueueBatch ()
16371648{
1638- m_scanDurationTimer.start ();
1639-
16401649 transaction ();
16411650
1642- // scan for up to 100ms or until we run out of documents
1643- while (!m_docsToScan.empty () && !scanQueueInterrupted ())
1651+ m_scanDurationTimer.start ();
1652+
1653+ // scan for up to the maximum scan duration or until we run out of documents
1654+ while (!m_docsToScan.empty ()) {
16441655 scanQueue ();
1656+ if (scanQueueInterrupted ())
1657+ break ;
1658+ }
16451659
16461660 commit ();
16471661
@@ -1727,22 +1741,8 @@ void Database::scanQueue()
17271741 Q_ASSERT (document_id != -1 );
17281742
17291743 {
1730- QString title, author, subject, keywords;
1731- if (info.isPdf ()) {
1732- QPdfDocument doc;
1733- if (doc.load (document_path) != QPdfDocument::Error::None) {
1734- qWarning () << " ERROR: Could not load pdf" << document_id << document_path;
1735- return updateFolderToIndex (folder_id, countForFolder);
1736- }
1737- title = doc.metaData (QPdfDocument::MetaDataField::Title).toString ();
1738- author = doc.metaData (QPdfDocument::MetaDataField::Author).toString ();
1739- subject = doc.metaData (QPdfDocument::MetaDataField::Subject).toString ();
1740- keywords = doc.metaData (QPdfDocument::MetaDataField::Keywords).toString ();
1741- // TODO(jared): metadata for Word documents?
1742- }
1743-
17441744 try {
1745- m_chunkStreamer.setDocument (info, document_id, embedding_model, title, author, subject, keywords );
1745+ m_chunkStreamer.setDocument (info, document_id, embedding_model);
17461746 } catch (const std::runtime_error &e) {
17471747 qWarning () << " LocalDocs ERROR:" << e.what ();
17481748 goto dequeue;
0 commit comments