3232#include < stdexcept>
3333
3434using namespace Qt ::Literals::StringLiterals;
35+ namespace ranges = std::ranges;
3536namespace us = unum::usearch;
3637
3738// #define DEBUG
@@ -175,6 +176,14 @@ static const QString INSERT_CHUNK_FTS_SQL = uR"(
175176 values(?, ?, ?, ?, ?, ?, ?);
176177)" _s;
177178
179+ static const QString SELECT_CHUNKED_DOCUMENTS_SQL[] = {
180+ uR"(
181+ select distinct document_id from chunks;
182+ )" _s, uR"(
183+ select distinct document_id from chunks_fts;
184+ )" _s,
185+ };
186+
178187static const QString DELETE_CHUNKS_SQL[] = {
179188 uR"(
180189 delete from embeddings
@@ -230,53 +239,6 @@ static const QString SELECT_CHUNKS_FTS_SQL = uR"(
230239 order by score limit %1;
231240)" _s;
232241
233- static bool addChunk (QSqlQuery &q, int document_id, const QString &chunk_text, const QString &file,
234- const QString &title, const QString &author, const QString &subject, const QString &keywords,
235- int page, int from, int to, int words, int *chunk_id)
236- {
237- if (!q.prepare (INSERT_CHUNK_SQL))
238- return false ;
239- q.addBindValue (document_id);
240- q.addBindValue (chunk_text);
241- q.addBindValue (file);
242- q.addBindValue (title);
243- q.addBindValue (author);
244- q.addBindValue (subject);
245- q.addBindValue (keywords);
246- q.addBindValue (page);
247- q.addBindValue (from);
248- q.addBindValue (to);
249- q.addBindValue (words);
250- if (!q.exec () || !q.next ())
251- return false ;
252- *chunk_id = q.value (0 ).toInt ();
253-
254- if (!q.prepare (INSERT_CHUNK_FTS_SQL))
255- return false ;
256- q.addBindValue (document_id);
257- q.addBindValue (chunk_text);
258- q.addBindValue (file);
259- q.addBindValue (title);
260- q.addBindValue (author);
261- q.addBindValue (subject);
262- q.addBindValue (keywords);
263- if (!q.exec ())
264- return false ;
265- return true ;
266- }
267-
268- static bool removeChunksByDocumentId (QSqlQuery &q, int document_id)
269- {
270- for (const auto &cmd: DELETE_CHUNKS_SQL) {
271- if (!q.prepare (cmd))
272- return false ;
273- q.addBindValue (document_id);
274- if (!q.exec ())
275- return false ;
276- }
277- return true ;
278- }
279-
280242#define NAMED_PAIR (name, typea, a, typeb, b ) \
281243 struct name { typea a; typeb b; }; \
282244 static bool operator ==(const name &x, const name &y) { return x.a == y.a && x.b == y.b ; } \
@@ -634,18 +596,6 @@ static bool selectAllFolderPaths(QSqlQuery &q, QList<QString> *folder_paths)
634596 return true ;
635597}
636598
637- static bool sqlRemoveDocsByFolderPath (QSqlQuery &q, const QString &path)
638- {
639- for (const auto &cmd: FOLDER_REMOVE_ALL_DOCS_SQL) {
640- if (!q.prepare (cmd))
641- return false ;
642- q.addBindValue (path);
643- if (!q.exec ())
644- return false ;
645- }
646- return true ;
647- }
648-
649599static const QString INSERT_COLLECTION_ITEM_SQL = uR"(
650600 insert into collection_items(collection_id, folder_id)
651601 values(?, ?)
@@ -889,6 +839,79 @@ void Database::rollback()
889839 Q_ASSERT (ok);
890840}
891841
842+ bool Database::refreshDocumentIdCache (QSqlQuery &q)
843+ {
844+ m_documentIdCache.clear ();
845+ for (const auto &cmd: SELECT_CHUNKED_DOCUMENTS_SQL) {
846+ if (!q.exec (cmd))
847+ return false ;
848+ while (q.next ())
849+ m_documentIdCache << q.value (0 ).toInt ();
850+ }
851+ return true ;
852+ }
853+
854+ bool Database::addChunk (QSqlQuery &q, int document_id, const QString &chunk_text, const QString &file,
855+ const QString &title, const QString &author, const QString &subject, const QString &keywords,
856+ int page, int from, int to, int words, int *chunk_id)
857+ {
858+ if (!q.prepare (INSERT_CHUNK_SQL))
859+ return false ;
860+ q.addBindValue (document_id);
861+ q.addBindValue (chunk_text);
862+ q.addBindValue (file);
863+ q.addBindValue (title);
864+ q.addBindValue (author);
865+ q.addBindValue (subject);
866+ q.addBindValue (keywords);
867+ q.addBindValue (page);
868+ q.addBindValue (from);
869+ q.addBindValue (to);
870+ q.addBindValue (words);
871+ if (!q.exec () || !q.next ())
872+ return false ;
873+ *chunk_id = q.value (0 ).toInt ();
874+
875+ if (!q.prepare (INSERT_CHUNK_FTS_SQL))
876+ return false ;
877+ q.addBindValue (document_id);
878+ q.addBindValue (chunk_text);
879+ q.addBindValue (file);
880+ q.addBindValue (title);
881+ q.addBindValue (author);
882+ q.addBindValue (subject);
883+ q.addBindValue (keywords);
884+ if (!q.exec ())
885+ return false ;
886+ m_documentIdCache << document_id;
887+ return true ;
888+ }
889+
890+ bool Database::removeChunksByDocumentId (QSqlQuery &q, int document_id)
891+ {
892+ for (const auto &cmd: DELETE_CHUNKS_SQL) {
893+ if (!q.prepare (cmd))
894+ return false ;
895+ q.addBindValue (document_id);
896+ if (!q.exec ())
897+ return false ;
898+ }
899+ m_documentIdCache.remove (document_id);
900+ return true ;
901+ }
902+
903+ bool Database::sqlRemoveDocsByFolderPath (QSqlQuery &q, const QString &path)
904+ {
905+ for (const auto &cmd: FOLDER_REMOVE_ALL_DOCS_SQL) {
906+ if (!q.prepare (cmd))
907+ return false ;
908+ q.addBindValue (path);
909+ if (!q.exec ())
910+ return false ;
911+ }
912+ return refreshDocumentIdCache (q);
913+ }
914+
892915bool Database::hasContent ()
893916{
894917 return m_db.tables ().contains (" chunks" , Qt::CaseInsensitive);
@@ -1246,9 +1269,13 @@ class TxtDocumentReader final : public DocumentReader {
12461269protected:
12471270 std::optional<QString> advance () override
12481271 {
1272+ if (getError ())
1273+ return std::nullopt ;
12491274 while (!m_stream.atEnd ()) {
12501275 QString word;
12511276 m_stream >> word;
1277+ if (getError ())
1278+ return std::nullopt ;
12521279 if (!word.isEmpty ())
12531280 return word;
12541281 }
@@ -1257,9 +1284,11 @@ class TxtDocumentReader final : public DocumentReader {
12571284
12581285 std::optional<ChunkStreamer::Status> getError () const override
12591286 {
1260- if (!m_file.error ())
1261- return std::nullopt ;
1262- return m_file.binarySeen () ? ChunkStreamer::Status::BINARY_SEEN : ChunkStreamer::Status::ERROR;
1287+ if (m_file.binarySeen ())
1288+ return ChunkStreamer::Status::BINARY_SEEN;
1289+ if (m_file.error ())
1290+ return ChunkStreamer::Status::ERROR;
1291+ return std::nullopt ;
12631292 }
12641293
12651294 BinaryDetectingFile m_file;
@@ -1300,12 +1329,24 @@ void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const Q
13001329 m_page = 0 ;
13011330
13021331 // make sure the document doesn't already have any chunks
1303- QSqlQuery q (m_database->m_db );
1304- if (!removeChunksByDocumentId (q, documentId))
1305- handleDocumentError (" ERROR: Cannot remove chunks of document" , documentId, doc.file .canonicalPath (), q.lastError ());
1332+ if (m_database->m_documentIdCache .contains (documentId)) {
1333+ QSqlQuery q (m_database->m_db );
1334+ if (!m_database->removeChunksByDocumentId (q, documentId))
1335+ handleDocumentError (" ERROR: Cannot remove chunks of document" , documentId, doc.file .canonicalPath (), q.lastError ());
1336+ }
13061337 }
13071338}
13081339
1340+ std::optional<DocumentInfo::key_type> ChunkStreamer::currentDocKey () const
1341+ {
1342+ return m_docKey;
1343+ }
1344+
1345+ void ChunkStreamer::reset ()
1346+ {
1347+ m_docKey.reset ();
1348+ }
1349+
13091350ChunkStreamer::Status ChunkStreamer::step ()
13101351{
13111352 // TODO: implement line_from/line_to
@@ -1318,8 +1359,10 @@ ChunkStreamer::Status ChunkStreamer::step()
13181359 Status retval;
13191360
13201361 for (;;) {
1321- if (auto error = m_reader->getError ())
1362+ if (auto error = m_reader->getError ()) {
1363+ m_docKey.reset (); // done processing
13221364 return *error;
1365+ }
13231366 if (m_database->scanQueueInterrupted ()) {
13241367 retval = Status::INTERRUPTED;
13251368 break ;
@@ -1340,43 +1383,52 @@ ChunkStreamer::Status ChunkStreamer::step()
13401383 }
13411384 }
13421385
1343- if (!word || m_chunk.length () >= maxChunkSize + 1 ) { // +1 for leading space
1386+ if (!word || m_chunk.length () >= maxChunkSize + 1 ) { // +1 for trailing space
13441387 if (!m_chunk.isEmpty ()) {
13451388 int nThisChunkWords = 0 ;
13461389 auto chunk = m_chunk; // copy
13471390
13481391 // handle overlength chunks
13491392 if (m_chunk.length () > maxChunkSize + 1 ) {
13501393 // find the final space
1351- qsizetype lastSpace = chunk.lastIndexOf (u' ' , -2 );
1394+ qsizetype chunkEnd = chunk.lastIndexOf (u' ' , -2 );
13521395
1353- if (lastSpace < 0 ) {
1396+ qsizetype spaceSize;
1397+ if (chunkEnd >= 0 ) {
13541398 // slice off the last word
1399+ spaceSize = 1 ;
13551400 Q_ASSERT (m_nChunkWords >= 1 );
1356- lastSpace = maxChunkSize;
1401+ // one word left
13571402 nThisChunkWords = m_nChunkWords - 1 ;
13581403 m_nChunkWords = 1 ;
13591404 } else {
13601405 // slice the overlong word
1406+ spaceSize = 0 ;
1407+ chunkEnd = maxChunkSize;
1408+ // partial word left, don't count it
13611409 nThisChunkWords = m_nChunkWords;
13621410 m_nChunkWords = 0 ;
13631411 }
1364- // save the extra part
1365- m_chunk = chunk.sliced (lastSpace + 1 );
1366- // slice
1367- chunk.truncate (lastSpace + 1 );
1368- Q_ASSERT (chunk.length () <= maxChunkSize + 1 );
1412+ // save the second part, excluding space if any
1413+ m_chunk = chunk.sliced (chunkEnd + spaceSize);
1414+ // consume the first part
1415+ chunk.truncate (chunkEnd);
13691416 } else {
13701417 nThisChunkWords = m_nChunkWords;
13711418 m_nChunkWords = 0 ;
1419+ // there is no second part
1420+ m_chunk.clear ();
1421+ // consume the whole chunk, excluding space
1422+ chunk.chop (1 );
13721423 }
1424+ Q_ASSERT (chunk.length () <= maxChunkSize);
13731425
13741426 QSqlQuery q (m_database->m_db );
13751427 int chunkId = 0 ;
1376- if (!addChunk (q,
1428+ if (!m_database-> addChunk (q,
13771429 m_documentId,
1378- chunk. chopped ( 1 ), // strip trailing space
1379- m_reader->doc ().file .canonicalFilePath (),
1430+ chunk,
1431+ m_reader->doc ().file .fileName (), // basename
13801432 m_title,
13811433 m_author,
13821434 m_subject,
@@ -1399,12 +1451,11 @@ ChunkStreamer::Status ChunkStreamer::step()
13991451 toEmbed.chunk = chunk;
14001452 m_database->appendChunk (toEmbed);
14011453 ++nChunks;
1402-
1403- m_chunk.clear ();
14041454 }
14051455
14061456 if (!word) {
14071457 retval = Status::DOC_COMPLETE;
1458+ m_docKey.reset (); // done processing
14081459 break ;
14091460 }
14101461 }
@@ -1532,8 +1583,14 @@ DocumentInfo Database::dequeueDocument()
15321583
15331584void Database::removeFolderFromDocumentQueue (int folder_id)
15341585{
1535- if (auto it = m_docsToScan.find (folder_id); it != m_docsToScan.end ())
1536- m_docsToScan.erase (it);
1586+ if (auto queueIt = m_docsToScan.find (folder_id); queueIt != m_docsToScan.end ()) {
1587+ if (auto key = m_chunkStreamer.currentDocKey ()) {
1588+ if (ranges::any_of (queueIt->second , [&key](auto &d) { return d.key () == key; }))
1589+ m_chunkStreamer.reset (); // done with this document
1590+ }
1591+ // remove folder from queue
1592+ m_docsToScan.erase (queueIt);
1593+ }
15371594}
15381595
15391596void Database::enqueueDocumentInternal (DocumentInfo &&info, bool prepend)
@@ -1758,7 +1815,12 @@ void Database::start()
17581815 m_databaseValid = false ;
17591816 } else {
17601817 cleanDB ();
1761- addCurrentFolders ();
1818+ QSqlQuery q (m_db);
1819+ if (!refreshDocumentIdCache (q)) {
1820+ m_databaseValid = false ;
1821+ } else {
1822+ addCurrentFolders ();
1823+ }
17621824 }
17631825
17641826 if (!m_databaseValid)
0 commit comments