Support adding ndjson and csv docs in batches

ellnix · ellnix · commit ce3a82e8bdd8 · 2025-01-22T14:01:09.000+01:00
diff --git a/lib/meilisearch/index.rb b/lib/meilisearch/index.rb
@@ -161,6 +161,20 @@ def add_documents_in_batches(documents, batch_size = 1000, primary_key = nil)
       end
     end
 
+    def add_documents_ndjson_in_batches(documents, batch_size = 1000, primary_key = nil)
+      documents.lines.each_slice(batch_size).map do |batch|
+        add_documents_ndjson(batch.join, primary_key)
+      end
+    end
+
+    def add_documents_csv_in_batches(documents, batch_size = 1000, primary_key = nil, delimiter = nil)
+      lines = documents.lines
+      heading = lines.first
+      lines.drop(1).each_slice(batch_size).map do |batch|
+        add_documents_csv(heading + batch.join, primary_key, delimiter)
+      end
+    end
+
     def add_documents_in_batches!(documents, batch_size = 1000, primary_key = nil)
       Utils.soft_deprecate(
         'Index#add_documents_in_batches!',
diff --git a/spec/meilisearch/index/documents_spec.rb b/spec/meilisearch/index/documents_spec.rb
@@ -49,48 +49,6 @@
           expect(index.documents['results'].count).to eq(5)
         end
 
-        it 'adds NDJSON documents' do
-          documents = <<~NDJSON
-            { "objectRef": 123,  "title": "Pride and Prejudice",                    "comment": "A great book" }
-            { "objectRef": 456,  "title": "Le Petit Prince",                        "comment": "A french book" }
-            { "objectRef": 1,    "title": "Alice In Wonderland",                    "comment": "A weird book" }
-            { "objectRef": 4,    "title": "Harry Potter and the Half-Blood Prince", "comment": "The best book" }
-          NDJSON
-          index.add_documents_ndjson(documents, 'objectRef').await
-
-          expect(index.documents['results'].count).to eq(4)
-        end
-
-        it 'adds CSV documents' do
-          documents = <<~CSV
-            "objectRef:number","title:string","comment:string"
-            "1239","Pride and Prejudice","A great book"
-            "4569","Le Petit Prince","A french book"
-            "49","Harry Potter and the Half-Blood Prince","The best book"
-          CSV
-          index.add_documents_csv(documents, 'objectRef').await
-
-          expect(index.documents['results'].count).to eq(3)
-        end
-
-        it 'adds CSV documents with different separator' do
-          documents = <<~CSV
-            "objectRef:number"|"title:string"|"comment:string"
-            "1239"|"Pride and Prejudice"|"A great book"
-            "4569"|"Le Petit Prince"|"A french book"
-            "49"|"Harry Potter and the Half-Blood Prince"|"The best book"
-          CSV
-
-          index.add_documents_csv(documents, 'objectRef', '|').await
-
-          expect(index.documents['results'].count).to eq(3)
-          expect(index.documents['results'][1]).to match(
-            'objectRef' => 4569,
-            'title' => 'Le Petit Prince',
-            'comment' => 'A french book'
-          )
-        end
-
         it 'infers order of fields' do
           index.add_documents(documents).await
           task = index.document(1)
@@ -127,6 +85,7 @@
         expect(index.documents['results']).to contain_exactly(*documents_with_string_keys)
       end
 
+
       context 'given a single document' do
         it 'adds only one document to index (as an hash of one document)' do
           new_doc = { objectId: 30, title: 'Hamlet' }
@@ -162,6 +121,94 @@
       end
     end
 
+    describe 'ndjson and csv methods' do
+      let(:ndjson_docs) do
+        <<~NDJSON
+          { "objectRef": 123,  "title": "Pride and Prejudice",                    "comment": "A great book" }
+          { "objectRef": 456,  "title": "Le Petit Prince",                        "comment": "A french book" }
+          { "objectRef": 4,    "title": "Harry Potter and the Half-Blood Prince", "comment": "The best book" }
+          { "objectRef": 55,    "title": "The Three Body Problem", "comment": "An interesting book" }
+          { "objectRef": 200,    "title": "Project Hail Mary", "comment": "A lonely book" }
+        NDJSON
+      end
+
+      let(:csv_docs) do
+        <<~CSV
+          "objectRef:number","title:string","comment:string"
+          "1239","Pride and Prejudice","A great book"
+          "456","Le Petit Prince","A french book"
+          "49","Harry Potter and the Half-Blood Prince","The best book"
+          "55","The Three Body Problem","An interesting book"
+          "200","Project Hail Mary","A lonely book"
+        CSV
+      end
+
+      let(:csv_docs_custom_delim) do
+        <<~CSV
+          "objectRef:number"|"title:string"|"comment:string"
+          "1239"|"Pride and Prejudice"|"A great book"
+          "456"|"Le Petit Prince"|"A french book"
+          "49"|"Harry Potter and the Half-Blood Prince"|"The best book"
+          "55"|"The Three Body Problem"|"An interesting book"
+          "200"|"Project Hail Mary"|"A lonely book"
+        CSV
+      end
+
+      let(:batch1_doc) do
+        {
+          'objectRef' => 456,
+          'title' => 'Le Petit Prince',
+          'comment' => 'A french book'
+        }
+      end
+
+      let(:batch2_doc) do
+        {
+          'objectRef' => 200,
+          'title' => 'Project Hail Mary',
+          'comment' => 'A lonely book'
+        }
+      end
+
+      it '#add_documents_ndjson' do
+        index.add_documents_ndjson(ndjson_docs, 'objectRef').await
+
+        expect(index.documents['results'].count).to eq(5)
+        expect(index.documents['results']).to include(batch1_doc, batch2_doc)
+      end
+
+      it '#add_documents_csv' do
+        index.add_documents_csv(csv_docs, 'objectRef').await
+        expect(index.documents['results'].count).to eq(5)
+      end
+
+      it '#add_documents_csv with a custom delimiter' do
+        index.add_documents_csv(csv_docs_custom_delim, 'objectRef', '|').await
+
+        expect(index.documents['results'].count).to eq(5)
+        expect(index.documents['results']).to include(batch1_doc, batch2_doc)
+      end
+
+      it '#add_documents_ndjson_in_batches' do
+        tasks = index.add_documents_ndjson_in_batches(ndjson_docs, 4, 'objectRef')
+        expect(tasks).to contain_exactly(a_kind_of(Meilisearch::Models::Task),
+                                         a_kind_of(Meilisearch::Models::Task))
+        tasks.each(&:await)
+        expect(index.documents['results']).to include(batch1_doc, batch2_doc)
+      end
+
+      it '#add_documents_csv_in_batches' do
+        tasks = index.add_documents_csv_in_batches(
+          csv_docs_custom_delim, 4, 'objectRef', '|'
+        )
+        expect(tasks).to contain_exactly(a_kind_of(Meilisearch::Models::Task),
+                                         a_kind_of(Meilisearch::Models::Task))
+        tasks.each(&:await)
+
+        expect(index.documents['results']).to include(batch1_doc, batch2_doc)
+      end
+    end
+
     describe '#add_documents!' do
       before { allow(Meilisearch::Utils).to receive(:soft_deprecate).and_return(nil) }