Skip to content

Commit f629eb5

Browse files
authored
Merge pull request #195 from mindsdb/kb-insert-chunking
Chunking data inserted into knowledge base
2 parents b400d23 + 249124a commit f629eb5

File tree

1 file changed

+27
-5
lines changed

1 file changed

+27
-5
lines changed

mindsdb_sdk/knowledge_bases.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import copy
22
import json
3-
from typing import Union, List
3+
from typing import Union, List, Iterable
44

55
import pandas as pd
66

@@ -16,6 +16,19 @@
1616
from .query import Query
1717
from .databases import Database
1818

19+
MAX_INSERT_SIZE = 1000
20+
21+
22+
def split_data(data: Union[pd.DataFrame, list], partition_size: int) -> Iterable:
23+
"""
24+
Split data into chunks with partition_size and yield them out
25+
"""
26+
num = 0
27+
while num * partition_size < len(data):
28+
# create results with partition
29+
yield data[num * partition_size: (num + 1) * partition_size]
30+
num += 1
31+
1932

2033
class KnowledgeBase(Query):
2134
"""
@@ -152,7 +165,7 @@ def insert_webpages(self, urls: List[str], crawl_depth: int = 1,
152165
data=data
153166
)
154167

155-
def insert(self, data: Union[pd.DataFrame, Query, dict], params: dict = None):
168+
def insert(self, data: Union[pd.DataFrame, Query, dict, list], params: dict = None):
156169
"""
157170
Insert data to knowledge base
158171
@@ -176,9 +189,18 @@ def insert(self, data: Union[pd.DataFrame, Query, dict], params: dict = None):
176189
if isinstance(data, dict):
177190
data = [data]
178191
elif isinstance(data, pd.DataFrame):
179-
data = data.to_dict('records')
180-
else:
181-
raise ValueError("Unknown data type, accepted types: DataFrame, Query, dict")
192+
for df in split_data(data, MAX_INSERT_SIZE):
193+
data = df.to_dict('records')
194+
self.insert(data, params=params)
195+
return
196+
elif not isinstance(data, list):
197+
raise ValueError("Unknown data type, accepted types: DataFrame, Query, dict, list")
198+
199+
# chunking a big input data
200+
if len(data) > MAX_INSERT_SIZE:
201+
for chunk in split_data(data, MAX_INSERT_SIZE):
202+
self.insert(chunk, params=params)
203+
return
182204

183205
data = {'rows': data}
184206
if params:

0 commit comments

Comments
 (0)