11import copy
22import json
3- from typing import Union , List
3+ from typing import Union , List , Iterable
44
55import pandas as pd
66
1616from .query import Query
1717from .databases import Database
1818
19+ MAX_INSERT_SIZE = 1000
20+
21+
22+ def split_data (data : Union [pd .DataFrame , list ], partition_size : int ) -> Iterable :
23+ """
24+ Split data into chunks with partition_size and yield them out
25+ """
26+ num = 0
27+ while num * partition_size < len (data ):
28+ # create results with partition
29+ yield data [num * partition_size : (num + 1 ) * partition_size ]
30+ num += 1
31+
1932
2033class KnowledgeBase (Query ):
2134 """
@@ -152,7 +165,7 @@ def insert_webpages(self, urls: List[str], crawl_depth: int = 1,
152165 data = data
153166 )
154167
155- def insert (self , data : Union [pd .DataFrame , Query , dict ], params : dict = None ):
168+ def insert (self , data : Union [pd .DataFrame , Query , dict , list ], params : dict = None ):
156169 """
157170 Insert data to knowledge base
158171
@@ -176,9 +189,18 @@ def insert(self, data: Union[pd.DataFrame, Query, dict], params: dict = None):
176189 if isinstance (data , dict ):
177190 data = [data ]
178191 elif isinstance (data , pd .DataFrame ):
179- data = data .to_dict ('records' )
180- else :
181- raise ValueError ("Unknown data type, accepted types: DataFrame, Query, dict" )
192+ for df in split_data (data , MAX_INSERT_SIZE ):
193+ data = df .to_dict ('records' )
194+ self .insert (data , params = params )
195+ return
196+ elif not isinstance (data , list ):
197+ raise ValueError ("Unknown data type, accepted types: DataFrame, Query, dict, list" )
198+
199+ # chunking a big input data
200+ if len (data ) > MAX_INSERT_SIZE :
201+ for chunk in split_data (data , MAX_INSERT_SIZE ):
202+ self .insert (chunk , params = params )
203+ return
182204
183205 data = {'rows' : data }
184206 if params :
0 commit comments