12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
15
+ import functools
15
16
import logging
17
+ import multiprocessing
18
+ import os
16
19
import re
17
20
from pathlib import Path
18
21
from typing import Callable , Dict , List , Optional
35
38
)
36
39
37
40
logger = logging .getLogger (__name__ )
41
+ import copy
42
+
43
+
44
+ def document_rough_split (document_list , max_token = 4500 ):
45
+ document_index_rough = []
46
+ for item in document_list :
47
+ if len (item ["content" ]) < max_token or "\n " in item :
48
+ document_index_rough .append (item )
49
+ else :
50
+ all_token = len (item ["content" ])
51
+ token_index = [i for i in range (0 , all_token + 1 , max_token )]
52
+ if all_token > token_index [- 1 ]:
53
+ token_index .append (all_token )
54
+ token_index_combine = [item ["content" ][start :end ] for start , end in zip (token_index , token_index [1 :])]
55
+ for txt in token_index_combine :
56
+ txt_split = copy .deepcopy (item )
57
+ txt_split ["content" ] = txt
58
+ document_index_rough .append (txt_split )
59
+ return document_index_rough
60
+
61
+
62
+ def split_document (document_index , all_document , split_text , split_paragraphs : bool , clean_func , path , split_answers ):
63
+ start = document_index [0 ]
64
+ end = document_index [1 ]
65
+ documents = []
66
+ for item in all_document [start :end ]:
67
+ text = item ["content" ]
68
+ if clean_func :
69
+ text = clean_func (text )
70
+ if split_paragraphs is True :
71
+ text_splits = split_text .split_text (text )
72
+ for txt in text_splits :
73
+ if not txt .strip (): # skip empty paragraphs
74
+ continue
75
+ if split_answers :
76
+ query , answer = txt .split ("\t " )
77
+ meta_data = {"name" : path .name , "answer" : answer }
78
+ # Add image list parsed from docx into meta
79
+ if item ["meta" ] is not None and "images" in item ["meta" ]:
80
+ meta_data ["images" ] = item ["meta" ]["images" ]
81
+ documents .append ({"content" : query , "meta" : meta_data })
82
+ else :
83
+ meta_data = {
84
+ "name" : path .name ,
85
+ }
86
+ # Add image list parsed from docx into meta
87
+ if item ["meta" ] is not None and "images" in item ["meta" ]:
88
+ meta_data ["images" ] = item ["meta" ]["images" ]
89
+ documents .append ({"content" : txt , "meta" : meta_data })
90
+ else :
91
+ documents .append ({"content" : text , "meta" : item ["meta" ] if "meta" in item else {"name" : path .name }})
92
+ return documents
93
+
94
+
95
+ def run_process (
96
+ document_combination_index ,
97
+ list_documents ,
98
+ split_text ,
99
+ process_num ,
100
+ split_paragraphs ,
101
+ clean_func ,
102
+ path ,
103
+ split_answers ,
104
+ ):
105
+ process_num = min (os .cpu_count (), process_num )
106
+ pool = multiprocessing .Pool (process_num )
107
+ split_document_c = functools .partial (
108
+ split_document ,
109
+ all_document = list_documents ,
110
+ split_text = split_text ,
111
+ split_paragraphs = split_paragraphs ,
112
+ clean_func = clean_func ,
113
+ path = path ,
114
+ split_answers = split_answers ,
115
+ )
116
+ result = pool .map_async (split_document_c , document_combination_index )
117
+ pool .close ()
118
+ pool .join ()
119
+ return result .get ()
38
120
39
121
40
122
def convert_files_to_dicts (
@@ -43,6 +125,7 @@ def convert_files_to_dicts(
43
125
split_paragraphs : bool = False ,
44
126
split_answers : bool = False ,
45
127
encoding : Optional [str ] = None ,
128
+ process_num : int = 20 ,
46
129
) -> List [dict ]:
47
130
"""
48
131
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
@@ -136,6 +219,7 @@ def convert_files_to_dicts_splitter(
136
219
chunk_size : int = 300 ,
137
220
chunk_overlap : int = 0 ,
138
221
language : str = "chinese" ,
222
+ process_num : int = 10 ,
139
223
) -> List [dict ]:
140
224
"""
141
225
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
@@ -184,6 +268,9 @@ def convert_files_to_dicts_splitter(
184
268
docx_splitter = SpacyTextSplitter (
185
269
separator = separator , filters = filters , chunk_size = chunk_size , chunk_overlap = chunk_overlap
186
270
)
271
+ pdf_splitter = SpacyTextSplitter (
272
+ separator = separator , chunk_size = chunk_size , chunk_overlap = chunk_overlap , filters = filters
273
+ )
187
274
else :
188
275
docx_splitter = SpacyTextSplitter (
189
276
separator = separator ,
@@ -192,12 +279,13 @@ def convert_files_to_dicts_splitter(
192
279
chunk_overlap = chunk_overlap ,
193
280
pipeline = "en_core_web_sm" ,
194
281
)
282
+ pdf_splitter = SpacyTextSplitter (
283
+ separator = separator , chunk_size = chunk_size , chunk_overlap = chunk_overlap , filters = filters
284
+ )
195
285
text_splitter = CharacterTextSplitter (
196
286
separator = separator , chunk_size = chunk_size , chunk_overlap = chunk_overlap , filters = filters
197
287
)
198
- pdf_splitter = CharacterTextSplitter (
199
- separator = separator , chunk_size = chunk_size , chunk_overlap = chunk_overlap , filters = filters
200
- )
288
+
201
289
imgage_splitter = CharacterTextSplitter (
202
290
separator = separator , chunk_size = chunk_size , chunk_overlap = chunk_overlap , filters = filters
203
291
)
@@ -230,34 +318,27 @@ def convert_files_to_dicts_splitter(
230
318
encoding = encoding ,
231
319
language = language ,
232
320
)
233
- for document in list_documents :
234
- text = document ["content" ]
235
- if clean_func :
236
- text = clean_func (text )
237
- if split_paragraphs is True :
238
- text_splits = suffix2splitter [suffix ].split_text (text )
239
- for txt in text_splits :
240
- if not txt .strip (): # skip empty paragraphs
241
- continue
242
- if split_answers :
243
- query , answer = txt .split ("\t " )
244
- meta_data = {"name" : path .name , "answer" : answer }
245
- # Add image list parsed from docx into meta
246
- if document ["meta" ] is not None and "images" in document ["meta" ]:
247
- meta_data ["images" ] = document ["meta" ]["images" ]
248
- documents .append ({"content" : query , "meta" : meta_data })
249
- else :
250
- meta_data = {
251
- "name" : path .name ,
252
- }
253
- # Add image list parsed from docx into meta
254
- if document ["meta" ] is not None and "images" in document ["meta" ]:
255
- meta_data ["images" ] = document ["meta" ]["images" ]
256
- documents .append ({"content" : txt , "meta" : meta_data })
257
- else :
258
- documents .append (
259
- {"content" : text , "meta" : document ["meta" ] if "meta" in document else {"name" : path .name }}
260
- )
321
+ list_documents = document_rough_split (list_documents )
322
+ document_number = len (list_documents )
323
+ split_len = document_number // process_num
324
+ if split_len == 0 :
325
+ split_len = document_number
326
+ document_list = [i for i in range (0 , document_number , split_len )]
327
+ if document_number > document_list [- 1 ]:
328
+ document_list .append (document_number )
329
+ document_combination_index = [(start , end ) for start , end in zip (document_list , document_list [1 :])]
330
+ document_mul = run_process (
331
+ document_combination_index = document_combination_index ,
332
+ list_documents = list_documents ,
333
+ split_text = suffix2splitter [suffix ],
334
+ process_num = process_num ,
335
+ split_paragraphs = split_paragraphs ,
336
+ clean_func = clean_func ,
337
+ path = path ,
338
+ split_answers = split_answers ,
339
+ )
340
+ for item in document_mul :
341
+ documents .extend (item )
261
342
if filters is not None and len (filters ) > 0 :
262
343
documents = clean (documents , filters )
263
344
return documents
0 commit comments