update processing (#7743)

qingzhong1 · web-flow · commit 6e9d46571ab5 · 2023-12-29T14:51:06.000+08:00
* update processing

* modify pdf

* modify pdf

* modify pdf

* modify pdf

* update pdf
diff --git a/pipelines/pipelines/nodes/file_converter/pdf.py b/pipelines/pipelines/nodes/file_converter/pdf.py
@@ -40,9 +40,13 @@ def extract_pages(page_list, file_path):
     end = page_list[1]
     page_text = []
     pdf = pypdf.PdfReader(file_path)
-    for page in pdf.pages[start:end]:
-        paragraphs = page.extract_text()
-        page_text.append(paragraphs)
+    for index, page in enumerate(pdf.pages[start:end]):
+        try:
+            paragraphs = page.extract_text()
+            paragraphs = paragraphs.encode("UTF-8", "ignore").decode("UTF-8")
+            page_text.append(paragraphs)
+        except Exception as e:
+            logger.warning("Page %d of the file cannot be parsed correctly %s" % (index + start + 1, str(e)))
     return page_text
 
 
@@ -109,58 +113,14 @@ def convert(
                                 not one of the valid languages, then it might likely be encoding error resulting
                                 in garbled text.
         """
-        pages = self._read_pdf(file_path, layout=False, process_num=process_num)
-        if remove_numeric_tables is None:
-            remove_numeric_tables = self.remove_numeric_tables
-        if valid_languages is None:
-            valid_languages = self.valid_languages
-        if language is None:
-            language = self.language
-        cleaned_pages = []
+        pages = self._read_pdf(file_path, process_num=process_num)
+        documents = []
         for page in pages:
-            # pdftotext tool provides an option to retain the original physical layout of a PDF page. This behaviour
-            # can be toggled by using the layout param.
-            #  layout=True
-            #      + table structures get retained better
-            #      - multi-column pages(eg, research papers) gets extracted with text from multiple columns on same line
-            #  layout=False
-            #      + keeps strings in content stream order, hence multi column layout works well
-            #      - cells of tables gets split across line
-            #
-            #  Here, as a "safe" default, layout is turned off.
-            lines = page.splitlines()
-
-            cleaned_lines = []
-            for line in lines:
-                if self.language == "chinese":
-                    words = list(line)
-                else:
-                    words = line.split()
-                digits = [word for word in words if any(i.isdigit() for i in word)]
-
-                # remove lines having > 40% of words as digits AND not ending with a period(.)
-                if remove_numeric_tables:
-                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
-                        logger.debug(f"Removing line '{line}' from {file_path}")
-                        continue
-                cleaned_lines.append(line)
-
-            page = "\n".join(cleaned_lines)
-            cleaned_pages.append(page)
-
-        if valid_languages:
-            document_text = "".join(cleaned_pages)
-            if not self.validate_language(document_text, valid_languages):
-                logger.warning(
-                    f"The language for {file_path} is not one of {valid_languages}. The file may not have "
-                    f"been decoded in the correct text format."
-                )
-
-        text = "\f".join(cleaned_pages)
-        document = {"content": text, "content_type": "text", "meta": meta}
-        return [document]
+            document = {"content": page, "content_type": "text", "meta": meta}
+            documents.append(document)
+        return documents
 
-    def _read_pdf(self, file_path: Path, layout: bool, process_num: int) -> List[str]:
+    def _read_pdf(self, file_path: Path, process_num: int) -> List[str]:
         """
         Extract pages from the pdf file at file_path.
 
diff --git a/pipelines/pipelines/utils/preprocessing.py b/pipelines/pipelines/utils/preprocessing.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 import logging
+import multiprocessing
+import os
 import re
 from pathlib import Path
 from typing import Callable, Dict, List, Optional
@@ -35,6 +38,85 @@
 )
 
 logger = logging.getLogger(__name__)
+import copy
+
+
+def document_rough_split(document_list, max_token=4500):
+    document_index_rough = []
+    for item in document_list:
+        if len(item["content"]) < max_token or "\n" in item:
+            document_index_rough.append(item)
+        else:
+            all_token = len(item["content"])
+            token_index = [i for i in range(0, all_token + 1, max_token)]
+            if all_token > token_index[-1]:
+                token_index.append(all_token)
+            token_index_combine = [item["content"][start:end] for start, end in zip(token_index, token_index[1:])]
+            for txt in token_index_combine:
+                txt_split = copy.deepcopy(item)
+                txt_split["content"] = txt
+                document_index_rough.append(txt_split)
+    return document_index_rough
+
+
+def split_document(document_index, all_document, split_text, split_paragraphs: bool, clean_func, path, split_answers):
+    start = document_index[0]
+    end = document_index[1]
+    documents = []
+    for item in all_document[start:end]:
+        text = item["content"]
+        if clean_func:
+            text = clean_func(text)
+        if split_paragraphs is True:
+            text_splits = split_text.split_text(text)
+            for txt in text_splits:
+                if not txt.strip():  # skip empty paragraphs
+                    continue
+                if split_answers:
+                    query, answer = txt.split("\t")
+                    meta_data = {"name": path.name, "answer": answer}
+                    # Add image list parsed from docx into meta
+                    if item["meta"] is not None and "images" in item["meta"]:
+                        meta_data["images"] = item["meta"]["images"]
+                    documents.append({"content": query, "meta": meta_data})
+                else:
+                    meta_data = {
+                        "name": path.name,
+                    }
+                    # Add image list parsed from docx into meta
+                    if item["meta"] is not None and "images" in item["meta"]:
+                        meta_data["images"] = item["meta"]["images"]
+                    documents.append({"content": txt, "meta": meta_data})
+        else:
+            documents.append({"content": text, "meta": item["meta"] if "meta" in item else {"name": path.name}})
+    return documents
+
+
+def run_process(
+    document_combination_index,
+    list_documents,
+    split_text,
+    process_num,
+    split_paragraphs,
+    clean_func,
+    path,
+    split_answers,
+):
+    process_num = min(os.cpu_count(), process_num)
+    pool = multiprocessing.Pool(process_num)
+    split_document_c = functools.partial(
+        split_document,
+        all_document=list_documents,
+        split_text=split_text,
+        split_paragraphs=split_paragraphs,
+        clean_func=clean_func,
+        path=path,
+        split_answers=split_answers,
+    )
+    result = pool.map_async(split_document_c, document_combination_index)
+    pool.close()
+    pool.join()
+    return result.get()
 
 
 def convert_files_to_dicts(
@@ -43,6 +125,7 @@ def convert_files_to_dicts(
     split_paragraphs: bool = False,
     split_answers: bool = False,
     encoding: Optional[str] = None,
+    process_num: int = 20,
 ) -> List[dict]:
     """
     Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
@@ -136,6 +219,7 @@ def convert_files_to_dicts_splitter(
     chunk_size: int = 300,
     chunk_overlap: int = 0,
     language: str = "chinese",
+    process_num: int = 10,
 ) -> List[dict]:
     """
     Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
@@ -184,6 +268,9 @@ def convert_files_to_dicts_splitter(
         docx_splitter = SpacyTextSplitter(
             separator=separator, filters=filters, chunk_size=chunk_size, chunk_overlap=chunk_overlap
         )
+        pdf_splitter = SpacyTextSplitter(
+            separator=separator, chunk_size=chunk_size, chunk_overlap=chunk_overlap, filters=filters
+        )
     else:
         docx_splitter = SpacyTextSplitter(
             separator=separator,
@@ -192,12 +279,13 @@ def convert_files_to_dicts_splitter(
             chunk_overlap=chunk_overlap,
             pipeline="en_core_web_sm",
         )
+        pdf_splitter = SpacyTextSplitter(
+            separator=separator, chunk_size=chunk_size, chunk_overlap=chunk_overlap, filters=filters
+        )
     text_splitter = CharacterTextSplitter(
         separator=separator, chunk_size=chunk_size, chunk_overlap=chunk_overlap, filters=filters
     )
-    pdf_splitter = CharacterTextSplitter(
-        separator=separator, chunk_size=chunk_size, chunk_overlap=chunk_overlap, filters=filters
-    )
+
     imgage_splitter = CharacterTextSplitter(
         separator=separator, chunk_size=chunk_size, chunk_overlap=chunk_overlap, filters=filters
     )
@@ -230,34 +318,27 @@ def convert_files_to_dicts_splitter(
                 encoding=encoding,
                 language=language,
             )
-            for document in list_documents:
-                text = document["content"]
-                if clean_func:
-                    text = clean_func(text)
-                if split_paragraphs is True:
-                    text_splits = suffix2splitter[suffix].split_text(text)
-                    for txt in text_splits:
-                        if not txt.strip():  # skip empty paragraphs
-                            continue
-                        if split_answers:
-                            query, answer = txt.split("\t")
-                            meta_data = {"name": path.name, "answer": answer}
-                            # Add image list parsed from docx into meta
-                            if document["meta"] is not None and "images" in document["meta"]:
-                                meta_data["images"] = document["meta"]["images"]
-                            documents.append({"content": query, "meta": meta_data})
-                        else:
-                            meta_data = {
-                                "name": path.name,
-                            }
-                            # Add image list parsed from docx into meta
-                            if document["meta"] is not None and "images" in document["meta"]:
-                                meta_data["images"] = document["meta"]["images"]
-                            documents.append({"content": txt, "meta": meta_data})
-                else:
-                    documents.append(
-                        {"content": text, "meta": document["meta"] if "meta" in document else {"name": path.name}}
-                    )
+            list_documents = document_rough_split(list_documents)
+            document_number = len(list_documents)
+            split_len = document_number // process_num
+            if split_len == 0:
+                split_len = document_number
+            document_list = [i for i in range(0, document_number, split_len)]
+            if document_number > document_list[-1]:
+                document_list.append(document_number)
+            document_combination_index = [(start, end) for start, end in zip(document_list, document_list[1:])]
+            document_mul = run_process(
+                document_combination_index=document_combination_index,
+                list_documents=list_documents,
+                split_text=suffix2splitter[suffix],
+                process_num=process_num,
+                split_paragraphs=split_paragraphs,
+                clean_func=clean_func,
+                path=path,
+                split_answers=split_answers,
+            )
+            for item in document_mul:
+                documents.extend(item)
     if filters is not None and len(filters) > 0:
         documents = clean(documents, filters)
     return documents