Documentation for implementing custom preprocessor

**Is your feature request related to a problem? Please describe.**
I need to change the splitting criteria for my document so I have made this changes. But it seems this is not working. Any issue with this code.

```
import re
import nltk
from nltk.tokenize import word_tokenize
from haystack.nodes import PreProcessor

from typing import List, Optional, Generator, Set, Union, Tuple, Dict, Literal

import logging
import re
from copy import deepcopy
from functools import partial, reduce
from itertools import chain
import warnings
from pathlib import Path
from pickle import UnpicklingError

from tqdm import tqdm
from more_itertools import windowed

from haystack.nodes.preprocessor.base import BasePreProcessor
from haystack.errors import HaystackError
from haystack.schema import Document


logger = logging.getLogger(__name__)

class CustomPreprocessor(PreProcessor):
    def __init__(self, min_word_count, max_word_count):
        super().__init__()
        self.min_word_count = min_word_count
        self.max_word_count = max_word_count
        # self.split_by = "passage"
        self.split_respect_sentence_boundary = False
        print("self.clean_whitespace", self.clean_whitespace)
        # print("self.split_by", self.split_by)

        def split_and_check_length(self, input_text):
          # Split text into paragraphs using .\n only if there is no number before the dot
          paragraphs = re.split(r'(?<!\d)(?<=\.\n)(?=\s+\d|\s+[A-Z])', input_text)
          # Initialize result list
          result = []
          
          # Initialize the sentence tokenizer
          sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
          
          if len(paragraphs) < 50:
              paragraphs = re.split(r'(?<!\d)(?<=\n)(?=\s+\d|\s+[A-Z])', input_text)
          # Iterate through paragraphs
          for paragraph in paragraphs:
              # Remove extra spaces for length check
              temp_paragraph = ' '.join(paragraph.split())
              
              # Tokenize the paragraph into words
              words = word_tokenize(temp_paragraph)
              word_count = len(temp_paragraph.split())        
              # Check if the word count in the paragraph is less than min_word_count
              if word_count < self.min_word_count:
                  # Check the size of both the current paragraph and the combined paragraph
                  if result:
                      combined_paragraph = result[-1] + ".\n" + paragraph
                      result[-1] = combined_paragraph
                  else:
                      # If result is empty, just append the paragraph
                      result.append(paragraph)
              else:
                  result.append(paragraph)
          # print(len(result), file_path)
          return result
        def _create_docs_from_splits(
          self,
          text_splits: List[str],
          splits_pages: List[int],
          splits_start_idxs: List[int],
          headlines: List[Dict],
          meta: Dict,
          split_overlap: int,
          id_hash_keys=Optional[List[str]],
      ) -> List[Document]:
          """
          Creates Document objects from text splits enriching them with page number and headline information if given.
          """
          documents: List[Document] = []

          earliest_rel_hl = 0
          for i, txt in enumerate(text_splits):
              meta = deepcopy(meta)
              doc = Document(content=txt, meta=meta, id_hash_keys=id_hash_keys)
              doc.meta["_split_id"] = i
              
              documents.append(doc)

          return documents

        def split(
          self,
          document: Union[dict, Document],
          split_by: Optional[Literal["word", "sentence", "passage"]],
          split_length: int,
          split_overlap: int,
          split_respect_sentence_boundary: bool,
          id_hash_keys: Optional[List[str]] = None,) -> List[Document]:
          """Perform document splitting on a single document. This method can split on different units, at different lengths,
          with different strides. It can also respect sentence boundaries. Its exact functionality is defined by
          the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents.
          """
          print("Splitting ------>>>>>>!!!!")
          if id_hash_keys is None:
              id_hash_keys = self.id_hash_keys

          if isinstance(document, dict):
              document["id_hash_keys"] = id_hash_keys
              document = Document.from_dict(document)

          # Mainly needed for type checking
          if not isinstance(document, Document):
              raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")

          if not split_by:
              return [document]

          if not split_length:
              raise Exception("split_length needs be set when using split_by.")

          if split_respect_sentence_boundary and split_by != "word":
              raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with split_by='word'.")

          if type(document.content) is not str:
              logger.error("Document content is not of type str. Nothing to split.")
              return [document]

          text = document.content
          headlines = document.meta["headlines"] if "headlines" in document.meta else []

          # logger.info("HEadlines <<<<<<<<<<<<<<<_______________________>>>>>>>>>>>>>", headlines)
          # if split_respect_sentence_boundary and split_by == "word":
          #     text_splits, splits_pages, splits_start_idxs = self._split_by_word_respecting_sent_boundary(
          #         text=text, split_length=split_length, split_overlap=split_overlap
          #     )
          # else:
          #     # create individual "elements" of passage, sentence, or word
          #     elements, split_at = self._split_into_units(text=text, split_by=split_by)

          #     # concatenate individual elements based on split_length & split_stride
          #     text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
          #         elements=elements, split_length=split_length, split_overlap=split_overlap, split_at=split_at
          #     )

          text_splits = self.split_and_check_length(text)

          # create new document dicts for each text split
          documents = self._create_docs_from_splits(
              text_splits=text_splits,
              splits_pages=None,
              splits_start_idxs=None,
              headlines=headlines,
              meta=document.meta or {},
              split_overlap=split_overlap,
              id_hash_keys=id_hash_keys,
          )

          return documents

```




Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Documentation for implementing custom preprocessor #6303

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Documentation for implementing custom preprocessor #6303

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions