-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Closed
Description
Is your feature request related to a problem? Please describe.
I need to change the splitting criteria for my document so I have made this changes. But it seems this is not working. Any issue with this code.
import re
import nltk
from nltk.tokenize import word_tokenize
from haystack.nodes import PreProcessor
from typing import List, Optional, Generator, Set, Union, Tuple, Dict, Literal
import logging
import re
from copy import deepcopy
from functools import partial, reduce
from itertools import chain
import warnings
from pathlib import Path
from pickle import UnpicklingError
from tqdm import tqdm
from more_itertools import windowed
from haystack.nodes.preprocessor.base import BasePreProcessor
from haystack.errors import HaystackError
from haystack.schema import Document
logger = logging.getLogger(__name__)
class CustomPreprocessor(PreProcessor):
def __init__(self, min_word_count, max_word_count):
super().__init__()
self.min_word_count = min_word_count
self.max_word_count = max_word_count
# self.split_by = "passage"
self.split_respect_sentence_boundary = False
print("self.clean_whitespace", self.clean_whitespace)
# print("self.split_by", self.split_by)
def split_and_check_length(self, input_text):
# Split text into paragraphs using .\n only if there is no number before the dot
paragraphs = re.split(r'(?<!\d)(?<=\.\n)(?=\s+\d|\s+[A-Z])', input_text)
# Initialize result list
result = []
# Initialize the sentence tokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
if len(paragraphs) < 50:
paragraphs = re.split(r'(?<!\d)(?<=\n)(?=\s+\d|\s+[A-Z])', input_text)
# Iterate through paragraphs
for paragraph in paragraphs:
# Remove extra spaces for length check
temp_paragraph = ' '.join(paragraph.split())
# Tokenize the paragraph into words
words = word_tokenize(temp_paragraph)
word_count = len(temp_paragraph.split())
# Check if the word count in the paragraph is less than min_word_count
if word_count < self.min_word_count:
# Check the size of both the current paragraph and the combined paragraph
if result:
combined_paragraph = result[-1] + ".\n" + paragraph
result[-1] = combined_paragraph
else:
# If result is empty, just append the paragraph
result.append(paragraph)
else:
result.append(paragraph)
# print(len(result), file_path)
return result
def _create_docs_from_splits(
self,
text_splits: List[str],
splits_pages: List[int],
splits_start_idxs: List[int],
headlines: List[Dict],
meta: Dict,
split_overlap: int,
id_hash_keys=Optional[List[str]],
) -> List[Document]:
"""
Creates Document objects from text splits enriching them with page number and headline information if given.
"""
documents: List[Document] = []
earliest_rel_hl = 0
for i, txt in enumerate(text_splits):
meta = deepcopy(meta)
doc = Document(content=txt, meta=meta, id_hash_keys=id_hash_keys)
doc.meta["_split_id"] = i
documents.append(doc)
return documents
def split(
self,
document: Union[dict, Document],
split_by: Optional[Literal["word", "sentence", "passage"]],
split_length: int,
split_overlap: int,
split_respect_sentence_boundary: bool,
id_hash_keys: Optional[List[str]] = None,) -> List[Document]:
"""Perform document splitting on a single document. This method can split on different units, at different lengths,
with different strides. It can also respect sentence boundaries. Its exact functionality is defined by
the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents.
"""
print("Splitting ------>>>>>>!!!!")
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys
if isinstance(document, dict):
document["id_hash_keys"] = id_hash_keys
document = Document.from_dict(document)
# Mainly needed for type checking
if not isinstance(document, Document):
raise HaystackError("Document must not be of type 'dict' but of type 'Document'.")
if not split_by:
return [document]
if not split_length:
raise Exception("split_length needs be set when using split_by.")
if split_respect_sentence_boundary and split_by != "word":
raise NotImplementedError("'split_respect_sentence_boundary=True' is only compatible with split_by='word'.")
if type(document.content) is not str:
logger.error("Document content is not of type str. Nothing to split.")
return [document]
text = document.content
headlines = document.meta["headlines"] if "headlines" in document.meta else []
# logger.info("HEadlines <<<<<<<<<<<<<<<_______________________>>>>>>>>>>>>>", headlines)
# if split_respect_sentence_boundary and split_by == "word":
# text_splits, splits_pages, splits_start_idxs = self._split_by_word_respecting_sent_boundary(
# text=text, split_length=split_length, split_overlap=split_overlap
# )
# else:
# # create individual "elements" of passage, sentence, or word
# elements, split_at = self._split_into_units(text=text, split_by=split_by)
# # concatenate individual elements based on split_length & split_stride
# text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
# elements=elements, split_length=split_length, split_overlap=split_overlap, split_at=split_at
# )
text_splits = self.split_and_check_length(text)
# create new document dicts for each text split
documents = self._create_docs_from_splits(
text_splits=text_splits,
splits_pages=None,
splits_start_idxs=None,
headlines=headlines,
meta=document.meta or {},
split_overlap=split_overlap,
id_hash_keys=id_hash_keys,
)
return documents
Metadata
Metadata
Assignees
Labels
No labels