|
| 1 | +from typing import List |
| 2 | +try: |
| 3 | + from unstructured.partition.md import partition_md |
| 4 | +except ImportError: |
| 5 | + raise ImportError( |
| 6 | + "Could not import unstructured package. " |
| 7 | + "Please install it with `pip install 'unstructured[pdf] @ git+https://github.com/clarifai/unstructured.git@support_clarifai_model'`." |
| 8 | + ) |
| 9 | + |
| 10 | +from clarifai_datautils.constants.pipeline import MAX_CHARACTERS |
| 11 | + |
| 12 | +from .basetransform import BaseTransform |
| 13 | + |
| 14 | + |
| 15 | +class MarkdownPartition(BaseTransform): |
| 16 | + """Partitions Markdown file into text elements.""" |
| 17 | + |
| 18 | + def __init__(self, |
| 19 | + chunking_strategy: str = "basic", |
| 20 | + max_characters=MAX_CHARACTERS, |
| 21 | + overlap=None, |
| 22 | + overlap_all=True, |
| 23 | + **kwargs): |
| 24 | + """Initializes an MarkdownPartition object. |
| 25 | +
|
| 26 | + Args: |
| 27 | + chunking_strategy (str): Chunking strategy to use. |
| 28 | + max_characters (int): Maximum number of characters in a chunk. |
| 29 | + overlap (int): Number of characters to overlap between chunks. |
| 30 | + overlap_all (bool): Whether to overlap all chunks. |
| 31 | + kwargs: Additional keyword arguments. |
| 32 | +
|
| 33 | + """ |
| 34 | + if chunking_strategy not in ["basic", "by_title"]: |
| 35 | + raise ValueError("chunking_strategy should be either 'basic' or 'by_title'.") |
| 36 | + self.chunking_strategy = chunking_strategy |
| 37 | + self.max_characters = max_characters |
| 38 | + self.overlap = overlap |
| 39 | + self.overlap_all = overlap_all |
| 40 | + self.kwargs = kwargs |
| 41 | + |
| 42 | + def __call__(self, elements: List[str]) -> List[str]: |
| 43 | + """Applies the transformation. |
| 44 | +
|
| 45 | + Args: |
| 46 | + elements (List[str]): List of text elements. |
| 47 | +
|
| 48 | + Returns: |
| 49 | + List of transformed text elements. |
| 50 | +
|
| 51 | + """ |
| 52 | + file_elements = [] |
| 53 | + for filename in elements: |
| 54 | + file_element = partition_md( |
| 55 | + filename=filename, |
| 56 | + chunking_strategy=self.chunking_strategy, |
| 57 | + max_characters=self.max_characters, |
| 58 | + overlap=self.overlap, |
| 59 | + overlap_all=self.overlap_all, |
| 60 | + **self.kwargs) |
| 61 | + file_elements.extend(file_element) |
| 62 | + del file_element |
| 63 | + |
| 64 | + return file_elements |
0 commit comments