Ai-Personnal-Assistant/load_file.py at main · neoluigi4123/Ai-Personnal-Assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import chardet
from docx import Document
from odf.opendocument import load as load_odt
from odf.text import P
from striprtf.striprtf import rtf_to_text
from PyPDF2 import PdfReader

def load_file(file_path: str) -> str:
    """
    Load and return the contents of a text-based file in many formats.
    """
    if not os.path.isabs(file_path):
        raise ValueError("Please provide an absolute file path.")

    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"No such file: {file_path}")

    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".docx":
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])

    elif ext == ".odt":
        doc = load_odt(file_path)
        paragraphs = [t.data for t in doc.getElementsByType(P)]
        return "\n".join(paragraphs)

    elif ext == ".rtf":
        with open(file_path, "r", errors="ignore") as f:
            return rtf_to_text(f.read())

    elif ext == ".pdf":
        reader = PdfReader(file_path)
        text = []
        for page in reader.pages:
            extracted = page.extract_text()
            if extracted:
                text.append(extracted)
        return "\n".join(text)

    else:
        try:
            with open(file_path, "rb") as f:
                raw = f.read()
            detected = chardet.detect(raw)
            encoding = detected["encoding"] or "utf-8"
            return raw.decode(encoding, errors="replace")
        except:
            return("File extension not supported")