-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_file.py
More file actions
51 lines (43 loc) · 1.52 KB
/
load_file.py
File metadata and controls
51 lines (43 loc) · 1.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import chardet
from docx import Document
from odf.opendocument import load as load_odt
from odf.text import P
from striprtf.striprtf import rtf_to_text
from PyPDF2 import PdfReader
def load_file(file_path: str) -> str:
"""
Load and return the contents of a text-based file in many formats.
"""
if not os.path.isabs(file_path):
raise ValueError("Please provide an absolute file path.")
if not os.path.isfile(file_path):
raise FileNotFoundError(f"No such file: {file_path}")
ext = os.path.splitext(file_path)[1].lower()
if ext == ".docx":
doc = Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
elif ext == ".odt":
doc = load_odt(file_path)
paragraphs = [t.data for t in doc.getElementsByType(P)]
return "\n".join(paragraphs)
elif ext == ".rtf":
with open(file_path, "r", errors="ignore") as f:
return rtf_to_text(f.read())
elif ext == ".pdf":
reader = PdfReader(file_path)
text = []
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text.append(extracted)
return "\n".join(text)
else:
try:
with open(file_path, "rb") as f:
raw = f.read()
detected = chardet.detect(raw)
encoding = detected["encoding"] or "utf-8"
return raw.decode(encoding, errors="replace")
except:
return("File extension not supported")