-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_file.py
More file actions
61 lines (51 loc) · 1.76 KB
/
load_file.py
File metadata and controls
61 lines (51 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
load_file.py
Convert a file into text format (rax text, file handling, audio transcription...)
"""
import os
import chardet
from docx import Document
from odf.opendocument import load as load_odt
from odf.text import P
from striprtf.striprtf import rtf_to_text
from PyPDF2 import PdfReader
def load_file(file_path: str) -> str:
"""
Load and return the contents of a text-based file in many formats.
Args:
file_path (str): The absolute path to the file.
Returns:
str: The text content of the file or an error message.
"""
if not os.path.isabs(file_path):
return("Please provide an absolute file path.")
if not os.path.isfile(file_path):
return(f"No such file: {file_path}")
ext = os.path.splitext(file_path)[1].lower()
if ext == ".docx":
doc = Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
elif ext == ".odt":
doc = load_odt(file_path)
paragraphs = [t.data for t in doc.getElementsByType(P)]
return "\n".join(paragraphs)
elif ext == ".rtf":
with open(file_path, "r", errors="ignore") as f:
return rtf_to_text(f.read())
elif ext == ".pdf":
reader = PdfReader(file_path)
text = []
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text.append(extracted)
return "\n".join(text)
else:
try:
with open(file_path, "rb") as f:
raw = f.read()
detected = chardet.detect(raw)
encoding = detected["encoding"] or "utf-8"
return raw.decode(encoding, errors="replace")
except Exception as e :
return(f"Error while uploading script: {e}")