Skip to content

Commit 6c05e88

Browse files
committed
make sure failures log AND raise exceptions
1 parent 742de80 commit 6c05e88

File tree

1 file changed

+95
-68
lines changed

1 file changed

+95
-68
lines changed

embed_documents.py

Lines changed: 95 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python
22

33
import logging
4+
import sys
45
from pathlib import Path
56

67
import requests
@@ -10,83 +11,109 @@
1011
from loaders.pdf import PDFLoader
1112
from loaders.web import WebLoader
1213

13-
# Load environment config
14+
# Load configuration
1415
config = Config.load()
1516

16-
# Configure logging
17+
# Set up logging
1718
logging.basicConfig(level=config.log_level)
1819
logger = logging.getLogger(__name__)
1920

20-
# Git-based embedding
21-
if config.repo_sources:
22-
logger.info("Starting Git-based document embedding...")
23-
try:
24-
git_loader = GitLoader(config)
25-
git_chunks = git_loader.load()
26-
27-
if git_chunks:
28-
logger.info(
29-
"Adding %d document chunks from Git to vector DB", len(git_chunks)
30-
)
31-
config.db_provider.add_documents(git_chunks)
32-
else:
33-
logger.info("No documents found in Git sources.")
34-
except Exception:
35-
logger.exception("Failed during Git document processing")
36-
37-
# Separate Web URLs by type
38-
pdf_urls = [url for url in config.web_sources if url.lower().endswith(".pdf")]
39-
html_urls = [url for url in config.web_sources if not url.lower().endswith(".pdf")]
40-
41-
# HTML URL embedding
42-
if html_urls:
43-
logger.info("Starting HTML-based web document embedding...")
44-
try:
45-
web_loader = WebLoader(config)
46-
web_chunks = web_loader.load(html_urls)
47-
48-
if web_chunks:
49-
logger.info("Adding %d HTML chunks to vector DB", len(web_chunks))
50-
config.db_provider.add_documents(web_chunks)
51-
else:
52-
logger.info("No chunks produced from HTML URLs.")
53-
except Exception:
54-
logger.exception("Failed during HTML web document processing")
55-
56-
# PDF URL embedding
57-
if pdf_urls:
58-
logger.info("Processing PDF documents from web URLs...")
59-
60-
pdf_dir = Path(config.temp_dir) / "web_pdfs"
61-
pdf_dir.mkdir(parents=True, exist_ok=True)
62-
63-
downloaded_files = []
64-
for url in pdf_urls:
65-
try:
66-
response = requests.get(url)
67-
response.raise_for_status()
6821

69-
filename = Path(url.split("/")[-1])
70-
file_path = pdf_dir / filename
71-
with open(file_path, "wb") as f:
72-
f.write(response.content)
22+
def _fail_and_exit(message: str, exc: Exception) -> None:
23+
"""
24+
Log an error with full traceback and raise the exception.
25+
26+
Args:
27+
message (str): Contextual message to log with the error.
28+
exc (Exception): The exception to raise.
29+
30+
This utility is used to ensure proper logging and failure behavior
31+
across all critical stages of the embedding job.
32+
"""
33+
logger.error("%s: %s", message, exc, exc_info=True)
34+
raise exc
35+
7336

74-
logger.info("Downloaded: %s", file_path)
75-
downloaded_files.append(file_path)
37+
def main() -> None:
38+
# Run Git-based document embedding
39+
if config.repo_sources:
40+
logger.info("Starting Git-based document embedding...")
41+
try:
42+
git_loader = GitLoader(config)
43+
git_chunks = git_loader.load()
44+
45+
if git_chunks:
46+
logger.info(
47+
"Adding %d Git document chunks to vector DB", len(git_chunks)
48+
)
49+
config.db_provider.add_documents(git_chunks)
50+
else:
51+
logger.info("No documents found in Git sources.")
7652
except Exception as e:
77-
logger.exception("Failed to download %s: %s", url, e)
53+
_fail_and_exit("Failed during Git document processing", e)
54+
55+
# Split web sources into HTML and PDF URLs
56+
pdf_urls = [url for url in config.web_sources if url.lower().endswith(".pdf")]
57+
html_urls = [url for url in config.web_sources if not url.lower().endswith(".pdf")]
7858

79-
if downloaded_files:
59+
# Run HTML-based web embedding
60+
if html_urls:
61+
logger.info("Starting HTML-based web document embedding...")
8062
try:
81-
pdf_loader = PDFLoader(config)
82-
pdf_chunks = pdf_loader.load(downloaded_files)
63+
web_loader = WebLoader(config)
64+
web_chunks = web_loader.load(html_urls)
8365

84-
if pdf_chunks:
85-
logger.info("Adding %d PDF chunks to vector DB", len(pdf_chunks))
86-
config.db_provider.add_documents(pdf_chunks)
66+
if web_chunks:
67+
logger.info("Adding %d HTML web chunks to vector DB", len(web_chunks))
68+
config.db_provider.add_documents(web_chunks)
8769
else:
88-
logger.info("No chunks produced from downloaded PDFs.")
89-
except Exception:
90-
logger.exception("Failed during PDF web document processing")
91-
92-
logger.info("Embedding job complete.")
70+
logger.info("No chunks produced from HTML URLs.")
71+
except Exception as e:
72+
_fail_and_exit("Failed during HTML web document processing", e)
73+
74+
# Run PDF-based web embedding
75+
if pdf_urls:
76+
logger.info("Downloading PDF documents from web URLs...")
77+
pdf_dir = Path(config.temp_dir) / "web_pdfs"
78+
pdf_dir.mkdir(parents=True, exist_ok=True)
79+
80+
downloaded_files = []
81+
for url in pdf_urls:
82+
try:
83+
response = requests.get(url)
84+
response.raise_for_status()
85+
86+
filename = Path(url.split("/")[-1])
87+
file_path = pdf_dir / filename
88+
with open(file_path, "wb") as f:
89+
f.write(response.content)
90+
91+
logger.info("Downloaded: %s", file_path)
92+
downloaded_files.append(file_path)
93+
except Exception as e:
94+
_fail_and_exit(f"Failed to download {url}", e)
95+
96+
if downloaded_files:
97+
try:
98+
pdf_loader = PDFLoader(config)
99+
pdf_chunks = pdf_loader.load(downloaded_files)
100+
101+
if pdf_chunks:
102+
logger.info(
103+
"Adding %d PDF web chunks to vector DB", len(pdf_chunks)
104+
)
105+
config.db_provider.add_documents(pdf_chunks)
106+
else:
107+
logger.info("No chunks produced from downloaded PDFs.")
108+
except Exception as e:
109+
_fail_and_exit("Failed during PDF web document processing", e)
110+
111+
logger.info("Embedding job complete.")
112+
113+
114+
if __name__ == "__main__":
115+
try:
116+
main()
117+
except Exception as e:
118+
logger.critical("Fatal error: %s", e, exc_info=True)
119+
sys.exit(1)

0 commit comments

Comments
 (0)