|
1 | 1 | #!/usr/bin/env python |
2 | 2 |
|
3 | 3 | import logging |
| 4 | +import sys |
4 | 5 | from pathlib import Path |
5 | 6 |
|
6 | 7 | import requests |
|
10 | 11 | from loaders.pdf import PDFLoader |
11 | 12 | from loaders.web import WebLoader |
12 | 13 |
|
13 | | -# Load environment config |
| 14 | +# Load configuration |
14 | 15 | config = Config.load() |
15 | 16 |
|
16 | | -# Configure logging |
| 17 | +# Set up logging |
17 | 18 | logging.basicConfig(level=config.log_level) |
18 | 19 | logger = logging.getLogger(__name__) |
19 | 20 |
|
20 | | -# Git-based embedding |
21 | | -if config.repo_sources: |
22 | | - logger.info("Starting Git-based document embedding...") |
23 | | - try: |
24 | | - git_loader = GitLoader(config) |
25 | | - git_chunks = git_loader.load() |
26 | | - |
27 | | - if git_chunks: |
28 | | - logger.info( |
29 | | - "Adding %d document chunks from Git to vector DB", len(git_chunks) |
30 | | - ) |
31 | | - config.db_provider.add_documents(git_chunks) |
32 | | - else: |
33 | | - logger.info("No documents found in Git sources.") |
34 | | - except Exception: |
35 | | - logger.exception("Failed during Git document processing") |
36 | | - |
37 | | -# Separate Web URLs by type |
38 | | -pdf_urls = [url for url in config.web_sources if url.lower().endswith(".pdf")] |
39 | | -html_urls = [url for url in config.web_sources if not url.lower().endswith(".pdf")] |
40 | | - |
41 | | -# HTML URL embedding |
42 | | -if html_urls: |
43 | | - logger.info("Starting HTML-based web document embedding...") |
44 | | - try: |
45 | | - web_loader = WebLoader(config) |
46 | | - web_chunks = web_loader.load(html_urls) |
47 | | - |
48 | | - if web_chunks: |
49 | | - logger.info("Adding %d HTML chunks to vector DB", len(web_chunks)) |
50 | | - config.db_provider.add_documents(web_chunks) |
51 | | - else: |
52 | | - logger.info("No chunks produced from HTML URLs.") |
53 | | - except Exception: |
54 | | - logger.exception("Failed during HTML web document processing") |
55 | | - |
56 | | -# PDF URL embedding |
57 | | -if pdf_urls: |
58 | | - logger.info("Processing PDF documents from web URLs...") |
59 | | - |
60 | | - pdf_dir = Path(config.temp_dir) / "web_pdfs" |
61 | | - pdf_dir.mkdir(parents=True, exist_ok=True) |
62 | | - |
63 | | - downloaded_files = [] |
64 | | - for url in pdf_urls: |
65 | | - try: |
66 | | - response = requests.get(url) |
67 | | - response.raise_for_status() |
68 | 21 |
|
69 | | - filename = Path(url.split("/")[-1]) |
70 | | - file_path = pdf_dir / filename |
71 | | - with open(file_path, "wb") as f: |
72 | | - f.write(response.content) |
| 22 | +def _fail_and_exit(message: str, exc: Exception) -> None: |
| 23 | + """ |
| 24 | + Log an error with full traceback and raise the exception. |
| 25 | +
|
| 26 | + Args: |
| 27 | + message (str): Contextual message to log with the error. |
| 28 | + exc (Exception): The exception to raise. |
| 29 | +
|
| 30 | + This utility is used to ensure proper logging and failure behavior |
| 31 | + across all critical stages of the embedding job. |
| 32 | + """ |
| 33 | + logger.error("%s: %s", message, exc, exc_info=True) |
| 34 | + raise exc |
| 35 | + |
73 | 36 |
|
74 | | - logger.info("Downloaded: %s", file_path) |
75 | | - downloaded_files.append(file_path) |
| 37 | +def main() -> None: |
| 38 | + # Run Git-based document embedding |
| 39 | + if config.repo_sources: |
| 40 | + logger.info("Starting Git-based document embedding...") |
| 41 | + try: |
| 42 | + git_loader = GitLoader(config) |
| 43 | + git_chunks = git_loader.load() |
| 44 | + |
| 45 | + if git_chunks: |
| 46 | + logger.info( |
| 47 | + "Adding %d Git document chunks to vector DB", len(git_chunks) |
| 48 | + ) |
| 49 | + config.db_provider.add_documents(git_chunks) |
| 50 | + else: |
| 51 | + logger.info("No documents found in Git sources.") |
76 | 52 | except Exception as e: |
77 | | - logger.exception("Failed to download %s: %s", url, e) |
| 53 | + _fail_and_exit("Failed during Git document processing", e) |
| 54 | + |
| 55 | + # Split web sources into HTML and PDF URLs |
| 56 | + pdf_urls = [url for url in config.web_sources if url.lower().endswith(".pdf")] |
| 57 | + html_urls = [url for url in config.web_sources if not url.lower().endswith(".pdf")] |
78 | 58 |
|
79 | | - if downloaded_files: |
| 59 | + # Run HTML-based web embedding |
| 60 | + if html_urls: |
| 61 | + logger.info("Starting HTML-based web document embedding...") |
80 | 62 | try: |
81 | | - pdf_loader = PDFLoader(config) |
82 | | - pdf_chunks = pdf_loader.load(downloaded_files) |
| 63 | + web_loader = WebLoader(config) |
| 64 | + web_chunks = web_loader.load(html_urls) |
83 | 65 |
|
84 | | - if pdf_chunks: |
85 | | - logger.info("Adding %d PDF chunks to vector DB", len(pdf_chunks)) |
86 | | - config.db_provider.add_documents(pdf_chunks) |
| 66 | + if web_chunks: |
| 67 | + logger.info("Adding %d HTML web chunks to vector DB", len(web_chunks)) |
| 68 | + config.db_provider.add_documents(web_chunks) |
87 | 69 | else: |
88 | | - logger.info("No chunks produced from downloaded PDFs.") |
89 | | - except Exception: |
90 | | - logger.exception("Failed during PDF web document processing") |
91 | | - |
92 | | -logger.info("Embedding job complete.") |
| 70 | + logger.info("No chunks produced from HTML URLs.") |
| 71 | + except Exception as e: |
| 72 | + _fail_and_exit("Failed during HTML web document processing", e) |
| 73 | + |
| 74 | + # Run PDF-based web embedding |
| 75 | + if pdf_urls: |
| 76 | + logger.info("Downloading PDF documents from web URLs...") |
| 77 | + pdf_dir = Path(config.temp_dir) / "web_pdfs" |
| 78 | + pdf_dir.mkdir(parents=True, exist_ok=True) |
| 79 | + |
| 80 | + downloaded_files = [] |
| 81 | + for url in pdf_urls: |
| 82 | + try: |
| 83 | + response = requests.get(url) |
| 84 | + response.raise_for_status() |
| 85 | + |
| 86 | + filename = Path(url.split("/")[-1]) |
| 87 | + file_path = pdf_dir / filename |
| 88 | + with open(file_path, "wb") as f: |
| 89 | + f.write(response.content) |
| 90 | + |
| 91 | + logger.info("Downloaded: %s", file_path) |
| 92 | + downloaded_files.append(file_path) |
| 93 | + except Exception as e: |
| 94 | + _fail_and_exit(f"Failed to download {url}", e) |
| 95 | + |
| 96 | + if downloaded_files: |
| 97 | + try: |
| 98 | + pdf_loader = PDFLoader(config) |
| 99 | + pdf_chunks = pdf_loader.load(downloaded_files) |
| 100 | + |
| 101 | + if pdf_chunks: |
| 102 | + logger.info( |
| 103 | + "Adding %d PDF web chunks to vector DB", len(pdf_chunks) |
| 104 | + ) |
| 105 | + config.db_provider.add_documents(pdf_chunks) |
| 106 | + else: |
| 107 | + logger.info("No chunks produced from downloaded PDFs.") |
| 108 | + except Exception as e: |
| 109 | + _fail_and_exit("Failed during PDF web document processing", e) |
| 110 | + |
| 111 | + logger.info("Embedding job complete.") |
| 112 | + |
| 113 | + |
| 114 | +if __name__ == "__main__": |
| 115 | + try: |
| 116 | + main() |
| 117 | + except Exception as e: |
| 118 | + logger.critical("Fatal error: %s", e, exc_info=True) |
| 119 | + sys.exit(1) |
0 commit comments