Skip to content

Commit 2025a26

Browse files
fix: ensure parameters in RagTool.add, add typing, tests (#3979)
* fix: ensure parameters in RagTool.add, add typing, tests * feat: substitute pymupdf for pypdf, better parsing performance --------- Co-authored-by: Lorenze Jay <[email protected]>
1 parent bed9a38 commit 2025a26

File tree

8 files changed

+734
-81
lines changed

8 files changed

+734
-81
lines changed

lib/crewai-tools/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ dependencies = [
1616
"lancedb>=0.5.4",
1717
"tiktoken>=0.8.0",
1818
"beautifulsoup4>=4.13.4",
19-
"pypdf>=5.9.0",
2019
"python-docx>=1.2.0",
2120
"youtube-transcript-api>=1.2.2",
21+
"pymupdf>=1.26.6",
2222
]
2323

2424

lib/crewai-tools/src/crewai_tools/adapters/crewai_rag_adapter.py

Lines changed: 74 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
from __future__ import annotations
44

55
import hashlib
6-
from pathlib import Path
7-
from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast
6+
from typing import TYPE_CHECKING, Any, cast
87
import uuid
98

109
from crewai.rag.config.types import RagConfigType
@@ -19,15 +18,13 @@
1918
from crewai_tools.rag.data_types import DataType
2019
from crewai_tools.rag.misc import sanitize_metadata_for_chromadb
2120
from crewai_tools.tools.rag.rag_tool import Adapter
21+
from crewai_tools.tools.rag.types import AddDocumentParams, ContentItem
2222

2323

2424
if TYPE_CHECKING:
2525
from crewai.rag.qdrant.config import QdrantConfig
2626

2727

28-
ContentItem: TypeAlias = str | Path | dict[str, Any]
29-
30-
3128
def _is_qdrant_config(config: Any) -> TypeIs[QdrantConfig]:
3229
"""Check if config is a QdrantConfig using safe duck typing.
3330
@@ -46,19 +43,6 @@ def _is_qdrant_config(config: Any) -> TypeIs[QdrantConfig]:
4643
return False
4744

4845

49-
class AddDocumentParams(TypedDict, total=False):
50-
"""Parameters for adding documents to the RAG system."""
51-
52-
data_type: DataType
53-
metadata: dict[str, Any]
54-
website: str
55-
url: str
56-
file_path: str | Path
57-
github_url: str
58-
youtube_url: str
59-
directory_path: str | Path
60-
61-
6246
class CrewAIRagAdapter(Adapter):
6347
"""Adapter that uses CrewAI's native RAG system.
6448
@@ -131,13 +115,26 @@ def query(
131115
def add(self, *args: ContentItem, **kwargs: Unpack[AddDocumentParams]) -> None:
132116
"""Add content to the knowledge base.
133117
134-
This method handles various input types and converts them to documents
135-
for the vector database. It supports the data_type parameter for
136-
compatibility with existing tools.
137-
138118
Args:
139119
*args: Content items to add (strings, paths, or document dicts)
140-
**kwargs: Additional parameters including data_type, metadata, etc.
120+
**kwargs: Additional parameters including:
121+
- data_type: DataType enum or string (e.g., "file", "pdf_file", "text")
122+
- path: Path to file or directory (alternative to positional arg)
123+
- file_path: Alias for path
124+
- metadata: Additional metadata to attach to documents
125+
- url: URL to fetch content from
126+
- website: Website URL to scrape
127+
- github_url: GitHub repository URL
128+
- youtube_url: YouTube video URL
129+
- directory_path: Path to directory
130+
131+
Examples:
132+
rag_tool.add("path/to/document.pdf", data_type=DataType.PDF_FILE)
133+
134+
rag_tool.add(path="path/to/document.pdf", data_type="file")
135+
rag_tool.add(file_path="path/to/document.pdf", data_type="pdf_file")
136+
137+
rag_tool.add("path/to/document.pdf") # auto-detects PDF
141138
"""
142139
import os
143140

@@ -146,17 +143,69 @@ def add(self, *args: ContentItem, **kwargs: Unpack[AddDocumentParams]) -> None:
146143
from crewai_tools.rag.source_content import SourceContent
147144

148145
documents: list[BaseRecord] = []
149-
data_type: DataType | None = kwargs.get("data_type")
146+
raw_data_type = kwargs.get("data_type")
150147
base_metadata: dict[str, Any] = kwargs.get("metadata", {})
151148

152-
for arg in args:
149+
data_type: DataType | None = None
150+
if raw_data_type is not None:
151+
if isinstance(raw_data_type, DataType):
152+
if raw_data_type != DataType.FILE:
153+
data_type = raw_data_type
154+
elif isinstance(raw_data_type, str):
155+
if raw_data_type != "file":
156+
try:
157+
data_type = DataType(raw_data_type)
158+
except ValueError:
159+
raise ValueError(
160+
f"Invalid data_type: '{raw_data_type}'. "
161+
f"Valid values are: 'file' (auto-detect), or one of: "
162+
f"{', '.join(dt.value for dt in DataType)}"
163+
) from None
164+
165+
content_items: list[ContentItem] = list(args)
166+
167+
path_value = kwargs.get("path") or kwargs.get("file_path")
168+
if path_value is not None:
169+
content_items.append(path_value)
170+
171+
if url := kwargs.get("url"):
172+
content_items.append(url)
173+
if website := kwargs.get("website"):
174+
content_items.append(website)
175+
if github_url := kwargs.get("github_url"):
176+
content_items.append(github_url)
177+
if youtube_url := kwargs.get("youtube_url"):
178+
content_items.append(youtube_url)
179+
if directory_path := kwargs.get("directory_path"):
180+
content_items.append(directory_path)
181+
182+
file_extensions = {
183+
".pdf",
184+
".txt",
185+
".csv",
186+
".json",
187+
".xml",
188+
".docx",
189+
".mdx",
190+
".md",
191+
}
192+
193+
for arg in content_items:
153194
source_ref: str
154195
if isinstance(arg, dict):
155196
source_ref = str(arg.get("source", arg.get("content", "")))
156197
else:
157198
source_ref = str(arg)
158199

159200
if not data_type:
201+
ext = os.path.splitext(source_ref)[1].lower()
202+
is_url = source_ref.startswith(("http://", "https://", "file://"))
203+
if (
204+
ext in file_extensions
205+
and not is_url
206+
and not os.path.isfile(source_ref)
207+
):
208+
raise FileNotFoundError(f"File does not exist: {source_ref}")
160209
data_type = DataTypes.from_content(source_ref)
161210

162211
if data_type == DataType.DIRECTORY:

lib/crewai-tools/src/crewai_tools/rag/data_types.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,31 @@
11
from enum import Enum
2+
from importlib import import_module
23
import os
34
from pathlib import Path
5+
from typing import cast
46
from urllib.parse import urlparse
57

68
from crewai_tools.rag.base_loader import BaseLoader
79
from crewai_tools.rag.chunkers.base_chunker import BaseChunker
810

911

1012
class DataType(str, Enum):
13+
FILE = "file"
1114
PDF_FILE = "pdf_file"
1215
TEXT_FILE = "text_file"
1316
CSV = "csv"
1417
JSON = "json"
1518
XML = "xml"
1619
DOCX = "docx"
1720
MDX = "mdx"
18-
19-
# Database types
2021
MYSQL = "mysql"
2122
POSTGRES = "postgres"
22-
23-
# Repository types
2423
GITHUB = "github"
2524
DIRECTORY = "directory"
26-
27-
# Web types
2825
WEBSITE = "website"
2926
DOCS_SITE = "docs_site"
3027
YOUTUBE_VIDEO = "youtube_video"
3128
YOUTUBE_CHANNEL = "youtube_channel"
32-
33-
# Raw types
3429
TEXT = "text"
3530

3631
def get_chunker(self) -> BaseChunker:
@@ -63,13 +58,11 @@ def get_chunker(self) -> BaseChunker:
6358

6459
try:
6560
module = import_module(module_path)
66-
return getattr(module, class_name)()
61+
return cast(BaseChunker, getattr(module, class_name)())
6762
except Exception as e:
6863
raise ValueError(f"Error loading chunker for {self}: {e}") from e
6964

7065
def get_loader(self) -> BaseLoader:
71-
from importlib import import_module
72-
7366
loaders = {
7467
DataType.PDF_FILE: ("pdf_loader", "PDFLoader"),
7568
DataType.TEXT_FILE: ("text_loader", "TextFileLoader"),
@@ -98,7 +91,7 @@ def get_loader(self) -> BaseLoader:
9891
module_path = f"crewai_tools.rag.loaders.{module_name}"
9992
try:
10093
module = import_module(module_path)
101-
return getattr(module, class_name)()
94+
return cast(BaseLoader, getattr(module, class_name)())
10295
except Exception as e:
10396
raise ValueError(f"Error loading loader for {self}: {e}") from e
10497

lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py

Lines changed: 75 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,70 +2,112 @@
22

33
import os
44
from pathlib import Path
5-
from typing import Any
5+
from typing import Any, cast
6+
from urllib.parse import urlparse
7+
import urllib.request
68

79
from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
810
from crewai_tools.rag.source_content import SourceContent
911

1012

1113
class PDFLoader(BaseLoader):
12-
"""Loader for PDF files."""
14+
"""Loader for PDF files and URLs."""
1315

14-
def load(self, source: SourceContent, **kwargs) -> LoaderResult: # type: ignore[override]
15-
"""Load and extract text from a PDF file.
16+
@staticmethod
17+
def _is_url(path: str) -> bool:
18+
"""Check if the path is a URL."""
19+
try:
20+
parsed = urlparse(path)
21+
return parsed.scheme in ("http", "https")
22+
except Exception:
23+
return False
24+
25+
@staticmethod
26+
def _download_pdf(url: str) -> bytes:
27+
"""Download PDF content from a URL.
1628
1729
Args:
18-
source: The source content containing the PDF file path
30+
url: The URL to download from.
1931
2032
Returns:
21-
LoaderResult with extracted text content
33+
The PDF content as bytes.
2234
2335
Raises:
24-
FileNotFoundError: If the PDF file doesn't exist
25-
ImportError: If required PDF libraries aren't installed
36+
ValueError: If the download fails.
2637
"""
38+
2739
try:
28-
import pypdf
29-
except ImportError:
30-
try:
31-
import PyPDF2 as pypdf # type: ignore[import-not-found,no-redef] # noqa: N813
32-
except ImportError as e:
33-
raise ImportError(
34-
"PDF support requires pypdf or PyPDF2. Install with: uv add pypdf"
35-
) from e
40+
with urllib.request.urlopen(url, timeout=30) as response: # noqa: S310
41+
return cast(bytes, response.read())
42+
except Exception as e:
43+
raise ValueError(f"Failed to download PDF from {url}: {e!s}") from e
44+
45+
def load(self, source: SourceContent, **kwargs: Any) -> LoaderResult: # type: ignore[override]
46+
"""Load and extract text from a PDF file or URL.
47+
48+
Args:
49+
source: The source content containing the PDF file path or URL.
50+
51+
Returns:
52+
LoaderResult with extracted text content.
53+
54+
Raises:
55+
FileNotFoundError: If the PDF file doesn't exist.
56+
ImportError: If required PDF libraries aren't installed.
57+
ValueError: If the PDF cannot be read or downloaded.
58+
"""
59+
try:
60+
import pymupdf # type: ignore[import-untyped]
61+
except ImportError as e:
62+
raise ImportError(
63+
"PDF support requires pymupdf. Install with: uv add pymupdf"
64+
) from e
3665

3766
file_path = source.source
67+
is_url = self._is_url(file_path)
3868

39-
if not os.path.isfile(file_path):
40-
raise FileNotFoundError(f"PDF file not found: {file_path}")
69+
if is_url:
70+
source_name = Path(urlparse(file_path).path).name or "downloaded.pdf"
71+
else:
72+
source_name = Path(file_path).name
4173

42-
text_content = []
74+
text_content: list[str] = []
4375
metadata: dict[str, Any] = {
44-
"source": str(file_path),
45-
"file_name": Path(file_path).name,
76+
"source": file_path,
77+
"file_name": source_name,
4678
"file_type": "pdf",
4779
}
4880

4981
try:
50-
with open(file_path, "rb") as file:
51-
pdf_reader = pypdf.PdfReader(file)
52-
metadata["num_pages"] = len(pdf_reader.pages)
53-
54-
for page_num, page in enumerate(pdf_reader.pages, 1):
55-
page_text = page.extract_text()
56-
if page_text.strip():
57-
text_content.append(f"Page {page_num}:\n{page_text}")
82+
if is_url:
83+
pdf_bytes = self._download_pdf(file_path)
84+
doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
85+
else:
86+
if not os.path.isfile(file_path):
87+
raise FileNotFoundError(f"PDF file not found: {file_path}")
88+
doc = pymupdf.open(file_path)
89+
90+
metadata["num_pages"] = len(doc)
91+
92+
for page_num, page in enumerate(doc, 1):
93+
page_text = page.get_text()
94+
if page_text.strip():
95+
text_content.append(f"Page {page_num}:\n{page_text}")
96+
97+
doc.close()
98+
except FileNotFoundError:
99+
raise
58100
except Exception as e:
59-
raise ValueError(f"Error reading PDF file {file_path}: {e!s}") from e
101+
raise ValueError(f"Error reading PDF from {file_path}: {e!s}") from e
60102

61103
if not text_content:
62-
content = f"[PDF file with no extractable text: {Path(file_path).name}]"
104+
content = f"[PDF file with no extractable text: {source_name}]"
63105
else:
64106
content = "\n\n".join(text_content)
65107

66108
return LoaderResult(
67109
content=content,
68-
source=str(file_path),
110+
source=file_path,
69111
metadata=metadata,
70-
doc_id=self.generate_doc_id(source_ref=str(file_path), content=content),
112+
doc_id=self.generate_doc_id(source_ref=file_path, content=content),
71113
)

0 commit comments

Comments
 (0)