crewAIInc
diff --git a/‎lib/crewai-tools/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎lib/crewai-tools/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/crewai-tools/src/crewai_tools/adapters/crewai_rag_adapter.py‎
Lines changed: 74 additions & 25 deletions b/‎lib/crewai-tools/src/crewai_tools/adapters/crewai_rag_adapter.py‎
Lines changed: 74 additions & 25 deletions
diff --git a/‎lib/crewai-tools/src/crewai_tools/rag/data_types.py‎
Lines changed: 5 additions & 12 deletions b/‎lib/crewai-tools/src/crewai_tools/rag/data_types.py‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py‎
Lines changed: 75 additions & 33 deletions b/‎lib/crewai-tools/src/crewai_tools/rag/loaders/pdf_loader.py‎
Lines changed: 75 additions & 33 deletions
@@ -16,9 +16,9 @@ dependencies = [
     "lancedb>=0.5.4",
     "tiktoken>=0.8.0",
     "beautifulsoup4>=4.13.4",
-    "pypdf>=5.9.0",
     "python-docx>=1.2.0",
     "youtube-transcript-api>=1.2.2",
+    "pymupdf>=1.26.6",
 ]
 
 
 
@@ -3,8 +3,7 @@
 from __future__ import annotations
 
 import hashlib
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast
+from typing import TYPE_CHECKING, Any, cast
 import uuid
 
 from crewai.rag.config.types import RagConfigType
@@ -19,15 +18,13 @@
 from crewai_tools.rag.data_types import DataType
 from crewai_tools.rag.misc import sanitize_metadata_for_chromadb
 from crewai_tools.tools.rag.rag_tool import Adapter
+from crewai_tools.tools.rag.types import AddDocumentParams, ContentItem
 
 
 if TYPE_CHECKING:
     from crewai.rag.qdrant.config import QdrantConfig
 
 
-ContentItem: TypeAlias = str | Path | dict[str, Any]
-
-
 def _is_qdrant_config(config: Any) -> TypeIs[QdrantConfig]:
     """Check if config is a QdrantConfig using safe duck typing.
 
@@ -46,19 +43,6 @@ def _is_qdrant_config(config: Any) -> TypeIs[QdrantConfig]:
         return False
 
 
-class AddDocumentParams(TypedDict, total=False):
-    """Parameters for adding documents to the RAG system."""
-
-    data_type: DataType
-    metadata: dict[str, Any]
-    website: str
-    url: str
-    file_path: str | Path
-    github_url: str
-    youtube_url: str
-    directory_path: str | Path
-
-
 class CrewAIRagAdapter(Adapter):
     """Adapter that uses CrewAI's native RAG system.
 
@@ -131,13 +115,26 @@ def query(
     def add(self, *args: ContentItem, **kwargs: Unpack[AddDocumentParams]) -> None:
         """Add content to the knowledge base.
 
-        This method handles various input types and converts them to documents
-        for the vector database. It supports the data_type parameter for
-        compatibility with existing tools.
-
         Args:
             *args: Content items to add (strings, paths, or document dicts)
-            **kwargs: Additional parameters including data_type, metadata, etc.
+            **kwargs: Additional parameters including:
+                - data_type: DataType enum or string (e.g., "file", "pdf_file", "text")
+                - path: Path to file or directory (alternative to positional arg)
+                - file_path: Alias for path
+                - metadata: Additional metadata to attach to documents
+                - url: URL to fetch content from
+                - website: Website URL to scrape
+                - github_url: GitHub repository URL
+                - youtube_url: YouTube video URL
+                - directory_path: Path to directory
+
+        Examples:
+            rag_tool.add("path/to/document.pdf", data_type=DataType.PDF_FILE)
+
+            rag_tool.add(path="path/to/document.pdf", data_type="file")
+            rag_tool.add(file_path="path/to/document.pdf", data_type="pdf_file")
+
+            rag_tool.add("path/to/document.pdf")  # auto-detects PDF
         """
         import os
 
@@ -146,17 +143,69 @@ def add(self, *args: ContentItem, **kwargs: Unpack[AddDocumentParams]) -> None:
         from crewai_tools.rag.source_content import SourceContent
 
         documents: list[BaseRecord] = []
-        data_type: DataType | None = kwargs.get("data_type")
+        raw_data_type = kwargs.get("data_type")
         base_metadata: dict[str, Any] = kwargs.get("metadata", {})
 
-        for arg in args:
+        data_type: DataType | None = None
+        if raw_data_type is not None:
+            if isinstance(raw_data_type, DataType):
+                if raw_data_type != DataType.FILE:
+                    data_type = raw_data_type
+            elif isinstance(raw_data_type, str):
+                if raw_data_type != "file":
+                    try:
+                        data_type = DataType(raw_data_type)
+                    except ValueError:
+                        raise ValueError(
+                            f"Invalid data_type: '{raw_data_type}'. "
+                            f"Valid values are: 'file' (auto-detect), or one of: "
+                            f"{', '.join(dt.value for dt in DataType)}"
+                        ) from None
+
+        content_items: list[ContentItem] = list(args)
+
+        path_value = kwargs.get("path") or kwargs.get("file_path")
+        if path_value is not None:
+            content_items.append(path_value)
+
+        if url := kwargs.get("url"):
+            content_items.append(url)
+        if website := kwargs.get("website"):
+            content_items.append(website)
+        if github_url := kwargs.get("github_url"):
+            content_items.append(github_url)
+        if youtube_url := kwargs.get("youtube_url"):
+            content_items.append(youtube_url)
+        if directory_path := kwargs.get("directory_path"):
+            content_items.append(directory_path)
+
+        file_extensions = {
+            ".pdf",
+            ".txt",
+            ".csv",
+            ".json",
+            ".xml",
+            ".docx",
+            ".mdx",
+            ".md",
+        }
+
+        for arg in content_items:
             source_ref: str
             if isinstance(arg, dict):
                 source_ref = str(arg.get("source", arg.get("content", "")))
             else:
                 source_ref = str(arg)
 
             if not data_type:
+                ext = os.path.splitext(source_ref)[1].lower()
+                is_url = source_ref.startswith(("http://", "https://", "file://"))
+                if (
+                    ext in file_extensions
+                    and not is_url
+                    and not os.path.isfile(source_ref)
+                ):
+                    raise FileNotFoundError(f"File does not exist: {source_ref}")
                 data_type = DataTypes.from_content(source_ref)
 
             if data_type == DataType.DIRECTORY:
 
@@ -1,36 +1,31 @@
 from enum import Enum
+from importlib import import_module
 import os
 from pathlib import Path
+from typing import cast
 from urllib.parse import urlparse
 
 from crewai_tools.rag.base_loader import BaseLoader
 from crewai_tools.rag.chunkers.base_chunker import BaseChunker
 
 
 class DataType(str, Enum):
+    FILE = "file"
     PDF_FILE = "pdf_file"
     TEXT_FILE = "text_file"
     CSV = "csv"
     JSON = "json"
     XML = "xml"
     DOCX = "docx"
     MDX = "mdx"
-
-    # Database types
     MYSQL = "mysql"
     POSTGRES = "postgres"
-
-    # Repository types
     GITHUB = "github"
     DIRECTORY = "directory"
-
-    # Web types
     WEBSITE = "website"
     DOCS_SITE = "docs_site"
     YOUTUBE_VIDEO = "youtube_video"
     YOUTUBE_CHANNEL = "youtube_channel"
-
-    # Raw types
     TEXT = "text"
 
     def get_chunker(self) -> BaseChunker:
@@ -63,13 +58,11 @@ def get_chunker(self) -> BaseChunker:
 
         try:
             module = import_module(module_path)
-            return getattr(module, class_name)()
+            return cast(BaseChunker, getattr(module, class_name)())
         except Exception as e:
             raise ValueError(f"Error loading chunker for {self}: {e}") from e
 
     def get_loader(self) -> BaseLoader:
-        from importlib import import_module
-
         loaders = {
             DataType.PDF_FILE: ("pdf_loader", "PDFLoader"),
             DataType.TEXT_FILE: ("text_loader", "TextFileLoader"),
@@ -98,7 +91,7 @@ def get_loader(self) -> BaseLoader:
         module_path = f"crewai_tools.rag.loaders.{module_name}"
         try:
             module = import_module(module_path)
-            return getattr(module, class_name)()
+            return cast(BaseLoader, getattr(module, class_name)())
         except Exception as e:
             raise ValueError(f"Error loading loader for {self}: {e}") from e
 
 
@@ -2,70 +2,112 @@
 
 import os
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
+from urllib.parse import urlparse
+import urllib.request
 
 from crewai_tools.rag.base_loader import BaseLoader, LoaderResult
 from crewai_tools.rag.source_content import SourceContent
 
 
 class PDFLoader(BaseLoader):
-    """Loader for PDF files."""
+    """Loader for PDF files and URLs."""
 
-    def load(self, source: SourceContent, **kwargs) -> LoaderResult:  # type: ignore[override]
-        """Load and extract text from a PDF file.
+    @staticmethod
+    def _is_url(path: str) -> bool:
+        """Check if the path is a URL."""
+        try:
+            parsed = urlparse(path)
+            return parsed.scheme in ("http", "https")
+        except Exception:
+            return False
+
+    @staticmethod
+    def _download_pdf(url: str) -> bytes:
+        """Download PDF content from a URL.
 
         Args:
-            source: The source content containing the PDF file path
+            url: The URL to download from.
 
         Returns:
-            LoaderResult with extracted text content
+            The PDF content as bytes.
 
         Raises:
-            FileNotFoundError: If the PDF file doesn't exist
-            ImportError: If required PDF libraries aren't installed
+            ValueError: If the download fails.
         """
+
         try:
-            import pypdf
-        except ImportError:
-            try:
-                import PyPDF2 as pypdf  # type: ignore[import-not-found,no-redef]  # noqa: N813
-            except ImportError as e:
-                raise ImportError(
-                    "PDF support requires pypdf or PyPDF2. Install with: uv add pypdf"
-                ) from e
+            with urllib.request.urlopen(url, timeout=30) as response:  # noqa: S310
+                return cast(bytes, response.read())
+        except Exception as e:
+            raise ValueError(f"Failed to download PDF from {url}: {e!s}") from e
+
+    def load(self, source: SourceContent, **kwargs: Any) -> LoaderResult:  # type: ignore[override]
+        """Load and extract text from a PDF file or URL.
+
+        Args:
+            source: The source content containing the PDF file path or URL.
+
+        Returns:
+            LoaderResult with extracted text content.
+
+        Raises:
+            FileNotFoundError: If the PDF file doesn't exist.
+            ImportError: If required PDF libraries aren't installed.
+            ValueError: If the PDF cannot be read or downloaded.
+        """
+        try:
+            import pymupdf  # type: ignore[import-untyped]
+        except ImportError as e:
+            raise ImportError(
+                "PDF support requires pymupdf. Install with: uv add pymupdf"
+            ) from e
 
         file_path = source.source
+        is_url = self._is_url(file_path)
 
-        if not os.path.isfile(file_path):
-            raise FileNotFoundError(f"PDF file not found: {file_path}")
+        if is_url:
+            source_name = Path(urlparse(file_path).path).name or "downloaded.pdf"
+        else:
+            source_name = Path(file_path).name
 
-        text_content = []
+        text_content: list[str] = []
         metadata: dict[str, Any] = {
-            "source": str(file_path),
-            "file_name": Path(file_path).name,
+            "source": file_path,
+            "file_name": source_name,
             "file_type": "pdf",
         }
 
         try:
-            with open(file_path, "rb") as file:
-                pdf_reader = pypdf.PdfReader(file)
-                metadata["num_pages"] = len(pdf_reader.pages)
-
-                for page_num, page in enumerate(pdf_reader.pages, 1):
-                    page_text = page.extract_text()
-                    if page_text.strip():
-                        text_content.append(f"Page {page_num}:\n{page_text}")
+            if is_url:
+                pdf_bytes = self._download_pdf(file_path)
+                doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
+            else:
+                if not os.path.isfile(file_path):
+                    raise FileNotFoundError(f"PDF file not found: {file_path}")
+                doc = pymupdf.open(file_path)
+
+            metadata["num_pages"] = len(doc)
+
+            for page_num, page in enumerate(doc, 1):
+                page_text = page.get_text()
+                if page_text.strip():
+                    text_content.append(f"Page {page_num}:\n{page_text}")
+
+            doc.close()
+        except FileNotFoundError:
+            raise
         except Exception as e:
-            raise ValueError(f"Error reading PDF file {file_path}: {e!s}") from e
+            raise ValueError(f"Error reading PDF from {file_path}: {e!s}") from e
 
         if not text_content:
-            content = f"[PDF file with no extractable text: {Path(file_path).name}]"
+            content = f"[PDF file with no extractable text: {source_name}]"
         else:
             content = "\n\n".join(text_content)
 
         return LoaderResult(
             content=content,
-            source=str(file_path),
+            source=file_path,
             metadata=metadata,
-            doc_id=self.generate_doc_id(source_ref=str(file_path), content=content),
+            doc_id=self.generate_doc_id(source_ref=file_path, content=content),
         )
Original file line number	Diff line number	Diff line change
`@@ -16,9 +16,9 @@ dependencies = [`
`16`	`16`	`"lancedb>=0.5.4",`
`17`	`17`	`"tiktoken>=0.8.0",`
`18`	`18`	`"beautifulsoup4>=4.13.4",`
`19`		`- "pypdf>=5.9.0",`
`20`	`19`	`"python-docx>=1.2.0",`
`21`	`20`	`"youtube-transcript-api>=1.2.2",`
	`21`	`+ "pymupdf>=1.26.6",`
`22`	`22`	`]`
`23`	`23`
`24`	`24`