Skip to content

Commit c780951

Browse files
Merge branch 'main' into gl/chore/crews-ruff-linting-fixe
2 parents 9525367 + 8e571ea commit c780951

9 files changed

+72
-75
lines changed

src/crewai/knowledge/source/base_file_knowledge_source.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from abc import ABC, abstractmethod
22
from pathlib import Path
3-
from typing import Dict, List, Optional, Union
43

54
from pydantic import Field, field_validator
65

@@ -14,19 +13,19 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
1413
"""Base class for knowledge sources that load content from files."""
1514

1615
_logger: Logger = Logger(verbose=True)
17-
file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
16+
file_path: Path | list[Path] | str | list[str] | None = Field(
1817
default=None,
1918
description="[Deprecated] The path to the file. Use file_paths instead.",
2019
)
21-
file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
20+
file_paths: Path | list[Path] | str | list[str] | None = Field(
2221
default_factory=list, description="The path to the file"
2322
)
24-
content: Dict[Path, str] = Field(init=False, default_factory=dict)
25-
storage: Optional[KnowledgeStorage] = Field(default=None)
26-
safe_file_paths: List[Path] = Field(default_factory=list)
23+
content: dict[Path, str] = Field(init=False, default_factory=dict)
24+
storage: KnowledgeStorage | None = Field(default=None)
25+
safe_file_paths: list[Path] = Field(default_factory=list)
2726

2827
@field_validator("file_path", "file_paths", mode="before")
29-
def validate_file_path(cls, v, info):
28+
def validate_file_path(cls, v, info): # noqa: N805
3029
"""Validate that at least one of file_path or file_paths is provided."""
3130
# Single check if both are None, O(1) instead of nested conditions
3231
if (
@@ -46,9 +45,8 @@ def model_post_init(self, _):
4645
self.content = self.load_content()
4746

4847
@abstractmethod
49-
def load_content(self) -> Dict[Path, str]:
48+
def load_content(self) -> dict[Path, str]:
5049
"""Load and preprocess file content. Should be overridden by subclasses. Assume that the file path is relative to the project root in the knowledge directory."""
51-
pass
5250

5351
def validate_content(self):
5452
"""Validate the paths."""
@@ -74,11 +72,11 @@ def _save_documents(self):
7472
else:
7573
raise ValueError("No storage found to save documents.")
7674

77-
def convert_to_path(self, path: Union[Path, str]) -> Path:
75+
def convert_to_path(self, path: Path | str) -> Path:
7876
"""Convert a path to a Path object."""
7977
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
8078

81-
def _process_file_paths(self) -> List[Path]:
79+
def _process_file_paths(self) -> list[Path]:
8280
"""Convert file_path to a list of Path objects."""
8381

8482
if hasattr(self, "file_path") and self.file_path is not None:
@@ -93,7 +91,7 @@ def _process_file_paths(self) -> List[Path]:
9391
raise ValueError("Your source must be provided with a file_paths: []")
9492

9593
# Convert single path to list
96-
path_list: List[Union[Path, str]] = (
94+
path_list: list[Path | str] = (
9795
[self.file_paths]
9896
if isinstance(self.file_paths, (str, Path))
9997
else list(self.file_paths)

src/crewai/knowledge/source/base_knowledge_source.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from typing import Any, Dict, List, Optional
2+
from typing import Any
33

44
import numpy as np
55
from pydantic import BaseModel, ConfigDict, Field
@@ -12,29 +12,27 @@ class BaseKnowledgeSource(BaseModel, ABC):
1212

1313
chunk_size: int = 4000
1414
chunk_overlap: int = 200
15-
chunks: List[str] = Field(default_factory=list)
16-
chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
15+
chunks: list[str] = Field(default_factory=list)
16+
chunk_embeddings: list[np.ndarray] = Field(default_factory=list)
1717

1818
model_config = ConfigDict(arbitrary_types_allowed=True)
19-
storage: Optional[KnowledgeStorage] = Field(default=None)
20-
metadata: Dict[str, Any] = Field(default_factory=dict) # Currently unused
21-
collection_name: Optional[str] = Field(default=None)
19+
storage: KnowledgeStorage | None = Field(default=None)
20+
metadata: dict[str, Any] = Field(default_factory=dict) # Currently unused
21+
collection_name: str | None = Field(default=None)
2222

2323
@abstractmethod
2424
def validate_content(self) -> Any:
2525
"""Load and preprocess content from the source."""
26-
pass
2726

2827
@abstractmethod
2928
def add(self) -> None:
3029
"""Process content, chunk it, compute embeddings, and save them."""
31-
pass
3230

33-
def get_embeddings(self) -> List[np.ndarray]:
31+
def get_embeddings(self) -> list[np.ndarray]:
3432
"""Return the list of embeddings for the chunks."""
3533
return self.chunk_embeddings
3634

37-
def _chunk_text(self, text: str) -> List[str]:
35+
def _chunk_text(self, text: str) -> list[str]:
3836
"""Utility method to split text into chunks."""
3937
return [
4038
text[i : i + self.chunk_size]

src/crewai/knowledge/source/crew_docling_source.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
1+
from collections.abc import Iterator
12
from pathlib import Path
2-
from typing import Iterator, List, Optional, Union
33
from urllib.parse import urlparse
44

55
try:
6-
from docling.datamodel.base_models import InputFormat
7-
from docling.document_converter import DocumentConverter
8-
from docling.exceptions import ConversionError
9-
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
10-
from docling_core.types.doc.document import DoclingDocument
6+
from docling.datamodel.base_models import ( # type: ignore[import-not-found]
7+
InputFormat,
8+
)
9+
from docling.document_converter import ( # type: ignore[import-not-found]
10+
DocumentConverter,
11+
)
12+
from docling.exceptions import ConversionError # type: ignore[import-not-found]
13+
from docling_core.transforms.chunker.hierarchical_chunker import ( # type: ignore[import-not-found]
14+
HierarchicalChunker,
15+
)
16+
from docling_core.types.doc.document import ( # type: ignore[import-not-found]
17+
DoclingDocument,
18+
)
1119

1220
DOCLING_AVAILABLE = True
1321
except ImportError:
@@ -35,11 +43,11 @@ def __init__(self, *args, **kwargs):
3543

3644
_logger: Logger = Logger(verbose=True)
3745

38-
file_path: Optional[List[Union[Path, str]]] = Field(default=None)
39-
file_paths: List[Union[Path, str]] = Field(default_factory=list)
40-
chunks: List[str] = Field(default_factory=list)
41-
safe_file_paths: List[Union[Path, str]] = Field(default_factory=list)
42-
content: List["DoclingDocument"] = Field(default_factory=list)
46+
file_path: list[Path | str] | None = Field(default=None)
47+
file_paths: list[Path | str] = Field(default_factory=list)
48+
chunks: list[str] = Field(default_factory=list)
49+
safe_file_paths: list[Path | str] = Field(default_factory=list)
50+
content: list["DoclingDocument"] = Field(default_factory=list)
4351
document_converter: "DocumentConverter" = Field(
4452
default_factory=lambda: DocumentConverter(
4553
allowed_formats=[
@@ -66,7 +74,7 @@ def model_post_init(self, _) -> None:
6674
self.safe_file_paths = self.validate_content()
6775
self.content = self._load_content()
6876

69-
def _load_content(self) -> List["DoclingDocument"]:
77+
def _load_content(self) -> list["DoclingDocument"]:
7078
try:
7179
return self._convert_source_to_docling_documents()
7280
except ConversionError as e:
@@ -88,7 +96,7 @@ def add(self) -> None:
8896
self.chunks.extend(list(new_chunks_iterable))
8997
self._save_documents()
9098

91-
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
99+
def _convert_source_to_docling_documents(self) -> list["DoclingDocument"]:
92100
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
93101
return [result.document for result in conv_results_iter]
94102

@@ -97,8 +105,8 @@ def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
97105
for chunk in chunker.chunk(doc):
98106
yield chunk.text
99107

100-
def validate_content(self) -> List[Union[Path, str]]:
101-
processed_paths: List[Union[Path, str]] = []
108+
def validate_content(self) -> list[Path | str]:
109+
processed_paths: list[Path | str] = []
102110
for path in self.file_paths:
103111
if isinstance(path, str):
104112
if path.startswith(("http://", "https://")):
@@ -108,7 +116,7 @@ def validate_content(self) -> List[Union[Path, str]]:
108116
else:
109117
raise ValueError(f"Invalid URL format: {path}")
110118
except Exception as e:
111-
raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
119+
raise ValueError(f"Invalid URL: {path}. Error: {e!s}") from e
112120
else:
113121
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
114122
if local_path.exists():

src/crewai/knowledge/source/csv_knowledge_source.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
import csv
22
from pathlib import Path
3-
from typing import Dict, List
43

54
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
65

76

87
class CSVKnowledgeSource(BaseFileKnowledgeSource):
98
"""A knowledge source that stores and queries CSV file content using embeddings."""
109

11-
def load_content(self) -> Dict[Path, str]:
10+
def load_content(self) -> dict[Path, str]:
1211
"""Load and preprocess CSV file content."""
1312
content_dict = {}
1413
for file_path in self.safe_file_paths:
@@ -32,7 +31,7 @@ def add(self) -> None:
3231
self.chunks.extend(new_chunks)
3332
self._save_documents()
3433

35-
def _chunk_text(self, text: str) -> List[str]:
34+
def _chunk_text(self, text: str) -> list[str]:
3635
"""Utility method to split text into chunks."""
3736
return [
3837
text[i : i + self.chunk_size]

src/crewai/knowledge/source/excel_knowledge_source.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
from pathlib import Path
2-
from typing import Dict, Iterator, List, Optional, Union
3-
from urllib.parse import urlparse
42

53
from pydantic import Field, field_validator
64

@@ -16,19 +14,19 @@ class ExcelKnowledgeSource(BaseKnowledgeSource):
1614

1715
_logger: Logger = Logger(verbose=True)
1816

19-
file_path: Optional[Union[Path, List[Path], str, List[str]]] = Field(
17+
file_path: Path | list[Path] | str | list[str] | None = Field(
2018
default=None,
2119
description="[Deprecated] The path to the file. Use file_paths instead.",
2220
)
23-
file_paths: Optional[Union[Path, List[Path], str, List[str]]] = Field(
21+
file_paths: Path | list[Path] | str | list[str] | None = Field(
2422
default_factory=list, description="The path to the file"
2523
)
26-
chunks: List[str] = Field(default_factory=list)
27-
content: Dict[Path, Dict[str, str]] = Field(default_factory=dict)
28-
safe_file_paths: List[Path] = Field(default_factory=list)
24+
chunks: list[str] = Field(default_factory=list)
25+
content: dict[Path, dict[str, str]] = Field(default_factory=dict)
26+
safe_file_paths: list[Path] = Field(default_factory=list)
2927

3028
@field_validator("file_path", "file_paths", mode="before")
31-
def validate_file_path(cls, v, info):
29+
def validate_file_path(cls, v, info): # noqa: N805
3230
"""Validate that at least one of file_path or file_paths is provided."""
3331
# Single check if both are None, O(1) instead of nested conditions
3432
if (
@@ -41,7 +39,7 @@ def validate_file_path(cls, v, info):
4139
raise ValueError("Either file_path or file_paths must be provided")
4240
return v
4341

44-
def _process_file_paths(self) -> List[Path]:
42+
def _process_file_paths(self) -> list[Path]:
4543
"""Convert file_path to a list of Path objects."""
4644

4745
if hasattr(self, "file_path") and self.file_path is not None:
@@ -56,7 +54,7 @@ def _process_file_paths(self) -> List[Path]:
5654
raise ValueError("Your source must be provided with a file_paths: []")
5755

5856
# Convert single path to list
59-
path_list: List[Union[Path, str]] = (
57+
path_list: list[Path | str] = (
6058
[self.file_paths]
6159
if isinstance(self.file_paths, (str, Path))
6260
else list(self.file_paths)
@@ -100,7 +98,7 @@ def model_post_init(self, _) -> None:
10098
self.validate_content()
10199
self.content = self._load_content()
102100

103-
def _load_content(self) -> Dict[Path, Dict[str, str]]:
101+
def _load_content(self) -> dict[Path, dict[str, str]]:
104102
"""Load and preprocess Excel file content from multiple sheets.
105103
106104
Each sheet's content is converted to CSV format and stored.
@@ -126,21 +124,21 @@ def _load_content(self) -> Dict[Path, Dict[str, str]]:
126124
content_dict[file_path] = sheet_dict
127125
return content_dict
128126

129-
def convert_to_path(self, path: Union[Path, str]) -> Path:
127+
def convert_to_path(self, path: Path | str) -> Path:
130128
"""Convert a path to a Path object."""
131129
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
132130

133131
def _import_dependencies(self):
134132
"""Dynamically import dependencies."""
135133
try:
136-
import pandas as pd
134+
import pandas as pd # type: ignore[import-untyped,import-not-found]
137135

138136
return pd
139137
except ImportError as e:
140138
missing_package = str(e).split()[-1]
141139
raise ImportError(
142140
f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
143-
)
141+
) from e
144142

145143
def add(self) -> None:
146144
"""
@@ -161,7 +159,7 @@ def add(self) -> None:
161159
self.chunks.extend(new_chunks)
162160
self._save_documents()
163161

164-
def _chunk_text(self, text: str) -> List[str]:
162+
def _chunk_text(self, text: str) -> list[str]:
165163
"""Utility method to split text into chunks."""
166164
return [
167165
text[i : i + self.chunk_size]

src/crewai/knowledge/source/json_knowledge_source.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
import json
22
from pathlib import Path
3-
from typing import Any, Dict, List
3+
from typing import Any
44

55
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
66

77

88
class JSONKnowledgeSource(BaseFileKnowledgeSource):
99
"""A knowledge source that stores and queries JSON file content using embeddings."""
1010

11-
def load_content(self) -> Dict[Path, str]:
11+
def load_content(self) -> dict[Path, str]:
1212
"""Load and preprocess JSON file content."""
13-
content: Dict[Path, str] = {}
13+
content: dict[Path, str] = {}
1414
for path in self.safe_file_paths:
1515
path = self.convert_to_path(path)
1616
with open(path, "r", encoding="utf-8") as json_file:
@@ -29,7 +29,7 @@ def _json_to_text(self, data: Any, level: int = 0) -> str:
2929
for item in data:
3030
text += f"{indent}- {self._json_to_text(item, level + 1)}\n"
3131
else:
32-
text += f"{str(data)}"
32+
text += f"{data!s}"
3333
return text
3434

3535
def add(self) -> None:
@@ -44,7 +44,7 @@ def add(self) -> None:
4444
self.chunks.extend(new_chunks)
4545
self._save_documents()
4646

47-
def _chunk_text(self, text: str) -> List[str]:
47+
def _chunk_text(self, text: str) -> list[str]:
4848
"""Utility method to split text into chunks."""
4949
return [
5050
text[i : i + self.chunk_size]

src/crewai/knowledge/source/pdf_knowledge_source.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
from pathlib import Path
2-
from typing import Dict, List
32

43
from crewai.knowledge.source.base_file_knowledge_source import BaseFileKnowledgeSource
54

65

76
class PDFKnowledgeSource(BaseFileKnowledgeSource):
87
"""A knowledge source that stores and queries PDF file content using embeddings."""
98

10-
def load_content(self) -> Dict[Path, str]:
9+
def load_content(self) -> dict[Path, str]:
1110
"""Load and preprocess PDF file content."""
1211
pdfplumber = self._import_pdfplumber()
1312

@@ -30,22 +29,22 @@ def _import_pdfplumber(self):
3029
import pdfplumber
3130

3231
return pdfplumber
33-
except ImportError:
32+
except ImportError as e:
3433
raise ImportError(
3534
"pdfplumber is not installed. Please install it with: pip install pdfplumber"
36-
)
35+
) from e
3736

3837
def add(self) -> None:
3938
"""
4039
Add PDF file content to the knowledge source, chunk it, compute embeddings,
4140
and save the embeddings.
4241
"""
43-
for _, text in self.content.items():
42+
for text in self.content.values():
4443
new_chunks = self._chunk_text(text)
4544
self.chunks.extend(new_chunks)
4645
self._save_documents()
4746

48-
def _chunk_text(self, text: str) -> List[str]:
47+
def _chunk_text(self, text: str) -> list[str]:
4948
"""Utility method to split text into chunks."""
5049
return [
5150
text[i : i + self.chunk_size]

0 commit comments

Comments
 (0)