@@ -117,7 +117,8 @@ def download_repo(repo_url: str, local_path: str, type: str = "github", access_t
117
117
# Alias for backward compatibility
118
118
download_github_repo = download_repo
119
119
120
- def read_all_documents (path : str , local_ollama : bool = False , excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ):
120
+ def read_all_documents (path : str , local_ollama : bool = False , excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ,
121
+ included_dirs : List [str ] = None , included_files : List [str ] = None ):
121
122
"""
122
123
Recursively reads all documents in a directory and its subdirectories.
123
124
@@ -128,6 +129,10 @@ def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: Lis
128
129
Overrides the default configuration if provided.
129
130
excluded_files (List[str], optional): List of file patterns to exclude from processing.
130
131
Overrides the default configuration if provided.
132
+ included_dirs (List[str], optional): List of directories to include exclusively.
133
+ When provided, only files in these directories will be processed.
134
+ included_files (List[str], optional): List of file patterns to include exclusively.
135
+ When provided, only files matching these patterns will be processed.
131
136
132
137
Returns:
133
138
list: A list of Document objects with metadata.
@@ -138,52 +143,130 @@ def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: Lis
138
143
".jsx" , ".tsx" , ".html" , ".css" , ".php" , ".swift" , ".cs" ]
139
144
doc_extensions = [".md" , ".txt" , ".rst" , ".json" , ".yaml" , ".yml" ]
140
145
141
- # Always start with default excluded directories and files
142
- final_excluded_dirs = set (DEFAULT_EXCLUDED_DIRS )
143
- final_excluded_files = set (DEFAULT_EXCLUDED_FILES )
146
+ # Determine filtering mode: inclusion or exclusion
147
+ use_inclusion_mode = (included_dirs is not None and len (included_dirs ) > 0 ) or (included_files is not None and len (included_files ) > 0 )
144
148
145
- # Add any additional excluded directories from config
146
- if "file_filters" in configs and "excluded_dirs" in configs ["file_filters" ]:
147
- final_excluded_dirs .update (configs ["file_filters" ]["excluded_dirs" ])
149
+ if use_inclusion_mode :
150
+ # Inclusion mode: only process specified directories and files
151
+ final_included_dirs = set (included_dirs ) if included_dirs else set ()
152
+ final_included_files = set (included_files ) if included_files else set ()
148
153
149
- # Add any additional excluded files from config
150
- if "file_filters" in configs and "excluded_files" in configs [ "file_filters" ]:
151
- final_excluded_files . update ( configs [ "file_filters" ][ "excluded_files" ] )
154
+ logger . info ( f"Using inclusion mode" )
155
+ logger . info ( f"Included directories: { list ( final_included_dirs ) } " )
156
+ logger . info ( f"Included files: { list ( final_included_files ) } " )
152
157
153
- # Add any explicitly provided excluded directories and files
154
- if excluded_dirs is not None :
155
- final_excluded_dirs .update (excluded_dirs )
158
+ # Convert to lists for processing
159
+ included_dirs = list (final_included_dirs )
160
+ included_files = list (final_included_files )
161
+ excluded_dirs = []
162
+ excluded_files = []
163
+ else :
164
+ # Exclusion mode: use default exclusions plus any additional ones
165
+ final_excluded_dirs = set (DEFAULT_EXCLUDED_DIRS )
166
+ final_excluded_files = set (DEFAULT_EXCLUDED_FILES )
167
+
168
+ # Add any additional excluded directories from config
169
+ if "file_filters" in configs and "excluded_dirs" in configs ["file_filters" ]:
170
+ final_excluded_dirs .update (configs ["file_filters" ]["excluded_dirs" ])
171
+
172
+ # Add any additional excluded files from config
173
+ if "file_filters" in configs and "excluded_files" in configs ["file_filters" ]:
174
+ final_excluded_files .update (configs ["file_filters" ]["excluded_files" ])
175
+
176
+ # Add any explicitly provided excluded directories and files
177
+ if excluded_dirs is not None :
178
+ final_excluded_dirs .update (excluded_dirs )
156
179
157
- if excluded_files is not None :
158
- final_excluded_files .update (excluded_files )
180
+ if excluded_files is not None :
181
+ final_excluded_files .update (excluded_files )
159
182
160
- # Convert back to lists for compatibility
161
- excluded_dirs = list (final_excluded_dirs )
162
- excluded_files = list (final_excluded_files )
183
+ # Convert back to lists for compatibility
184
+ excluded_dirs = list (final_excluded_dirs )
185
+ excluded_files = list (final_excluded_files )
186
+ included_dirs = []
187
+ included_files = []
163
188
164
- logger .info (f"Using excluded directories: { excluded_dirs } " )
165
- logger .info (f"Using excluded files: { excluded_files } " )
189
+ logger .info (f"Using exclusion mode" )
190
+ logger .info (f"Excluded directories: { excluded_dirs } " )
191
+ logger .info (f"Excluded files: { excluded_files } " )
166
192
167
193
logger .info (f"Reading documents from { path } " )
168
194
169
- # Process code files first
170
- for ext in code_extensions :
171
- files = glob .glob (f"{ path } /**/*{ ext } " , recursive = True )
172
- for file_path in files :
173
- # Skip excluded directories and files
195
+ def should_process_file (file_path : str , use_inclusion : bool , included_dirs : List [str ], included_files : List [str ],
196
+ excluded_dirs : List [str ], excluded_files : List [str ]) -> bool :
197
+ """
198
+ Determine if a file should be processed based on inclusion/exclusion rules.
199
+
200
+ Args:
201
+ file_path (str): The file path to check
202
+ use_inclusion (bool): Whether to use inclusion mode
203
+ included_dirs (List[str]): List of directories to include
204
+ included_files (List[str]): List of files to include
205
+ excluded_dirs (List[str]): List of directories to exclude
206
+ excluded_files (List[str]): List of files to exclude
207
+
208
+ Returns:
209
+ bool: True if the file should be processed, False otherwise
210
+ """
211
+ file_path_parts = os .path .normpath (file_path ).split (os .sep )
212
+ file_name = os .path .basename (file_path )
213
+
214
+ if use_inclusion :
215
+ # Inclusion mode: file must be in included directories or match included files
216
+ is_included = False
217
+
218
+ # Check if file is in an included directory
219
+ if included_dirs :
220
+ for included in included_dirs :
221
+ clean_included = included .strip ("./" ).rstrip ("/" )
222
+ if clean_included in file_path_parts :
223
+ is_included = True
224
+ break
225
+
226
+ # Check if file matches included file patterns
227
+ if not is_included and included_files :
228
+ for included_file in included_files :
229
+ if file_name == included_file or file_name .endswith (included_file ):
230
+ is_included = True
231
+ break
232
+
233
+ # If no inclusion rules are specified for a category, allow all files from that category
234
+ if not included_dirs and not included_files :
235
+ is_included = True
236
+ elif not included_dirs and included_files :
237
+ # Only file patterns specified, allow all directories
238
+ pass # is_included is already set based on file patterns
239
+ elif included_dirs and not included_files :
240
+ # Only directory patterns specified, allow all files in included directories
241
+ pass # is_included is already set based on directory patterns
242
+
243
+ return is_included
244
+ else :
245
+ # Exclusion mode: file must not be in excluded directories or match excluded files
174
246
is_excluded = False
247
+
175
248
# Check if file is in an excluded directory
176
- file_path_parts = os .path .normpath (file_path ).split (os .sep )
177
249
for excluded in excluded_dirs :
178
- # Remove ./ prefix and trailing slash if present
179
250
clean_excluded = excluded .strip ("./" ).rstrip ("/" )
180
- # Check if the excluded directory is in the path components
181
251
if clean_excluded in file_path_parts :
182
252
is_excluded = True
183
253
break
184
- if not is_excluded and any (os .path .basename (file_path ) == excluded for excluded in excluded_files ):
185
- is_excluded = True
186
- if is_excluded :
254
+
255
+ # Check if file matches excluded file patterns
256
+ if not is_excluded :
257
+ for excluded_file in excluded_files :
258
+ if file_name == excluded_file :
259
+ is_excluded = True
260
+ break
261
+
262
+ return not is_excluded
263
+
264
+ # Process code files first
265
+ for ext in code_extensions :
266
+ files = glob .glob (f"{ path } /**/*{ ext } " , recursive = True )
267
+ for file_path in files :
268
+ # Check if file should be processed based on inclusion/exclusion rules
269
+ if not should_process_file (file_path , use_inclusion_mode , included_dirs , included_files , excluded_dirs , excluded_files ):
187
270
continue
188
271
189
272
try :
@@ -223,20 +306,8 @@ def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: Lis
223
306
for ext in doc_extensions :
224
307
files = glob .glob (f"{ path } /**/*{ ext } " , recursive = True )
225
308
for file_path in files :
226
- # Skip excluded directories and files
227
- is_excluded = False
228
- # Check if file is in an excluded directory
229
- file_path_parts = os .path .normpath (file_path ).split (os .sep )
230
- for excluded in excluded_dirs :
231
- # Remove ./ prefix and trailing slash if present
232
- clean_excluded = excluded .strip ("./" ).rstrip ("/" )
233
- # Check if the excluded directory is in the path components
234
- if clean_excluded in file_path_parts :
235
- is_excluded = True
236
- break
237
- if not is_excluded and any (os .path .basename (file_path ) == excluded for excluded in excluded_files ):
238
- is_excluded = True
239
- if is_excluded :
309
+ # Check if file should be processed based on inclusion/exclusion rules
310
+ if not should_process_file (file_path , use_inclusion_mode , included_dirs , included_files , excluded_dirs , excluded_files ):
240
311
continue
241
312
242
313
try :
@@ -572,7 +643,8 @@ def __init__(self):
572
643
self .repo_paths = None
573
644
574
645
def prepare_database (self , repo_url_or_path : str , type : str = "github" , access_token : str = None , local_ollama : bool = False ,
575
- excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ) -> List [Document ]:
646
+ excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ,
647
+ included_dirs : List [str ] = None , included_files : List [str ] = None ) -> List [Document ]:
576
648
"""
577
649
Create a new database from the repository.
578
650
@@ -582,13 +654,16 @@ def prepare_database(self, repo_url_or_path: str, type: str = "github", access_t
582
654
local_ollama (bool): Whether to use local Ollama for embedding (default: False)
583
655
excluded_dirs (List[str], optional): List of directories to exclude from processing
584
656
excluded_files (List[str], optional): List of file patterns to exclude from processing
657
+ included_dirs (List[str], optional): List of directories to include exclusively
658
+ included_files (List[str], optional): List of file patterns to include exclusively
585
659
586
660
Returns:
587
661
List[Document]: List of Document objects
588
662
"""
589
663
self .reset_database ()
590
664
self ._create_repo (repo_url_or_path , type , access_token )
591
- return self .prepare_db_index (local_ollama = local_ollama , excluded_dirs = excluded_dirs , excluded_files = excluded_files )
665
+ return self .prepare_db_index (local_ollama = local_ollama , excluded_dirs = excluded_dirs , excluded_files = excluded_files ,
666
+ included_dirs = included_dirs , included_files = included_files )
592
667
593
668
def reset_database (self ):
594
669
"""
@@ -659,14 +734,17 @@ def _create_repo(self, repo_url_or_path: str, type: str = "github", access_token
659
734
logger .error (f"Failed to create repository structure: { e } " )
660
735
raise
661
736
662
- def prepare_db_index (self , local_ollama : bool = False , excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ) -> List [Document ]:
737
+ def prepare_db_index (self , local_ollama : bool = False , excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ,
738
+ included_dirs : List [str ] = None , included_files : List [str ] = None ) -> List [Document ]:
663
739
"""
664
740
Prepare the indexed database for the repository.
665
741
666
742
Args:
667
743
local_ollama (bool): Whether to use local Ollama for embedding (default: False)
668
744
excluded_dirs (List[str], optional): List of directories to exclude from processing
669
745
excluded_files (List[str], optional): List of file patterns to exclude from processing
746
+ included_dirs (List[str], optional): List of directories to include exclusively
747
+ included_files (List[str], optional): List of file patterns to include exclusively
670
748
671
749
Returns:
672
750
List[Document]: List of Document objects
@@ -690,7 +768,9 @@ def prepare_db_index(self, local_ollama: bool = False, excluded_dirs: List[str]
690
768
self .repo_paths ["save_repo_dir" ],
691
769
local_ollama = local_ollama ,
692
770
excluded_dirs = excluded_dirs ,
693
- excluded_files = excluded_files
771
+ excluded_files = excluded_files ,
772
+ included_dirs = included_dirs ,
773
+ included_files = included_files
694
774
)
695
775
self .db = transform_documents_and_save_to_db (
696
776
documents , self .repo_paths ["save_db_file" ], local_ollama = local_ollama
0 commit comments