21
21
# Maximum token limit for OpenAI embedding models
22
22
MAX_EMBEDDING_TOKENS = 8192
23
23
24
- def count_tokens (text : str , local_ollama : bool = False ) -> int :
24
+ def count_tokens (text : str , is_ollama_embedder : bool = None ) -> int :
25
25
"""
26
26
Count the number of tokens in a text string using tiktoken.
27
27
28
28
Args:
29
29
text (str): The text to count tokens for.
30
- local_ollama (bool, optional): Whether using local Ollama embeddings. Default is False.
30
+ is_ollama_embedder (bool, optional): Whether using Ollama embeddings.
31
+ If None, will be determined from configuration.
31
32
32
33
Returns:
33
34
int: The number of tokens in the text.
34
35
"""
35
36
try :
36
- if local_ollama :
37
+ # Determine if using Ollama embedder if not specified
38
+ if is_ollama_embedder is None :
39
+ from api .config import is_ollama_embedder as check_ollama
40
+ is_ollama_embedder = check_ollama ()
41
+
42
+ if is_ollama_embedder :
37
43
encoding = tiktoken .get_encoding ("cl100k_base" )
38
44
else :
39
45
encoding = tiktoken .encoding_for_model ("text-embedding-3-small" )
@@ -117,14 +123,15 @@ def download_repo(repo_url: str, local_path: str, type: str = "github", access_t
117
123
# Alias for backward compatibility
118
124
download_github_repo = download_repo
119
125
120
- def read_all_documents (path : str , local_ollama : bool = False , excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ,
126
+ def read_all_documents (path : str , is_ollama_embedder : bool = None , excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ,
121
127
included_dirs : List [str ] = None , included_files : List [str ] = None ):
122
128
"""
123
129
Recursively reads all documents in a directory and its subdirectories.
124
130
125
131
Args:
126
132
path (str): The root directory path.
127
- local_ollama (bool): Whether to use local Ollama for token counting. Default is False.
133
+ is_ollama_embedder (bool, optional): Whether using Ollama embeddings for token counting.
134
+ If None, will be determined from configuration.
128
135
excluded_dirs (List[str], optional): List of directories to exclude from processing.
129
136
Overrides the default configuration if provided.
130
137
excluded_files (List[str], optional): List of file patterns to exclude from processing.
@@ -282,7 +289,7 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
282
289
)
283
290
284
291
# Check token count
285
- token_count = count_tokens (content , local_ollama )
292
+ token_count = count_tokens (content , is_ollama_embedder )
286
293
if token_count > MAX_EMBEDDING_TOKENS * 10 :
287
294
logger .warning (f"Skipping large file { relative_path } : Token count ({ token_count } ) exceeds limit" )
288
295
continue
@@ -316,7 +323,7 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
316
323
relative_path = os .path .relpath (file_path , path )
317
324
318
325
# Check token count
319
- token_count = count_tokens (content , local_ollama )
326
+ token_count = count_tokens (content , is_ollama_embedder )
320
327
if token_count > MAX_EMBEDDING_TOKENS :
321
328
logger .warning (f"Skipping large file { relative_path } : Token count ({ token_count } ) exceeds limit" )
322
329
continue
@@ -339,33 +346,43 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
339
346
logger .info (f"Found { len (documents )} documents" )
340
347
return documents
341
348
342
- def prepare_data_pipeline (local_ollama : bool = False ):
349
+ def prepare_data_pipeline (is_ollama_embedder : bool = None ):
343
350
"""
344
351
Creates and returns the data transformation pipeline.
345
352
346
353
Args:
347
- local_ollama (bool): Whether to use local Ollama for embedding (default: False)
354
+ is_ollama_embedder (bool, optional): Whether to use Ollama for embedding.
355
+ If None, will be determined from configuration.
348
356
349
357
Returns:
350
358
adal.Sequential: The data transformation pipeline
351
359
"""
360
+ from api .config import get_embedder_config , is_ollama_embedder as check_ollama
361
+
362
+ # Determine if using Ollama embedder if not specified
363
+ if is_ollama_embedder is None :
364
+ is_ollama_embedder = check_ollama ()
365
+
352
366
splitter = TextSplitter (** configs ["text_splitter" ])
367
+ embedder_config = get_embedder_config ()
353
368
354
- if local_ollama :
355
- # Use Ollama embedder
356
- embedder = adal .Embedder (
357
- model_client = configs ["embedder_ollama" ]["model_client" ](),
358
- model_kwargs = configs ["embedder_ollama" ]["model_kwargs" ],
359
- )
369
+ if not embedder_config :
370
+ raise ValueError ("No embedder configuration found" )
371
+
372
+ # Create embedder based on configuration
373
+ embedder = adal .Embedder (
374
+ model_client = embedder_config ["model_client" ](),
375
+ model_kwargs = embedder_config ["model_kwargs" ],
376
+ )
377
+
378
+ if is_ollama_embedder :
379
+ # Use Ollama document processor for single-document processing
360
380
embedder_transformer = OllamaDocumentProcessor (embedder = embedder )
361
381
else :
362
- # Use OpenAI embedder
363
- embedder = adal .Embedder (
364
- model_client = configs ["embedder" ]["model_client" ](),
365
- model_kwargs = configs ["embedder" ]["model_kwargs" ],
366
- )
382
+ # Use batch processing for other embedders
383
+ batch_size = embedder_config .get ("batch_size" , 500 )
367
384
embedder_transformer = ToEmbeddings (
368
- embedder = embedder , batch_size = configs [ "embedder" ][ " batch_size" ]
385
+ embedder = embedder , batch_size = batch_size
369
386
)
370
387
371
388
data_transformer = adal .Sequential (
@@ -374,18 +391,19 @@ def prepare_data_pipeline(local_ollama: bool = False):
374
391
return data_transformer
375
392
376
393
def transform_documents_and_save_to_db (
377
- documents : List [Document ], db_path : str , local_ollama : bool = False
394
+ documents : List [Document ], db_path : str , is_ollama_embedder : bool = None
378
395
) -> LocalDB :
379
396
"""
380
397
Transforms a list of documents and saves them to a local database.
381
398
382
399
Args:
383
400
documents (list): A list of `Document` objects.
384
401
db_path (str): The path to the local database file.
385
- local_ollama (bool): Whether to use local Ollama for embedding (default: False)
402
+ is_ollama_embedder (bool, optional): Whether to use Ollama for embedding.
403
+ If None, will be determined from configuration.
386
404
"""
387
405
# Get the data transformer
388
- data_transformer = prepare_data_pipeline (local_ollama )
406
+ data_transformer = prepare_data_pipeline (is_ollama_embedder )
389
407
390
408
# Save the documents to a local database
391
409
db = LocalDB ()
@@ -642,7 +660,7 @@ def __init__(self):
642
660
self .repo_url_or_path = None
643
661
self .repo_paths = None
644
662
645
- def prepare_database (self , repo_url_or_path : str , type : str = "github" , access_token : str = None , local_ollama : bool = False ,
663
+ def prepare_database (self , repo_url_or_path : str , type : str = "github" , access_token : str = None , is_ollama_embedder : bool = None ,
646
664
excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ,
647
665
included_dirs : List [str ] = None , included_files : List [str ] = None ) -> List [Document ]:
648
666
"""
@@ -651,7 +669,8 @@ def prepare_database(self, repo_url_or_path: str, type: str = "github", access_t
651
669
Args:
652
670
repo_url_or_path (str): The URL or local path of the repository
653
671
access_token (str, optional): Access token for private repositories
654
- local_ollama (bool): Whether to use local Ollama for embedding (default: False)
672
+ is_ollama_embedder (bool, optional): Whether to use Ollama for embedding.
673
+ If None, will be determined from configuration.
655
674
excluded_dirs (List[str], optional): List of directories to exclude from processing
656
675
excluded_files (List[str], optional): List of file patterns to exclude from processing
657
676
included_dirs (List[str], optional): List of directories to include exclusively
@@ -662,7 +681,7 @@ def prepare_database(self, repo_url_or_path: str, type: str = "github", access_t
662
681
"""
663
682
self .reset_database ()
664
683
self ._create_repo (repo_url_or_path , type , access_token )
665
- return self .prepare_db_index (local_ollama = local_ollama , excluded_dirs = excluded_dirs , excluded_files = excluded_files ,
684
+ return self .prepare_db_index (is_ollama_embedder = is_ollama_embedder , excluded_dirs = excluded_dirs , excluded_files = excluded_files ,
666
685
included_dirs = included_dirs , included_files = included_files )
667
686
668
687
def reset_database (self ):
@@ -734,13 +753,14 @@ def _create_repo(self, repo_url_or_path: str, type: str = "github", access_token
734
753
logger .error (f"Failed to create repository structure: { e } " )
735
754
raise
736
755
737
- def prepare_db_index (self , local_ollama : bool = False , excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ,
756
+ def prepare_db_index (self , is_ollama_embedder : bool = None , excluded_dirs : List [str ] = None , excluded_files : List [str ] = None ,
738
757
included_dirs : List [str ] = None , included_files : List [str ] = None ) -> List [Document ]:
739
758
"""
740
759
Prepare the indexed database for the repository.
741
760
742
761
Args:
743
- local_ollama (bool): Whether to use local Ollama for embedding (default: False)
762
+ is_ollama_embedder (bool, optional): Whether to use Ollama for embedding.
763
+ If None, will be determined from configuration.
744
764
excluded_dirs (List[str], optional): List of directories to exclude from processing
745
765
excluded_files (List[str], optional): List of file patterns to exclude from processing
746
766
included_dirs (List[str], optional): List of directories to include exclusively
@@ -766,14 +786,14 @@ def prepare_db_index(self, local_ollama: bool = False, excluded_dirs: List[str]
766
786
logger .info ("Creating new database..." )
767
787
documents = read_all_documents (
768
788
self .repo_paths ["save_repo_dir" ],
769
- local_ollama = local_ollama ,
789
+ is_ollama_embedder = is_ollama_embedder ,
770
790
excluded_dirs = excluded_dirs ,
771
791
excluded_files = excluded_files ,
772
792
included_dirs = included_dirs ,
773
793
included_files = included_files
774
794
)
775
795
self .db = transform_documents_and_save_to_db (
776
- documents , self .repo_paths ["save_db_file" ], local_ollama = local_ollama
796
+ documents , self .repo_paths ["save_db_file" ], is_ollama_embedder = is_ollama_embedder
777
797
)
778
798
logger .info (f"Total documents: { len (documents )} " )
779
799
transformed_docs = self .db .get_transformed_data (key = "split_and_embed" )
0 commit comments