Skip to content

Commit 57c7756

Browse files
Enhanced Ollama Embedding Configuration + List View improvements (#157)
* feature/configurable-ollama-embedder * Add search functionality and card/list view toggle to processed projects
1 parent a619ebd commit 57c7756

File tree

14 files changed

+530
-131
lines changed

14 files changed

+530
-131
lines changed

api/config.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,35 @@ def load_embedder_config():
108108

109109
return embedder_config
110110

111+
def get_embedder_config():
112+
"""
113+
Get the current embedder configuration.
114+
115+
Returns:
116+
dict: The embedder configuration with model_client resolved
117+
"""
118+
return configs.get("embedder", {})
119+
120+
def is_ollama_embedder():
121+
"""
122+
Check if the current embedder configuration uses OllamaClient.
123+
124+
Returns:
125+
bool: True if using OllamaClient, False otherwise
126+
"""
127+
embedder_config = get_embedder_config()
128+
if not embedder_config:
129+
return False
130+
131+
# Check if model_client is OllamaClient
132+
model_client = embedder_config.get("model_client")
133+
if model_client:
134+
return model_client.__name__ == "OllamaClient"
135+
136+
# Fallback: check client_class string
137+
client_class = embedder_config.get("client_class", "")
138+
return client_class == "OllamaClient"
139+
111140
# Load repository and file filters configuration
112141
def load_repo_config():
113142
return load_json_config("repo.json")

api/data_pipeline.py

Lines changed: 51 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,25 @@
2121
# Maximum token limit for OpenAI embedding models
2222
MAX_EMBEDDING_TOKENS = 8192
2323

24-
def count_tokens(text: str, local_ollama: bool = False) -> int:
24+
def count_tokens(text: str, is_ollama_embedder: bool = None) -> int:
2525
"""
2626
Count the number of tokens in a text string using tiktoken.
2727
2828
Args:
2929
text (str): The text to count tokens for.
30-
local_ollama (bool, optional): Whether using local Ollama embeddings. Default is False.
30+
is_ollama_embedder (bool, optional): Whether using Ollama embeddings.
31+
If None, will be determined from configuration.
3132
3233
Returns:
3334
int: The number of tokens in the text.
3435
"""
3536
try:
36-
if local_ollama:
37+
# Determine if using Ollama embedder if not specified
38+
if is_ollama_embedder is None:
39+
from api.config import is_ollama_embedder as check_ollama
40+
is_ollama_embedder = check_ollama()
41+
42+
if is_ollama_embedder:
3743
encoding = tiktoken.get_encoding("cl100k_base")
3844
else:
3945
encoding = tiktoken.encoding_for_model("text-embedding-3-small")
@@ -117,14 +123,15 @@ def download_repo(repo_url: str, local_path: str, type: str = "github", access_t
117123
# Alias for backward compatibility
118124
download_github_repo = download_repo
119125

120-
def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: List[str] = None, excluded_files: List[str] = None,
126+
def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs: List[str] = None, excluded_files: List[str] = None,
121127
included_dirs: List[str] = None, included_files: List[str] = None):
122128
"""
123129
Recursively reads all documents in a directory and its subdirectories.
124130
125131
Args:
126132
path (str): The root directory path.
127-
local_ollama (bool): Whether to use local Ollama for token counting. Default is False.
133+
is_ollama_embedder (bool, optional): Whether using Ollama embeddings for token counting.
134+
If None, will be determined from configuration.
128135
excluded_dirs (List[str], optional): List of directories to exclude from processing.
129136
Overrides the default configuration if provided.
130137
excluded_files (List[str], optional): List of file patterns to exclude from processing.
@@ -282,7 +289,7 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
282289
)
283290

284291
# Check token count
285-
token_count = count_tokens(content, local_ollama)
292+
token_count = count_tokens(content, is_ollama_embedder)
286293
if token_count > MAX_EMBEDDING_TOKENS * 10:
287294
logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
288295
continue
@@ -316,7 +323,7 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
316323
relative_path = os.path.relpath(file_path, path)
317324

318325
# Check token count
319-
token_count = count_tokens(content, local_ollama)
326+
token_count = count_tokens(content, is_ollama_embedder)
320327
if token_count > MAX_EMBEDDING_TOKENS:
321328
logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
322329
continue
@@ -339,33 +346,43 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
339346
logger.info(f"Found {len(documents)} documents")
340347
return documents
341348

342-
def prepare_data_pipeline(local_ollama: bool = False):
349+
def prepare_data_pipeline(is_ollama_embedder: bool = None):
343350
"""
344351
Creates and returns the data transformation pipeline.
345352
346353
Args:
347-
local_ollama (bool): Whether to use local Ollama for embedding (default: False)
354+
is_ollama_embedder (bool, optional): Whether to use Ollama for embedding.
355+
If None, will be determined from configuration.
348356
349357
Returns:
350358
adal.Sequential: The data transformation pipeline
351359
"""
360+
from api.config import get_embedder_config, is_ollama_embedder as check_ollama
361+
362+
# Determine if using Ollama embedder if not specified
363+
if is_ollama_embedder is None:
364+
is_ollama_embedder = check_ollama()
365+
352366
splitter = TextSplitter(**configs["text_splitter"])
367+
embedder_config = get_embedder_config()
353368

354-
if local_ollama:
355-
# Use Ollama embedder
356-
embedder = adal.Embedder(
357-
model_client=configs["embedder_ollama"]["model_client"](),
358-
model_kwargs=configs["embedder_ollama"]["model_kwargs"],
359-
)
369+
if not embedder_config:
370+
raise ValueError("No embedder configuration found")
371+
372+
# Create embedder based on configuration
373+
embedder = adal.Embedder(
374+
model_client=embedder_config["model_client"](),
375+
model_kwargs=embedder_config["model_kwargs"],
376+
)
377+
378+
if is_ollama_embedder:
379+
# Use Ollama document processor for single-document processing
360380
embedder_transformer = OllamaDocumentProcessor(embedder=embedder)
361381
else:
362-
# Use OpenAI embedder
363-
embedder = adal.Embedder(
364-
model_client=configs["embedder"]["model_client"](),
365-
model_kwargs=configs["embedder"]["model_kwargs"],
366-
)
382+
# Use batch processing for other embedders
383+
batch_size = embedder_config.get("batch_size", 500)
367384
embedder_transformer = ToEmbeddings(
368-
embedder=embedder, batch_size=configs["embedder"]["batch_size"]
385+
embedder=embedder, batch_size=batch_size
369386
)
370387

371388
data_transformer = adal.Sequential(
@@ -374,18 +391,19 @@ def prepare_data_pipeline(local_ollama: bool = False):
374391
return data_transformer
375392

376393
def transform_documents_and_save_to_db(
377-
documents: List[Document], db_path: str, local_ollama: bool = False
394+
documents: List[Document], db_path: str, is_ollama_embedder: bool = None
378395
) -> LocalDB:
379396
"""
380397
Transforms a list of documents and saves them to a local database.
381398
382399
Args:
383400
documents (list): A list of `Document` objects.
384401
db_path (str): The path to the local database file.
385-
local_ollama (bool): Whether to use local Ollama for embedding (default: False)
402+
is_ollama_embedder (bool, optional): Whether to use Ollama for embedding.
403+
If None, will be determined from configuration.
386404
"""
387405
# Get the data transformer
388-
data_transformer = prepare_data_pipeline(local_ollama)
406+
data_transformer = prepare_data_pipeline(is_ollama_embedder)
389407

390408
# Save the documents to a local database
391409
db = LocalDB()
@@ -642,7 +660,7 @@ def __init__(self):
642660
self.repo_url_or_path = None
643661
self.repo_paths = None
644662

645-
def prepare_database(self, repo_url_or_path: str, type: str = "github", access_token: str = None, local_ollama: bool = False,
663+
def prepare_database(self, repo_url_or_path: str, type: str = "github", access_token: str = None, is_ollama_embedder: bool = None,
646664
excluded_dirs: List[str] = None, excluded_files: List[str] = None,
647665
included_dirs: List[str] = None, included_files: List[str] = None) -> List[Document]:
648666
"""
@@ -651,7 +669,8 @@ def prepare_database(self, repo_url_or_path: str, type: str = "github", access_t
651669
Args:
652670
repo_url_or_path (str): The URL or local path of the repository
653671
access_token (str, optional): Access token for private repositories
654-
local_ollama (bool): Whether to use local Ollama for embedding (default: False)
672+
is_ollama_embedder (bool, optional): Whether to use Ollama for embedding.
673+
If None, will be determined from configuration.
655674
excluded_dirs (List[str], optional): List of directories to exclude from processing
656675
excluded_files (List[str], optional): List of file patterns to exclude from processing
657676
included_dirs (List[str], optional): List of directories to include exclusively
@@ -662,7 +681,7 @@ def prepare_database(self, repo_url_or_path: str, type: str = "github", access_t
662681
"""
663682
self.reset_database()
664683
self._create_repo(repo_url_or_path, type, access_token)
665-
return self.prepare_db_index(local_ollama=local_ollama, excluded_dirs=excluded_dirs, excluded_files=excluded_files,
684+
return self.prepare_db_index(is_ollama_embedder=is_ollama_embedder, excluded_dirs=excluded_dirs, excluded_files=excluded_files,
666685
included_dirs=included_dirs, included_files=included_files)
667686

668687
def reset_database(self):
@@ -734,13 +753,14 @@ def _create_repo(self, repo_url_or_path: str, type: str = "github", access_token
734753
logger.error(f"Failed to create repository structure: {e}")
735754
raise
736755

737-
def prepare_db_index(self, local_ollama: bool = False, excluded_dirs: List[str] = None, excluded_files: List[str] = None,
756+
def prepare_db_index(self, is_ollama_embedder: bool = None, excluded_dirs: List[str] = None, excluded_files: List[str] = None,
738757
included_dirs: List[str] = None, included_files: List[str] = None) -> List[Document]:
739758
"""
740759
Prepare the indexed database for the repository.
741760
742761
Args:
743-
local_ollama (bool): Whether to use local Ollama for embedding (default: False)
762+
is_ollama_embedder (bool, optional): Whether to use Ollama for embedding.
763+
If None, will be determined from configuration.
744764
excluded_dirs (List[str], optional): List of directories to exclude from processing
745765
excluded_files (List[str], optional): List of file patterns to exclude from processing
746766
included_dirs (List[str], optional): List of directories to include exclusively
@@ -766,14 +786,14 @@ def prepare_db_index(self, local_ollama: bool = False, excluded_dirs: List[str]
766786
logger.info("Creating new database...")
767787
documents = read_all_documents(
768788
self.repo_paths["save_repo_dir"],
769-
local_ollama=local_ollama,
789+
is_ollama_embedder=is_ollama_embedder,
770790
excluded_dirs=excluded_dirs,
771791
excluded_files=excluded_files,
772792
included_dirs=included_dirs,
773793
included_files=included_files
774794
)
775795
self.db = transform_documents_and_save_to_db(
776-
documents, self.repo_paths["save_db_file"], local_ollama=local_ollama
796+
documents, self.repo_paths["save_db_file"], is_ollama_embedder=is_ollama_embedder
777797
)
778798
logger.info(f"Total documents: {len(documents)}")
779799
transformed_docs = self.db.get_transformed_data(key="split_and_embed")

api/rag.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -218,15 +218,20 @@ def __init__(self, provider="google", model=None, use_s3: bool = False): # noqa
218218

219219
self.provider = provider
220220
self.model = model
221-
self.local_ollama = provider == "ollama"
221+
222+
# Import the helper functions
223+
from api.config import get_embedder_config, is_ollama_embedder
224+
225+
# Determine if we're using Ollama embedder based on configuration
226+
self.is_ollama_embedder = is_ollama_embedder()
222227

223228
# Initialize components
224229
self.memory = Memory()
225230

226-
if self.local_ollama:
227-
embedder_config = configs["embedder_ollama"]
228-
else:
229-
embedder_config = configs["embedder"]
231+
# Get embedder configuration
232+
embedder_config = get_embedder_config()
233+
if not embedder_config:
234+
raise ValueError("No embedder configuration found")
230235

231236
# --- Initialize Embedder ---
232237
self.embedder = adal.Embedder(
@@ -242,7 +247,9 @@ def single_string_embedder(query):
242247
raise ValueError("Ollama embedder only supports a single string")
243248
query = query[0]
244249
return self.embedder(input=query)
245-
self.query_embedder = single_string_embedder
250+
251+
# Use single string embedder for Ollama, regular embedder for others
252+
self.query_embedder = single_string_embedder if self.is_ollama_embedder else self.embedder
246253

247254
self.initialize_db_manager()
248255

@@ -402,7 +409,7 @@ def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_
402409
repo_url_or_path,
403410
type,
404411
access_token,
405-
local_ollama=self.local_ollama,
412+
is_ollama_embedder=self.is_ollama_embedder,
406413
excluded_dirs=excluded_dirs,
407414
excluded_files=excluded_files,
408415
included_dirs=included_dirs,
@@ -419,10 +426,11 @@ def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_
419426
logger.info(f"Using {len(self.transformed_docs)} documents with valid embeddings for retrieval")
420427

421428
try:
422-
retreive_embedder = self.query_embedder if self.local_ollama else self.embedder
429+
# Use the appropriate embedder for retrieval
430+
retrieve_embedder = self.query_embedder if self.is_ollama_embedder else self.embedder
423431
self.retriever = FAISSRetriever(
424432
**configs["retriever"],
425-
embedder=retreive_embedder,
433+
embedder=retrieve_embedder,
426434
documents=self.transformed_docs,
427435
document_map_func=lambda doc: doc.vector,
428436
)

src/app/globals.css

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
--card-bg: #fffaf0; /* Slightly warmer than background */
1515
--highlight: #e8927c; /* Soft coral (Akane) */
1616
--muted: #a59e8c; /* Soft gray-brown (Nezumi) */
17+
--link-color: #7c5aa0; /* Slightly darker purple for links */
1718
}
1819

1920
html[data-theme='dark'] {
@@ -27,6 +28,7 @@ html[data-theme='dark'] {
2728
--card-bg: #222222; /* Slightly lighter than background */
2829
--highlight: #e57373; /* Soft red */
2930
--muted: #8c8c8c; /* Muted gray */
31+
--link-color: #b19cd9; /* Lighter purple for dark mode links */
3032
}
3133

3234
@theme inline {
@@ -117,3 +119,25 @@ html[data-theme='dark'] .paper-texture {
117119
.card-japanese:hover {
118120
box-shadow: 0 4px 12px var(--shadow-color);
119121
}
122+
123+
/* Line clamp utilities */
124+
.line-clamp-1 {
125+
overflow: hidden;
126+
display: -webkit-box;
127+
-webkit-box-orient: vertical;
128+
-webkit-line-clamp: 1;
129+
}
130+
131+
.line-clamp-2 {
132+
overflow: hidden;
133+
display: -webkit-box;
134+
-webkit-box-orient: vertical;
135+
-webkit-line-clamp: 2;
136+
}
137+
138+
.line-clamp-3 {
139+
overflow: hidden;
140+
display: -webkit-box;
141+
-webkit-box-orient: vertical;
142+
-webkit-line-clamp: 3;
143+
}

0 commit comments

Comments
 (0)