Skip to content

Commit d7afcea

Browse files
Feature: WSS. (#118)
* Feature: WSS. * Fix Lint issue. * Feature: Add include_dirs and prevent embedding errors.
1 parent ea5a9cb commit d7afcea

File tree

19 files changed

+3591
-285
lines changed

19 files changed

+3591
-285
lines changed

api/api.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
22
import logging
3-
from fastapi import FastAPI, HTTPException, Query, Request
3+
from fastapi import FastAPI, HTTPException, Query, Request, WebSocket
44
from fastapi.middleware.cors import CORSMiddleware
55
from fastapi.responses import JSONResponse, Response
66
from typing import List, Optional, Dict, Any, Literal
@@ -351,10 +351,14 @@ def generate_json_export(repo_url: str, pages: List[WikiPage]) -> str:
351351

352352
# Import the simplified chat implementation
353353
from api.simple_chat import chat_completions_stream
354+
from api.websocket_wiki import handle_websocket_chat
354355

355356
# Add the chat_completions_stream endpoint to the main app
356357
app.add_api_route("/chat/completions/stream", chat_completions_stream, methods=["POST"])
357358

359+
# Add the WebSocket endpoint
360+
app.add_websocket_route("/ws/chat", handle_websocket_chat)
361+
358362
# --- Wiki Cache Helper Functions ---
359363

360364
WIKI_CACHE_DIR = os.path.join(get_adalflow_default_root_path(), "wikicache")
@@ -475,7 +479,8 @@ async def root():
475479
"version": "1.0.0",
476480
"endpoints": {
477481
"Chat": [
478-
"POST /chat/completions/stream - Streaming chat completion",
482+
"POST /chat/completions/stream - Streaming chat completion (HTTP)",
483+
"WebSocket /ws/chat - WebSocket chat completion",
479484
],
480485
"Wiki": [
481486
"POST /export/wiki - Export wiki content as Markdown or JSON",

api/config/generator.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"top_p": 0.8,
1111
"top_k": 20
1212
},
13-
"gemini-2.5-flash-preview-04-17": {
13+
"gemini-2.5-flash-preview-05-20": {
1414
"temperature": 0.7,
1515
"top_p": 0.8,
1616
"top_k": 20
@@ -56,6 +56,10 @@
5656
"temperature": 0.7,
5757
"top_p": 0.8
5858
},
59+
"deepseek/deepseek-r1": {
60+
"temperature": 0.7,
61+
"top_p": 0.8
62+
},
5963
"openai/gpt-4.1": {
6064
"temperature": 0.7,
6165
"top_p": 0.8

api/data_pipeline.py

Lines changed: 129 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@ def download_repo(repo_url: str, local_path: str, type: str = "github", access_t
117117
# Alias for backward compatibility
118118
download_github_repo = download_repo
119119

120-
def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: List[str] = None, excluded_files: List[str] = None):
120+
def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: List[str] = None, excluded_files: List[str] = None,
121+
included_dirs: List[str] = None, included_files: List[str] = None):
121122
"""
122123
Recursively reads all documents in a directory and its subdirectories.
123124
@@ -128,6 +129,10 @@ def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: Lis
128129
Overrides the default configuration if provided.
129130
excluded_files (List[str], optional): List of file patterns to exclude from processing.
130131
Overrides the default configuration if provided.
132+
included_dirs (List[str], optional): List of directories to include exclusively.
133+
When provided, only files in these directories will be processed.
134+
included_files (List[str], optional): List of file patterns to include exclusively.
135+
When provided, only files matching these patterns will be processed.
131136
132137
Returns:
133138
list: A list of Document objects with metadata.
@@ -138,52 +143,130 @@ def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: Lis
138143
".jsx", ".tsx", ".html", ".css", ".php", ".swift", ".cs"]
139144
doc_extensions = [".md", ".txt", ".rst", ".json", ".yaml", ".yml"]
140145

141-
# Always start with default excluded directories and files
142-
final_excluded_dirs = set(DEFAULT_EXCLUDED_DIRS)
143-
final_excluded_files = set(DEFAULT_EXCLUDED_FILES)
146+
# Determine filtering mode: inclusion or exclusion
147+
use_inclusion_mode = (included_dirs is not None and len(included_dirs) > 0) or (included_files is not None and len(included_files) > 0)
144148

145-
# Add any additional excluded directories from config
146-
if "file_filters" in configs and "excluded_dirs" in configs["file_filters"]:
147-
final_excluded_dirs.update(configs["file_filters"]["excluded_dirs"])
149+
if use_inclusion_mode:
150+
# Inclusion mode: only process specified directories and files
151+
final_included_dirs = set(included_dirs) if included_dirs else set()
152+
final_included_files = set(included_files) if included_files else set()
148153

149-
# Add any additional excluded files from config
150-
if "file_filters" in configs and "excluded_files" in configs["file_filters"]:
151-
final_excluded_files.update(configs["file_filters"]["excluded_files"])
154+
logger.info(f"Using inclusion mode")
155+
logger.info(f"Included directories: {list(final_included_dirs)}")
156+
logger.info(f"Included files: {list(final_included_files)}")
152157

153-
# Add any explicitly provided excluded directories and files
154-
if excluded_dirs is not None:
155-
final_excluded_dirs.update(excluded_dirs)
158+
# Convert to lists for processing
159+
included_dirs = list(final_included_dirs)
160+
included_files = list(final_included_files)
161+
excluded_dirs = []
162+
excluded_files = []
163+
else:
164+
# Exclusion mode: use default exclusions plus any additional ones
165+
final_excluded_dirs = set(DEFAULT_EXCLUDED_DIRS)
166+
final_excluded_files = set(DEFAULT_EXCLUDED_FILES)
167+
168+
# Add any additional excluded directories from config
169+
if "file_filters" in configs and "excluded_dirs" in configs["file_filters"]:
170+
final_excluded_dirs.update(configs["file_filters"]["excluded_dirs"])
171+
172+
# Add any additional excluded files from config
173+
if "file_filters" in configs and "excluded_files" in configs["file_filters"]:
174+
final_excluded_files.update(configs["file_filters"]["excluded_files"])
175+
176+
# Add any explicitly provided excluded directories and files
177+
if excluded_dirs is not None:
178+
final_excluded_dirs.update(excluded_dirs)
156179

157-
if excluded_files is not None:
158-
final_excluded_files.update(excluded_files)
180+
if excluded_files is not None:
181+
final_excluded_files.update(excluded_files)
159182

160-
# Convert back to lists for compatibility
161-
excluded_dirs = list(final_excluded_dirs)
162-
excluded_files = list(final_excluded_files)
183+
# Convert back to lists for compatibility
184+
excluded_dirs = list(final_excluded_dirs)
185+
excluded_files = list(final_excluded_files)
186+
included_dirs = []
187+
included_files = []
163188

164-
logger.info(f"Using excluded directories: {excluded_dirs}")
165-
logger.info(f"Using excluded files: {excluded_files}")
189+
logger.info(f"Using exclusion mode")
190+
logger.info(f"Excluded directories: {excluded_dirs}")
191+
logger.info(f"Excluded files: {excluded_files}")
166192

167193
logger.info(f"Reading documents from {path}")
168194

169-
# Process code files first
170-
for ext in code_extensions:
171-
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
172-
for file_path in files:
173-
# Skip excluded directories and files
195+
def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List[str], included_files: List[str],
196+
excluded_dirs: List[str], excluded_files: List[str]) -> bool:
197+
"""
198+
Determine if a file should be processed based on inclusion/exclusion rules.
199+
200+
Args:
201+
file_path (str): The file path to check
202+
use_inclusion (bool): Whether to use inclusion mode
203+
included_dirs (List[str]): List of directories to include
204+
included_files (List[str]): List of files to include
205+
excluded_dirs (List[str]): List of directories to exclude
206+
excluded_files (List[str]): List of files to exclude
207+
208+
Returns:
209+
bool: True if the file should be processed, False otherwise
210+
"""
211+
file_path_parts = os.path.normpath(file_path).split(os.sep)
212+
file_name = os.path.basename(file_path)
213+
214+
if use_inclusion:
215+
# Inclusion mode: file must be in included directories or match included files
216+
is_included = False
217+
218+
# Check if file is in an included directory
219+
if included_dirs:
220+
for included in included_dirs:
221+
clean_included = included.strip("./").rstrip("/")
222+
if clean_included in file_path_parts:
223+
is_included = True
224+
break
225+
226+
# Check if file matches included file patterns
227+
if not is_included and included_files:
228+
for included_file in included_files:
229+
if file_name == included_file or file_name.endswith(included_file):
230+
is_included = True
231+
break
232+
233+
# If no inclusion rules are specified for a category, allow all files from that category
234+
if not included_dirs and not included_files:
235+
is_included = True
236+
elif not included_dirs and included_files:
237+
# Only file patterns specified, allow all directories
238+
pass # is_included is already set based on file patterns
239+
elif included_dirs and not included_files:
240+
# Only directory patterns specified, allow all files in included directories
241+
pass # is_included is already set based on directory patterns
242+
243+
return is_included
244+
else:
245+
# Exclusion mode: file must not be in excluded directories or match excluded files
174246
is_excluded = False
247+
175248
# Check if file is in an excluded directory
176-
file_path_parts = os.path.normpath(file_path).split(os.sep)
177249
for excluded in excluded_dirs:
178-
# Remove ./ prefix and trailing slash if present
179250
clean_excluded = excluded.strip("./").rstrip("/")
180-
# Check if the excluded directory is in the path components
181251
if clean_excluded in file_path_parts:
182252
is_excluded = True
183253
break
184-
if not is_excluded and any(os.path.basename(file_path) == excluded for excluded in excluded_files):
185-
is_excluded = True
186-
if is_excluded:
254+
255+
# Check if file matches excluded file patterns
256+
if not is_excluded:
257+
for excluded_file in excluded_files:
258+
if file_name == excluded_file:
259+
is_excluded = True
260+
break
261+
262+
return not is_excluded
263+
264+
# Process code files first
265+
for ext in code_extensions:
266+
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
267+
for file_path in files:
268+
# Check if file should be processed based on inclusion/exclusion rules
269+
if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
187270
continue
188271

189272
try:
@@ -223,20 +306,8 @@ def read_all_documents(path: str, local_ollama: bool = False, excluded_dirs: Lis
223306
for ext in doc_extensions:
224307
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
225308
for file_path in files:
226-
# Skip excluded directories and files
227-
is_excluded = False
228-
# Check if file is in an excluded directory
229-
file_path_parts = os.path.normpath(file_path).split(os.sep)
230-
for excluded in excluded_dirs:
231-
# Remove ./ prefix and trailing slash if present
232-
clean_excluded = excluded.strip("./").rstrip("/")
233-
# Check if the excluded directory is in the path components
234-
if clean_excluded in file_path_parts:
235-
is_excluded = True
236-
break
237-
if not is_excluded and any(os.path.basename(file_path) == excluded for excluded in excluded_files):
238-
is_excluded = True
239-
if is_excluded:
309+
# Check if file should be processed based on inclusion/exclusion rules
310+
if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
240311
continue
241312

242313
try:
@@ -572,7 +643,8 @@ def __init__(self):
572643
self.repo_paths = None
573644

574645
def prepare_database(self, repo_url_or_path: str, type: str = "github", access_token: str = None, local_ollama: bool = False,
575-
excluded_dirs: List[str] = None, excluded_files: List[str] = None) -> List[Document]:
646+
excluded_dirs: List[str] = None, excluded_files: List[str] = None,
647+
included_dirs: List[str] = None, included_files: List[str] = None) -> List[Document]:
576648
"""
577649
Create a new database from the repository.
578650
@@ -582,13 +654,16 @@ def prepare_database(self, repo_url_or_path: str, type: str = "github", access_t
582654
local_ollama (bool): Whether to use local Ollama for embedding (default: False)
583655
excluded_dirs (List[str], optional): List of directories to exclude from processing
584656
excluded_files (List[str], optional): List of file patterns to exclude from processing
657+
included_dirs (List[str], optional): List of directories to include exclusively
658+
included_files (List[str], optional): List of file patterns to include exclusively
585659
586660
Returns:
587661
List[Document]: List of Document objects
588662
"""
589663
self.reset_database()
590664
self._create_repo(repo_url_or_path, type, access_token)
591-
return self.prepare_db_index(local_ollama=local_ollama, excluded_dirs=excluded_dirs, excluded_files=excluded_files)
665+
return self.prepare_db_index(local_ollama=local_ollama, excluded_dirs=excluded_dirs, excluded_files=excluded_files,
666+
included_dirs=included_dirs, included_files=included_files)
592667

593668
def reset_database(self):
594669
"""
@@ -659,14 +734,17 @@ def _create_repo(self, repo_url_or_path: str, type: str = "github", access_token
659734
logger.error(f"Failed to create repository structure: {e}")
660735
raise
661736

662-
def prepare_db_index(self, local_ollama: bool = False, excluded_dirs: List[str] = None, excluded_files: List[str] = None) -> List[Document]:
737+
def prepare_db_index(self, local_ollama: bool = False, excluded_dirs: List[str] = None, excluded_files: List[str] = None,
738+
included_dirs: List[str] = None, included_files: List[str] = None) -> List[Document]:
663739
"""
664740
Prepare the indexed database for the repository.
665741
666742
Args:
667743
local_ollama (bool): Whether to use local Ollama for embedding (default: False)
668744
excluded_dirs (List[str], optional): List of directories to exclude from processing
669745
excluded_files (List[str], optional): List of file patterns to exclude from processing
746+
included_dirs (List[str], optional): List of directories to include exclusively
747+
included_files (List[str], optional): List of file patterns to include exclusively
670748
671749
Returns:
672750
List[Document]: List of Document objects
@@ -690,7 +768,9 @@ def prepare_db_index(self, local_ollama: bool = False, excluded_dirs: List[str]
690768
self.repo_paths["save_repo_dir"],
691769
local_ollama=local_ollama,
692770
excluded_dirs=excluded_dirs,
693-
excluded_files=excluded_files
771+
excluded_files=excluded_files,
772+
included_dirs=included_dirs,
773+
included_files=included_files
694774
)
695775
self.db = transform_documents_and_save_to_db(
696776
documents, self.repo_paths["save_db_file"], local_ollama=local_ollama

api/ollama_patch.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,34 @@ def __call__(self, documents: Sequence[Document]) -> Sequence[Document]:
2626
output = deepcopy(documents)
2727
logger.info(f"Processing {len(output)} documents individually for Ollama embeddings")
2828

29+
successful_docs = []
30+
expected_embedding_size = None
31+
2932
for i, doc in enumerate(tqdm(output, desc="Processing documents for Ollama embeddings")):
3033
try:
3134
# Get embedding for a single document
3235
result = self.embedder(input=doc.text)
3336
if result.data and len(result.data) > 0:
37+
embedding = result.data[0].embedding
38+
39+
# Validate embedding size consistency
40+
if expected_embedding_size is None:
41+
expected_embedding_size = len(embedding)
42+
logger.info(f"Expected embedding size set to: {expected_embedding_size}")
43+
elif len(embedding) != expected_embedding_size:
44+
file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
45+
logger.warning(f"Document '{file_path}' has inconsistent embedding size {len(embedding)} != {expected_embedding_size}, skipping")
46+
continue
47+
3448
# Assign the embedding to the document
35-
output[i].vector = result.data[0].embedding
49+
output[i].vector = embedding
50+
successful_docs.append(output[i])
3651
else:
37-
logger.warning(f"Failed to get embedding for document {i}")
52+
file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
53+
logger.warning(f"Failed to get embedding for document '{file_path}', skipping")
3854
except Exception as e:
39-
logger.error(f"Error processing document {i}: {e}")
55+
file_path = getattr(doc, 'meta_data', {}).get('file_path', f'document_{i}')
56+
logger.error(f"Error processing document '{file_path}': {e}, skipping")
4057

41-
return output
58+
logger.info(f"Successfully processed {len(successful_docs)}/{len(output)} documents with consistent embeddings")
59+
return successful_docs

0 commit comments

Comments
 (0)