Skip to content

Commit 52ed5ef

Browse files
committed
synch ago 2025
1 parent 91c27e8 commit 52ed5ef

32 files changed

+1938
-192
lines changed

ai/gen-ai-agents/custom_rag_agent/agent_state.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ class State(TypedDict):
4040
standalone_question: str = ""
4141

4242
# similarity_search
43+
# 30/06: modified, now they're a dict with
44+
# page_content and metadata
45+
# populated with docs_serializable (utils.py)
4346
retriever_docs: Optional[list] = []
4447
# reranker
4548
reranker_docs: Optional[list] = []

ai/gen-ai-agents/custom_rag_agent/answer_generator.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
File name: answer_generator.py
33
Author: Luigi Saetta
4-
Date last modified: 2025-03-31
4+
Date last modified: 2025-04-02
55
Python Version: 3.11
66
77
Description:
@@ -67,10 +67,8 @@ def build_context_for_llm(self, docs: list):
6767
6868
docs: list[Documents]
6969
"""
70-
_context = ""
71-
72-
for doc in docs:
73-
_context += doc.page_content + "\n\n"
70+
# more Pythonic
71+
_context = "\n\n".join(doc["page_content"] for doc in docs)
7472

7573
return _context
7674

@@ -79,7 +77,7 @@ def invoke(self, input: State, config=None, **kwargs):
7977
"""
8078
Generate the final answer
8179
"""
82-
# get the config
80+
# get the model_id from config
8381
model_id = config["configurable"]["model_id"]
8482

8583
if config["configurable"]["main_language"] in self.dict_languages:
@@ -102,6 +100,7 @@ def invoke(self, input: State, config=None, **kwargs):
102100
try:
103101
llm = get_llm(model_id=model_id)
104102

103+
# docs are returned from the reranker
105104
_context = self.build_context_for_llm(input["reranker_docs"])
106105

107106
system_prompt = PromptTemplate(

ai/gen-ai-agents/custom_rag_agent/assistant_ui_langgraph.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
File name: assistant_ui.py
33
Author: Luigi Saetta
44
Date created: 2024-12-04
5-
Date last modified: 2025-03-31
5+
Date last modified: 2025-07-01
66
Python Version: 3.11
77
88
Description:
@@ -15,7 +15,7 @@
1515
This code is released under the MIT License.
1616
1717
Notes:
18-
This is part of a demo fro a RAG solution implemented
18+
This is part of a demo for a RAG solution implemented
1919
using LangGraph
2020
2121
Warnings:
@@ -38,7 +38,7 @@
3838
from transport import http_transport
3939
from utils import get_console_logger
4040

41-
# changed to better manage ENABLE_TRACING
41+
# changed to better manage ENABLE_TRACING (can be enabled from UI)
4242
import config
4343

4444
# Constant
@@ -142,13 +142,14 @@ def register_feedback():
142142

143143
st.sidebar.header("Options")
144144

145+
st.sidebar.text_input(label="Region", value=config.REGION, disabled=True)
146+
145147
# the collection used for semantic search
146148
st.session_state.collection_name = st.sidebar.selectbox(
147149
"Collection name",
148150
config.COLLECTION_LIST,
149151
)
150152

151-
# add the choice of LLM (not used for now)
152153
st.session_state.main_language = st.sidebar.selectbox(
153154
"Select the language for the answer",
154155
config.LANGUAGE_LIST,
@@ -203,11 +204,11 @@ def register_feedback():
203204
encoding=Encoding.V2_JSON,
204205
sample_rate=100,
205206
) as span:
206-
# loop to manage streaming
207207
# set the agent config
208208
agent_config = {
209209
"configurable": {
210210
"model_id": st.session_state.model_id,
211+
"embed_model_type": config.EMBED_MODEL_TYPE,
211212
"enable_reranker": st.session_state.enable_reranker,
212213
"enable_tracing": config.ENABLE_TRACING,
213214
"main_language": st.session_state.main_language,
@@ -219,6 +220,7 @@ def register_feedback():
219220
if config.DEBUG:
220221
logger.info("Agent config: %s", agent_config)
221222

223+
# loop to manage streaming
222224
for event in st.session_state.workflow.stream(
223225
input_state,
224226
config=agent_config,
@@ -248,13 +250,13 @@ def register_feedback():
248250
# Stream
249251
with st.chat_message(ASSISTANT):
250252
response_container = st.empty()
251-
full_response = ""
253+
FULL_RESPONSE = ""
252254

253255
for chunk in answer_generator:
254-
full_response += chunk.content
255-
response_container.markdown(full_response + "▌")
256+
FULL_RESPONSE += chunk.content
257+
response_container.markdown(FULL_RESPONSE + "▌")
256258

257-
response_container.markdown(full_response)
259+
response_container.markdown(FULL_RESPONSE)
258260

259261
elapsed_time = round((time.time() - time_start), 1)
260262
logger.info("Elapsed time: %s sec.", elapsed_time)
@@ -268,7 +270,7 @@ def register_feedback():
268270

269271
# Add user/assistant message to chat history
270272
add_to_chat_history(HumanMessage(content=question))
271-
add_to_chat_history(AIMessage(content=full_response))
273+
add_to_chat_history(AIMessage(content=FULL_RESPONSE))
272274

273275
# get the feedback
274276
if st.session_state.get_feedback:

ai/gen-ai-agents/custom_rag_agent/bm25_search.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -57,17 +57,22 @@ def fetch_text_data(self):
5757
cursor.execute(query)
5858

5959
while True:
60-
rows = cursor.fetchmany(self.batch_size) # Fetch records in batches
60+
# Fetch records in batches
61+
rows = cursor.fetchmany(self.batch_size)
6162
if not rows:
62-
break # Exit loop when no more data
63+
# Exit loop when no more data
64+
break
6365

6466
for row in rows:
65-
lob_data = row[0] # This is a CLOB object
67+
# This is a CLOB object
68+
lob_data = row[0]
6669

6770
if isinstance(lob_data, oracledb.LOB):
68-
_results.append(lob_data.read()) # Read LOB content
71+
# Read LOB content
72+
_results.append(lob_data.read())
6973
else:
70-
_results.append(str(lob_data)) # Fallback for non-LOB data
74+
# Fallback for non-LOB data
75+
_results.append(str(lob_data))
7176

7277
return _results
7378

@@ -116,18 +121,33 @@ def search(self, query, top_n=5):
116121

117122
# Example Usage:
118123
# credential are packed in CONNECT_ARGS
119-
table_name = "BOOKS"
120-
text_column = "TEXT"
121124

122-
# create the index
123-
bm25_search = BM25OracleSearch(table_name, text_column)
124125

125-
questions = ["Chi è Luigi Saetta?", "What are the main innovation produced by GPT-4?"]
126+
def run_test():
127+
"""
128+
To run a quick test.
129+
"""
130+
table_name = "BOOKS"
131+
text_column = "TEXT"
132+
133+
# create the index
134+
bm25_search = BM25OracleSearch(table_name, text_column)
135+
136+
questions = [
137+
"Chi è Luigi Saetta?",
138+
"What are the main innovation produced by GPT-4?",
139+
]
140+
141+
for _question in questions:
142+
results = bm25_search.search(_question, top_n=2)
143+
144+
# Print search results
145+
for text, score in results:
146+
print(f"Score: {score:.2f} - Text: {text}")
147+
print("")
126148

127-
for _question in questions:
128-
results = bm25_search.search(_question, top_n=2)
129149

130-
# Print search results
131-
for text, score in results:
132-
print(f"Score: {score:.2f} - Text: {text}")
133-
print("")
150+
#
151+
# Main
152+
#
153+
run_test()

ai/gen-ai-agents/custom_rag_agent/chunk_index_utils.py

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,17 @@
2424
logger = get_console_logger()
2525

2626

27+
def get_chunk_header(file_path):
28+
"""
29+
Generate an header for the chunk.
30+
"""
31+
doc_name = remove_path_from_ref(file_path)
32+
# split to remove the extension
33+
doc_title = doc_name.split(".")[0]
34+
35+
return f"# Doc. title: {doc_title}\n", doc_name
36+
37+
2738
def get_recursive_text_splitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
2839
"""
2940
return a recursive text splitter
@@ -39,7 +50,15 @@ def get_recursive_text_splitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERL
3950

4051
def load_and_split_pdf(book_path, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
4152
"""
42-
load a single book
53+
Loads and splits a PDF document into chunks using a recursive character text splitter.
54+
55+
Args:
56+
book_path (str): The file path of the PDF document.
57+
chunk_size (int): Size of each text chunk.
58+
chunk_overlap (int): Overlap between chunks.
59+
60+
Returns:
61+
List[Document]: A list of LangChain Document objects with metadata.
4362
"""
4463
text_splitter = get_recursive_text_splitter(chunk_size, chunk_overlap)
4564

@@ -50,28 +69,33 @@ def load_and_split_pdf(book_path, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVE
5069
chunk_header = ""
5170

5271
if len(docs) > 0:
53-
doc_name = remove_path_from_ref(book_path)
54-
# split to remove the extension
55-
doc_title = doc_name.split(".")[0]
56-
chunk_header = f"# Doc. title: {doc_title}\n"
72+
chunk_header, _ = get_chunk_header(book_path)
5773

5874
# remove path from source and reduce the metadata (16/03/2025)
5975
for doc in docs:
6076
# add more context to the chunk
6177
doc.page_content = chunk_header + doc.page_content
6278
doc.metadata = {
6379
"source": remove_path_from_ref(book_path),
64-
"page_label": doc.metadata["page_label"],
80+
"page_label": doc.metadata.get("page_label", ""),
6581
}
6682

67-
logger.info("Loaded %s chunks...", len(docs))
83+
logger.info("Successfully loaded and split %d chunks from %s", len(docs), book_path)
6884

6985
return docs
7086

7187

7288
def load_and_split_docx(file_path, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
7389
"""
74-
To load docx files
90+
Loads and splits a docx document into chunks using a recursive character text splitter.
91+
92+
Args:
93+
file_path (str): The file path of the document.
94+
chunk_size (int): Size of each text chunk.
95+
chunk_overlap (int): Overlap between chunks.
96+
97+
Returns:
98+
List[Document]: A list of LangChain Document objects with metadata.
7599
"""
76100
loader = UnstructuredLoader(file_path)
77101
docs = loader.load()
@@ -80,12 +104,10 @@ def load_and_split_docx(file_path, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OV
80104
grouped_text = defaultdict(list)
81105

82106
chunk_header = ""
107+
doc_name = ""
83108

84109
if len(docs) > 0:
85-
doc_name = remove_path_from_ref(file_path)
86-
# split to remove the extension
87-
doc_title = doc_name.split(".")[0]
88-
chunk_header = f"# Doc. title: {doc_title}\n"
110+
chunk_header, doc_name = get_chunk_header(file_path)
89111

90112
for doc in docs:
91113
# fallback to 0 if not available
@@ -115,6 +137,6 @@ def load_and_split_docx(file_path, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OV
115137
)
116138
)
117139

118-
logger.info("Loaded %s chunks...", len(final_chunks))
140+
logger.info("Successfully loaded and split %d chunks from %s", len(docs), file_path)
119141

120142
return final_chunks

0 commit comments

Comments
 (0)