@@ -48,11 +48,17 @@ async def get_report_info(index_name: str, report_id: str):
4848 f"abfs://{ sanitized_index_name } /{ COMMUNITY_REPORT_TABLE } " ,
4949 storage_options = get_pandas_storage_options (),
5050 )
51- # row = report_table[report_table["community"] == report_id]
52- # return ReportResponse(text=row["full_content"].values[0])
51+ # check if report_id exists in the index
52+ if not report_table ["community" ].isin ([report_id ]).any ():
53+ raise ValueError (f"Report '{ report_id } ' not found in index '{ index_name } '." )
54+ # check if multiple reports with the same id exist (should not happen)
55+ if len (report_table .loc [report_table ["community" ] == report_id ]) > 1 :
56+ raise ValueError (
57+ f"Multiple reports with id '{ report_id } ' found in index '{ index_name } '."
58+ )
5359 report_content = report_table .loc [
5460 report_table ["community" ] == report_id , "full_content"
55- ][0 ]
61+ ]. to_numpy () [0 ]
5662 return ReportResponse (text = report_content )
5763 except Exception :
5864 logger = LoggerSingleton ().get_instance ()
@@ -75,26 +81,37 @@ async def get_chunk_info(index_name: str, text_unit_id: str):
7581 validate_index_file_exist (sanitized_index_name , TEXT_UNITS_TABLE )
7682 validate_index_file_exist (sanitized_index_name , DOCUMENTS_TABLE )
7783 try :
78- text_unit_table = pd .read_parquet (
84+ text_units = pd .read_parquet (
7985 f"abfs://{ sanitized_index_name } /{ TEXT_UNITS_TABLE } " ,
8086 storage_options = get_pandas_storage_options (),
8187 )
8288 docs = pd .read_parquet (
8389 f"abfs://{ sanitized_index_name } /{ DOCUMENTS_TABLE } " ,
8490 storage_options = get_pandas_storage_options (),
8591 )
86- links = {
87- el ["id" ]: el ["title" ]
88- for el in docs [["id" , "title" ]].to_dict (orient = "records" )
89- }
90- text_unit_table ["source_doc" ] = text_unit_table ["document_ids" ].apply (
91- lambda x : links [x [0 ]]
92+ # rename columns for easy joining
93+ docs = docs [["id" , "title" ]].rename (
94+ columns = {"id" : "document_id" , "title" : "source_document" }
9295 )
93- row = text_unit_table .loc [
94- text_unit_table .chunk_id == text_unit_id , ["chunk" , "source_doc" ]
96+ # explode the 'document_ids' column so the format matches with 'document_id'
97+ text_units = text_units .explode ("document_ids" )
98+
99+ # verify that text_unit_id exists in the index
100+ if not text_units ["chunk_id" ].isin ([text_unit_id ]).any ():
101+ raise ValueError (
102+ f"Text unit '{ text_unit_id } ' not found in index '{ index_name } '."
103+ )
104+
105+ # combine tables to create a (chunk_id -> source_document) mapping
106+ merged_table = text_units .merge (
107+ docs , left_on = "document_ids" , right_on = "document_id" , how = "left"
108+ )
109+ row = merged_table .loc [
110+ merged_table ["chunk_id" ] == text_unit_id , ["chunk" , "source_document" ]
95111 ]
96112 return TextUnitResponse (
97- text = row ["chunk" ].values [0 ], source_document = row ["source_doc" ].values [0 ]
113+ text = row ["chunk" ].to_numpy ()[0 ],
114+ source_document = row ["source_document" ].to_numpy ()[0 ],
98115 )
99116 except Exception :
100117 logger = LoggerSingleton ().get_instance ()
0 commit comments