Azure-Samples
diff --git a/‎backend/manage-indexing-jobs.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/manage-indexing-jobs.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/api/index.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/api/index.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/api/query.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/api/query.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backend/src/api/source.py‎
Lines changed: 30 additions & 13 deletions b/‎backend/src/api/source.py‎
Lines changed: 30 additions & 13 deletions
diff --git a/‎backend/src/logger/pipeline_job_workflow_callbacks.py‎
Lines changed: 1 addition & 1 deletion b/‎backend/src/logger/pipeline_job_workflow_callbacks.py‎
Lines changed: 1 addition & 1 deletion
@@ -20,8 +20,8 @@
 from src.api.azure_clients import AzureClientManager
 from src.api.common import sanitize_name
 from src.logger.logger_singleton import LoggerSingleton
-from src.models import PipelineJob
 from src.typing.pipeline import PipelineJobState
+from src.utils.pipeline import PipelineJob
 
 
 def schedule_indexing_job(index_name: str):
 
@@ -44,9 +44,9 @@
     BaseResponse,
     IndexNameList,
     IndexStatusResponse,
-    PipelineJob,
 )
 from src.typing.pipeline import PipelineJobState
+from src.utils.pipeline import PipelineJob
 
 index_route = APIRouter(
     prefix="/index",
 
@@ -34,10 +34,10 @@
 from src.models import (
     GraphRequest,
     GraphResponse,
-    PipelineJob,
 )
 from src.typing.pipeline import PipelineJobState
 from src.utils import query as query_helper
+from src.utils.pipeline import PipelineJob
 
 query_route = APIRouter(
     prefix="/query",
 
@@ -48,11 +48,17 @@ async def get_report_info(index_name: str, report_id: str):
             f"abfs://{sanitized_index_name}/{COMMUNITY_REPORT_TABLE}",
             storage_options=get_pandas_storage_options(),
         )
-        # row = report_table[report_table["community"] == report_id]
-        # return ReportResponse(text=row["full_content"].values[0])
+        # check if report_id exists in the index
+        if not report_table["community"].isin([report_id]).any():
+            raise ValueError(f"Report '{report_id}' not found in index '{index_name}'.")
+        # check if multiple reports with the same id exist (should not happen)
+        if len(report_table.loc[report_table["community"] == report_id]) > 1:
+            raise ValueError(
+                f"Multiple reports with id '{report_id}' found in index '{index_name}'."
+            )
         report_content = report_table.loc[
             report_table["community"] == report_id, "full_content"
-        ][0]
+        ].to_numpy()[0]
         return ReportResponse(text=report_content)
     except Exception:
         logger = LoggerSingleton().get_instance()
@@ -75,26 +81,37 @@ async def get_chunk_info(index_name: str, text_unit_id: str):
     validate_index_file_exist(sanitized_index_name, TEXT_UNITS_TABLE)
     validate_index_file_exist(sanitized_index_name, DOCUMENTS_TABLE)
     try:
-        text_unit_table = pd.read_parquet(
+        text_units = pd.read_parquet(
             f"abfs://{sanitized_index_name}/{TEXT_UNITS_TABLE}",
             storage_options=get_pandas_storage_options(),
         )
         docs = pd.read_parquet(
             f"abfs://{sanitized_index_name}/{DOCUMENTS_TABLE}",
             storage_options=get_pandas_storage_options(),
         )
-        links = {
-            el["id"]: el["title"]
-            for el in docs[["id", "title"]].to_dict(orient="records")
-        }
-        text_unit_table["source_doc"] = text_unit_table["document_ids"].apply(
-            lambda x: links[x[0]]
+        # rename columns for easy joining
+        docs = docs[["id", "title"]].rename(
+            columns={"id": "document_id", "title": "source_document"}
         )
-        row = text_unit_table.loc[
-            text_unit_table.chunk_id == text_unit_id, ["chunk", "source_doc"]
+        # explode the 'document_ids' column so the format matches with 'document_id'
+        text_units = text_units.explode("document_ids")
+
+        # verify that text_unit_id exists in the index
+        if not text_units["chunk_id"].isin([text_unit_id]).any():
+            raise ValueError(
+                f"Text unit '{text_unit_id}' not found in index '{index_name}'."
+            )
+
+        # combine tables to create a (chunk_id -> source_document) mapping
+        merged_table = text_units.merge(
+            docs, left_on="document_ids", right_on="document_id", how="left"
+        )
+        row = merged_table.loc[
+            merged_table["chunk_id"] == text_unit_id, ["chunk", "source_document"]
         ]
         return TextUnitResponse(
-            text=row["chunk"].values[0], source_document=row["source_doc"].values[0]
+            text=row["chunk"].to_numpy()[0],
+            source_document=row["source_document"].to_numpy()[0],
         )
     except Exception:
         logger = LoggerSingleton().get_instance()
 
@@ -3,8 +3,8 @@
 
 from datashaper.workflow.workflow_callbacks import NoopWorkflowCallbacks
 
-from src.models import PipelineJob
 from src.typing.pipeline import PipelineJobState
+from src.utils.pipeline import PipelineJob
 
 
 class PipelineJobWorkflowCallbacks(NoopWorkflowCallbacks):