Skip to content
This repository was archived by the owner on May 27, 2025. It is now read-only.

Commit 4de5d8a

Browse files
committed
use synthetic dataset
1 parent 7402d28 commit 4de5d8a

22 files changed

+447
-397
lines changed

backend/manage-indexing-jobs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
from src.api.azure_clients import AzureClientManager
2121
from src.api.common import sanitize_name
2222
from src.logger.logger_singleton import LoggerSingleton
23-
from src.models import PipelineJob
2423
from src.typing.pipeline import PipelineJobState
24+
from src.utils.pipeline import PipelineJob
2525

2626

2727
def schedule_indexing_job(index_name: str):

backend/src/api/index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@
4444
BaseResponse,
4545
IndexNameList,
4646
IndexStatusResponse,
47-
PipelineJob,
4847
)
4948
from src.typing.pipeline import PipelineJobState
49+
from src.utils.pipeline import PipelineJob
5050

5151
index_route = APIRouter(
5252
prefix="/index",

backend/src/api/query.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,10 @@
3434
from src.models import (
3535
GraphRequest,
3636
GraphResponse,
37-
PipelineJob,
3837
)
3938
from src.typing.pipeline import PipelineJobState
4039
from src.utils import query as query_helper
40+
from src.utils.pipeline import PipelineJob
4141

4242
query_route = APIRouter(
4343
prefix="/query",

backend/src/api/source.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,17 @@ async def get_report_info(index_name: str, report_id: str):
4848
f"abfs://{sanitized_index_name}/{COMMUNITY_REPORT_TABLE}",
4949
storage_options=get_pandas_storage_options(),
5050
)
51-
# row = report_table[report_table["community"] == report_id]
52-
# return ReportResponse(text=row["full_content"].values[0])
51+
# check if report_id exists in the index
52+
if not report_table["community"].isin([report_id]).any():
53+
raise ValueError(f"Report '{report_id}' not found in index '{index_name}'.")
54+
# check if multiple reports with the same id exist (should not happen)
55+
if len(report_table.loc[report_table["community"] == report_id]) > 1:
56+
raise ValueError(
57+
f"Multiple reports with id '{report_id}' found in index '{index_name}'."
58+
)
5359
report_content = report_table.loc[
5460
report_table["community"] == report_id, "full_content"
55-
][0]
61+
].to_numpy()[0]
5662
return ReportResponse(text=report_content)
5763
except Exception:
5864
logger = LoggerSingleton().get_instance()
@@ -75,26 +81,37 @@ async def get_chunk_info(index_name: str, text_unit_id: str):
7581
validate_index_file_exist(sanitized_index_name, TEXT_UNITS_TABLE)
7682
validate_index_file_exist(sanitized_index_name, DOCUMENTS_TABLE)
7783
try:
78-
text_unit_table = pd.read_parquet(
84+
text_units = pd.read_parquet(
7985
f"abfs://{sanitized_index_name}/{TEXT_UNITS_TABLE}",
8086
storage_options=get_pandas_storage_options(),
8187
)
8288
docs = pd.read_parquet(
8389
f"abfs://{sanitized_index_name}/{DOCUMENTS_TABLE}",
8490
storage_options=get_pandas_storage_options(),
8591
)
86-
links = {
87-
el["id"]: el["title"]
88-
for el in docs[["id", "title"]].to_dict(orient="records")
89-
}
90-
text_unit_table["source_doc"] = text_unit_table["document_ids"].apply(
91-
lambda x: links[x[0]]
92+
# rename columns for easy joining
93+
docs = docs[["id", "title"]].rename(
94+
columns={"id": "document_id", "title": "source_document"}
9295
)
93-
row = text_unit_table.loc[
94-
text_unit_table.chunk_id == text_unit_id, ["chunk", "source_doc"]
96+
# explode the 'document_ids' column so the format matches with 'document_id'
97+
text_units = text_units.explode("document_ids")
98+
99+
# verify that text_unit_id exists in the index
100+
if not text_units["chunk_id"].isin([text_unit_id]).any():
101+
raise ValueError(
102+
f"Text unit '{text_unit_id}' not found in index '{index_name}'."
103+
)
104+
105+
# combine tables to create a (chunk_id -> source_document) mapping
106+
merged_table = text_units.merge(
107+
docs, left_on="document_ids", right_on="document_id", how="left"
108+
)
109+
row = merged_table.loc[
110+
merged_table["chunk_id"] == text_unit_id, ["chunk", "source_document"]
95111
]
96112
return TextUnitResponse(
97-
text=row["chunk"].values[0], source_document=row["source_doc"].values[0]
113+
text=row["chunk"].to_numpy()[0],
114+
source_document=row["source_document"].to_numpy()[0],
98115
)
99116
except Exception:
100117
logger = LoggerSingleton().get_instance()

backend/src/logger/pipeline_job_workflow_callbacks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33

44
from datashaper.workflow.workflow_callbacks import NoopWorkflowCallbacks
55

6-
from src.models import PipelineJob
76
from src.typing.pipeline import PipelineJobState
7+
from src.utils.pipeline import PipelineJob
88

99

1010
class PipelineJobWorkflowCallbacks(NoopWorkflowCallbacks):

0 commit comments

Comments
 (0)