Skip to content
This repository was archived by the owner on May 27, 2025. It is now read-only.

Commit 92af333

Browse files
committed
working version of indexing endpoint
1 parent 404eed1 commit 92af333

File tree

4 files changed

+134
-91
lines changed

4 files changed

+134
-91
lines changed

backend/src/api/graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ async def get_graphml_file(index_name: str):
2929
# validate index_name and graphml file existence
3030
azure_client_manager = AzureClientManager()
3131
sanitized_index_name = sanitize_name(index_name)
32-
graphml_filename = "summarized_graph.graphml"
32+
graphml_filename = "graph.graphml"
3333
blob_filepath = f"output/{graphml_filename}" # expected file location of the graph based on the workflow
3434
validate_index_file_exist(sanitized_index_name, blob_filepath)
3535
try:

backend/src/api/index.py

Lines changed: 37 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT License.
33

4-
import asyncio
54
import inspect
65
import os
76
import traceback
87
from time import time
9-
from typing import cast
108

9+
import graphrag.api as api
1110
import yaml
1211
from azure.identity import DefaultAzureCredential
1312
from azure.search.documents.indexes import SearchIndexClient
14-
from datashaper import WorkflowCallbacksManager
1513
from fastapi import (
1614
APIRouter,
1715
HTTPException,
@@ -193,21 +191,16 @@ async def _start_indexing_pipeline(index_name: str):
193191
f"{sanitized_index_name}_description_embedding"
194192
)
195193

196-
# set prompts for entity extraction, community report, and summarize descriptions.
194+
# set prompt for entity extraction
197195
if pipeline_job.entity_extraction_prompt:
198196
fname = "entity-extraction-prompt.txt"
199197
with open(fname, "w") as outfile:
200198
outfile.write(pipeline_job.entity_extraction_prompt)
201199
data["entity_extraction"]["prompt"] = fname
202200
else:
203201
data.pop("entity_extraction")
204-
if pipeline_job.community_report_prompt:
205-
fname = "community-report-prompt.txt"
206-
with open(fname, "w") as outfile:
207-
outfile.write(pipeline_job.community_report_prompt)
208-
data["community_reports"]["prompt"] = fname
209-
else:
210-
data.pop("community_reports")
202+
203+
# set prompt for summarize descriptions
211204
if pipeline_job.summarize_descriptions_prompt:
212205
fname = "summarize-descriptions-prompt.txt"
213206
with open(fname, "w") as outfile:
@@ -216,15 +209,24 @@ async def _start_indexing_pipeline(index_name: str):
216209
else:
217210
data.pop("summarize_descriptions")
218211

219-
# generate the default pipeline and override with custom settings
212+
# set prompt for community report
213+
if pipeline_job.community_report_prompt:
214+
fname = "community-report-prompt.txt"
215+
with open(fname, "w") as outfile:
216+
outfile.write(pipeline_job.community_report_prompt)
217+
data["community_reports"]["prompt"] = fname
218+
else:
219+
data.pop("community_reports")
220+
221+
# generate a default GraphRagConfig and override with custom settings
220222
parameters = create_graphrag_config(data, ".")
221-
pipeline_config = create_pipeline_config(parameters, True)
222223

223224
# reset pipeline job details
224225
pipeline_job.status = PipelineJobState.RUNNING
225226
pipeline_job.all_workflows = []
226227
pipeline_job.completed_workflows = []
227228
pipeline_job.failed_workflows = []
229+
pipeline_config = create_pipeline_config(parameters)
228230
for workflow in pipeline_config.workflows:
229231
pipeline_job.all_workflows.append(workflow.name)
230232

@@ -243,49 +245,44 @@ async def _start_indexing_pipeline(index_name: str):
243245
reporters=loggers,
244246
)
245247

246-
# add pipeline job callback to the callback manager
247-
cast(WorkflowCallbacksManager, workflow_callbacks).register(
248-
PipelineJobWorkflowCallbacks(pipeline_job)
249-
)
248+
# add pipeline job callback to monitor job progress
249+
pipeline_job_callback = PipelineJobWorkflowCallbacks(pipeline_job)
250250

251251
# run the pipeline
252252
try:
253-
# TODO refactor to use the new replacement for run_pipeline_with_config
254-
from graphrag.index.run import run_pipeline_with_config
255-
async for workflow_result in run_pipeline_with_config(
256-
config_or_path=pipeline_config,
257-
callbacks=workflow_callbacks,
258-
progress_reporter=None,
259-
):
260-
await asyncio.sleep(0)
261-
if len(workflow_result.errors or []) > 0:
262-
# if the workflow failed, record the failure
263-
pipeline_job.failed_workflows.append(workflow_result.workflow)
264-
pipeline_job.update_db()
265-
# TODO: exit early if a workflow fails and add more detailed error logging
266-
253+
await api.build_index(
254+
config=parameters,
255+
callbacks=[workflow_callbacks, pipeline_job_callback],
256+
)
267257
# if job is done, check if any workflow steps failed
268258
if len(pipeline_job.failed_workflows) > 0:
269259
pipeline_job.status = PipelineJobState.FAILED
260+
workflow_callbacks.on_log(
261+
message=f"Indexing pipeline encountered error for index'{index_name}'.",
262+
details={
263+
"index": index_name,
264+
"storage_name": storage_name,
265+
"status_message": "indexing pipeline encountered error",
266+
},
267+
)
270268
else:
271269
# record the workflow completion
272270
pipeline_job.status = PipelineJobState.COMPLETE
273271
pipeline_job.percent_complete = 100
272+
workflow_callbacks.on_log(
273+
message=f"Indexing pipeline complete for index'{index_name}'.",
274+
details={
275+
"index": index_name,
276+
"storage_name": storage_name,
277+
"status_message": "indexing pipeline complete",
278+
},
279+
)
274280

275281
pipeline_job.progress = (
276282
f"{len(pipeline_job.completed_workflows)} out of "
277283
f"{len(pipeline_job.all_workflows)} workflows completed successfully."
278284
)
279285

280-
workflow_callbacks.on_log(
281-
message=f"Indexing pipeline complete for index'{index_name}'.",
282-
details={
283-
"index": index_name,
284-
"storage_name": storage_name,
285-
"status_message": "indexing pipeline complete",
286-
},
287-
)
288-
289286
del workflow_callbacks # garbage collect
290287
if pipeline_job.status == PipelineJobState.FAILED:
291288
exit(1) # signal to AKS that indexing job failed

backend/src/api/pipeline-settings.yaml

Lines changed: 95 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,9 @@
33

44
# this yaml file serves as a configuration template for the graphrag indexing jobs
55
# some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set
6-
input:
7-
type: blob
8-
file_type: text
9-
file_pattern: .*\.txt$
10-
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
11-
container_name: PLACEHOLDER
12-
base_dir: .
136

14-
storage:
15-
type: blob
16-
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
17-
container_name: PLACEHOLDER
18-
base_dir: output
19-
20-
reporting:
21-
type: blob
22-
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
23-
container_name: PLACEHOLDER
24-
base_dir: logs
25-
26-
cache:
27-
type: blob
28-
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
29-
container_name: PLACEHOLDER
30-
base_dir: cache
7+
###################### LLM settings ######################
8+
encoding_model: cl100k_base # this needs to be matched to your model!
319

3210
llm:
3311
type: azure_openai_chat
@@ -37,55 +15,123 @@ llm:
3715
deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME
3816
cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
3917
model_supports_json: True
40-
tokens_per_minute: 80000
18+
tokens_per_minute: 80_000
4119
requests_per_minute: 480
42-
thread_count: 50
4320
concurrent_requests: 25
21+
max_retries: 25
22+
max_retry_wait: 60.0
23+
sleep_on_rate_limit_recommendation: True
4424

4525
parallelization:
46-
stagger: 0.25
4726
num_threads: 10
27+
stagger: 0.25
4828

49-
async_mode: threaded
29+
async_mode: threaded # or asyncio
5030

5131
embeddings:
52-
async_mode: threaded
32+
vector_store:
33+
type: azure_ai_search
34+
collection_name: PLACEHOLDER
35+
title_column: name
36+
overwrite: True
37+
url: $AI_SEARCH_URL
38+
audience: $AI_SEARCH_AUDIENCE
5339
llm:
5440
type: azure_openai_embedding
5541
api_base: $GRAPHRAG_API_BASE
5642
api_version: $GRAPHRAG_API_VERSION
57-
batch_size: 16
43+
batch_size: 10
5844
model: $GRAPHRAG_EMBEDDING_MODEL
5945
deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME
6046
cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
61-
tokens_per_minute: 350000
62-
concurrent_requests: 25
47+
tokens_per_minute: 350_000
6348
requests_per_minute: 2100
64-
thread_count: 50
65-
max_retries: 50
66-
parallelization:
67-
stagger: 0.25
68-
num_threads: 10
69-
vector_store:
70-
type: azure_ai_search
71-
collection_name: PLACEHOLDER
72-
title_column: name
73-
overwrite: True
74-
url: $AI_SEARCH_URL
75-
audience: $AI_SEARCH_AUDIENCE
7649

77-
entity_extraction:
78-
prompt: PLACEHOLDER
50+
###################### Input settings ######################
51+
input:
52+
type: blob
53+
file_type: text
54+
base_dir: .
55+
file_encoding: utf-8
56+
file_pattern: .*\.txt$
57+
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
58+
container_name: PLACEHOLDER
7959

80-
community_reports:
60+
chunks:
61+
size: 1_200
62+
overlap: 100
63+
group_by_columns: [id]
64+
65+
###################### Storage settings ######################
66+
cache:
67+
type: blob
68+
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
69+
container_name: PLACEHOLDER
70+
base_dir: cache
71+
72+
reporting:
73+
type: blob
74+
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
75+
container_name: PLACEHOLDER
76+
base_dir: logs
77+
78+
storage:
79+
type: blob
80+
storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
81+
container_name: PLACEHOLDER
82+
base_dir: output
83+
84+
###################### Workflow settings ######################
85+
skip_workflows: []
86+
87+
entity_extraction:
8188
prompt: PLACEHOLDER
89+
entity_types: [organization, person, geo, event]
90+
max_gleanings: 1
8291

8392
summarize_descriptions:
8493
prompt: PLACEHOLDER
94+
max_length: 500
8595

86-
# claim extraction is disabled by default in the graphrag library so we enable it for the solution accelerator
8796
claim_extraction:
88-
enabled: True
97+
enabled: false
98+
prompt: "prompts/claim_extraction.txt"
99+
description: "Any claims or facts that could be relevant to information discovery."
100+
max_gleanings: 1
101+
102+
community_reports:
103+
prompt: PLACEHOLDER
104+
max_length: 2_000
105+
max_input_length: 8_000
106+
107+
cluster_graph:
108+
max_cluster_size: 10
109+
110+
embed_graph:
111+
enabled: false
112+
113+
umap:
114+
enabled: false
89115

90116
snapshots:
91117
graphml: True
118+
embeddings: false
119+
transient: false
120+
121+
###################### Query settings ######################
122+
## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
123+
## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
124+
local_search:
125+
prompt: PLACEHOLDER
126+
127+
global_search:
128+
map_prompt: PLACEHOLDER
129+
reduce_prompt: PLACEHOLDER
130+
knowledge_prompt: PLACEHOLDER
131+
132+
drift_search:
133+
prompt: PLACEHOLDER
134+
reduce_prompt: PLACEHOLDER
135+
136+
basic_search:
137+
prompt: PLACEHOLDER

backend/src/logger/load_logger.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def load_pipeline_logger(
2727
2828
Loggers may be configured as generic loggers or associated with a specified indexing job.
2929
"""
30-
# always register the console logger if no loggers are specified
30+
# always register the console logger as a fallback option
3131
if Reporters.CONSOLE not in reporters:
3232
reporters.append(Reporters.CONSOLE)
3333

0 commit comments

Comments
 (0)