working version of indexing endpoint

jgbradley1 · jgbradley1 · commit 92af333f21d8 · 2025-01-17T00:41:37.000-05:00
diff --git a/backend/src/api/graph.py b/backend/src/api/graph.py
@@ -29,7 +29,7 @@ async def get_graphml_file(index_name: str):
     # validate index_name and graphml file existence
     azure_client_manager = AzureClientManager()
     sanitized_index_name = sanitize_name(index_name)
-    graphml_filename = "summarized_graph.graphml"
+    graphml_filename = "graph.graphml"
     blob_filepath = f"output/{graphml_filename}"  # expected file location of the graph based on the workflow
     validate_index_file_exist(sanitized_index_name, blob_filepath)
     try:
diff --git a/backend/src/api/index.py b/backend/src/api/index.py
@@ -1,17 +1,15 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import asyncio
 import inspect
 import os
 import traceback
 from time import time
-from typing import cast
 
+import graphrag.api as api
 import yaml
 from azure.identity import DefaultAzureCredential
 from azure.search.documents.indexes import SearchIndexClient
-from datashaper import WorkflowCallbacksManager
 from fastapi import (
     APIRouter,
     HTTPException,
@@ -193,21 +191,16 @@ async def _start_indexing_pipeline(index_name: str):
             f"{sanitized_index_name}_description_embedding"
         )
 
-    # set prompts for entity extraction, community report, and summarize descriptions.
+    # set prompt for entity extraction
     if pipeline_job.entity_extraction_prompt:
         fname = "entity-extraction-prompt.txt"
         with open(fname, "w") as outfile:
             outfile.write(pipeline_job.entity_extraction_prompt)
         data["entity_extraction"]["prompt"] = fname
     else:
         data.pop("entity_extraction")
-    if pipeline_job.community_report_prompt:
-        fname = "community-report-prompt.txt"
-        with open(fname, "w") as outfile:
-            outfile.write(pipeline_job.community_report_prompt)
-        data["community_reports"]["prompt"] = fname
-    else:
-        data.pop("community_reports")
+
+    # set prompt for summarize descriptions
     if pipeline_job.summarize_descriptions_prompt:
         fname = "summarize-descriptions-prompt.txt"
         with open(fname, "w") as outfile:
@@ -216,15 +209,24 @@ async def _start_indexing_pipeline(index_name: str):
     else:
         data.pop("summarize_descriptions")
 
-    # generate the default pipeline and override with custom settings
+    # set prompt for community report
+    if pipeline_job.community_report_prompt:
+        fname = "community-report-prompt.txt"
+        with open(fname, "w") as outfile:
+            outfile.write(pipeline_job.community_report_prompt)
+        data["community_reports"]["prompt"] = fname
+    else:
+        data.pop("community_reports")
+
+    # generate a default GraphRagConfig and override with custom settings
     parameters = create_graphrag_config(data, ".")
-    pipeline_config = create_pipeline_config(parameters, True)
 
     # reset pipeline job details
     pipeline_job.status = PipelineJobState.RUNNING
     pipeline_job.all_workflows = []
     pipeline_job.completed_workflows = []
     pipeline_job.failed_workflows = []
+    pipeline_config = create_pipeline_config(parameters)
     for workflow in pipeline_config.workflows:
         pipeline_job.all_workflows.append(workflow.name)
 
@@ -243,49 +245,44 @@ async def _start_indexing_pipeline(index_name: str):
         reporters=loggers,
     )
 
-    # add pipeline job callback to the callback manager
-    cast(WorkflowCallbacksManager, workflow_callbacks).register(
-        PipelineJobWorkflowCallbacks(pipeline_job)
-    )
+    # add pipeline job callback to monitor job progress
+    pipeline_job_callback = PipelineJobWorkflowCallbacks(pipeline_job)
 
     # run the pipeline
     try:
-        # TODO refactor to use the new replacement for run_pipeline_with_config
-        from graphrag.index.run import run_pipeline_with_config
-        async for workflow_result in run_pipeline_with_config(
-            config_or_path=pipeline_config,
-            callbacks=workflow_callbacks,
-            progress_reporter=None,
-        ):
-            await asyncio.sleep(0)
-            if len(workflow_result.errors or []) > 0:
-                # if the workflow failed, record the failure
-                pipeline_job.failed_workflows.append(workflow_result.workflow)
-                pipeline_job.update_db()
-                # TODO: exit early if a workflow fails and add more detailed error logging
-
+        await api.build_index(
+            config=parameters,
+            callbacks=[workflow_callbacks, pipeline_job_callback],
+        )
         # if job is done, check if any workflow steps failed
         if len(pipeline_job.failed_workflows) > 0:
             pipeline_job.status = PipelineJobState.FAILED
+            workflow_callbacks.on_log(
+                message=f"Indexing pipeline encountered error for index'{index_name}'.",
+                details={
+                    "index": index_name,
+                    "storage_name": storage_name,
+                    "status_message": "indexing pipeline encountered error",
+                },
+            )
         else:
             # record the workflow completion
             pipeline_job.status = PipelineJobState.COMPLETE
             pipeline_job.percent_complete = 100
+            workflow_callbacks.on_log(
+                message=f"Indexing pipeline complete for index'{index_name}'.",
+                details={
+                    "index": index_name,
+                    "storage_name": storage_name,
+                    "status_message": "indexing pipeline complete",
+                },
+            )
 
         pipeline_job.progress = (
             f"{len(pipeline_job.completed_workflows)} out of "
             f"{len(pipeline_job.all_workflows)} workflows completed successfully."
         )
 
-        workflow_callbacks.on_log(
-            message=f"Indexing pipeline complete for index'{index_name}'.",
-            details={
-                "index": index_name,
-                "storage_name": storage_name,
-                "status_message": "indexing pipeline complete",
-            },
-        )
-
         del workflow_callbacks  # garbage collect
         if pipeline_job.status == PipelineJobState.FAILED:
             exit(1)  # signal to AKS that indexing job failed
diff --git a/backend/src/api/pipeline-settings.yaml b/backend/src/api/pipeline-settings.yaml
@@ -3,31 +3,9 @@
 
 # this yaml file serves as a configuration template for the graphrag indexing jobs
 # some values are hardcoded while others denoted by PLACEHOLDER will be dynamically set
-input:
-  type: blob
-  file_type: text
-  file_pattern: .*\.txt$
-  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
-  container_name: PLACEHOLDER
-  base_dir: .
 
-storage:
-  type: blob
-  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
-  container_name: PLACEHOLDER
-  base_dir: output
-
-reporting:
-  type: blob
-  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
-  container_name: PLACEHOLDER
-  base_dir: logs
-
-cache:
-  type: blob
-  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
-  container_name: PLACEHOLDER
-  base_dir: cache
+######################  LLM settings  ######################
+encoding_model: cl100k_base # this needs to be matched to your model!
 
 llm:
   type: azure_openai_chat
@@ -37,55 +15,123 @@ llm:
   deployment_name: $GRAPHRAG_LLM_DEPLOYMENT_NAME
   cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
   model_supports_json: True
-  tokens_per_minute: 80000
+  tokens_per_minute: 80_000
   requests_per_minute: 480
-  thread_count: 50
   concurrent_requests: 25
+  max_retries: 25
+  max_retry_wait: 60.0
+  sleep_on_rate_limit_recommendation: True
 
 parallelization:
-  stagger: 0.25
   num_threads: 10
+  stagger: 0.25
 
-async_mode: threaded
+async_mode: threaded # or asyncio
 
 embeddings:
-  async_mode: threaded
+  vector_store:
+    type: azure_ai_search
+    collection_name: PLACEHOLDER
+    title_column: name
+    overwrite: True
+    url: $AI_SEARCH_URL
+    audience: $AI_SEARCH_AUDIENCE
   llm:
     type: azure_openai_embedding
     api_base: $GRAPHRAG_API_BASE
     api_version: $GRAPHRAG_API_VERSION
-    batch_size: 16
+    batch_size: 10
     model: $GRAPHRAG_EMBEDDING_MODEL
     deployment_name: $GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME
     cognitive_services_endpoint: $GRAPHRAG_COGNITIVE_SERVICES_ENDPOINT
-    tokens_per_minute: 350000
-    concurrent_requests: 25
+    tokens_per_minute: 350_000
     requests_per_minute: 2100
-    thread_count: 50
-    max_retries: 50
-  parallelization:
-    stagger: 0.25
-    num_threads: 10
-  vector_store:
-    type: azure_ai_search
-    collection_name: PLACEHOLDER
-    title_column: name
-    overwrite: True
-    url: $AI_SEARCH_URL
-    audience: $AI_SEARCH_AUDIENCE
 
-entity_extraction:
-  prompt: PLACEHOLDER
+######################  Input settings  ######################
+input:
+  type: blob
+  file_type: text
+  base_dir: .
+  file_encoding: utf-8
+  file_pattern: .*\.txt$
+  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
+  container_name: PLACEHOLDER
 
-community_reports:
+chunks:
+  size: 1_200
+  overlap: 100
+  group_by_columns: [id]
+
+######################  Storage settings  ######################
+cache:
+  type: blob
+  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
+  container_name: PLACEHOLDER
+  base_dir: cache
+
+reporting:
+  type: blob
+  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
+  container_name: PLACEHOLDER
+  base_dir: logs
+
+storage:
+  type: blob
+  storage_account_blob_url: $STORAGE_ACCOUNT_BLOB_URL
+  container_name: PLACEHOLDER
+  base_dir: output
+
+######################  Workflow settings  ######################
+skip_workflows: []
+
+entity_extraction:
   prompt: PLACEHOLDER
+  entity_types: [organization, person, geo, event]
+  max_gleanings: 1
 
 summarize_descriptions:
   prompt: PLACEHOLDER
+  max_length: 500
 
-# claim extraction is disabled by default in the graphrag library so we enable it for the solution accelerator
 claim_extraction:
-  enabled: True
+  enabled: false
+  prompt: "prompts/claim_extraction.txt"
+  description: "Any claims or facts that could be relevant to information discovery."
+  max_gleanings: 1
+
+community_reports:
+  prompt: PLACEHOLDER
+  max_length: 2_000
+  max_input_length: 8_000
+
+cluster_graph:
+  max_cluster_size: 10
+
+embed_graph:
+  enabled: false
+
+umap:
+  enabled: false
 
 snapshots:
   graphml: True
+  embeddings: false
+  transient: false
+
+######################  Query settings  ######################
+## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
+## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
+local_search:
+  prompt: PLACEHOLDER
+
+global_search:
+  map_prompt: PLACEHOLDER
+  reduce_prompt: PLACEHOLDER
+  knowledge_prompt: PLACEHOLDER
+
+drift_search:
+  prompt: PLACEHOLDER
+  reduce_prompt: PLACEHOLDER
+
+basic_search:
+  prompt: PLACEHOLDER
diff --git a/backend/src/logger/load_logger.py b/backend/src/logger/load_logger.py
@@ -27,7 +27,7 @@ def load_pipeline_logger(
 
     Loggers may be configured as generic loggers or associated with a specified indexing job.
     """
-    # always register the console logger if no loggers are specified
+    # always register the console logger as a fallback option
     if Reporters.CONSOLE not in reporters:
         reporters.append(Reporters.CONSOLE)