update successful response status code

jgbradley1 · jgbradley1 · commit a77806fb8003 · 2025-04-06T02:19:05.000-04:00
diff --git a/backend/graphrag_app/api/data.py b/backend/graphrag_app/api/data.py
@@ -14,6 +14,7 @@
     Depends,
     HTTPException,
     UploadFile,
+    status,
 )
 from markitdown import MarkItDown, StreamInfo
 
@@ -98,7 +99,7 @@ async def upload_file_async(
                     stream_info=stream_info,
                 )
 
-                # clean the output and upload to blob storage
+                # remove illegal unicode characters and upload to blob storage
                 cleaned_result = clean_output(result.text_content)
                 await converted_blob_client.upload_blob(
                     cleaned_result, overwrite=overwrite
@@ -107,11 +108,12 @@ async def upload_file_async(
                 # update the file cache
                 await update_cache(filename, file_stream, container_client)
         except Exception:
-            pass
+            # if any exception occurs, return the filename to indicate conversion/upload failures
+            return upload_file.filename
 
 
 def clean_output(val: str, replacement: str = ""):
-    """Remove illegal XML characters from a string."""
+    """Removes unicode characters that are invalid XML characters (not valid for graphml files at least)."""
     # fmt: off
     _illegal_xml_chars_RE = re.compile(
             "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
@@ -124,7 +126,7 @@ def clean_output(val: str, replacement: str = ""):
     "",
     summary="Upload data to a data storage container",
     response_model=BaseResponse,
-    responses={200: {"model": BaseResponse}},
+    responses={status.HTTP_201_CREATED: {"model": BaseResponse}},
 )
 async def upload_files(
     files: List[UploadFile],
@@ -133,18 +135,8 @@ async def upload_files(
     overwrite: bool = True,
 ):
     """
-    Create a Azure Storage container and upload files to it.
-
-    Args:
-        files (List[UploadFile]): A list of files to be uploaded.
-        storage_name (str): The name of the Azure Blob Storage container to which files will be uploaded.
-        overwrite (bool): Whether to overwrite existing files with the same name. Defaults to True. If False, files that already exist will be skipped.
-
-    Returns:
-        BaseResponse: An instance of the BaseResponse model with a status message indicating the result of the upload.
-
-    Raises:
-        HTTPException: If the container name is invalid or if any error occurs during the upload process.
+    Create a Azure Storage container (if needed) and upload files. Multiple file types are supported, including pdf, powerpoint, word, excel, html, csv, json, xml, etc.
+    The complete set of supported file types can be found in the MarkItDown (https://github.com/microsoft/markitdown) library.
     """
     try:
         # create the initial cache if it doesn't exist
@@ -153,16 +145,19 @@ async def upload_files(
         )
         await create_cache(blob_container_client)
 
-        # upload files in batches of 1000 to avoid exceeding Azure Storage API limits
-        batch_size = 1000
+        # upload files in batches of 100 to avoid exceeding Azure Storage API limits
+        processing_errors = []
+        batch_size = 100
         num_batches = ceil(len(files) / batch_size)
         for i in range(num_batches):
             batch_files = files[i * batch_size : (i + 1) * batch_size]
             tasks = [
                 upload_file_async(file, blob_container_client, overwrite)
                 for file in batch_files
             ]
-            await asyncio.gather(*tasks)
+            results = await asyncio.gather(*tasks)
+            results = [r for r in results if r is not None]
+            processing_errors.extend(results)
 
         # update container-store entry in cosmosDB once upload process is successful
         cosmos_container_store_client = get_cosmos_container_store_client()
@@ -171,6 +166,12 @@ async def upload_files(
             "human_readable_name": container_name,
             "type": "data",
         })
+
+        if len(processing_errors) > 0:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Error uploading files: {processing_errors}.",
+            )
         return BaseResponse(status="File upload successful.")
     except Exception as e:
         logger = load_pipeline_logger()
diff --git a/backend/graphrag_app/api/graph.py b/backend/graphrag_app/api/graph.py
@@ -8,6 +8,7 @@
     APIRouter,
     Depends,
     HTTPException,
+    status,
 )
 from fastapi.responses import StreamingResponse
 
@@ -31,6 +32,7 @@
     "/graphml/{container_name}",
     summary="Retrieve a GraphML file of the knowledge graph",
     response_description="GraphML file successfully downloaded",
+    status_code=status.HTTP_200_OK,
 )
 async def get_graphml_file(
     container_name, sanitized_container_name: str = Depends(sanitize_name)
diff --git a/backend/graphrag_app/api/index.py b/backend/graphrag_app/api/index.py
@@ -12,6 +12,7 @@
     Depends,
     HTTPException,
     UploadFile,
+    status,
 )
 from kubernetes import (
     client as kubernetes_client,
@@ -49,7 +50,7 @@
     "",
     summary="Build an index",
     response_model=BaseResponse,
-    responses={200: {"model": BaseResponse}},
+    responses={status.HTTP_202_ACCEPTED: {"model": BaseResponse}},
 )
 async def schedule_index_job(
     storage_container_name: str,
@@ -142,7 +143,7 @@ async def schedule_index_job(
     "",
     summary="Get all index names",
     response_model=IndexNameList,
-    responses={200: {"model": IndexNameList}},
+    responses={status.HTTP_200_OK: {"model": IndexNameList}},
 )
 async def get_all_index_names(
     container_store_client=Depends(get_cosmos_container_store_client),
@@ -218,7 +219,7 @@ def _delete_k8s_job(job_name: str, namespace: str) -> None:
     "/{container_name}",
     summary="Delete a specified index",
     response_model=BaseResponse,
-    responses={200: {"model": BaseResponse}},
+    responses={status.HTTP_200_OK: {"model": BaseResponse}},
 )
 async def delete_index(
     container_name: str,
@@ -267,6 +268,7 @@ async def delete_index(
     "/status/{container_name}",
     summary="Track the status of an indexing job",
     response_model=IndexStatusResponse,
+    status_code=status.HTTP_200_OK,
 )
 async def get_index_status(
     container_name: str, sanitized_container_name: str = Depends(sanitize_name)
diff --git a/backend/graphrag_app/api/prompt_tuning.py b/backend/graphrag_app/api/prompt_tuning.py
@@ -11,6 +11,7 @@
     APIRouter,
     Depends,
     HTTPException,
+    status,
 )
 from graphrag.config.create_graphrag_config import create_graphrag_config
 
@@ -27,6 +28,7 @@
     "/prompts",
     summary="Generate custom graphrag prompts based on user-provided data.",
     description="Generating custom prompts from user-provided data may take several minutes to run based on the amount of data used.",
+    status_code=status.HTTP_200_OK,
 )
 async def generate_prompts(
     container_name: str,
diff --git a/backend/graphrag_app/api/query.py b/backend/graphrag_app/api/query.py
@@ -10,6 +10,7 @@
     APIRouter,
     Depends,
     HTTPException,
+    status,
 )
 from graphrag.api.query import global_search, local_search
 from graphrag.config.create_graphrag_config import create_graphrag_config
@@ -42,7 +43,7 @@
     summary="Perform a global search across the knowledge graph index",
     description="The global query method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole.",
     response_model=GraphResponse,
-    responses={200: {"model": GraphResponse}},
+    responses={status.HTTP_200_OK: {"model": GraphResponse}},
 )
 async def global_query(request: GraphRequest):
     # this is a slightly modified version of the graphrag.query.cli.run_global_search method
@@ -122,7 +123,7 @@ async def global_query(request: GraphRequest):
     summary="Perform a local search across the knowledge graph index.",
     description="The local query method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).",
     response_model=GraphResponse,
-    responses={200: {"model": GraphResponse}},
+    responses={status.HTTP_200_OK: {"model": GraphResponse}},
 )
 async def local_query(request: GraphRequest):
     index_name = request.index_name
diff --git a/backend/graphrag_app/api/query_streaming.py b/backend/graphrag_app/api/query_streaming.py
@@ -12,6 +12,7 @@
     APIRouter,
     Depends,
     HTTPException,
+    status,
 )
 from fastapi.responses import StreamingResponse
 from graphrag.api.query import (
@@ -47,6 +48,7 @@
     "/global",
     summary="Stream a response back after performing a global search",
     description="The global query method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole.",
+    status_code=status.HTTP_200_OK,
 )
 async def global_search_streaming(request: GraphRequest):
     # this is a slightly modified version of graphrag_app.api.query.global_query() method
@@ -204,6 +206,7 @@ async def global_search_streaming(request: GraphRequest):
     "/local",
     summary="Stream a response back after performing a local search",
     description="The local query method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).",
+    status_code=status.HTTP_200_OK,
 )
 async def local_search_streaming(request: GraphRequest):
     # this is a slightly modified version of graphrag_app.api.query.local_query() method
diff --git a/backend/graphrag_app/api/source.py b/backend/graphrag_app/api/source.py
@@ -5,7 +5,12 @@
 import traceback
 
 import pandas as pd
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import (
+    APIRouter,
+    Depends,
+    HTTPException,
+    status,
+)
 
 from graphrag_app.logger.load_logger import load_pipeline_logger
 from graphrag_app.typing.models import (
@@ -43,7 +48,7 @@
     "/report/{container_name}/{report_id}",
     summary="Return a single community report.",
     response_model=ReportResponse,
-    responses={200: {"model": ReportResponse}},
+    responses={status.HTTP_200_OK: {"model": ReportResponse}},
 )
 async def get_report_info(
     report_id: int,
@@ -88,7 +93,7 @@ async def get_report_info(
     "/text/{container_name}/{text_unit_id}",
     summary="Return a single base text unit.",
     response_model=TextUnitResponse,
-    responses={200: {"model": TextUnitResponse}},
+    responses={status.HTTP_200_OK: {"model": TextUnitResponse}},
 )
 async def get_chunk_info(
     text_unit_id: str,
@@ -148,7 +153,7 @@ async def get_chunk_info(
     "/entity/{container_name}/{entity_id}",
     summary="Return a single entity.",
     response_model=EntityResponse,
-    responses={200: {"model": EntityResponse}},
+    responses={status.HTTP_200_OK: {"model": EntityResponse}},
 )
 async def get_entity_info(
     entity_id: int,
@@ -190,7 +195,7 @@ async def get_entity_info(
     "/claim/{container_name}/{claim_id}",
     summary="Return a single claim.",
     response_model=ClaimResponse,
-    responses={200: {"model": ClaimResponse}},
+    responses={status.HTTP_200_OK: {"model": ClaimResponse}},
 )
 async def get_claim_info(
     claim_id: int,
@@ -240,7 +245,7 @@ async def get_claim_info(
     "/relationship/{container_name}/{relationship_id}",
     summary="Return a single relationship.",
     response_model=RelationshipResponse,
-    responses={200: {"model": RelationshipResponse}},
+    responses={status.HTTP_200_OK: {"model": RelationshipResponse}},
 )
 async def get_relationship_info(
     relationship_id: int,
diff --git a/backend/graphrag_app/utils/common.py b/backend/graphrag_app/utils/common.py
@@ -213,7 +213,7 @@ async def subscription_key_check(
 
 async def create_cache(container_client: ContainerClient) -> None:
     """
-    Create a file cache to track the uploaded files if it doesn't exist.
+    Create a file cache (csv) to track uploaded files.
     """
     try:
         cache_blob_client = container_client.get_blob_client("uploaded_files_cache.csv")
@@ -238,7 +238,9 @@ async def create_cache(container_client: ContainerClient) -> None:
 
 async def check_cache(file_stream: BinaryIO, container_client: ContainerClient) -> bool:
     """
-    Check if the file has already been uploaded.
+    Check a cache file to determine if a file has previously been uploaded.
+
+    Note: This function creates/checks a CSV file in azure storage to act as a cache of previously uploaded files.
     """
     try:
         # load the file cache
@@ -265,7 +267,7 @@ async def update_cache(
     filename: str, file_stream: BinaryIO, container_client: ContainerClient
 ) -> None:
     """
-    Update the file cache with the new file by appending a new row to the cache.
+    Update the file cache (csv) with a new file by adding a new row.
     """
     try:
         # Load the file cache
diff --git a/backend/poetry.lock b/backend/poetry.lock
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
diff --git a/notebooks/1-Quickstart.ipynb b/notebooks/1-Quickstart.ipynb
diff --git a/notebooks/2-Advanced_Getting_Started.ipynb b/notebooks/2-Advanced_Getting_Started.ipynb

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@`
`8`	`8`	`APIRouter,`
`9`	`9`	`Depends,`
`10`	`10`	`HTTPException,`
	`11`	`+ status,`
`11`	`12`	`)`
`12`	`13`	`from fastapi.responses import StreamingResponse`
`13`	`14`
`@@ -31,6 +32,7 @@`
`31`	`32`	`"/graphml/{container_name}",`
`32`	`33`	`summary="Retrieve a GraphML file of the knowledge graph",`
`33`	`34`	`response_description="GraphML file successfully downloaded",`
	`35`	`+ status_code=status.HTTP_200_OK,`
`34`	`36`	`)`
`35`	`37`	`async def get_graphml_file(`
`36`	`38`	`container_name, sanitized_container_name: str = Depends(sanitize_name)`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`	`APIRouter,`
`12`	`12`	`Depends,`
`13`	`13`	`HTTPException,`
	`14`	`+ status,`
`14`	`15`	`)`
`15`	`16`	`from graphrag.config.create_graphrag_config import create_graphrag_config`
`16`	`17`
`@@ -27,6 +28,7 @@`
`27`	`28`	`"/prompts",`
`28`	`29`	`summary="Generate custom graphrag prompts based on user-provided data.",`
`29`	`30`	`description="Generating custom prompts from user-provided data may take several minutes to run based on the amount of data used.",`
	`31`	`+ status_code=status.HTTP_200_OK,`
`30`	`32`	`)`
`31`	`33`	`async def generate_prompts(`
`32`	`34`	`container_name: str,`