Skip to content
This repository was archived by the owner on May 27, 2025. It is now read-only.

Commit 83316b2

Browse files
authored
Upgrade to latest graphrag library release (#213)
1 parent 900c503 commit 83316b2

File tree

95 files changed

+3668
-3801
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+3668
-3801
lines changed

.github/dependabot.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@
66
version: 2
77
updates:
88
- package-ecosystem: "pip"
9-
directory: "/"
9+
directory: "/backend"
1010
schedule:
1111
interval: "weekly"

.github/workflows/tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
- name: Run pytests
5656
working-directory: ${{ github.workspace }}/backend
5757
run: |
58-
pytest --cov=src --junitxml=test-results.xml tests/
58+
pytest --cov=graphrag_app --junitxml=test-results.xml tests/
5959
6060
- name: Upload test results
6161
uses: actions/upload-artifact@v4

backend/.coveragerc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
[run]
22
omit =
33
**/__init__.py
4-
src/models.py

backend/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Web App
2+
This directory contains the source code for a FastAPI application implements a REST API wrapper around the graphrag library. The app has been packaged up as a python package for a cleaner install/deployment experience.
3+
4+
## Package Layout
5+
The code has the following structure:
6+
```shell
7+
backend
8+
├── README.md
9+
├── graphrag_app # contains the main application files
10+
│   ├── __init__.py
11+
│   ├── api # endpoint definitions
12+
│   ├── logger # custom loggers designed for graphrag use
13+
│   ├── main.py # initializes the FastAPI application
14+
│   ├── typing # data validation models
15+
│   └── utils # utility/helper functions
16+
├── manifests # k8s manifest files
17+
├── poetry.lock
18+
├── pyproject.toml
19+
├── pytest.ini
20+
├── scripts # miscellaneous scripts that get executed in k8s
21+
└── tests # pytests (integration tests + unit tests)
22+
```
Lines changed: 66 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,30 @@
33

44
import asyncio
55
import re
6+
import traceback
67
from math import ceil
78
from typing import List
89

9-
from azure.storage.blob import ContainerClient
10+
from azure.storage.blob.aio import ContainerClient
1011
from fastapi import (
1112
APIRouter,
13+
Depends,
1214
HTTPException,
1315
UploadFile,
1416
)
1517

16-
from src.api.azure_clients import AzureClientManager
17-
from src.api.common import (
18-
delete_blob_container,
19-
delete_cosmos_container_item,
20-
sanitize_name,
21-
validate_blob_container_name,
22-
)
23-
from src.logger import LoggerSingleton
24-
from src.models import (
18+
from graphrag_app.logger.load_logger import load_pipeline_logger
19+
from graphrag_app.typing.models import (
2520
BaseResponse,
2621
StorageNameList,
2722
)
23+
from graphrag_app.utils.common import (
24+
delete_cosmos_container_item_if_exist,
25+
delete_storage_container_if_exist,
26+
get_blob_container_client,
27+
get_cosmos_container_store_client,
28+
sanitize_name,
29+
)
2830

2931
data_route = APIRouter(
3032
prefix="/data",
@@ -34,26 +36,27 @@
3436

3537
@data_route.get(
3638
"",
37-
summary="Get all data storage containers.",
39+
summary="Get list of data containers.",
3840
response_model=StorageNameList,
3941
responses={200: {"model": StorageNameList}},
4042
)
41-
async def get_all_data_storage_containers():
43+
async def get_all_data_containers():
4244
"""
43-
Retrieve a list of all data storage containers.
45+
Retrieve a list of all data containers.
4446
"""
45-
azure_client_manager = AzureClientManager()
4647
items = []
4748
try:
48-
container_store_client = azure_client_manager.get_cosmos_container_client(
49-
database="graphrag", container="container-store"
50-
)
49+
container_store_client = get_cosmos_container_store_client()
5150
for item in container_store_client.read_all_items():
5251
if item["type"] == "data":
5352
items.append(item["human_readable_name"])
54-
except Exception:
55-
reporter = LoggerSingleton().get_instance()
56-
reporter.on_error("Error getting list of blob containers.")
53+
except Exception as e:
54+
reporter = load_pipeline_logger()
55+
reporter.error(
56+
message="Error getting list of blob containers.",
57+
cause=e,
58+
stack=traceback.format_exc(),
59+
)
5760
raise HTTPException(
5861
status_code=500, detail="Error getting list of blob containers."
5962
)
@@ -112,10 +115,13 @@ def __exit__(self, *args):
112115
responses={200: {"model": BaseResponse}},
113116
)
114117
async def upload_files(
115-
files: List[UploadFile], storage_name: str, overwrite: bool = True
118+
files: List[UploadFile],
119+
container_name: str,
120+
sanitized_container_name: str = Depends(sanitize_name),
121+
overwrite: bool = True,
116122
):
117123
"""
118-
Create a data storage container in Azure and upload files to it.
124+
Create a Azure Storage container and upload files to it.
119125
120126
Args:
121127
files (List[UploadFile]): A list of files to be uploaded.
@@ -128,80 +134,73 @@ async def upload_files(
128134
Raises:
129135
HTTPException: If the container name is invalid or if any error occurs during the upload process.
130136
"""
131-
sanitized_storage_name = sanitize_name(storage_name)
132-
# ensure container name follows Azure Blob Storage naming conventions
133-
try:
134-
validate_blob_container_name(sanitized_storage_name)
135-
except ValueError:
136-
raise HTTPException(
137-
status_code=500,
138-
detail=f"Invalid blob container name: '{storage_name}'. Please try a different name.",
139-
)
140137
try:
141-
azure_client_manager = AzureClientManager()
142-
blob_service_client = azure_client_manager.get_blob_service_client_async()
143-
container_client = blob_service_client.get_container_client(
144-
sanitized_storage_name
145-
)
146-
if not await container_client.exists():
147-
await container_client.create_container()
148-
149138
# clean files - remove illegal XML characters
150139
files = [UploadFile(Cleaner(f.file), filename=f.filename) for f in files]
151140

152141
# upload files in batches of 1000 to avoid exceeding Azure Storage API limits
142+
blob_container_client = await get_blob_container_client(
143+
sanitized_container_name
144+
)
153145
batch_size = 1000
154-
batches = ceil(len(files) / batch_size)
155-
for i in range(batches):
146+
num_batches = ceil(len(files) / batch_size)
147+
for i in range(num_batches):
156148
batch_files = files[i * batch_size : (i + 1) * batch_size]
157149
tasks = [
158-
upload_file_async(file, container_client, overwrite)
150+
upload_file_async(file, blob_container_client, overwrite)
159151
for file in batch_files
160152
]
161153
await asyncio.gather(*tasks)
162-
# update container-store in cosmosDB since upload process was successful
163-
container_store_client = azure_client_manager.get_cosmos_container_client(
164-
database="graphrag", container="container-store"
165-
)
166-
container_store_client.upsert_item({
167-
"id": sanitized_storage_name,
168-
"human_readable_name": storage_name,
154+
155+
# update container-store entry in cosmosDB once upload process is successful
156+
cosmos_container_store_client = get_cosmos_container_store_client()
157+
cosmos_container_store_client.upsert_item({
158+
"id": sanitized_container_name,
159+
"human_readable_name": container_name,
169160
"type": "data",
170161
})
171162
return BaseResponse(status="File upload successful.")
172-
except Exception:
173-
logger = LoggerSingleton().get_instance()
174-
logger.on_error("Error uploading files.", details={"files": files})
163+
except Exception as e:
164+
logger = load_pipeline_logger()
165+
logger.error(
166+
message="Error uploading files.",
167+
cause=e,
168+
stack=traceback.format_exc(),
169+
details={"files": [f.filename for f in files]},
170+
)
175171
raise HTTPException(
176172
status_code=500,
177-
detail=f"Error uploading files to container '{storage_name}'.",
173+
detail=f"Error uploading files to container '{container_name}'.",
178174
)
179175

180176

181177
@data_route.delete(
182-
"/{storage_name}",
178+
"/{container_name}",
183179
summary="Delete a data storage container",
184180
response_model=BaseResponse,
185181
responses={200: {"model": BaseResponse}},
186182
)
187-
async def delete_files(storage_name: str):
183+
async def delete_files(
184+
container_name: str, sanitized_container_name: str = Depends(sanitize_name)
185+
):
188186
"""
189187
Delete a specified data storage container.
190188
"""
191-
# azure_client_manager = AzureClientManager()
192-
sanitized_storage_name = sanitize_name(storage_name)
193189
try:
194-
# delete container in Azure Storage
195-
delete_blob_container(sanitized_storage_name)
196-
# delete entry from container-store in cosmosDB
197-
delete_cosmos_container_item("container-store", sanitized_storage_name)
198-
except Exception:
199-
logger = LoggerSingleton().get_instance()
200-
logger.on_error(
201-
f"Error deleting container {storage_name}.",
202-
details={"Container": storage_name},
190+
delete_storage_container_if_exist(sanitized_container_name)
191+
delete_cosmos_container_item_if_exist(
192+
"container-store", sanitized_container_name
193+
)
194+
except Exception as e:
195+
logger = load_pipeline_logger()
196+
logger.error(
197+
message=f"Error deleting container {container_name}.",
198+
cause=e,
199+
stack=traceback.format_exc(),
200+
details={"Container": container_name},
203201
)
204202
raise HTTPException(
205-
status_code=500, detail=f"Error deleting container '{storage_name}'."
203+
status_code=500,
204+
detail=f"Error deleting container '{container_name}'.",
206205
)
207206
return BaseResponse(status="Success")
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,21 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT License.
33

4+
import traceback
5+
46
from fastapi import (
57
APIRouter,
8+
Depends,
69
HTTPException,
710
)
811
from fastapi.responses import StreamingResponse
912

10-
from src.api.azure_clients import AzureClientManager
11-
from src.api.common import (
13+
from graphrag_app.logger.load_logger import load_pipeline_logger
14+
from graphrag_app.utils.azure_clients import AzureClientManager
15+
from graphrag_app.utils.common import (
1216
sanitize_name,
1317
validate_index_file_exist,
1418
)
15-
from src.logger import LoggerSingleton
1619

1720
graph_route = APIRouter(
1821
prefix="/graph",
@@ -21,31 +24,36 @@
2124

2225

2326
@graph_route.get(
24-
"/graphml/{index_name}",
27+
"/graphml/{container_name}",
2528
summary="Retrieve a GraphML file of the knowledge graph",
2629
response_description="GraphML file successfully downloaded",
2730
)
28-
async def get_graphml_file(index_name: str):
29-
# validate index_name and graphml file existence
31+
async def get_graphml_file(
32+
container_name, sanitized_container_name: str = Depends(sanitize_name)
33+
):
34+
# validate graphml file existence
3035
azure_client_manager = AzureClientManager()
31-
sanitized_index_name = sanitize_name(index_name)
32-
graphml_filename = "summarized_graph.graphml"
36+
graphml_filename = "graph.graphml"
3337
blob_filepath = f"output/{graphml_filename}" # expected file location of the graph based on the workflow
34-
validate_index_file_exist(sanitized_index_name, blob_filepath)
38+
validate_index_file_exist(sanitized_container_name, blob_filepath)
3539
try:
3640
blob_client = azure_client_manager.get_blob_service_client().get_blob_client(
37-
container=sanitized_index_name, blob=blob_filepath
41+
container=sanitized_container_name, blob=blob_filepath
3842
)
3943
blob_stream = blob_client.download_blob().chunks()
4044
return StreamingResponse(
4145
blob_stream,
4246
media_type="application/octet-stream",
4347
headers={"Content-Disposition": f"attachment; filename={graphml_filename}"},
4448
)
45-
except Exception:
46-
logger = LoggerSingleton().get_instance()
47-
logger.on_error("Could not retrieve graphml file")
49+
except Exception as e:
50+
logger = load_pipeline_logger()
51+
logger.error(
52+
message="Could not fetch graphml file",
53+
cause=e,
54+
stack=traceback.format_exc(),
55+
)
4856
raise HTTPException(
4957
status_code=500,
50-
detail=f"Could not retrieve graphml file for index '{index_name}'.",
58+
detail=f"Could not fetch graphml file for '{container_name}'.",
5159
)

0 commit comments

Comments
 (0)