Skip to content

Commit 79d7a70

Browse files
dayesouzanatoverse
andauthored
modify smoke tests to include csv table provider (#2229)
* add smoke tests for output csv * change text fixture test * change test * add semver * run format --------- Co-authored-by: Nathan Evans <github@talkswithnumbers.com>
1 parent 71445a3 commit 79d7a70

File tree

5 files changed

+41
-27
lines changed

5 files changed

+41
-27
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "add csv table smoke tests"
4+
}

packages/graphrag-storage/graphrag_storage/tables/csv_table_provider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ async def read_dataframe(self, table_name: str) -> pd.DataFrame:
7070
# Handle empty CSV (pandas can't parse files with no columns)
7171
if not csv_data or csv_data.strip() == "":
7272
return pd.DataFrame()
73-
return pd.read_csv(StringIO(csv_data))
73+
return pd.read_csv(StringIO(csv_data), keep_default_na=False)
7474
except Exception:
7575
logger.exception("error loading table from storage: %s", filename)
7676
raise

tests/fixtures/text/config.json

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,12 @@
2121
300
2222
],
2323
"max_runtime": 30,
24+
"nan_allowed_columns": [
25+
"description"
26+
],
2427
"expected_artifacts": [
25-
"entities.parquet",
26-
"relationships.parquet"
28+
"entities.csv",
29+
"relationships.csv"
2730
]
2831
},
2932
"create_communities": {
@@ -32,7 +35,7 @@
3235
30
3336
],
3437
"max_runtime": 30,
35-
"expected_artifacts": ["communities.parquet"]
38+
"expected_artifacts": ["communities.csv"]
3639
},
3740
"create_community_reports_text": {
3841
"row_range": [
@@ -51,7 +54,7 @@
5154
"size"
5255
],
5356
"max_runtime": 2000,
54-
"expected_artifacts": ["community_reports.parquet"]
57+
"expected_artifacts": ["community_reports.csv"]
5558
},
5659
"create_final_text_units": {
5760
"row_range": [
@@ -64,7 +67,7 @@
6467
"covariate_ids"
6568
],
6669
"max_runtime": 30,
67-
"expected_artifacts": ["text_units.parquet"]
70+
"expected_artifacts": ["text_units.csv"]
6871
},
6972
"create_final_documents": {
7073
"row_range": [
@@ -75,7 +78,7 @@
7578
"raw_data"
7679
],
7780
"max_runtime": 30,
78-
"expected_artifacts": ["documents.parquet"]
81+
"expected_artifacts": ["documents.csv"]
7982
},
8083
"generate_text_embeddings": {
8184
"row_range": [
@@ -84,9 +87,9 @@
8487
],
8588
"max_runtime": 150,
8689
"expected_artifacts": [
87-
"embeddings.text_unit_text.parquet",
88-
"embeddings.entity_description.parquet",
89-
"embeddings.community_full_content.parquet"
90+
"embeddings.text_unit_text.csv",
91+
"embeddings.entity_description.csv",
92+
"embeddings.community_full_content.csv"
9093
]
9194
}
9295
},

tests/fixtures/text/settings.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ vector_store:
2929
api_key: ${AZURE_AI_SEARCH_API_KEY}
3030
container_name: "simple_text_ci"
3131

32+
table_provider:
33+
type: csv
34+
3235
community_reports:
3336
prompt: "prompts/community_report.txt"
3437
max_length: 2000

tests/smoke/test_fixtures.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -178,25 +178,29 @@ def __assert_indexer_outputs(
178178
for artifact in workflow_artifacts:
179179
if artifact.endswith(".parquet"):
180180
output_df = pd.read_parquet(output_path / artifact)
181-
182-
# Check number of rows between range
183-
assert (
184-
config["row_range"][0]
185-
<= len(output_df)
186-
<= config["row_range"][1]
187-
), (
188-
f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
181+
elif artifact.endswith(".csv"):
182+
output_df = pd.read_csv(
183+
output_path / artifact, keep_default_na=False
189184
)
185+
else:
186+
continue
187+
188+
# Check number of rows between range
189+
assert (
190+
config["row_range"][0] <= len(output_df) <= config["row_range"][1]
191+
), (
192+
f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}"
193+
)
190194

191-
# Get non-nan rows
192-
nan_df = output_df.loc[
193-
:,
194-
~output_df.columns.isin(config.get("nan_allowed_columns", [])),
195-
]
196-
nan_df = nan_df[nan_df.isna().any(axis=1)]
197-
assert len(nan_df) == 0, (
198-
f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
199-
)
195+
# Get non-nan rows
196+
nan_df = output_df.loc[
197+
:,
198+
~output_df.columns.isin(config.get("nan_allowed_columns", [])),
199+
]
200+
nan_df = nan_df[nan_df.isna().any(axis=1)]
201+
assert len(nan_df) == 0, (
202+
f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}"
203+
)
200204

201205
def __run_query(self, root: Path, query_config: dict[str, str]):
202206
command = [

0 commit comments

Comments
 (0)