Merge remote-tracking branch 'origin/main' into custom_vector_store_schema

Gaudy Blanco · Gaudy Blanco · commit a735808dae9e · 2025-09-18T13:36:03.000-06:00
diff --git a/.semversioner/next-release/minor-20250916182815141332.json b/.semversioner/next-release/minor-20250916182815141332.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Add config for NLP async mode."
+}
diff --git a/.semversioner/next-release/patch-20250905134951797674.json b/.semversioner/next-release/patch-20250905134951797674.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add gpt-5 support by updating fnllm dependency."
+}
diff --git a/docs/prompt_tuning/auto_prompt_tuning.md b/docs/prompt_tuning/auto_prompt_tuning.md
@@ -58,13 +58,13 @@ graphrag prompt-tune [--root ROOT] [--config CONFIG] [--domain DOMAIN]  [--selec
 ```bash
 python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --domain "environmental news" \
 --selection-method random --limit 10 --language English --max-tokens 2048 --chunk-size 256 --min-examples-required 3 \
---no-entity-types --output /path/to/output
+--no-discover-entity-types --output /path/to/output
 ```
 
 or, with minimal configuration (suggested):
 
 ```bash
-python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --no-entity-types
+python -m graphrag prompt-tune --root /path/to/project --config /path/to/settings.yaml --no-discover-entity-types
 ```
 
 ## Document Selection Methods
diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py
@@ -214,6 +214,7 @@ class ExtractGraphNLPDefaults:
     normalize_edge_weights: bool = True
     text_analyzer: TextAnalyzerDefaults = field(default_factory=TextAnalyzerDefaults)
     concurrent_requests: int = 25
+    async_mode: AsyncType = AsyncType.Threaded
 
 
 @dataclass
diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py
@@ -112,6 +112,7 @@
 extract_graph_nlp:
   text_analyzer:
     extractor_type: {graphrag_config_defaults.extract_graph_nlp.text_analyzer.extractor_type.value} # [regex_english, syntactic_parser, cfg]
+  async_mode: {graphrag_config_defaults.extract_graph_nlp.async_mode.value} # or asyncio
 
 cluster_graph:
   max_cluster_size: {graphrag_config_defaults.cluster_graph.max_cluster_size}
diff --git a/graphrag/config/models/extract_graph_nlp_config.py b/graphrag/config/models/extract_graph_nlp_config.py
@@ -6,7 +6,7 @@
 from pydantic import BaseModel, Field
 
 from graphrag.config.defaults import graphrag_config_defaults
-from graphrag.config.enums import NounPhraseExtractorType
+from graphrag.config.enums import AsyncType, NounPhraseExtractorType
 
 
 class TextAnalyzerConfig(BaseModel):
@@ -68,3 +68,7 @@ class ExtractGraphNLPConfig(BaseModel):
         description="The number of threads to use for the extraction process.",
         default=graphrag_config_defaults.extract_graph_nlp.concurrent_requests,
     )
+    async_mode: AsyncType = Field(
+        description="The async mode to use.",
+        default=graphrag_config_defaults.extract_graph_nlp.async_mode,
+    )
diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@@ -24,12 +24,17 @@ async def build_noun_graph(
     text_analyzer: BaseNounPhraseExtractor,
     normalize_edge_weights: bool,
     num_threads: int = 4,
+    async_mode: AsyncType = AsyncType.Threaded,
     cache: PipelineCache | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Build a noun graph from text units."""
     text_units = text_unit_df.loc[:, ["id", "text"]]
     nodes_df = await _extract_nodes(
-        text_units, text_analyzer, num_threads=num_threads, cache=cache
+        text_units,
+        text_analyzer,
+        num_threads=num_threads,
+        async_mode=async_mode,
+        cache=cache,
     )
     edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights)
     return (nodes_df, edges_df)
@@ -39,6 +44,7 @@ async def _extract_nodes(
     text_unit_df: pd.DataFrame,
     text_analyzer: BaseNounPhraseExtractor,
     num_threads: int = 4,
+    async_mode: AsyncType = AsyncType.Threaded,
     cache: PipelineCache | None = None,
 ) -> pd.DataFrame:
     """
@@ -64,7 +70,7 @@ async def extract(row):
         text_unit_df,
         extract,
         num_threads=num_threads,
-        async_type=AsyncType.Threaded,
+        async_type=async_mode,
         progress_msg="extract noun phrases progress: ",
     )
 
diff --git a/graphrag/index/workflows/extract_graph_nlp.py b/graphrag/index/workflows/extract_graph_nlp.py
@@ -61,6 +61,7 @@ async def extract_graph_nlp(
         text_analyzer=text_analyzer,
         normalize_edge_weights=extraction_config.normalize_edge_weights,
         num_threads=extraction_config.concurrent_requests,
+        async_mode=extraction_config.async_mode,
         cache=cache,
     )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
     # Async IO
     "aiofiles>=24.1.0",
     # LLM
-    "fnllm[azure,openai]>=0.3.0",
+    "fnllm[azure,openai]>=0.4.1",
     "json-repair>=0.30.3",
     "openai>=1.68.0",
     "nltk==3.9.1",
diff --git a/uv.lock b/uv.lock

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "minor",
 +  "description": "Add config for NLP async mode."
 +}
Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ async def extract_graph_nlp(`
`61`	`61`	`text_analyzer=text_analyzer,`
`62`	`62`	`normalize_edge_weights=extraction_config.normalize_edge_weights,`
`63`	`63`	`num_threads=extraction_config.concurrent_requests,`
	`64`	`+ async_mode=extraction_config.async_mode,`
`64`	`65`	`cache=cache,`
`65`	`66`	`)`
`66`	`67`