Skip to content

Commit d31750f

Browse files
NLP graph extraction (#1652)
* Add NLP extraction workflow * Add text unit community summarization * Add CLI flag for indexing method * Regenerate poetry.lock * Fix claims loading * Merge fixes * Add workflow overrides to config * Semver * Add graph pruning config * Remove degree re-compute from pruning * Switch to percentile for edge weight pruning * Add NLP extraction config * Add new NLP extractor options * Add FGR workflows to util method * Use a generator factory for workflows * Update pruning defaults --------- Co-authored-by: Alonso Guevara <[email protected]>
1 parent eeee84e commit d31750f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2908
-126
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "minor",
3+
"description": "Add NLP graph extraction."
4+
}

dictionary.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,17 @@ cosmosdb
3333
Hnsw
3434
odata
3535

36-
# NLTK Terms
36+
# NLP Terms
3737
chunker
3838
wordnet
3939
maxent
4040
punkt
41+
punct
42+
lemmatizer
43+
PROPN
44+
Syntatic
45+
ents
46+
INTJ
4147

4248
# Libraries
4349
Langchain

graphrag/api/index.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,19 @@
1313
from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
1414
from graphrag.callbacks.reporting import create_pipeline_reporter
1515
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
16-
from graphrag.config.enums import CacheType
16+
from graphrag.config.enums import CacheType, IndexingMethod
1717
from graphrag.config.models.graph_rag_config import GraphRagConfig
18-
from graphrag.index.run.run_workflows import run_workflows
18+
from graphrag.index.run.run_pipeline import run_pipeline
1919
from graphrag.index.typing import PipelineRunResult
20+
from graphrag.index.workflows.factory import create_pipeline
2021
from graphrag.logger.base import ProgressLogger
21-
from graphrag.utils.api import get_workflows_list
2222

2323
log = logging.getLogger(__name__)
2424

2525

2626
async def build_index(
2727
config: GraphRagConfig,
28+
method: IndexingMethod = IndexingMethod.Standard,
2829
memory_profile: bool = False,
2930
callbacks: list[WorkflowCallbacks] | None = None,
3031
progress_logger: ProgressLogger | None = None,
@@ -35,6 +36,8 @@ async def build_index(
3536
----------
3637
config : GraphRagConfig
3738
The configuration.
39+
method : IndexingMethod default=IndexingMethod.Standard
40+
Styling of indexing to perform (full LLM, NLP + LLM, etc.).
3841
memory_profile : bool
3942
Whether to enable memory profiling.
4043
callbacks : list[WorkflowCallbacks] | None default=None
@@ -61,10 +64,10 @@ async def build_index(
6164
if memory_profile:
6265
log.warning("New pipeline does not yet support memory profiling.")
6366

64-
workflows = get_workflows_list(config)
67+
pipeline = create_pipeline(config, method)
6568

66-
async for output in run_workflows(
67-
workflows,
69+
async for output in run_pipeline(
70+
pipeline,
6871
config,
6972
cache=pipeline_cache,
7073
callbacks=callbacks,

graphrag/cli/index.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pathlib import Path
1111

1212
import graphrag.api as api
13-
from graphrag.config.enums import CacheType
13+
from graphrag.config.enums import CacheType, IndexingMethod
1414
from graphrag.config.load_config import load_config
1515
from graphrag.config.logging import enable_logging_with_config
1616
from graphrag.index.validate_config import validate_config_names
@@ -63,6 +63,7 @@ def handle_signal(signum, _):
6363

6464
def index_cli(
6565
root_dir: Path,
66+
method: IndexingMethod,
6667
verbose: bool,
6768
memprofile: bool,
6869
cache: bool,
@@ -81,6 +82,7 @@ def index_cli(
8182

8283
_run_index(
8384
config=config,
85+
method=method,
8486
verbose=verbose,
8587
memprofile=memprofile,
8688
cache=cache,
@@ -92,6 +94,7 @@ def index_cli(
9294

9395
def update_cli(
9496
root_dir: Path,
97+
method: IndexingMethod,
9598
verbose: bool,
9699
memprofile: bool,
97100
cache: bool,
@@ -119,6 +122,7 @@ def update_cli(
119122

120123
_run_index(
121124
config=config,
125+
method=method,
122126
verbose=verbose,
123127
memprofile=memprofile,
124128
cache=cache,
@@ -130,6 +134,7 @@ def update_cli(
130134

131135
def _run_index(
132136
config,
137+
method,
133138
verbose,
134139
memprofile,
135140
cache,
@@ -170,6 +175,7 @@ def _run_index(
170175
outputs = asyncio.run(
171176
api.build_index(
172177
config=config,
178+
method=method,
173179
memory_profile=memprofile,
174180
progress_logger=progress_logger,
175181
)

graphrag/cli/main.py

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
import os
77
import re
88
from collections.abc import Callable
9-
from enum import Enum
109
from pathlib import Path
1110
from typing import Annotated
1211

1312
import typer
1413

14+
from graphrag.config.enums import IndexingMethod, SearchMethod
1515
from graphrag.logger.types import LoggerType
1616
from graphrag.prompt_tune.defaults import (
1717
MAX_TOKEN_COUNT,
@@ -82,19 +82,6 @@ def completer(incomplete: str) -> list[str]:
8282
return completer
8383

8484

85-
class SearchType(Enum):
86-
"""The type of search to run."""
87-
88-
LOCAL = "local"
89-
GLOBAL = "global"
90-
DRIFT = "drift"
91-
BASIC = "basic"
92-
93-
def __str__(self):
94-
"""Return the string representation of the enum value."""
95-
return self.value
96-
97-
9885
@app.command("init")
9986
def _initialize_cli(
10087
root: Annotated[
@@ -141,6 +128,9 @@ def _index_cli(
141128
),
142129
),
143130
] = Path(), # set default to current directory
131+
method: Annotated[
132+
IndexingMethod, typer.Option(help="The indexing method to use.")
133+
] = IndexingMethod.Standard,
144134
verbose: Annotated[
145135
bool, typer.Option(help="Run the indexing pipeline with verbose logging")
146136
] = False,
@@ -186,6 +176,7 @@ def _index_cli(
186176
dry_run=dry_run,
187177
skip_validation=skip_validation,
188178
output_dir=output,
179+
method=method,
189180
)
190181

191182

@@ -207,6 +198,9 @@ def _update_cli(
207198
resolve_path=True,
208199
),
209200
] = Path(), # set default to current directory
201+
method: Annotated[
202+
IndexingMethod, typer.Option(help="The indexing method to use.")
203+
] = IndexingMethod.Standard,
210204
verbose: Annotated[
211205
bool, typer.Option(help="Run the indexing pipeline with verbose logging")
212206
] = False,
@@ -249,6 +243,7 @@ def _update_cli(
249243
config_filepath=config,
250244
skip_validation=skip_validation,
251245
output_dir=output,
246+
method=method,
252247
)
253248

254249

@@ -364,7 +359,7 @@ def _prompt_tune_cli(
364359

365360
@app.command("query")
366361
def _query_cli(
367-
method: Annotated[SearchType, typer.Option(help="The query algorithm to use.")],
362+
method: Annotated[SearchMethod, typer.Option(help="The query algorithm to use.")],
368363
query: Annotated[str, typer.Option(help="The query to execute.")],
369364
config: Annotated[
370365
Path | None,
@@ -433,7 +428,7 @@ def _query_cli(
433428
)
434429

435430
match method:
436-
case SearchType.LOCAL:
431+
case SearchMethod.LOCAL:
437432
run_local_search(
438433
config_filepath=config,
439434
data_dir=data,
@@ -443,7 +438,7 @@ def _query_cli(
443438
streaming=streaming,
444439
query=query,
445440
)
446-
case SearchType.GLOBAL:
441+
case SearchMethod.GLOBAL:
447442
run_global_search(
448443
config_filepath=config,
449444
data_dir=data,
@@ -454,7 +449,7 @@ def _query_cli(
454449
streaming=streaming,
455450
query=query,
456451
)
457-
case SearchType.DRIFT:
452+
case SearchMethod.DRIFT:
458453
run_drift_search(
459454
config_filepath=config,
460455
data_dir=data,
@@ -464,7 +459,7 @@ def _query_cli(
464459
response_type=response_type,
465460
query=query,
466461
)
467-
case SearchType.BASIC:
462+
case SearchMethod.BASIC:
468463
run_basic_search(
469464
config_filepath=config,
470465
data_dir=data,

graphrag/config/defaults.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
InputFileType,
1313
InputType,
1414
LLMType,
15+
NounPhraseExtractorType,
1516
OutputType,
1617
ReportingType,
1718
TextEmbeddingTarget,
@@ -42,8 +43,11 @@
4243
LLM_SLEEP_ON_RATE_LIMIT_RECOMMENDATION = True
4344
LLM_CONCURRENT_REQUESTS = 25
4445

46+
PARALLELIZATION_STAGGER = 0.3
47+
PARALLELIZATION_NUM_THREADS = 50
48+
4549
#
46-
# Text Embedding Parameters
50+
# Text embedding
4751
#
4852
EMBEDDING_TYPE = LLMType.OpenAIEmbedding
4953
EMBEDDING_MODEL = "text-embedding-3-small"
@@ -52,36 +56,67 @@
5256
EMBEDDING_TARGET = TextEmbeddingTarget.required
5357
EMBEDDING_MODEL_ID = DEFAULT_EMBEDDING_MODEL_ID
5458

59+
# LLM response caching
5560
CACHE_TYPE = CacheType.file
5661
CACHE_BASE_DIR = "cache"
62+
63+
# Text chunking
5764
CHUNK_SIZE = 1200
5865
CHUNK_OVERLAP = 100
5966
CHUNK_GROUP_BY_COLUMNS = ["id"]
6067
CHUNK_STRATEGY = ChunkStrategyType.tokens
68+
69+
# Claim extraction
6170
CLAIM_DESCRIPTION = (
6271
"Any claims or facts that could be relevant to information discovery."
6372
)
6473
CLAIM_MAX_GLEANINGS = 1
6574
CLAIM_EXTRACTION_ENABLED = False
6675
CLAIM_EXTRACTION_MODEL_ID = DEFAULT_CHAT_MODEL_ID
76+
77+
# Graph clustering
6778
MAX_CLUSTER_SIZE = 10
6879
USE_LCC = True
6980
CLUSTER_GRAPH_SEED = 0xDEADBEEF
81+
82+
# Community report summarization
7083
COMMUNITY_REPORT_MAX_LENGTH = 2000
7184
COMMUNITY_REPORT_MAX_INPUT_LENGTH = 8000
7285
COMMUNITY_REPORT_MODEL_ID = DEFAULT_CHAT_MODEL_ID
86+
87+
# Graph extraction via LLM
7388
ENTITY_EXTRACTION_ENTITY_TYPES = ["organization", "person", "geo", "event"]
7489
ENTITY_EXTRACTION_MAX_GLEANINGS = 1
7590
ENTITY_EXTRACTION_MODEL_ID = DEFAULT_CHAT_MODEL_ID
91+
92+
# Graph extraction via NLP
93+
NLP_NORMALIZE_EDGE_WEIGHTS = True
94+
NLP_EXTRACTOR_TYPE = NounPhraseExtractorType.RegexEnglish
95+
NLP_MAX_WORD_LENGTH = 15
96+
NLP_MODEL_NAME = "en_core_web_md"
97+
NLP_EXCLUDE_NOUNS = None
98+
NLP_WORD_DELIMITER = " "
99+
NLP_INCLUDE_NAMED_ENTITIES = True
100+
NLP_EXCLUDE_ENTITY_TAGS = ["DATE"]
101+
NLP_EXCLUDE_POS_TAGS = ["DET", "PRON", "INTJ", "X"]
102+
NLP_NOUN_PHRASE_TAGS = ["PROPN", "NOUNS"]
103+
NLP_NOUN_PHRASE_CFG = {
104+
"PROPN,PROPN": "PROPN",
105+
"NOUN,NOUN": "NOUNS",
106+
"NOUNS,NOUN": "NOUNS",
107+
"ADJ,ADJ": "ADJ",
108+
"ADJ,NOUN": "NOUNS",
109+
}
110+
111+
# Input file params
76112
INPUT_FILE_TYPE = InputFileType.text
77113
INPUT_TYPE = InputType.file
78114
INPUT_BASE_DIR = "input"
79115
INPUT_FILE_ENCODING = "utf-8"
80116
INPUT_TEXT_COLUMN = "text"
81117
INPUT_CSV_PATTERN = ".*\\.csv$"
82118
INPUT_TEXT_PATTERN = ".*\\.txt$"
83-
PARALLELIZATION_STAGGER = 0.3
84-
PARALLELIZATION_NUM_THREADS = 50
119+
85120
NODE2VEC_ENABLED = False
86121
NODE2VEC_DIMENSIONS = 1536
87122
NODE2VEC_NUM_WALKS = 10
@@ -101,6 +136,14 @@
101136
UMAP_ENABLED = False
102137
UPDATE_OUTPUT_BASE_DIR = "update_output"
103138

139+
# Graph Pruning
140+
PRUNE_MIN_NODE_FREQ = 2
141+
PRUNE_MAX_NODE_FREQ_STD = None
142+
PRUNE_MIN_NODE_DEGREE = 1
143+
PRUNE_MAX_NODE_DEGREE_STD = None
144+
PRUNE_MIN_EDGE_WEIGHT_PCT = 40
145+
PRUNE_REMOVE_EGO_NODES = False
146+
PRUNE_LCC_ONLY = False
104147

105148
VECTOR_STORE_TYPE = VectorStoreType.LanceDB.value
106149
VECTOR_STORE_DB_URI = str(Path(OUTPUT_BASE_DIR) / "lancedb")

graphrag/config/enums.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,36 @@ class ChunkStrategyType(str, Enum):
140140
def __repr__(self):
141141
"""Get a string representation."""
142142
return f'"{self.value}"'
143+
144+
145+
class SearchMethod(Enum):
146+
"""The type of search to run."""
147+
148+
LOCAL = "local"
149+
GLOBAL = "global"
150+
DRIFT = "drift"
151+
BASIC = "basic"
152+
153+
def __str__(self):
154+
"""Return the string representation of the enum value."""
155+
return self.value
156+
157+
158+
class IndexingMethod(str, Enum):
159+
"""Enum for the type of indexing to perform."""
160+
161+
Standard = "standard"
162+
"""Traditional GraphRAG indexing, with all graph construction and summarization performed by a language model."""
163+
Fast = "fast"
164+
"""Fast indexing, using NLP for graph construction and language model for summarization."""
165+
166+
167+
class NounPhraseExtractorType(str, Enum):
168+
"""Enum for the noun phrase extractor options."""
169+
170+
RegexEnglish = "regex_english"
171+
"""Standard extractor using regex. Fastest, but limited to English."""
172+
Syntactic = "syntactic_parser"
173+
"""Noun phrase extractor based on dependency parsing and NER using SpaCy."""
174+
CFG = "cfg"
175+
"""Noun phrase extractor combining CFG-based noun-chunk extraction and NER."""

0 commit comments

Comments
 (0)