Skip to content

Commit b4b8b81

Browse files
Remove spacy model from toml (#1771)
* Remove spacy model from toml * Semver
1 parent 716f93d commit b4b8b81

File tree

6 files changed

+35
-22
lines changed

6 files changed

+35
-22
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "Remove spacy model from toml file"
4+
}

graphrag/index/operations/build_noun_graph/np_extractors/base.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,13 @@
33

44
"""Base class for noun phrase extractors."""
55

6+
import logging
67
from abc import ABCMeta, abstractmethod
78

9+
import spacy
10+
11+
log = logging.getLogger(__name__)
12+
813

914
class BaseNounPhraseExtractor(metaclass=ABCMeta):
1015
"""Abstract base class for noun phrase extractors."""
@@ -37,3 +42,20 @@ def extract(self, text: str) -> list[str]:
3742
@abstractmethod
3843
def __str__(self) -> str:
3944
"""Return string representation of the extractor, used for cache key generation."""
45+
46+
@staticmethod
47+
def load_spacy_model(
48+
model_name: str, exclude: list[str] | None = None
49+
) -> spacy.language.Language:
50+
"""Load a SpaCy model."""
51+
if exclude is None:
52+
exclude = []
53+
try:
54+
return spacy.load(model_name, exclude=exclude)
55+
except OSError:
56+
msg = f"Model `{model_name}` not found. Attempting to download..."
57+
log.info(msg)
58+
from spacy.cli.download import download
59+
60+
download(model_name)
61+
return spacy.load(model_name, exclude=exclude)

graphrag/index/operations/build_noun_graph/np_extractors/cfg_extractor.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
from typing import Any
77

8-
import spacy
98
from spacy.tokens.doc import Doc
109

1110
from graphrag.index.operations.build_noun_graph.np_extractors.base import (
@@ -57,9 +56,13 @@ def __init__(
5756
self.include_named_entities = include_named_entities
5857
self.exclude_entity_tags = exclude_entity_tags
5958
if not include_named_entities:
60-
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "parser", "ner"])
59+
self.nlp = self.load_spacy_model(
60+
model_name, exclude=["lemmatizer", "parser", "ner"]
61+
)
6162
else:
62-
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "parser"])
63+
self.nlp = self.load_spacy_model(
64+
model_name, exclude=["lemmatizer", "parser"]
65+
)
6366

6467
self.exclude_pos_tags = exclude_pos_tags
6568
self.noun_phrase_grammars = noun_phrase_grammars

graphrag/index/operations/build_noun_graph/np_extractors/syntactic_parsing_extractor.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
from typing import Any
77

8-
import spacy
98
from spacy.tokens.span import Span
109
from spacy.util import filter_spans
1110

@@ -55,9 +54,9 @@ def __init__(
5554
self.include_named_entities = include_named_entities
5655
self.exclude_entity_tags = exclude_entity_tags
5756
if not include_named_entities:
58-
self.nlp = spacy.load(model_name, exclude=["lemmatizer", "ner"])
57+
self.nlp = self.load_spacy_model(model_name, exclude=["lemmatizer", "ner"])
5958
else:
60-
self.nlp = spacy.load(model_name, exclude=["lemmatizer"])
59+
self.nlp = self.load_spacy_model(model_name, exclude=["lemmatizer"])
6160

6261
self.exclude_pos_tags = exclude_pos_tags
6362

poetry.lock

Lines changed: 1 addition & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ format-jinja = """
4747
[tool.poetry.dependencies]
4848
python = ">=3.10,<3.13"
4949
environs = "^11.0.0"
50-
en-core-web-md = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0.tar.gz" }
5150

5251
# Vector Stores
5352
azure-search-documents = "^11.5.2"

0 commit comments

Comments
 (0)