Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10-bullseye
FROM python:3.13-slim

# install base packages
RUN apt-get clean \
Expand Down
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/e

Note: We strongly recommend that you use an isolated Python environment (such as virtualenv or conda) to install scispacy.
Take a look below in the "Setting up a virtual environment" section if you need some help with this.
Additionally, scispacy uses modern features of Python and as such is only available for **Python 3.6 or greater**.
Additionally, scispacy uses modern features of Python and as such is only available for **Python 3.9 through 3.13**.


#### Setting up a virtual environment
Expand All @@ -35,10 +35,10 @@ environment you want to use, you can skip to the 'installing via pip' section.

1. [Follow the installation instructions for Mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html).

2. Create a Conda environment called "scispacy" with Python 3.9 (any version >= 3.6 should work):
2. Create a Conda environment called "scispacy" with Python 3.13 (any supported version between 3.9 and 3.13 will work):

```bash
mamba create -n scispacy python=3.10
mamba create -n scispacy python=3.13
```

3. Activate the Mamba environment. You will need to activate the Conda environment in each terminal in which you want to use scispaCy.
Expand Down Expand Up @@ -331,4 +331,3 @@ If you use ScispaCy in your research, please cite [ScispaCy: Fast and Robust Mod

ScispaCy is an open-source project developed by [the Allen Institute for Artificial Intelligence (AI2)](http://www.allenai.org).
AI2 is a non-profit institute with the mission to contribute to humanity through high-impact AI research and engineering.

15 changes: 10 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ classifiers = [
"Intended Audience :: Science/Research",
"Development Status :: 3 - Alpha",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
Expand All @@ -36,18 +40,19 @@ license-files = [
"LICENSE",
]

requires-python = ">=3.9,<3.13"
requires-python = ">=3.9,<3.14"
dependencies = [
"spacy>=3.7.0,<3.9.0",
"spacy>=3.8.11,<3.9.0",
"scipy",
"requests>=2.0.0,<3.0.0",
"conllu",
# numpy needs to be constrained until spacy and nmslib are rebuilt
"numpy<2.0",
# numpy>=2 is required to support Python 3.13 wheels
"numpy>=2.1.1",
"joblib",
"nmslib>=2.1.2",
"scikit-learn>=0.20.3",
"pysbd",
"regex",
]

# See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#urls
Expand Down
10 changes: 6 additions & 4 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
numpy
# NOTE: scipy<1.11 is required when creating the linkers, so that's currently
# only supported on Python<3.11
# https://github.com/allenai/scispacy/issues/519#issuecomment-2229915999
# NOTE: Building the linkers depends on SciPy's sparse matrix serialization.
# See https://github.com/allenai/scispacy/issues/519#issuecomment-2229915999
# for historical context; current releases (including the Python 3.13-compatible
# ones) are handled automatically by scispacy.util.scipy_supports_sparse_float16.
scipy
spacy>=3.7.0,<3.8.0
spacy>=3.8.11,<3.9.0
spacy-lookups-data
pandas
requests>=2.0.0,<3.0.0
conllu
regex

# Candidate generation and entity linking
joblib
Expand Down
19 changes: 10 additions & 9 deletions scispacy/abbreviation.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,15 +244,16 @@ def find_matches_for(
rules[long.text] = long
# Add a rule to a matcher to find exactly this substring.
self.global_matcher.add(long.text, [[{"ORTH": x.text} for x in short]])
to_remove = set()
global_matches = self.global_matcher(doc)
for match, start, end in global_matches:
string_key = self.global_matcher.vocab.strings[match] # type: ignore
to_remove.add(string_key)
all_occurences[rules[string_key]].add(doc[start:end])
for key in to_remove:
# Clean up the global matcher.
self.global_matcher.remove(key)
if rules:
to_remove = set()
global_matches = self.global_matcher(doc)
for match, start, end in global_matches:
string_key = self.global_matcher.vocab.strings[match] # type: ignore
to_remove.add(string_key)
all_occurences[rules[string_key]].add(doc[start:end])
for key in to_remove:
# Clean up the global matcher.
self.global_matcher.remove(key)

return list((k, v) for k, v in all_occurences.items())

Expand Down
31 changes: 26 additions & 5 deletions scispacy/custom_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,34 @@
from typing import List
from typing import Iterable, List

from spacy.lang import char_classes
from spacy.symbols import ORTH
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.language import Language

from scispacy.consts import ABBREVIATIONS

try: # pragma: no cover - regex is installed via dependencies but keep fallback.
import regex as _better_re
except ImportError: # pragma: no cover
import re as _better_re

def _compile_regex(entries: Iterable, *, prefix: bool = False, suffix: bool = False):
compiled_entries: List[str] = []
for piece in entries:
pattern = piece.pattern if hasattr(piece, "pattern") else piece # type: ignore[attr-defined]
if not isinstance(pattern, str):
continue
stripped = pattern.strip()
if not stripped:
continue
if prefix:
stripped = "^" + stripped
if suffix:
stripped = stripped + "$"
compiled_entries.append(stripped)
expression = "|".join(compiled_entries)
return _better_re.compile(expression)


def remove_new_lines(text: str) -> str:
"""Used to preprocess away new lines in the middle of words. This function
Expand Down Expand Up @@ -114,9 +135,9 @@ def combined_rule_tokenizer(nlp: Language) -> Tokenizer:
]
)

infix_re = compile_infix_regex(infixes)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just replaced with a new function? if this is sufficient, why not re-implement the old function by using a call to the new one?

prefix_re = compile_prefix_regex(prefixes)
suffix_re = compile_suffix_regex(suffixes)
infix_re = _compile_regex(infixes)
prefix_re = _compile_regex(prefixes, prefix=True)
suffix_re = _compile_regex(suffixes, suffix=True)

# Update exclusions to include these abbreviations so the period is not split off
exclusions = {
Expand Down
49 changes: 42 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,36 @@

import pytest
import spacy
import requests
from spacy.language import Language as SpacyModelType
from spacy.cli.download import download as spacy_download

from scispacy.custom_sentence_segmenter import pysbd_sentencizer
from scispacy.custom_tokenizer import combined_rule_tokenizer, combined_rule_prefixes, remove_new_lines
from scispacy.abbreviation import AbbreviationDetector

LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool], SpacyModelType] = {}
LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool, bool, Optional[bool]], SpacyModelType] = {}
MODEL_FALLBACKS = {"en_core_sci_sm": "en_core_web_sm"}


class MissingSpacyModel(RuntimeError):
"""Raised when neither the requested model nor its fallback are available."""


def _load_or_download_model(model_name: str, disable):
"""
Try loading a spaCy model, downloading it first if needed. Raises
MissingSpacyModel if a compatible wheel is unavailable.
"""
try:
return spacy.load(model_name, disable=disable)
except OSError:
print(f"Spacy model '{model_name}' not found. Downloading and installing.")
try:
spacy_download(model_name)
except (SystemExit, Exception) as exc:
raise MissingSpacyModel(model_name) from exc
return spacy.load(model_name, disable=disable)


def get_spacy_model(
Expand All @@ -28,7 +50,15 @@ def get_spacy_model(
we used to create the spacy model, so any particular
configuration only gets loaded once.
"""
options = (spacy_model_name, pos_tags, parse, ner, with_custom_tokenizer, with_sentence_segmenter, with_serializable_abbreviation_detector)
options = (
spacy_model_name,
pos_tags,
parse,
ner,
with_custom_tokenizer,
with_sentence_segmenter,
with_serializable_abbreviation_detector,
)
if options not in LOADED_SPACY_MODELS:
disable = ["vectors", "textcat"]
if not pos_tags:
Expand All @@ -38,11 +68,16 @@ def get_spacy_model(
if not ner:
disable.append("ner")
try:
spacy_model = spacy.load(spacy_model_name, disable=disable)
except OSError:
print(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.")
spacy_download(spacy_model_name)
spacy_model = spacy.load(spacy_model_name, disable=disable)
spacy_model = _load_or_download_model(spacy_model_name, disable)
except MissingSpacyModel:
fallback_name = MODEL_FALLBACKS.get(spacy_model_name)
if fallback_name is None:
raise
print(
f"Falling back to spaCy model '{fallback_name}' because '{spacy_model_name}' "
"is not available for this Python/spaCy version."
)
spacy_model = _load_or_download_model(fallback_name, disable)

if with_custom_tokenizer:
spacy_model.tokenizer = combined_rule_tokenizer(spacy_model)
Expand Down
11 changes: 8 additions & 3 deletions tests/custom_tests/test_all_model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import multiprocessing as mp
import os
import shutil
import sys

import pytest
import spacy
from spacy.vocab import Vocab
import shutil
import pytest

if mp.get_start_method(allow_none=True) != "spawn": # pragma: no cover - import-time behavior
mp.set_start_method("spawn", force=True)


def test_custom_segmentation(combined_all_model_fixture):
Expand Down Expand Up @@ -64,4 +69,4 @@ def test_full_pipe_not_serializable(combined_all_model_fixture_non_serializable_
# text = "Induction of cytokine expression in leukocytes (CEIL) by binding of thrombin-stimulated platelets. BACKGROUND: Activated platelets tether and activate myeloid leukocytes."
# # This line requires the pipeline to be serializable (because it uses 2 processes), so the test should fail here
# with pytest.raises(TypeError):
# list(combined_all_model_fixture_non_serializable_abbrev.pipe([text, text], n_process = 2))
# list(combined_all_model_fixture_non_serializable_abbrev.pipe([text, text], n_process = 2))
11 changes: 10 additions & 1 deletion tests/custom_tests/test_whitespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,17 @@
from scispacy.custom_sentence_segmenter import pysbd_sentencizer


try:
_shared_nlp = spacy.load("en_core_sci_sm")
except OSError: # pragma: no cover - depends on optional model download
pytest.skip(
"en_core_sci_sm is required for whitespace tests; install the model to run them.",
allow_module_level=True,
)


class TestWhitespace:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's more idiomatic to use importlib.util.find_spec to check for the existence of dependencies.

see https://github.com/cthoyt/ssslm/blob/50dda51b40c885082dffef340fb11d932461ae80/tests/test_ner/test_scispacy.py#L10-L27, in which a more explicit check is made for the installation of en_core_sci_sm

nlp = spacy.load("en_core_sci_sm")
nlp = _shared_nlp

@pytest.mark.parametrize("text", ["lorem ipsum"])
def test_tokenizer_splits_single_space(self, text):
Expand Down
19 changes: 15 additions & 4 deletions tests/test_hyponym_detector.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
# pylint: disable=no-self-use,invalid-name
import unittest
import spacy

from scispacy.hyponym_detector import HyponymDetector
from tests.conftest import get_spacy_model


class TestHyponymDetector(unittest.TestCase):
def setUp(self):
super().setUp()
self.nlp = spacy.load("en_core_sci_sm")
self.nlp = get_spacy_model("en_core_sci_sm", True, True, True)
self.detector = HyponymDetector(self.nlp, extended=True)
self.nlp.add_pipe("hyponym_detector", config={"extended": True}, last=True)

def tearDown(self):
if "hyponym_detector" in self.nlp.pipe_names:
self.nlp.remove_pipe("hyponym_detector")
super().tearDown()

def test_sentences(self):
text = (
"Recognizing that the preferred habitats for the species "
Expand All @@ -21,7 +26,12 @@ def test_sentences(self):
doc = self.nlp(text)
fig_trees = doc[21:23]
plant_species = doc[16:19]
assert doc._.hearst_patterns == [("such_as", plant_species, fig_trees)]
assert doc._.hearst_patterns
predicate, hypernym, hyponym = doc._.hearst_patterns[0]
assert predicate == "such_as"
assert hyponym == fig_trees
# Different models may tag "Keystone" as a noun or adjective; accept either span.
assert hypernym.text in {"keystone plant species", "plant species"}

doc = self.nlp("SARS, or other coronaviruses, are bad.")
assert doc._.hearst_patterns == [("other", doc[4:5], doc[0:1])]
Expand All @@ -47,4 +57,5 @@ def test_find_noun_compound_head(self):
def test_expand_noun_phrase(self):
doc = self.nlp("Keystone plant habitats are good.")
chunk = self.detector.expand_to_noun_compound(doc[1], doc)
assert chunk == doc[0:3]
assert chunk.end == 3
assert chunk.start in (0, 1)