Skip to content

Running Custom NER training using local en_core_web_lg in config file #10148

@iteimouri

Description

@iteimouri

I've made the following config file to make a custom NER:

[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null

[pretraining]

[initialize]
vectors = "P:\Documents\Python Scripts\newPythonEnv\Additive_NER_Spacy3\en_core_web_lg-3.2.0.tar\en_core_web_lg-3.2.0\en_core_web_lg\en_core_web_lg-3.2.0\vocab\vectors"
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

Towards the end I had to change the vectors path from en_core_web_lg to "P:\Documents\Python Scripts\newPythonEnv\Additive_NER_Spacy3\en_core_web_lg-3.2.0.tar\en_core_web_lg-3.2.0\en_core_web_lg\en_core_web_lg-3.2.0\vocab\vectors" this is because I can not fetch the en_core_web_lg model from online servers but only can do that locally (due to work restriction). However I get the following error:

[2022-01-27 16:21:16,595] [INFO] Set up nlp object from config
[2022-01-27 16:21:16,608] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-01-27 16:21:16,613] [INFO] Created vocabulary
Traceback (most recent call last):
  File "P:\myPythonEnv\lib\site-packages\spacy\language.py", line 1283, in initialize
    init_vocab(
  File "P:\myPythonEnv\lib\site-packages\spacy\training\initialize.py", line 131, in init_vocab
    load_vectors_into_model(nlp, vectors)
  File "P:\myPythonEnv\lib\site-packages\spacy\training\initialize.py", line 152, in load_vectors_into_model
    vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
  File "P:\myPythonEnv\lib\site-packages\spacy\util.py", line 427, in load_model
    raise IOError(Errors.E050.format(name=name))
OSError: [E050] Can't find model '"P:\Documents\Python Scripts\newPythonEnv\Additive_NER_Spacy3\en_core_web_lg-3.2.0.tar\en_core_web_lg-3.2.0\en_core_web_lg\en_core_web_lg-3.2.0\vocab\vectors"'. It doesn't seem to be a Python package or a valid path to a data directory.

During handling of the above exception, another exception occurred:
Traceback (most recent call last):
  File "P:\myPythonEnv\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "P:\myPythonEnv\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "P:\myPythonEnv\lib\site-packages\spacy\__main__.py", line 4, in <module>
    setup_cli()
  File "P:\myPythonEnv\lib\site-packages\spacy\cli\_util.py", line 71, in setup_cli
    command(prog_name=COMMAND)
  File "P:\myPythonEnv\lib\site-packages\click\core.py", line 829, in __call__
    return self.main(*args, **kwargs)
  File "P:\myPythonEnv\lib\site-packages\click\core.py", line 782, in main
    rv = self.invoke(ctx)
  File "P:\myPythonEnv\lib\site-packages\click\core.py", line 1259, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "P:\myPythonEnv\lib\site-packages\click\core.py", line 1066, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "P:\myPythonEnv\lib\site-packages\click\core.py", line 610, in invoke
    return callback(*args, **kwargs)
  File "P:\myPythonEnv\lib\site-packages\typer\main.py", line 497, in wrapper
    return callback(**use_params)  # type: ignore
  File "P:\myPythonEnv\lib\site-packages\spacy\cli\train.py", line 45, in train_cli
    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
  File "P:\myPythonEnv\lib\site-packages\spacy\cli\train.py", line 72, in train
    nlp = init_nlp(config, use_gpu=use_gpu)
  File "P:\myPythonEnv\lib\site-packages\spacy\training\initialize.py", line 84, in init_nlp
    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
  File "P:\myPythonEnv\lib\site-packages\spacy\language.py", line 1287, in initialize
    raise IOError(Errors.E884.format(vectors=I["vectors"]))
OSError: [E884] The pipeline could not be initialized because the vectors could not be found at '"P:\Documents\Python Scripts\newPythonEnv\Additive_NER_Spacy3\en_core_web_lg-3.2.0.tar\en_core_web_lg-3.2.0\en_core_web_lg\en_core_web_lg-3.2.0\vocab\vectors"'. If your pipeline was already initialized/trained before, call 'resume_training' instead of 'initialize', or initialize only the components that are new.

I have tried various paths and all of the gave the same error - these non-working paths are:

"P:\Documents\Python Scripts\newPythonEnv\Additive_NER_Spacy3\en_core_web_lg-3.2.0.tar\en_core_web_lg-3.2.0\en_core_web_lg\en_core_web_lg-3.2.0\vocab"

"P:\Documents\Python Scripts\newPythonEnv\Additive_NER_Spacy3\en_core_web_lg-3.2.0.tar\en_core_web_lg-3.2.0\en_core_web_lg\en_core_web_lg-3.2.0\vocab\vectors"

"P:\Documents\Python Scripts\newPythonEnv\Additive_NER_Spacy3\en_core_web_lg-3.2.0.tar\en_core_web_lg-3.2.0\en_core_web_lg\en_core_web_lg-3.2.0"

"en_core_web_lg"

I wonder if there is a way to fetch the model using some proxy in the config file or how can I make sure the vectors are loaded locally in a correct manner?

My Spacy version is 3.2.1.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions