Skip to content

Commit e2b4a51

Browse files
authored
Update NEL emerson config options (#207)
* remove ner from pipeline * fix scoring weights * two configs, one with entity_ruler and one with ner * add config with NEL only, training should work with this * restore pipeline creation * restore vectors_model * revert change to benchmark EL project
1 parent 1771b63 commit e2b4a51

File tree

4 files changed

+299
-8
lines changed

4 files changed

+299
-8
lines changed

tutorials/nel_emerson/configs/nel.cfg renamed to tutorials/nel_emerson/configs/nel_entityruler.cfg

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ gpu_allocator = null
1313

1414
[nlp]
1515
lang = "en"
16-
pipeline = ["sentencizer","entity_ruler","ner","entity_linker"]
16+
pipeline = ["sentencizer","entity_ruler","entity_linker"]
1717
disabled = []
1818
before_creation = null
1919
after_creation = null
@@ -30,10 +30,6 @@ punct_chars = null
3030
source = "${paths.base_nlp}"
3131
component = "entity_ruler"
3232

33-
[components.ner]
34-
source = "${paths.base_nlp}"
35-
component = "ner"
36-
3733
[components.entity_linker]
3834
factory = "entity_linker"
3935
entity_vector_length = 64
@@ -94,7 +90,7 @@ eval_frequency = 200
9490
accumulate_gradient = 2
9591
max_epochs = 0
9692
max_steps = 600
97-
frozen_components = ["sentencizer","ner"]
93+
frozen_components = []
9894
before_to_disk = null
9995

10096
[training.logger]
@@ -130,6 +126,12 @@ learn_rate = 0.001
130126
nel_micro_p = 0.0
131127
nel_micro_r = 0.0
132128
nel_micro_f = 1.0
129+
ents_f = 0.0
130+
ents_p = 0.0
131+
ents_r = 0.0
132+
sents_f = null
133+
sents_p = null
134+
sents_r = null
133135

134136
[pretraining]
135137

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
[paths]
2+
train = ""
3+
dev = ""
4+
raw = null
5+
init_tok2vec = null
6+
kb = ""
7+
base_nlp = ""
8+
vectors = "${paths.base_nlp}"
9+
10+
[system]
11+
seed = 342
12+
gpu_allocator = null
13+
14+
[nlp]
15+
lang = "en"
16+
pipeline = ["sentencizer","ner","entity_linker"]
17+
disabled = []
18+
before_creation = null
19+
after_creation = null
20+
after_pipeline_creation = null
21+
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
22+
23+
[components]
24+
25+
[components.sentencizer]
26+
factory = "sentencizer"
27+
punct_chars = null
28+
29+
[components.ner]
30+
source = "${paths.base_nlp}"
31+
component = "ner"
32+
33+
[components.entity_linker]
34+
factory = "entity_linker"
35+
entity_vector_length = 64
36+
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
37+
incl_context = true
38+
incl_prior = true
39+
labels_discard = []
40+
use_gold_ents = true
41+
42+
[components.entity_linker.model]
43+
@architectures = "spacy.EntityLinker.v2"
44+
nO = null
45+
46+
[components.entity_linker.model.tok2vec]
47+
@architectures = "spacy.HashEmbedCNN.v1"
48+
pretrained_vectors = null
49+
width = 96
50+
depth = 2
51+
embed_size = 2000
52+
window_size = 1
53+
maxout_pieces = 3
54+
subword_features = true
55+
56+
[initialize]
57+
vectors = ${paths.vectors}
58+
init_tok2vec = ${paths.init_tok2vec}
59+
vocab_data = null
60+
lookups = null
61+
62+
[initialize.components]
63+
64+
[initialize.components.entity_linker]
65+
66+
[initialize.components.entity_linker.kb_loader]
67+
@misc = "spacy.KBFromFile.v1"
68+
kb_path = ${paths.kb}
69+
70+
[initialize.tokenizer]
71+
72+
73+
[corpora]
74+
75+
[corpora.train]
76+
@readers = "MyCorpus.v1"
77+
file = ${paths.train}
78+
79+
[corpora.dev]
80+
@readers = "MyCorpus.v1"
81+
file = ${paths.dev}
82+
83+
[training]
84+
train_corpus = "corpora.train"
85+
dev_corpus = "corpora.dev"
86+
seed = ${system.seed}
87+
gpu_allocator = ${system.gpu_allocator}
88+
dropout = 0.2
89+
patience = 10000
90+
eval_frequency = 200
91+
accumulate_gradient = 2
92+
max_epochs = 0
93+
max_steps = 600
94+
frozen_components = ["ner"]
95+
before_to_disk = null
96+
97+
[training.logger]
98+
@loggers = "spacy.ConsoleLogger.v1"
99+
progress_bar = false
100+
101+
102+
[training.batcher]
103+
@batchers = "spacy.batch_by_words.v1"
104+
discard_oversize = false
105+
tolerance = 0.2
106+
get_length = null
107+
108+
[training.batcher.size]
109+
@schedules = "compounding.v1"
110+
start = 100
111+
stop = 1000
112+
compound = 1.001
113+
t = 0.0
114+
115+
[training.optimizer]
116+
@optimizers = "Adam.v1"
117+
beta1 = 0.9
118+
beta2 = 0.999
119+
L2_is_weight_decay = true
120+
L2 = 0.01
121+
grad_clip = 1.0
122+
use_averages = false
123+
eps = 0.00000001
124+
learn_rate = 0.001
125+
126+
[training.score_weights]
127+
nel_micro_p = 0.0
128+
nel_micro_r = 0.0
129+
nel_micro_f = 1.0
130+
ents_f = 0.0
131+
ents_p = 0.0
132+
ents_r = 0.0
133+
sents_f = null
134+
sents_p = null
135+
sents_r = null
136+
137+
[pretraining]
138+
139+
[optimizer]
140+
@optimizers = "Adam.v1"
141+
learn_rate = 0.001
142+
beta1 = 0.9
143+
beta2 = 0.999
144+
L2 = 0.0
145+
eps = 0.00000001
146+
grad_clip = 1.0
147+
L2_is_weight_decay = true
148+
use_averages = true
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
[paths]
2+
train = ""
3+
dev = ""
4+
raw = null
5+
init_tok2vec = null
6+
kb = ""
7+
base_nlp = ""
8+
vectors = "${paths.base_nlp}"
9+
10+
[system]
11+
seed = 342
12+
gpu_allocator = null
13+
14+
[nlp]
15+
lang = "en"
16+
pipeline = ["sentencizer","entity_linker"]
17+
disabled = []
18+
before_creation = null
19+
after_creation = null
20+
after_pipeline_creation = null
21+
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
22+
23+
[components]
24+
25+
[components.sentencizer]
26+
factory = "sentencizer"
27+
punct_chars = null
28+
29+
[components.entity_linker]
30+
factory = "entity_linker"
31+
entity_vector_length = 64
32+
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
33+
incl_context = true
34+
incl_prior = true
35+
labels_discard = []
36+
use_gold_ents = true
37+
38+
[components.entity_linker.model]
39+
@architectures = "spacy.EntityLinker.v2"
40+
nO = null
41+
42+
[components.entity_linker.model.tok2vec]
43+
@architectures = "spacy.HashEmbedCNN.v1"
44+
pretrained_vectors = null
45+
width = 96
46+
depth = 2
47+
embed_size = 2000
48+
window_size = 1
49+
maxout_pieces = 3
50+
subword_features = true
51+
52+
[initialize]
53+
vectors = ${paths.vectors}
54+
init_tok2vec = ${paths.init_tok2vec}
55+
vocab_data = null
56+
lookups = null
57+
58+
[initialize.components]
59+
60+
[initialize.components.entity_linker]
61+
62+
[initialize.components.entity_linker.kb_loader]
63+
@misc = "spacy.KBFromFile.v1"
64+
kb_path = ${paths.kb}
65+
66+
[initialize.tokenizer]
67+
68+
69+
[corpora]
70+
71+
[corpora.train]
72+
@readers = "MyCorpus.v1"
73+
file = ${paths.train}
74+
75+
[corpora.dev]
76+
@readers = "MyCorpus.v1"
77+
file = ${paths.dev}
78+
79+
[training]
80+
train_corpus = "corpora.train"
81+
dev_corpus = "corpora.dev"
82+
seed = ${system.seed}
83+
gpu_allocator = ${system.gpu_allocator}
84+
dropout = 0.2
85+
patience = 10000
86+
eval_frequency = 200
87+
accumulate_gradient = 2
88+
max_epochs = 0
89+
max_steps = 600
90+
frozen_components = []
91+
before_to_disk = null
92+
93+
[training.logger]
94+
@loggers = "spacy.ConsoleLogger.v1"
95+
progress_bar = false
96+
97+
98+
[training.batcher]
99+
@batchers = "spacy.batch_by_words.v1"
100+
discard_oversize = false
101+
tolerance = 0.2
102+
get_length = null
103+
104+
[training.batcher.size]
105+
@schedules = "compounding.v1"
106+
start = 100
107+
stop = 1000
108+
compound = 1.001
109+
t = 0.0
110+
111+
[training.optimizer]
112+
@optimizers = "Adam.v1"
113+
beta1 = 0.9
114+
beta2 = 0.999
115+
L2_is_weight_decay = true
116+
L2 = 0.01
117+
grad_clip = 1.0
118+
use_averages = false
119+
eps = 0.00000001
120+
learn_rate = 0.001
121+
122+
[training.score_weights]
123+
nel_micro_p = 0.0
124+
nel_micro_r = 0.0
125+
nel_micro_f = 1.0
126+
sents_f = null
127+
sents_p = null
128+
sents_r = null
129+
130+
[pretraining]
131+
132+
[optimizer]
133+
@optimizers = "Adam.v1"
134+
learn_rate = 0.001
135+
beta1 = 0.9
136+
beta2 = 0.999
137+
L2 = 0.0
138+
eps = 0.00000001
139+
grad_clip = 1.0
140+
L2_is_weight_decay = true
141+
use_averages = true

tutorials/nel_emerson/project.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@ description: "**This project was created as part of a [step-by-step video tutori
33
# Variables can be referenced across the project.yml using ${vars.var_name}
44
vars:
55
name: "nel_emerson"
6-
config: "nel.cfg"
6+
config: "nel_entityruler.cfg"
77
vectors_model: "en_core_web_md"
88
annotations: "emerson_annotated_text.jsonl"
99
entities: "entities.csv"
1010
kb: "my_kb"
1111
nlp: "my_nlp"
1212
train: "train"
1313
dev: "dev"
14-
version: "0.0.3"
14+
version: "0.0.4"
1515

1616
# These are the directories that the project needs. The project CLI will make
1717
# sure that they always exist.

0 commit comments

Comments
 (0)