Skip to content

Commit 37e9da7

Browse files
author
Jacobo Myerston
committed
Adapt greCy to spacy 3.7.5 and adds training data for NER and lemmatizer pipeline
1 parent 691bf32 commit 37e9da7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+444
-567
lines changed

configs/large.cfg

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ seed = 0
1111

1212
[nlp]
1313
lang = "grc"
14-
pipeline = ["tok2vec","morphologizer","tagger","parser","lemmatizer","attribute_ruler"]
14+
pipeline = ["tok2vec","morphologizer","tagger","parser","lemmatizer","ner","attribute_ruler"]
1515
batch_size = 128
1616
disabled = []
1717
before_creation = null
@@ -25,6 +25,27 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
2525
source = "./training/lemmatizer/large/model-best"
2626
replace_listeners = ["model.tok2vec"]
2727

28+
[components.ner]
29+
factory = "ner"
30+
incorrect_spans_key = null
31+
moves = null
32+
scorer = {"@scorers":"spacy.ner_scorer.v1"}
33+
update_with_oracle_cut_size = 100
34+
35+
[components.ner.model]
36+
@architectures = "spacy.TransitionBasedParser.v2"
37+
state_type = "ner"
38+
extra_state_tokens = false
39+
hidden_width = 64
40+
maxout_pieces = 2
41+
use_upper = true
42+
nO = null
43+
44+
[components.ner.model.tok2vec]
45+
@architectures = "spacy.Tok2VecListener.v1"
46+
width = ${components.tok2vec.model.encode.width}
47+
upstream = "tok2vec"
48+
2849
[components.attribute_ruler]
2950
factory = "attribute_ruler"
3051
scorer = {"@scorers":"spacy.attribute_ruler_scorer.v1"}
@@ -138,7 +159,7 @@ accumulate_gradient = 1
138159
patience = 5000
139160
max_epochs = 0
140161
max_steps = 20000
141-
eval_frequency = 200
162+
eval_frequency = 1000
142163
frozen_components = ["lemmatizer"]
143164
annotating_components = []
144165
before_to_disk = null

configs/lemmatizer_sm.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ accumulate_gradient = 1
9191
patience = 5000
9292
max_epochs = 0
9393
max_steps = 20000
94-
eval_frequency = 200
94+
eval_frequency = 1000
9595
frozen_components = []
9696
annotating_components = []
9797
before_to_disk = null
@@ -148,4 +148,4 @@ after_init = null
148148

149149
[initialize.components]
150150

151-
[initialize.tokenizer]
151+
[initialize.tokenizer]

configs/lemmatizer_trf.cfg

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,34 +10,35 @@ seed = 0
1010

1111
[nlp]
1212
lang = "grc"
13-
pipeline = ["transformer","lemmatizer"]
14-
batch_size = 32
13+
pipeline = ["transformer","trainable_lemmatizer"]
14+
batch_size = 128
1515
disabled = []
1616
before_creation = null
1717
after_creation = null
1818
after_pipeline_creation = null
1919
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20+
vectors = {"@vectors":"spacy.Vectors.v1"}
2021

2122
[components]
2223

23-
[components.lemmatizer]
24+
[components.trainable_lemmatizer]
2425
factory = "trainable_lemmatizer"
2526
backoff = "orth"
2627
min_tree_freq = 1
2728
overwrite = false
2829
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
2930
top_k = 5
3031

31-
[components.lemmatizer.model]
32+
[components.trainable_lemmatizer.model]
3233
@architectures = "spacy.Tagger.v2"
3334
nO = null
3435
normalize = false
3536

36-
[components.lemmatizer.model.tok2vec]
37+
[components.trainable_lemmatizer.model.tok2vec]
3738
@architectures = "spacy-transformers.TransformerListener.v1"
3839
grad_factor = 1.0
3940
pooling = {"@layers":"reduce_mean.v1"}
40-
upstream = "transformer"
41+
upstream = "*"
4142

4243
[components.transformer]
4344
factory = "transformer"
@@ -46,7 +47,7 @@ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotati
4647

4748
[components.transformer.model]
4849
@architectures = "spacy-transformers.TransformerModel.v3"
49-
name = "Jacobo/aristoBERTo"
50+
name = "wantuta/roberta_ancient_greek_mlm"
5051
mixed_precision = false
5152

5253
[components.transformer.model.get_spans]
@@ -89,7 +90,7 @@ dropout = 0.1
8990
patience = 1600
9091
max_epochs = 0
9192
max_steps = 20000
92-
eval_frequency = 200
93+
eval_frequency = 1000
9394
frozen_components = []
9495
annotating_components = []
9596
before_to_disk = null
@@ -102,10 +103,6 @@ size = 2000
102103
buffer = 256
103104
get_length = null
104105

105-
# [training.logger]
106-
# @loggers = "spacy.ConsoleLogger.v1"
107-
# progress_bar = false
108-
109106
[training.logger]
110107
@loggers = "spacy.WandbLogger.v3"
111108
project_name = "lemmatizer"
@@ -146,4 +143,4 @@ after_init = null
146143

147144
[initialize.components]
148145

149-
[initialize.tokenizer]
146+
[initialize.tokenizer]

configs/lemmatizer_vec.cfg

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ seed = 0
1111
[nlp]
1212
lang = "grc"
1313
pipeline = ["lemmatizer"]
14-
batch_size = 64
14+
batch_size = 32
1515
disabled = []
1616
before_creation = null
1717
after_creation = null
@@ -91,7 +91,7 @@ accumulate_gradient = 1
9191
patience = 5000
9292
max_epochs = 0
9393
max_steps = 20000
94-
eval_frequency = 200
94+
eval_frequency = 1000
9595
frozen_components = []
9696
annotating_components = []
9797
before_to_disk = null
@@ -148,4 +148,4 @@ after_init = null
148148

149149
[initialize.components]
150150

151-
[initialize.tokenizer]
151+
[initialize.tokenizer]

configs/senter.cfg

Lines changed: 0 additions & 121 deletions
This file was deleted.

0 commit comments

Comments
 (0)