Skip to content

Commit c5dcee1

Browse files
committed
Make CRF and elastic setup optional
* downgrade elasticsearch-dsl version due to compatibility issues * make elastic connection setup optional * make CRF loading optional * disable multiprocessing in tokenization to avoid forking in celery workers
1 parent 4c7c3f8 commit c5dcee1

File tree

4 files changed

+11
-7
lines changed

4 files changed

+11
-7
lines changed

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ dependencies:
1313
- python=3.7.1
1414
- pyahocorasick=1.4.0
1515
- Unidecode=1.0.23
16-
- elasticsearch-dsl=7.0.0
16+
- elasticsearch-dsl=6.3.1
1717
- ipython=7.5.0
1818
- joblib=0.13.2
1919
- python-magic=0.4.15

sota_extractor2/data/structure.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,10 @@ def prepare_data(tables, csv_path, cache=False):
191191

192192

193193
class CellEvidenceExtractor:
194-
def __init__(self):
194+
def __init__(self, setup_connection=True):
195195
# todo: make sure can be called more than once or refactor to singleton
196-
setup_default_connection()
196+
if setup_connection:
197+
setup_default_connection()
197198

198199
def __call__(self, paper, tables, paper_limit=30, corpus_limit=10):
199200
dfs = [evidence_for_table(paper.paper_id, table, paper_limit, corpus_limit) for table in tables]

sota_extractor2/models/structure/structure_predictor.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,19 @@ def cut_ulmfit_head(model):
3535
class TableStructurePredictor(ULMFiT_SP):
3636
step = "structure_prediction"
3737

38-
def __init__(self, path, file, crf_path=None, crf_model="crf.pkl",
38+
def __init__(self, path, file, crf_path=None, crf_model=None,
3939
sp_path=None, sp_model="spm.model", sp_vocab="spm.vocab"):
4040
super().__init__(path, file, sp_path, sp_model, sp_vocab)
4141

4242
self._full_learner = deepcopy(self.learner)
4343
self.learner.model = cut_ulmfit_head(self.learner.model)
4444
self.learner.loss_func = None
4545

46-
#todo: make CRF optional
47-
crf_path = Path(path) if crf_path is None else Path(crf_path)
48-
self.crf = load_crf(crf_path / crf_model)
46+
if crf_model is not None:
47+
crf_path = Path(path) if crf_path is None else Path(crf_path)
48+
self.crf = load_crf(crf_path / crf_model)
49+
else:
50+
self.crf = None
4951

5052
# todo: clean Experiment from older approaches
5153
self._e = ULMFiTExperiment(remove_num=False, drop_duplicates=False,

sota_extractor2/models/structure/ulmfit.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def _fix_sp_processor(self, sp_path, sp_model, sp_vocab):
1313
if isinstance(processor, SPProcessor):
1414
processor.sp_model = sp_path / sp_model
1515
processor.sp_vocab = sp_path / sp_vocab
16+
processor.n_cpus = 1
1617

1718
#todo: see why it wasn't set on save
1819
processor.mark_fields = True

0 commit comments

Comments
 (0)