-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathpreprocess_parser.sh
More file actions
executable file
·57 lines (51 loc) · 2.27 KB
/
preprocess_parser.sh
File metadata and controls
executable file
·57 lines (51 loc) · 2.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env bash
# ================ input arguments =========================
data_root=$1
lang=$2
parent_child_tab=$3
sorted=$4
thread_id=$5
eval=$6
# ================ default arguments =======================
# ltf source folder path
ltf_source_thread=${data_root}/ltf_minibatch/${thread_id}
# rsd source folder path
rsd_source_thread=${data_root}/rsd_minibatch/${thread_id}
# file list of ltf files (only file names)
ltf_file_list_thread=${data_root}/ltf_minibatch/${thread_id}_ltf_lst
# file list of rsd files (absolute paths, this is a temporary file)
rsd_file_list_thread=${data_root}/rsd_minibatch/${thread_id}_rsd_lst
# bio
edl_output_dir_thread=${data_root}/edl_minibatch
edl_bio_thread=${edl_output_dir_thread}/${lang}_${thread_id}.bio
# corenlp
core_nlp_output_path=${data_root}/corenlp
timetable_tab=${data_root}/time_table.tab
# ================ script =========================
# generate files for each thread
# generate *.bio
docker run --rm -v ${data_root}:${data_root} -w `pwd` -i limanling/uiuc_ie_${eval} \
/opt/conda/envs/py36/bin/python \
/aida_utilities/ltf2bio.py ${ltf_source_thread} ${edl_bio_thread}
# generate file list
docker run --rm -v ${data_root}:${data_root} -w `pwd` -i limanling/uiuc_ie_${eval} \
/opt/conda/envs/py36/bin/python \
/aida_utilities/dir_readlink.py ${rsd_source_thread} ${rsd_file_list_thread} \
--stanford_corenlp ${core_nlp_output_path}
# apply stanford corenlp
docker run --rm -v ${data_root}:${data_root} \
-v ${parent_child_tab}:${parent_child_tab} \
-w `pwd` -i limanling/uiuc_ie_${eval} \
/opt/conda/envs/py36/bin/python \
/aida_utilities/parent_child_util.py \
${parent_child_tab} ${sorted} ${timetable_tab}
docker run --rm -v ${data_root}:${data_root} -w /stanford-corenlp-aida_0 -i limanling/aida-tools \
java -mx50g -cp '/stanford-corenlp-aida_0/*' edu.stanford.nlp.pipeline.StanfordCoreNLP \
$* -annotators 'tokenize,ssplit,pos,lemma,ner' \
-outputFormat json \
-filelist ${rsd_file_list_thread} \
-ner.docdate.useMappingFile ${timetable_tab} \
-properties StanfordCoreNLP_${lang}.properties \
-outputDirectory ${core_nlp_output_path}
docker run --rm -v ${data_root}:${data_root} -i limanling/uiuc_ie_${eval} \
echo "finish stanford dependency parser for "${rsd_source_thread}