-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdbnl_fix_and_split.nf
More file actions
executable file
·142 lines (116 loc) · 4.46 KB
/
dbnl_fix_and_split.nf
File metadata and controls
executable file
·142 lines (116 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env nextflow
/*
vim: syntax=groovy
-*- mode: groovy;-*-
*/
log.info "-------------------------------------------"
log.info "Fix and split DBL"
log.info "-------------------------------------------"
log.info " (no ocr/normalisation/ticcl!)"
def env = System.getenv()
params.virtualenv = env.containsKey('VIRTUAL_ENV') ? env['VIRTUAL_ENV'] : ""
params.extension = "xml"
params.ignore = false
if (params.containsKey('help') || !params.containsKey('inputdir') || !params.containsKey('outputdir') || !params.containsKey('datadir')) {
log.info "Usage:"
log.info " dbnl_fix_and_split.nf [OPTIONS]"
log.info ""
log.info "Mandatory parameters:"
log.info " --inputdir DIRECTORY Input directory (FoLiA documents)"
log.info " --outputdir DIRECTORY Output directory"
log.info " --datadir DIRECTORY Directory where the inl/nederlab-linguistic-enrichment repository is cloned"
log.info "Optional parameters:"
log.info " --extension STR Extension of documents in input directory (default: xml)"
log.info " --ignore Ignore documents that are not in the Nederlab collection, just pass them through as-is"
log.info ""
}
println "Reading documents from " + params.inputdir + "/**." + params.extension
inputdocuments = Channel.fromPath(params.inputdir+"/**." + params.extension)
inputdocuments_test = Channel.fromPath(params.inputdir+"/**." + params.extension)
println "Found " + inputdocuments_test.count().val + " input documents"
process fix {
input:
file inputdocument from inputdocuments
val datadir from params.datadir
val virtualenv from params.virtualenv
val ignore from params.ignore
output:
file "out/${inputdocument.simpleName}.folia.xml" into fixeddocuments
script:
"""
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
if [ ! -z "$ignore" ]; then
flags="--ignore"
else
flags=""
fi
mkdir -p out
python3 \$LM_PREFIX/opt/nederlab-pipeline/scripts/dbnl/dbnl_ozt_fix.py -d ${datadir} -O out/ \$flags ${inputdocument} || exit 1
mv out/*xml out/${inputdocument.simpleName}.folia.xml || exit 1
"""
}
process split {
//publishDir params.outputdir, mode: 'copy', overwrite: true, pattern: "*_????.folia.xml"
input:
file inputdocument from fixeddocuments
val virtualenv from params.virtualenv
output:
file "out/*.folia.xml" into splitdocuments mode flatten
script:
"""
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
mkdir -p out
foliasplit -q div --submetadata --out out --external ${inputdocument}
count=\$(ls out/*_????.folia.xml | wc -l)
if [ \$count -eq 0 ]; then
#there are no output documents
#this means there was nothing
#to split, take the input file as output (with suffix 0000 so
#this task picks it up as valid output)
ln -s ${inputdocument} out/${inputdocument.simpleName}_0000.folia.xml
fi
"""
}
process validate {
input:
file inputdocument from splitdocuments
val virtualenv from params.virtualenv
output:
file "out/${inputdocument}" into validateddocuments
script:
"""
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
mkdir -p out
foliavalidator -o ${inputdocument} > out/${inputdocument}
"""
}
process compress {
publishDir params.outputdir, mode: 'copy', overwrite: true, pattern: "*.folia.xml.gz"
input:
file inputdocument from validateddocuments
output:
file "${inputdocument.toString().replace('_0000.folia.xml','.folia.xml').replace('out/','/')}.gz" into outputdocuments
script:
"""
if echo "${inputdocument}" | grep -q "_0000.folia.xml"; then
#remove the _0000 suffix again for the final result
gzip --best -c -k \$(realpath ${inputdocument}) > \$(echo "${inputdocument}" | sed 's/_0000.folia.xml/.folia.xml/').gz
else
gzip --best -c -k \$(realpath ${inputdocument}) > ${inputdocument}.gz
fi
"""
}
outputdocuments
.subscribe { println "Outputted ${it.name}" }