Skip to content

Commit e67b292

Browse files
committed
added a string linking stage to ticcl, this adds extra markup information (t-str/t-correction) using the foliatextcontent tool, this is in turn needed by FLAT for proper visualisation. #62
1 parent d423fce commit e67b292

File tree

4 files changed

+57
-6
lines changed

4 files changed

+57
-6
lines changed

codemeta.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"@type": "SoftwareSourceCode",
1111
"identifier": "piccl",
1212
"name": "PICCL",
13-
"version": "0.9.4",
13+
"version": "0.9.5",
1414
"description": "A set of workflows for corpus building through OCR, post-correction, and normalisation.",
1515
"license": "https://spdx.org/licenses/GPL-3.0",
1616
"url": "https://github.com/LanguageMachines/PICCL",

ticcl.nf

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ if (params.containsKey('help')) {
5656
log.info " --high INT skip entries from the anagram file longer than 'high' characters. (default=35)"
5757
log.info " --chainclean BOOLINT enable chain clean or not (1 = on, 0 = off, default)"
5858
log.info " --nofoliacorrect skip the FoLiA correct step"
59+
log.info " --nostringlinking skip the final string linking step"
5960
exit 2
6061
}
6162

@@ -508,7 +509,7 @@ if (!params.containsKey('nofoliacorrect')) {
508509
Correct the input documents using the ranked list, produces final output documents with <str>, using FoLiA-correct
509510
*/
510511

511-
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)
512+
publishDir params.outputdir, mode: 'copy', overwrite: true
512513
label "multicore"
513514

514515
input:
@@ -522,7 +523,7 @@ if (!params.containsKey('nofoliacorrect')) {
522523
val virtualenv from params.virtualenv
523524

524525
output:
525-
file "*.ticcl.folia.xml" into folia_ticcl_documents
526+
file "*.foliacorrect.folia.xml" into foliacorrect_documents
526527

527528
script:
528529
"""
@@ -540,13 +541,16 @@ if (!params.containsKey('nofoliacorrect')) {
540541
FoLiA-correct --inputclass "${inputclass}" --outputclass "${outputclass}" --nums 10 -e ${extension} -O outputdir/ --unk "${unknownfreqlist}" --punct "${punctuationmap}" --rank "${rankedlist}" -t ${task.cpus} . || exit 1
541542
542543
cd outputdir
544+
echo "output files:"
543545
ls
544546
545547
#rename files so they have *.ticcl.folia.xml as extension (rather than .ticcl.xml which FoLiA-correct produces)
546548
for f in *.xml; do
547549
if [[ \$f != "*.xml" ]]; then
548550
if [[ \${f%.ticcl.xml} != \$f ]]; then
549-
newf="\${f%.ticcl.xml}.ticcl.folia.xml"
551+
newf="\${f%.ticcl.xml}.foliacorrect.folia.xml" #old folia-correc
552+
elif [[ \${f%.ticcl.folia.xml} != \$f ]]; then
553+
newf="\${f%.ticcl.folia.xml}.foliacorrect.folia.xml" #new folia-correct
550554
else
551555
newf="\$f"
552556
fi
@@ -557,6 +561,53 @@ if (!params.containsKey('nofoliacorrect')) {
557561
"""
558562
}
559563

564+
if (!params.containsKey('nostringlinking')) {
565+
process linkstrings {
566+
/*
567+
This invokes a tool that adds text markup information (t-str and t-correction) linking to the substrings. It adds a level of redundancy that is needed for proper visualisation in FLAT.
568+
*/
569+
570+
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)
571+
572+
input:
573+
file foliadoc from foliacorrect_documents
574+
val virtualenv from params.virtualenv
575+
576+
output:
577+
file "*.ticcl.folia.xml" into folia_ticcl_documents
578+
579+
script:
580+
"""
581+
#!/bin/bash
582+
set +u
583+
if [ ! -z "${virtualenv}" ]; then
584+
source ${virtualenv}/bin/activate
585+
fi
586+
set -u
587+
588+
foliatextcontent -M ${foliadoc} > ${foliadoc.simpleName}.ticcl.folia.xml || exit 1
589+
"""
590+
}
591+
592+
} else {
593+
process nolinkstrings {
594+
"""Simple file rename step"""
595+
596+
publishDir params.outputdir, mode: 'copy', overwrite: true //publish the output for the end-user to see (this is the final output)
597+
598+
input:
599+
file foliadoc from foliacorrect_documents
600+
601+
output:
602+
file "*.ticcl.folia.xml" into folia_ticcl_documents
603+
604+
script:
605+
"""
606+
cp ${foliadoc} ${foliadoc.simpleName}.ticcl.folia.xml || exit 1
607+
"""
608+
}
609+
}
610+
560611
//explicitly report the final documents created to stdout
561612
folia_ticcl_documents.subscribe { println "TICCL output document written to " + params.outputdir + "/" + it.name }
562613
}

webservice/picclservice/picclservice.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
#An informative description for this system (this should be fairly short, about one paragraph, and may not contain HTML)
4949
SYSTEM_DESCRIPTION = "PICCL offers a workflow for corpus building and builds on a variety of tools. The primary component of PICCL is TICCL; a Text-induced Corpus Clean-up system, which performs spelling correction and OCR post-correction (normalisation of spelling variants etc)."
5050

51-
SYSTEM_VERSION = "0.9.4" #also change in codemeta.json and setup.py
51+
SYSTEM_VERSION = "0.9.5" #also change in codemeta.json and setup.py
5252

5353
SYSTEM_AUTHOR = "Martin Reynaert, Maarten van Gompel, Ko van der Sloot"
5454

webservice/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
setup(
1212
name = "PICCL",
13-
version = "0.9.4", #also change in codemeta.json and picclservice.py
13+
version = "0.9.5", #also change in codemeta.json and picclservice.py
1414
author = "Martin Reynaert, Maarten van Gompel",
1515
author_email = "[email protected]",
1616
description = ("Webservice for PICCL; a set of workflows for corpus building through OCR, post-correction, modernization of historic language and Natural Language Processing"),

0 commit comments

Comments
 (0)