Skip to content

Commit 6b25bd1

Browse files
committed
udated to new segmenter version
1 parent f8ca9a3 commit 6b25bd1

File tree

36 files changed

+641
-192
lines changed

36 files changed

+641
-192
lines changed

install.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
VERSION=0.7.0
3+
VERSION=0.7.1
44
DIR="topicrawler-${VERSION}"
55

66
mkdir -p ${DIR}
@@ -27,4 +27,3 @@ tar -xvzf lt.ltbot-*.tar.gz --strip-components 1 -C heritrix-3.2.0
2727

2828

2929

30-

lt.lm/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<groupId>de.tudarmstadt</groupId>
77
<artifactId>lt.kd-suite</artifactId>
8-
<version>0.7.0</version>
8+
<version>0.7.1</version>
99
</parent>
1010

1111
<artifactId>lt.lm</artifactId>

lt.lm/src/main/sh/lm

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,14 @@ lib_dir=${lmhome}/lib; while IFS= read -r -d '' f; do cp=${cp}:"${f}"; done < <(
5151
cp=${cp:1} # remove heading colon
5252

5353
# skip all -D.. and -X.. parameters before the actual main class and add them later to JAVA_OPTS
54-
DX=''
54+
DX=""
5555
while [[ $1 == -D* || $1 == -X* ]]; do
56-
DX="$DX $1"
56+
DX="$DX "$(printf '%q' "${1}") # quote args
5757
shift
5858
done
59+
JAVA_OPTS="$JAVA_OPTS $DX"
60+
args=""
61+
for arg in "${@}"; do args="$args "$(printf '%q' "${arg}"); done # quote args
5962

6063
mainclass=${1:-MainFinder} # set main class if no further arg was provided
6164
if [ ! -z "$1" ]; then # if at least one arg was passed

lt.lm/src/main/sh/lm-nightly

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,9 @@ lmsrc="${kdhome}/lt.lm"
1111

1212
tgt=$(find "${lmsrc}/target" -type f -name "lm" | grep "dist/" | head -n1)
1313

14-
eval "${tgt} $@"
14+
args=""
15+
for arg in "${@}"; do args="$args "$(printf '%q' "${arg}"); done # quote args
16+
17+
cmd="${tgt} "${args}
18+
19+
eval "${cmd}"

lt.ltbot/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<groupId>de.tudarmstadt</groupId>
77
<artifactId>lt.kd-suite</artifactId>
8-
<version>0.7.0</version>
8+
<version>0.7.1</version>
99
</parent>
1010

1111
<artifactId>lt.ltbot</artifactId>

lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/writer/SentenceMakerJava8.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ public String getTargetLanguageCode(){
5959
}
6060
public void setTargetLanguageCode(String target_language_code){
6161
_target_language_code = target_language_code;
62+
_rule_splitter.get().initParam(_target_language_code, false);
6263
}
6364

6465
public Stream<String> getSentencesStream(String text){
@@ -67,7 +68,7 @@ public Stream<String> getSentencesStream(String text){
6768

6869
public Stream<String> getSentencesStream(String text, String languagecode){
6970
return _line_splitter.get().init(new StringReader(text)).stream().filter(s -> s.type == SegmentType.SENTENCE).map(Segment::asString).flatMap(line -> {
70-
return _rule_splitter.get().init(new StringReader(line), languagecode).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {
71+
return _rule_splitter.get().init(new StringReader(line)).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {
7172
final AtomicInteger c = new AtomicInteger();
7273
String r = _tokenizer.get().init(s.asString()).stream().sequential().map(t -> {
7374
if(t.isWord())

lt.ltbot/src/test/scripts/prepare_eval.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,4 +59,27 @@ for t in ai-en vehicles-en plants-en; do for m in nf 1 2 3 5; do d=$b/$t-$m; cat
5959
find . -name docperps.tsv | while read f; do d=$(dirname $f); cat $f | perl -F\\t -lanE 'if($F[1] =~ /.*[0-9].*/ && $F[1]<1e4){print $_}' > $d/docperps-pr1e4.tsv & done
6060

6161

62+
dirs="ai-en-5 plants-en-5 vehicles-en-5"; for d in $dirs; do zcat $d/crawl-sentences.txt.gz | cut -f1 | head -c 11GB > $d/crawl-sentences-11GB.txt; done
63+
64+
### steffen@farnsworth:/mnt/farnsworthshare/semeval-2015-task17-texeval/wiki$ zcat wikipedia.txt.gz | wc
65+
### 109136195 1868200724 11792896391
66+
### wikipedia 11.8GB non-unique
67+
68+
dirs="ai-en-nf vehicles-en-nf plants-en-nf"
69+
for d in $dirs; do jobs=$(ls -tr $d | grep job); for j in $jobs; do files=$(ls -tr $d/$j/sentences/ | grep "HTML.*\.txt\.gz"); for f in $files; do zcat $d/$j/sentences/$f | cut -f2,5 | gzip -c >> $d/crawl-sentences.txt.gz ; done ; done & done
70+
71+
# check size
72+
fun () { zcat $1 | wc > $1.wc; }; for d in $dirs; do fun $d/crawl-sentences.txt.gz & done
73+
74+
dirs="ai-en-nf plants-en-nf vehicles-en-nf"; for d in $dirs; do zcat $d/crawl-sentences.txt.gz | cut -f1 | head -c 11GB > $d/crawl-sentences-11GB.txt; done
75+
76+
# combine function with find command
77+
fun () { echo $1; ls -lah $1; }; find . -maxdepth 1 -type d -name "*-nf" | while read d; do fun $d; done
78+
79+
# sync to farnsworth
80+
fun () { rsync -avvzhP $1/crawl-sentences* fw:data/semeval-2015-task17-texeval/$1/ ; }
81+
find . -maxdepth 1 -type d -name "*-nf" | while read d; do fun $d & done
82+
83+
84+
6285

lt.seg/README.MD

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,16 +66,18 @@ lt.seg comes with a number of parameters, run `seg -?` to get a list of options
6666
* `--normalize <level>` (`-nl`)
6767
* `0` (default): no normalization, each segment will be printed as it is in the input
6868
* `1`: reduce same consecutive non-word characters, e.g. multiple consecutive blanks will be merged to one. Example: "\t\t\n\t\t" -> "\t\n\t"
69-
* `2`: `1` + replace consecutive numbers and digits within words and number segments themselves with the symbol `0`. Example 'He11o World. I am Johnny 5.' -> 'He0o World . I am Johnny 0 .'
70-
* `3`: `2` + replace all non-word segments with its symbol.
71-
* `4`: `3` + lowercase words.
69+
* `2`: `1` + replace empty space and punctuation characters with its symbol
70+
* `3`: `2` + replace consecutive numbers and digits within words and number segments themselves with the symbol `0`. Example 'He11o World. I am Johnny 5.' -> 'He0o World . I am Johnny 0 .'
71+
* `4`: `3` + replace all non-word segments with its symbol.
72+
* `5`: `4` + lowercase words.
7273
* `--filter <level>` (`-fl`): *Note: examples below use normalization level (-nl)* `2` and DiffTokenizer
7374
* `0`: no filtering, each segment will be printed separated by blanks (this also includes emptyspace segments, in most cases you probably want to use at least `1` or `2`)
7475
* `1`: filter control character segments
7576
* `2`: (default): `1` + filter emptyspace segments
7677
* `3`: `2` + filter unclassified and non-readable segments (attention: results heavily depend on tokenizer)
77-
* `4`: `3` + filter punctuation characters. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is 0 or 0"
78-
* `5`: `4` + filter numbers and words with numbers. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is or" (Only useful with proper token normalization level.)
78+
* `4`: `3` + filter punctuation characters
79+
* `5`: `4` + filter meta data like URLs, file descriptors, emails, wiki markup, emoticons, etc.
80+
* `6`: `4` + filter numbers and words with numbers. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is or" (Only useful with proper token normalization level.)
7981
* `--merge [<level>]` (`-ml`): *Note: examples below use normalization level (-nl)* `2`
8082
* `0`: no merging (default when not specified)
8183
* `1`: merge same consecutive token types if they are not words or words with numbers (default when just -ml specified). Example: "The number is 534 423 or 43. ? :-/ " -> "The number is 0 or 0 . "

lt.seg/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<groupId>de.tudarmstadt</groupId>
77
<artifactId>lt.kd-suite</artifactId>
8-
<version>0.7.0</version>
8+
<version>0.7.1</version>
99
</parent>
1010

1111
<artifactId>lt.seg</artifactId>

lt.seg/src/main/java/de/tudarmstadt/lt/seg/Segment.java

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616
package de.tudarmstadt.lt.seg;
1717

18+
import java.util.EnumSet;
1819

1920
/**
2021
*
@@ -40,6 +41,11 @@ public class Segment {
4041
public boolean hasZeroLength(){
4142
return begin == end;
4243
}
44+
45+
public int length(){
46+
assert(text.length() == end - begin);
47+
return end - begin;
48+
}
4349

4450
public String asString(){
4551
return text.toString();
@@ -49,7 +55,7 @@ public String asNormalizedString(int level){
4955

5056
String result = text.toString();
5157

52-
if(level >= 1 && type == SegmentType.NON_WORD){ // reduce non-word characters
58+
if(level >= 1 && (type == SegmentType.NON_WORD || type == SegmentType.UNKNOWN)){ // reduce non-word characters
5359
StringBuilder b = text.codePoints().boxed().reduce(new StringBuilder(), (x, y) -> {
5460
if(x.length() == 0 || x.codePointBefore(x.length()) != y)
5561
return x.appendCodePoint(y);
@@ -59,8 +65,17 @@ public String asNormalizedString(int level){
5965
});
6066
result = b.toString();
6167
}
68+
69+
if(level >= 1 && type == SegmentType.CONTROL){ // reduce non-word characters
70+
result = type.symbol();
71+
}
72+
73+
if(level >= 2 && (type == SegmentType.EMPTY_SPACE || type == SegmentType.PUNCT)){
74+
// replace numbers, punctuation and empty spaces with a single symbol, no matter how long the number once was
75+
result = type.symbol();
76+
}
6277

63-
if(level >= 2){
78+
if(level >= 3){
6479
if(type == SegmentType.WORD_WITH_NUMBER){ // replace consecutive digits within a word
6580
StringBuilder b = text.codePoints().boxed().reduce(new StringBuilder(), (x, y) -> {
6681
if(x.length() == 0){
@@ -81,16 +96,16 @@ public String asNormalizedString(int level){
8196
});
8297
result = b.toString();
8398
}
84-
if(type == SegmentType.NUMBER)
99+
if(EnumSet.of(SegmentType.NUMBER, SegmentType.DATE, SegmentType.PHONE, SegmentType.TIME).contains(type))
85100
result = type.symbol();
86101
}
87102

88-
if(level >= 3 && (type == SegmentType.EMPTY_SPACE || type == SegmentType.PUNCTUATION)){
89-
// replace numbers, punctuation and empty spaces with a single symbol, no matter how long the number once was
90-
result = type.symbol();
103+
if(level >= 4){
104+
if(EnumSet.complementOf(EnumSet.of(SegmentType.WORD, SegmentType.WORD_LOWERCASE, SegmentType.WORD_UPPERCASE, SegmentType.WORD_WITH_NUMBER, SegmentType.SENTENCE, SegmentType.ABBRV, SegmentType.PARAGRAPH, SegmentType.TEXT)).contains(type))
105+
result = type.symbol();
91106
}
92107

93-
if(level >= 4)
108+
if(level >= 5)
94109
result = result.toLowerCase();
95110

96111
return result;
@@ -104,19 +119,31 @@ public boolean isEmpty(){
104119
return type == SegmentType.EMPTY_SPACE;
105120
}
106121

122+
public boolean isPartOfSentence(){
123+
return type == SegmentType.SENTENCE || type == SegmentType.SENTENCE_BOUNDARY;
124+
}
125+
107126
public boolean isWord(){
108127
return type == SegmentType.WORD ||
109128
type == SegmentType.WORD_UPPERCASE ||
110129
type == SegmentType.WORD_LOWERCASE;
111130
}
112131

132+
// TODO: replace with type.ordninal in range [x,y]. needs reordering of Segmenttypes
113133
public boolean isReadable(){
114-
return type == SegmentType.WORD ||
115-
type == SegmentType.NUMBER ||
116-
type == SegmentType.WORD_WITH_NUMBER||
117-
type == SegmentType.WORD_UPPERCASE ||
118-
type == SegmentType.WORD_LOWERCASE ||
119-
type == SegmentType.PUNCTUATION;
134+
// SegmentType.WORD,
135+
// SegmentType.NUMBER,
136+
// SegmentType.WORD_WITH_NUMBER,
137+
// SegmentType.WORD_UPPERCASE,
138+
// SegmentType.WORD_LOWERCASE,
139+
// SegmentType.PUNCT
140+
return EnumSet.complementOf(EnumSet.of(
141+
SegmentType.CONTROL,
142+
SegmentType.UNKNOWN,
143+
SegmentType.EMPTY_SPACE
144+
)).contains(type);
145+
146+
120147
}
121148

122149
/* (non-Javadoc)

0 commit comments

Comments
 (0)