remstef
diff --git a/‎install.sh‎
Lines changed: 1 addition & 2 deletions b/‎install.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎lt.lm/pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎lt.lm/pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lt.lm/src/main/sh/lm‎
Lines changed: 5 additions & 2 deletions b/‎lt.lm/src/main/sh/lm‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎lt.lm/src/main/sh/lm-nightly‎
Lines changed: 6 additions & 1 deletion b/‎lt.lm/src/main/sh/lm-nightly‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎lt.ltbot/pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎lt.ltbot/pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/writer/SentenceMakerJava8.java‎
Lines changed: 2 additions & 1 deletion b/‎lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/writer/SentenceMakerJava8.java‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lt.ltbot/src/test/scripts/prepare_eval.sh‎
Lines changed: 23 additions & 0 deletions b/‎lt.ltbot/src/test/scripts/prepare_eval.sh‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎lt.seg/README.MD‎
Lines changed: 7 additions & 5 deletions b/‎lt.seg/README.MD‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎lt.seg/pom.xml‎
Lines changed: 1 addition & 1 deletion b/‎lt.seg/pom.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lt.seg/src/main/java/de/tudarmstadt/lt/seg/Segment.java‎
Lines changed: 40 additions & 13 deletions b/‎lt.seg/src/main/java/de/tudarmstadt/lt/seg/Segment.java‎
Lines changed: 40 additions & 13 deletions
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-VERSION=0.7.0
+VERSION=0.7.1
 DIR="topicrawler-${VERSION}"
 
 mkdir -p ${DIR}
@@ -27,4 +27,3 @@ tar -xvzf lt.ltbot-*.tar.gz --strip-components 1 -C heritrix-3.2.0
 
 
 
-
@@ -5,7 +5,7 @@
 	<parent>
 	  <groupId>de.tudarmstadt</groupId>
 	  <artifactId>lt.kd-suite</artifactId>
-	  <version>0.7.0</version>
+	  <version>0.7.1</version>
 	</parent>
 
 	<artifactId>lt.lm</artifactId>
 
@@ -51,11 +51,14 @@ lib_dir=${lmhome}/lib; while IFS= read -r -d '' f; do cp=${cp}:"${f}"; done < <(
 cp=${cp:1} # remove heading colon
 
 # skip all -D.. and -X.. parameters before the actual main class and add them later to JAVA_OPTS
-DX=''
+DX=""
 while [[ $1 == -D* || $1 == -X* ]]; do
-	DX="$DX $1"
+	DX="$DX "$(printf '%q' "${1}") # quote args
 	shift
 done
+JAVA_OPTS="$JAVA_OPTS $DX"
+args=""
+for arg in "${@}"; do args="$args "$(printf '%q' "${arg}"); done # quote args
 
 mainclass=${1:-MainFinder} # set main class if no further arg was provided
 if [ ! -z "$1" ]; then # if at least one arg was passed
 
@@ -11,4 +11,9 @@ lmsrc="${kdhome}/lt.lm"
 
 tgt=$(find "${lmsrc}/target" -type f -name "lm" | grep "dist/" | head -n1)
 
-eval "${tgt} $@"
+args=""
+for arg in "${@}"; do args="$args "$(printf '%q' "${arg}"); done # quote args
+
+cmd="${tgt} "${args}
+
+eval "${cmd}"
@@ -5,7 +5,7 @@
 	<parent>
 	  <groupId>de.tudarmstadt</groupId>
 	  <artifactId>lt.kd-suite</artifactId>
-	  <version>0.7.0</version>
+	  <version>0.7.1</version>
 	</parent>
 
 	<artifactId>lt.ltbot</artifactId>
 
@@ -59,6 +59,7 @@ public String getTargetLanguageCode(){
 	}
 	public void setTargetLanguageCode(String target_language_code){
 		_target_language_code = target_language_code;
+		_rule_splitter.get().initParam(_target_language_code, false);
 	}
 
 	public Stream<String> getSentencesStream(String text){
@@ -67,7 +68,7 @@ public Stream<String> getSentencesStream(String text){
 
 	public Stream<String> getSentencesStream(String text, String languagecode){
 		return _line_splitter.get().init(new StringReader(text)).stream().filter(s -> s.type == SegmentType.SENTENCE).map(Segment::asString).flatMap(line -> {
-			return _rule_splitter.get().init(new StringReader(line), languagecode).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {
+			return _rule_splitter.get().init(new StringReader(line)).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {
 				final AtomicInteger c = new AtomicInteger();
 				String r = _tokenizer.get().init(s.asString()).stream().sequential().map(t -> {
 					if(t.isWord())
 
@@ -59,4 +59,27 @@ for t in ai-en vehicles-en plants-en; do for m in nf 1 2 3 5; do d=$b/$t-$m; cat
 find . -name docperps.tsv | while read f; do d=$(dirname $f); cat $f | perl -F\\t -lanE 'if($F[1] =~ /.*[0-9].*/ && $F[1]<1e4){print $_}' > $d/docperps-pr1e4.tsv &  done
 
 
+dirs="ai-en-5 plants-en-5 vehicles-en-5"; for d in $dirs; do zcat $d/crawl-sentences.txt.gz | cut -f1 | head -c 11GB > $d/crawl-sentences-11GB.txt; done
+
+### steffen@farnsworth:/mnt/farnsworthshare/semeval-2015-task17-texeval/wiki$ zcat wikipedia.txt.gz | wc
+### 109136195 1868200724 11792896391
+### wikipedia 11.8GB non-unique
+
+dirs="ai-en-nf vehicles-en-nf plants-en-nf"
+for d in $dirs; do jobs=$(ls -tr $d | grep job); for j in $jobs; do files=$(ls -tr $d/$j/sentences/ | grep "HTML.*\.txt\.gz"); for f in $files; do zcat $d/$j/sentences/$f | cut -f2,5 | gzip -c >> $d/crawl-sentences.txt.gz ; done ; done & done
+
+# check size
+fun () { zcat $1 | wc > $1.wc; }; for d in $dirs; do fun $d/crawl-sentences.txt.gz & done
+	
+dirs="ai-en-nf plants-en-nf vehicles-en-nf"; for d in $dirs; do zcat $d/crawl-sentences.txt.gz | cut -f1 | head -c 11GB > $d/crawl-sentences-11GB.txt; done
+
+# combine function with find command
+fun () { echo $1; ls -lah $1; }; find . -maxdepth 1 -type d -name "*-nf" | while read d; do fun $d; done
+
+# sync to farnsworth 	
+fun () { rsync -avvzhP $1/crawl-sentences* fw:data/semeval-2015-task17-texeval/$1/ ;  }
+find . -maxdepth 1 -type d -name "*-nf" | while read d; do fun $d & done
+
+	 
+
 
@@ -66,16 +66,18 @@ lt.seg comes with a number of parameters, run `seg -?` to get a list of options
 * `--normalize <level>` (`-nl`)
     * `0` (default): no normalization, each segment will be printed as it is in the input
     * `1`: reduce same consecutive non-word characters, e.g. multiple consecutive blanks will be merged to one. Example: "\t\t\n\t\t" -> "\t\n\t"
-    * `2`: `1` + replace consecutive numbers and digits within words and number segments themselves with the symbol `0`. Example 'He11o World. I am Johnny 5.' -> 'He0o World . I am Johnny 0 .'
-    * `3`: `2` + replace all non-word segments with its symbol.
-    * `4`: `3` + lowercase words.
+    * `2`: `1` + replace empty space and punctuation characters with its symbol 
+    * `3`: `2` + replace consecutive numbers and digits within words and number segments themselves with the symbol `0`. Example 'He11o World. I am Johnny 5.' -> 'He0o World . I am Johnny 0 .'
+    * `4`: `3` + replace all non-word segments with its symbol.
+    * `5`: `4` + lowercase words.
 * `--filter <level>` (`-fl`): *Note: examples below use normalization level (-nl)* `2` and DiffTokenizer
     * `0`: no filtering, each segment will be printed separated by blanks (this also includes emptyspace segments, in most cases you probably want to use at least `1` or `2`)
 	* `1`: filter control character segments
     * `2`: (default): `1` + filter emptyspace segments
     * `3`: `2` + filter unclassified and non-readable segments (attention: results heavily depend on tokenizer)
-    * `4`: `3` + filter punctuation characters. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is 0 or 0"
-    * `5`: `4` + filter numbers and words with numbers. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is or" (Only useful with proper token normalization level.)
+    * `4`: `3` + filter punctuation characters
+    * `5`: `4` + filter meta data like URLs, file descriptors, emails, wiki markup, emoticons, etc.
+    * `6`: `4` + filter numbers and words with numbers. Example: "The number is 534 423 or 43. ? :-/ " -> "The number is or" (Only useful with proper token normalization level.)
 * `--merge [<level>]` (`-ml`): *Note: examples below use normalization level (-nl)* `2`
 	* `0`: no merging (default when not specified)
 	* `1`: merge same consecutive token types if they are not words or words with numbers (default when just -ml specified). Example: "The number is 534 423 or 43. ? :-/ " -> "The number is 0 or 0 . "
 
@@ -5,7 +5,7 @@
 	<parent>
 	  <groupId>de.tudarmstadt</groupId>
 	  <artifactId>lt.kd-suite</artifactId>
-	  <version>0.7.0</version>
+	  <version>0.7.1</version>
 	</parent>
 
 	<artifactId>lt.seg</artifactId>
 
@@ -15,6 +15,7 @@
  */
 package de.tudarmstadt.lt.seg;
 
+import java.util.EnumSet;
 
 /**
  * 
@@ -40,6 +41,11 @@ public class Segment {
 	public boolean hasZeroLength(){
 		return begin == end;
 	}
+	
+	public int length(){
+		assert(text.length() == end - begin);
+		return end - begin;
+	}
 
 	public String asString(){
 		return text.toString();
@@ -49,7 +55,7 @@ public String asNormalizedString(int level){
 
 		String result = text.toString();
 
-		if(level >= 1 && type == SegmentType.NON_WORD){ // reduce non-word characters
+		if(level >= 1 && (type == SegmentType.NON_WORD || type == SegmentType.UNKNOWN)){ // reduce non-word characters
 			StringBuilder b = text.codePoints().boxed().reduce(new StringBuilder(), (x, y) -> {
 				if(x.length() == 0 || x.codePointBefore(x.length()) != y)
 					return x.appendCodePoint(y);
@@ -59,8 +65,17 @@ public String asNormalizedString(int level){
 			});
 			result = b.toString();
 		}
+		
+		if(level >= 1 && type == SegmentType.CONTROL){ // reduce non-word characters
+			result = type.symbol();
+		}
+		
+		if(level >= 2 && (type == SegmentType.EMPTY_SPACE || type == SegmentType.PUNCT)){
+			// replace numbers, punctuation and empty spaces with a single symbol, no matter how long the number once was
+			result = type.symbol();
+		}
 
-		if(level >= 2){
+		if(level >= 3){
 			if(type == SegmentType.WORD_WITH_NUMBER){ // replace consecutive digits within a word
 				StringBuilder b = text.codePoints().boxed().reduce(new StringBuilder(), (x, y) -> {
 					if(x.length() == 0){
@@ -81,16 +96,16 @@ public String asNormalizedString(int level){
 				});
 				result = b.toString();
 			}
-			if(type == SegmentType.NUMBER)
+			if(EnumSet.of(SegmentType.NUMBER, SegmentType.DATE, SegmentType.PHONE, SegmentType.TIME).contains(type))
 				result = type.symbol();
 		}
 
-		if(level >= 3 && (type == SegmentType.EMPTY_SPACE || type == SegmentType.PUNCTUATION)){
-			// replace numbers, punctuation and empty spaces with a single symbol, no matter how long the number once was
-			result = type.symbol();
+		if(level >= 4){
+			if(EnumSet.complementOf(EnumSet.of(SegmentType.WORD, SegmentType.WORD_LOWERCASE, SegmentType.WORD_UPPERCASE, SegmentType.WORD_WITH_NUMBER, SegmentType.SENTENCE, SegmentType.ABBRV, SegmentType.PARAGRAPH, SegmentType.TEXT)).contains(type))
+				result = type.symbol();
 		}
 
-		if(level >= 4)
+		if(level >= 5)
 			result = result.toLowerCase();
 
 		return result;
@@ -104,19 +119,31 @@ public boolean isEmpty(){
 		return type == SegmentType.EMPTY_SPACE; 
 	}
 
+	public boolean isPartOfSentence(){
+		return type == SegmentType.SENTENCE || type == SegmentType.SENTENCE_BOUNDARY; 
+	}
+	
 	public boolean isWord(){
 		return type == SegmentType.WORD ||   
 				type == SegmentType.WORD_UPPERCASE ||
 				type == SegmentType.WORD_LOWERCASE;
 	}
 
+	// TODO: replace with type.ordninal in range [x,y]. needs reordering of Segmenttypes 
 	public boolean isReadable(){
-		return type == SegmentType.WORD || 
-				type == SegmentType.NUMBER || 
-				type == SegmentType.WORD_WITH_NUMBER|| 
-				type == SegmentType.WORD_UPPERCASE ||
-				type == SegmentType.WORD_LOWERCASE ||
-				type == SegmentType.PUNCTUATION;  
+//		SegmentType.WORD,
+//		SegmentType.NUMBER,
+//		SegmentType.WORD_WITH_NUMBER, 
+//		SegmentType.WORD_UPPERCASE,
+//		SegmentType.WORD_LOWERCASE,
+//		SegmentType.PUNCT
+		return EnumSet.complementOf(EnumSet.of(
+					SegmentType.CONTROL,
+					SegmentType.UNKNOWN,
+					SegmentType.EMPTY_SPACE
+				)).contains(type); 
+		
+
 	}
 
 	/* (non-Javadoc)
Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,7 @@ public String getTargetLanguageCode(){`
`59`	`59`	`}`
`60`	`60`	`public void setTargetLanguageCode(String target_language_code){`
`61`	`61`	`_target_language_code = target_language_code;`
	`62`	`+ _rule_splitter.get().initParam(_target_language_code, false);`
`62`	`63`	`}`
`63`	`64`
`64`	`65`	`public Stream<String> getSentencesStream(String text){`
`@@ -67,7 +68,7 @@ public Stream<String> getSentencesStream(String text){`
`67`	`68`
`68`	`69`	`public Stream<String> getSentencesStream(String text, String languagecode){`
`69`	`70`	`return _line_splitter.get().init(new StringReader(text)).stream().filter(s -> s.type == SegmentType.SENTENCE).map(Segment::asString).flatMap(line -> {`
`70`		`- return _rule_splitter.get().init(new StringReader(line), languagecode).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {`
	`71`	`+ return _rule_splitter.get().init(new StringReader(line)).stream().filter(s -> s.type == SegmentType.SENTENCE).sequential().map(s -> {`
`71`	`72`	`final AtomicInteger c = new AtomicInteger();`
`72`	`73`	`String r = _tokenizer.get().init(s.asString()).stream().sequential().map(t -> {`
`73`	`74`	`if(t.isWord())`