Skip to content

Commit 9344791

Browse files
committed
project refactoring
1 parent 49d8859 commit 9344791

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+622
-232
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33

44
**/.cache-main
55
**/.cache-tests
6+
**/.pydevproject

lt.lm/pom.xml

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,17 @@
11
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
22
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
33
<modelVersion>4.0.0</modelVersion>
4-
<groupId>de.tudarmstadt</groupId>
4+
5+
<parent>
6+
<groupId>de.tudarmstadt</groupId>
7+
<artifactId>lt.kd-suite</artifactId>
8+
<version>0.7.0</version>
9+
</parent>
10+
511
<artifactId>lt.lm</artifactId>
6-
<version>0.4.1h</version>
7-
<properties>
8-
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
9-
</properties>
10-
<repositories>
11-
<repository>
12-
<id>lt.lm-local-repository</id>
13-
<url>file://${project.basedir}/repo</url>
14-
</repository>
15-
</repositories>
12+
1613
<build>
1714
<plugins>
18-
<plugin>
19-
<groupId>org.apache.maven.plugins</groupId>
20-
<artifactId>maven-compiler-plugin</artifactId>
21-
<version>2.4</version>
22-
<configuration>
23-
<source>1.8</source>
24-
<target>1.8</target>
25-
</configuration>
26-
</plugin>
2715
<plugin>
2816
<groupId>org.apache.maven.plugins</groupId>
2917
<artifactId>maven-dependency-plugin</artifactId>
@@ -108,16 +96,10 @@
10896
<artifactId>berkeleylm</artifactId>
10997
<version>1.1.6</version>
11098
</dependency>
111-
<dependency>
112-
<groupId>junit</groupId>
113-
<artifactId>junit</artifactId>
114-
<version>4.11</version><!--$NO-MVN-MAN-VER$ -->
115-
<scope>test</scope>
116-
</dependency>
11799
<dependency>
118100
<groupId>de.tudarmstadt</groupId>
119101
<artifactId>lt.utilities</artifactId>
120-
<version>0.3.7</version>
102+
<version>${project.version}</version>
121103
</dependency>
122104
<dependency>
123105
<groupId>commons-collections</groupId>
@@ -142,7 +124,7 @@
142124
<dependency>
143125
<groupId>de.tudarmstadt</groupId>
144126
<artifactId>lt.seg</artifactId>
145-
<version>0.5.1</version>
127+
<version>${project.version}</version>
146128
</dependency>
147129
<!-- <dependency>
148130
<groupId>org.codehaus.janino</groupId>

lt.lm/src/main/java/de/tudarmstadt/lt/lm/app/PerpDoc.java

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ void run(Reader r) {
198198
_oov_terms = 0;
199199
_num_ngrams = 0;
200200
long l = 0;
201+
String ts = null;
202+
String s = null;
201203
for(LineIterator liter = new LineIterator(r); liter.hasNext(); ){
202204
if(++l % 5000 == 0)
203205
LOG.info("{}: processing line {}.", _rmi_string, l);
@@ -210,30 +212,34 @@ void run(Reader r) {
210212
String[] splits = line.split("\t");
211213
if(splits.length < 3)
212214
continue;
213-
if(docid == null)
215+
if(docid == null){
214216
docid = splits[2];
217+
ts = splits[0];
218+
}
215219

216220
if(!splits[2].equals(docid)){
217221
double perplexity = _perplexity_doc.get();
218222
if(perplexity > _max_perp)
219223
_max_perp = perplexity;
220224
if(perplexity < _min_perp)
221225
_min_perp = perplexity;
222-
String o = String.format("%s\t%s\tPerplexity: %6.3e \tMax: %6.3e \tMin: %6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
223-
_rmi_string, docid, perplexity, _max_perp, _min_perp,
226+
String o = String.format("%s\t%s\t%s\tPerplexity: %6.3e \tMax: %6.3e \tMin: %6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
227+
_rmi_string, ts, docid, perplexity, _max_perp, _min_perp,
224228
_num_ngrams, _oov_terms, _oov_ngrams);
225229
LOG.info(o);
226230
if(!_quiet)
227231
write(String.format("%s%n", o));
228232
else
229-
write(String.format("%s\t%s\t%6.3e%n", _rmi_string, docid, perplexity));
233+
write(String.format("%s\t%s\t%s\t%6.3e%n", _rmi_string, ts, docid, perplexity));
230234
_perplexity_doc.reset();
231235
docid = splits[2];
236+
ts = splits[0];
232237
}
233238

239+
s = splits[1];
234240
List<String>[] ngrams;
235241
try {
236-
ngrams = _lm_prvdr.getNgrams(line);
242+
ngrams = _lm_prvdr.getNgrams(s);
237243
if(ngrams == null || ngrams.length == 0)
238244
continue;
239245
} catch (Exception e) {
@@ -268,14 +274,14 @@ void run(Reader r) {
268274
_max_perp = perplexity;
269275
if(perplexity < _min_perp)
270276
_min_perp = perplexity;
271-
String o = String.format("%s\t%s\tPerplexity: %6.3e \tMax: %6.3e \tMin: %6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
272-
_rmi_string, docid, perplexity, _max_perp, _min_perp,
277+
String o = String.format("%s\t%s\t%s\tPerplexity: %6.3e \tMax: %6.3e \tMin: %6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
278+
_rmi_string, ts, docid, perplexity, _max_perp, _min_perp,
273279
_num_ngrams, _oov_terms, _oov_ngrams);
274280
LOG.info(o);
275281
if(!_quiet)
276282
write(String.format("%s%n", o));
277283
else
278-
write(String.format("%s\t%s\t%6.3e%n", _rmi_string, docid, _perplexity_doc.get()));
284+
write(String.format("%s\t%s\t%s\t%6.3e%n", _rmi_string, ts, docid, _perplexity_doc.get()));
279285
}
280286

281287
}

lt.lm/src/main/java/de/tudarmstadt/lt/lm/app/PerplexityClient.java

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ public PerplexityClient(String args[]) {
6868
opts.addOption(OptionBuilder.withLongOpt("port").withArgName("port-number").hasArg().withDescription(String.format("Specifies the port on which the rmi registry listens (default: %d).", Registry.REGISTRY_PORT)).create("p"));
6969
opts.addOption(OptionBuilder.withLongOpt("selftest").withDescription("Run a selftest, compute perplexity of ngrams in specified LM.").create("s"));
7070
opts.addOption(OptionBuilder.withLongOpt("quiet").withDescription("Run with minimum outout on stdout.").create("q"));
71-
opts.addOption(OptionBuilder.withLongOpt("noov").hasOptionalArg().withArgName("{true|false}").withDescription("Do not consider oov terms, i.e. ngrams that end in an oov term. (default: false)").create());
71+
opts.addOption(OptionBuilder.withLongOpt("skipoov").hasOptionalArg().withArgName("{true|false}").withDescription("Do not consider oov terms, i.e. ngrams that end in an oov term. (default: false)").create());
72+
opts.addOption(OptionBuilder.withLongOpt("skipoovreflm").hasOptionalArg().withArgName("{true|false}").withDescription("Do not consider oov terms regarding the oovreflm, i.e. ngrams that end in an oov term. (default: false)").create());
7273
opts.addOption(OptionBuilder.withLongOpt("oovreflm").withArgName("identifier").hasArg().withDescription("Do not consider oov terms with respect to the provided lm, i.e. ngrams that end in an oov term in the referenced lm. (default use current lm)").create());
7374
opts.addOption(OptionBuilder.withLongOpt("host").withArgName("hostname").hasArg().withDescription("Specifies the hostname on which the rmi registry listens (default: localhost).").create("h"));
7475
opts.addOption(OptionBuilder.withLongOpt("file").withArgName("name").hasArg().withDescription("Specify the file or directory that contains '.txt' files that are used as source for testing perplexity with the specified language model. Specify '-' to pipe from stdin. (default: '-').").create("f"));
@@ -89,9 +90,13 @@ public PerplexityClient(String args[]) {
8990
_host = cmd.getOptionValue("host", "localhost");
9091
_selftest = cmd.hasOption("selftest");
9192
_quiet = cmd.hasOption("quiet");
92-
_no_oov = cmd.hasOption("noov");
93-
if(_no_oov && cmd.getOptionValue("noov") != null)
94-
_no_oov = Boolean.parseBoolean(cmd.getOptionValue("noov"));
93+
_no_oov = cmd.hasOption("skipoov");
94+
if(_no_oov && cmd.getOptionValue("skipoov") != null)
95+
_no_oov = Boolean.parseBoolean(cmd.getOptionValue("skipoov"));
96+
_no_oov_reflm = cmd.hasOption("skipoovreflm");
97+
if(_no_oov_reflm && cmd.getOptionValue("skipoovreflm") != null)
98+
_no_oov_reflm = Boolean.parseBoolean(cmd.getOptionValue("skipoovreflm"));
99+
95100
_one_ngram_per_line = cmd.hasOption("one_ngram_per_line");
96101
if(_one_ngram_per_line && cmd.getOptionValue("one_ngram_per_line") != null)
97102
_one_ngram_per_line = Boolean.parseBoolean(cmd.getOptionValue("one_ngram_per_line"));
@@ -114,6 +119,7 @@ public PerplexityClient(String args[]) {
114119
boolean _selftest;
115120
boolean _quiet;
116121
boolean _no_oov;
122+
boolean _no_oov_reflm;
117123
boolean _one_ngram_per_line;
118124
PrintStream _pout;
119125

@@ -126,6 +132,8 @@ public PerplexityClient(String args[]) {
126132
ModelPerplexity<String> _perplexity_all = null;
127133
ModelPerplexity<String> _perplexity_file = null;
128134

135+
long _oovreflm_oov_terms = 0;
136+
long _oovreflm_oov_ngrams = 0;
129137
long _oov_terms = 0;
130138
long _oov_ngrams = 0;
131139
long _num_ngrams = 0;
@@ -192,22 +200,30 @@ public boolean accept(File f) {
192200
try{ run(new InputStreamReader(new FileInputStream(f), "UTF-8")); }catch(Exception e){LOG.error("{}: Could not compute perplexity from file '{}'.", _rmi_string, f.getAbsolutePath(), e);}
193201
String o = String.format("%s: (intermediate results) \t %s \tPerplexity (file): %6.3e \tPerplexity (cum): %6.3e \tMax: log_10(p(%s))=%6.3e \tMin: log_10(p(%s))=%6.3e \tngrams (cum): %d \tOov-terms (cum): %d \tOov-ngrams (cum): %d",
194202
_rmi_string, f.getAbsoluteFile(), _perplexity_file.get(), _perplexity_all.get(), _max_ngram, _max_prob, _min_ngram, _min_prob,
195-
_num_ngrams, _oov_terms, _oov_ngrams);
203+
_num_ngrams, _oovreflm_oov_terms, _oovreflm_oov_ngrams);
196204
LOG.info(o);
197205
if(!_quiet)
198206
write(String.format("%s%n", o));
199207
}
200208
}
201209
}
202210

203-
String o = String.format("%s\t%s\tPerplexity: %6.3e \tMax: log_10(p(%s))=%6.3e \tMin: log_10(p(%s))=%6.3e \tngrams: %d \tOov-terms: %d \tOov-ngrams: %d",
211+
String o = String.format("%s\t%s\tPerplexity: %6.3e \tMax: log_10(p(%s))=%6.3e \tMin: log_10(p(%s))=%6.3e \tngrams: %d \toov-handling: %s \tOov-terms: %d \tOov-ngrams: %d \toov-reflm-handling: %s \tOov-reflm-terms: %d \tOov-reflm-ngrams: %d",
204212
_rmi_string, _file, _perplexity_all.get(), _max_ngram, _max_prob, _min_ngram, _min_prob,
205-
_num_ngrams, _oov_terms, _oov_ngrams);
213+
_num_ngrams,
214+
_no_oov ? "oov excluded" : "oov included", _oov_terms, _oov_ngrams,
215+
_no_oov_reflm ? "oov-reflm excluded" : "oov-reflm included", _oovreflm_oov_terms, _oovreflm_oov_ngrams);
206216
LOG.info(o);
207217
if(!_quiet)
208218
write(String.format("%s%n", o));
209219
else
210-
write(String.format("%s\t%s\t%6.3e%n", _rmi_string, _file, _perplexity_all.get()));
220+
write(String.format("%s\t%s\t%6.3e\t%d\t%s\t%d\t%d\t%s\t%d\t%d%n", _rmi_string, _file, _perplexity_all.get(), _num_ngrams,
221+
_no_oov ? "oov excluded" : "oov included",
222+
_oov_ngrams,
223+
_oov_terms,
224+
_no_oov_reflm ? "oov-reflm excluded" : "oov-reflm included",
225+
_oovreflm_oov_ngrams,
226+
_oovreflm_oov_terms));
211227
}
212228

213229
@SuppressWarnings("unchecked")
@@ -239,12 +255,20 @@ void run(Reader r) {
239255
continue;
240256
_num_ngrams++;
241257
try{
258+
boolean oov = false;
259+
if(_lm_prvdr.ngramContainsOOV(ngram)){
260+
_oov_ngrams++;
261+
if(_lm_prvdr.ngramEndsWithOOV(ngram)){
262+
_oov_terms++;
263+
oov = true;
264+
}
265+
}
242266

243267
if(_lm_prvdr_oovref.ngramContainsOOV(ngram)){
244-
_oov_ngrams++;
268+
_oovreflm_oov_ngrams++;
245269
if(_lm_prvdr_oovref.ngramEndsWithOOV(ngram)){
246-
_oov_terms++;
247-
if(_no_oov)
270+
_oovreflm_oov_terms++;
271+
if(_no_oov_reflm || (_no_oov && oov))
248272
continue;
249273
}
250274
}

lt.lm/src/main/sh/lm

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ cp=${cp:1} # remove heading colon
5252

5353
# skip all -D.. and -X.. parameters before the actual main class and add them later to JAVA_OPTS
5454
DX=''
55-
while [[ "${1}" == -D* || "${1}" == -X* ]]; do
56-
DX="${DX} \"${1}\""
55+
while [[ $1 == -D* || $1 == -X* ]]; do
56+
DX="$DX $1"
5757
shift
5858
done
5959

@@ -75,7 +75,8 @@ if [[ ! $JAVA_OPTS == *"-Dproject.properties="* && -e ${lmhome}/project.properti
7575
# add logback.xml if not already in JAVA_OPTS
7676
if [[ ! $JAVA_OPTS == *"-Dlogback.configurationFile="* && -e ${lmhome}/logback.xml ]]; then JAVA_OPTS="$JAVA_OPTS -Dlogback.configurationFile=\"${lmhome}/logback.xml\"" ; fi
7777
# add Xmx and Xms if not set
78-
if [[ ! $JAVA_OPTS == *"-Xmx"* && ! $JAVA_OPTS == *"-Xms"* ]]; then JAVA_OPTS="$JAVA_OPTS -Xmx2g -Xms2g" ; fi
78+
if [[ ! $JAVA_OPTS == *"-Xmx"* ]]; then JAVA_OPTS="$JAVA_OPTS -Xmx2g" ; fi
79+
if [[ ! $JAVA_OPTS == *"-Xms"* ]]; then JAVA_OPTS="$JAVA_OPTS -Xms2g" ; fi
7980

8081
# try to find JAVA_HOME and set it accordingly
8182
if [ -z ${JAVA_HOME} ]; then
@@ -110,7 +111,13 @@ fi
110111
if [ $test_var == 'y' -o $test_var == 'Y' ]
111112
then
112113
echo "Start: `date`." >&2
113-
eval "time ${command}"
114+
if [ $(which rlwrap) ]; then
115+
echo "found rlwrap" >&2
116+
eval "time rlwrap ${command}"
117+
else
118+
echo "rlwrap not found." >&2
119+
eval "time ${command}"
120+
fi
114121
echo "Finished: `date`." >&2
115122
else
116123
echo "Command execution cancelled." >&2

lt.lm/src/main/sh/lm-nightly

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
##
3+
#
4+
# Copy this file into your PATH
5+
#
6+
##
7+
8+
[ -z ${KD_SUITE_HOME} ] && kdhome="${HOME}/git/lt.kd" || kdhome=${KD_SUITE_HOME}
9+
10+
lmsrc="${kdhome}/lt.lm"
11+
12+
tgt=$(find "${lmsrc}/target" -type f -name "lm" | grep "dist/" | head -n1)
13+
14+
eval "${tgt} $@"

lt.ltbot/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ target
1313
**/*.pyc
1414

1515
# ipython notebook related
16-
**/.ipynb_checkpoints*
16+
.ipynb_checkpoints
17+
__pycache__
1718

1819
# java compiled
1920
**/*.class

lt.ltbot/pom.xml

Lines changed: 11 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,17 @@
11
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
22
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
33
<modelVersion>4.0.0</modelVersion>
4-
<groupId>de.tudarmstadt</groupId>
4+
5+
<parent>
6+
<groupId>de.tudarmstadt</groupId>
7+
<artifactId>lt.kd-suite</artifactId>
8+
<version>0.7.0</version>
9+
</parent>
10+
511
<artifactId>lt.ltbot</artifactId>
6-
<version>0.4.1a</version>
7-
<properties>
8-
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
9-
</properties>
10-
<repositories>
11-
<repository>
12-
<id>local-repository</id>
13-
<url>file:///${project.basedir}/repo</url>
14-
</repository>
15-
</repositories>
12+
1613
<build>
1714
<plugins>
18-
<plugin>
19-
<groupId>org.apache.maven.plugins</groupId>
20-
<artifactId>maven-compiler-plugin</artifactId>
21-
<version>3.1</version>
22-
<configuration>
23-
<source>1.8</source>
24-
<target>1.8</target>
25-
</configuration>
26-
</plugin>
2715
<plugin>
2816
<groupId>org.apache.maven.plugins</groupId>
2917
<artifactId>maven-dependency-plugin</artifactId>
@@ -140,19 +128,13 @@
140128
<dependency>
141129
<groupId>de.tudarmstadt</groupId>
142130
<artifactId>lt.lm</artifactId>
143-
<version>0.4.1h</version>
131+
<version>${project.version}</version>
144132
</dependency>
145133
<dependency>
146134
<groupId>org.jsoup</groupId>
147135
<artifactId>jsoup</artifactId>
148136
<version>1.7.3</version>
149137
</dependency>
150-
<dependency>
151-
<groupId>junit</groupId>
152-
<artifactId>junit</artifactId>
153-
<version>4.11</version>
154-
<scope>test</scope>
155-
</dependency>
156138
<!-- <dependency>
157139
<groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
158140
<artifactId>de.tudarmstadt.ukp.dkpro.core.treetagger-asl</artifactId>
@@ -253,12 +235,12 @@
253235
<dependency>
254236
<groupId>de.tudarmstadt</groupId>
255237
<artifactId>lt.seg</artifactId>
256-
<version>0.5.1</version>
238+
<version>${project.version}</version>
257239
</dependency>
258240
<dependency>
259241
<groupId>de.tudarmstadt</groupId>
260242
<artifactId>lt.utilities</artifactId>
261-
<version>0.3.7</version>
243+
<version>${project.version}</version>
262244
</dependency>
263245
<dependency>
264246
<groupId>com.syncthemall</groupId>

lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/writer/PlainTextDocumentWriter.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -188,11 +188,14 @@ protected void innerProcess(CrawlURI curi) throws InterruptedException {
188188
*/
189189
protected void writeplaintext(CrawlURI curi, String cleaned_plaintext) {
190190
String perplexity_value_as_string = "null";
191-
if(curi != null && curi.getData() != null){
192-
Object obj = curi.getData().get(SharedConstants.EXTRA_INFO_PERPLEXITY);
193-
if(obj != null)
194-
perplexity_value_as_string = (String)obj;
195-
}
191+
if(curi == null || curi.getData() == null)
192+
return;
193+
if(StringUtils.isEmpty(cleaned_plaintext))
194+
return;
195+
196+
Object obj = curi.getData().get(SharedConstants.EXTRA_INFO_PERPLEXITY);
197+
if(obj != null)
198+
perplexity_value_as_string = (String)obj;
196199

197200
String time = TimeUtils.get_ISO_8601_UTC();
198201
synchronized (_lck) {

0 commit comments

Comments
 (0)