Skip to content

Commit ba19a16

Browse files
committed
several bugfixes
1 parent 0077b31 commit ba19a16

File tree

21 files changed

+452
-51
lines changed

21 files changed

+452
-51
lines changed

lt.lm/src/test/java/META-INF/MANIFEST.MF

Lines changed: 0 additions & 3 deletions
This file was deleted.

lt.ltbot/jobs/profile-ltbot-default-seedfile/profile-crawler-beans-ltbot.cxml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ seeds.textSource.path=seed.txt
108108

109109
<bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName" />
110110

111-
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
111+
<bean id="seeds" class="de.tudarmstadt.lt.ltbot.seed.TextPrioSeedModule">
112112
<property name="textSource">
113113
<bean class="org.archive.spring.ConfigFile">
114114
<property name="path">
@@ -204,7 +204,7 @@ seeds.textSource.path=seed.txt
204204

205205
<!-- PRIORITY -->
206206
<bean id="sentenceMaker" class="de.tudarmstadt.lt.ltbot.writer.SentenceMaker">
207-
<property name="minLength" value="5" />
207+
<property name="minLength" value="3" />
208208
<property name="targetLanguageCode" value="default" />
209209
</bean>
210210
<bean id="perplexityProducer" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValueProducerPerplexity" autowire="byName">
@@ -246,7 +246,7 @@ seeds.textSource.path=seed.txt
246246
</bean>
247247
</property>
248248
</bean>
249-
<bean id="perplexityPrioritizer" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValuePrioritizer" />
249+
<bean id="perplexityPrioritizer" class="de.tudarmstadt.lt.ltbot.prefetch.DecesiveValuePrioritizer" />
250250
<bean id="perplexityLoggerDispositionChain" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValueLogger" />
251251

252252
<!-- CANDIDATE CHAIN -->

lt.ltbot/jobs/profile-ltbot-default-seedlist/profile-crawler-beans-ltbot.cxml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ http://127.0.0.1/test
108108

109109
<!-- BEANS BEANS BEANS -->
110110
<bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName" />
111-
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule">
111+
<bean id="seeds" class="de.tudarmstadt.lt.ltbot.seed.TextPrioSeedModule">
112112
<property name="textSource">
113113
<bean class="org.archive.spring.ConfigString">
114114
<property name="value">
@@ -203,7 +203,7 @@ http://127.0.0.1/test
203203

204204
<!-- PRIORITY -->
205205
<bean id="sentenceMaker" class="de.tudarmstadt.lt.ltbot.writer.SentenceMaker">
206-
<property name="minLength" value="5" />
206+
<property name="minLength" value="3" />
207207
<property name="targetLanguageCode" value="default" />
208208
</bean>
209209
<bean id="perplexityProducer" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValueProducerPerplexity" autowire="byName">
@@ -245,7 +245,7 @@ http://127.0.0.1/test
245245
</bean>
246246
</property>
247247
</bean>
248-
<bean id="perplexityPrioritizer" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValuePrioritizer" />
248+
<bean id="perplexityPrioritizer" class="de.tudarmstadt.lt.ltbot.prefetch.DecesiveValuePrioritizer" />
249249
<bean id="perplexityLoggerDispositionChain" class="de.tudarmstadt.lt.ltbot.postprocessor.DecesiveValueLogger" />
250250

251251
<!-- CANDIDATE CHAIN -->

lt.ltbot/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<modelVersion>4.0.0</modelVersion>
44
<groupId>de.tudarmstadt</groupId>
55
<artifactId>lt.ltbot</artifactId>
6-
<version>0.4.0d</version>
6+
<version>0.4.1</version>
77
<properties>
88
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
99
</properties>

lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/postprocessor/DecesiveValueLogger.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ protected void innerProcess(CrawlURI uri) throws InterruptedException {
107107

108108
private String getLogString(CrawlURI curi){
109109
String timestamp = TimeUtils.get_ISO_8601_UTC();
110-
String value_as_str = curi.getData().containsKey(getExtraInfoValueFieldName()) ? curi.getData().get(getExtraInfoValueFieldName()).toString() : "null";
110+
String value_as_str = curi.getData().get(getExtraInfoValueFieldName()) != null ? curi.getData().get(getExtraInfoValueFieldName()).toString() : "null";
111111
String current_scheduling_directive = String.valueOf(curi.getSchedulingDirective());
112112
String current_precedence = String.valueOf(curi.getPrecedence());
113113
String assigned_scheduling_directive = "_";

lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/postprocessor/DecesiveValueProducerPerplexity.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,8 @@ public double computePerplexity(String text) throws Exception{
155155
ModelPerplexity<String> perp = new ModelPerplexity<String>(_lmprvdr.get());
156156
for(String sentence : _sentenceMakerInstance.getSentences(text)){
157157
List<String>[] ngrams = _lmprvdr.get().getNgrams(sentence);
158-
if(ngrams.length <= 1) // at least 2 ngrams
158+
// LOG.finest(String.format("ngrams: %s", ngrams));
159+
if(ngrams.length < 1) // at least 1 ngrams
159160
continue;
160161
if(ngrams[ngrams.length-1].size() < _lmprvdr.get().getLmOrder()) // at least one ngram with cardinality of lm
161162
continue;
@@ -264,7 +265,7 @@ protected void innerProcess(CrawlURI uri) throws InterruptedException {
264265

265266

266267
synchronized (_lck) {
267-
if(Double.isInfinite(perplexity)){
268+
if(!Double.isFinite(perplexity) || perplexity <= 1){
268269
_num_inf_values.incrementAndGet();
269270
}else{
270271
double temp = (_perplexity_avg * _num_values) + perplexity;
@@ -306,14 +307,14 @@ static void addExtraInfo(CrawlURI uri, String key, Object value) {
306307
double perplexity = Double.POSITIVE_INFINITY;
307308
try {
308309
String docid = "#" + Integer.toHexString(cleaned_plaintext.hashCode());
309-
LOG.fine(String.format("Sending text with id '%s' to StringProvider: '%s' (length %d).", docid, cleaned_plaintext_abbr, cleaned_plaintext.length()));
310+
LOG.finest(String.format("Sending text with id '%s' to StringProvider: '%s' (length %d).", docid, cleaned_plaintext_abbr, cleaned_plaintext.length()));
310311
perplexity = computePerplexity(cleaned_plaintext);
311312
// if (Double.isNaN(perplexity)) {
312313
// double perplexity_new = -1d;
313314
// LOG.log(Level.WARNING, String.format("[%s '%s'] failed to get meaningful perplexity: %g. Setting perplexity to %g.", uri.toString(), cleaned_plaintext_abbr, perplexity, perplexity_new));
314315
// perplexity = perplexity_new;
315316
// }
316-
LOG.fine(String.format("[%s, '%s'] perplexity: %g.", uri.toString(), cleaned_plaintext_abbr, perplexity));
317+
LOG.finest(String.format("[%s, '%s'] perplexity: %g.", uri.toString(), cleaned_plaintext_abbr, perplexity));
317318
} catch (Throwable t) {
318319
for (int i = 1; t != null && i < 10; i++) {
319320
LOG.log(Level.SEVERE,
@@ -330,7 +331,7 @@ static void addExtraInfo(CrawlURI uri, String key, Object value) {
330331
_paused_due_to_error = true;
331332
}
332333
}
333-
if(Double.isInfinite(perplexity)){
334+
if(!Double.isFinite(perplexity) || perplexity <= 1){
334335
LOG.log(Level.FINE, String.format("[%s '%s'] resetting infinite perplexity to predefined maximum perplexity value (-1).", uri.toString(), cleaned_plaintext_abbr));
335336
perplexity = -1;
336337
}

lt.ltbot/src/main/java/de/tudarmstadt/lt/ltbot/postprocessor/SharedConstants.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ public class SharedConstants {
2424
private SharedConstants(){ /* DO NOT INSTANTIATE */ }
2525

2626
public final static String EXTRA_INFO_PERPLEXITY = "perp";
27-
public final static String EXTRA_INFO_PERPLEXITY_VIA = "perp-via";
2827
public final static String EXTRA_INFO_ASSIGNED_SCHEDULING_DIRECTIVE = "asgnd-sched-drctve";
2928
public final static String EXTRA_INFO_ASSIGNED_COST_PRECEDENCE = "asgnd-cost-precedence";
3029
public final static String EXTRA_INFO_PLAINTEXT_ABBREVIATED = "plain-abbrv";

0 commit comments

Comments
 (0)