Skip to content

Commit a5418b0

Browse files
committed
use at least 256 bytes for character estimator
(was 1024)
1 parent 6e41cbc commit a5418b0

File tree

2 files changed

+22
-27
lines changed

2 files changed

+22
-27
lines changed

lib/src/main/scala/com/worksap/nlp/uzushio/lib/lang/LangEstimation.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ case class ProbableLanguage(lang: String) extends EstimationResult {
1515
override def str: String = lang
1616
}
1717

18-
class LangEstimation(private val minBytes: Int = 1024) {
18+
class LangEstimation(private val minBytes: Int = 256) {
1919
private val internalBuffer = CharBuffer.allocate(5 * 1024)
2020
private val decodeBuffer = CharBuffer.allocate(4 * 1024)
2121
private def langDetector = LangEstimation.cachedDetector

lib/src/main/scala/com/worksap/nlp/uzushio/lib/warc/WarcEntryParser.scala

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,11 @@
11
package com.worksap.nlp.uzushio.lib.warc
22

33
import com.worksap.nlp.uzushio.lib.html.{AllTagMapper, ParagraphExtractor, ParseAbortException}
4-
import com.worksap.nlp.uzushio.lib.lang.{
5-
EstimationFailure,
6-
LangEstimation,
7-
LangTagSniffer,
8-
ProbableLanguage
9-
}
4+
import com.worksap.nlp.uzushio.lib.lang.{EstimationFailure, LangEstimation, LangTagSniffer, ProbableLanguage}
105
import com.worksap.nlp.uzushio.lib.warc.WarcEntryParser.{logger, resolveEarliestDate}
6+
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap
117
import org.apache.commons.lang3.StringUtils
12-
import org.apache.hc.core5.http.impl.nio.{
13-
DefaultHttpResponseFactory,
14-
DefaultHttpResponseParser,
15-
SessionBufferAccess
16-
}
8+
import org.apache.hc.core5.http.impl.nio.{DefaultHttpResponseFactory, DefaultHttpResponseParser, SessionBufferAccess}
179
import org.apache.hc.core5.http.{HttpException, HttpMessage, MessageHeaders}
1810
import org.apache.tika.detect.EncodingDetector
1911
import org.apache.tika.exception.TikaException
@@ -25,14 +17,10 @@ import org.mozilla.universalchardet.UniversalDetector
2517
import org.slf4j.LoggerFactory
2618

2719
import java.io.{ByteArrayInputStream, IOException, InputStream}
28-
import java.nio.charset.{
29-
Charset,
30-
IllegalCharsetNameException,
31-
StandardCharsets,
32-
UnsupportedCharsetException
33-
}
20+
import java.nio.charset.{Charset, IllegalCharsetNameException, StandardCharsets, UnsupportedCharsetException}
3421
import java.time.format.{DateTimeFormatter, DateTimeParseException}
3522
import java.time.{LocalDateTime, ZoneId, ZonedDateTime}
23+
import java.util
3624
import java.util.{Locale, UUID}
3725
import scala.collection.mutable.ArrayBuffer
3826
import scala.xml.SAXException
@@ -58,6 +46,8 @@ class WarcEntryParser(
5846
private val sessionInputBuffer = SessionBufferAccess.instance(4 * 1024, 1024)
5947
private val langEstimation = new LangEstimation()
6048

49+
private val failures = new Object2IntOpenHashMap[String]()
50+
6151
private def detectCharsetFromBytes(
6252
data: Array[Byte],
6353
offset: Int,
@@ -70,6 +60,11 @@ class WarcEntryParser(
7060
detector.getDetectedCharset
7161
}
7262

63+
private def skipDoc(reason: String): None.type = {
64+
failures.addTo(reason, 1)
65+
None
66+
}
67+
7368
def parseHttpHeader(bytes: Array[Byte]): Option[(HttpMessage, Int)] = {
7469
sessionInputBuffer.clear()
7570
sessionInputBuffer.putBytes(bytes)
@@ -84,9 +79,9 @@ class WarcEntryParser(
8479
}
8580
Some((resp, sessionInputBuffer.position()))
8681
} catch {
87-
case _: HttpException => None
88-
case _: IOException => None
89-
case _: IllegalArgumentException => None
82+
case _: HttpException => skipDoc("http.parse")
83+
case _: IOException => skipDoc("http.ioex")
84+
case _: IllegalArgumentException => skipDoc("http.ia")
9085
}
9186
}
9287

@@ -95,8 +90,8 @@ class WarcEntryParser(
9590
try {
9691
Some(Charset.forName(cleanName))
9792
} catch {
98-
case _: IllegalCharsetNameException => None
99-
case _: UnsupportedCharsetException => None
93+
case _: IllegalCharsetNameException => skipDoc("charset.illegal")
94+
case _: UnsupportedCharsetException => skipDoc("charset.unsupported")
10095
}
10196
}
10297

@@ -163,22 +158,22 @@ class WarcEntryParser(
163158
if (c1.isDefined) {
164159
langEstimation.estimateLang(data, offset, c1.get) match {
165160
case ProbableLanguage(lang) => return Some((c1.get, lang))
166-
case EstimationFailure => return None
161+
case EstimationFailure => return skipDoc("lang.c1")
167162
case _ => // do nothing
168163
}
169164
}
170165
val c2 = guessCharsetFromHeader(headers)
171166
if (c2.isDefined) {
172167
langEstimation.estimateLang(data, offset, c2.get) match {
173168
case ProbableLanguage(lang) => return Some((c2.get, lang))
174-
case EstimationFailure => return None
169+
case EstimationFailure => return skipDoc("lang.c2")
175170
case _ => // do nothing
176171
}
177172
}
178173
val c3 = guessCharsetFromBytes(data, offset)
179174
langEstimation.estimateLang(data, offset, c3) match {
180175
case ProbableLanguage(lang) => Some((c3, lang))
181-
case _ => None
176+
case _ => skipDoc("lang.c3")
182177
}
183178
}
184179

@@ -257,7 +252,7 @@ class WarcEntryParser(
257252
date = resolveEarliestDate(item.accessDate, header)
258253
)
259254
)
260-
case _ => None
255+
case (_, lang) => skipDoc(s"lang.$lang")
261256
}
262257
}
263258
}

0 commit comments

Comments
 (0)