11package com .worksap .nlp .uzushio .lib .warc
22
33import com .worksap .nlp .uzushio .lib .html .{AllTagMapper , ParagraphExtractor , ParseAbortException }
4- import com .worksap .nlp .uzushio .lib .lang .{
5- EstimationFailure ,
6- LangEstimation ,
7- LangTagSniffer ,
8- ProbableLanguage
9- }
4+ import com .worksap .nlp .uzushio .lib .lang .{EstimationFailure , LangEstimation , LangTagSniffer , ProbableLanguage }
105import com .worksap .nlp .uzushio .lib .warc .WarcEntryParser .{logger , resolveEarliestDate }
6+ import it .unimi .dsi .fastutil .objects .Object2IntOpenHashMap
117import org .apache .commons .lang3 .StringUtils
12- import org .apache .hc .core5 .http .impl .nio .{
13- DefaultHttpResponseFactory ,
14- DefaultHttpResponseParser ,
15- SessionBufferAccess
16- }
8+ import org .apache .hc .core5 .http .impl .nio .{DefaultHttpResponseFactory , DefaultHttpResponseParser , SessionBufferAccess }
179import org .apache .hc .core5 .http .{HttpException , HttpMessage , MessageHeaders }
1810import org .apache .tika .detect .EncodingDetector
1911import org .apache .tika .exception .TikaException
@@ -25,14 +17,10 @@ import org.mozilla.universalchardet.UniversalDetector
2517import org .slf4j .LoggerFactory
2618
2719import java .io .{ByteArrayInputStream , IOException , InputStream }
28- import java .nio .charset .{
29- Charset ,
30- IllegalCharsetNameException ,
31- StandardCharsets ,
32- UnsupportedCharsetException
33- }
20+ import java .nio .charset .{Charset , IllegalCharsetNameException , StandardCharsets , UnsupportedCharsetException }
3421import java .time .format .{DateTimeFormatter , DateTimeParseException }
3522import java .time .{LocalDateTime , ZoneId , ZonedDateTime }
23+ import java .util
3624import java .util .{Locale , UUID }
3725import scala .collection .mutable .ArrayBuffer
3826import scala .xml .SAXException
@@ -58,6 +46,8 @@ class WarcEntryParser(
5846 private val sessionInputBuffer = SessionBufferAccess .instance(4 * 1024 , 1024 )
5947 private val langEstimation = new LangEstimation ()
6048
49+ private val failures = new Object2IntOpenHashMap [String ]()
50+
6151 private def detectCharsetFromBytes (
6252 data : Array [Byte ],
6353 offset : Int ,
@@ -70,6 +60,11 @@ class WarcEntryParser(
7060 detector.getDetectedCharset
7161 }
7262
63+ private def skipDoc (reason : String ): None .type = {
64+ failures.addTo(reason, 1 )
65+ None
66+ }
67+
7368 def parseHttpHeader (bytes : Array [Byte ]): Option [(HttpMessage , Int )] = {
7469 sessionInputBuffer.clear()
7570 sessionInputBuffer.putBytes(bytes)
@@ -84,9 +79,9 @@ class WarcEntryParser(
8479 }
8580 Some ((resp, sessionInputBuffer.position()))
8681 } catch {
87- case _ : HttpException => None
88- case _ : IOException => None
89- case _ : IllegalArgumentException => None
82+ case _ : HttpException => skipDoc( " http.parse " )
83+ case _ : IOException => skipDoc( " http.ioex " )
84+ case _ : IllegalArgumentException => skipDoc( " http.ia " )
9085 }
9186 }
9287
@@ -95,8 +90,8 @@ class WarcEntryParser(
9590 try {
9691 Some (Charset .forName(cleanName))
9792 } catch {
98- case _ : IllegalCharsetNameException => None
99- case _ : UnsupportedCharsetException => None
93+ case _ : IllegalCharsetNameException => skipDoc( " charset.illegal " )
94+ case _ : UnsupportedCharsetException => skipDoc( " charset.unsupported " )
10095 }
10196 }
10297
@@ -163,22 +158,22 @@ class WarcEntryParser(
163158 if (c1.isDefined) {
164159 langEstimation.estimateLang(data, offset, c1.get) match {
165160 case ProbableLanguage (lang) => return Some ((c1.get, lang))
166- case EstimationFailure => return None
161+ case EstimationFailure => return skipDoc( " lang.c1 " )
167162 case _ => // do nothing
168163 }
169164 }
170165 val c2 = guessCharsetFromHeader(headers)
171166 if (c2.isDefined) {
172167 langEstimation.estimateLang(data, offset, c2.get) match {
173168 case ProbableLanguage (lang) => return Some ((c2.get, lang))
174- case EstimationFailure => return None
169+ case EstimationFailure => return skipDoc( " lang.c2 " )
175170 case _ => // do nothing
176171 }
177172 }
178173 val c3 = guessCharsetFromBytes(data, offset)
179174 langEstimation.estimateLang(data, offset, c3) match {
180175 case ProbableLanguage (lang) => Some ((c3, lang))
181- case _ => None
176+ case _ => skipDoc( " lang.c3 " )
182177 }
183178 }
184179
@@ -257,7 +252,7 @@ class WarcEntryParser(
257252 date = resolveEarliestDate(item.accessDate, header)
258253 )
259254 )
260- case _ => None
255+ case (_, lang) => skipDoc( s " lang. $lang " )
261256 }
262257 }
263258 }
0 commit comments