Skip to content

Commit e48831a

Browse files
Enable Html stripping (#478)
1 parent eba38a0 commit e48831a

File tree

8 files changed

+43
-7
lines changed

8 files changed

+43
-7
lines changed

core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ trait RichMapFeature {
277277
* options are from the full entry or from the tokens
278278
* @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to
279279
* be hashed instead of ignored
280+
* @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing
280281
* @param others additional text features
281282
* @return result feature of type Vector
282283
*/
@@ -304,6 +305,7 @@ trait RichMapFeature {
304305
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
305306
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
306307
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
308+
stripHtml: Boolean = TextTokenizer.StripHtml,
307309
others: Array[FeatureLike[TextMap]] = Array.empty
308310
): FeatureLike[OPVector] = {
309311
// scalastyle:on parameter.number
@@ -318,6 +320,7 @@ trait RichMapFeature {
318320
.setAutoDetectThreshold(autoDetectThreshold)
319321
.setDefaultLanguage(defaultLanguage)
320322
.setMinTokenLength(minTokenLength)
323+
.setStripHtml(stripHtml)
321324
.setToLowercase(toLowercase)
322325
.setTopK(topK)
323326
.setMinSupport(minSupport)
@@ -426,10 +429,9 @@ trait RichMapFeature {
426429
* @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or
427430
* failed to make a good enough prediction.
428431
* @param hashAlgorithm hash algorithm to use
429-
* @param tokenizeForLengths If true, then the length counts will be lengths of the tokens in the entries.
430-
* If false, then the length counts will be the lengths of the entire entries
431432
* @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to
432433
* be hashed instead of ignored
434+
* @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing
433435
* @param others additional text features
434436
* @return result feature of type Vector
435437
*/
@@ -457,6 +459,7 @@ trait RichMapFeature {
457459
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
458460
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
459461
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
462+
stripHtml: Boolean = TextTokenizer.StripHtml,
460463
others: Array[FeatureLike[TextAreaMap]] = Array.empty
461464
): FeatureLike[OPVector] = {
462465
// scalastyle:on parameter.number
@@ -471,6 +474,7 @@ trait RichMapFeature {
471474
.setAutoDetectThreshold(autoDetectThreshold)
472475
.setDefaultLanguage(defaultLanguage)
473476
.setMinTokenLength(minTokenLength)
477+
.setStripHtml(stripHtml)
474478
.setToLowercase(toLowercase)
475479
.setTopK(topK)
476480
.setMinSupport(minSupport)

core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ trait RichTextFeature {
115115
* confidence greater than the threshold then defaultLanguage is used.
116116
* @param hashSpaceStrategy strategy to determine whether to use shared hash space for all included features
117117
* @param minTokenLength minimum token length, >= 1.
118+
* @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing
118119
* @param trackNulls indicates whether or not to track null values in a separate column.
119120
* Since features may be combined into a shared hash space here, the null value
120121
* should be tracked separately
@@ -137,6 +138,7 @@ trait RichTextFeature {
137138
autoDetectLanguage: Boolean,
138139
minTokenLength: Int,
139140
toLowercase: Boolean,
141+
stripHtml: Boolean = TextTokenizer.StripHtml,
140142
trackNulls: Boolean = TransmogrifierDefaults.TrackNulls,
141143
trackTextLen: Boolean = TransmogrifierDefaults.TrackTextLen,
142144
hashWithIndex: Boolean = TransmogrifierDefaults.HashWithIndex,
@@ -153,7 +155,7 @@ trait RichTextFeature {
153155
// scalastyle:on parameter.number
154156
val tokenized = (f +: others).map(_.tokenize(
155157
languageDetector = languageDetector,
156-
analyzer = analyzer,
158+
analyzer = if (stripHtml) TextTokenizer.AnalyzerHtmlStrip else analyzer,
157159
autoDetectLanguage = autoDetectLanguage,
158160
autoDetectThreshold = autoDetectThreshold,
159161
defaultLanguage = defaultLanguage,
@@ -241,6 +243,7 @@ trait RichTextFeature {
241243
hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm,
242244
textLengthType: TextLengthType = SmartTextVectorizer.LengthType,
243245
minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev,
246+
stripHtml: Boolean = TextTokenizer.StripHtml,
244247
others: Array[FeatureLike[T]] = Array.empty
245248
): FeatureLike[OPVector] = {
246249
// scalastyle:on parameter.number
@@ -254,6 +257,7 @@ trait RichTextFeature {
254257
.setAutoDetectThreshold(autoDetectThreshold)
255258
.setDefaultLanguage(defaultLanguage)
256259
.setMinTokenLength(minTokenLength)
260+
.setStripHtml(stripHtml)
257261
.setToLowercase(toLowercase)
258262
.setTopK(topK)
259263
.setMinSupport(minSupport)
@@ -375,7 +379,7 @@ trait RichTextFeature {
375379
minTokenLength: Int = TextTokenizer.MinTokenLength,
376380
toLowercase: Boolean = TextTokenizer.ToLowercase
377381
): FeatureLike[TextList] = {
378-
382+
// html stripping won't work here due since LuceneRegexTextAnalyzer
379383
tokenize(
380384
languageDetector = TextTokenizer.LanguageDetector,
381385
analyzer = new LuceneRegexTextAnalyzer(pattern, group),

core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]]
221221
.setMinTokenLength(getMinTokenLength)
222222
.setToLowercase(getToLowercase)
223223
.setTrackTextLen($(trackTextLen))
224+
.setStripHtml(getStripHtml)
224225
}
225226
}
226227

core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])(
148148
.setMinTokenLength(getMinTokenLength)
149149
.setToLowercase(getToLowercase)
150150
.setTrackTextLen($(trackTextLen))
151+
.setStripHtml(getStripHtml)
151152
}
152153

153154
private def makeVectorMetadata(smartTextParams: SmartTextVectorizerModelArgs): OpVectorMetadata = {

core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,24 @@ trait TextTokenizerParams extends LanguageDetectionParams with TextMatchingParam
8383
def setMinTokenLength(value: Int): this.type = set(minTokenLength, value)
8484
def getMinTokenLength: Int = $(minTokenLength)
8585

86+
final val stripHtml =
87+
new BooleanParam(this, "stripHtml", "enable html stripping")
88+
def setStripHtml(value: Boolean): this.type = set(stripHtml, value)
89+
def getStripHtml: Boolean = $(stripHtml)
90+
8691
setDefault(
8792
minTokenLength -> TextTokenizer.MinTokenLength,
8893
toLowercase -> TextTokenizer.ToLowercase,
8994
autoDetectLanguage -> TextTokenizer.AutoDetectLanguage,
9095
autoDetectThreshold -> TextTokenizer.AutoDetectThreshold,
91-
defaultLanguage -> TextTokenizer.DefaultLanguage.entryName
96+
defaultLanguage -> TextTokenizer.DefaultLanguage.entryName,
97+
stripHtml -> TextTokenizer.StripHtml
9298
)
9399

94100
def tokenize(
95101
text: Text,
96102
languageDetector: LanguageDetector = TextTokenizer.LanguageDetector,
97-
analyzer: TextAnalyzer = TextTokenizer.Analyzer
103+
analyzer: TextAnalyzer = if (getStripHtml) TextTokenizer.AnalyzerHtmlStrip else TextTokenizer.Analyzer
98104
): TextTokenizerResult = TextTokenizer.tokenize(
99105
text = text,
100106
languageDetector = languageDetector,

core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ private[op] trait TransmogrifierDefaults {
5353
val NullString: String = OpVectorColumnMetadata.NullString
5454
val OtherString: String = OpVectorColumnMetadata.OtherString
5555
val DefaultNumOfFeatures: Int = 512
56-
val MaxNumOfFeatures: Int = 16384
56+
val MaxNumOfFeatures: Int = 1 << 17 // 2^17
5757
val DateListDefault: DateListPivot = DateListPivot.SinceLast
5858
val ReferenceDate: org.joda.time.DateTime = DateTimeUtils.now()
5959
val TopK: Int = 20

core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,16 @@ class SmartTextMapVectorizerTest
772772
checkDerivedQuantities(res, "f2", Seq(4, 5, 5, 5, 3).map(_.toLong))
773773
}
774774

775+
it should "turn on stripHTML flag is equivalent to passing in a custom AnalyzerHtmlStrip" +
776+
"inside SmartTextMapVectorizer" in {
777+
val exampleHTML = "<body>Big ones, small <h1>ones</h1>, some as big as your head</body>".toText
778+
val tokensWithFlag = new SmartTextMapVectorizer()
779+
.setStripHtml(true).setInput(m1).tokenize(exampleHTML).tokens.value
780+
val tokensWithAnalyzer = new SmartTextMapVectorizer().setInput(m1)
781+
.tokenize(exampleHTML, analyzer = TextTokenizer.AnalyzerHtmlStrip).tokens.value
782+
tokensWithFlag should contain theSameElementsInOrderAs tokensWithAnalyzer
783+
}
784+
775785
private[op] def assertVectorLength(df: DataFrame, output: FeatureLike[OPVector],
776786
expectedLength: Int, textVectorizationMethod: TextVectorizationMethod): Unit = {
777787
val result = df.collect(output)

core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,4 +712,14 @@ class SmartTextVectorizerTest
712712
ts.lengthStdDev.isNaN shouldBe true
713713
}
714714

715+
it should "turn on stripHTML flag is equivalent to passing in a custom AnalyzerHtmlStrip" +
716+
"inside SmartTextVectorizer" in {
717+
val exampleHTML = "<body>Big ones, small <h1>ones</h1>, some as big as your head</body>".toText
718+
val tokensWithFlag = new SmartTextVectorizer()
719+
.setStripHtml(true).setInput(f1).tokenize(exampleHTML).tokens.value
720+
val tokensWithAnalyzer = new SmartTextVectorizer().setInput(f1)
721+
.tokenize(exampleHTML, analyzer = TextTokenizer.AnalyzerHtmlStrip).tokens.value
722+
tokensWithFlag should contain theSameElementsInOrderAs tokensWithAnalyzer
723+
}
724+
715725
}

0 commit comments

Comments
 (0)