Skip to content

Commit db238cc

Browse files
committed
Handle mismatched HTML tags in content parsing and improve text extraction logic
1 parent 1859bf1 commit db238cc

File tree

17 files changed

+473
-339
lines changed

17 files changed

+473
-339
lines changed

rssparser/src/androidMain/kotlin/com/prof18/rssparser/internal/XmlPullParser+.kt

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,42 @@
11
package com.prof18.rssparser.internal
22

33
import org.xmlpull.v1.XmlPullParser
4+
import org.xmlpull.v1.XmlPullParserException
45

5-
internal fun XmlPullParser.nextTrimmedText(): String? = this.nextText()?.trim()
6+
internal fun XmlPullParser.nextTrimmedText(): String? {
7+
return try {
8+
this.nextText()?.trim()
9+
} catch (_: XmlPullParserException) {
10+
// Handle malformed HTML content (e.g., mismatched tag cases like <em>...</EM>) or HTML tags in content
11+
// When nextText() throws, there are nested tags, so we manually parse and collect all text
12+
// Note: nextText() may have consumed some text before throwing, so we start fresh from current position
13+
14+
val result = StringBuilder()
15+
var depth = 0
16+
var eventType = this.eventType
17+
18+
// Process from current position until we find the closing tag at depth 0
19+
while (eventType != XmlPullParser.END_DOCUMENT) {
20+
when (eventType) {
21+
XmlPullParser.TEXT, XmlPullParser.CDSECT -> {
22+
result.append(this.text)
23+
}
24+
XmlPullParser.START_TAG -> {
25+
depth++
26+
}
27+
XmlPullParser.END_TAG -> {
28+
if (depth == 0) {
29+
// We've reached the end tag of the original element
30+
break
31+
}
32+
depth--
33+
}
34+
}
35+
eventType = this.next()
36+
}
37+
result.toString().trim().takeIf { it.isNotEmpty() }
38+
}
39+
}
640

741
internal fun XmlPullParser.contains(key: RssKeyword): Boolean {
842
return this.name.equals(key.value, ignoreCase = true)

rssparser/src/androidMain/kotlin/com/prof18/rssparser/internal/atom/AtomParser.kt

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,7 @@ internal fun CoroutineScope.extractAtomContent(
9797

9898
xmlPullParser.contains(AtomKeyword.ENTRY_CONTENT) -> {
9999
if (insideItem) {
100-
val content = try {
101-
xmlPullParser.nextTrimmedText()
102-
} catch (_: XmlPullParserException) {
103-
// If there's some html not escaped, the parsing is going to fail
104-
null
105-
}
100+
val content = xmlPullParser.nextTrimmedText()
106101
channelFactory.articleBuilder.content(content)
107102
channelFactory.setImageFromContent(content)
108103
}

rssparser/src/appleMain/kotlin/com/prof18/rssparser/internal/FeedHandler.kt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@ import com.prof18.rssparser.model.RssChannel
44

55
internal interface FeedHandler {
66
fun didStartElement(startElement: String, attributes: Map<Any?, *>)
7-
fun foundCharacters(characters: String)
8-
fun didEndElement(endElement: String)
7+
fun didEndElement(endElement: String, text: String)
98
fun buildRssChannel(): RssChannel
9+
10+
/**
11+
* Determines if the text builder should be cleared when starting a new element.
12+
* Returns true for known RSS/Atom/RDF tags, false for HTML tags within content.
13+
*/
14+
fun shouldClearTextBuilder(qName: String): Boolean
1015
}

rssparser/src/appleMain/kotlin/com/prof18/rssparser/internal/IosXmlParser.kt

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ private class NSXMLParserDelegate(
6868
) : NSObject(), NSXMLParserDelegateProtocol {
6969

7070
private var feedHandler: FeedHandler? = null
71+
private val textBuilder: StringBuilder = StringBuilder()
7172

7273
override fun parser(
7374
parser: NSXMLParser,
@@ -95,13 +96,18 @@ private class NSXMLParserDelegate(
9596
"The provided XML is not supported. Only RSS and Atom feeds are supported",
9697
)
9798
}
99+
// Clear text builder only for known RSS/Atom/RDF tags
100+
// Don't clear for HTML tags within content to handle mismatched tag cases
101+
if (feedHandler?.shouldClearTextBuilder(didStartElement) == true) {
102+
textBuilder.clear()
103+
}
98104
feedHandler?.didStartElement(didStartElement, attributes)
99105
}
100106
}
101107
}
102108

103109
override fun parser(parser: NSXMLParser, foundCharacters: String) {
104-
feedHandler?.foundCharacters(foundCharacters)
110+
textBuilder.append(foundCharacters)
105111
}
106112

107113
override fun parser(
@@ -110,7 +116,8 @@ private class NSXMLParserDelegate(
110116
namespaceURI: String?,
111117
qualifiedName: String?,
112118
) {
113-
feedHandler?.didEndElement(didEndElement)
119+
val text = textBuilder.toString().trim()
120+
feedHandler?.didEndElement(didEndElement, text)
114121
}
115122

116123
override fun parserDidEndDocument(parser: NSXMLParser) {

rssparser/src/appleMain/kotlin/com/prof18/rssparser/internal/atom/AtomFeedHandler.kt

Lines changed: 87 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,13 @@ internal class AtomFeedHandler(
1010
private val baseFeedUrl: String?,
1111
) : FeedHandler {
1212

13-
private var currentElement: String? = null
14-
1513
private var channelFactory = ChannelFactory()
16-
private var itemData: MutableMap<String, String> = mutableMapOf()
17-
private var channelData: MutableMap<String, String> = mutableMapOf()
1814

1915
private var isInsideItem = false
2016
private var isInsideChannel = true
2117

2218
override fun didStartElement(startElement: String, attributes: Map<Any?, *>) {
23-
currentElement = startElement
24-
25-
when (currentElement) {
19+
when (startElement) {
2620
AtomKeyword.ATOM.value -> isInsideChannel = true
2721
AtomKeyword.ENTRY_ITEM.value -> isInsideItem = true
2822

@@ -87,86 +81,109 @@ internal class AtomFeedHandler(
8781
}
8882
}
8983

90-
override fun foundCharacters(characters: String) {
91-
val element = currentElement ?: return
92-
93-
when {
94-
isInsideItem -> itemData[element] = (itemData[element].orEmpty()) + characters
95-
isInsideChannel -> channelData[element] = (channelData[element].orEmpty()) + characters
96-
}
97-
}
98-
99-
override fun didEndElement(endElement: String) {
84+
override fun didEndElement(endElement: String, text: String) {
10085
when (endElement) {
10186
AtomKeyword.ATOM.value -> {
102-
channelFactory.channelImageBuilder.url(
103-
channelData[AtomKeyword.ICON.value]?.trim()
104-
)
105-
channelFactory.channelBuilder.lastBuildDate(
106-
channelData[AtomKeyword.UPDATED.value]?.trim()
107-
)
108-
channelFactory.channelBuilder.description(
109-
channelData[AtomKeyword.SUBTITLE.value]?.trim()
110-
)
111-
channelFactory.channelBuilder.title(
112-
channelData[AtomKeyword.TITLE.value]?.trim()
113-
)
114-
11587
isInsideChannel = false
11688
}
11789

11890
AtomKeyword.ENTRY_ITEM.value -> {
119-
val pubDate = if (itemData[AtomKeyword.ENTRY_PUBLISHED.value] != null) {
120-
itemData[AtomKeyword.ENTRY_PUBLISHED.value]?.trim()
121-
} else {
122-
itemData[AtomKeyword.UPDATED.value]?.trim()
123-
}
124-
channelFactory.articleBuilder.pubDate(
125-
pubDate
126-
)
127-
channelFactory.articleBuilder.title(
128-
itemData[AtomKeyword.TITLE.value]?.trim()
129-
)
130-
channelFactory.articleBuilder.author(
131-
itemData[AtomKeyword.ENTRY_AUTHOR.value]?.trim()
132-
)
133-
channelFactory.articleBuilder.guid(
134-
itemData[AtomKeyword.ENTRY_GUID.value]?.trim()
135-
)
136-
137-
val content = itemData[AtomKeyword.ENTRY_CONTENT.value]?.trim()
138-
channelFactory.articleBuilder.content(content)
139-
channelFactory.setImageFromContent(content)
140-
141-
val description = itemData[AtomKeyword.ENTRY_DESCRIPTION.value]?.trim()
142-
channelFactory.articleBuilder.description(description)
143-
channelFactory.setImageFromContent(description)
144-
145-
val category = itemData[AtomKeyword.ENTRY_CATEGORY.value]?.trim()
146-
if (!category.isNullOrEmpty()) {
147-
channelFactory.articleBuilder.addCategory(category)
91+
channelFactory.buildArticle()
92+
isInsideItem = false
93+
}
94+
95+
AtomKeyword.ICON.value -> {
96+
if (isInsideChannel) {
97+
channelFactory.channelImageBuilder.url(text)
14898
}
99+
}
149100

150-
// Youtube
101+
AtomKeyword.ENTRY_PUBLISHED.value -> {
102+
if (isInsideItem) {
103+
channelFactory.articleBuilder.pubDate(text)
104+
}
105+
}
151106

152-
val channelId = itemData[AtomKeyword.YOUTUBE_CHANNEL_ID.value]?.trim()
153-
channelFactory.youtubeChannelDataBuilder.channelId(channelId)
107+
AtomKeyword.UPDATED.value -> {
108+
when {
109+
isInsideItem -> channelFactory.articleBuilder.pubDateIfNull(text)
110+
isInsideChannel -> channelFactory.channelBuilder.lastBuildDate(text)
111+
}
112+
}
154113

155-
val videoId = itemData[AtomKeyword.YOUTUBE_VIDEO_ID.value]?.trim()
156-
channelFactory.youtubeItemDataBuilder.videoId(videoId)
114+
AtomKeyword.SUBTITLE.value -> {
115+
if (isInsideChannel) {
116+
channelFactory.channelBuilder.description(text)
117+
}
118+
}
157119

158-
val title = itemData[AtomKeyword.YOUTUBE_MEDIA_GROUP_TITLE.value]?.trim()
159-
channelFactory.youtubeItemDataBuilder.title(title)
120+
AtomKeyword.TITLE.value -> {
121+
when {
122+
isInsideItem -> channelFactory.articleBuilder.title(text)
123+
isInsideChannel -> channelFactory.channelBuilder.title(text)
124+
}
125+
}
160126

161-
val videoDescription = itemData[AtomKeyword.YOUTUBE_MEDIA_GROUP_DESCRIPTION.value]?.trim()
162-
channelFactory.youtubeItemDataBuilder.description(videoDescription)
127+
AtomKeyword.ENTRY_AUTHOR.value -> {
128+
if (isInsideItem) {
129+
channelFactory.articleBuilder.author(text)
130+
}
131+
}
163132

164-
channelFactory.buildArticle()
165-
itemData.clear()
133+
AtomKeyword.ENTRY_GUID.value -> {
134+
if (isInsideItem) {
135+
channelFactory.articleBuilder.guid(text)
136+
}
137+
}
138+
139+
AtomKeyword.ENTRY_CONTENT.value -> {
140+
if (isInsideItem) {
141+
channelFactory.articleBuilder.content(text)
142+
channelFactory.setImageFromContent(text)
143+
}
144+
}
145+
146+
AtomKeyword.ENTRY_DESCRIPTION.value -> {
147+
if (isInsideItem) {
148+
channelFactory.articleBuilder.description(text)
149+
channelFactory.setImageFromContent(text)
150+
}
151+
}
152+
153+
AtomKeyword.ENTRY_CATEGORY.value -> {
154+
if (isInsideItem && text.isNotEmpty()) {
155+
channelFactory.articleBuilder.addCategory(text)
156+
}
157+
}
158+
159+
AtomKeyword.YOUTUBE_CHANNEL_ID.value -> {
160+
channelFactory.youtubeChannelDataBuilder.channelId(text)
161+
}
162+
163+
AtomKeyword.YOUTUBE_VIDEO_ID.value -> {
164+
if (isInsideItem) {
165+
channelFactory.youtubeItemDataBuilder.videoId(text)
166+
}
167+
}
168+
169+
AtomKeyword.YOUTUBE_MEDIA_GROUP_TITLE.value -> {
170+
if (isInsideItem) {
171+
channelFactory.youtubeItemDataBuilder.title(text)
172+
}
173+
}
174+
175+
AtomKeyword.YOUTUBE_MEDIA_GROUP_DESCRIPTION.value -> {
176+
if (isInsideItem) {
177+
channelFactory.youtubeItemDataBuilder.description(text)
178+
}
166179
}
167180
}
168181
}
169182

170183
override fun buildRssChannel(): RssChannel =
171184
channelFactory.build()
185+
186+
override fun shouldClearTextBuilder(qName: String): Boolean {
187+
return AtomKeyword.isValid(qName)
188+
}
172189
}

0 commit comments

Comments
 (0)