Skip to content

Commit 8b7f0f5

Browse files
authored
Added created and modified dates to metadata extractor. (#23)
* Added created and modified dates to metadata extractor. * Add a sample test for golden files using the new Resource API
1 parent 07db8c8 commit 8b7f0f5

File tree

4 files changed

+72
-3
lines changed

4 files changed

+72
-3
lines changed

src/main/kotlin/com/chimbori/crux/api/Fields.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ public object Fields {
88
public const val LANGUAGE: String = "language"
99
public const val DISPLAY: String = "display"
1010
public const val ORIENTATION: String = "orientation"
11+
public const val PUBLISHED_AT: String = "published_at"
12+
public const val MODIFIED_AT: String = "modified_at"
1113

1214
public const val THEME_COLOR_HEX: String = "theme-color-hex"
1315
public const val THEME_COLOR_HTML: String = "theme-color-html" // Named colors like "aliceblue"

src/main/kotlin/com/chimbori/crux/extractors/MetadataHelpers.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,16 @@ public fun Document.extractSiteName(): String? = (
4242
public fun Document.extractThemeColor(): String? =
4343
select("meta[name=theme-color]").attr("content").nullIfBlank()
4444

45+
public fun Document.extractPublishedAt(): String? = (
46+
select("meta[itemprop=dateCreated]").attr("content").nullIfBlank()
47+
?: select("meta[property=article:published_time]").attr("content").nullIfBlank()
48+
)?.removeWhiteSpace()?.nullIfBlank()
49+
50+
public fun Document.extractModifiedAt(): String? = (
51+
select("meta[itemprop=dateModified]").attr("content").nullIfBlank()
52+
?: select("meta[property=article:modified_time]").attr("content").nullIfBlank()
53+
)?.removeWhiteSpace()?.nullIfBlank()
54+
4555
public fun Document.extractKeywords(): List<String> =
4656
select("meta[name=keywords]").attr("content")
4757
.removeWhiteSpace()

src/main/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractor.kt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@ import com.chimbori.crux.api.Extractor
44
import com.chimbori.crux.api.Fields.AMP_URL
55
import com.chimbori.crux.api.Fields.BANNER_IMAGE_URL
66
import com.chimbori.crux.api.Fields.CANONICAL_URL
7+
import com.chimbori.crux.api.Fields.PUBLISHED_AT
78
import com.chimbori.crux.api.Fields.DESCRIPTION
89
import com.chimbori.crux.api.Fields.FEED_URL
910
import com.chimbori.crux.api.Fields.KEYWORDS_CSV
11+
import com.chimbori.crux.api.Fields.MODIFIED_AT
1012
import com.chimbori.crux.api.Fields.NEXT_PAGE_URL
1113
import com.chimbori.crux.api.Fields.PREVIOUS_PAGE_URL
1214
import com.chimbori.crux.api.Fields.SITE_NAME
@@ -18,10 +20,12 @@ import com.chimbori.crux.common.fetchFromUrl
1820
import com.chimbori.crux.common.isLikelyArticle
1921
import com.chimbori.crux.extractors.extractAmpUrl
2022
import com.chimbori.crux.extractors.extractCanonicalUrl
23+
import com.chimbori.crux.extractors.extractPublishedAt
2124
import com.chimbori.crux.extractors.extractDescription
2225
import com.chimbori.crux.extractors.extractFeedUrl
2326
import com.chimbori.crux.extractors.extractImageUrl
2427
import com.chimbori.crux.extractors.extractKeywords
28+
import com.chimbori.crux.extractors.extractModifiedAt
2529
import com.chimbori.crux.extractors.extractPaginationUrl
2630
import com.chimbori.crux.extractors.extractSiteName
2731
import com.chimbori.crux.extractors.extractThemeColor
@@ -61,6 +65,8 @@ public class HtmlMetadataExtractor(private val okHttpClient: OkHttpClient) : Ext
6165
DESCRIPTION to resourceToUse.document?.extractDescription(),
6266
SITE_NAME to resourceToUse.document?.extractSiteName(),
6367
THEME_COLOR_HEX to resourceToUse.document?.extractThemeColor(),
68+
PUBLISHED_AT to resourceToUse.document?.extractPublishedAt(),
69+
MODIFIED_AT to resourceToUse.document?.extractModifiedAt(),
6470
KEYWORDS_CSV to resourceToUse.document?.extractKeywords()?.joinToString(separator = ","),
6571
),
6672
urls = mapOf(

src/test/kotlin/com/chimbori/crux/articles/GoldenFilesTest.kt

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package com.chimbori.crux.articles
22

33
import com.chimbori.crux.api.Fields.AMP_URL
44
import com.chimbori.crux.api.Fields.BANNER_IMAGE_URL
5+
import com.chimbori.crux.api.Fields.MODIFIED_AT
6+
import com.chimbori.crux.api.Fields.PUBLISHED_AT
57
import com.chimbori.crux.api.Fields.SITE_NAME
68
import com.chimbori.crux.api.Fields.TITLE
79
import com.chimbori.crux.common.assertContains
@@ -70,7 +72,27 @@ class GoldenFilesTest {
7072
}
7173

7274
@Test
73-
fun testBBC_AMP() {
75+
fun testBBC_AMP_resourceApi() {
76+
extractFromFile("http://www.bbc.co.uk/news/amp/37341871".toHttpUrl(), "bbc-amp.html").run {
77+
assertEquals("BBC News", fields[SITE_NAME])
78+
assertEquals("Baby born on Mediterranean rescue ship", fields[TITLE])
79+
assertEquals("http://www.bbc.co.uk/news/amp/37341871".toHttpUrl(), url)
80+
assertEquals(
81+
"http://ichef.bbci.co.uk/news/999/cpsprodpb/146E6/production/_91168638_baby070012-9-20162-1photocreditalvawhitemsf.jpg".toHttpUrl(),
82+
urls[BANNER_IMAGE_URL]
83+
)
84+
assertStartsWith(
85+
"A Nigerian woman has given birth to a boy on board a rescue ship in the Mediterranean after being plucked from an overcrowded rubber dinghy.",
86+
article?.text()
87+
)
88+
89+
assertEquals("2016-09-12T14:31:25+00:00", fields[PUBLISHED_AT])
90+
assertEquals("2016-09-12T14:31:25+00:00", fields[MODIFIED_AT]);
91+
}
92+
}
93+
94+
@Test
95+
fun testBBC_AMP_articleApi() {
7496
fromFile("http://www.bbc.co.uk/news/amp/37341871", "bbc-amp.html").run {
7597
assertEquals("BBC News", siteName)
7698
assertEquals("Baby born on Mediterranean rescue ship", title)
@@ -125,7 +147,19 @@ class GoldenFilesTest {
125147
}
126148

127149
@Test
128-
fun testBoingBoing() {
150+
fun testBoingBoing_resourceApi() {
151+
extractFromFile("http://www.boingboing.net/2010/08/18/dr-laura-criticism-o.html".toHttpUrl(), "boingboing.html").run {
152+
assertStartsWith(
153+
"Dr. Laura: criticism of me infringes my first amendment rights Dr. Laura Schlessinger is leaving radio to regain her \"first amendment\" rights on the internet.",
154+
article?.text()
155+
)
156+
assertEquals("2010-08-18T01:57:27+00:00", fields[PUBLISHED_AT])
157+
assertEquals("2010-08-18T09:43:25+00:00", fields[MODIFIED_AT]);
158+
}
159+
}
160+
161+
@Test
162+
fun testBoingBoing_articleApi() {
129163
fromFile("http://www.boingboing.net/2010/08/18/dr-laura-criticism-o.html", "boingboing.html").run {
130164
assertStartsWith(
131165
"Dr. Laura: criticism of me infringes my first amendment rights Dr. Laura Schlessinger is leaving radio to regain her \"first amendment\" rights on the internet.",
@@ -1108,7 +1142,24 @@ class GoldenFilesTest {
11081142
}
11091143

11101144
@Test
1111-
fun testWallStreetJournal() {
1145+
fun testWallStreetJournal_resourceApi() {
1146+
extractFromFile("http://www.wsj.com/articles/SB10001424052748704532204575397061414483040".toHttpUrl(), "wsj.html").run {
1147+
assertEquals(
1148+
"https://si.wsj.net/public/resources/images/OB-JO759_0814st_D_20100814143158.jpg".toHttpUrl(),
1149+
urls[BANNER_IMAGE_URL]
1150+
)
1151+
assertStartsWith(
1152+
"The Obama administration has paid out less than a third of the nearly $230 billion",
1153+
article?.text()
1154+
)
1155+
1156+
assertEquals("2010-08-14T15:14:00.000Z", fields[PUBLISHED_AT])
1157+
assertEquals("2010-08-16T04:01:00.000Z", fields[MODIFIED_AT]);
1158+
}
1159+
}
1160+
1161+
@Test
1162+
fun testWallStreetJournal_articleApi() {
11121163
fromFile("http://www.wsj.com/articles/SB10001424052748704532204575397061414483040", "wsj.html").run {
11131164
assertStartsWith(
11141165
"The Obama administration has paid out less than a third of the nearly $230 billion",

0 commit comments

Comments
 (0)