Skip to content

Commit 5421eec

Browse files
committed
added swedish as a language and added Expressen as publisher.
1 parent 716b827 commit 5421eec

File tree

7 files changed

+184
-0
lines changed

7 files changed

+184
-0
lines changed

docs/supported_publishers.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2766,6 +2766,44 @@
27662766
</table>
27672767

27682768

2769+
## SE-Publishers
2770+
2771+
<table class="publishers se">
2772+
<thead>
2773+
<tr>
2774+
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2775+
<th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2776+
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2777+
<th>Languages</th>
2778+
<th>Missing&#160;Attributes</th>
2779+
<th>Deprecated&#160;Attributes</th>
2780+
<th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
2781+
</tr>
2782+
</thead>
2783+
<tbody>
2784+
<tr>
2785+
<td>
2786+
<code>Expressen</code>
2787+
</td>
2788+
<td>
2789+
<div>Expressen</div>
2790+
</td>
2791+
<td>
2792+
<a href="https://www.expressen.se/">
2793+
<span>www.expressen.se</span>
2794+
</a>
2795+
</td>
2796+
<td>
2797+
<code>sv</code>
2798+
</td>
2799+
<td>&#160;</td>
2800+
<td>&#160;</td>
2801+
<td>&#160;</td>
2802+
</tr>
2803+
</tbody>
2804+
</table>
2805+
2806+
27692807
## TR-Publishers
27702808

27712809
<table class="publishers tr">

src/fundus/publishers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from fundus.publishers.pt import PT
3333
from fundus.publishers.py import PY
3434
from fundus.publishers.ru import RU
35+
from fundus.publishers.se import SE
3536
from fundus.publishers.tr import TR
3637
from fundus.publishers.tw import TW
3738
from fundus.publishers.tz import TZ
@@ -101,6 +102,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
101102
pt = PT
102103
py = PY
103104
ru = RU
105+
se = SE
104106
tr = TR
105107
tw = TW
106108
tz = TZ
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from fundus.publishers.base_objects import PublisherGroup,Publisher
2+
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
3+
4+
from .expressen import ExpressenParser
5+
6+
class SE(metaclass=PublisherGroup):
7+
default_language = "sv"
8+
9+
Expressen = Publisher(
10+
name="Expressen",
11+
domain="https://www.expressen.se/",
12+
parser=ExpressenParser,
13+
sources=[
14+
RSSFeed("https://feeds.expressen.se/nyheter/"),
15+
RSSFeed("https://feeds.expressen.se/sport/"),
16+
RSSFeed("https://feeds.expressen.se/noje/"),
17+
Sitemap("https://www.expressen.se/sitemap.xml"),
18+
],
19+
)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
8+
from fundus.parser.data import Image
9+
from fundus.parser.utility import (
10+
extract_article_body_with_selector,
11+
generic_author_parsing,
12+
generic_date_parsing,
13+
image_extraction,
14+
generic_topic_parsing,
15+
)
16+
17+
18+
class ExpressenParser(ParserProxy):
19+
class V1(BaseParser):
20+
_paragraph_selector = CSSSelector("div.article__body-text p")
21+
_summary_selector = CSSSelector("div.article__preamble")
22+
23+
@attribute
24+
def body(self) -> Optional[ArticleBody]:
25+
return extract_article_body_with_selector(
26+
self.precomputed.doc,
27+
paragraph_selector=self._paragraph_selector,
28+
summary_selector=self._summary_selector,
29+
)
30+
31+
@attribute
32+
def title(self) -> Optional[str]:
33+
return self.precomputed.meta.get("og:title")
34+
35+
@attribute
36+
def publishing_date(self) -> Optional[datetime.datetime]:
37+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
38+
39+
@attribute
40+
def authors(self) -> List[str]:
41+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
42+
43+
@attribute
44+
def images(self) -> List[Image]:
45+
return image_extraction(
46+
doc=self.precomputed.doc,
47+
paragraph_selector=self._paragraph_selector,
48+
image_selector=XPath("//figure//img"),
49+
upper_boundary_selector=CSSSelector("div.article__body-text"),
50+
)
51+
52+
@attribute
53+
def topics(self) -> List[str]:
54+
return generic_topic_parsing(self.precomputed.ld.bf_search("articleSection"))
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
{
2+
"V1": {
3+
"authors": [
4+
"stefan soxbo"
5+
],
6+
"body": {
7+
"summary": [
8+
"Gymnasieeleven Carson Ryan ”misstogs för en ekorre” när han var ute och jagade.17-åringen sköts i bakhuvudet och dog, skriver People.Över en halv miljon kronor har samlats in för att hjälpa familjen.Senaste nyhetsklippen från Expressen."
9+
],
10+
"sections": [
11+
{
12+
"headline": [],
13+
"paragraphs": [
14+
"17-årige amerikanen Carson Ryan gick sista året på Washington High School och spelade i skolans fotbollslag.",
15+
"– Carson var en riktig tävlingsmänniska i allt han gjorde. Han älskade fiske och att vara med sina vänner. Men viktigast av allt så var Carson en person med en otroligt stark tro, säger lagets assisterande tränare Nic Williams.",
16+
"Under lördagen var Carson ute på en ekorrjakt i delstaten Iowa. Då blev han av misstag skjuten av en annan jägare, enligt Iowas naturresursmyndighet.",
17+
"”Han misstogs för en ekorre av en medlem i sitt jaktlag och träffades i bakhuvudet”, skriver myndigheten.",
18+
"Carson fördes till sjukhus.",
19+
"”Han avled senare av sina skador”, skriver myndigheten.",
20+
"Händelsen utreds.",
21+
"Skolan höll en minnesstund för Carson på söndagskvällen. Elever uppmanades att bära skolans svarta och orange färger på måndagen till minne av Carson.",
22+
"En insamling har startats för att hjälpa familjen och 600 000 kronor har donerats.",
23+
"”Carson var en son, en vän och ett ljus för alla som hade förmånen att känna honom. Hans vänlighet, humor och genuina personlighet berörde otaliga liv. Hans död lämnar ett omätbart tomrum”, står det i insamlingens beskrivning."
24+
]
25+
}
26+
]
27+
},
28+
"images": [
29+
{
30+
"versions": [
31+
{
32+
"url": "https://static.bonniernews.se/images/1c/25/1c258a8ea4d549588f1a6d80400f4982/16x9/640.png",
33+
"query_width": null,
34+
"size": {
35+
"width": 640,
36+
"height": 0
37+
},
38+
"type": "image/png"
39+
},
40+
{
41+
"url": "https://static.bonniernews.se/images/1c/25/1c258a8ea4d549588f1a6d80400f4982/16x9/741.png",
42+
"query_width": null,
43+
"size": {
44+
"width": 741,
45+
"height": 0
46+
},
47+
"type": "image/png"
48+
}
49+
],
50+
"is_cover": true,
51+
"description": null,
52+
"caption": "Carson Ryan. Foto: Go Fund Me",
53+
"authors": [
54+
"Go Fund Me"
55+
],
56+
"position": 907
57+
}
58+
],
59+
"publishing_date": "2025-10-01 00:11:03.035000+00:00",
60+
"title": "Därför dog 17-åringen:\n”Misstogs för ekorre”",
61+
"topics": [
62+
"nyheter"
63+
]
64+
}
65+
}
48.4 KB
Binary file not shown.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"Expressen_2025_10_20.html.gz": {
3+
"url": "https://www.expressen.se/nyheter/varlden/17-aringens-dodsorsak-misstogs-for-ekorre/",
4+
"crawl_date": "2025-10-20 23:46:55.070865"
5+
}
6+
}

0 commit comments

Comments
 (0)