Skip to content

Commit 0cd07c0

Browse files
authored
Merge pull request #790 from flairNLP/update-sources-pt
Update sources ThePortugalNews
2 parents d03abd7 + ffb5a25 commit 0cd07c0

File tree

2 files changed

+92
-4
lines changed

2 files changed

+92
-4
lines changed

docs/supported_publishers.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2661,7 +2661,22 @@
26612661
</a>
26622662
</td>
26632663
<td>
2664+
<code>ar</code>
2665+
<code>br</code>
2666+
<code>de</code>
26642667
<code>en</code>
2668+
<code>es</code>
2669+
<code>fi</code>
2670+
<code>fr</code>
2671+
<code>he</code>
2672+
<code>it</code>
2673+
<code>nl</code>
2674+
<code>pl</code>
2675+
<code>pt</code>
2676+
<code>ru</code>
2677+
<code>se</code>
2678+
<code>tr</code>
2679+
<code>zh</code>
26652680
</td>
26662681
<td>
26672682
<code>topics</code>

src/fundus/publishers/pt/__init__.py

Lines changed: 77 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from fundus.publishers.base_objects import Publisher, PublisherGroup
2-
from fundus.scraping.filter import inverse, lor, regex_filter
2+
from fundus.scraping.filter import inverse, regex_filter
33
from fundus.scraping.url import Sitemap
44

55
from .the_portugal_news import ThePortugalNewsParser
@@ -12,12 +12,85 @@ class PT(metaclass=PublisherGroup):
1212
name="The Portugal News",
1313
domain="https://www.theportugalnews.com/",
1414
parser=ThePortugalNewsParser,
15-
# There are more languages un the sitemap that could be added in the future
1615
sources=[
1716
Sitemap(
18-
"https://www.theportugalnews.com/sitemap.xml",
19-
sitemap_filter=lor(regex_filter("category-pages"), inverse(regex_filter("/en/"))),
17+
"https://www.theportugalnews.com/sitemap-news.xml",
18+
sitemap_filter=inverse(regex_filter("news-en.xml")),
2019
languages={"en"},
2120
),
21+
Sitemap(
22+
"https://www.theportugalnews.com/sitemap-news.xml",
23+
sitemap_filter=inverse(regex_filter("news-de.xml")),
24+
languages={"de"},
25+
),
26+
Sitemap(
27+
"https://www.theportugalnews.com/sitemap-news.xml",
28+
sitemap_filter=inverse(regex_filter("news-nl.xml")),
29+
languages={"nl"},
30+
),
31+
Sitemap(
32+
"https://www.theportugalnews.com/sitemap-news.xml",
33+
sitemap_filter=inverse(regex_filter("news-fr.xml")),
34+
languages={"fr"},
35+
),
36+
Sitemap(
37+
"https://www.theportugalnews.com/sitemap-news.xml",
38+
sitemap_filter=inverse(regex_filter("news-es.xml")),
39+
languages={"es"},
40+
),
41+
Sitemap(
42+
"https://www.theportugalnews.com/sitemap-news.xml",
43+
sitemap_filter=inverse(regex_filter("news-it.xml")),
44+
languages={"it"},
45+
),
46+
Sitemap(
47+
"https://www.theportugalnews.com/sitemap-news.xml",
48+
sitemap_filter=inverse(regex_filter("news-se.xml")),
49+
languages={"se"},
50+
),
51+
Sitemap(
52+
"https://www.theportugalnews.com/sitemap-news.xml",
53+
sitemap_filter=inverse(regex_filter("news-ru.xml")),
54+
languages={"ru"},
55+
),
56+
Sitemap(
57+
"https://www.theportugalnews.com/sitemap-news.xml",
58+
sitemap_filter=inverse(regex_filter("news-zh.xml")),
59+
languages={"zh"},
60+
),
61+
Sitemap(
62+
"https://www.theportugalnews.com/sitemap-news.xml",
63+
sitemap_filter=inverse(regex_filter("news-tr.xml")),
64+
languages={"tr"},
65+
),
66+
Sitemap(
67+
"https://www.theportugalnews.com/sitemap-news.xml",
68+
sitemap_filter=inverse(regex_filter("news-pt.xml")),
69+
),
70+
Sitemap(
71+
"https://www.theportugalnews.com/sitemap-news.xml",
72+
sitemap_filter=inverse(regex_filter("news-ar.xml")),
73+
languages={"ar"},
74+
),
75+
Sitemap(
76+
"https://www.theportugalnews.com/sitemap-news.xml",
77+
sitemap_filter=inverse(regex_filter("news-he.xml")),
78+
languages={"he"},
79+
),
80+
Sitemap(
81+
"https://www.theportugalnews.com/sitemap-news.xml",
82+
sitemap_filter=inverse(regex_filter("news-pl.xml")),
83+
languages={"pl"},
84+
),
85+
Sitemap(
86+
"https://www.theportugalnews.com/sitemap-news.xml",
87+
sitemap_filter=inverse(regex_filter("news-fi.xml")),
88+
languages={"fi"},
89+
),
90+
Sitemap(
91+
"https://www.theportugalnews.com/sitemap-news.xml",
92+
sitemap_filter=inverse(regex_filter("news-br.xml")),
93+
languages={"br"},
94+
),
2295
],
2396
)

0 commit comments

Comments
 (0)