11from fundus .publishers .base_objects import Publisher , PublisherGroup
2- from fundus .scraping .filter import inverse , lor , regex_filter
2+ from fundus .scraping .filter import inverse , regex_filter
33from fundus .scraping .url import Sitemap
44
55from .the_portugal_news import ThePortugalNewsParser
@@ -12,12 +12,85 @@ class PT(metaclass=PublisherGroup):
1212 name = "The Portugal News" ,
1313 domain = "https://www.theportugalnews.com/" ,
1414 parser = ThePortugalNewsParser ,
15- # There are more languages un the sitemap that could be added in the future
1615 sources = [
1716 Sitemap (
18- "https://www.theportugalnews.com/sitemap.xml" ,
19- sitemap_filter = lor ( regex_filter ( "category-pages" ), inverse (regex_filter ("/en/" ) )),
17+ "https://www.theportugalnews.com/sitemap-news .xml" ,
18+ sitemap_filter = inverse (regex_filter ("news-en.xml" )),
2019 languages = {"en" },
2120 ),
21+ Sitemap (
22+ "https://www.theportugalnews.com/sitemap-news.xml" ,
23+ sitemap_filter = inverse (regex_filter ("news-de.xml" )),
24+ languages = {"de" },
25+ ),
26+ Sitemap (
27+ "https://www.theportugalnews.com/sitemap-news.xml" ,
28+ sitemap_filter = inverse (regex_filter ("news-nl.xml" )),
29+ languages = {"nl" },
30+ ),
31+ Sitemap (
32+ "https://www.theportugalnews.com/sitemap-news.xml" ,
33+ sitemap_filter = inverse (regex_filter ("news-fr.xml" )),
34+ languages = {"fr" },
35+ ),
36+ Sitemap (
37+ "https://www.theportugalnews.com/sitemap-news.xml" ,
38+ sitemap_filter = inverse (regex_filter ("news-es.xml" )),
39+ languages = {"es" },
40+ ),
41+ Sitemap (
42+ "https://www.theportugalnews.com/sitemap-news.xml" ,
43+ sitemap_filter = inverse (regex_filter ("news-it.xml" )),
44+ languages = {"it" },
45+ ),
46+ Sitemap (
47+ "https://www.theportugalnews.com/sitemap-news.xml" ,
48+ sitemap_filter = inverse (regex_filter ("news-se.xml" )),
49+ languages = {"se" },
50+ ),
51+ Sitemap (
52+ "https://www.theportugalnews.com/sitemap-news.xml" ,
53+ sitemap_filter = inverse (regex_filter ("news-ru.xml" )),
54+ languages = {"ru" },
55+ ),
56+ Sitemap (
57+ "https://www.theportugalnews.com/sitemap-news.xml" ,
58+ sitemap_filter = inverse (regex_filter ("news-zh.xml" )),
59+ languages = {"zh" },
60+ ),
61+ Sitemap (
62+ "https://www.theportugalnews.com/sitemap-news.xml" ,
63+ sitemap_filter = inverse (regex_filter ("news-tr.xml" )),
64+ languages = {"tr" },
65+ ),
66+ Sitemap (
67+ "https://www.theportugalnews.com/sitemap-news.xml" ,
68+ sitemap_filter = inverse (regex_filter ("news-pt.xml" )),
69+ ),
70+ Sitemap (
71+ "https://www.theportugalnews.com/sitemap-news.xml" ,
72+ sitemap_filter = inverse (regex_filter ("news-ar.xml" )),
73+ languages = {"ar" },
74+ ),
75+ Sitemap (
76+ "https://www.theportugalnews.com/sitemap-news.xml" ,
77+ sitemap_filter = inverse (regex_filter ("news-he.xml" )),
78+ languages = {"he" },
79+ ),
80+ Sitemap (
81+ "https://www.theportugalnews.com/sitemap-news.xml" ,
82+ sitemap_filter = inverse (regex_filter ("news-pl.xml" )),
83+ languages = {"pl" },
84+ ),
85+ Sitemap (
86+ "https://www.theportugalnews.com/sitemap-news.xml" ,
87+ sitemap_filter = inverse (regex_filter ("news-fi.xml" )),
88+ languages = {"fi" },
89+ ),
90+ Sitemap (
91+ "https://www.theportugalnews.com/sitemap-news.xml" ,
92+ sitemap_filter = inverse (regex_filter ("news-br.xml" )),
93+ languages = {"br" },
94+ ),
2295 ],
2396 )
0 commit comments