Skip to content

Commit 0c47617

Browse files
committed
new arte crawler
zdf: handle null-elements add topic to deep search
2 parents bc9a81b + 388cea7 commit 0c47617

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1284
-1705
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ sourceCompatibility = JavaVersion.VERSION_17
2727
targetCompatibility = JavaVersion.VERSION_17
2828
group = 'de.mediathekview'
2929
archivesBaseName = "MServer"
30-
version = '3.1.272'
30+
version = '3.1.273'
3131

3232
def jarName = 'MServer.jar'
3333
def mainClass = 'mServer.Main'

src/main/java/de/mediathekview/mlib/Const.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,11 @@ public class Const {
4444
public static final String ARD = "ARD";
4545
public static final String ARD_ALPHA = "ARD-alpha";
4646
public static final String ARTE_DE = "ARTE.DE";
47+
public static final String ARTE_EN = "ARTE.EN";
48+
public static final String ARTE_ES = "ARTE.ES";
4749
public static final String ARTE_FR = "ARTE.FR";
50+
public static final String ARTE_IT = "ARTE.IT";
51+
public static final String ARTE_PL = "ARTE.PL";
4852
public static final String BR = "BR";
4953
public static final String DW = "DW";
5054
public static final String HR = "HR";

src/main/java/mServer/crawler/FilmeSuchen.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import de.mediathekview.mlib.tool.Log;
2828
import mServer.crawler.sender.*;
2929
import mServer.crawler.sender.ard.ArdCrawler;
30-
import mServer.crawler.sender.arte.MediathekArte;
30+
import mServer.crawler.sender.arte.*;
3131
import mServer.crawler.sender.dreisat.DreiSatCrawler;
3232
import mServer.crawler.sender.dw.DwCrawler;
3333
import mServer.crawler.sender.kika.KikaApiCrawler;
@@ -88,7 +88,12 @@ public FilmeSuchen() {
8888
mediathekListe.add(new ZdfCrawler(this, 0));
8989
}
9090
if (crawlerList.contains("ARTE")) {
91-
mediathekListe.add(new MediathekArte(this, 0));
91+
mediathekListe.add(new ArteCrawler(this, 0));
92+
mediathekListe.add(new ArteCrawler_FR(this, 0));
93+
mediathekListe.add(new ArteCrawler_EN(this, 1));
94+
mediathekListe.add(new ArteCrawler_ES(this, 1));
95+
mediathekListe.add(new ArteCrawler_PL(this, 1));
96+
mediathekListe.add(new ArteCrawler_IT(this, 1));
9297
}
9398
if (crawlerList.contains("DW")) {
9499
mediathekListe.add(new DwCrawler(this, 0));
@@ -97,7 +102,7 @@ public FilmeSuchen() {
97102
mediathekListe.add(new KikaApiCrawler(this, 0));
98103
}
99104
if (crawlerList.contains("3SAT")) {
100-
mediathekListe.add(new DreiSatCrawler(this, 1));
105+
mediathekListe.add(new DreiSatCrawler(this, 0));
101106
}
102107
if (crawlerList.contains("SR")) {
103108
mediathekListe.add(new SrCrawler(this, 1));
@@ -112,7 +117,7 @@ public FilmeSuchen() {
112117
mediathekListe.add(new OrfOnCrawler(this, 1));
113118
}
114119
if (crawlerList.contains("PHONIX")) {
115-
mediathekListe.add(new PhoenixCrawler(this, 1));
120+
mediathekListe.add(new PhoenixCrawler(this, 0));
116121
}
117122

118123
}

src/main/java/mServer/crawler/sender/ard/tasks/ArdTopicPageTask.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ public class ArdTopicPageTask extends ArdTaskBase<ArdFilmInfoDto, CrawlerUrlDTO>
106106
TOPICS_LOAD_ALL_PAGES.add("Y3JpZDovL2JyLmRlL2Jyb2FkY2FzdFNlcmllcy9ici1odWJlcnQtb2huZS1zdGFsbGVy");
107107
// lohnt sich das
108108
TOPICS_LOAD_ALL_PAGES.add("Y3JpZDovL2JyLmRlL2Jyb2FkY2FzdFNlcmllcy9mYzkyMWQxMC1iY2VjLTQxZmYtOWY3ZC05ODI0YzM0ZDY1MmY");
109+
// Morden im Norden
110+
TOPICS_LOAD_ALL_PAGES.add("Y3JpZDovL25kci5kZS80OTU1");
109111
}
110112

111113
public ArdTopicPageTask(MediathekReader aCrawler,

src/main/java/mServer/crawler/sender/arte/ArteCategoryFilmListDeserializer.java

Lines changed: 0 additions & 46 deletions
This file was deleted.

src/main/java/mServer/crawler/sender/arte/ArteCategoryFilmsDTO.java

Lines changed: 0 additions & 37 deletions
This file was deleted.

src/main/java/mServer/crawler/sender/arte/ArteCollectionChildDeserializer.java

Lines changed: 0 additions & 42 deletions
This file was deleted.

src/main/java/mServer/crawler/sender/arte/ArteCollectionParentDeserializer.java

Lines changed: 0 additions & 41 deletions
This file was deleted.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package mServer.crawler.sender.arte;
2+
3+
public class ArteConstants {
4+
public static final String VIDEOS_URL ="https://api.arte.tv/api/opa/v3/videos?limit=100&page=%s&sort=-creationDate&language=%s";
5+
public static final String VIDEOS_URL_ALT ="https://api.arte.tv/api/opa/v3/videos?limit=100&page=%s&sort=creationDate&language=%s";
6+
public static final String VIDEO_URL ="https://www.arte.tv/hbbtvv2/services/web/index.php/OPA/v3/streams/%s/%s/%s"; //PROGRAMID/KIND/LANG
7+
public static final String API_TOKEN = "Bearer Nzc1Yjc1ZjJkYjk1NWFhN2I2MWEwMmRlMzAzNjI5NmU3NWU3ODg4ODJjOWMxNTMxYzEzZGRjYjg2ZGE4MmIwOA";
8+
private ArteConstants() {}
9+
10+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
package mServer.crawler.sender.arte;
2+
3+
import com.google.gson.JsonElement;
4+
import de.mediathekview.mlib.Const;
5+
import de.mediathekview.mlib.daten.DatenFilm;
6+
import de.mediathekview.mlib.tool.Log;
7+
import mServer.crawler.CrawlerTool;
8+
import mServer.crawler.FilmeSuchen;
9+
import mServer.crawler.sender.MediathekCrawler;
10+
import mServer.crawler.sender.arte.json.ArteVideoInfoDto;
11+
import mServer.crawler.sender.arte.tasks.ArteDtoVideo2FilmTask;
12+
import mServer.crawler.sender.arte.tasks.ArteVideoInfoTask;
13+
import mServer.crawler.sender.arte.tasks.ArteVideoLinkTask;
14+
import mServer.crawler.sender.base.JsonUtils;
15+
import mServer.crawler.sender.base.JsoupConnection;
16+
import mServer.crawler.sender.base.TopicUrlDTO;
17+
import org.apache.logging.log4j.LogManager;
18+
import org.apache.logging.log4j.Logger;
19+
20+
import java.io.IOException;
21+
import java.util.Map;
22+
import java.util.Optional;
23+
import java.util.Set;
24+
import java.util.concurrent.ConcurrentLinkedQueue;
25+
import java.util.concurrent.RecursiveTask;
26+
27+
public class ArteCrawler extends MediathekCrawler {
28+
private static final Logger LOG = LogManager.getLogger(ArteCrawler.class);
29+
private final JsoupConnection jsoupConnection;
30+
31+
public ArteCrawler(FilmeSuchen ssearch, int startPrio) {
32+
this(ssearch, startPrio, Const.ARTE_DE);
33+
}
34+
35+
protected ArteCrawler(FilmeSuchen ssearch, int startPrio, String sender) {
36+
super(ssearch, sender,/* threads */ 1, /* urlWarten */ 200, startPrio);
37+
this.jsoupConnection = new JsoupConnection(60, 4);
38+
}
39+
40+
protected ArteLanguage getLanguage() {
41+
return ArteLanguage.DE;
42+
}
43+
44+
@Override
45+
protected RecursiveTask<Set<DatenFilm>> createCrawlerTask() {
46+
47+
try {
48+
final ConcurrentLinkedQueue<TopicUrlDTO> videoUrls = new ConcurrentLinkedQueue<>();
49+
videoUrls.addAll(createVideosQueue(getLanguage().toString().toLowerCase()));
50+
51+
final ArteVideoInfoTask aArteRestVideoInfoTask;
52+
// DO NOT overload - maximumUrlsPerTask used to reduce threads to 4
53+
aArteRestVideoInfoTask = new ArteVideoInfoTask(this, videoUrls);
54+
final ConcurrentLinkedQueue<ArteVideoInfoDto> videos = new ConcurrentLinkedQueue<>();
55+
videos.addAll(aArteRestVideoInfoTask.fork().join());
56+
//
57+
Log.sysLog(getSendername() + " Anzahl video info: " + videos.size());
58+
//
59+
final ConcurrentLinkedQueue<ArteVideoInfoDto> videosWithLink = new ConcurrentLinkedQueue<>();
60+
final ArteVideoLinkTask aArteRestVideosTask = new ArteVideoLinkTask(this, videos);
61+
videosWithLink.addAll(aArteRestVideosTask.fork().join());
62+
//
63+
Log.sysLog(getSendername() + " Anzahl video links: " + videosWithLink.size());
64+
//
65+
return new ArteDtoVideo2FilmTask(this, new ConcurrentLinkedQueue<>(videosWithLink), getSendername());
66+
67+
} catch (final Exception ex) {
68+
LOG.fatal("Exception in {} crawler.", getSendername(), ex);
69+
}
70+
return null;
71+
}
72+
73+
private ConcurrentLinkedQueue<TopicUrlDTO> createVideosQueue(String language) {
74+
int maxPages = getMaxPagesForOverview(language);
75+
final ConcurrentLinkedQueue<TopicUrlDTO> root = new ConcurrentLinkedQueue<>();
76+
String rootUrl = String.format(ArteConstants.VIDEOS_URL, 1, language);
77+
root.add(new TopicUrlDTO("all videos1", rootUrl));
78+
if (maxPages >= 100) {
79+
String rootUrl2 = String.format(ArteConstants.VIDEOS_URL_ALT, 1, language);
80+
root.add(new TopicUrlDTO("all videos2", rootUrl2));
81+
}
82+
return root;
83+
}
84+
85+
private int getMaxPagesForOverview(String lang) {
86+
final int maxAvailablePages = getNumberOfAvailablePages(lang);
87+
final int configuredMaxPages = getMaximumSubpages();
88+
if (configuredMaxPages > maxAvailablePages) {
89+
return Math.min(configuredMaxPages, maxAvailablePages / 2);
90+
} else {
91+
return Math.min(configuredMaxPages, configuredMaxPages / 2);
92+
}
93+
}
94+
95+
private int getNumberOfAvailablePages(String lang) {
96+
final int naturalLimit = Math.min(100, getMaximumSubpages());
97+
try {
98+
String rootUrl = String.format(ArteConstants.VIDEOS_URL, 1, lang);
99+
String[] path= {"meta", "videos", "pages"};
100+
final Map<String, String> headers = Map.of(
101+
"Accept", "application/json",
102+
"Content-Type", "application/json",
103+
"Authorization", ArteConstants.API_TOKEN
104+
);
105+
JsonElement element = jsoupConnection.requestBodyAsJsonElement(rootUrl, headers);
106+
Optional<Integer> pages = JsonUtils.getElementValueAsInteger(element, path);
107+
if (pages.isPresent()) {
108+
return pages.get();
109+
}
110+
} catch (IOException e) {
111+
LOG.error("getMaxPagesForOverview", e);
112+
}
113+
return naturalLimit;
114+
}
115+
116+
private int getMaximumSubpages() {
117+
if (CrawlerTool.loadLongMax()) {
118+
return 10;
119+
} else {
120+
return 1;
121+
}
122+
}
123+
}
124+
125+

0 commit comments

Comments
 (0)