Skip to content

Commit ace2ab8

Browse files
authored
Merge pull request #644 from adam-miller/adam/parameterize-yt-dlp-process-command
feat: Parameterize yt-dlp process command arguments
2 parents cc1a668 + 710662f commit ace2ab8

File tree

1 file changed

+30
-18
lines changed

1 file changed

+30
-18
lines changed

contrib/src/main/java/org/archive/modules/extractor/ExtractorYoutubeDL.java

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import java.net.URI;
3535
import java.nio.channels.Channels;
3636
import java.util.ArrayList;
37+
import java.util.Arrays;
3738
import java.util.List;
3839
import java.util.concurrent.Callable;
3940
import java.util.concurrent.ExecutionException;
@@ -67,7 +68,7 @@
6768
* Extracts links to media by running yt-dlp in a subprocess. Runs only on html.
6869
*
6970
* <p>
70-
* Also implements {@link WARCRecordBuilder} to write youtube-dl json to the
71+
* Also implements {@link WARCRecordBuilder} to write yt-dlp json to the
7172
* warc.
7273
*
7374
* <p>
@@ -81,7 +82,7 @@
8182
* fetch chain, and to the end of the warc writer chain.
8283
*
8384
* <p>
84-
* Keeps a log of containing pages and media captured as a result of youtube-dl
85+
* Keeps a log of containing pages and media captured as a result of yt-dlp
8586
* extraction. The format of the log is as follows:
8687
*
8788
* <pre>
@@ -140,7 +141,7 @@ protected RandomAccessFile getLocalTempFile() {
140141
localTemp = openNewTempFile();
141142
tempfile.set(localTemp);
142143
}
143-
logger.info("Getting youtube-dl temp file ");
144+
logger.info("Getting yt-dlp temp file ");
144145
return localTemp;
145146
}
146147
protected boolean isOpen(RandomAccessFile f) {
@@ -149,12 +150,12 @@ protected boolean isOpen(RandomAccessFile f) {
149150
return true;
150151
}
151152
catch (IOException e) {
152-
logger.info("youtube-dl temp file is not open");
153+
logger.info("yt-dlp temp file is not open");
153154
return false ;
154155
}
155156
}
156157
protected RandomAccessFile openNewTempFile() {
157-
logger.info("Opening New youtube-dl temp file ");
158+
logger.info("Opening New yt-dlp temp file ");
158159
File t;
159160
try {
160161
t = File.createTempFile("ydl", ".json");
@@ -194,6 +195,18 @@ public void setLogMetadataRecord(boolean logMetadataRecord) {
194195
kp.put("logMetadataRecord",logMetadataRecord);
195196
}
196197

198+
{
199+
setProcessArguments(Arrays.asList("yt-dlp", "--ignore-config",
200+
"--simulate", "--dump-single-json", "-S vcodec:h264,res:720,acodec:aac",
201+
"--no-cache-dir", "--no-playlist", "--playlist-end=" + MAX_VIDEOS_PER_PAGE ));
202+
}
203+
public List<String> getProcessArguments() {
204+
return (List<String>) kp.get("processArguments");
205+
}
206+
public void setProcessArguments(List<String> processArguments) {
207+
kp.put("processArguments", processArguments);
208+
}
209+
197210
@Override
198211
public void start() {
199212
if (!isRunning) {
@@ -229,7 +242,7 @@ protected String readToEnd(Reader r) throws IOException {
229242
* - If {@code uri} is annotated "youtube-dl" and is an actual video
230243
* download, log a line to ExtractorYoutubeDL.log
231244
*
232-
* - If {@link #shouldExtract(CrawlURI)}, do youtube-dl extraction.
245+
* - If {@link #shouldExtract(CrawlURI)}, do yt-dlp extraction.
233246
*/
234247
@Override
235248
protected void extract(CrawlURI uri) {
@@ -400,7 +413,7 @@ public void write(int b) throws IOException {}
400413
}
401414

402415
/**
403-
* Streams through youtube-dl json output. Sticks video urls in
416+
* Streams through yt-dlp json output. Sticks video urls in
404417
* <code>results.videoUrls</code>, web page urls in
405418
* <code>results.pageUrls</code>, and saves the json in anonymous temp file
406419
* <code>results.jsonFile</code>.
@@ -468,17 +481,16 @@ protected YoutubeDLResults runYoutubeDL(CrawlURI uri) {
468481
* the best audio with best acodec no better than aac and
469482
* with the smallest dimension no larger than 720.
470483
*/
471-
ProcessBuilder pb = new ProcessBuilder("yt-dlp", "--ignore-config",
472-
"--simulate", "--dump-single-json", "-S vcodec:h264,res:720,acodec:aac",
473-
"--no-cache-dir", "--no-playlist",
474-
"--playlist-end=" + MAX_VIDEOS_PER_PAGE, uri.toString());
484+
ArrayList<String> processArguments = new ArrayList<String>(getProcessArguments());
485+
processArguments.add(uri.toString());
486+
ProcessBuilder pb = new ProcessBuilder(processArguments);
475487
logger.info("running: " + String.join(" ", pb.command()));
476488

477489
Process proc = null;
478490
try {
479491
proc = pb.start();
480492
} catch (IOException e) {
481-
logger.log(Level.WARNING, "youtube-dl failed " + pb.command(), e);
493+
logger.log(Level.WARNING, "yt-dlp failed " + pb.command(), e);
482494
return null;
483495
}
484496

@@ -506,7 +518,7 @@ public String call() throws IOException {
506518
try {
507519
// this happens when there was no json output, which means no videos
508520
// were found, totally normal
509-
logger.log(Level.FINE, "problem parsing json from youtube-dl " + pb.command() + " " + future.get());
521+
logger.log(Level.FINE, "problem parsing json from yt-dlp " + pb.command() + " " + future.get());
510522
} catch (InterruptedException e1) {
511523
throw new IOException(e1);
512524
} catch (ExecutionException e1) {
@@ -515,15 +527,15 @@ public String call() throws IOException {
515527
}
516528
} catch (IOException e) {
517529
logger.log(Level.WARNING,
518-
"problem reading output from youtube-dl " + pb.command(),
530+
"problem reading output from yt-dlp " + pb.command(),
519531
e);
520532
return null;
521533
} finally {
522534
try {
523535
// the process should already have completed
524536
proc.waitFor(1, TimeUnit.SECONDS);
525537
} catch (InterruptedException e) {
526-
logger.warning("youtube-dl still running? killing it");
538+
logger.warning("yt-dlp still running? killing it");
527539
proc.destroyForcibly();
528540
}
529541
threadPool.shutdown();
@@ -540,13 +552,13 @@ protected boolean shouldProcess(CrawlURI uri) {
540552
return true;
541553
}
542554

543-
// Otherwise, check if we want to run youtube-dl on the url.
555+
// Otherwise, check if we want to run yt-dlp on the url.
544556
return shouldExtract(uri);
545557
}
546558

547559
/**
548-
* Returns {@code true} if we should run youtube-dl on this url. We run
549-
* youtube-dl on html 200s that are not too huge.
560+
* Returns {@code true} if we should run yt-dlp on this url. We run
561+
* yt-dlp on html 200s that are not too huge.
550562
*/
551563
protected boolean shouldExtract(CrawlURI uri) {
552564
if (uri.getFetchStatus() != 200) {

0 commit comments

Comments
 (0)