3434import java .net .URI ;
3535import java .nio .channels .Channels ;
3636import java .util .ArrayList ;
37+ import java .util .Arrays ;
3738import java .util .List ;
3839import java .util .concurrent .Callable ;
3940import java .util .concurrent .ExecutionException ;
6768 * Extracts links to media by running yt-dlp in a subprocess. Runs only on html.
6869 *
6970 * <p>
70- * Also implements {@link WARCRecordBuilder} to write youtube-dl json to the
71+ * Also implements {@link WARCRecordBuilder} to write yt-dlp json to the
7172 * warc.
7273 *
7374 * <p>
8182 * fetch chain, and to the end of the warc writer chain.
8283 *
8384 * <p>
84- * Keeps a log of containing pages and media captured as a result of youtube-dl
85+ * Keeps a log of containing pages and media captured as a result of yt-dlp
8586 * extraction. The format of the log is as follows:
8687 *
8788 * <pre>
@@ -140,7 +141,7 @@ protected RandomAccessFile getLocalTempFile() {
140141 localTemp = openNewTempFile ();
141142 tempfile .set (localTemp );
142143 }
143- logger .info ("Getting youtube-dl temp file " );
144+ logger .info ("Getting yt-dlp temp file " );
144145 return localTemp ;
145146 }
146147 protected boolean isOpen (RandomAccessFile f ) {
@@ -149,12 +150,12 @@ protected boolean isOpen(RandomAccessFile f) {
149150 return true ;
150151 }
151152 catch (IOException e ) {
152- logger .info ("youtube-dl temp file is not open" );
153+ logger .info ("yt-dlp temp file is not open" );
153154 return false ;
154155 }
155156 }
156157 protected RandomAccessFile openNewTempFile () {
157- logger .info ("Opening New youtube-dl temp file " );
158+ logger .info ("Opening New yt-dlp temp file " );
158159 File t ;
159160 try {
160161 t = File .createTempFile ("ydl" , ".json" );
@@ -194,6 +195,18 @@ public void setLogMetadataRecord(boolean logMetadataRecord) {
194195 kp .put ("logMetadataRecord" ,logMetadataRecord );
195196 }
196197
198+ {
199+ setProcessArguments (Arrays .asList ("yt-dlp" , "--ignore-config" ,
200+ "--simulate" , "--dump-single-json" , "-S vcodec:h264,res:720,acodec:aac" ,
201+ "--no-cache-dir" , "--no-playlist" , "--playlist-end=" + MAX_VIDEOS_PER_PAGE ));
202+ }
203+ public List <String > getProcessArguments () {
204+ return (List <String >) kp .get ("processArguments" );
205+ }
206+ public void setProcessArguments (List <String > processArguments ) {
207+ kp .put ("processArguments" , processArguments );
208+ }
209+
197210 @ Override
198211 public void start () {
199212 if (!isRunning ) {
@@ -229,7 +242,7 @@ protected String readToEnd(Reader r) throws IOException {
229242 * - If {@code uri} is annotated "youtube-dl" and is an actual video
230243 * download, log a line to ExtractorYoutubeDL.log
231244 *
232- * - If {@link #shouldExtract(CrawlURI)}, do youtube-dl extraction.
245+ * - If {@link #shouldExtract(CrawlURI)}, do yt-dlp extraction.
233246 */
234247 @ Override
235248 protected void extract (CrawlURI uri ) {
@@ -400,7 +413,7 @@ public void write(int b) throws IOException {}
400413 }
401414
402415 /**
403- * Streams through youtube-dl json output. Sticks video urls in
416+ * Streams through yt-dlp json output. Sticks video urls in
404417 * <code>results.videoUrls</code>, web page urls in
405418 * <code>results.pageUrls</code>, and saves the json in anonymous temp file
406419 * <code>results.jsonFile</code>.
@@ -468,17 +481,16 @@ protected YoutubeDLResults runYoutubeDL(CrawlURI uri) {
468481 * the best audio with best acodec no better than aac and
469482 * with the smallest dimension no larger than 720.
470483 */
471- ProcessBuilder pb = new ProcessBuilder ("yt-dlp" , "--ignore-config" ,
472- "--simulate" , "--dump-single-json" , "-S vcodec:h264,res:720,acodec:aac" ,
473- "--no-cache-dir" , "--no-playlist" ,
474- "--playlist-end=" + MAX_VIDEOS_PER_PAGE , uri .toString ());
484+ ArrayList <String > processArguments = new ArrayList <String >(getProcessArguments ());
485+ processArguments .add (uri .toString ());
486+ ProcessBuilder pb = new ProcessBuilder (processArguments );
475487 logger .info ("running: " + String .join (" " , pb .command ()));
476488
477489 Process proc = null ;
478490 try {
479491 proc = pb .start ();
480492 } catch (IOException e ) {
481- logger .log (Level .WARNING , "youtube-dl failed " + pb .command (), e );
493+ logger .log (Level .WARNING , "yt-dlp failed " + pb .command (), e );
482494 return null ;
483495 }
484496
@@ -506,7 +518,7 @@ public String call() throws IOException {
506518 try {
507519 // this happens when there was no json output, which means no videos
508520 // were found, totally normal
509- logger .log (Level .FINE , "problem parsing json from youtube-dl " + pb .command () + " " + future .get ());
521+ logger .log (Level .FINE , "problem parsing json from yt-dlp " + pb .command () + " " + future .get ());
510522 } catch (InterruptedException e1 ) {
511523 throw new IOException (e1 );
512524 } catch (ExecutionException e1 ) {
@@ -515,15 +527,15 @@ public String call() throws IOException {
515527 }
516528 } catch (IOException e ) {
517529 logger .log (Level .WARNING ,
518- "problem reading output from youtube-dl " + pb .command (),
530+ "problem reading output from yt-dlp " + pb .command (),
519531 e );
520532 return null ;
521533 } finally {
522534 try {
523535 // the process should already have completed
524536 proc .waitFor (1 , TimeUnit .SECONDS );
525537 } catch (InterruptedException e ) {
526- logger .warning ("youtube-dl still running? killing it" );
538+ logger .warning ("yt-dlp still running? killing it" );
527539 proc .destroyForcibly ();
528540 }
529541 threadPool .shutdown ();
@@ -540,13 +552,13 @@ protected boolean shouldProcess(CrawlURI uri) {
540552 return true ;
541553 }
542554
543- // Otherwise, check if we want to run youtube-dl on the url.
555+ // Otherwise, check if we want to run yt-dlp on the url.
544556 return shouldExtract (uri );
545557 }
546558
547559 /**
548- * Returns {@code true} if we should run youtube-dl on this url. We run
549- * youtube-dl on html 200s that are not too huge.
560+ * Returns {@code true} if we should run yt-dlp on this url. We run
561+ * yt-dlp on html 200s that are not too huge.
550562 */
551563 protected boolean shouldExtract (CrawlURI uri ) {
552564 if (uri .getFetchStatus () != 200 ) {
0 commit comments