@@ -740,111 +740,6 @@ def __adjust_durations(self, subs: List[SubRipItem], audio_file_path: str, stret
740740 if task .sync_map_file_path_absolute is not None and os .path .exists (task .sync_map_file_path_absolute ):
741741 os .remove (task .sync_map_file_path_absolute )
742742
743- def __compress_and_stretch (self , subs : List [SubRipItem ], audio_file_path : str , stretch_in_lang : str , lock : threading .RLock ) -> List [SubRipItem ]:
744- from dtw import dtw
745- try :
746- with lock :
747- segment_path , _ = self .__media_helper .extract_audio_from_start_to_end (
748- audio_file_path ,
749- str (subs [0 ].start ),
750- str (subs [len (subs ) - 1 ].end ),
751- )
752-
753- # Create a text file for DTW alignments
754- root , _ = os .path .splitext (segment_path )
755- text_file_path = "{}.txt" .format (root )
756-
757- with open (text_file_path , "w" , encoding = "utf8" ) as text_file :
758- text_file .write ("*****" .join ([sub_new .text for sub_new in subs ]))
759-
760- sample_rate = self .__feature_embedder .frequency
761- hop_length = self .__feature_embedder .hop_len
762- n_mfcc = self .__feature_embedder .n_mfcc
763-
764- file_script_duration_mapping = []
765- with tempfile .TemporaryDirectory () as temp_dir :
766- with open (text_file_path , "r" ) as f :
767- script_lines = f .read ().split ("*****" )
768- wav_data = []
769- for i , line in enumerate (script_lines ):
770- normalised_line = line .replace ('"' , "'" )
771- espeak_output_file = f"espeak_part_{ i } .wav"
772- espeak_cmd = f"espeak -v { Language .LANGUAGE_TO_VOICE_CODE [stretch_in_lang ]} --stdout -- \" { normalised_line } \" | ffmpeg -y -i - -af 'aresample={ sample_rate } ' { os .path .join (temp_dir , espeak_output_file )} "
773- os .system (espeak_cmd )
774- y , sr = librosa .load (os .path .join (temp_dir , espeak_output_file ), sr = None )
775- wav_data .append (y )
776- duration = librosa .get_duration (y = y , sr = sr )
777- file_script_duration_mapping .append ((os .path .join (temp_dir , espeak_output_file ), line , duration ))
778- data = np .concatenate (wav_data )
779- sf .write (os .path .join (temp_dir , "espeak-all.wav" ), data , sr )
780-
781- y_query , sr_query = librosa .load (os .path .join (temp_dir , "espeak-all.wav" ), sr = None )
782- query_mfcc_features = librosa .feature .mfcc (y = y_query , sr = sr_query , n_mfcc = n_mfcc , hop_length = hop_length ).T
783- y_reference , sr_reference = librosa .load (segment_path , sr = sample_rate )
784- reference_mfcc_features = librosa .feature .mfcc (y = y_reference , sr = sr_reference , n_mfcc = n_mfcc , hop_length = hop_length ).T
785-
786- alignment = dtw (query_mfcc_features , reference_mfcc_features , keep_internals = False )
787- assert len (alignment .index1 ) == len (alignment .index2 ), "Mismatch in lengths of alignment indices"
788- assert sr_query == sr_reference
789- frame_duration = hop_length / sr_query
790-
791- mapped_times = []
792- start_frame_index = 0
793- for index , (wav_file , line_text , duration ) in enumerate (file_script_duration_mapping ):
794- num_frames_in_query = int (np .ceil (duration / frame_duration ))
795-
796- query_start_frame = start_frame_index
797- query_end_frame = start_frame_index + num_frames_in_query - 1
798- reference_frame_indices = [r for q , r in zip (alignment .index1 , alignment .index2 ) if
799- query_start_frame <= q <= query_end_frame ]
800- reference_start_frame = min (reference_frame_indices )
801- reference_end_frame = max (reference_frame_indices )
802-
803- # TODO: Handle cases where mapped frames are not found in the reference audio
804-
805- new_reference_start_time = reference_start_frame * frame_duration
806- new_reference_end_time = (reference_end_frame + 1 ) * frame_duration
807-
808- mapped_times .append ({
809- "new_reference_start_time" : new_reference_start_time ,
810- "new_reference_end_time" : new_reference_end_time
811- })
812-
813- start_frame_index = query_end_frame + 1
814-
815- with open (os .path .join (temp_dir , "synced_subtitles.srt" ), "w" ) as f :
816- for index , entry in enumerate (mapped_times ):
817- start_srt = Utils .format_timestamp (entry ["new_reference_start_time" ])
818- end_srt = Utils .format_timestamp (entry ["new_reference_end_time" ])
819- f .write (f"{ index + 1 } \n " )
820- f .write (f"{ start_srt } --> { end_srt } \n " )
821- f .write (f"{ script_lines [index ]} \n " )
822- f .write (f"\n " )
823- f .flush ()
824-
825- adjusted_subs = Subtitle ._get_srt_subs (
826- subrip_file_path = os .path .join (temp_dir , "synced_subtitles.srt" ),
827- encoding = "utf-8"
828- )
829-
830- for index , sub_new_loaded in enumerate (adjusted_subs ):
831- sub_new_loaded .index = subs [index ].index
832-
833- adjusted_subs .shift (
834- seconds = self .__media_helper .get_duration_in_seconds (
835- start = None , end = str (subs [0 ].start )
836- )
837- )
838- return adjusted_subs
839- except KeyboardInterrupt :
840- raise TerminalException ("Subtitle compress and stretch interrupted by the user" )
841- finally :
842- # Housekeep intermediate files
843- if text_file_path is not None and os .path .exists (
844- text_file_path
845- ):
846- os .remove (text_file_path )
847-
848743 def __predict (
849744 self ,
850745 video_file_path : Optional [str ],
0 commit comments