Skip to content

Commit a3be0b4

Browse files
committed
utilise Whisper HF hub models for transcription
1 parent fc18afa commit a3be0b4

File tree

14 files changed

+1332
-286
lines changed

14 files changed

+1332
-286
lines changed

.github/workflows/ci-pipeline.yml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ on:
1010
jobs:
1111
main:
1212

13-
runs-on: ubuntu-latest
13+
runs-on: ubuntu-22.04
1414
strategy:
1515
matrix:
1616
python-version: ["3.10"]
@@ -29,11 +29,12 @@ jobs:
2929
sudo apt-get -y install libsndfile-dev
3030
sudo apt-get -y install build-essential python3-dev
3131
python -m pip install --upgrade pip
32-
pip install -e .
33-
pip install -e ".[stretch]"
34-
pip install -e ".[llm]"
35-
pip install -e ".[dev]"
36-
pip install -e ".[site]"
32+
python -m pip install "setuptools<65.0.0" wheel
33+
python -m pip install --no-build-isolation -e .
34+
python -m pip install --no-build-isolation -e ".[stretch]"
35+
python -m pip install --no-build-isolation -e ".[llm]"
36+
python -m pip install --no-build-isolation -e ".[dev]"
37+
python -m pip install --no-build-isolation -e ".[site]"
3738
- name: Type checking
3839
run: |
3940
python -m mypy --follow-imports=normal subaligner

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ Assuming your media assets are stored under `d:\media`, open built-in command pr
162162
<pre><code>subaligner -m dual -v video.mp4 -s subtitle.srt -tr helsinki-nlp -o subtitle_aligned.srt -t src,tgt</code></pre>
163163
<pre><code>subaligner -m dual -v video.mp4 -s subtitle.srt -tr facebook-mbart -tf large -o subtitle_aligned.srt -t src,tgt</code></pre>
164164
<pre><code>subaligner -m dual -v video.mp4 -s subtitle.srt -tr facebook-m2m100 -tf small -o subtitle_aligned.srt -t src,tgt</code></pre>
165-
<pre><code>subaligner -m dual -v video.mp4 -s subtitle.srt -tr whisper -tf small -o subtitle_aligned.srt -t src,eng</code></pre>
165+
<pre><code>subaligner -m dual -v video.mp4 -s subtitle.srt -tr whisper -tf small -o subtitle_aligned.srt -t src,tgt</code></pre>
166166
</details>
167167

168168
<details>
@@ -251,6 +251,6 @@ This tool wouldn't be possible without the following packages:
251251
[pysubs2](https://github.com/tkarabela/pysubs2)
252252
[aeneas](https://www.readbeyond.it/aeneas/)
253253
[transformers](https://huggingface.co/transformers/)
254-
[openai-whisper](https://github.com/openai/whisper).
254+
[whisper](https://openai.com/index/whisper/).
255255

256256
Thanks to Alan Robinson and Nigel Megitt for their invaluable feedback.

docker/Dockerfile-Dev

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ RUN ["/bin/bash", "-c", "apt-get -y update &&\
2222
apt-get -y install python3-pip &&\
2323
python3 -m pip install --upgrade pip &&\
2424
python3 -m pip install --upgrade \"setuptools<65.0.0\" wheel &&\
25-
python3 -m pip install -e . &&\
26-
python3 -m pip install -e \".[harmony]\""]
25+
python3 -m pip install --no-build-isolation -e . &&\
26+
python3 -m pip install --no-build-isolation -e \".[harmony]\""]

pyproject.toml

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ classifiers = [
2525
"Programming Language :: Python :: 3.9",
2626
"Programming Language :: Python :: 3.8",
2727
"Intended Audience :: Developers",
28-
"Topic :: Utilities"
28+
"Topic :: Utilities",
2929
]
3030
dependencies = [
3131
"astor==0.7.1",
@@ -90,33 +90,37 @@ dependencies = [
9090
"urllib3~=1.26.5",
9191
"wrapt==1.14.0",
9292
"Werkzeug>=0.15.3",
93-
"zict==0.1.3"
93+
"zict==0.1.3",
9494
]
9595

9696
[project.optional-dependencies]
9797
harmony = [
9898
"aeneas~=1.7.3.0; python_version < '3.12'",
9999
# "aeneas @ git+https://github.com/baxtree/aeneas.git@v1.7.3.1#egg=aeneas ; python_version >= '3.12'",
100-
"dtw-python~=1.5.3",
101100
"safetensors~=0.5.3",
102101
"sentencepiece~=0.1.95; python_version < '3.12'",
103102
"sentencepiece~=0.2.0; python_version >= '3.12'",
104-
"torch<2.3.0",
105-
"torchaudio<2.3.0",
106-
"transformers<4.37.0",
107-
"openai-whisper==20250625"
103+
"torch~=2.9.1",
104+
"torchaudio~=2.9.1",
105+
"transformers==4.57.3",
106+
"accelerate~=1.12.0",
107+
"soxr==1.0.0",
108+
"webrtcvad~=2.0.10",
109+
"silero-vad~=6.2.0",
108110
]
109111
dev = [
110112
"aeneas~=1.7.3.0; python_version < '3.12'",
111113
# "aeneas @ git+https://github.com/baxtree/aeneas.git@v1.7.3.1#egg=aeneas ; python_version >= '3.12'",
112-
"dtw-python~=1.5.3",
113114
"safetensors~=0.5.3",
114115
"sentencepiece~=0.1.95; python_version < '3.12'",
115116
"sentencepiece~=0.2.0; python_version >= '3.12'",
116-
"torch<2.3.0",
117-
"torchaudio<2.3.0",
118-
"transformers<4.37.0",
119-
"openai-whisper==20250625",
117+
"torch~=2.9.1",
118+
"torchaudio~=2.9.1",
119+
"transformers==4.57.3",
120+
"soxr==1.0.0",
121+
"accelerate~=1.12.0",
122+
"webrtcvad~=2.0.10",
123+
"silero-vad~=6.2.0",
120124
"mock==4.0.3",
121125
"coverage==5.5",
122126
"tox~=3.23.0",
@@ -130,11 +134,11 @@ dev = [
130134
"mypy==1.3.0",
131135
"types-requests==2.27.9",
132136
"types-setuptools==64.0.1",
133-
"typing-extensions==4.8.0",
137+
"typing-extensions==4.10.0",
134138
"parameterized==0.8.1",
135139
"pylint~=2.17.2",
136140
"pygments<3.0.0",
137-
"darglint~=1.8.1"
141+
"darglint~=1.8.1",
138142
]
139143
docs = [
140144
"sphinx==6.2.1",
@@ -144,16 +148,18 @@ docs = [
144148
stretch = [
145149
"aeneas~=1.7.3.0; python_version < '3.12'",
146150
# "aeneas @ git+https://github.com/baxtree/aeneas.git@v1.7.3.1#egg=aeneas ; python_version >= '3.12'",
147-
"dtw-python~=1.5.3"
148151
]
149152
llm = [
150153
"safetensors~=0.5.3",
151154
"sentencepiece~=0.1.95; python_version < '3.12'",
152155
"sentencepiece~=0.2.0; python_version >= '3.12'",
153-
"torch<2.3.0",
154-
"torchaudio<2.3.0",
155-
"transformers<4.37.0",
156-
"openai-whisper==20250625"
156+
"torch~=2.9.1",
157+
"torchaudio~=2.9.1",
158+
"transformers==4.57.3",
159+
"accelerate~=1.12.0",
160+
"soxr==1.0.0",
161+
"webrtcvad~=2.0.10",
162+
"silero-vad~=6.2.0",
157163
]
158164

159165
[project.scripts]

site/source/acknowledgement.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ Acknowledgement
1212
- `pysubs2 <https://github.com/tkarabela/pysubs2>`_
1313
- `aeneas <https://www.readbeyond.it/aeneas/>`_
1414
- `transformers <https://huggingface.co/transformers/>`_
15-
- `openai-whisper <https://github.com/openai/whisper>`_
15+
- `whisper <https://openai.com/index/whisper/>`_
1616

1717
Thanks to Alan Robinson and Nigel Megitt for their invaluable feedback.

subaligner/__main__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
Target video's main language as an ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes]
3333
-mr {whisper}, --transcription_recipe {whisper}
3434
LLM recipe used for transcribing video files
35-
-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}
35+
-mf {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v2,large-v3,large,large-v3-turbo}, --transcription_flavour {tiny,tiny.en,small,medium,medium.en,base,base.en,large-v1,large-v2,large-v3,large,turbo}
3636
Flavour variation for a specific LLM recipe supporting transcription
3737
-ip INITIAL_PROMPT, --initial_prompt INITIAL_PROMPT
3838
Optional text to provide the transcribing context or specific phrases
@@ -197,7 +197,7 @@ def main():
197197
"-mcl",
198198
"--max_char_length",
199199
type=int,
200-
default=None,
200+
default=37,
201201
help="Maximum number of characters for each generated subtitle segment"
202202
)
203203
from subaligner.llm import TranslationRecipe

subaligner/llm.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,10 @@ class WhisperFlavour(Enum):
2020
MEDIUM_EN = "medium.en"
2121
BASE = "base"
2222
BASE_EN = "base.en"
23-
LARGE_V1 = "large-v1"
23+
LARGE = "large"
2424
LARGE_V2 = "large-v2"
2525
LARGE_V3 = "large-v3"
26-
LARGE = "large"
27-
TURBO = "turbo"
26+
TURBO = "large-v3-turbo"
2827

2928

3029
class HelsinkiNLPFlavour(Enum):

subaligner/predictor.py

Lines changed: 0 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -740,111 +740,6 @@ def __adjust_durations(self, subs: List[SubRipItem], audio_file_path: str, stret
740740
if task.sync_map_file_path_absolute is not None and os.path.exists(task.sync_map_file_path_absolute):
741741
os.remove(task.sync_map_file_path_absolute)
742742

743-
def __compress_and_stretch(self, subs: List[SubRipItem], audio_file_path: str, stretch_in_lang: str, lock: threading.RLock) -> List[SubRipItem]:
744-
from dtw import dtw
745-
try:
746-
with lock:
747-
segment_path, _ = self.__media_helper.extract_audio_from_start_to_end(
748-
audio_file_path,
749-
str(subs[0].start),
750-
str(subs[len(subs) - 1].end),
751-
)
752-
753-
# Create a text file for DTW alignments
754-
root, _ = os.path.splitext(segment_path)
755-
text_file_path = "{}.txt".format(root)
756-
757-
with open(text_file_path, "w", encoding="utf8") as text_file:
758-
text_file.write("*****".join([sub_new.text for sub_new in subs]))
759-
760-
sample_rate = self.__feature_embedder.frequency
761-
hop_length = self.__feature_embedder.hop_len
762-
n_mfcc = self.__feature_embedder.n_mfcc
763-
764-
file_script_duration_mapping = []
765-
with tempfile.TemporaryDirectory() as temp_dir:
766-
with open(text_file_path, "r") as f:
767-
script_lines = f.read().split("*****")
768-
wav_data = []
769-
for i, line in enumerate(script_lines):
770-
normalised_line = line.replace('"', "'")
771-
espeak_output_file = f"espeak_part_{i}.wav"
772-
espeak_cmd = f"espeak -v {Language.LANGUAGE_TO_VOICE_CODE[stretch_in_lang]} --stdout -- \"{normalised_line}\" | ffmpeg -y -i - -af 'aresample={sample_rate}' {os.path.join(temp_dir, espeak_output_file)}"
773-
os.system(espeak_cmd)
774-
y, sr = librosa.load(os.path.join(temp_dir, espeak_output_file), sr=None)
775-
wav_data.append(y)
776-
duration = librosa.get_duration(y=y, sr=sr)
777-
file_script_duration_mapping.append((os.path.join(temp_dir, espeak_output_file), line, duration))
778-
data = np.concatenate(wav_data)
779-
sf.write(os.path.join(temp_dir, "espeak-all.wav"), data, sr)
780-
781-
y_query, sr_query = librosa.load(os.path.join(temp_dir, "espeak-all.wav"), sr=None)
782-
query_mfcc_features = librosa.feature.mfcc(y=y_query, sr=sr_query, n_mfcc=n_mfcc, hop_length=hop_length).T
783-
y_reference, sr_reference = librosa.load(segment_path, sr=sample_rate)
784-
reference_mfcc_features = librosa.feature.mfcc(y=y_reference, sr=sr_reference, n_mfcc=n_mfcc, hop_length=hop_length).T
785-
786-
alignment = dtw(query_mfcc_features, reference_mfcc_features, keep_internals=False)
787-
assert len(alignment.index1) == len(alignment.index2), "Mismatch in lengths of alignment indices"
788-
assert sr_query == sr_reference
789-
frame_duration = hop_length / sr_query
790-
791-
mapped_times = []
792-
start_frame_index = 0
793-
for index, (wav_file, line_text, duration) in enumerate(file_script_duration_mapping):
794-
num_frames_in_query = int(np.ceil(duration / frame_duration))
795-
796-
query_start_frame = start_frame_index
797-
query_end_frame = start_frame_index + num_frames_in_query - 1
798-
reference_frame_indices = [r for q, r in zip(alignment.index1, alignment.index2) if
799-
query_start_frame <= q <= query_end_frame]
800-
reference_start_frame = min(reference_frame_indices)
801-
reference_end_frame = max(reference_frame_indices)
802-
803-
# TODO: Handle cases where mapped frames are not found in the reference audio
804-
805-
new_reference_start_time = reference_start_frame * frame_duration
806-
new_reference_end_time = (reference_end_frame + 1) * frame_duration
807-
808-
mapped_times.append({
809-
"new_reference_start_time": new_reference_start_time,
810-
"new_reference_end_time": new_reference_end_time
811-
})
812-
813-
start_frame_index = query_end_frame + 1
814-
815-
with open(os.path.join(temp_dir, "synced_subtitles.srt"), "w") as f:
816-
for index, entry in enumerate(mapped_times):
817-
start_srt = Utils.format_timestamp(entry["new_reference_start_time"])
818-
end_srt = Utils.format_timestamp(entry["new_reference_end_time"])
819-
f.write(f"{index + 1}\n")
820-
f.write(f"{start_srt} --> {end_srt}\n")
821-
f.write(f"{script_lines[index]}\n")
822-
f.write(f"\n")
823-
f.flush()
824-
825-
adjusted_subs = Subtitle._get_srt_subs(
826-
subrip_file_path=os.path.join(temp_dir, "synced_subtitles.srt"),
827-
encoding="utf-8"
828-
)
829-
830-
for index, sub_new_loaded in enumerate(adjusted_subs):
831-
sub_new_loaded.index = subs[index].index
832-
833-
adjusted_subs.shift(
834-
seconds=self.__media_helper.get_duration_in_seconds(
835-
start=None, end=str(subs[0].start)
836-
)
837-
)
838-
return adjusted_subs
839-
except KeyboardInterrupt:
840-
raise TerminalException("Subtitle compress and stretch interrupted by the user")
841-
finally:
842-
# Housekeep intermediate files
843-
if text_file_path is not None and os.path.exists(
844-
text_file_path
845-
):
846-
os.remove(text_file_path)
847-
848743
def __predict(
849744
self,
850745
video_file_path: Optional[str],

0 commit comments

Comments
 (0)