1+ """Subtitle to plain Text converter: Handles .srt, .vtt, .ssa, .ass files."""
2+
13# cSpell:disable
2- # SRT or WEBVTT to plain Text
4+ # SRT, ASS/SSA or WEBVTT to plain Text
35# Author: NebularNerd
46# Version: 2025-02-03
57# https://github.com/NebularNerd/subtotxt
1517
1618
1719def missing_modules_installer (required_modules ):
20+ """Auto module installer, fairly clever, will run if it finds modules are missing."""
1821 import platform
1922
2023 if float (platform .python_version ().rsplit ("." , 1 )[0 ].strip ()) < 3.12 : # pkg_resources method
@@ -65,13 +68,17 @@ def missing_modules_installer(required_modules):
6568
6669
6770class file_handler :
71+ """Get the file ready for action"""
72+
6873 def __init__ (self ):
74+ """Variables have the following purposes."""
6975 self .i = None # Input file
7076 self .o = None # Output file
7177 self .c = None # Copy file
7278 self .overw = None # Overwrite
7379
7480 def set_file (self , i ):
81+ """Set file input, then create output names."""
7582 i = Path (i )
7683 if i .is_file ():
7784 self .i = i
@@ -82,30 +89,39 @@ def set_file(self, i):
8289 raise FileNotFoundError (f"File '{ i } ' not found." )
8390
8491 def set_over (self , x ):
92+ """Overwrite existing output file without asking."""
8593 self .overw = x
8694
8795
8896class encoding :
97+ """Figure out what encoding the subtitle has, override output encoding if desired."""
98+
8999 def __init__ (self ):
100+ """Variables have the following purposes."""
90101 self .res = None # Check encoding
91102 self .enc = None # Detected encoding
92103 self .out = None # Output encoding
93104
94105 def check_encoding (self ):
95- self .res = from_path (file .i ).best () # charset_normalizer guess encoding
106+ """charset_normalizer guess encoding."""
107+ self .res = from_path (file .i ).best ()
96108 self .enc = self .res .encoding
97109 if self .res is not None and self .enc == "utf_8" and self .res .bom :
98110 self .enc += "_sig" # adds sig for utf_8_sig/bom files
99111 print (f"Detected Character Encoding: { self .enc } " )
100112 print (f"Confidence of encoding: { int ((1.0 - self .res .chaos ) * 100 )} %" )
101113
102114 def force_utf8 (self , x ):
115+ """Force UTF8 output regardless of input encoding."""
103116 print ("Output encoding forced to UTF-8" if x else "Output will use input encoding" )
104117 self .out = "utf_8" if x else self .enc
105118
106119
107120class subtitle :
121+ """Wrangle and mangle to file into nice readable text."""
122+
108123 def __init__ (self ):
124+ """Variables have the following purposes."""
109125 self .format = None # Which subtitle format
110126 self .text = "" # The output text
111127 self .text_finished = "" # The output text after a final check
@@ -117,6 +133,17 @@ def __init__(self):
117133 self .oneline = False # If True attempts to join longer lines
118134
119135 def testsub (self ):
136+ """
137+ Opens subtitle file and attempts to detect encoding used.
138+
139+ Notes:
140+ A file may appear as `UTF8` in some programs but be detects as `ascii` here,
141+ this is not a bug. `ascii` just means there are no characters in the file beyond the
142+ standard character set.
143+
144+ Chinese and near neighbours/dialects have many many encodings, sometimes the wrong one may
145+ be choosen but it should not affect output.
146+ """
120147 with open (file .i , "r" , encoding = enc .enc ) as ts :
121148 for line in ts :
122149 if "WEBVTT" in line :
@@ -127,32 +154,42 @@ def testsub(self):
127154 self .format = "ass"
128155
129156 def junklist (self ):
130- # This list will grow
131- # Escaping and r(raw) tag needed for special characters
157+ """
158+ List of junk strings, characters, control codes we wish to remove.
159+
160+ This list will grow/adapt over time.
161+ Escaping and r(raw) tag needed for special characters
162+ """
132163 j = ["<.*?>" , r"\{.*?\}" , r"\[.*\]" , r"\(.*\)" , r"^-\s" ]
133164 if self .no_names :
134165 j .append ("^.*?:" )
135166 return j
136167
137168 def set_no_names (self , x ):
169+ """If True: Strip names from lines, e.g.: `Blackadder: You're name is Bob?`."""
138170 self .no_names = x
139171 self .junk = self .junklist ()
140172
141173 def set_no_sort (self , x ):
174+ """If True: Prevents .ass/.ssa subs from being sorted by timecode"""
142175 self .nosrt = x
143176
144177 def screen_output (self , x ):
178+ """If True: Outputs processed content to screen/console."""
145179 self .scr = x
146180
147181 def one_line (self , x ):
182+ """If True: Sets one line function, attempts to join split sentences."""
148183 self .oneline = x
149184
150185
151- def cls (): # Clear screen win/*nix friendly
186+ def cls ():
187+ """Clear screen win/*nix friendly."""
152188 os .system ("cls" if os .name == "nt" else "clear" )
153189
154190
155- def yn (yn ): # Simple Y/N selector, use yn(text_for_choice)
191+ def yn (yn ):
192+ """Simple Y/N selector, use yn(text_for_choice)."""
156193 while True :
157194 print (f"{ yn } [Y/N]" )
158195 choice = input ().lower ()
@@ -165,6 +202,7 @@ def yn(yn): # Simple Y/N selector, use yn(text_for_choice)
165202
166203
167204def arguments ():
205+ """Everyone loves arguments, here's a list of them."""
168206 parser = argparse .ArgumentParser (
169207 formatter_class = argparse .RawDescriptionHelpFormatter ,
170208 description = "Quickly convert SRT, SSA or WEBVTT subtitles into plain text file." ,
@@ -261,6 +299,7 @@ def arguments():
261299
262300
263301def overwrite_old_file (f ):
302+ """Politely check if there is an exiting file before moving forward."""
264303 if f .is_file ():
265304 if (not file .overw and yn ("Output file already exists, delete and make a new one?" )) or file .overw :
266305 print ("Overwriting old file" )
@@ -270,6 +309,7 @@ def overwrite_old_file(f):
270309
271310
272311def copy ():
312+ """This just copies a file line by line, handy for checking encoding issues without processing the file."""
273313 overwrite_old_file (file .c )
274314 with open (file .i , "r" , encoding = enc .enc ) as original , open (file .c , "w" , encoding = enc .out ) as new :
275315 for line in original :
@@ -280,8 +320,7 @@ def copy():
280320
281321
282322def junk_strip (line ):
283- # Based on PR #4 by eMPee584
284- # Looping is terrible, but, a required evil it seems
323+ """Based on PR #4 by eMPee584. Looping is terrible, but, a required evil it seems."""
285324 for junk in sub .junk :
286325 try :
287326 line = re .sub (rf"{ junk } " , "" , line )
@@ -291,6 +330,7 @@ def junk_strip(line):
291330
292331
293332def process_line (line ):
333+ """Process each line, remove formatting junk, check for duplicates, store for writing later."""
294334 # Strip formatting junk from line
295335 # We do this before checking for duplicates
296336 line = junk_strip (line ).strip ()
@@ -316,9 +356,11 @@ def process_line(line):
316356
317357
318358def do_srt ():
319- # SubRip subtitle file .srt
320- # https://en.wikipedia.org/wiki/SubRip
321- # Format has a line number followed by a timecode on the next line, then text.
359+ """
360+ SubRip subtitle file .srt format.
361+ https://en.wikipedia.org/wiki/SubRip
362+ Format has a line number followed by a timecode on the next line, then text.
363+ """
322364 print ("Processing file as SubRip subtitles [.srt]" )
323365 with open (file .i , "r" , encoding = enc .enc ) as original :
324366 subnum = 1
@@ -331,12 +373,15 @@ def do_srt():
331373
332374
333375def do_vtt ():
334- # WebVTT (Web Video Text Tracks) subtitle file .vtt
335- # https://en.wikipedia.org/wiki/WebVTT
336- # https://www.checksub.com/blog/guide-use-webvtt-subtitles-format
337- # This format has a few differing 'standards', you have:
338- # Metadata, notes, styles, timceodes with optional hours, and optional line numbers,
339- # almost none of which are actually used it seems. But we need to handle them
376+ """
377+ WebVTT (Web Video Text Tracks) subtitle file .vtt format.
378+
379+ https://en.wikipedia.org/wiki/WebVTT
380+ https://www.checksub.com/blog/guide-use-webvtt-subtitles-format
381+ This format has a few differing `standards`, you have:
382+ Metadata, notes, styles, timceodes with optional hours, and optional line numbers,
383+ almost none of which are actually used it seems. But we need to handle them.
384+ """
340385 print ("Processing file as WebVTT (Web Video Text Tracks) [.vtt]" )
341386 with open (file .i , "r" , encoding = enc .enc ) as original :
342387 subnum = 1
@@ -355,13 +400,16 @@ def do_vtt():
355400
356401
357402def do_ass ():
358- # SubStation Alpha subtitle file .ssa/.ass
359- # https://wiki.multimedia.cx/index.php?title=SubStation_Alpha
360- # http://www.tcax.org/docs/ass-specs.htm Browser may complain as not https site.
361- # This format has different version, later ones include more metadata and sections,
362- # this should not be a big problem as the text is always on a `Dialog:` line.
363- # Two keys issues are; lines may not be in timecode order,
364- # text may be for labelling things and not part of the script.
403+ """
404+ SubStation Alpha subtitle file .ssa/.ass format.
405+
406+ https://wiki.multimedia.cx/index.php?title=SubStation_Alpha
407+ http://www.tcax.org/docs/ass-specs.htm Browser may complain as not https site.
408+ This format has different version, later ones include more metadata and sections,
409+ this should not be a big problem as the text is always on a `Dialog:` line.
410+ Two keys issues are; lines may not be in timecode order,
411+ text may be for labelling objects and not part of the script.
412+ """
365413 print ("Processing file as SubStation Alpha subtitle [.ssa/.ass]" )
366414 with open (file .i , "r" , encoding = enc .enc ) as original :
367415 # Try and get version
@@ -393,6 +441,7 @@ def do_ass():
393441
394442
395443def write_to_file ():
444+ """Outputs finished work to a new file in the selected encoding."""
396445 with open (file .o , "w" , encoding = enc .out ) as new :
397446 # We check for junk again because it can gets split over two lines and we can't find it until now.
398447 for line in sub .text .splitlines ():
@@ -401,6 +450,7 @@ def write_to_file():
401450
402451
403452def do_work ():
453+ """Process file based on sub.format, additionally check if there is a file from a previous run."""
404454 overwrite_old_file (file .o )
405455 if sub .format == "srt" :
406456 do_srt ()
@@ -413,6 +463,7 @@ def do_work():
413463
414464
415465def check_it_works (in_file ): # Pytest runner
466+ """This is for running Pytests, as we need to be able to set various variables."""
416467 try :
417468 file .set_file (in_file ["test_file" ])
418469 file .o = Path (in_file ["test_outf" ]) # Override normal output file
0 commit comments