Skip to content

Commit 3e4a6da

Browse files
committed
Added initially pytest, fixed line bug
Added a basic pytest, will build more later if this works. Fixed a line duplication bug for certain files.
1 parent abd1ffc commit 3e4a6da

23 files changed

+24995
-34
lines changed
125 Bytes
Binary file not shown.
12.4 KB
Binary file not shown.

subtotxt.py

Lines changed: 110 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import re
1111
from pathlib import Path
1212

13+
1314
version = "2025-02-03"
1415

1516

@@ -64,54 +65,88 @@ def missing_modules_installer(required_modules):
6465

6566

6667
class file_handler:
67-
def __init__(self, i):
68+
def __init__(self):
69+
self.i = None # Input file
70+
self.o = None # Output file
71+
self.c = None # Copy file
72+
self.overw = None # Overwrite
73+
74+
def set_file(self, i):
75+
i = Path(i)
6876
if i.is_file():
6977
self.i = i
7078
self.o = i.with_suffix(".txt")
7179
self.c = i.with_stem(f"{Path(i).stem}-copy")
7280
print(f"Input file: {i}")
7381
else:
74-
raise Exception(f"File {i} not found.")
82+
raise FileNotFoundError(f"File '{i}' not found.")
83+
84+
def set_over(self, x):
85+
self.overw = x
7586

7687

7788
class encoding:
78-
def __init__(self, i):
79-
self.res = from_path(i).best() # charset_normalizer guess encoding
89+
def __init__(self):
90+
self.res = None # Check encoding
91+
self.enc = None # Detected encoding
92+
self.out = None # Output encoding
93+
94+
def check_encoding(self):
95+
self.res = from_path(file.i).best() # charset_normalizer guess encoding
8096
self.enc = self.res.encoding
81-
self.out = "utf_8" if args.utf8 else self.enc
8297
if self.res is not None and self.enc == "utf_8" and self.res.bom:
8398
self.enc += "_sig" # adds sig for utf_8_sig/bom files
8499
print(f"Detected Character Encoding: {self.enc}")
85100
print(f"Confidence of encoding: {int((1.0 - self.res.chaos) * 100)}%")
86-
print("Output encoding forced to UTF-8" if args.utf8 else "Output will use input encoding")
101+
102+
def force_utf8(self, x):
103+
print("Output encoding forced to UTF-8" if x else "Output will use input encoding")
104+
self.out = "utf_8" if x else self.enc
87105

88106

89107
class subtitle:
90108
def __init__(self):
91-
self.format = self.testsub() # Which subtitle format
109+
self.format = None # Which subtitle format
92110
self.text = "" # The output text
93111
self.text_finished = "" # The output text after a final check
94112
self.prev = "" # Previously read line, prevents duplicates
95-
self.junk = self.junklist()
113+
self.junk = None # Junk remover list, set below
114+
self.no_names = False # If True removes names from subtitles
115+
self.nosrt = False # If True leaves subs in file order, not timecode order
116+
self.scr = False # If True outputs to screen as each line processed
117+
self.oneline = False # If True attempts to join longer lines
96118

97119
def testsub(self):
98120
with open(file.i, "r", encoding=enc.enc) as ts:
99121
for line in ts:
100122
if "WEBVTT" in line:
101-
return "vtt"
123+
self.format = "vtt"
102124
if line.strip("\n") == "1" and re.search("(.*:.*:.*-->.*:.*:.*)", next(ts)):
103-
return "srt"
125+
self.format = "srt"
104126
if any(s in line for s in ["!:", "Timer:", "Style:", "Comment:", "Dialogue:", "ScriptType:"]):
105-
return "ass"
127+
self.format = "ass"
106128

107129
def junklist(self):
108130
# This list will grow
109131
# Escaping and r(raw) tag needed for special characters
110132
j = ["<.*?>", r"\{.*?\}", r"\[.*\]", r"\(.*\)", r"^-\s"]
111-
if args.nonames:
133+
if self.no_names:
112134
j.append("^.*?:")
113135
return j
114136

137+
def set_no_names(self, x):
138+
self.no_names = x
139+
self.junk = self.junklist()
140+
141+
def set_no_sort(self, x):
142+
self.nosrt = x
143+
144+
def screen_output(self, x):
145+
self.scr = x
146+
147+
def one_line(self, x):
148+
self.oneline = x
149+
115150

116151
def cls(): # Clear screen win/*nix friendly
117152
os.system("cls" if os.name == "nt" else "clear")
@@ -214,20 +249,28 @@ def arguments():
214249
required=False,
215250
help="For SubStation Alpha (.ssa/.ass), do not sort by timecode.",
216251
)
252+
parser.add_argument(
253+
"--debug",
254+
"-db",
255+
default=False,
256+
action="store_true",
257+
required=False,
258+
help="Give Traceback output if the script fails",
259+
)
217260
return parser.parse_args()
218261

219262

220-
def overwrite(f):
263+
def overwrite_old_file(f):
221264
if f.is_file():
222-
if (not args.overwrite and yn("Output file already exists, delete and make a new one?")) or args.overwrite:
265+
if (not file.overw and yn("Output file already exists, delete and make a new one?")) or file.overw:
223266
print("Overwriting old file")
224267
send2trash(f)
225268
else:
226269
raise Exception("Output file already exists.")
227270

228271

229272
def copy():
230-
overwrite(file.c)
273+
overwrite_old_file(file.c)
231274
with open(file.i, "r", encoding=enc.enc) as original, open(file.c, "w", encoding=enc.out) as new:
232275
for line in original:
233276
if args.screen:
@@ -237,7 +280,7 @@ def copy():
237280

238281

239282
def junk_strip(line):
240-
# Based on PR#4 by eMPee584
283+
# Based on PR #4 by eMPee584
241284
# Looping is terrible, but, a required evil it seems
242285
for junk in sub.junk:
243286
try:
@@ -250,13 +293,13 @@ def junk_strip(line):
250293
def process_line(line):
251294
# Strip formatting junk from line
252295
# We do this before checking for duplicates
253-
line = junk_strip(line)
296+
line = junk_strip(line).strip()
254297
# Process line if it's not a duplicate of the previous one, or empty.
255-
# Based on PR#4 by eMPee584
256-
line = line.strip()
257-
if not line == sub.prev and line != "":
258-
# One liners based on PR#2 by adam-sierakowski
259-
if args.oneliners:
298+
# Based on PR #4 by eMPee584
299+
# Fix for live translations giving duplicates from Issue #9 by rajibando
300+
if line.strip() and line.strip() != sub.prev.strip():
301+
# One liners based on PR #2 by adam-sierakowski
302+
if sub.oneline:
260303
if line[-1] in [".", "?", "!", "…"]:
261304
ln = f"{line}\n"
262305
sub.text += ln
@@ -267,7 +310,7 @@ def process_line(line):
267310
ln = f"{line}\n"
268311
sub.text += ln
269312

270-
if args.screen:
313+
if sub.scr:
271314
print(ln, end="")
272315
sub.prev = ln
273316

@@ -316,7 +359,7 @@ def do_ass():
316359
# https://wiki.multimedia.cx/index.php?title=SubStation_Alpha
317360
# http://www.tcax.org/docs/ass-specs.htm Browser may complain as not https site.
318361
# This format has different version, later ones include more metadata and sections,
319-
# this should not be a big problem as teh text is always on a `Dialog:` line.
362+
# this should not be a big problem as the text is always on a `Dialog:` line.
320363
# Two keys issues are; lines may not be in timecode order,
321364
# text may be for labelling things and not part of the script.
322365
print("Processing file as SubStation Alpha subtitle [.ssa/.ass]")
@@ -342,9 +385,9 @@ def do_ass():
342385
stc = x[0][0] # Start timecode
343386
nom = x[0][1] # Character speaking
344387
txt = x[0][2] # Text
345-
text = txt if (args.nonames or nom == "") else f"{nom}: {txt}"
388+
text = txt if (sub.no_names or nom == "") else f"{nom}: {txt}"
346389
d.update({stc: {"dialog": text}})
347-
for t in [v["dialog"] for k, v in sorted(d.items())] if not args.nosort else [v["dialog"] for v in d.values()]:
390+
for t in [v["dialog"] for k, v in sorted(d.items())] if not sub.nosrt else [v["dialog"] for v in d.values()]:
348391
process_line(t.replace(r"\n", " ").replace(r"\N", " ")) # Fixes odd newline in .ass
349392
write_to_file()
350393

@@ -358,7 +401,7 @@ def write_to_file():
358401

359402

360403
def do_work():
361-
overwrite(file.o)
404+
overwrite_old_file(file.o)
362405
if sub.format == "srt":
363406
do_srt()
364407
elif sub.format == "vtt":
@@ -369,35 +412,68 @@ def do_work():
369412
raise Exception("Unable to determine Subtitle format.")
370413

371414

415+
def check_it_works(in_file): # Pytest runner
416+
try:
417+
file.set_file(in_file["test_file"])
418+
file.o = Path(in_file["test_outf"]) # Override normal output file
419+
file.set_over(True) # Always overwrite (although unlikely when Pytesting)
420+
enc.check_encoding()
421+
enc.force_utf8(in_file["test_force"]) # True/False
422+
sub.set_no_names(in_file["test_names"]) # True/False
423+
sub.set_no_sort(in_file["test_sort"])
424+
sub.screen_output(False) # Pytest never needs to output to screen
425+
sub.one_line(in_file["test_onel"])
426+
sub.testsub()
427+
do_work()
428+
return
429+
except Exception as error:
430+
return f"Testing failed: {error}"
431+
432+
433+
# Init classes
434+
file = file_handler()
435+
enc = encoding()
436+
sub = subtitle()
437+
438+
# Do things
372439
if __name__ == "__main__":
373440
args = arguments()
374441
cls()
375442
try:
376443
print(f"SUB to TXT v{version}\n{'-' * 22}")
377444
if args.file or args.copy:
378-
file = file_handler(Path(args.file))
379-
enc = encoding(file.i)
445+
file.set_file(args.file)
446+
file.set_over(args.overwrite)
447+
enc.check_encoding()
448+
enc.force_utf8(args.utf8) # True/False
449+
sub.set_no_names(args.nonames) # True/False
450+
sub.set_no_sort(args.nosort) # True/False
451+
sub.screen_output(args.screen) # True/False
452+
sub.one_line(args.oneliners) # True/False
453+
sub.testsub()
380454
if args.pause and not yn("Ready to start?"):
381455
raise Exception("User exited at pause before start")
382456
if args.copy:
383457
copy()
384458
else:
385-
sub = subtitle()
386459
do_work()
387460
if args.dir:
388461
files = list(filter(lambda p: p.suffix in {".srt", ".vtt", ".ssa", ".ass"}, Path(args.dir).glob("*")))
389462
how_many = len(files)
390463
c = 0
391464
print(f"Multi file mode. Found {how_many} files.")
392465
print("-" * 22)
393-
for file in files:
394-
file = file_handler(Path(file))
395-
enc = encoding(file.i)
396-
sub = subtitle()
466+
for f in files:
467+
file.set(f)
468+
enc.force_utf8(args.utf8)
397469
do_work()
398470
print("-" * 22)
399471
c += 1
400472
print(f"Processed {c}/{how_many} files.")
401473
print("\nFinished!\n")
402474
except Exception as error:
403475
print(f"Script execution stopped because:\n{error}")
476+
if args.debug:
477+
import traceback
478+
479+
print(traceback.format_exc())
131 Bytes
Binary file not shown.
131 Bytes
Binary file not shown.
714 Bytes
Binary file not shown.
2.24 KB
Binary file not shown.

tests/conftest.py

Whitespace-only changes.

0 commit comments

Comments
 (0)