Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions sms_wsj/database/wsj/create_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import re
import tempfile
import subprocess
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import sh is now unused

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also fix the imports for the scipy window functions (in the stft)? Currently, some Import tests still fail.

from pathlib import Path

import sacred
Expand Down Expand Up @@ -182,18 +183,20 @@ def normalize_transcription(transcriptions, wsj_root: Path):
:param wsj_root: Path to WSJ database

:return result: Clean transcription dictionary

>>> transcriptions = {'ID1': 'Hello World, and bye!?', 'ID2': 'What? Yes.'}
>>> normalize_transcription(transcriptions, '')
{'ID1': 'HELLO WORLD, AND BYE!?', 'ID2': 'WHAT? YES.'}

"""
assert len(transcriptions) > 0, 'No transcriptions to clean up.'
with tempfile.TemporaryDirectory() as temporary_directory:
temporary_directory = Path(temporary_directory).absolute()
with open(temporary_directory / 'dirty.txt', 'w') as f:
for key, value in transcriptions.items():
f.write('{} {}\n'.format(key, value))
result = sh.perl(
sh.cat(str(temporary_directory / 'dirty.txt')),
kaldi_wsj_tools / 'normalize_transcript.pl',
'<NOISE>'
)

text = ''.join([f'{key} {value}\n' for key, value in transcriptions.items()])
cp = subprocess.run(
['perl', kaldi_wsj_tools / 'normalize_transcript.pl', '<NOISE>'],
input=text, stdout=subprocess.PIPE, check=True, universal_newlines=True)
result = cp.stdout

result = [line.split(maxsplit=1) for line in result.strip().split('\n')]
result = {k: v for k, v in result}
return result
Expand Down