Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/AzureSpeechDetection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ Returned `AudioTrack` objects have the following members in their `detection_pro

| Property Key | Description |
|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `SPEAKER_ID` | An integer speaker identifier, indexed from 1. When a job has been segmented by the Workflow Manager, the ID for all utterances will be overwritten by zero, to avoid confusion (as speaker IDs are not consistent between subjobs). |
| `LONG_SPEAKER_ID` | A unique speaker identifier, of the form "`<start_offset>-<stop_offset>-<#>`, where `<start_offset>` and `<stop_offset>` are integers indicating the segment range (in frame counts for video jobs, milliseconds for audio jobs) for sub-jobs when a job has been segmented by the Workflow Manager. The final `#` portion of the ID is a 1-indexed counter for speaker identity within the indicated segment range. When jobs are not segmented, or not submitted through the Workflow Manager at all, `stop_offset` may instead be `EOF`, indicating that the job extends to the end of the file. |
| `SPEAKER_ID` | A dummy field set to "0". |
| `GENDER` | Only present if supplied by an upstream component. The gender of the speaker. |
| `GENDER_CONFIDENCE` | Only present if supplied by an upstream component. The confidence of the gender classification. |
| `TRANSCRIPT` | The text of the utterance transcript. Words are space-separated. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,10 @@ def get_detections_from_job(
logger.exception(f'Exception raised while processing audio: {e}')
raise

# Remove this block to drop LONG_SPEAKER_ID
for track in audio_tracks:
sid = track.detection_properties['SPEAKER_ID']
if job_config.is_triggered_job:
track.detection_properties['LONG_SPEAKER_ID'] = sid
else:
track.detection_properties['LONG_SPEAKER_ID'] = job_config.speaker_id_prefix + sid
if job_config.overwrite_ids:
track.detection_properties['SPEAKER_ID'] = '0'
track.detection_properties['LONG_SPEAKER_ID'] = track.detection_properties['SPEAKER_ID']
track.detection_properties['SPEAKER_ID'] = '0'

logger.info('Processing complete. Found %d tracks.' % len(audio_tracks))
return audio_tracks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __init__(self):
@staticmethod
def convert_word_timing(
recognized_phrases: Iterable[Mapping[str, Any]],
job_config: AzureJobConfig,
speaker: Optional[mpf_util.SpeakerInfo] = None
) -> Iterable[Utterance]:
""" Convert ACS recognized_phrases structure to utterances with correct
Expand All @@ -96,7 +97,7 @@ def convert_word_timing(
confidence = phrase['nBest'][0]['confidence']
word_segments = list(map(get_seg, phrase['nBest'][0]['words']))
word_confidences = [w['confidence'] for w in phrase['nBest'][0]['words']]
speaker_id = str(phrase.get('speaker', '0'))
speaker_id = job_config.speaker_id_prefix + str(phrase.get('speaker', '0'))

# Ensure display text tokens are one-to-one with word segments
# If not, replace with bare words. This loses punctuation and
Expand Down Expand Up @@ -336,6 +337,7 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]:
recognized_phrases = transcription['recognizedPhrases']
utterances = self.convert_word_timing(
recognized_phrases=recognized_phrases,
job_config=job_config,
speaker=job_config.speaker)

logger.info('Completed process audio')
Expand Down
12 changes: 0 additions & 12 deletions python/AzureSpeechDetection/tests/test_acs_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,18 +192,6 @@ def test_diarization(self):
self.assertEqual(1, len_raw)
self.assertEqual(2, len_dia)

# A nonzero start_time indicates to the component that this is a
# subjob, so all SPEAKER_IDs should be equal to 0
ids_raw, ids_dia = [
set([
track.detection_properties['SPEAKER_ID']
for track in result
])
for result in results
]
self.assertEqual({'0'}, ids_raw)
self.assertEqual({'0'}, ids_dia)

def test_language(self):
job_en = mpf.AudioJob(
job_name='test_bilingual_english',
Expand Down