Skip to content

Commit d953dc4

Browse files
authored
Refactoring and fixing the bug. Now it returns the first item with an empty transcription (#229)
1 parent f9b3e2f commit d953dc4

File tree

2 files changed

+61
-41
lines changed

2 files changed

+61
-41
lines changed

packages/components/timed-text-editor/stories/index.stories.js

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import { action } from '@storybook/addon-actions';
55
import { withKnobs, text, number, boolean } from '@storybook/addon-knobs';
66

77
import bbcKaldiTranscript from './fixtures/bbc-kaldi.json';
8-
98
import TimedTextEditor from '../index.js';
109

1110
storiesOf('TimedTextEditor', module)
@@ -32,6 +31,33 @@ storiesOf('TimedTextEditor', module)
3231
fileName: text('fileName', 'KateDarling_2018S-950k.mp4')
3332
};
3433

34+
return (
35+
<TimedTextEditor { ...fixtureProps } />
36+
);
37+
})
38+
.add('empty dpe', () => {
39+
const mediaUrl = 'https://download.ted.com/talks/KateDarling_2018S-950k.mp4';
40+
const emptyTranscriptData = { 'paragraphs': [], 'words': [] };
41+
42+
const fixtureProps = {
43+
transcriptData: emptyTranscriptData,
44+
mediaUrl: text('mediaUrl', mediaUrl),
45+
isEditable: boolean('isEditable', true),
46+
spellCheck: boolean('spellCheck', false),
47+
onWordClick: action('onWordClick'),
48+
sttJsonType: text('sttJsonType', 'digitalpaperedit'),
49+
isPlaying: action('isPlaying'),
50+
playMedia: action('playMedia'),
51+
currentTime: number('currentTime', 0),
52+
isScrollIntoViewOn: boolean('isScrollIntoViewOn', true),
53+
isPauseWhileTypingOn: boolean('isPauseWhileTypingOn', true),
54+
timecodeOffset: number('timecodeOffset', 0),
55+
handleAnalyticsEvents: action('handleAnalyticsEvents'),
56+
showSpeakers: boolean('showSpeakers', true),
57+
showTimecodes: boolean('showTimecodes', true),
58+
fileName: text('fileName', 'KateDarling_2018S-950k.mp4')
59+
};
60+
3561
return (
3662
<TimedTextEditor { ...fixtureProps } />
3763
);

packages/stt-adapters/digital-paper-edit/index.js

Lines changed: 34 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,74 +3,68 @@
33
* More details see
44
* https://github.com/bbc/digital-paper-edit
55
*/
6-
import generateEntitiesRanges from '../generate-entities-ranges/index.js';
7-
import groupWordsInParagraphsBySpeakers from './group-words-by-speakers.js';
6+
import generateEntitiesRanges from '../generate-entities-ranges';
7+
import groupWordsInParagraphsBySpeakers from './group-words-by-speakers';
88
/**
99
* groups words list from kaldi transcript based on punctuation.
1010
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
1111
* @param {array} words - array of words opbjects from kaldi transcript
1212
*/
13-
const groupWordsInParagraphs = words => {
13+
const groupWordsInParagraphs = (words) => {
1414
const results = [];
1515
let paragraph = { words: [], text: [] };
1616

17-
words.forEach(word => {
17+
words.forEach((word) => {
18+
paragraph.words.push(word);
19+
paragraph.text.push(word.text);
20+
1821
// if word contains punctuation
1922
if (/[.?!]/.test(word.text)) {
20-
paragraph.words.push(word);
21-
paragraph.text.push(word.text);
2223
paragraph.text = paragraph.text.join(' ');
2324
results.push(paragraph);
2425
// reset paragraph
2526
paragraph = { words: [], text: [] };
26-
} else {
27-
paragraph.words.push(word);
28-
paragraph.text.push(word.text);
2927
}
3028
});
3129

3230
return results;
3331
};
3432

33+
const generateDraftJsContentBlock = (paragraph) => {
34+
const { words, text, speaker } = paragraph;
35+
const start = words.length > 0 ? words[0].start : 0;
36+
37+
return {
38+
text: text,
39+
type: 'paragraph',
40+
data: {
41+
speaker: speaker,
42+
words: words,
43+
start: start,
44+
},
45+
// the entities as ranges are each word in the space-joined text,
46+
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
47+
entityRanges: generateEntitiesRanges(words, 'text'), // wordAttributeName
48+
};
49+
};
50+
3551
const digitalPaperEditToDraft = (digitalPaperEditTranscriptJson) => {
36-
const results = [];
37-
let speakerSegmentation = null;
3852
let wordsByParagraphs = [];
39-
const tmpWords = digitalPaperEditTranscriptJson.words;
4053

41-
if (digitalPaperEditTranscriptJson.paragraphs) {
42-
speakerSegmentation = digitalPaperEditTranscriptJson.paragraphs;
43-
}
54+
const { words, paragraphs } = digitalPaperEditTranscriptJson;
4455

45-
if (!speakerSegmentation) {
46-
wordsByParagraphs = groupWordsInParagraphs(tmpWords);
56+
if (!paragraphs) {
57+
wordsByParagraphs = groupWordsInParagraphs(words);
4758
} else {
48-
wordsByParagraphs = groupWordsInParagraphsBySpeakers(tmpWords, digitalPaperEditTranscriptJson.paragraphs );
59+
wordsByParagraphs = groupWordsInParagraphsBySpeakers(words, paragraphs);
4960
}
5061

51-
wordsByParagraphs.forEach((paragraph, i) => {
52-
// if paragraph contain words
53-
// eg sometimes the speaker segmentation might not contain words :man-shrugging:
54-
if (paragraph.words[0]) {
55-
let speakerLabel = `TBC ${ i }`;
56-
if (speakerSegmentation) {
57-
speakerLabel = paragraph.speaker;
58-
}
59-
60-
const draftJsContentBlockParagraph = {
61-
text: paragraph.text,
62-
type: 'paragraph',
63-
data: {
64-
speaker: speakerLabel,
65-
words: paragraph.words,
66-
start: paragraph.words[0].start
67-
},
68-
// the entities as ranges are each word in the space-joined text,
69-
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
70-
entityRanges: generateEntitiesRanges(paragraph.words, 'text') // wordAttributeName
71-
};
72-
results.push(draftJsContentBlockParagraph);
62+
const results = wordsByParagraphs.map((paragraph, i) => {
63+
if (!paragraph.speaker) {
64+
paragraph.speaker = `TBC ${ i }`;
7365
}
66+
67+
return generateDraftJsContentBlock(paragraph);
7468
});
7569

7670
return results;

0 commit comments

Comments
 (0)