|
3 | 3 | * More details see |
4 | 4 | * https://github.com/bbc/digital-paper-edit |
5 | 5 | */ |
6 | | -import generateEntitiesRanges from '../generate-entities-ranges/index.js'; |
7 | | -import groupWordsInParagraphsBySpeakers from './group-words-by-speakers.js'; |
| 6 | +import generateEntitiesRanges from '../generate-entities-ranges'; |
| 7 | +import groupWordsInParagraphsBySpeakers from './group-words-by-speakers'; |
8 | 8 | /** |
9 | 9 | * groups words list from kaldi transcript based on punctuation. |
10 | 10 | * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words. |
11 | 11 | * @param {array} words - array of words opbjects from kaldi transcript |
12 | 12 | */ |
13 | | -const groupWordsInParagraphs = words => { |
| 13 | +const groupWordsInParagraphs = (words) => { |
14 | 14 | const results = []; |
15 | 15 | let paragraph = { words: [], text: [] }; |
16 | 16 |
|
17 | | - words.forEach(word => { |
| 17 | + words.forEach((word) => { |
| 18 | + paragraph.words.push(word); |
| 19 | + paragraph.text.push(word.text); |
| 20 | + |
18 | 21 | // if word contains punctuation |
19 | 22 | if (/[.?!]/.test(word.text)) { |
20 | | - paragraph.words.push(word); |
21 | | - paragraph.text.push(word.text); |
22 | 23 | paragraph.text = paragraph.text.join(' '); |
23 | 24 | results.push(paragraph); |
24 | 25 | // reset paragraph |
25 | 26 | paragraph = { words: [], text: [] }; |
26 | | - } else { |
27 | | - paragraph.words.push(word); |
28 | | - paragraph.text.push(word.text); |
29 | 27 | } |
30 | 28 | }); |
31 | 29 |
|
32 | 30 | return results; |
33 | 31 | }; |
34 | 32 |
|
| 33 | +const generateDraftJsContentBlock = (paragraph) => { |
| 34 | + const { words, text, speaker } = paragraph; |
| 35 | + const start = words.length > 0 ? words[0].start : 0; |
| 36 | + |
| 37 | + return { |
| 38 | + text: text, |
| 39 | + type: 'paragraph', |
| 40 | + data: { |
| 41 | + speaker: speaker, |
| 42 | + words: words, |
| 43 | + start: start, |
| 44 | + }, |
| 45 | + // the entities as ranges are each word in the space-joined text, |
| 46 | + // so it needs to be compute for each the offset from the beginning of the paragraph and the length |
| 47 | + entityRanges: generateEntitiesRanges(words, 'text'), // wordAttributeName |
| 48 | + }; |
| 49 | +}; |
| 50 | + |
35 | 51 | const digitalPaperEditToDraft = (digitalPaperEditTranscriptJson) => { |
36 | | - const results = []; |
37 | | - let speakerSegmentation = null; |
38 | 52 | let wordsByParagraphs = []; |
39 | | - const tmpWords = digitalPaperEditTranscriptJson.words; |
40 | 53 |
|
41 | | - if (digitalPaperEditTranscriptJson.paragraphs) { |
42 | | - speakerSegmentation = digitalPaperEditTranscriptJson.paragraphs; |
43 | | - } |
| 54 | + const { words, paragraphs } = digitalPaperEditTranscriptJson; |
44 | 55 |
|
45 | | - if (!speakerSegmentation) { |
46 | | - wordsByParagraphs = groupWordsInParagraphs(tmpWords); |
| 56 | + if (!paragraphs) { |
| 57 | + wordsByParagraphs = groupWordsInParagraphs(words); |
47 | 58 | } else { |
48 | | - wordsByParagraphs = groupWordsInParagraphsBySpeakers(tmpWords, digitalPaperEditTranscriptJson.paragraphs ); |
| 59 | + wordsByParagraphs = groupWordsInParagraphsBySpeakers(words, paragraphs); |
49 | 60 | } |
50 | 61 |
|
51 | | - wordsByParagraphs.forEach((paragraph, i) => { |
52 | | - // if paragraph contain words |
53 | | - // eg sometimes the speaker segmentation might not contain words :man-shrugging: |
54 | | - if (paragraph.words[0]) { |
55 | | - let speakerLabel = `TBC ${ i }`; |
56 | | - if (speakerSegmentation) { |
57 | | - speakerLabel = paragraph.speaker; |
58 | | - } |
59 | | - |
60 | | - const draftJsContentBlockParagraph = { |
61 | | - text: paragraph.text, |
62 | | - type: 'paragraph', |
63 | | - data: { |
64 | | - speaker: speakerLabel, |
65 | | - words: paragraph.words, |
66 | | - start: paragraph.words[0].start |
67 | | - }, |
68 | | - // the entities as ranges are each word in the space-joined text, |
69 | | - // so it needs to be compute for each the offset from the beginning of the paragraph and the length |
70 | | - entityRanges: generateEntitiesRanges(paragraph.words, 'text') // wordAttributeName |
71 | | - }; |
72 | | - results.push(draftJsContentBlockParagraph); |
| 62 | + const results = wordsByParagraphs.map((paragraph, i) => { |
| 63 | + if (!paragraph.speaker) { |
| 64 | + paragraph.speaker = `TBC ${ i }`; |
73 | 65 | } |
| 66 | + |
| 67 | + return generateDraftJsContentBlock(paragraph); |
74 | 68 | }); |
75 | 69 |
|
76 | 70 | return results; |
|
0 commit comments