Skip to content

Commit 5e5d227

Browse files
author
czyt
committed
use struct for creating ssml
1 parent 38aaff9 commit 5e5d227

File tree

2 files changed

+94
-104
lines changed

2 files changed

+94
-104
lines changed

internal/communicate/communicate.go

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"crypto/tls"
77
"encoding/binary"
88
"encoding/json"
9+
"encoding/xml"
910
"fmt"
1011
"html"
1112
"io"
@@ -29,7 +30,6 @@ import (
2930

3031
const (
3132
ssmlHeaderTemplate = "X-RequestId:%s\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:%sZ\r\nPath:ssml\r\n\r\n"
32-
ssmlTemplate = "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='%s'><prosody pitch='%s' rate='%s' volume='%s'>%s</prosody></voice></speak>"
3333
)
3434

3535
var (
@@ -464,14 +464,27 @@ func splitTextByByteLength(text string, byteLength int) [][]byte {
464464
}
465465

466466
func makeSsml(text string, pitch, voice string, rate string, volume string) string {
467-
ssml := fmt.Sprintf(
468-
ssmlTemplate,
469-
voice,
470-
pitch,
471-
rate,
472-
volume,
473-
text)
474-
return ssml
467+
ssml := &Speak{
468+
XMLName: xml.Name{Local: "speak"},
469+
Version: "1.0",
470+
Xmlns: "http://www.w3.org/2001/10/synthesis",
471+
Lang: "en-US",
472+
Voice: []Voice{{
473+
Name: voice,
474+
Prosody: Prosody{
475+
Pitch: pitch,
476+
Rate: rate,
477+
Volume: volume,
478+
Text: text,
479+
},
480+
}},
481+
}
482+
483+
output, err := xml.MarshalIndent(ssml, "", " ")
484+
if err != nil {
485+
return ""
486+
}
487+
return string(output)
475488
}
476489

477490
func currentTimeInMST() string {

internal/communicate/ssml.go

Lines changed: 72 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -2,107 +2,84 @@ package communicate
22

33
import "encoding/xml"
44

5-
// reference document at: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-structure
6-
75
type Speak struct {
8-
XMLName xml.Name `xml:"speak"`
9-
Version string `xml:"version,attr"`
10-
Xmlns string `xml:"xmlns,attr"`
11-
Mstts string `xml:"mstts,attr"`
12-
Lang string `xml:"xml:lang,attr"`
13-
Backgroundaudio Backgroundaudio `xml:"mstts:backgroundaudio"`
14-
Voice Voice `xml:"voice"`
15-
}
16-
17-
type Backgroundaudio struct {
18-
Src string `xml:"src,attr"`
19-
Volume string `xml:"volume,attr"`
20-
Fadein string `xml:"fadein,attr"`
21-
Fadeout string `xml:"fadeout,attr"`
6+
XMLName xml.Name `xml:"speak"`
7+
Version string `xml:"version,attr"`
8+
Xmlns string `xml:"xmlns,attr"`
9+
Lang string `xml:"xml:lang,attr"`
10+
Voice []Voice `xml:"voice"`
2211
}
2312

2413
type Voice struct {
25-
Name string `xml:"name,attr"`
26-
Effect string `xml:"effect,attr"`
27-
Audio Audio `xml:"audio"`
28-
Bookmark string `xml:"bookmark,omitempty"`
29-
Break Break `xml:"break,omitempty"`
30-
Emphasis Emphasis `xml:"emphasis,omitempty"`
31-
Lang Lang `xml:"lang"`
32-
Lexicon Lexicon `xml:"lexicon,omitempty"`
33-
Math string `xml:"math,omitempty"`
34-
Mstts Mstts `xml:"mstts,omitempty"`
35-
P string `xml:"p,omitempty"`
36-
Phoneme Phoneme `xml:"phoneme,omitempty"`
37-
Prosody Prosody `xml:"prosody"`
38-
SayAs SayAs `xml:"say-as,omitempty"`
39-
Sub string `xml:"sub,omitempty"`
40-
}
41-
42-
type Audio struct {
43-
Src string `xml:"src"`
44-
}
45-
46-
type Break struct {
47-
Strength string `xml:"strength,attr"`
48-
Time string `xml:"time,attr"`
49-
}
50-
51-
type Emphasis struct {
52-
Level string `xml:"level,attr"`
53-
}
54-
55-
type Lang struct {
56-
XmlLang string `xml:"xml:lang,attr"`
57-
}
58-
59-
type Lexicon struct {
60-
URI string `xml:"uri,attr"`
61-
}
62-
63-
type Mstts struct {
64-
Backgroundaudio string `xml:"backgroundaudio"`
65-
Ttsembedding TtsEmbedding `xml:"ttsembedding"`
66-
ExpressAs ExpressAs `xml:"express-as"`
67-
Silence Silence `xml:"silence"`
68-
Viseme Viseme `xml:"viseme"`
69-
Audioduration string `xml:"audioduration"`
70-
}
71-
72-
type TtsEmbedding struct {
73-
SpeakerProfileId string `xml:"speakerProfileId,attr"`
74-
}
75-
76-
type ExpressAs struct {
77-
Style string `xml:"style,attr"`
78-
Styledegree string `xml:"styledegree,attr"`
79-
Role string `xml:"role,attr"`
80-
}
81-
82-
type Silence struct {
83-
Type string `xml:"type,attr"`
84-
Value string `xml:"value,attr"`
85-
}
86-
87-
type Viseme struct {
88-
Type string `xml:"type,attr"`
89-
}
90-
91-
type Phoneme struct {
92-
Alphabet string `xml:"alphabet,attr"`
93-
Ph string `xml:"ph,attr"`
14+
Name string `xml:"name,attr"`
15+
Prosody Prosody `xml:"prosody"`
9416
}
9517

9618
type Prosody struct {
97-
Pitch string `xml:"pitch,attr"`
19+
// Contour represents changes in pitch. These changes are represented as an array of targets at specified time
20+
//positions in the speech output. Sets of parameter pairs define each target. For example:
21+
//
22+
//<prosody contour="(0%,+20Hz) (10%,-2st) (40%,+10Hz)">
23+
//
24+
//The first value in each set of parameters specifies the location of the pitch change as a percentage of the
25+
//duration of the text. The second value specifies the amount to raise or lower the pitch by using a relative
26+
//value or an enumeration value for pitch (see pitch).
9827
Contour string `xml:"contour,attr,omitempty"`
99-
Range string `xml:"range,attr,omitempty"`
100-
Rate string `xml:"rate,attr"`
101-
Volume string `xml:"volume,attr"`
102-
}
103-
104-
type SayAs struct {
105-
InterpretAs string `xml:"interpret-as,attr"`
106-
Format string `xml:"format,attr"`
107-
Detail string `xml:"detail,attr"`
28+
//Indicates the baseline pitch for the text. Pitch changes can be applied at the sentence level. The pitch changes
29+
//should be within 0.5 to 1.5 times the original audio. You can express the pitch as:
30+
//An absolute value:
31+
//Expressed as a number followed by "Hz" (Hertz). For example, <prosody pitch="600Hz">some text</prosody>.
32+
//A relative value:
33+
// As a relative number: Expressed as a number preceded by "+" or "-" and followed by "Hz" or "st" that specifies
34+
// an amount to change the pitch. For example:
35+
// <prosody pitch="+80Hz">some text</prosody> or <prosody pitch="-2st">some text</prosody>.
36+
// The "st" indicates the change unit is semitone, which is half of a tone (a half step) on the standard diatonic scale.
37+
//As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the
38+
//relative change. For example: <prosody pitch="50%">some text</prosody> or <prosody pitch="-50%">some text</prosody>.
39+
// A constant value:
40+
// x-low
41+
// low
42+
// medium
43+
// high
44+
// x-high
45+
// default
46+
Pitch string `xml:"pitch,attr"`
47+
// Indicates the speaking rate of the text. Speaking rate can be applied at the word or sentence level. The rate changes
48+
//should be within 0.5 to 2 times the original audio. You can express rate as:
49+
//A relative value:
50+
// As a relative number: Expressed as a number that acts as a multiplier of the default. For example, a value of 1 results
51+
// in no change in the original rate. A value of 0.5 results in a halving of the original rate. A value of 2 results in
52+
// twice the original rate.
53+
// As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the relative
54+
// change. For example:
55+
// <prosody rate="50%">some text</prosody> or <prosody rate="-50%">some text</prosody>.
56+
// A constant value:
57+
// x-slow
58+
// slow
59+
// medium
60+
// fast
61+
// x-fast
62+
// default
63+
Rate string `xml:"rate,attr"`
64+
// Indicates the volume level of the speaking voice. Volume changes can be applied at the sentence level. You can express
65+
//the volume as:
66+
// An absolute value: Expressed as a number in the range of 0.0 to 100.0, from quietest to loudest, such as 75.
67+
//The default value is 100.0.
68+
// A relative value:
69+
// As a relative number: Expressed as a number preceded by "+" or "-" that specifies an amount to change the volume.
70+
//Examples are +10 or -5.5.
71+
// As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the
72+
//relative change. For example:
73+
// <prosody volume="50%">some text</prosody> or <prosody volume="+3%">some text</prosody>.
74+
//
75+
// A constant value:
76+
// silent
77+
// x-soft
78+
// soft
79+
// medium
80+
// loud
81+
// x-loud
82+
// default
83+
Volume string `xml:"volume,attr"`
84+
Text string `xml:",chardata"`
10885
}

0 commit comments

Comments
 (0)