@@ -2,107 +2,84 @@ package communicate
22
33import "encoding/xml"
44
5- // reference document at: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-structure
6-
75type Speak struct {
8- XMLName xml.Name `xml:"speak"`
9- Version string `xml:"version,attr"`
10- Xmlns string `xml:"xmlns,attr"`
11- Mstts string `xml:"mstts,attr"`
12- Lang string `xml:"xml:lang,attr"`
13- Backgroundaudio Backgroundaudio `xml:"mstts:backgroundaudio"`
14- Voice Voice `xml:"voice"`
15- }
16-
17- type Backgroundaudio struct {
18- Src string `xml:"src,attr"`
19- Volume string `xml:"volume,attr"`
20- Fadein string `xml:"fadein,attr"`
21- Fadeout string `xml:"fadeout,attr"`
6+ XMLName xml.Name `xml:"speak"`
7+ Version string `xml:"version,attr"`
8+ Xmlns string `xml:"xmlns,attr"`
9+ Lang string `xml:"xml:lang,attr"`
10+ Voice []Voice `xml:"voice"`
2211}
2312
2413type Voice struct {
25- Name string `xml:"name,attr"`
26- Effect string `xml:"effect,attr"`
27- Audio Audio `xml:"audio"`
28- Bookmark string `xml:"bookmark,omitempty"`
29- Break Break `xml:"break,omitempty"`
30- Emphasis Emphasis `xml:"emphasis,omitempty"`
31- Lang Lang `xml:"lang"`
32- Lexicon Lexicon `xml:"lexicon,omitempty"`
33- Math string `xml:"math,omitempty"`
34- Mstts Mstts `xml:"mstts,omitempty"`
35- P string `xml:"p,omitempty"`
36- Phoneme Phoneme `xml:"phoneme,omitempty"`
37- Prosody Prosody `xml:"prosody"`
38- SayAs SayAs `xml:"say-as,omitempty"`
39- Sub string `xml:"sub,omitempty"`
40- }
41-
42- type Audio struct {
43- Src string `xml:"src"`
44- }
45-
46- type Break struct {
47- Strength string `xml:"strength,attr"`
48- Time string `xml:"time,attr"`
49- }
50-
51- type Emphasis struct {
52- Level string `xml:"level,attr"`
53- }
54-
55- type Lang struct {
56- XmlLang string `xml:"xml:lang,attr"`
57- }
58-
59- type Lexicon struct {
60- URI string `xml:"uri,attr"`
61- }
62-
63- type Mstts struct {
64- Backgroundaudio string `xml:"backgroundaudio"`
65- Ttsembedding TtsEmbedding `xml:"ttsembedding"`
66- ExpressAs ExpressAs `xml:"express-as"`
67- Silence Silence `xml:"silence"`
68- Viseme Viseme `xml:"viseme"`
69- Audioduration string `xml:"audioduration"`
70- }
71-
72- type TtsEmbedding struct {
73- SpeakerProfileId string `xml:"speakerProfileId,attr"`
74- }
75-
76- type ExpressAs struct {
77- Style string `xml:"style,attr"`
78- Styledegree string `xml:"styledegree,attr"`
79- Role string `xml:"role,attr"`
80- }
81-
82- type Silence struct {
83- Type string `xml:"type,attr"`
84- Value string `xml:"value,attr"`
85- }
86-
87- type Viseme struct {
88- Type string `xml:"type,attr"`
89- }
90-
91- type Phoneme struct {
92- Alphabet string `xml:"alphabet,attr"`
93- Ph string `xml:"ph,attr"`
14+ Name string `xml:"name,attr"`
15+ Prosody Prosody `xml:"prosody"`
9416}
9517
9618type Prosody struct {
97- Pitch string `xml:"pitch,attr"`
19+ // Contour represents changes in pitch. These changes are represented as an array of targets at specified time
20+ //positions in the speech output. Sets of parameter pairs define each target. For example:
21+ //
22+ //<prosody contour="(0%,+20Hz) (10%,-2st) (40%,+10Hz)">
23+ //
24+ //The first value in each set of parameters specifies the location of the pitch change as a percentage of the
25+ //duration of the text. The second value specifies the amount to raise or lower the pitch by using a relative
26+ //value or an enumeration value for pitch (see pitch).
9827 Contour string `xml:"contour,attr,omitempty"`
99- Range string `xml:"range,attr,omitempty"`
100- Rate string `xml:"rate,attr"`
101- Volume string `xml:"volume,attr"`
102- }
103-
104- type SayAs struct {
105- InterpretAs string `xml:"interpret-as,attr"`
106- Format string `xml:"format,attr"`
107- Detail string `xml:"detail,attr"`
28+ //Indicates the baseline pitch for the text. Pitch changes can be applied at the sentence level. The pitch changes
29+ //should be within 0.5 to 1.5 times the original audio. You can express the pitch as:
30+ //An absolute value:
31+ //Expressed as a number followed by "Hz" (Hertz). For example, <prosody pitch="600Hz">some text</prosody>.
32+ //A relative value:
33+ // As a relative number: Expressed as a number preceded by "+" or "-" and followed by "Hz" or "st" that specifies
34+ // an amount to change the pitch. For example:
35+ // <prosody pitch="+80Hz">some text</prosody> or <prosody pitch="-2st">some text</prosody>.
36+ // The "st" indicates the change unit is semitone, which is half of a tone (a half step) on the standard diatonic scale.
37+ //As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the
38+ //relative change. For example: <prosody pitch="50%">some text</prosody> or <prosody pitch="-50%">some text</prosody>.
39+ // A constant value:
40+ // x-low
41+ // low
42+ // medium
43+ // high
44+ // x-high
45+ // default
46+ Pitch string `xml:"pitch,attr"`
47+ // Indicates the speaking rate of the text. Speaking rate can be applied at the word or sentence level. The rate changes
48+ //should be within 0.5 to 2 times the original audio. You can express rate as:
49+ //A relative value:
50+ // As a relative number: Expressed as a number that acts as a multiplier of the default. For example, a value of 1 results
51+ // in no change in the original rate. A value of 0.5 results in a halving of the original rate. A value of 2 results in
52+ // twice the original rate.
53+ // As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the relative
54+ // change. For example:
55+ // <prosody rate="50%">some text</prosody> or <prosody rate="-50%">some text</prosody>.
56+ // A constant value:
57+ // x-slow
58+ // slow
59+ // medium
60+ // fast
61+ // x-fast
62+ // default
63+ Rate string `xml:"rate,attr"`
64+ // Indicates the volume level of the speaking voice. Volume changes can be applied at the sentence level. You can express
65+ //the volume as:
66+ // An absolute value: Expressed as a number in the range of 0.0 to 100.0, from quietest to loudest, such as 75.
67+ //The default value is 100.0.
68+ // A relative value:
69+ // As a relative number: Expressed as a number preceded by "+" or "-" that specifies an amount to change the volume.
70+ //Examples are +10 or -5.5.
71+ // As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the
72+ //relative change. For example:
73+ // <prosody volume="50%">some text</prosody> or <prosody volume="+3%">some text</prosody>.
74+ //
75+ // A constant value:
76+ // silent
77+ // x-soft
78+ // soft
79+ // medium
80+ // loud
81+ // x-loud
82+ // default
83+ Volume string `xml:"volume,attr"`
84+ Text string `xml:",chardata"`
10885}
0 commit comments