use struct for creating ssml

czyt · czyt · commit 5e5d2272d16d · 2024-04-14T15:06:49.000+08:00
diff --git a/internal/communicate/communicate.go b/internal/communicate/communicate.go
@@ -6,6 +6,7 @@ import (
 	"crypto/tls"
 	"encoding/binary"
 	"encoding/json"
+	"encoding/xml"
 	"fmt"
 	"html"
 	"io"
@@ -29,7 +30,6 @@ import (
 
 const (
 	ssmlHeaderTemplate = "X-RequestId:%s\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:%sZ\r\nPath:ssml\r\n\r\n"
-	ssmlTemplate       = "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='%s'><prosody pitch='%s' rate='%s' volume='%s'>%s</prosody></voice></speak>"
 )
 
 var (
@@ -464,14 +464,27 @@ func splitTextByByteLength(text string, byteLength int) [][]byte {
 }
 
 func makeSsml(text string, pitch, voice string, rate string, volume string) string {
-	ssml := fmt.Sprintf(
-		ssmlTemplate,
-		voice,
-		pitch,
-		rate,
-		volume,
-		text)
-	return ssml
+	ssml := &Speak{
+		XMLName: xml.Name{Local: "speak"},
+		Version: "1.0",
+		Xmlns:   "http://www.w3.org/2001/10/synthesis",
+		Lang:    "en-US",
+		Voice: []Voice{{
+			Name: voice,
+			Prosody: Prosody{
+				Pitch:  pitch,
+				Rate:   rate,
+				Volume: volume,
+				Text:   text,
+			},
+		}},
+	}
+
+	output, err := xml.MarshalIndent(ssml, "", "  ")
+	if err != nil {
+		return ""
+	}
+	return string(output)
 }
 
 func currentTimeInMST() string {
diff --git a/internal/communicate/ssml.go b/internal/communicate/ssml.go
@@ -2,107 +2,84 @@ package communicate
 
 import "encoding/xml"
 
-// reference document at: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-structure
-
 type Speak struct {
-	XMLName         xml.Name        `xml:"speak"`
-	Version         string          `xml:"version,attr"`
-	Xmlns           string          `xml:"xmlns,attr"`
-	Mstts           string          `xml:"mstts,attr"`
-	Lang            string          `xml:"xml:lang,attr"`
-	Backgroundaudio Backgroundaudio `xml:"mstts:backgroundaudio"`
-	Voice           Voice           `xml:"voice"`
-}
-
-type Backgroundaudio struct {
-	Src     string `xml:"src,attr"`
-	Volume  string `xml:"volume,attr"`
-	Fadein  string `xml:"fadein,attr"`
-	Fadeout string `xml:"fadeout,attr"`
+	XMLName xml.Name `xml:"speak"`
+	Version string   `xml:"version,attr"`
+	Xmlns   string   `xml:"xmlns,attr"`
+	Lang    string   `xml:"xml:lang,attr"`
+	Voice   []Voice  `xml:"voice"`
 }
 
 type Voice struct {
-	Name     string   `xml:"name,attr"`
-	Effect   string   `xml:"effect,attr"`
-	Audio    Audio    `xml:"audio"`
-	Bookmark string   `xml:"bookmark,omitempty"`
-	Break    Break    `xml:"break,omitempty"`
-	Emphasis Emphasis `xml:"emphasis,omitempty"`
-	Lang     Lang     `xml:"lang"`
-	Lexicon  Lexicon  `xml:"lexicon,omitempty"`
-	Math     string   `xml:"math,omitempty"`
-	Mstts    Mstts    `xml:"mstts,omitempty"`
-	P        string   `xml:"p,omitempty"`
-	Phoneme  Phoneme  `xml:"phoneme,omitempty"`
-	Prosody  Prosody  `xml:"prosody"`
-	SayAs    SayAs    `xml:"say-as,omitempty"`
-	Sub      string   `xml:"sub,omitempty"`
-}
-
-type Audio struct {
-	Src string `xml:"src"`
-}
-
-type Break struct {
-	Strength string `xml:"strength,attr"`
-	Time     string `xml:"time,attr"`
-}
-
-type Emphasis struct {
-	Level string `xml:"level,attr"`
-}
-
-type Lang struct {
-	XmlLang string `xml:"xml:lang,attr"`
-}
-
-type Lexicon struct {
-	URI string `xml:"uri,attr"`
-}
-
-type Mstts struct {
-	Backgroundaudio string       `xml:"backgroundaudio"`
-	Ttsembedding    TtsEmbedding `xml:"ttsembedding"`
-	ExpressAs       ExpressAs    `xml:"express-as"`
-	Silence         Silence      `xml:"silence"`
-	Viseme          Viseme       `xml:"viseme"`
-	Audioduration   string       `xml:"audioduration"`
-}
-
-type TtsEmbedding struct {
-	SpeakerProfileId string `xml:"speakerProfileId,attr"`
-}
-
-type ExpressAs struct {
-	Style       string `xml:"style,attr"`
-	Styledegree string `xml:"styledegree,attr"`
-	Role        string `xml:"role,attr"`
-}
-
-type Silence struct {
-	Type  string `xml:"type,attr"`
-	Value string `xml:"value,attr"`
-}
-
-type Viseme struct {
-	Type string `xml:"type,attr"`
-}
-
-type Phoneme struct {
-	Alphabet string `xml:"alphabet,attr"`
-	Ph       string `xml:"ph,attr"`
+	Name    string  `xml:"name,attr"`
+	Prosody Prosody `xml:"prosody"`
 }
 
 type Prosody struct {
-	Pitch   string `xml:"pitch,attr"`
+	// Contour represents changes in pitch. These changes are represented as an array of targets at specified time
+	//positions in the speech output. Sets of parameter pairs define each target. For example:
+	//
+	//<prosody contour="(0%,+20Hz) (10%,-2st) (40%,+10Hz)">
+	//
+	//The first value in each set of parameters specifies the location of the pitch change as a percentage of the
+	//duration of the text. The second value specifies the amount to raise or lower the pitch by using a relative
+	//value or an enumeration value for pitch (see pitch).
 	Contour string `xml:"contour,attr,omitempty"`
-	Range   string `xml:"range,attr,omitempty"`
-	Rate    string `xml:"rate,attr"`
-	Volume  string `xml:"volume,attr"`
-}
-
-type SayAs struct {
-	InterpretAs string `xml:"interpret-as,attr"`
-	Format      string `xml:"format,attr"`
-	Detail      string `xml:"detail,attr"`
+	//Indicates the baseline pitch for the text. Pitch changes can be applied at the sentence level. The pitch changes
+	//should be within 0.5 to 1.5 times the original audio. You can express the pitch as:
+	//An absolute value:
+	//Expressed as a number followed by "Hz" (Hertz). For example, <prosody pitch="600Hz">some text</prosody>.
+	//A relative value:
+	//	As a relative number: Expressed as a number preceded by "+" or "-" and followed by "Hz" or "st" that specifies
+	//	an amount to change the pitch. For example:
+	//	<prosody pitch="+80Hz">some text</prosody> or <prosody pitch="-2st">some text</prosody>.
+	//	The "st" indicates the change unit is semitone, which is half of a tone (a half step) on the standard diatonic scale.
+	//As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the
+	//relative change. For example: <prosody pitch="50%">some text</prosody> or <prosody pitch="-50%">some text</prosody>.
+	// A constant value:
+	//	x-low
+	//	low
+	//	medium
+	//	high
+	//	x-high
+	//	default
+	Pitch string `xml:"pitch,attr"`
+	// Indicates the speaking rate of the text. Speaking rate can be applied at the word or sentence level. The rate changes
+	//should be within 0.5 to 2 times the original audio. You can express rate as:
+	//A relative value:
+	//	As a relative number: Expressed as a number that acts as a multiplier of the default. For example, a value of 1 results
+	//	in no change in the original rate. A value of 0.5 results in a halving of the original rate. A value of 2 results in
+	//	twice the original rate.
+	//	As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the relative
+	//	change. For example:
+	//	<prosody rate="50%">some text</prosody> or <prosody rate="-50%">some text</prosody>.
+	//	A constant value:
+	//	x-slow
+	//	slow
+	//	medium
+	//	fast
+	//	x-fast
+	//	default
+	Rate string `xml:"rate,attr"`
+	// Indicates the volume level of the speaking voice. Volume changes can be applied at the sentence level. You can express
+	//the volume as:
+	// An absolute value: Expressed as a number in the range of 0.0 to 100.0, from quietest to loudest, such as 75.
+	//The default value is 100.0.
+	// A relative value:
+	// As a relative number: Expressed as a number preceded by "+" or "-" that specifies an amount to change the volume.
+	//Examples are +10 or -5.5.
+	// As a percentage: Expressed as a number preceded by "+" (optionally) or "-" and followed by "%", indicating the
+	//relative change. For example:
+	//	<prosody volume="50%">some text</prosody> or <prosody volume="+3%">some text</prosody>.
+	//
+	//	A constant value:
+	//	silent
+	//	x-soft
+	//	soft
+	//	medium
+	//	loud
+	//	x-loud
+	//	default
+	Volume string `xml:"volume,attr"`
+	Text   string `xml:",chardata"`
 }