feat(texttospeech): update the API

yoshi-automation · yoshi-automation · commit a8dde3ed3f92 · 2025-05-28T01:37:34.000Z
#### texttospeech:v1beta1

The following keys were added:
- schemas.SynthesisInput.properties.markup.description
- schemas.SynthesisInput.properties.markup.type

The following keys were changed:
- schemas.CustomPronunciationParams.properties.phoneticEncoding.enum
- schemas.CustomPronunciationParams.properties.phoneticEncoding.enumDescriptions

#### texttospeech:v1

The following keys were added:
- schemas.SynthesisInput.properties.markup.description
- schemas.SynthesisInput.properties.markup.type

The following keys were changed:
- schemas.CustomPronunciationParams.properties.phoneticEncoding.enum
- schemas.CustomPronunciationParams.properties.phoneticEncoding.enumDescriptions
diff --git a/discovery/texttospeech-v1.json b/discovery/texttospeech-v1.json
@@ -318,7 +318,7 @@
       }
     }
   },
-  "revision": "20250415",
+  "revision": "20250424",
   "rootUrl": "https://texttospeech.googleapis.com/",
   "schemas": {
     "AdvancedVoiceOptions": {
@@ -403,12 +403,16 @@
           "enum": [
             "PHONETIC_ENCODING_UNSPECIFIED",
             "PHONETIC_ENCODING_IPA",
-            "PHONETIC_ENCODING_X_SAMPA"
+            "PHONETIC_ENCODING_X_SAMPA",
+            "PHONETIC_ENCODING_JAPANESE_YOMIGANA",
+            "PHONETIC_ENCODING_PINYIN"
           ],
           "enumDescriptions": [
             "Not specified.",
             "IPA, such as apple -> ˈæpəl. https://en.wikipedia.org/wiki/International_Phonetic_Alphabet",
-            "X-SAMPA, such as apple -> \"{p@l\". https://en.wikipedia.org/wiki/X-SAMPA"
+            "X-SAMPA, such as apple -> \"{p@l\". https://en.wikipedia.org/wiki/X-SAMPA",
+            "For reading-to-pron conversion to work well, the `pronunciation` field should only contain Kanji, Hiragana, and Katakana. The pronunciation can also contain pitch accents. The start of a pitch phrase is specified with `^` and the down-pitch position is specified with `!`, for example: phrase:端 pronunciation:^はし phrase:箸 pronunciation:^は!し phrase:橋 pronunciation:^はし! We currently only support the Tokyo dialect, which allows at most one down-pitch per phrase (i.e. at most one `!` between `^`).",
+            "Used to specify pronunciations for Mandarin words. See https://en.wikipedia.org/wiki/Pinyin. For example: 朝阳, the pronunciation is \"chao2 yang2\". The number represents the tone, and there is a space between syllables. Neutral tones are represented by 5, for example 孩子 \"hai2 zi5\"."
           ],
           "type": "string"
         },
@@ -608,6 +612,10 @@
           "$ref": "CustomPronunciations",
           "description": "Optional. The pronunciation customizations are applied to the input. If this is set, the input is synthesized using the given pronunciation customizations. The initial support is for en-us, with plans to expand to other locales in the future. Instant Clone voices aren't supported. In order to customize the pronunciation of a phrase, there must be an exact match of the phrase in the input types. If using SSML, the phrase must not be inside a phoneme tag."
         },
+        "markup": {
+          "description": "Markup for HD voices specifically. This field may not be used with any other voices.",
+          "type": "string"
+        },
         "multiSpeakerMarkup": {
           "$ref": "MultiSpeakerMarkup",
           "description": "The multi-speaker input to be synthesized. Only applicable for multi-speaker synthesis."
diff --git a/discovery/texttospeech-v1beta1.json b/discovery/texttospeech-v1beta1.json
@@ -261,7 +261,7 @@
       }
     }
   },
-  "revision": "20250415",
+  "revision": "20250424",
   "rootUrl": "https://texttospeech.googleapis.com/",
   "schemas": {
     "AdvancedVoiceOptions": {
@@ -342,12 +342,16 @@
           "enum": [
             "PHONETIC_ENCODING_UNSPECIFIED",
             "PHONETIC_ENCODING_IPA",
-            "PHONETIC_ENCODING_X_SAMPA"
+            "PHONETIC_ENCODING_X_SAMPA",
+            "PHONETIC_ENCODING_JAPANESE_YOMIGANA",
+            "PHONETIC_ENCODING_PINYIN"
           ],
           "enumDescriptions": [
             "Not specified.",
             "IPA, such as apple -> ˈæpəl. https://en.wikipedia.org/wiki/International_Phonetic_Alphabet",
-            "X-SAMPA, such as apple -> \"{p@l\". https://en.wikipedia.org/wiki/X-SAMPA"
+            "X-SAMPA, such as apple -> \"{p@l\". https://en.wikipedia.org/wiki/X-SAMPA",
+            "For reading-to-pron conversion to work well, the `pronunciation` field should only contain Kanji, Hiragana, and Katakana. The pronunciation can also contain pitch accents. The start of a pitch phrase is specified with `^` and the down-pitch position is specified with `!`, for example: phrase:端 pronunciation:^はし phrase:箸 pronunciation:^は!し phrase:橋 pronunciation:^はし! We currently only support the Tokyo dialect, which allows at most one down-pitch per phrase (i.e. at most one `!` between `^`).",
+            "Used to specify pronunciations for Mandarin words. See https://en.wikipedia.org/wiki/Pinyin. For example: 朝阳, the pronunciation is \"chao2 yang2\". The number represents the tone, and there is a space between syllables. Neutral tones are represented by 5, for example 孩子 \"hai2 zi5\"."
           ],
           "type": "string"
         },
@@ -541,6 +545,10 @@
           "$ref": "CustomPronunciations",
           "description": "Optional. The pronunciation customizations are applied to the input. If this is set, the input is synthesized using the given pronunciation customizations. The initial support is for en-us, with plans to expand to other locales in the future. Instant Clone voices aren't supported. In order to customize the pronunciation of a phrase, there must be an exact match of the phrase in the input types. If using SSML, the phrase must not be inside a phoneme tag."
         },
+        "markup": {
+          "description": "Markup for HD voices specifically. This field may not be used with any other voices.",
+          "type": "string"
+        },
         "multiSpeakerMarkup": {
           "$ref": "MultiSpeakerMarkup",
           "description": "The multi-speaker input to be synthesized. Only applicable for multi-speaker synthesis."
diff --git a/src/apis/texttospeech/v1.ts b/src/apis/texttospeech/v1.ts
@@ -313,6 +313,10 @@ export namespace texttospeech_v1 {
      * Optional. The pronunciation customizations are applied to the input. If this is set, the input is synthesized using the given pronunciation customizations. The initial support is for en-us, with plans to expand to other locales in the future. Instant Clone voices aren't supported. In order to customize the pronunciation of a phrase, there must be an exact match of the phrase in the input types. If using SSML, the phrase must not be inside a phoneme tag.
      */
     customPronunciations?: Schema$CustomPronunciations;
+    /**
+     * Markup for HD voices specifically. This field may not be used with any other voices.
+     */
+    markup?: string | null;
     /**
      * The multi-speaker input to be synthesized. Only applicable for multi-speaker synthesis.
      */
diff --git a/src/apis/texttospeech/v1beta1.ts b/src/apis/texttospeech/v1beta1.ts
@@ -303,6 +303,10 @@ export namespace texttospeech_v1beta1 {
      * Optional. The pronunciation customizations are applied to the input. If this is set, the input is synthesized using the given pronunciation customizations. The initial support is for en-us, with plans to expand to other locales in the future. Instant Clone voices aren't supported. In order to customize the pronunciation of a phrase, there must be an exact match of the phrase in the input types. If using SSML, the phrase must not be inside a phoneme tag.
      */
     customPronunciations?: Schema$CustomPronunciations;
+    /**
+     * Markup for HD voices specifically. This field may not be used with any other voices.
+     */
+    markup?: string | null;
     /**
      * The multi-speaker input to be synthesized. Only applicable for multi-speaker synthesis.
      */