onvif · Peggy0422 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/doc/Media2.xml b/doc/Media2.xml
@@ -2355,7 +2355,52 @@
             <para>When the size of the audio clip upload exceeds the MaxAudioClipSize parameter in KB, the device should return an HTTP 413, Request Entity Too Large error to the client.</para>
           </listitem>
         </itemizedlist>              
-      </section>    
+      </section>  
+	  <section xml:id="section_AddTTSAudioClip">
+        <title>AddTTSAudioClip</title>
+        <para>This operation adds a text, audio clip configuration and TTS configuration to the device, for device converting the text to an audio clip based on the TTS configuration. 
+			The response to the command includes a unique token for this converted audio clip. 
+			If the device is unable to support language specified in the TTS configuration, the associated configuration will be deleted from the device.</para>
+        <variablelist role="op">
+          <varlistentry>
+            <term>request</term>
+            <listitem>
+      <para role="param">Token - optional[tt:ReferenceToken]</para>
+              <para role="text"> Optional token associated with the audio clip.</para>
+              <para role="param">Configuration - 
+                [tr2:AudioClip]</para>
+              <para role="text"> Audio clip configuration to add.</para>
+      <para role="param">TTSConfiguration - 
+                [tr2:TTSAudio]</para>
+              <para role="text"> TTS configuration for converting a text to an audio clip.</para>
+            </listitem>
+          </varlistentry>
+          <varlistentry>
+            <term>response</term>
+            <listitem>
+              <para role="param">Token - [tt:ReferenceToken]</para>
+              <para role="text">Unique token assigned by device for the TTS audio clip.</para>             
+            </listitem>
+          </varlistentry>
+          <varlistentry>
+            <term>faults</term>
+            <listitem>
+              <para role="param">env:Receiver - ter:Action - ter:MaxAudioClipLimit</para>
+              <para role="text">The maximum number of audio clip configurations supported by the device has been reached.</para>
+              <para role="param">env:Sender - ter:InvalidArgVal - ter:InvalidConfig</para>
+              <para role="text">The configuration parameters are not possible to set.</para>  
+              <para role="param">env:Sender - ter:InvalidArgVal - ter:InvalidLanguage</para>
+              <para role="text">The language is not supported.</para>  
+            </listitem>
+          </varlistentry>
+          <varlistentry>
+            <term>access class</term>
+            <listitem>
+              <para role="access">WRITE_SYSTEM</para>
+            </listitem>
+          </varlistentry>
+        </variablelist>          
+      </section>       
 	  <section xml:id="section_wvd_dzg_rye">
         <title>SetAudioClip</title>
         <para>This operation modifies the existing audio clip configuration on the device.</para>
@@ -2765,6 +2810,10 @@
 		<varlistentry>
           <term>SupportedAudioClipFormat</term>
           <listitem><para>Enumerates the supported audio clip formats. See tr2: SupportedAudioClipFormat.</para></listitem>
+        </varlistentry>
+		<varlistentry>
+          <term>TTSCapabilities</term>
+          <listitem><para>Indicates device supports TTS function and TTS configuration. See tr2: TTSCapabilities.</para></listitem>
         </varlistentry>
 	</variablelist>
 	</section>

diff --git a/wsdl/ver20/media/wsdl/media.wsdl b/wsdl/ver20/media/wsdl/media.wsdl
@@ -202,6 +202,13 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 			<!--===============================-->
 			<xs:complexType name="AudioClipCapabilities">
 				<xs:sequence>
+                  <!--==============TTS Capability=================-->
+                  <xs:element name="TTSCapabilities" type="tr2:TTSCapabilities" minOccurs="0">
+                     <xs:annotation>
+                        <xs:documentation>Indicates device has TTS capability.</xs:documentation>
+                     </xs:annotation>
+                  </xs:element>
+                 <!--=============================================-->
 					<xs:any namespace="##any" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>   <!-- first ONVIF then Vendor -->
 				</xs:sequence>
 				<xs:attribute name="MaxAudioClipLimit" type="xs:int">
@@ -222,6 +229,40 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 				<xs:anyAttribute processContents="lax"/>
 			</xs:complexType>
 			<!--===============================-->
+            <!--=============TTS Capability=================-->
+            <xs:complexType name="TTSCapabilities">
+				<xs:sequence>
+                   <xs:any namespace="##any" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>   <!-- first ONVIF then Vendor -->
+                </xs:sequence>
+				<xs:attribute name="MaxContentLength" type="xs:int">
+                   <xs:annotation>
+                       <xs:documentation> Indicates the maximum length of content of a text for device to convert to an audio clip.</xs:documentation>
+                   </xs:annotation>
+                 </xs:attribute>
+                 <xs:attribute name="TTSLanguage" type="tt:StringAttrList">
+                    <xs:annotation>
+                        <xs:documentation> 
+                            List of supported languages. Uses ISO 639-1 alpha-2 language codes, such as"en" for English. See <a href="https://www.loc.gov/standards/iso639-2/php/English_list.php">Codes for the Representation of Names of Languages</a>.
+                            Optionally combined with ISO 3166-1 alpha-2 country codes using the "language-country" format to specify regional variations, such as"en-US" for American English. For country codes, see <a href="https://www.iso.org/obp/ui/">ISO 3166 Country Codes</a>.
+                        </xs:documentation>
+                   </xs:annotation>
+                 </xs:attribute>
+                 <xs:attribute name="TTSVoiceType" type="tt:StringAttrList">
+                   <xs:annotation>
+                        <xs:documentation> List of supported voice types. See tr2: TTSVoiceType.</xs:documentation>     
+                   </xs:annotation>
+                 </xs:attribute>
+                <xs:anyAttribute processContents="lax"/>
+            </xs:complexType>
+          <!--=============TTS Voice Type==================-->
+          <xs:simpleType name="TTSVoiceType">
+              <xs:restriction base="xs:string">
+              <xs:enumeration value="male"/>
+              <xs:enumeration value="female"/>
+              </xs:restriction>
+          </xs:simpleType>
+         <!--==============TTS End===============-->
+         <!--====================================-->
 			<xs:simpleType name="SupportedAudioClipFormat">
 				<xs:restriction base="xs:string">
 					<xs:enumeration value="audio/vnd.wave;codec=1"/>
@@ -1445,8 +1486,34 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 				</xs:sequence>
 				<xs:anyAttribute processContents="lax"/>
 			</xs:complexType>
-
-			<!--===============================-->
+             <!--=========TTS Audio======================-->
+			 <xs:complexType name="TTSAudio">
+				<xs:sequence>					
+				 <xs:element name="Content" type="xs:string">
+                     <xs:annotation>
+                         <xs:documentation>Content of the audio clip.</xs:documentation>
+                     </xs:annotation>
+                 </xs:element>
+                 <xs:element name="Language" type="xs:string">
+                     <xs:annotation>
+                         <xs:documentation>
+                             The language that is supported by the device and used for TTS audio clip playback. 
+                             Uses ISO 639-1 alpha-2 language codes for definition, such as"en" for English. See <a href="https://www.loc.gov/standards/iso639-2/php/English_list.php">Codes for the Representation of Names of Languages</a>.
+                             Optionally combined with ISO 3166-1 alpha-2 country codes using the "language-country" format to specify regional variations, such as"en-US" for American English. For country codes, see <a href="https://www.iso.org/obp/ui/">ISO 3166 Country Codes</a>.
+                         </xs:documentation>
+                     </xs:annotation>
+                 </xs:element>
+                 <xs:element name="VoiceType" type="xs:string">
+                     <xs:annotation>
+                         <xs:documentation>The voice type that is supported by the device and used for TTS audio clip playback. See tr2: TTSVoiceType.</xs:documentation>
+                     </xs:annotation>
+                 </xs:element>
+					<xs:any namespace="##any" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>   <!-- first ONVIF then Vendor -->
+				</xs:sequence>
+				<xs:anyAttribute processContents="lax"/>
+			</xs:complexType>
+			<!--=========TTS Audio END=======================-->
+            <!--===============================-->
 			<xs:complexType name="GetAudioClipsResponseItem">
 				<xs:sequence>
 					<xs:element name="Token" type="tt:ReferenceToken">
@@ -1579,7 +1646,40 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 					</xs:sequence>
 				</xs:complexType>
 			</xs:element>
-
+        <!--==============TTS=================-->
+         <xs:element name="AddTTSAudioClip">
+				<xs:complexType>				
+					<xs:sequence>
+						<xs:element name="Token" type="tt:ReferenceToken" minOccurs="0">				
+							<xs:annotation>
+								<xs:documentation>Optional token associated with the audio clip.</xs:documentation>
+							</xs:annotation>
+						</xs:element>
+						<xs:element name="Configuration" type="tr2:AudioClip">
+							<xs:annotation>
+								<xs:documentation>Audio clip configuration to add.</xs:documentation>
+							</xs:annotation>
+					    </xs:element>
+                        <xs:element name="TTSConfiguration" type="tr2:TTSAudio">
+                        <xs:annotation>
+                        <xs:documentation>The configuration for the TTS audio clip to add.</xs:documentation>
+                        </xs:annotation>
+                       </xs:element>        
+					</xs:sequence>
+				</xs:complexType>
+			</xs:element>			
+			<xs:element name="AddTTSAudioClipResponse">
+				<xs:complexType>
+					<xs:sequence>
+						<xs:element name="Token" type="tt:ReferenceToken">						
+							<xs:annotation>
+								<xs:documentation>Unique token assigned by device for the TTS audio clip.</xs:documentation>
+							</xs:annotation>
+						</xs:element>
+					</xs:sequence>
+				</xs:complexType>				
+			</xs:element>
+<!--==============TTS END=================-->
 			<xs:element name="DeleteAudioClip">
 				<xs:complexType>
 					<xs:sequence>
@@ -2018,6 +2118,14 @@ IN NO EVENT WILL THE CORPORATION OR ITS MEMBERS OR THEIR AFFILIATES BE LIABLE FO
 	<wsdl:message name="AddAudioClipResponse">
 		<wsdl:part name="parameters" element="tr2:AddAudioClipResponse"/>
 	</wsdl:message>
+<!--==============TTS=================--> 
+    <wsdl:message name="AddTTSAudioClipRequest">
+		<wsdl:part name="parameters" element="tr2:AddTTSAudioClip"/>
+	</wsdl:message>
+	<wsdl:message name="AddTTSAudioClipResponse">
+		<wsdl:part name="parameters" element="tr2:AddTTSAudioClipResponse"/>
+	</wsdl:message>
+<!--==============================--> 
 	<wsdl:message name="SetAudioClipRequest">
 		<wsdl:part name="parameters" element="tr2:SetAudioClip"/>
 	</wsdl:message>
@@ -2412,6 +2520,13 @@ image will be updated automatically and independent from calls to GetSnapshotUri
 			<wsdl:input message="tr2:AddAudioClipRequest"/>
 			<wsdl:output message="tr2:AddAudioClipResponse"/>
 		</wsdl:operation>
+        <!--==============TTS=================--> 
+        <wsdl:operation name="AddTTSAudioClip">
+			<wsdl:documentation>This operation sends a text and its configuartion to device that supports TTS function, so that device could convert the text into an audio clip and play it according to audio clip Configuration and TTS Configuration.</wsdl:documentation>
+			<wsdl:input message="tr2:AddTTSAudioClipRequest"/>
+			<wsdl:output message="tr2:AddTTSAudioClipResponse"/>
+		</wsdl:operation>
+        <!--=============================--> 
 		<wsdl:operation name="SetAudioClip">
 			<wsdl:documentation>This operation modifies the existing audio clip configuration on the device.</wsdl:documentation>
 			<wsdl:input message="tr2:SetAudioClipRequest"/>
@@ -2940,6 +3055,17 @@ image will be updated automatically and independent from calls to GetSnapshotUri
 				<soap:body use="literal"/>
 			</wsdl:output>
 		</wsdl:operation>
+        <!--==============TTS=================-->
+        <wsdl:operation name="AddTTSAudioClip">
+			<soap:operation soapAction="http://www.onvif.org/ver20/media/wsdl/AddTTSAudioClip"/>
+			<wsdl:input>
+				<soap:body use="literal"/>
+			</wsdl:input>
+			<wsdl:output>
+				<soap:body use="literal"/>
+			</wsdl:output>
+		</wsdl:operation>
+       <!--=================================-->
 		<wsdl:operation name="SetAudioClip">
 			<soap:operation soapAction="http://www.onvif.org/ver20/media/wsdl/SetAudioClip"/>
 			<wsdl:input>