Skip to content

Commit 428851e

Browse files
authored
Merge pull request #261 from egorky/azure-provider
Azure provider STT and TTS
2 parents 987d061 + 493ab13 commit 428851e

File tree

17 files changed

+2373
-36
lines changed

17 files changed

+2373
-36
lines changed

admin_ui/backend/api/config.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1014,7 +1014,11 @@ def get_env_key(key_name: str) -> str:
10141014
for line in f:
10151015
line = line.strip()
10161016
if line.startswith(f"{key_name}="):
1017-
return line.split('=', 1)[1].strip()
1017+
value = line.split('=', 1)[1].strip()
1018+
# Strip surrounding single or double quotes (common .env convention)
1019+
if len(value) >= 2 and value[0] == value[-1] and value[0] in ('"', "'"):
1020+
value = value[1:-1]
1021+
return value
10181022
return ''
10191023

10201024
# Helper to substitute environment variables in config values
@@ -1409,6 +1413,40 @@ def _telnyx_error_summary(resp: httpx.Response) -> str:
14091413
# If it's pure local without WS (e.g. wrapper), assume success if file paths exist?
14101414
return {"success": True, "message": "Provider configuration valid (No specific connection test available)"}
14111415

1416+
# ============================================================
1417+
# AZURE SPEECH SERVICE (STT / TTS)
1418+
# ============================================================
1419+
if provider_config.get('type') == 'azure' or 'azure' in provider_name:
1420+
api_key = get_env_key('AZURE_SPEECH_KEY') or os.getenv('AZURE_SPEECH_KEY') or ''
1421+
if not api_key:
1422+
return {"success": False, "message": "AZURE_SPEECH_KEY not set in .env file"}
1423+
region = provider_config.get('region', 'eastus')
1424+
# Validate region to prevent SSRF via crafted region values
1425+
import re
1426+
_azure_region_re = re.compile(r"^[a-z][a-z0-9-]{0,48}[a-z0-9]$")
1427+
region = str(region).strip().lower()
1428+
if not region or not _azure_region_re.match(region):
1429+
return {"success": False, "message": f"Invalid Azure region '{region}'. Expected lowercase alphanumeric (e.g. 'eastus')."}
1430+
# Hit the token endpoint — a 200 or 400 response proves the key is recognized
1431+
token_url = f"https://{region}.api.cognitive.microsoft.com/sts/v1.0/issueToken"
1432+
try:
1433+
async with httpx.AsyncClient() as client:
1434+
response = await client.post(
1435+
token_url,
1436+
headers={"Ocp-Apim-Subscription-Key": api_key},
1437+
timeout=10.0,
1438+
)
1439+
if response.status_code == 200:
1440+
capabilities = provider_config.get('capabilities', [])
1441+
cap_str = '/'.join(str(c).upper() for c in capabilities) if capabilities else 'Speech'
1442+
return {"success": True, "message": f"Connected to Azure Speech Service ({region}). {cap_str} key valid."}
1443+
if response.status_code == 401:
1444+
return {"success": False, "message": "Invalid AZURE_SPEECH_KEY (401 Unauthorized)"}
1445+
return {"success": False, "message": f"Azure Speech API returned HTTP {response.status_code} for region '{region}'"}
1446+
except Exception as e:
1447+
logger.debug("Azure Speech provider validation failed", error=str(e), exc_info=True)
1448+
return {"success": False, "message": f"Cannot connect to Azure Speech Service at region '{region}' (see server logs)"}
1449+
14121450
return {"success": False, "message": "Unknown provider type - cannot test"}
14131451

14141452
except httpx.TimeoutException:

admin_ui/backend/api/local_ai.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ class AvailableModels(BaseModel):
140140

141141
class SwitchModelRequest(BaseModel):
142142
"""Request to switch model."""
143+
model_config = {"protected_namespaces": ()}
143144
model_type: str # stt, tts, llm
144145
backend: Optional[str] = None # For STT/TTS: vosk, sherpa, kroko, piper, kokoro
145146
model_path: Optional[str] = None # For models with paths

admin_ui/frontend/src/components/config/PipelineForm.tsx

Lines changed: 108 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ const PipelineForm: React.FC<PipelineFormProps> = ({ config, providers, onChange
3434
const [statusLoading, setStatusLoading] = useState(false);
3535
const [showAdvancedSTT, setShowAdvancedSTT] = useState(false);
3636
const [showLlmExpert, setShowLlmExpert] = useState<boolean>(
37-
() => config?.options?.llm?.tools_enabled !== undefined || Boolean(config?.options?.llm?.realtime_model)
37+
() => config?.options?.llm?.tools_enabled !== undefined || Boolean(config?.options?.llm?.realtime_model) || config?.options?.llm?.aggregation_min_words !== undefined || config?.options?.llm?.aggregation_min_chars !== undefined
3838
);
3939
const [showSttExpert, setShowSttExpert] = useState<boolean>(
4040
() => Array.isArray(config?.options?.stt?.timestamp_granularities) && config.options.stt.timestamp_granularities.length > 0
@@ -67,16 +67,19 @@ const PipelineForm: React.FC<PipelineFormProps> = ({ config, providers, onChange
6767
}, [config]);
6868

6969
useEffect(() => {
70-
if (config?.options?.llm?.tools_enabled !== undefined || config?.options?.llm?.realtime_model) {
70+
if (config?.options?.llm?.tools_enabled !== undefined || config?.options?.llm?.realtime_model || config?.options?.llm?.aggregation_min_words !== undefined || config?.options?.llm?.aggregation_min_chars !== undefined) {
7171
setShowLlmExpert(true);
7272
}
73-
}, [config?.options?.llm?.tools_enabled, config?.options?.llm?.realtime_model]);
73+
}, [config?.options?.llm?.tools_enabled, config?.options?.llm?.realtime_model, config?.options?.llm?.aggregation_min_words, config?.options?.llm?.aggregation_min_chars]);
7474

7575
useEffect(() => {
76-
if (Array.isArray(config?.options?.stt?.timestamp_granularities) && config.options.stt.timestamp_granularities.length > 0) {
76+
if ((Array.isArray(config?.options?.stt?.timestamp_granularities) && config.options.stt.timestamp_granularities.length > 0)
77+
|| config?.options?.stt?.vad_silence_ms !== undefined
78+
|| config?.options?.stt?.variant !== undefined
79+
|| config?.options?.stt?.vad_silence_timeout_ms !== undefined) {
7780
setShowSttExpert(true);
7881
}
79-
}, [config?.options?.stt?.timestamp_granularities]);
82+
}, [config?.options?.stt?.timestamp_granularities, config?.options?.stt?.vad_silence_ms, config?.options?.stt?.variant, config?.options?.stt?.vad_silence_timeout_ms]);
8083

8184
useEffect(() => {
8285
if (config?.options?.tts?.response_format !== undefined || config?.options?.tts?.max_input_chars !== undefined) {
@@ -221,6 +224,8 @@ const PipelineForm: React.FC<PipelineFormProps> = ({ config, providers, onChange
221224
const isGroqStt = sttKey.includes('groq');
222225
const isGroqTts = ttsKey.includes('groq');
223226
const isOllamaLlm = llmKey.includes('ollama');
227+
const isAzureStt = sttKey.includes('azure');
228+
const isAzureTts = ttsKey.includes('azure');
224229

225230
const timestampGranularities = Array.isArray(localConfig.options?.stt?.timestamp_granularities)
226231
? localConfig.options?.stt?.timestamp_granularities
@@ -425,6 +430,38 @@ const PipelineForm: React.FC<PipelineFormProps> = ({ config, providers, onChange
425430
disabled={!showLlmExpert}
426431
/>
427432
)}
433+
<FormInput
434+
label="LLM Min Words Threshold"
435+
type="number"
436+
min={1}
437+
step={1}
438+
value={localConfig.options?.llm?.aggregation_min_words ?? ''}
439+
onChange={(e) => {
440+
const raw = e.target.value;
441+
if (!raw) { updateRoleOptions('llm', { aggregation_min_words: undefined }); return; }
442+
const parsed = parseInt(raw, 10);
443+
if (Number.isFinite(parsed)) { updateRoleOptions('llm', { aggregation_min_words: Math.max(1, parsed) }); }
444+
}}
445+
placeholder="Auto"
446+
tooltip="Minimum words to wait before sending transcript to LLM."
447+
disabled={!showLlmExpert}
448+
/>
449+
<FormInput
450+
label="LLM Min Chars Threshold"
451+
type="number"
452+
min={1}
453+
step={1}
454+
value={localConfig.options?.llm?.aggregation_min_chars ?? ''}
455+
onChange={(e) => {
456+
const raw = e.target.value;
457+
if (!raw) { updateRoleOptions('llm', { aggregation_min_chars: undefined }); return; }
458+
const parsed = parseInt(raw, 10);
459+
if (Number.isFinite(parsed)) { updateRoleOptions('llm', { aggregation_min_chars: Math.max(1, parsed) }); }
460+
}}
461+
placeholder="Auto"
462+
tooltip="Minimum characters to wait before sending transcript to LLM."
463+
disabled={!showLlmExpert}
464+
/>
428465
</div>
429466
<div className="mt-2 border-t border-amber-300/30 pt-3 space-y-3">
430467
<p className="text-xs text-muted-foreground">
@@ -518,11 +555,11 @@ const PipelineForm: React.FC<PipelineFormProps> = ({ config, providers, onChange
518555
</div>
519556
)}
520557

521-
{(isOpenAIStt || isGroqStt) && (
558+
{(isOpenAIStt || isGroqStt || isAzureStt) && (
522559
<div className="space-y-3 border border-amber-300/40 rounded-lg p-4 bg-amber-500/5">
523560
<FormSwitch
524561
label="STT Expert Settings"
525-
description="Expose advanced STT adapter timestamp options."
562+
description="Expose advanced STT adapter options."
526563
checked={showSttExpert}
527564
onChange={(e) => setShowSttExpert(e.target.checked)}
528565
className="mb-0 border-0 p-0 bg-transparent"
@@ -533,26 +570,54 @@ const PipelineForm: React.FC<PipelineFormProps> = ({ config, providers, onChange
533570
: 'Expert values are visible and read-only until STT expert mode is enabled.'}
534571
</p>
535572
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
536-
<FormInput
537-
label="STT Timestamp Granularities"
538-
value={timestampGranularitiesText}
539-
onChange={(e) =>
540-
updateRoleOptions('stt', {
541-
timestamp_granularities: (e.target.value || '')
542-
.split(',')
543-
.map((v) => v.trim())
544-
.filter(Boolean),
545-
})
546-
}
547-
placeholder="segment, word"
548-
tooltip="Comma-separated; only supported on specific models/endpoints."
549-
disabled={!showSttExpert}
550-
/>
573+
{(isOpenAIStt || isGroqStt) && (
574+
<FormInput
575+
label="STT Timestamp Granularities"
576+
value={timestampGranularitiesText}
577+
onChange={(e) =>
578+
updateRoleOptions('stt', {
579+
timestamp_granularities: (e.target.value || '')
580+
.split(',')
581+
.map((v) => v.trim())
582+
.filter(Boolean),
583+
})
584+
}
585+
placeholder="segment, word"
586+
tooltip="Comma-separated; only supported on specific models/endpoints."
587+
disabled={!showSttExpert}
588+
/>
589+
)}
590+
{isAzureStt && (
591+
<>
592+
<div className="space-y-1">
593+
<label className="text-sm font-medium">Azure STT Variant Override</label>
594+
<select
595+
className="w-full p-2 rounded border border-input bg-background text-sm"
596+
value={localConfig.options?.stt?.variant || ''}
597+
onChange={(e) => updateRoleOptions('stt', { variant: e.target.value || undefined })}
598+
disabled={!showSttExpert}
599+
>
600+
<option value="">Use provider default</option>
601+
<option value="realtime">realtime</option>
602+
<option value="fast">fast</option>
603+
</select>
604+
<p className="text-xs text-muted-foreground">Override the variant set on the provider.</p>
605+
</div>
606+
<FormInput
607+
label="Azure STT Language Override"
608+
value={localConfig.options?.stt?.language || ''}
609+
onChange={(e) => updateRoleOptions('stt', { language: e.target.value || undefined })}
610+
placeholder="en-US"
611+
tooltip="Override the BCP-47 locale for this pipeline slot."
612+
disabled={!showSttExpert}
613+
/>
614+
</>
615+
)}
551616
</div>
552617
</div>
553618
)}
554619

555-
{(isOpenAITts || isGroqTts) && (
620+
{(isOpenAITts || isGroqTts || isAzureTts) && (
556621
<div className="space-y-3 border border-amber-300/40 rounded-lg p-4 bg-amber-500/5">
557622
<FormSwitch
558623
label="TTS Expert Settings"
@@ -587,6 +652,26 @@ const PipelineForm: React.FC<PipelineFormProps> = ({ config, providers, onChange
587652
disabled={!showTtsExpert}
588653
/>
589654
)}
655+
{isAzureTts && (
656+
<>
657+
<FormInput
658+
label="Azure TTS Voice Name Override"
659+
value={localConfig.options?.tts?.voice_name || ''}
660+
onChange={(e) => updateRoleOptions('tts', { voice_name: e.target.value || undefined })}
661+
placeholder="en-US-JennyNeural"
662+
tooltip="Override the neural voice name for this pipeline slot."
663+
disabled={!showTtsExpert}
664+
/>
665+
<FormInput
666+
label="Azure TTS Output Format Override"
667+
value={localConfig.options?.tts?.output_format || ''}
668+
onChange={(e) => updateRoleOptions('tts', { output_format: e.target.value || undefined })}
669+
placeholder="riff-8khz-16bit-mono-pcm"
670+
tooltip="Override the Azure output format for this pipeline slot."
671+
disabled={!showTtsExpert}
672+
/>
673+
</>
674+
)}
590675
</div>
591676
</div>
592677
)}

0 commit comments

Comments
 (0)