simstudioai
diff --git a/‎apps/docs/components/icons.tsx‎
Lines changed: 24 additions & 0 deletions b/‎apps/docs/components/icons.tsx‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎apps/docs/components/ui/icon-mapping.ts‎
Lines changed: 2 additions & 0 deletions b/‎apps/docs/components/ui/icon-mapping.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎apps/docs/content/docs/en/tools/calendly.mdx‎
Lines changed: 14 additions & 0 deletions b/‎apps/docs/content/docs/en/tools/calendly.mdx‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎apps/docs/content/docs/en/tools/meta.json‎
Lines changed: 1 addition & 0 deletions b/‎apps/docs/content/docs/en/tools/meta.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/docs/content/docs/en/tools/stt.mdx‎
Lines changed: 122 additions & 0 deletions b/‎apps/docs/content/docs/en/tools/stt.mdx‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎apps/sim/app/api/files/upload/route.ts‎
Lines changed: 20 additions & 4 deletions b/‎apps/sim/app/api/files/upload/route.ts‎
Lines changed: 20 additions & 4 deletions
@@ -4084,3 +4084,27 @@ export function CalendlyIcon(props: SVGProps<SVGSVGElement>) {
     </svg>
   )
 }
+
+export function AudioWaveformIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      xmlns='http://www.w3.org/2000/svg'
+      width='24'
+      height='24'
+      viewBox='0 0 24 24'
+      fill='none'
+      stroke='currentColor'
+      strokeWidth='2'
+      strokeLinecap='round'
+      strokeLinejoin='round'
+    >
+      <path d='M2 10v3' />
+      <path d='M6 6v11' />
+      <path d='M10 3v18' />
+      <path d='M14 8v7' />
+      <path d='M18 5v13' />
+      <path d='M22 10v3' />
+    </svg>
+  )
+}
@@ -8,6 +8,7 @@ import {
   ApolloIcon,
   ArxivIcon,
   AsanaIcon,
+  AudioWaveformIcon,
   BrainIcon,
   BrowserUseIcon,
   CalendlyIcon,
@@ -100,6 +101,7 @@ export const blockTypeToIconMap: Record<string, IconComponent> = {
   telegram: TelegramIcon,
   tavily: TavilyIcon,
   supabase: SupabaseIcon,
+  stt: AudioWaveformIcon,
   stripe: StripeIcon,
   stagehand_agent: StagehandIcon,
   stagehand: StagehandIcon,
 
@@ -10,6 +10,20 @@ import { BlockInfoCard } from "@/components/ui/block-info-card"
   color="#FFFFFF"
 />
 
+{/* MANUAL-CONTENT-START:intro */}
+[Calendly](https://calendly.com/) is a popular scheduling automation platform that helps you book meetings, events, and appointments with ease. With Calendly, teams and individuals can streamline scheduling, reduce back-and-forth emails, and automate tasks around events.
+
+With the Sim Calendly integration, your agents can:
+
+- **Retrieve information about your account and scheduled events**: Use tools to fetch user info, event types, and scheduled events for analysis or automation.
+- **Manage event types and scheduling**: Access and list available event types for users or organizations, retrieve details about specific event types, and monitor scheduled meetings and invitee data.
+- **Automate follow-ups and workflows**: When users schedule, reschedule, or cancel meetings, Sim agents can automatically trigger corresponding workflows—such as sending reminders, updating CRMs, or notifying participants.
+- **Integrate easily using webhooks**: Set up Sim workflows to respond to real-time Calendly webhook events, including when invitees schedule, cancel, or interact with routing forms.
+
+Whether you want to automate meeting prep, manage invites, or run custom workflows in response to scheduling activity, the Calendly tools in Sim give you flexible and secure access. Unlock new automation by reacting instantly to scheduling changes—streamlining your team's operations and communications.
+{/* MANUAL-CONTENT-END */}
+
+
 ## Usage Instructions
 
 Integrate Calendly into your workflow. Manage event types, scheduled events, invitees, and webhooks. Can also trigger workflows based on Calendly webhook events (invitee scheduled, invitee canceled, routing form submitted). Requires Personal Access Token.
 
@@ -61,6 +61,7 @@
     "stagehand",
     "stagehand_agent",
     "stripe",
+    "stt",
     "supabase",
     "tavily",
     "telegram",
 
@@ -0,0 +1,122 @@
+---
+title: Speech-to-Text
+description: Convert speech to text using AI
+---
+
+import { BlockInfoCard } from "@/components/ui/block-info-card"
+
+<BlockInfoCard 
+  type="stt"
+  color="#181C1E"
+/>
+
+{/* MANUAL-CONTENT-START:intro */}
+Transcribe speech to text using state-of-the-art AI models from leading providers. The Sim Speech-to-Text (STT) tools allow you to convert audio and video files into accurate transcripts, supporting multiple languages, timestamps, and optional translation.
+
+Supported providers:
+
+- **[OpenAI Whisper](https://platform.openai.com/docs/guides/speech-to-text/overview)**: Advanced open-source STT model from OpenAI. Supports models such as `whisper-1` and handles a wide variety of languages and audio formats.
+- **[Deepgram](https://deepgram.com/)**: Real-time and batch STT API with deep learning models like `nova-3`, `nova-2`, and `whisper-large`. Offers features like diarization, intent recognition, and industry-specific tuning.
+- **[ElevenLabs](https://elevenlabs.io/)**: Known for high-quality speech AI, ElevenLabs provides STT models focused on accuracy and natural language understanding for numerous languages and dialects.
+
+Choose the provider and model best suited to your task—whether fast, production-grade transcription (Deepgram), highly accurate multi-language capability (Whisper), or advanced understanding and language coverage (ElevenLabs).
+{/* MANUAL-CONTENT-END */}
+
+
+## Usage Instructions
+
+Transcribe audio and video files to text using leading AI providers. Supports multiple languages, timestamps, and speaker diarization.
+
+
+
+## Tools
+
+### `stt_whisper`
+
+Transcribe audio to text using OpenAI Whisper
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(whisper\) |
+| `apiKey` | string | Yes | OpenAI API key |
+| `model` | string | No | Whisper model to use \(default: whisper-1\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+| `translateToEnglish` | boolean | No | Translate audio to English |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+### `stt_deepgram`
+
+Transcribe audio to text using Deepgram
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(deepgram\) |
+| `apiKey` | string | Yes | Deepgram API key |
+| `model` | string | No | Deepgram model to use \(nova-3, nova-2, whisper-large, etc.\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+| `diarization` | boolean | No | Enable speaker diarization |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments with speaker labels |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+### `stt_elevenlabs`
+
+Transcribe audio to text using ElevenLabs
+
+#### Input
+
+| Parameter | Type | Required | Description |
+| --------- | ---- | -------- | ----------- |
+| `provider` | string | Yes | STT provider \(elevenlabs\) |
+| `apiKey` | string | Yes | ElevenLabs API key |
+| `model` | string | No | ElevenLabs model to use \(scribe_v1, scribe_v1_experimental\) |
+| `audioFile` | file | No | Audio or video file to transcribe |
+| `audioFileReference` | file | No | Reference to audio/video file from previous blocks |
+| `audioUrl` | string | No | URL to audio or video file |
+| `language` | string | No | Language code \(e.g., "en", "es", "fr"\) or "auto" for auto-detection |
+| `timestamps` | string | No | Timestamp granularity: none, sentence, or word |
+
+#### Output
+
+| Parameter | Type | Description |
+| --------- | ---- | ----------- |
+| `transcript` | string | Full transcribed text |
+| `segments` | array | Timestamped segments |
+| `language` | string | Detected or specified language |
+| `duration` | number | Audio duration in seconds |
+| `confidence` | number | Overall confidence score |
+
+
+
+## Notes
+
+- Category: `tools`
+- Type: `stt`
@@ -13,21 +13,37 @@ import {
 } from '@/app/api/files/utils'
 
 const ALLOWED_EXTENSIONS = new Set([
+  // Documents
   'pdf',
   'doc',
   'docx',
   'txt',
   'md',
-  'png',
-  'jpg',
-  'jpeg',
-  'gif',
   'csv',
   'xlsx',
   'xls',
   'json',
   'yaml',
   'yml',
+  // Images
+  'png',
+  'jpg',
+  'jpeg',
+  'gif',
+  // Audio
+  'mp3',
+  'm4a',
+  'wav',
+  'webm',
+  'ogg',
+  'flac',
+  'aac',
+  'opus',
+  // Video
+  'mp4',
+  'mov',
+  'avi',
+  'mkv',
 ])
 
 function validateFileExtension(filename: string): boolean {