1+ {
2+ "cells" : [
3+ {
4+ "cell_type" : " markdown" ,
5+ "id" : " a050dfc0" ,
6+ "metadata" : {
7+ "id" : " a050dfc0"
8+ },
9+ "source" : [
10+ " # Sesame CSM (1B) - Text to Speech"
11+ ]
12+ },
13+ {
14+ "cell_type" : " markdown" ,
15+ "id" : " 7c2c61ec" ,
16+ "metadata" : {
17+ "id" : " 7c2c61ec"
18+ },
19+ "source" : [
20+ " This notebook showcases how to generate speech from text using the Sesame CSM 1B model. This is ideal for converting instructional or conversational content into natural audio output."
21+ ]
22+ },
23+ {
24+ "cell_type" : " markdown" ,
25+ "source" : [
26+ " [](https://colab.research.google.com/github/DhivyaBharathy-web/PraisonAI/blob/main/examples/cookbooks/Sesame_CSM_1B_TTS.ipynb)\n "
27+ ],
28+ "metadata" : {
29+ "id" : " 5SA94wTb9AH-"
30+ },
31+ "id" : " 5SA94wTb9AH-"
32+ },
33+ {
34+ "cell_type" : " markdown" ,
35+ "id" : " ab27fd8d" ,
36+ "metadata" : {
37+ "id" : " ab27fd8d"
38+ },
39+ "source" : [
40+ " ## 🧩 Dependencies"
41+ ]
42+ },
43+ {
44+ "cell_type" : " code" ,
45+ "execution_count" : null ,
46+ "id" : " ac3a627f" ,
47+ "metadata" : {
48+ "id" : " ac3a627f"
49+ },
50+ "outputs" : [],
51+ "source" : [
52+ " !pip install -q transformers\n " ,
53+ " !pip install -q torchaudio\n " ,
54+ " !pip install -q soundfile"
55+ ]
56+ },
57+ {
58+ "cell_type" : " markdown" ,
59+ "id" : " a382e4d0" ,
60+ "metadata" : {
61+ "id" : " a382e4d0"
62+ },
63+ "source" : [
64+ " ## 🛠️ Tools\n " ,
65+ " - transformers for loading the model\n " ,
66+ " - torchaudio for audio processing\n " ,
67+ " - soundfile for playback"
68+ ]
69+ },
70+ {
71+ "cell_type" : " markdown" ,
72+ "id" : " 7ff6eb6d" ,
73+ "metadata" : {
74+ "id" : " 7ff6eb6d"
75+ },
76+ "source" : [
77+ " ## 🧾 YAML Prompt\n " ,
78+ " ```yaml\n " ,
79+ " task: \" Text to Speech\"\n " ,
80+ " style: \" Clear, educational\"\n " ,
81+ " language: \" en\"\n " ,
82+ " ```"
83+ ]
84+ },
85+ {
86+ "cell_type" : " markdown" ,
87+ "id" : " 0ab9cda8" ,
88+ "metadata" : {
89+ "id" : " 0ab9cda8"
90+ },
91+ "source" : [
92+ " ## 🧠 Main"
93+ ]
94+ },
95+ {
96+ "cell_type" : " code" ,
97+ "execution_count" : null ,
98+ "id" : " 6459f8bd" ,
99+ "metadata" : {
100+ "id" : " 6459f8bd"
101+ },
102+ "outputs" : [],
103+ "source" : [
104+ " from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq\n " ,
105+ " import torchaudio\n " ,
106+ " import soundfile as sf\n " ,
107+ " import torch\n " ,
108+ " \n " ,
109+ " processor = AutoProcessor.from_pretrained(\" m-a-p/Sesame-CM\" )\n " ,
110+ " model = AutoModelForSpeechSeq2Seq.from_pretrained(\" m-a-p/Sesame-CM\" )\n " ,
111+ " model = model.to(\" cuda\" if torch.cuda.is_available() else \" cpu\" )\n " ,
112+ " \n " ,
113+ " text = \" Welcome to the world of voice synthesis using open models!\"\n " ,
114+ " inputs = processor(text, return_tensors=\" pt\" ).to(model.device)\n " ,
115+ " with torch.no_grad():\n " ,
116+ " outputs = model.generate(**inputs)\n " ,
117+ " speech = processor.batch_decode(outputs, return_tensors=\" pt\" )[0]\n " ,
118+ " \n " ,
119+ " sf.write(\" output.wav\" , speech.numpy(), 16000)"
120+ ]
121+ },
122+ {
123+ "cell_type" : " markdown" ,
124+ "id" : " 376fe112" ,
125+ "metadata" : {
126+ "id" : " 376fe112"
127+ },
128+ "source" : [
129+ " ## 📤 Output\n " ,
130+ " 🖼️ Output Preview (Text Summary):\n " ,
131+ " \n " ,
132+ " Prompt: A clear educational text is converted to a .wav file.\n " ,
133+ " \n " ,
134+ " 🎧 The output audio will say: 'Welcome to the world of voice synthesis using open models!'\n " ,
135+ " This demonstrates how Sesame-CM can be used for building TTS applications easily."
136+ ]
137+ }
138+ ],
139+ "metadata" : {
140+ "colab" : {
141+ "provenance" : []
142+ }
143+ },
144+ "nbformat" : 4 ,
145+ "nbformat_minor" : 5
146+ }
0 commit comments