diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index 63bb56e77..6655369d2 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -84,3 +84,7 @@ FSMN fsmn subarray topp +kokoro +phonemizer +phonemizers +phonemis \ No newline at end of file diff --git a/apps/speech-to-text/App.tsx b/apps/speech-to-text/App.tsx deleted file mode 100644 index 53a7464e0..000000000 --- a/apps/speech-to-text/App.tsx +++ /dev/null @@ -1,6 +0,0 @@ -import React from 'react'; -import { SpeechToTextScreen } from './screens/SpeechToTextScreen'; - -export default function App() { - return ; -} diff --git a/apps/speech-to-text/.gitignore b/apps/speech/.gitignore similarity index 100% rename from apps/speech-to-text/.gitignore rename to apps/speech/.gitignore diff --git a/apps/speech/App.tsx b/apps/speech/App.tsx new file mode 100644 index 000000000..bb5196e63 --- /dev/null +++ b/apps/speech/App.tsx @@ -0,0 +1,92 @@ +import React, { useState } from 'react'; +import { View, Text, StyleSheet, TouchableOpacity } from 'react-native'; +import { TextToSpeechScreen } from './screens/TextToSpeechScreen'; +import { SpeechToTextScreen } from './screens/SpeechToTextScreen'; +import ColorPalette from './colors'; +import ExecutorchLogo from './assets/executorch.svg'; +import { Quiz } from './screens/Quiz'; + +export default function App() { + const [currentScreen, setCurrentScreen] = useState< + 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' + >('menu'); + + if (currentScreen === 'text-to-speech') { + return ; + } + + if (currentScreen === 'speech-to-text') { + return ; + } + + if (currentScreen === 'quiz') { + return ; + } + + return ( + + + Select a demo model + + setCurrentScreen('speech-to-text')} + > + Speech to Text + + setCurrentScreen('text-to-speech')} + > + Text to Speech + + setCurrentScreen('quiz')} + > + Text to Speech - Quiz + + + + ); +} + +export const fontSizes = { + xxl: 34, + xl: 22, + lg: 18, + md: 16, + sm: 14, + xs: 12, + xxs: 10, +}; + +const styles = StyleSheet.create({ + container: { + flex: 1, + justifyContent: 'center', + alignItems: 'center', + backgroundColor: '#fff', + }, + headerText: { + fontSize: fontSizes.lg, + color: ColorPalette.strongPrimary, + margin: 20, + }, + buttonContainer: { + width: '80%', + justifyContent: 'space-evenly', + marginBottom: 20, + }, + button: { + backgroundColor: ColorPalette.strongPrimary, + borderRadius: 8, + padding: 10, + alignItems: 'center', + marginBottom: 10, + }, + buttonText: { + color: 'white', + fontSize: fontSizes.md, + }, +}); diff --git a/apps/speech-to-text/app.json b/apps/speech/app.json similarity index 81% rename from apps/speech-to-text/app.json rename to apps/speech/app.json index 80e133d16..693c815cb 100644 --- a/apps/speech-to-text/app.json +++ b/apps/speech/app.json @@ -1,7 +1,7 @@ { "expo": { - "name": "speech-to-text", - "slug": "speech-to-text", + "name": "speech", + "slug": "speech", "version": "1.0.0", "orientation": "portrait", "icon": "./assets/icon.png", @@ -14,7 +14,7 @@ }, "ios": { "supportsTablet": true, - "bundleIdentifier": "com.anonymous.speechtotext", + "bundleIdentifier": "com.anonymous.speech", "infoPlist": { "NSMicrophoneUsageDescription": "This app needs access to your microphone to record audio." } @@ -24,7 +24,7 @@ "foregroundImage": "./assets/adaptive-icon.png", "backgroundColor": "#ffffff" }, - "package": "com.anonymous.speechtotext" + "package": "com.anonymous.speech" }, "web": { "favicon": "./assets/favicon.png" diff --git a/apps/speech-to-text/assets/adaptive-icon.png b/apps/speech/assets/adaptive-icon.png similarity index 100% rename from apps/speech-to-text/assets/adaptive-icon.png rename to apps/speech/assets/adaptive-icon.png diff --git a/apps/speech/assets/executorch.svg b/apps/speech/assets/executorch.svg new file mode 100644 index 000000000..e548ea420 --- /dev/null +++ b/apps/speech/assets/executorch.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/apps/speech-to-text/assets/favicon.png b/apps/speech/assets/favicon.png similarity index 100% rename from apps/speech-to-text/assets/favicon.png rename to apps/speech/assets/favicon.png diff --git a/apps/speech-to-text/assets/icon.png b/apps/speech/assets/icon.png similarity index 100% rename from apps/speech-to-text/assets/icon.png rename to apps/speech/assets/icon.png diff --git a/apps/speech/assets/quiz-data.ts b/apps/speech/assets/quiz-data.ts new file mode 100644 index 000000000..71cfdc544 --- /dev/null +++ b/apps/speech/assets/quiz-data.ts @@ -0,0 +1,864 @@ +// --- Data --- +// cspell:ignoreRegExp /export const QUESTIONS = \[[\s\S]*?\];/ +export const QUESTIONS = [ + { + q: 'What is the capital of Japan?', + a: ['Beijing', 'Seoul', 'Tokyo', 'Kyoto'], + c: 2, + e: 'Tokyo is the capital of Japan.', + context: + "Tokyo, formerly known as Edo, became the capital of Japan in 1868. It is one of the world's most populous cities and a major center for finance, culture, and technology. Tokyo hosts the Imperial Palace and is famous for its blend of traditional and modern architecture.", + }, + { + q: 'Who wrote "Romeo and Juliet"?', + a: ['Mark Twain', 'William Shakespeare', 'Charles Dickens', 'Jane Austen'], + c: 1, + e: 'William Shakespeare wrote Romeo and Juliet.', + context: + 'William Shakespeare was an English playwright and poet, widely regarded as the greatest writer in the English language. "Romeo and Juliet" is one of his most famous tragedies, exploring themes of love, fate, and family conflict in Renaissance Italy.', + }, + { + q: 'Which planet has the most moons in our solar system?', + a: ['Mars', 'Earth', 'Jupiter', 'Venus'], + c: 2, + e: 'Jupiter has the most known moons.', + context: + 'Jupiter has over 90 known moons, including Ganymede, the largest moon in the solar system. Its strong gravity allows it to capture many objects as moons. Saturn also has many moons, but Jupiter currently holds the record.', + }, + { + q: 'What gas do plants use for photosynthesis?', + a: ['Oxygen', 'Carbon dioxide', 'Nitrogen', 'Helium'], + c: 1, + e: 'Plants use carbon dioxide for photosynthesis.', + context: + 'Photosynthesis is the process by which plants convert carbon dioxide and water into glucose and oxygen using sunlight. This process is essential for life on Earth, as it provides oxygen and food for many organisms.', + }, + { + q: 'Who painted the Sistine Chapel ceiling?', + a: ['Raphael', 'Leonardo da Vinci', 'Michelangelo', 'Donatello'], + c: 2, + e: 'Michelangelo painted the Sistine Chapel ceiling.', + context: + 'Michelangelo was an Italian Renaissance artist known for his sculptures and paintings. The Sistine Chapel ceiling, painted between 1508 and 1512, features scenes from the Book of Genesis and is considered a masterpiece of Western art.', + }, + { + q: 'What is the largest continent by land area?', + a: ['Africa', 'Asia', 'Europe', 'Antarctica'], + c: 1, + e: 'Asia is the largest continent by land area.', + context: + "Asia covers about 30% of Earth's land area and is home to more than half of the world's population. It includes diverse regions such as the Middle East, South Asia, East Asia, and Siberia.", + }, + { + q: 'Which element has the chemical symbol "O"?', + a: ['Gold', 'Oxygen', 'Silver', 'Iron'], + c: 1, + e: 'O is the symbol for oxygen.', + context: + "Oxygen is a vital element for life, making up about 21% of Earth's atmosphere. It is essential for respiration in most living organisms and is highly reactive, forming compounds with many other elements.", + }, + { + q: 'In which year did the first man land on the moon?', + a: ['1969', '1959', '1979', '1965'], + c: 0, + e: 'The first moon landing was in 1969.', + context: + 'Apollo 11 was the mission that first landed humans on the Moon. Neil Armstrong and Buzz Aldrin walked on the lunar surface, while Michael Collins orbited above. The event marked a major milestone in space exploration.', + }, + { + q: 'What is the smallest prime number?', + a: ['0', '1', '2', '3'], + c: 2, + e: '2 is the smallest prime number.', + context: + 'A prime number is a natural number greater than 1 that has no positive divisors other than 1 and itself. 2 is the only even prime number, as all other even numbers are divisible by 2.', + }, + { + q: 'Which ocean lies on the east coast of the United States?', + a: ['Pacific', 'Atlantic', 'Indian', 'Arctic'], + c: 1, + e: 'The Atlantic Ocean is on the east coast.', + context: + 'The Atlantic Ocean is the second-largest ocean and separates North America from Europe and Africa. Major cities like New York, Miami, and Boston are located along its coast.', + }, + { + q: 'What currency is used in the United Kingdom?', + a: ['Euro', 'Pound', 'Dollar', 'Franc'], + c: 1, + e: 'The British pound is the currency of the UK.', + context: + 'The pound sterling, commonly known as the pound, is the official currency of the United Kingdom. It is one of the oldest currencies still in use and is subdivided into 100 pence.', + }, + { + q: 'Who discovered penicillin?', + a: ['Alexander Fleming', 'Marie Curie', 'Louis Pasteur', 'Gregor Mendel'], + c: 0, + e: 'Alexander Fleming discovered penicillin.', + context: + 'Penicillin was the first true antibiotic discovered by Alexander Fleming in 1928. It has saved countless lives by effectively treating bacterial infections.', + }, + { + q: 'What is the human body organ that pumps blood?', + a: ['Liver', 'Lung', 'Heart', 'Kidney'], + c: 2, + e: 'The heart pumps blood in the body.', + context: + 'The heart is a muscular organ about the size of a fist, located slightly left of the center of the chest. It pumps blood through the circulatory system, supplying oxygen and nutrients to the body.', + }, + { + q: 'Which country is known for the pyramids at Giza?', + a: ['Mexico', 'Peru', 'Egypt', 'Sudan'], + c: 2, + e: 'Egypt is famous for the pyramids at Giza.', + context: + 'The Giza pyramid complex is one of the most famous archaeological sites in the world. The Great Pyramid of Giza is the largest pyramid in Egypt and one of the Seven Wonders of the Ancient World.', + }, + { + q: 'Which language has the most native speakers worldwide?', + a: ['English', 'Spanish', 'Mandarin', 'Hindi'], + c: 2, + e: 'Mandarin Chinese has the most native speakers.', + context: + 'Mandarin is the most widely spoken language in the world, with over a billion native speakers. It is the official language of China and Taiwan, and one of the official languages of Singapore.', + }, + { + q: 'What is H2O commonly called?', + a: ['Salt', 'Hydrogen', 'Water', 'Oxygen'], + c: 2, + e: 'H2O is the chemical formula for water.', + context: + "Water is essential for all known forms of life. It covers about 71% of Earth's surface and is vital for drinking, agriculture, and industry.", + }, + { + q: 'Which metal is liquid at room temperature?', + a: ['Mercury', 'Gold', 'Silver', 'Copper'], + c: 0, + e: 'Mercury is liquid at room temperature.', + context: + 'Mercury is the only metal that is liquid at standard conditions for temperature and pressure. It is used in thermometers, barometers, and some electrical switches.', + }, + { + q: 'What is the fastest land animal?', + a: ['Lion', 'Cheetah', 'Horse', 'Kangaroo'], + c: 1, + e: 'The cheetah is the fastest land animal.', + context: + 'The cheetah can reach speeds of up to 75 miles per hour (120 kilometers per hour) in short bursts covering distances up to 500 meters. It is built for speed with a lightweight body and long legs.', + }, + { + q: 'Which organ in plants makes food using sunlight?', + a: ['Root', 'Stem', 'Leaf', 'Flower'], + c: 2, + e: 'Leaves perform photosynthesis to make food.', + context: + 'Photosynthesis occurs in the chloroplasts of plant cells, which contain chlorophyll that captures sunlight. This process converts carbon dioxide and water into glucose and oxygen.', + }, + { + q: 'Who composed the Fifth Symphony known as "fate"?', + a: ['Mozart', 'Beethoven', 'Bach', 'Chopin'], + c: 1, + e: 'Beethoven composed the Fifth Symphony.', + context: + 'Ludwig van Beethoven was a German composer and pianist. His Fifth Symphony, composed between 1804 and 1808, is one of the most performed symphonies and is known for its distinctive four-note motif.', + }, + { + q: 'What is the boiling point of water at sea level in Celsius?', + a: ['90', '95', '100', '105'], + c: 2, + e: 'Water boils at 100 degrees Celsius at sea level.', + context: + 'The boiling point of water can change depending on the atmospheric pressure. At higher altitudes, water boils at a lower temperature due to reduced pressure.', + }, + { + q: 'Which city is known as the Big Apple?', + a: ['Los Angeles', 'Chicago', 'New York', 'Miami'], + c: 2, + e: 'New York City is nicknamed the Big Apple.', + context: + 'The nickname "The Big Apple" originally referred to New York City\'s horse racing tracks. It later became popularized in the 1970s and is now a widely recognized nickname for the city.', + }, + { + q: 'What is the longest river in the world by length?', + a: ['Nile', 'Amazon', 'Yangtze', 'Mississippi'], + c: 0, + e: 'The Nile has long been considered the longest.', + context: + 'The Nile River in Africa is approximately 6650 kilometers (4130 miles) long. It flows through eleven countries and is essential for agriculture and water supply in the region.', + }, + { + q: 'Which planet is closest to the Sun?', + a: ['Venus', 'Mercury', 'Earth', 'Mars'], + c: 1, + e: 'Mercury is the planet closest to the Sun.', + context: + 'Mercury is the smallest planet in our solar system and orbits the Sun at an average distance of about 57.91 million kilometers (36 million miles).', + }, + { + q: 'Who developed the theory of relativity?', + a: ['Isaac Newton', 'Albert Einstein', 'Niels Bohr', 'Galileo'], + c: 1, + e: 'Albert Einstein developed the theory of relativity.', + context: + "Einstein's theory of relativity, published in 1905, revolutionized our understanding of space, time, and gravity. It introduced the famous equation E=mc², linking energy and mass.", + }, + { + q: 'What is the main language spoken in Brazil?', + a: ['Spanish', 'Portuguese', 'French', 'English'], + c: 1, + e: 'Portuguese is the main language in Brazil.', + context: + "Brazil is the largest country in South America and the only one in the region where Portuguese is the official language. This is due to Brazil's colonization by Portugal in the 16th century.", + }, + { + q: 'What instrument has keys, pedals, and strings?', + a: ['Guitar', 'Violin', 'Piano', 'Flute'], + c: 2, + e: 'The piano has keys, pedals, and strings.', + context: + 'The piano is a musical instrument played by pressing keys that cause hammers to strike strings, producing sound. It is widely used in classical and popular music.', + }, + { + q: 'Which country gifted the Statue of Liberty to the USA?', + a: ['Germany', 'France', 'Italy', 'Spain'], + c: 1, + e: 'France gifted the Statue of Liberty to the USA.', + context: + 'The Statue of Liberty was a gift from the people of France to the United States, dedicated in 1886. It was designed by French sculptor Frédéric Auguste Bartholdi and symbolizes freedom and democracy.', + }, + { + q: "Which gas is most abundant in Earth's atmosphere?", + a: ['Oxygen', 'Carbon dioxide', 'Nitrogen', 'Argon'], + c: 2, + e: 'Nitrogen is the most abundant gas in the atmosphere.', + context: + "Nitrogen makes up about 78% of Earth's atmosphere by volume. It is a colorless, odorless gas that is essential for life, as it is a key component of amino acids and nucleic acids.", + }, + { + q: 'What is the chemical symbol for gold?', + a: ['Au', 'Ag', 'Gd', 'Go'], + c: 0, + e: 'Au is the chemical symbol for gold.', + context: + 'Gold is a dense, malleable metal with the chemical symbol Au (from Latin: aurum) and atomic number 79. It is highly valued for its use in jewelry, currency, and other arts.', + }, + { + q: 'Who painted "The Starry Night"?', + a: ['Paul Cezanne', 'Vincent van Gogh', 'Pablo Picasso', 'Claude Monet'], + c: 1, + e: 'Vincent van Gogh painted The Starry Night.', + context: + "The Starry Night is one of Vincent van Gogh's most famous paintings, created in 1889. It depicts a swirling night sky over a quiet town, expressing van Gogh's emotional turmoil and fascination with the night.", + }, + { + q: 'Which year did World War 2 end?', + a: ['1944', '1945', '1946', '1947'], + c: 1, + e: 'World War 2 ended in 1945.', + context: + "World War 2 was a global conflict that lasted from 1939 to 1945. It involved most of the world's nations and resulted in significant changes to the global political and social landscape.", + }, + { + q: 'What is the largest mammal?', + a: ['Elephant', 'Blue whale', 'Giraffe', 'Hippopotamus'], + c: 1, + e: 'The blue whale is the largest mammal.', + context: + 'The blue whale is the largest animal known to have ever existed, reaching lengths of up to 100 feet (30 meters) and weights of up to 200 tons. They are found in oceans worldwide and primarily eat small shrimp-like animals called krill.', + }, + { + q: 'Which element is needed to make steel?', + a: ['Carbon', 'Helium', 'Nitrogen', 'Neon'], + c: 0, + e: 'Carbon is combined with iron to make steel.', + context: + 'Steel is an alloy made primarily of iron and carbon. The carbon content determines the hardness and strength of the steel. Other elements may also be added to create different types of steel.', + }, + { + q: 'Who is the author of the Harry Potter series?', + a: ['C S Lewis', 'J R R Tolkien', 'J K Rowling', 'Philip Pullman'], + c: 2, + e: 'J K Rowling wrote the Harry Potter series.', + context: + 'The Harry Potter series is a globally popular fantasy book series written by J.K. Rowling. It follows the life and adventures of a young wizard, Harry Potter, and his friends.', + }, + { + q: 'Which country uses the Yen as its currency?', + a: ['China', 'Japan', 'South Korea', 'Vietnam'], + c: 1, + e: 'Japan uses the Yen as its currency.', + context: + 'The yen is the official currency of Japan, introduced in 1871. It is one of the most traded currencies in the world and is known for its stability.', + }, + { + q: 'Which vitamin is produced when skin is exposed to sunlight?', + a: ['Vitamin A', 'Vitamin B', 'Vitamin C', 'Vitamin D'], + c: 3, + e: 'Vitamin D is produced in skin after sunlight exposure.', + context: + 'Vitamin D is essential for maintaining healthy bones and teeth, and it plays a role in immune system function. The body produces vitamin D in response to skin being exposed to sunlight.', + }, + { + q: 'What is the tallest mountain in the world above sea level?', + a: ['K2', 'Kangchenjunga', 'Mount Everest', 'Lhotse'], + c: 2, + e: 'Mount Everest is the tallest above sea level.', + context: + "Mount Everest, located in the Himalayas on the border of Nepal and the Tibet Autonomous Region of China, is the Earth's highest mountain above sea level, with a peak at 8848.86 meters (29031.7 ft).", + }, + { + q: 'What device converts alternating current to direct current?', + a: ['Transformer', 'Rectifier', 'Generator', 'Inverter'], + c: 1, + e: 'A rectifier converts AC to DC.', + context: + 'A rectifier is an electrical device that converts alternating current (AC) to direct current (DC). It allows current to flow in one direction only, effectively converting the AC waveform to a DC waveform.', + }, + { + q: 'Which two colors make green when mixed in paint?', + a: ['Red and Blue', 'Blue and Yellow', 'Red and Yellow', 'Blue and Green'], + c: 1, + e: 'Blue and yellow mix to make green.', + context: + 'In color theory, blue and yellow are primary colors that, when mixed together, create green, which is a secondary color. This is due to the way our eyes perceive color and the way light wavelengths combine.', + }, + { + q: 'Who was the first president of United States?', + a: [ + 'Abraham Lincoln', + 'George Washington', + 'Thomas Jefferson', + 'John Adams', + ], + c: 1, + e: 'George Washington was the first president of United States.', + context: + 'George Washington was unanimously elected as the first President of the United States in 1788. He served two terms from 1789 to 1797 and is often called the "Father of His Country".', + }, + { + q: 'Which organ breaks down food and absorbs nutrients?', + a: ['Lung', 'Kidney', 'Stomach and intestine', 'Heart'], + c: 2, + e: 'Stomach and intestines digest and absorb nutrients.', + context: + 'The digestive system breaks down food into smaller molecules, which are then absorbed into the bloodstream through the walls of the intestines. The stomach and intestines play key roles in this process.', + }, + { + q: 'Which bird is known for its ability to mimic human speech?', + a: ['Eagle', 'Parrot', 'Sparrow', 'Ostrich'], + c: 1, + e: 'Parrots can mimic human speech.', + context: + 'Some species of parrots are known for their ability to imitate human speech and other sounds. This ability varies among individual birds and is thought to be a form of social learning.', + }, + { + q: 'What is the study of past human activity called?', + a: ['Anthropology', 'Archaeology', 'Sociology', 'Geology'], + c: 1, + e: 'Archaeology studies past human activity.', + context: + 'Archaeology is the scientific study of ancient cultures and human activity through the examination of artifacts, structures, and other physical remains.', + }, + { + q: 'Which substance makes up the majority of the Sun?', + a: ['Iron', 'Hydrogen', 'Carbon', 'Silicon'], + c: 1, + e: 'Hydrogen is the main element in the Sun.', + context: + 'The Sun is composed of about 74% hydrogen, 24% helium, and 2% heavier elements. Hydrogen is the primary fuel for the nuclear fusion reactions that power the Sun.', + }, + { + q: 'Who invented the telephone?', + a: [ + 'Thomas Edison', + 'Alexander Graham Bell', + 'Nikola Tesla', + 'Guglielmo Marconi', + ], + c: 1, + e: 'Alexander Graham Bell is credited with the telephone.', + context: + 'Alexander Graham Bell was a Scottish-born inventor, scientist, and teacher who is credited with inventing the first practical telephone. He was awarded the first US patent for the invention of the telephone.', + }, + { + q: 'What is the capital of Canada?', + a: ['Toronto', 'Montreal', 'Vancouver', 'Ottawa'], + c: 3, + e: 'Ottawa is the capital of Canada.', + context: + 'Ottawa is the capital city of Canada, located in the province of Ontario. It became the capital in 1857 and is home to many national institutions, including the Parliament of Canada.', + }, + { + q: 'Which planet is known for its rings?', + a: ['Mars', 'Jupiter', 'Saturn', 'Uranus'], + c: 2, + e: 'Saturn is famous for its rings.', + context: + "Saturn is the sixth planet from the Sun and is known for its prominent ring system, which is made up of ice particles, rocky debris, and dust. The rings are thought to be remnants of moons or comets that were torn apart by Saturn's gravity.", + }, + { + q: 'Which chemical is used as table salt?', + a: ['Sodium chloride', 'Potassium', 'Magnesium', 'Calcium'], + c: 0, + e: 'Table salt is sodium chloride.', + context: + 'Table salt is chemically known as sodium chloride (NaCl). It is composed of sodium ions and chloride ions and is used in food preparation and preservation.', + }, + { + q: 'Who painted "Guernica"?', + a: ['Salvador Dali', 'Pablo Picasso', 'Henri Matisse', 'Jackson Pollock'], + c: 1, + e: 'Pablo Picasso painted Guernica.', + context: + 'Guernica is a mural-sized oil painting on canvas by Spanish artist Pablo Picasso, completed in 1937. It is one of the most famous anti-war artworks, depicting the suffering caused by war and violence.', + }, + { + q: 'What is the largest desert in the world?', + a: ['Sahara', 'Gobi', 'Arabian', 'Antarctic desert'], + c: 3, + e: 'The Antarctic is the largest desert by area.', + context: + 'The Antarctic Desert is the largest desert in the world, covering an area of about 14 million square kilometers (5.5 million square miles). It is classified as a desert due to its extremely low humidity and precipitation.', + }, + { + q: 'Which sport uses a shuttlecock?', + a: ['Tennis', 'Badminton', 'Squash', 'Table tennis'], + c: 1, + e: 'Badminton uses a shuttlecock.', + context: + "Badminton is a racquet sport played using shuttlecocks and a lightweight racquet. The game is played on a rectangular court divided by a net, and the objective is to hit the shuttlecock over the net and into the opponent's court.", + }, + { + q: 'What is the freezing point of water in Fahrenheit?', + a: ['0', '32', '100', '212'], + c: 1, + e: 'Water freezes at 32 degrees Fahrenheit.', + context: + 'The freezing point of water is 32 degrees Fahrenheit (0 degrees Celsius) at standard atmospheric pressure. At this temperature, water molecules slow down and form a crystalline structure, resulting in ice.', + }, + { + q: 'Which continent has the most countries?', + a: ['Asia', 'Africa', 'Europe', 'South America'], + c: 1, + e: 'Africa has the most countries of any continent.', + context: + 'Africa is the second-largest and second-most populous continent, with 54 recognized sovereign states. It has a diverse range of cultures, languages, and ecosystems.', + }, + { + q: 'What is the main ingredient in traditional sushi?', + a: ['Beef', 'Rice', 'Potatoes', 'Cheese'], + c: 1, + e: 'Rice is the main ingredient in sushi.', + context: + 'Sushi is a Japanese dish typically made with vinegared rice, raw fish, and other ingredients like vegetables and seaweed. The rice is the essential component that defines sushi.', + }, + { + q: 'Which famous physicist wrote "A Brief History of Time"?', + a: ['Richard Feynman', 'Stephen Hawking', 'Carl Sagan', 'Brian Cox'], + c: 1, + e: 'Stephen Hawking wrote A Brief History of Time.', + context: + 'A Brief History of Time is a popular science book written by physicist Stephen Hawking. It explains complex concepts in cosmology, such as the Big Bang, black holes, and light cones, in accessible language.', + }, + { + q: 'Which city is the capital of Australia?', + a: ['Sydney', 'Melbourne', 'Canberra', 'Perth'], + c: 2, + e: 'Canberra is the capital of Australia.', + context: + 'Canberra is the capital city of Australia, located in the Australian Capital Territory. It was selected as the capital in 1908 as a compromise between rivals Sydney and Melbourne.', + }, + { + q: 'What is the largest organ in the human body?', + a: ['Liver', 'Skin', 'Heart', 'Brain'], + c: 1, + e: 'Skin is the largest organ of the human body.', + context: + "The skin is the body's largest organ, covering an area of about 2 square meters (22 square feet) in adults. It protects internal organs, regulates temperature, and enables the sense of touch.", + }, + { + q: 'Which gas do humans inhale to survive?', + a: ['Carbon dioxide', 'Nitrogen', 'Oxygen', 'Helium'], + c: 2, + e: 'Humans inhale oxygen to survive.', + context: + 'Oxygen is essential for human survival as it is required for cellular respiration, the process by which cells produce energy. Humans inhale air containing oxygen through the respiratory system.', + }, + { + q: 'Who developed the theory of evolution by natural selection?', + a: ['Gregor Mendel', 'Charles Darwin', 'Louis Pasteur', 'Alfred Wallace'], + c: 1, + e: 'Charles Darwin proposed natural selection.', + context: + 'Charles Darwin was an English naturalist, geologist, and biologist best known for his contributions to the science of evolution. He proposed the theory of natural selection as the mechanism of evolution.', + }, + { + q: 'What instrument measures temperature?', + a: ['Barometer', 'Thermometer', 'Hygrometer', 'Ammeter'], + c: 1, + e: 'A thermometer measures temperature.', + context: + 'A thermometer is a device that measures temperature, typically using a glass tube filled with mercury or alcohol that expands and contracts with temperature changes.', + }, + { + q: 'Which country has the largest population?', + a: ['India', 'United States', 'China', 'Russia'], + c: 2, + e: 'China has the largest population.', + context: + 'China is the most populous country in the world, with a population of over 1.4 billion people. It is followed by India, the United States, and Indonesia.', + }, + { + q: 'What is the chemical formula for table sugar (sucrose)?', + a: ['C6H12O6', 'C12H22O11', 'H2O', 'CO2'], + c: 1, + e: 'Sucrose has formula C12H22O11.', + context: + 'Table sugar, or sucrose, is a carbohydrate composed of glucose and fructose units. It is commonly used as a sweetener in food and drinks.', + }, + { + q: 'Which author wrote "Pride and Prejudice"?', + a: ['Emily Bronte', 'Charlotte Bronte', 'Jane Austen', 'Mary Shelley'], + c: 2, + e: 'Jane Austen wrote Pride and Prejudice.', + context: + 'Pride and Prejudice is a novel written by Jane Austen, published in 1813. It is a romantic fiction that critiques the British landed gentry at the end of the 18th century.', + }, + { + q: 'What is the largest island in the world?', + a: ['Greenland', 'Madagascar', 'Borneo', 'New Guinea'], + c: 0, + e: 'Greenland is the largest island.', + context: + "Greenland is the world's largest island that is not a continent. It is located between the Arctic and Atlantic Oceans and is an autonomous territory within the Kingdom of Denmark.", + }, + { + q: 'Which planet is known as the Red Planet?', + a: ['Earth', 'Mars', 'Venus', 'Mercury'], + c: 1, + e: 'Mars is known as the Red Planet.', + context: + 'Mars is often called the Red Planet because of its reddish appearance, which is due to iron oxide (rust) on its surface. It is the fourth planet from the Sun and has the largest dust storms in the solar system.', + }, + { + q: 'Who is credited with inventing the light bulb?', + a: ['Nikola Tesla', 'Thomas Edison', 'Alexander Graham Bell', 'James Watt'], + c: 1, + e: 'Thomas Edison is commonly credited for the light bulb.', + context: + 'Thomas Edison was an American inventor and businessman who is credited with developing the first commercially successful incandescent light bulb.', + }, + { + q: 'What is the capital of Italy?', + a: ['Milan', 'Naples', 'Rome', 'Florence'], + c: 2, + e: 'Rome is the capital of Italy.', + context: + 'Rome, the "Eternal City", is the capital of Italy and of the Lazio region. It is known for its nearly 3000 years of globally influential art, architecture, and culture.', + }, + { + q: 'Which metal has the highest electrical conductivity?', + a: ['Gold', 'Silver', 'Copper', 'Aluminum'], + c: 1, + e: 'Silver has the highest conductivity of common metals.', + context: + 'Silver is a metal known for its high electrical conductivity, thermal conductivity, and reflectivity. It is used in electrical contacts, conductors, and in various electronic devices.', + }, + { + q: 'What is the largest city in India by population?', + a: ['Delhi', 'Mumbai', 'Bangalore', 'Kolkata'], + c: 1, + e: 'Mumbai is the largest city by population in India.', + context: + 'Mumbai, formerly known as Bombay, is the most populous city in India and the seventh most populous city in the world. It is the financial, commercial, and entertainment hub of India.', + }, + { + q: 'Which mountain range includes Mount Kilimanjaro?', + a: [ + 'Andes', + 'Himalayas', + 'Kilimanjaro is a standalone mountain', + 'Rockies', + ], + c: 2, + e: 'Kilimanjaro is a free standing mountain, not part of a range.', + context: + 'Mount Kilimanjaro is a dormant stratovolcano located in Tanzania. It is the highest mountain in Africa, standing at 5895 meters (19341 feet) above sea level.', + }, + { + q: 'Which animal is known as the king of the jungle?', + a: ['Tiger', 'Elephant', 'Lion', 'Bear'], + c: 2, + e: 'The lion is often called the king of the jungle.', + context: + 'The lion is a large cat species found in Africa and India. It is known for its strength, courage, and majestic appearance. The term "king of the jungle" is a colloquial expression and lions actually inhabit grasslands and savannas.', + }, + { + q: 'What is the primary language of Egypt?', + a: ['Arabic', 'English', 'French', 'Greek'], + c: 0, + e: 'Arabic is the primary language in Egypt.', + context: + 'Arabic is the official language of Egypt and is spoken by the vast majority of the population. Egypt is also known for its ancient civilization and historical monuments.', + }, + { + q: 'Which country has the largest area in the world?', + a: ['United States', 'China', 'Russia', 'Canada'], + c: 2, + e: 'Russia is the largest country by area.', + context: + 'Russia is the largest country in the world by land area, covering more than 17 million square kilometers. It spans Eastern Europe and northern Asia, and has a wide range of environments and landscapes.', + }, + { + q: 'Who painted the Mona Lisa?', + a: ['Michelangelo', 'Leonardo da Vinci', 'Rembrandt', 'Raphael'], + c: 1, + e: 'Leonardo da Vinci painted the Mona Lisa.', + context: + 'The Mona Lisa is a half-length portrait painting by the Italian Renaissance artist Leonardo da Vinci. It is considered an archetypal masterpiece of the Italian Renaissance and is one of the most famous paintings in the world.', + }, + { + q: 'Which chemical element has atomic number 1?', + a: ['Helium', 'Hydrogen', 'Oxygen', 'Lithium'], + c: 1, + e: 'Hydrogen has atomic number 1.', + context: + 'Hydrogen is the chemical element with the symbol H and atomic number 1. It is the lightest and most abundant element in the universe, making up about 75% of its elemental mass.', + }, + { + q: 'What is the capital of Germany?', + a: ['Munich', 'Frankfurt', 'Berlin', 'Hamburg'], + c: 2, + e: 'Berlin is the capital of Germany.', + context: + 'Berlin is the capital and largest city of Germany, located in the northeastern part of the country. It is known for its cultural heritage, modern architecture, and vibrant arts scene.', + }, + { + q: 'Which famous scientist formulated the laws of motion?', + a: ['Albert Einstein', 'Isaac Newton', 'Galileo Galilei', 'Max Planck'], + c: 1, + e: 'Isaac Newton formulated the classical laws of motion.', + context: + 'Isaac Newton was an English mathematician, physicist, and astronomer who is widely recognized for formulating the laws of motion and universal gravitation.', + }, + { + q: 'What is the main ingredient in hummus?', + a: ['Lentils', 'Chickpeas', 'Beans', 'Peas'], + c: 1, + e: 'Hummus is made primarily from chickpeas.', + context: + 'Hummus is a spread made from cooked, mashed chickpeas or other beans, and is a common part of Levantine and Middle Eastern cuisines. It is often served with pita bread.', + }, + { + q: 'What is the largest lake by area in Africa?', + a: ['Lake Victoria', 'Lake Tanganyika', 'Lake Malawi', 'Lake Turkana'], + c: 0, + e: 'Lake Victoria is the largest lake in Africa by area.', + context: + 'Lake Victoria is the largest lake in Africa and the second-largest freshwater lake in the world by surface area. It is bordered by three countries: Tanzania, Uganda, and Kenya.', + }, + { + q: 'Which composer wrote the opera La Traviata?', + a: ['Wagner', 'Verdi', 'Puccini', 'Mozart'], + c: 1, + e: 'Giuseppe Verdi composed La Traviata.', + context: + 'La Traviata is an opera in three acts by Giuseppe Verdi, premiered in 1853. It is based on the novel "La Dame aux Camélias" by Alexandre Dumas fils and is one of the most performed operas worldwide.', + }, + { + q: 'Which city is home to the Colosseum?', + a: ['Athens', 'Rome', 'Istanbul', 'Naples'], + c: 1, + e: 'The Colosseum is located in Rome.', + context: + 'The Colosseum, also known as the Flavian Amphitheatre, is an ancient oval amphitheatre located in the center of Rome. It is the largest amphitheatre ever built and is considered one of the greatest works of Roman architecture and engineering.', + }, + { + q: 'What is the capital of Spain?', + a: ['Valencia', 'Seville', 'Madrid', 'Barcelona'], + c: 2, + e: 'Madrid is the capital of Spain.', + context: + 'Madrid is the capital and largest city of Spain, located in the center of the country. It is known for its cultural and artistic heritage, as well as its vibrant nightlife.', + }, + { + q: 'Which planet has a day longer than its year?', + a: ['Mercury', 'Venus', 'Mars', 'Earth'], + c: 1, + e: 'Venus rotates slowly so its day is longer than its year.', + context: + 'Venus is the second planet from the Sun and has a very slow rotation on its axis, taking about 243 Earth days to complete one rotation. However, its orbit around the Sun takes only about 225 Earth days.', + }, + { + q: 'Who wrote "The Odyssey"?', + a: ['Homer', 'Virgil', 'Sophocles', 'Plato'], + c: 0, + e: 'Homer is attributed as the author of The Odyssey.', + context: + 'The Odyssey is an ancient Greek epic poem attributed to Homer. It is one of the two major ancient Greek epic poems, the other being the Iliad, and it consists of 24 books.', + }, + { + q: 'Which organ is primarily responsible for detoxifying chemicals?', + a: ['Heart', 'Lung', 'Liver', 'Spleen'], + c: 2, + e: 'The liver detoxifies many chemicals in the body.', + context: + 'The liver is a vital organ that plays a key role in metabolism, digestion, and detoxification. It filters blood coming from the digestive tract and detoxifies chemicals, metabolizes drugs, and secretes bile.', + }, + { + q: 'Which country is famous for maple syrup?', + a: ['United States', 'Canada', 'Norway', 'Sweden'], + c: 1, + e: 'Canada is famous for maple syrup.', + context: + 'Canada is the largest producer of maple syrup in the world, accounting for about 71% of the global market share. Maple syrup is a traditional Canadian sweetener made from the sap of sugar maple trees.', + }, + { + q: 'What is the common name for sodium bicarbonate?', + a: ['Baking soda', 'Table salt', 'Vinegar', 'Baking powder'], + c: 0, + e: 'Sodium bicarbonate is known as baking soda.', + context: + 'Sodium bicarbonate, commonly known as baking soda, is a chemical compound with the formula NaHCO₃. It is used in baking as a leavening agent, and also has various household and industrial uses.', + }, + { + q: 'Which sea creature has eight arms?', + a: ['Shark', 'Octopus', 'Dolphin', 'Jellyfish'], + c: 1, + e: 'An octopus has eight arms.', + context: + 'Octopuses are marine animals known for their eight arms, which are lined with sensitive suckers. They are intelligent creatures and can change color and texture to blend in with their surroundings.', + }, + { + q: 'What is the capital of Russia?', + a: ['Saint Petersburg', 'Moscow', 'Novosibirsk', 'Sochi'], + c: 1, + e: 'Moscow is the capital of Russia.', + context: + 'Moscow is the capital and largest city of Russia, located in the western part of the country. It is known for its rich cultural history, architecture, and as a major political, economic, and scientific center.', + }, + { + q: 'Which instrument is primarily used in jazz as a brass reed instrument?', + a: ['Violin', 'Saxophone', 'Clarinet', 'Oboe'], + c: 1, + e: 'The saxophone is a common brass reed instrument in jazz.', + context: + 'The saxophone is a musical instrument invented by Adolphe Sax in the 1840s. It is a key instrument in jazz music, known for its expressive range and timbre.', + }, + { + q: 'What is the largest planet in our solar system?', + a: ['Earth', 'Saturn', 'Jupiter', 'Neptune'], + c: 2, + e: 'Jupiter is the largest planet in the solar system.', + context: + 'Jupiter is the fifth planet from the Sun and is more than twice as massive as all the other planets in the solar system combined. It has a thick atmosphere made up mostly of hydrogen and helium.', + }, + { + q: 'Which chemical element is liquid and used in thermometers?', + a: ['Mercury', 'Lead', 'Sodium', 'Iron'], + c: 0, + e: 'Mercury is used in some thermometers.', + context: + 'Mercury is a chemical element with the symbol Hg and atomic number 80. It is a heavy, silvery-white liquid metal that is used in thermometers, barometers, and other scientific instruments.', + }, + { + q: 'Who is known as the father of modern physics and developed the laws of motion?', + a: ['Benjamin Franklin', 'Isaac Newton', 'Albert Einstein', 'Nikola Tesla'], + c: 1, + e: 'Isaac Newton developed the laws of motion.', + context: + 'Isaac Newton is often referred to as the father of modern physics for his groundbreaking work in the 17th century. He formulated the laws of motion and universal gravitation, laying the foundation for classical mechanics.', + }, + { + q: 'Which country is famous for the tango dance?', + a: ['Brazil', 'Argentina', 'Mexico', 'Chile'], + c: 1, + e: 'Argentina is famous for the tango.', + context: + 'The tango is a partner dance that originated in the 1880s along the River Plate, the natural border between Argentina and Uruguay. It is now popular worldwide and is known for its passionate and dramatic style.', + }, + { + q: 'What is the capital of South Africa (one of them)?', + a: ['Cape Town', 'Pretoria', 'Johannesburg', 'Durban'], + c: 1, + e: "Pretoria is one of South Africa's capitals (administrative).", + context: + 'South Africa has three capital cities: Pretoria (administrative), Cape Town (legislative), and Bloemfontein (judicial). Pretoria is known for its diplomatic missions and embassies.', + }, + { + q: 'Which substance is needed for combustion?', + a: ['Oxygen', 'Helium', 'Nitrogen', 'Carbon dioxide'], + c: 0, + e: 'Oxygen supports combustion.', + context: + 'Combustion is a chemical process that occurs when a substance reacts rapidly with oxygen and releases energy in the form of light and heat. Oxygen is essential for combustion to occur.', + }, + { + q: 'Who wrote the novel "Moby Dick"?', + a: [ + 'Herman Melville', + 'Ernest Hemingway', + 'F Scott Fitzgerald', + 'Mark Twain', + ], + c: 0, + e: 'Herman Melville is the author of Moby Dick.', + context: + "Moby Dick is an 1851 novel by Herman Melville. The book is the sailor Ishmael's narrative of the obsessive quest of Ahab, captain of the whaling ship Pequod, for revenge on Moby Dick.", + }, + { + q: 'What is the primary material used to make glass?', + a: ['Iron', 'Sand', 'Wood', 'Clay'], + c: 1, + e: 'Glass is primarily made from silica sand.', + context: + 'Glass is a solid material that is typically made from silica (silicon dioxide) sand, soda ash, and limestone. It is used in windows, bottles, and many other applications.', + }, + { + q: 'Which famous tower leans and is in Italy?', + a: ['Eiffel Tower', 'Leaning Tower of Pisa', 'Big Ben', 'CN Tower'], + c: 1, + e: 'The Leaning Tower of Pisa is in Italy.', + context: + 'The Leaning Tower of Pisa is a freestanding bell tower located in Pisa, Italy. It is famous for its unintended tilt, which began during its construction in the 12th century.', + }, + { + q: 'Which organ controls the nervous system?', + a: ['Heart', 'Brain', 'Liver', 'Kidney'], + c: 1, + e: 'The brain controls the nervous system.', + context: + 'The brain is the central organ of the human nervous system, and with the spinal cord, it makes up the central nervous system (CNS). It is responsible for processing sensory information and coordinating bodily functions.', + }, + { + q: 'What is the capital of Turkey?', + a: ['Istanbul', 'Ankara', 'Izmir', 'Bursa'], + c: 1, + e: 'Ankara is the capital of Turkey.', + context: + 'Ankara is the capital of Turkey, located in the central part of the country. It became the capital in 1923, replacing Istanbul as the capital of the Republic of Turkey.', + }, + { + q: 'Which sport is played at Wimbledon?', + a: ['Cricket', 'Tennis', 'Football', 'Rugby'], + c: 1, + e: 'Wimbledon is a major tennis tournament.', + context: + 'The Wimbledon Championships is the oldest tennis tournament in the world, and is considered the most prestigious. It has been held at the All England Club in Wimbledon, London, since 1877.', + }, + { + q: 'What metal is primarily used to make aircraft bodies due to its light weight?', + a: ['Steel', 'Titanium', 'Aluminum', 'Copper'], + c: 2, + e: 'Aluminum is widely used for aircraft bodies.', + context: + 'Aluminum is a lightweight, durable metal that is resistant to corrosion, making it ideal for aircraft construction. It is used in the manufacture of aircraft bodies, wings, and other components.', + }, + { + q: 'Which ocean is between Africa and Australia?', + a: ['Atlantic', 'Pacific', 'Indian', 'Arctic'], + c: 2, + e: 'The Indian Ocean lies between Africa and Australia.', + context: + "The Indian Ocean is the third-largest ocean, covering about 20% of the Earth's water surface. It is bounded by Africa, Asia, Australia, and the Indian subcontinent.", + }, +]; diff --git a/apps/speech-to-text/assets/splash-icon.png b/apps/speech/assets/splash-icon.png similarity index 100% rename from apps/speech-to-text/assets/splash-icon.png rename to apps/speech/assets/splash-icon.png diff --git a/apps/speech-to-text/assets/swm_icon.svg b/apps/speech/assets/swm_icon.svg similarity index 100% rename from apps/speech-to-text/assets/swm_icon.svg rename to apps/speech/assets/swm_icon.svg diff --git a/apps/speech/colors.ts b/apps/speech/colors.ts new file mode 100644 index 000000000..feb75ac33 --- /dev/null +++ b/apps/speech/colors.ts @@ -0,0 +1,6 @@ +const ColorPalette = { + primary: '#001A72', + strongPrimary: '#020F3C', +}; + +export default ColorPalette; diff --git a/apps/speech-to-text/declarations.d.ts b/apps/speech/declarations.d.ts similarity index 100% rename from apps/speech-to-text/declarations.d.ts rename to apps/speech/declarations.d.ts diff --git a/apps/speech-to-text/index.ts b/apps/speech/index.ts similarity index 100% rename from apps/speech-to-text/index.ts rename to apps/speech/index.ts diff --git a/apps/speech-to-text/metro.config.js b/apps/speech/metro.config.js similarity index 100% rename from apps/speech-to-text/metro.config.js rename to apps/speech/metro.config.js diff --git a/apps/speech-to-text/package.json b/apps/speech/package.json similarity index 97% rename from apps/speech-to-text/package.json rename to apps/speech/package.json index b0ca00639..094fa2b78 100644 --- a/apps/speech-to-text/package.json +++ b/apps/speech/package.json @@ -1,5 +1,5 @@ { - "name": "speech-to-text", + "name": "speech", "version": "1.0.0", "main": "index.ts", "scripts": { diff --git a/apps/speech/screens/Quiz.tsx b/apps/speech/screens/Quiz.tsx new file mode 100644 index 000000000..8ff6e9126 --- /dev/null +++ b/apps/speech/screens/Quiz.tsx @@ -0,0 +1,502 @@ +import React, { useEffect, useRef, useState, useCallback } from 'react'; +import { + Text, + View, + StyleSheet, + TouchableOpacity, + Platform, + ScrollView, + KeyboardAvoidingView, +} from 'react-native'; +import Animated, { + useSharedValue, + useAnimatedStyle, + withTiming, + withSequence, + withDelay, + runOnJS, +} from 'react-native-reanimated'; +import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; +import { + KOKORO_EN, + KOKORO_VOICE_AF_HEART, + useTextToSpeech, +} from 'react-native-executorch'; +import { + AudioManager, + AudioContext, + AudioBuffer, +} from 'react-native-audio-api'; +import FontAwesome from '@expo/vector-icons/FontAwesome'; +import SWMIcon from '../assets/swm_icon.svg'; +import { QUESTIONS } from '../assets/quiz-data'; + +// Shuffle helper +function shuffleArray(array: T[]): T[] { + const arr = array.slice(); + for (let i = arr.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + [arr[i], arr[j]] = [arr[j], arr[i]]; + } + return arr; +} + +// --- Audio Helper --- +const createAudioBufferFromVector = ( + audioVector: Float32Array, + audioContext: AudioContext | null = null, + sampleRate: number = 24000 +): AudioBuffer => { + if (audioContext == null) audioContext = new AudioContext({ sampleRate }); + const audioBuffer = audioContext.createBuffer( + 1, + audioVector.length, + sampleRate + ); + const channelData = audioBuffer.getChannelData(0); + channelData.set(audioVector); + return audioBuffer; +}; + +export const Quiz = () => { + // --- Hooks & State --- + const model = useTextToSpeech({ + model: KOKORO_EN, + voice: KOKORO_VOICE_AF_HEART, + }); + + const [shuffledQuestions] = useState(() => shuffleArray(QUESTIONS)); + const [currentIndex, setCurrentIndex] = useState(0); + const [selectedAnswer, setSelectedAnswer] = useState(null); + const [isAnswerCorrect, setIsAnswerCorrect] = useState(null); + const [showNext, setShowNext] = useState(false); + const [isSpeaking, setIsSpeaking] = useState(false); + const fadeAnim = useSharedValue(1); + const feedbackAnim = useSharedValue(0); + const nextButtonAnim = useSharedValue(0); + const buttonsInactiveAnim = useSharedValue(1); + + const audioContextRef = useRef(null); + const currentSourceRef = useRef(null); + const isTransitioningRef = useRef(false); + const autoSpeakRef = useRef(true); + + // Animated Styles + const containerStyle = useAnimatedStyle(() => ({ + opacity: fadeAnim.value, + })); + + const feedbackStyle = useAnimatedStyle(() => ({ + opacity: feedbackAnim.value, + })); + + const nextButtonStyle = useAnimatedStyle(() => ({ + opacity: nextButtonAnim.value * buttonsInactiveAnim.value, + transform: [ + { + translateY: (1 - nextButtonAnim.value) * 12, + }, + ], + })); + + // --- Audio Setup --- + useEffect(() => { + AudioManager.setAudioSessionOptions({ + iosCategory: 'playAndRecord', + iosMode: 'spokenAudio', + iosOptions: ['defaultToSpeaker'], + }); + + audioContextRef.current = new AudioContext({ sampleRate: 24000 }); + audioContextRef.current.suspend(); + + return () => { + audioContextRef.current?.close(); + audioContextRef.current = null; + }; + }, []); + + // --- TTS Function --- + const speak = useCallback( + async (text: string) => { + if (!text.trim() || !model.isReady) return; + + // Stop previous audio if any + if (currentSourceRef.current) { + try { + currentSourceRef.current.stop(); + } catch (e) {} + } + + setIsSpeaking(true); + try { + const audioContext = audioContextRef.current; + if (!audioContext) return; + if (audioContext.state === 'suspended') await audioContext.resume(); + + const onNext = async (audioVec: Float32Array) => { + return new Promise((resolve) => { + const audioBuffer = createAudioBufferFromVector( + audioVec, + audioContext, + 24000 + ); + const source = audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(audioContext.destination); + currentSourceRef.current = source; + source.onEnded = () => resolve(); + source.start(); + }); + }; + + await model.stream({ text, onNext, onEnd: async () => {} }); + } catch (e) { + console.error(e); + } finally { + setIsSpeaking(false); + } + }, + [model] + ); + + // --- Game Logic --- + const currentQ = shuffledQuestions[currentIndex]; + + // Speak question on load + useEffect(() => { + if (!model.isReady) return; + if (!autoSpeakRef.current) { + autoSpeakRef.current = true; + return; + } + const t = setTimeout(() => speak(currentQ.q), 500); + return () => clearTimeout(t); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [currentIndex, model.isReady]); + + const handleAnswer = async (index: number) => { + if (selectedAnswer !== null) return; // Prevent double taps + + setSelectedAnswer(index); + const correct = index === currentQ.c; + setIsAnswerCorrect(correct); + + // 1. Visual Feedback Animation (1s) + feedbackAnim.value = withSequence( + withTiming(1, { duration: 200 }), + withDelay(1000, withTiming(0, { duration: 200 })) + ); + + // 2. Audio Feedback + if (correct) { + await speak('Correct!'); + } else { + // Play "Incorrect" and explanation as one string + await speak(`Incorrect. ${currentQ.e}`); + } + + // 3. Show Next Button + setShowNext(true); + }; + + const updateQuestionState = (nextIdx: number) => { + setSelectedAnswer(null); + setIsAnswerCorrect(null); + setCurrentIndex(nextIdx); + + setTimeout(() => { + isTransitioningRef.current = false; + fadeAnim.value = withTiming(1, { duration: 500 }); + }, 500); + }; + + const handleNext = () => { + const nextIdx = (currentIndex + 1) % shuffledQuestions.length; + buttonsInactiveAnim.value = 1; // Ensure buttons are active for next round + autoSpeakRef.current = false; + speak(shuffledQuestions[nextIdx].q); + + isTransitioningRef.current = true; + setShowNext(false); + + fadeAnim.value = withTiming(0, { duration: 500 }, (finished) => { + if (finished) { + runOnJS(updateQuestionState)(nextIdx); + } + }); + }; + + const handleLearnMore = async () => { + if (isSpeaking) return; + + buttonsInactiveAnim.value = withTiming(0.5, { duration: 800 }); + + // Play the context for the current question + await speak(currentQ.context); + + buttonsInactiveAnim.value = withTiming(1, { duration: 800 }); + }; + + const getButtonColor = (index: number) => { + if (selectedAnswer === null) return styles.optionButton; + + if (index === currentQ.c) return styles.correctButton; // Highlight correct answer always if answered + if (index === selectedAnswer && !isAnswerCorrect) return styles.wrongButton; // Highlight mistake + + return styles.disabledOption; + }; + + // Animate next button appearance + useEffect(() => { + nextButtonAnim.value = withTiming(showNext ? 1 : 0, { + duration: 450, + }); + }, [showNext, nextButtonAnim]); + + return ( + + + + + Text to Speech - Quiz + + + {!model.isReady ? ( + + + Loading Model: {Math.round(model.downloadProgress * 100)}% + + + ) : ( + + + + + + Question {currentIndex + 1} + + {currentQ.q} + + + + {currentQ.a.map((opt, idx) => ( + handleAnswer(idx)} + disabled={selectedAnswer !== null} + > + {opt} + + ))} + + + {/* Feedback Animation Overlay Text */} + {selectedAnswer !== null && ( + + + {isAnswerCorrect ? 'Correct!' : 'Incorrect'} + + + )} + + + + {showNext && ( + + + Next Question + + + + + + Learn More + + + + + )} + + )} + + + ); +}; + +// Reuse stylistic approach from TextToSpeechScreen +const styles = StyleSheet.create({ + container: { + flex: 1, + backgroundColor: 'white', + }, + centerContainer: { + flex: 1, + justifyContent: 'center', + alignItems: 'center', + }, + loadingText: { + fontSize: 18, + color: '#0f186e', + }, + header: { + flexDirection: 'row', + alignItems: 'center', + padding: 16, + borderBottomWidth: 1, + borderBottomColor: '#eee', + }, + headerText: { + fontSize: 20, + fontWeight: 'bold', + color: '#0f186e', + marginLeft: 10, + }, + scrollContent: { + padding: 20, + paddingBottom: 160, // Space for bottom container + flexGrow: 1, // Ensures content scales to screen height + justifyContent: 'center', + }, + quizContainer: { + width: '100%', + }, + questionCard: { + backgroundColor: '#0f186e', + borderRadius: 16, + padding: 24, + marginBottom: 24, + minHeight: 150, + justifyContent: 'center', + }, + questionIndex: { + color: 'rgba(255,255,255,0.7)', + fontSize: 14, + marginBottom: 8, + textTransform: 'uppercase', + fontWeight: 'bold', + }, + questionText: { + color: 'white', + fontSize: 22, + fontWeight: '600', + lineHeight: 30, + }, + optionsContainer: { + gap: 12, + }, + baseOption: { + padding: 16, + borderRadius: 12, + borderWidth: 2, + flexDirection: 'row', + alignItems: 'center', + }, + // States of options + optionButton: { + backgroundColor: 'white', + borderColor: '#e0e0e0', + }, + correctButton: { + backgroundColor: '#E8F5E9', + borderColor: '#4CAF50', + }, + wrongButton: { + backgroundColor: '#FFEBEE', + borderColor: '#F44336', + }, + disabledOption: { + backgroundColor: '#f5f5f5', + borderColor: '#eee', + opacity: 0.6, + }, + + optionText: { + fontSize: 18, + color: '#333', + fontWeight: '500', + }, + feedbackContainer: { + alignItems: 'center', + marginVertical: 10, + }, + feedbackText: { + fontSize: 24, + fontWeight: 'bold', + marginVertical: 10, + }, + feedbackTextCorrect: { + color: '#4CAF50', + }, + feedbackTextIncorrect: { + color: '#F44336', + }, + nextButton: { + // Removed marginTop to rely on container padding + backgroundColor: '#0f186e', + paddingVertical: 16, + paddingHorizontal: 32, + borderRadius: 30, + flexDirection: 'row', + justifyContent: 'center', + alignItems: 'center', + gap: 10, + alignSelf: 'center', + width: '100%', + }, + learnMoreButton: { + marginTop: 0, + backgroundColor: 'white', + borderWidth: 2, + borderColor: '#0f186e', + }, + nextButtonText: { + color: 'white', + fontSize: 18, + fontWeight: 'bold', + }, + learnMoreButtonText: { + color: '#0f186e', + }, + bottomContainer: { + position: 'absolute', + bottom: 0, + left: 0, + right: 0, + padding: 20, + gap: 12, + alignItems: 'center', + backgroundColor: 'rgba(255,255,255,0.95)', + }, + flex1: { + flex: 1, + }, +}); diff --git a/apps/speech-to-text/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx similarity index 100% rename from apps/speech-to-text/screens/SpeechToTextScreen.tsx rename to apps/speech/screens/SpeechToTextScreen.tsx diff --git a/apps/speech/screens/TextToSpeechScreen.tsx b/apps/speech/screens/TextToSpeechScreen.tsx new file mode 100644 index 000000000..455c5a2b7 --- /dev/null +++ b/apps/speech/screens/TextToSpeechScreen.tsx @@ -0,0 +1,261 @@ +import React, { useEffect, useRef, useState } from 'react'; +import { + Text, + View, + StyleSheet, + TouchableOpacity, + TextInput, + KeyboardAvoidingView, + Platform, +} from 'react-native'; +import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; +import { + KOKORO_EN, + KOKORO_VOICE_AF_HEART, + useTextToSpeech, +} from 'react-native-executorch'; +import FontAwesome from '@expo/vector-icons/FontAwesome'; +import { + AudioManager, + AudioContext, + AudioBuffer, +} from 'react-native-audio-api'; +import SWMIcon from '../assets/swm_icon.svg'; + +/** + * Converts an audio vector (Float32Array) to an AudioBuffer for playback + * @param audioVector - The generated audio samples from the model + * @param sampleRate - The sample rate (default: 24000 Hz for Kokoro) + * @returns AudioBuffer ready for playback + */ +const createAudioBufferFromVector = ( + audioVector: Float32Array, + audioContext: AudioContext | null = null, + sampleRate: number = 24000 +): AudioBuffer => { + if (audioContext == null) audioContext = new AudioContext({ sampleRate }); + + const audioBuffer = audioContext.createBuffer( + 1, + audioVector.length, + sampleRate + ); + const channelData = audioBuffer.getChannelData(0); + channelData.set(audioVector); + + return audioBuffer; +}; + +export const TextToSpeechScreen = () => { + const model = useTextToSpeech({ + model: KOKORO_EN, + voice: KOKORO_VOICE_AF_HEART, + options: { + // This allows to minimize the memory usage by utilizing only one of the models. + // However, it either increases the latency (in case of the largest model) or + // decreases the quality of the results (in case of the smaller models). + // fixedModel: "large" + }, + }); + + const [inputText, setInputText] = useState(''); + const [isPlaying, setIsPlaying] = useState(false); + const [readyToGenerate, setReadyToGenerate] = useState(false); + + const audioContextRef = useRef(null); + const sourceRef = useRef(null); + + useEffect(() => { + AudioManager.setAudioSessionOptions({ + iosCategory: 'playAndRecord', + iosMode: 'spokenAudio', + iosOptions: ['defaultToSpeaker'], + }); + + // Initialize context once + audioContextRef.current = new AudioContext({ sampleRate: 24000 }); + audioContextRef.current.suspend(); + + return () => { + audioContextRef.current?.close(); + audioContextRef.current = null; + }; + }, []); + + useEffect(() => { + setReadyToGenerate(!model.isGenerating && model.isReady && !isPlaying); + }, [model.isGenerating, model.isReady, isPlaying]); + + const handlePlayAudio = async () => { + if (!inputText.trim()) { + return; + } + + setIsPlaying(true); + + try { + const audioContext = audioContextRef.current; + if (!audioContext) return; + + if (audioContext.state === 'suspended') { + await audioContext.resume(); + } + + const onNext = async (audioVec: Float32Array) => { + return new Promise((resolve) => { + const audioBuffer = createAudioBufferFromVector( + audioVec, + audioContext, + 24000 + ); + + const source = (sourceRef.current = + audioContext.createBufferSource()); + source.buffer = audioBuffer; + source.connect(audioContext.destination); + + source.onEnded = () => resolve(); + + source.start(); + }); + }; + + const onEnd = async () => { + setIsPlaying(false); + setReadyToGenerate(true); + await audioContext.suspend(); + }; + + await model.stream({ + text: inputText, + onNext, + onEnd, + }); + } catch (error) { + console.error('Error generating or playing audio:', error); + setIsPlaying(false); + } + }; + + const getModelStatus = () => { + if (model.error) return `${model.error}`; + if (model.isGenerating) return 'Generating audio...'; + if (model.isReady) return 'Ready to synthesize'; + return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`; + }; + + return ( + + + + + + React Native ExecuTorch + Text to Speech + + + + Status: {getModelStatus()} + + + + Enter text to synthesize + + + + + + + + {isPlaying ? 'Playing...' : 'Generate & Play'} + + + + + + + ); +}; + +const styles = StyleSheet.create({ + container: { + flex: 1, + alignItems: 'center', + backgroundColor: 'white', + paddingHorizontal: 16, + }, + keyboardAvoidingView: { + flex: 1, + width: '100%', + }, + header: { + alignItems: 'center', + }, + headerText: { + fontSize: 22, + fontWeight: 'bold', + color: '#0f186e', + }, + statusContainer: { + marginTop: 12, + alignItems: 'center', + }, + inputContainer: { + width: '100%', + marginTop: 24, + }, + inputLabel: { + marginLeft: 12, + marginBottom: 4, + color: '#0f186e', + fontWeight: '600', + }, + textInput: { + borderRadius: 12, + borderWidth: 1, + borderColor: '#0f186e', + padding: 12, + minHeight: 120, + fontSize: 16, + }, + buttonContainer: { + marginTop: 24, + }, + playButton: { + backgroundColor: '#0f186e', + flexDirection: 'row', + justifyContent: 'center', + alignItems: 'center', + padding: 12, + borderRadius: 12, + gap: 8, + }, + buttonText: { + color: 'white', + fontWeight: '600', + letterSpacing: -0.5, + fontSize: 16, + }, + disabled: { + opacity: 0.5, + }, +}); diff --git a/apps/speech-to-text/tsconfig.json b/apps/speech/tsconfig.json similarity index 100% rename from apps/speech-to-text/tsconfig.json rename to apps/speech/tsconfig.json diff --git a/packages/react-native-executorch/android/src/main/cpp/CMakeLists.txt b/packages/react-native-executorch/android/src/main/cpp/CMakeLists.txt index 20a3bb017..9452914c5 100644 --- a/packages/react-native-executorch/android/src/main/cpp/CMakeLists.txt +++ b/packages/react-native-executorch/android/src/main/cpp/CMakeLists.txt @@ -91,6 +91,13 @@ set(TOKENIZERS_LIBS "${LIBS_DIR}/tokenizers-cpp/${ANDROID_ABI}/libtokenizers_cpp.a" "${LIBS_DIR}/tokenizers-cpp/${ANDROID_ABI}/libsentencepiece.a" ) + +# ------- phonemis ------- + +set(PHONEMIS_LIBS + "${LIBS_DIR}/phonemis/${ANDROID_ABI}/libphonemis.a" +) + # -------------- target_link_options(react-native-executorch PRIVATE -fopenmp -static-openmp) @@ -103,6 +110,7 @@ target_link_libraries( ${OPENCV_THIRD_PARTY_LIBS} ${TOKENIZERS_LIBS} ${TOKENIZERS_THIRD_PARTY_LIBS} + ${PHONEMIS_LIBS} executorch ${EXECUTORCH_LIBS} z diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp index c25fbd13f..7a4426e06 100644 --- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -100,6 +101,10 @@ void RnExecutorchInstaller::injectJSIBindings( *jsiRuntime, "loadSpeechToText", RnExecutorchInstaller::loadModel( jsiRuntime, jsCallInvoker, "loadSpeechToText")); + jsiRuntime->global().setProperty( + *jsiRuntime, "loadTextToSpeechKokoro", + RnExecutorchInstaller::loadModel( + jsiRuntime, jsCallInvoker, "loadTextToSpeechKokoro")); jsiRuntime->global().setProperty( *jsiRuntime, "loadVAD", RnExecutorchInstaller::loadModel< diff --git a/packages/react-native-executorch/common/rnexecutorch/data_processing/Sequential.h b/packages/react-native-executorch/common/rnexecutorch/data_processing/Sequential.h new file mode 100644 index 000000000..a872352f6 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/data_processing/Sequential.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include +#include +#include + +/** + * @namespace rnexecutorch::sequential + * @brief Namespace for non-modifying sequential operations + */ +namespace rnexecutorch::sequential { + +/** + * @brief Repeats each element of a one-dimensional data vector according to the + * specified repetition counts. + * + * This function operates on one-dimensional input data and a corresponding + * vector of repetition counts. For each i-th element in `data`, the function + * appends it to the output vector `repetitions[i]` times. For example, given + * `data = [0, 1, 2]` and `repetitions = [2, 1, 2]`, the result will be `[0, 0, + * 1, 2, 2]`. + * + * @param data A span of input elements to be repeated. + * @param repetitions A span of integral values specifying how many times to + * repeat each corresponding element in `data`. + * @return A std::vector containing the repeated elements in order. + */ +template +std::vector repeatInterleave(std::span data, + std::span repetitions) { + if (data.size() != repetitions.size()) { + throw std::invalid_argument( + "repeatInterleave(): repetitions vector must be the same size as data," + " expected " + + std::to_string(data.size()) + " but got " + + std::to_string(repetitions.size())); + } + + IType totalReps = std::reduce(repetitions.begin(), repetitions.end()); + std::vector result(totalReps); + + IType filled = 0; + for (size_t i = 0; i < data.size(); i++) { + std::fill_n(result.begin() + filled, repetitions[i], data[i]); + filled += repetitions[i]; + } + + return result; +} + +} // namespace rnexecutorch::sequential \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 7feb497dc..97e8d91fb 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -44,6 +45,15 @@ inline std::string getValue(const jsi::Value &val, return val.getString(runtime).utf8(runtime); } +template <> +inline std::u32string getValue(const jsi::Value &val, + jsi::Runtime &runtime) { + std::string utf8 = getValue(val, runtime); + std::wstring_convert, char32_t> conv; + + return conv.from_bytes(utf8); +} + template <> inline std::shared_ptr getValue>(const jsi::Value &val, @@ -283,6 +293,15 @@ inline jsi::Value getJsiValue(const std::vector &vec, return {runtime, array}; } +inline jsi::Value getJsiValue(const std::vector &vec, + jsi::Runtime &runtime) { + jsi::Array array(runtime, vec.size()); + for (size_t i = 0; i < vec.size(); i++) { + array.setValueAtIndex(runtime, i, jsi::Value(vec[i])); + } + return {runtime, array}; +} + inline jsi::Value getJsiValue(const std::vector &vec, jsi::Runtime &runtime) { jsi::Array array(runtime, vec.size()); diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 3be8e6f7a..083eef285 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -156,6 +157,17 @@ template class ModelHostObject : public JsiHostObject { addFunctions( JSI_EXPORT_FUNCTION(ModelHostObject, unload, "unload")); } + + if constexpr (meta::SameAs) { + addFunctions( + JSI_EXPORT_FUNCTION(ModelHostObject, unload, "unload")); + addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, + promiseHostFunction<&Model::stream>, + "stream")); + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, promiseHostFunction<&Model::setFixedModel>, + "setFixedModel")); + } } // A generic host function that runs synchronously, works analogously to the diff --git a/packages/react-native-executorch/common/rnexecutorch/metaprogramming/ContainerHelpers.h b/packages/react-native-executorch/common/rnexecutorch/metaprogramming/ContainerHelpers.h new file mode 100644 index 000000000..5f7ae124e --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/metaprogramming/ContainerHelpers.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include +#include + +#include + +/** + * @brief A helper macro to check if a container has the expected size. + * Prints an error message with the container's name, file, and line number if + * the size does not match, and throws a std::runtime_error. + * + * @param container The container whose size will be checked (must have a + * .size() method). + * @param expected The expected size of the container. + * @note The macro prints the variable name, file, and line for easier + * debugging. + */ +#define CHECK_SIZE(container, expected) \ + if ((container).size() != (expected)) { \ + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Error, \ + "Unexpected size for " #container " at ", \ + std::filesystem::path(__FILE__).filename().string(), \ + ":", __LINE__, ": expected ", (expected), " but got ", \ + (container).size()); \ + throw std::runtime_error("Invalid input shape for " #container); \ + } \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/TextToSpeech.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/TextToSpeech.h new file mode 100644 index 000000000..2fb17c95f --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/TextToSpeech.h @@ -0,0 +1,3 @@ +#pragma once + +#include "kokoro/Kokoro.h" \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h new file mode 100644 index 000000000..d7af178a8 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Constants.h @@ -0,0 +1,77 @@ +#pragma once + +#include +#include +#include +#include + +#include "Types.h" + +namespace rnexecutorch::models::text_to_speech::kokoro::constants { +// Hyperparameters which determine the behavior of the model & algorithms. +inline constexpr size_t kMaxTextSize = + 2048; // An input which exceedes this value causes an exception to be thrown +inline constexpr int32_t kAudioCroppingSteps = 20; +inline constexpr float kAudioSilenceThreshold = 0.01F; +inline const std::unordered_map kPauseValues = { + {U'.', 250}, {U'?', 350}, {'!', 180}, {';', 300}, + {U'…', 500}, {U',', 125}, {U':', 175}, {U'-', 175}}; // [ms] +inline constexpr int32_t kDefaultPause = 5; // [ms] + +// Model input sizes - input tokens & max (expected) durations +inline constexpr Configuration kInputSmall = {.noTokens = 32, .duration = 92}; +inline constexpr Configuration kInputMedium = {.noTokens = 64, .duration = 164}; +inline constexpr Configuration kInputLarge = {.noTokens = 128, .duration = 296}; +inline const std::unordered_map kInputs = { + {"small", kInputSmall}, {"medium", kInputMedium}, {"large", kInputLarge}}; + +// Model input sizes - voice reference vector +inline constexpr int32_t kVoiceRefSize = + 256; // Always a fixed size, regardless of number of input tokens +inline constexpr int32_t kVoiceRefHalfSize = kVoiceRefSize / 2; + +// Duration mappings +// This corresponds to a number of elements in resulting audio vector per each +// duration point. +inline constexpr int32_t kTicksPerDuration = 600; +inline constexpr int32_t kSamplingRate = + 24000; // Corresponds to Kokoro's model audio frequency +inline constexpr int32_t kSamplesPerMilisecond = kSamplingRate / 1000; + +// Special phonemes +inline const std::unordered_set kEndOfSentencePhonemes = { + U'.', U'?', U'!', U';', U'…'}; +inline const std::unordered_set kPausePhonemes = {U',', U':', U'-'}; + +// Phoneme to token mappings +inline constexpr int32_t kVocabSize = 178; +inline const std::unordered_map kVocab = { + {U';', 1}, {U':', 2}, {U',', 3}, {U'.', 4}, {U'!', 5}, + {U'?', 6}, {U'—', 9}, {U'…', 10}, {U'"', 11}, {U'(', 12}, + {U')', 13}, {U'“', 14}, {U'”', 15}, {U' ', 16}, {U'\u0303', 17}, + {U'ʣ', 18}, {U'ʥ', 19}, {U'ʦ', 20}, {U'ʨ', 21}, {U'ᵝ', 22}, + {U'\uAB67', 23}, {U'A', 24}, {U'I', 25}, {U'O', 31}, {U'Q', 33}, + {U'S', 35}, {U'T', 36}, {U'W', 39}, {U'Y', 41}, {U'ᵊ', 42}, + {U'a', 43}, {U'b', 44}, {U'c', 45}, {U'd', 46}, {U'e', 47}, + {U'f', 48}, {U'h', 50}, {U'i', 51}, {U'j', 52}, {U'k', 53}, + {U'l', 54}, {U'm', 55}, {U'n', 56}, {U'o', 57}, {U'p', 58}, + {U'q', 59}, {U'r', 60}, {U's', 61}, {U't', 62}, {U'u', 63}, + {U'v', 64}, {U'w', 65}, {U'x', 66}, {U'y', 67}, {U'z', 68}, + {U'ɑ', 69}, {U'ɐ', 70}, {U'ɒ', 71}, {U'æ', 72}, {U'β', 75}, + {U'ɔ', 76}, {U'ɕ', 77}, {U'ç', 78}, {U'ɖ', 80}, {U'ð', 81}, + {U'ʤ', 82}, {U'ə', 83}, {U'ɚ', 85}, {U'ɛ', 86}, {U'ɜ', 87}, + {U'ɟ', 90}, {U'ɡ', 92}, {U'ɥ', 99}, {U'ɨ', 101}, {U'ɪ', 102}, + {U'ʝ', 103}, {U'ɯ', 110}, {U'ɰ', 111}, {U'ŋ', 112}, {U'ɳ', 113}, + {U'ɲ', 114}, {U'ɴ', 115}, {U'ø', 116}, {U'ɸ', 118}, {U'θ', 119}, + {U'œ', 120}, {U'ɹ', 123}, {U'ɾ', 125}, {U'ɻ', 126}, {U'ʁ', 128}, + {U'ɽ', 129}, {U'ʂ', 130}, {U'ʃ', 131}, {U'ʈ', 132}, {U'ʧ', 133}, + {U'ʊ', 135}, {U'ʋ', 136}, {U'ʌ', 138}, {U'ɣ', 139}, {U'ɤ', 140}, + {U'χ', 142}, {U'ʎ', 143}, {U'ʒ', 147}, {U'ʔ', 148}, {U'ˈ', 156}, + {U'ˌ', 157}, {U'ː', 158}, {U'ʰ', 162}, {U'ʲ', 164}, {U'↓', 169}, + {U'→', 171}, {U'↗', 172}, {U'↘', 173}, {U'ᵻ', 177}}; + +// Special tokens +inline constexpr Token kInvalidToken = -1; +inline constexpr Token kPadToken = 0; + +} // namespace rnexecutorch::models::text_to_speech::kokoro::constants diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Decoder.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Decoder.cpp new file mode 100644 index 000000000..a53b405d0 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Decoder.cpp @@ -0,0 +1,56 @@ +#include "Decoder.h" +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +using ::executorch::aten::ScalarType; +using ::executorch::extension::make_tensor_ptr; +using ::executorch::extension::TensorPtr; + +Decoder::Decoder(const std::string &modelSource, + std::shared_ptr callInvoker) + : BaseModel(modelSource, callInvoker) { + std::string testMethod = + "forward_" + std::to_string(constants::kInputSmall.noTokens); + auto inputTensors = getAllInputShapes(testMethod); + + // Perform checks to validate model's compatibility with native code + CHECK_SIZE(inputTensors, 4); +} + +Result> +Decoder::generate(const std::string &method, const Configuration &inputConfig, + std::span asr, std::span f0Pred, + std::span nPred, std::span ref_ls) { + // Perform input shape checks + // Both F0 and N vectors should be twice as long as duration + CHECK_SIZE(f0Pred, 2 * inputConfig.duration); + CHECK_SIZE(nPred, 2 * inputConfig.duration); + CHECK_SIZE(ref_ls, constants::kVoiceRefHalfSize); + + // Convert input data to ExecuTorch tensors + auto asrTensor = make_tensor_ptr({1, 512, inputConfig.duration}, asr.data(), + ScalarType::Float); + auto f0Tensor = make_tensor_ptr({1, 2 * inputConfig.duration}, f0Pred.data(), + ScalarType::Float); + auto nTensor = make_tensor_ptr({1, 2 * inputConfig.duration}, nPred.data(), + ScalarType::Float); + auto voiceRefTensor = make_tensor_ptr({1, constants::kVoiceRefHalfSize}, + ref_ls.data(), ScalarType::Float); + + // Execute the appropriate "forward_xyz" method, based on given method name + auto results = + execute(method, {asrTensor, f0Tensor, nTensor, voiceRefTensor}); + + if (!results.ok()) { + throw std::runtime_error( + "[Kokoro::Decoder] Failed to execute method " + method + + ", error: " + std::to_string(static_cast(results.error()))); + } + + // [audio] + return results; +} + +} // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Decoder.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Decoder.h new file mode 100644 index 000000000..2d206724f --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Decoder.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include +#include + +#include + +#include "Constants.h" +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +class Decoder : public BaseModel { +public: + explicit Decoder(const std::string &modelSource, + std::shared_ptr callInvoker); + + Result> + generate(const std::string &method, const Configuration &inputConfig, + std::span asr, std::span f0Pred, + std::span nPred, std::span ref_ls); +}; + +} // namespace rnexecutorch::models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp new file mode 100644 index 000000000..ad4d69627 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp @@ -0,0 +1,144 @@ +#include "DurationPredictor.h" +#include +#include +#include +#include +#include +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +using ::executorch::aten::ScalarType; +using ::executorch::extension::make_tensor_ptr; +using ::executorch::extension::TensorPtr; + +DurationPredictor::DurationPredictor( + const std::string &modelSource, + std::shared_ptr callInvoker) + : BaseModel(modelSource, callInvoker) { + std::string testMethod = + "forward_" + std::to_string(constants::kInputSmall.noTokens); + auto inputTensors = getAllInputShapes(testMethod); + + // Perform checks to validate model's compatibility with native code + CHECK_SIZE(inputTensors, 4); +} + +std::tuple, int32_t> +DurationPredictor::generate(const std::string &method, + const Configuration &inputConfig, + std::span tokens, std::span textMask, + std::span ref_hs, float speed) { + // Perform input shape checks + // Since every bit in text mask corresponds to exactly one of the tokens, both + // vectors should be the same length + CHECK_SIZE(tokens, textMask.size()); + CHECK_SIZE(ref_hs, constants::kVoiceRefHalfSize); + + // Convert input data to ExecuTorch tensors + auto tokensTensor = make_tensor_ptr({1, static_cast(tokens.size())}, + tokens.data(), ScalarType::Long); + auto textMaskTensor = + make_tensor_ptr({1, static_cast(textMask.size())}, + textMask.data(), ScalarType::Bool); + auto voiceRefTensor = make_tensor_ptr({1, constants::kVoiceRefHalfSize}, + ref_hs.data(), ScalarType::Float); + auto speedTensor = make_tensor_ptr({1}, &speed, ScalarType::Float); + + // Execute the appropriate "forward_xyz" method, based on given method name + auto results = execute( + method, {tokensTensor, textMaskTensor, voiceRefTensor, speedTensor}); + + if (!results.ok()) { + throw std::runtime_error( + "[Kokoro::DurationPredictor] Failed to execute method " + method + + ", error: " + std::to_string(static_cast(results.error()))); + } + + // Unpack the result + auto predDur = results->at(0).toTensor(); + auto d = results->at(1).toTensor(); + + // Scale output durations to match the value from model's config + scaleDurations(predDur, inputConfig.duration); + + // Create indices tensor by repetitions according to durations vector + std::vector idxs(inputConfig.noTokens); + std::iota(idxs.begin(), idxs.end(), 0LL); + std::vector indices = rnexecutorch::sequential::repeatInterleave( + std::span(idxs), + std::span(predDur.const_data_ptr(), + predDur.numel())); + + // Calculate the effective duration + // Note that we lower effective duration even further, to remove + // some of the side-effects at the end of the audio. + int32_t originalLength = + std::distance(tokens.begin(), + std::find(tokens.begin() + 1, tokens.end(), 0)) + + 1; + int32_t effDuration = std::distance( + indices.begin(), + std::lower_bound(indices.begin(), indices.end(), originalLength)); + + if (effDuration < inputConfig.duration) + effDuration *= 0.95; + + return std::make_tuple(std::move(d), std::move(indices), + std::move(effDuration)); +} + +void DurationPredictor::scaleDurations(Tensor &durations, + int32_t targetDuration) const { + // We expect durations tensor to be a Long tensor of a shape [1, n_tokens] + if (durations.dtype() != ScalarType::Long && + durations.dtype() != ScalarType::Int) { + throw std::runtime_error( + "[Kokoro::DurationPredictor] Attempted to scale a non-integer tensor"); + } + + auto shape = durations.sizes(); + if (shape.size() != 1) { + throw std::runtime_error( + "[Kokoro::DurationPredictor] Attempted to scale an ill-shaped tensor"); + } + + int32_t nTokens = shape[0]; + int64_t *durationsPtr = durations.data_ptr(); + int64_t totalDur = std::reduce(durationsPtr, durationsPtr + nTokens); + + float scaleFactor = static_cast(targetDuration) / totalDur; + bool shrinking = scaleFactor < 1.F; + + // We need to scale partial durations (integers) corresponding to each token + // in a way that they all sum up to target duration, while keeping the balance + // between the values. + std::priority_queue> + remainders; // Sorted by the first value + int64_t scaledSum = 0; + for (int i = 0; i < nTokens; i++) { + float scaled = scaleFactor * durationsPtr[i]; + float remainder = + shrinking ? std::ceil(scaled) - scaled : scaled - std::floor(scaled); + + durationsPtr[i] = static_cast(shrinking ? std::ceil(scaled) + : std::floor(scaled)); + scaledSum += durationsPtr[i]; + + // Keeps the entries sorted by the remainders + remainders.emplace(remainder, i); + } + + // The initial processing scales durations to at least (targetDuration - + // nTokens) - the next part is to round the remaining values sorted by their + // remainders size. + int32_t diff = std::abs(targetDuration - scaledSum); + for (int i = 0; i < diff; i++) { + auto [remainder, idx] = remainders.top(); + durationsPtr[idx] += shrinking ? -1 : 1; + remainders.pop(); + } +} + +} // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h new file mode 100644 index 000000000..6d41be604 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h @@ -0,0 +1,43 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include "Constants.h" +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +using executorch::aten::Tensor; + +class DurationPredictor : public BaseModel { +public: + explicit DurationPredictor(const std::string &modelSource, + std::shared_ptr callInvoker); + + // Returns a tuple (d, indices, effectiveDuration) + std::tuple, int32_t> + generate(const std::string &method, const Configuration &inputConfig, + std::span tokens, std::span textMask, + std::span ref_hs, float speed = 1.F); + +private: + // Helper function - duration scalling + // Performs integer scaling on the durations tensor to ensure the sum of + // durations matches the given target duration + void scaleDurations(Tensor &durations, int32_t targetDuration) const; + + // Helper function - calculating effective duration based on duration tensor + // Since we apply padding to the input, the effective duration is + // usually a little bit lower than the max duration defined by static input + // size. + int32_t calculateEffectiveDuration(const Tensor &d, + const std::vector &indices) const; +}; + +} // namespace rnexecutorch::models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Encoder.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Encoder.cpp new file mode 100644 index 000000000..a06e85e07 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Encoder.cpp @@ -0,0 +1,57 @@ +#include "Encoder.h" +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +using ::executorch::aten::ScalarType; +using ::executorch::extension::make_tensor_ptr; +using ::executorch::extension::TensorPtr; + +Encoder::Encoder(const std::string &modelSource, + std::shared_ptr callInvoker) + : BaseModel(modelSource, callInvoker) { + std::string testMethod = + "forward_" + std::to_string(constants::kInputSmall.noTokens); + auto inputTensors = getAllInputShapes(testMethod); + + // Perform checks to validate model's compatibility with native code + CHECK_SIZE(inputTensors, 3); +} + +Result> Encoder::generate(const std::string &method, + const Configuration &inputConfig, + std::span tokens, + std::span textMask, + std::span pred_aln_trg) { + // Perform input shape checks + // Since every bit in text mask corresponds to exactly one of the tokens, both + // vectors should be the same length + CHECK_SIZE(tokens, textMask.size()); + + // Convert input data to ExecuTorch tensors + int32_t noTokens = static_cast(tokens.size()); + auto tokensTensor = make_tensor_ptr({1, static_cast(tokens.size())}, + tokens.data(), ScalarType::Long); + auto textMaskTensor = + make_tensor_ptr({1, static_cast(textMask.size())}, + textMask.data(), ScalarType::Bool); + auto predAlnTrgTensor = + make_tensor_ptr({1, noTokens, inputConfig.duration}, pred_aln_trg.data(), + ScalarType::Float); + + // Execute the appropriate "forward_xyz" method, based on given method name + auto results = + execute(method, {tokensTensor, textMaskTensor, predAlnTrgTensor}); + + if (!results.ok()) { + throw std::runtime_error( + "[Kokoro::Encoder] Failed to execute method " + method + + ", error: " + std::to_string(static_cast(results.error()))); + } + + // [asr] + return results; +} + +} // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Encoder.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Encoder.h new file mode 100644 index 000000000..10ef3d90f --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Encoder.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include +#include + +#include + +#include "Constants.h" +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +class Encoder : public BaseModel { +public: + explicit Encoder(const std::string &modelSource, + std::shared_ptr callInvoker); + + Result> generate(const std::string &method, + const Configuration &inputConfig, + std::span tokens, + std::span textMask, + std::span pred_aln_trg); +}; + +} // namespace rnexecutorch::models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/F0NPredictor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/F0NPredictor.cpp new file mode 100644 index 000000000..4864300cc --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/F0NPredictor.cpp @@ -0,0 +1,50 @@ +#include "F0NPredictor.h" +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +using ::executorch::aten::ScalarType; +using ::executorch::extension::make_tensor_ptr; +using ::executorch::extension::TensorPtr; + +F0NPredictor::F0NPredictor(const std::string &modelSource, + std::shared_ptr callInvoker) + : BaseModel(modelSource, callInvoker) { + std::string testMethod = + "forward_" + std::to_string(constants::kInputSmall.noTokens); + auto inputTensors = getAllInputShapes(testMethod); + + // Perform checks to validate model's compatibility with native code + CHECK_SIZE(inputTensors, 3); +} + +Result> F0NPredictor::generate( + const std::string &method, const Configuration &inputConfig, + std::span indices, std::span dur, std::span ref_hs) { + // Perform input shape checks + // s vector should be half of a voice reference vector size + CHECK_SIZE(ref_hs, constants::kVoiceRefHalfSize); + + // Convert input data to ExecuTorch tensors + auto indicesTensor = + make_tensor_ptr({inputConfig.duration}, indices.data(), ScalarType::Long); + auto durTensor = make_tensor_ptr({1, inputConfig.noTokens, 640}, dur.data(), + ScalarType::Float); + auto voiceRefTensor = make_tensor_ptr({1, constants::kVoiceRefHalfSize}, + ref_hs.data(), ScalarType::Float); + + // Execute the appropriate "forward_xyz" method, based on given method name + auto results = execute(method, {indicesTensor, durTensor, voiceRefTensor}); + + if (!results.ok()) { + throw std::runtime_error( + "[Kokoro::DurationPredictor] Failed to execute method " + method + + ", error: " + std::to_string(static_cast(results.error()))); + } + + // [F0_pred, N_pred, en, pred_alg_trn] + return results; +} + +} // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/F0NPredictor.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/F0NPredictor.h new file mode 100644 index 000000000..a7d55fdc8 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/F0NPredictor.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include +#include + +#include + +#include "Constants.h" +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +class F0NPredictor : public BaseModel { +public: + explicit F0NPredictor(const std::string &modelSource, + std::shared_ptr callInvoker); + + Result> generate(const std::string &method, + const Configuration &inputConfig, + std::span indices, + std::span dur, + std::span ref_hs); +}; + +} // namespace rnexecutorch::models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp new file mode 100644 index 000000000..a365feb67 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp @@ -0,0 +1,245 @@ +#include "Kokoro.h" +#include "Utils.h" + +#include +#include +#include +#include +#include + +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +Kokoro::Kokoro(int language, const std::string &taggerDataSource, + const std::string &phonemizerDataSource, + const std::string &durationPredictorSource, + const std::string &f0nPredictorSource, + const std::string &encoderSource, + const std::string &decoderSource, const std::string &voiceSource, + std::shared_ptr callInvoker) + : callInvoker_(std::move(callInvoker)), + durationPredictor_(durationPredictorSource, callInvoker_), + f0nPredictor_(f0nPredictorSource, callInvoker_), + encoder_(encoderSource, callInvoker_), + decoder_(decoderSource, callInvoker_), + phonemizer_(static_cast(language), taggerDataSource, + phonemizerDataSource) { + // Populate the voice array by reading given file + loadVoice(voiceSource); +} + +void Kokoro::loadVoice(const std::string &voiceSource) { + constexpr size_t rows = static_cast(constants::kInputLarge.noTokens); + constexpr size_t cols = static_cast(constants::kVoiceRefSize); // 256 + const size_t expectedCount = rows * cols; + const std::streamsize expectedBytes = + static_cast(expectedCount * sizeof(float)); + + std::ifstream in(voiceSource, std::ios::binary); + if (!in) { + throw std::runtime_error("[Kokoro::loadSingleVoice]: cannot open file: " + + voiceSource); + } + + // Check the file size + in.seekg(0, std::ios::end); + const std::streamsize fileSize = in.tellg(); + in.seekg(0, std::ios::beg); + if (fileSize < expectedBytes) { + throw std::runtime_error( + "[Kokoro::loadSingleVoice]: file too small: expected at least " + + std::to_string(expectedBytes) + " bytes, got " + + std::to_string(fileSize)); + } + + // Read [rows, 1, cols] as contiguous floats directly into voice_ + // ([rows][cols]) + if (!in.read(reinterpret_cast(voice_.data()->data()), + expectedBytes)) { + throw std::runtime_error( + "[Kokoro::loadSingleVoice]: failed to read voice weights"); + } +} + +std::vector Kokoro::generate(std::string text, float speed) { + if (text.size() > constants::kMaxTextSize) { + throw std::invalid_argument("Kokoro: maximum input text size exceeded"); + } + + // G2P (Grapheme to Phoneme) conversion + auto phonemes = phonemizer_.process(text); + + // Divide the phonemes string intro substrings. + // Affects the further calculations only in case of string size + // exceeding the biggest model's input. + auto subsentences = + partitioner_.divide(phonemes); + + std::vector audio = {}; + for (const auto &subsentence : subsentences) { + size_t inputSize = subsentence.size() + 2; + const auto &config = selectConfig(inputSize); + + auto audioPart = generateForConfig(subsentence, config, speed); + + // Calculate a pause between the sentences + char32_t lastPhoneme = subsentence.back(); + size_t pauseMs = constants::kPauseValues.contains(lastPhoneme) + ? constants::kPauseValues.at(lastPhoneme) + : constants::kDefaultPause; + std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F); + + // Add audio part and pause to the main audio vector + audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()), + std::make_move_iterator(audioPart.end())); + audio.insert(audio.end(), std::make_move_iterator(pause.begin()), + std::make_move_iterator(pause.end())); + } + + return audio; +} + +void Kokoro::stream(std::string text, float speed, + std::shared_ptr callback) { + if (text.size() > constants::kMaxTextSize) { + throw std::invalid_argument("Kokoro: maximum input text size exceeded"); + } + + // Build a full callback function + auto nativeCallback = [this, callback](const std::vector &audioVec) { + this->callInvoker_->invokeAsync([callback, audioVec](jsi::Runtime &rt) { + callback->call(rt, + rnexecutorch::jsi_conversion::getJsiValue(audioVec, rt)); + }); + }; + + // G2P (Grapheme to Phoneme) conversion + auto phonemes = phonemizer_.process(text); + + // Divide the phonemes string intro substrings. + // Use specialized implementation to minimize the latency between the + // sentences. + auto subsentences = + partitioner_.divide(phonemes); + + // We follow the implementation of generate() method, but + // instead of accumulating results in a vector, we push them + // back to the JS side with the callback. + for (const auto &subsentence : subsentences) { + size_t inputSize = subsentence.size() + 2; + const auto &config = selectConfig(inputSize); + + auto audioPart = generateForConfig(subsentence, config, speed); + + // Calculate a pause between the sentences + char32_t lastPhoneme = subsentence.back(); + size_t pauseMs = constants::kPauseValues.contains(lastPhoneme) + ? constants::kPauseValues.at(lastPhoneme) + : constants::kDefaultPause; + std::vector pause(pauseMs * constants::kSamplesPerMilisecond, 0.F); + + // Add the pause to the audio vector + audioPart.insert(audioPart.end(), std::make_move_iterator(pause.begin()), + std::make_move_iterator(pause.end())); + + // Push the audio right away to the JS side + nativeCallback(audioPart); + } +} + +std::vector Kokoro::generateForConfig(const std::u32string &phonemes, + const Configuration &config, + float speed) { + // Determine the appropriate method for given input configuration + std::string method = "forward_" + std::to_string(config.noTokens); + + // Map phonemes to tokens + auto tokens = utils::tokenize(phonemes, {config.noTokens}); + + // Select the appropriate voice vector + auto voiceId = std::clamp(static_cast(phonemes.size()) - 1, 0, + config.noTokens - 2); + auto &voice = voice_[voiceId]; + auto ref_ls = std::span(voice).first(constants::kVoiceRefHalfSize); + auto ref_hs = std::span(voice).last(constants::kVoiceRefHalfSize); + + // Initialize text mask + // Exlude all the paddings apart from first and last one. + int32_t inputLength = + std::min(static_cast(phonemes.size()) + 2, config.noTokens); + std::vector textMask(config.noTokens, false); + std::fill(textMask.begin(), textMask.begin() + inputLength, true); + + // Inference 1 - DurationPredictor + // The resulting duration vector is already scalled at this point + auto [d, indices, effectiveDuration] = durationPredictor_.generate( + method, config, std::span(tokens), + std::span(reinterpret_cast(textMask.data()), textMask.size()), + ref_hs, speed); + + // Inference 2 - F0NPredictor + auto f0nPrediction = f0nPredictor_.generate( + method, config, std::span(indices), + std::span(d.data_ptr(), d.numel()), ref_hs); + auto F0_pred = f0nPrediction->at(0).toTensor(); + auto N_pred = f0nPrediction->at(1).toTensor(); + auto en = f0nPrediction->at(2).toTensor(); + auto pred_aln_trg = f0nPrediction->at(3).toTensor(); + + // Inference 3 - Encoder + auto encoding = encoder_.generate( + method, config, std::span(tokens), + std::span(reinterpret_cast(textMask.data()), textMask.size()), + std::span(pred_aln_trg.data_ptr(), pred_aln_trg.numel())); + auto asr = encoding->at(0).toTensor(); + + // Inference 4 - Decoder + auto decoding = decoder_.generate( + method, config, std::span(asr.data_ptr(), asr.numel()), + std::span(F0_pred.data_ptr(), F0_pred.numel()), + std::span(N_pred.data_ptr(), N_pred.numel()), ref_ls); + auto audioTensor = decoding->at(0).toTensor(); + + // Cut the resulting audio vector according to the effective duration + int32_t effLength = constants::kTicksPerDuration * effectiveDuration; + auto audio = + std::span(audioTensor.const_data_ptr(), effLength); + auto croppedAudio = + utils::stripAudio(audio, constants::kSamplesPerMilisecond * 50); + + std::vector result(croppedAudio.begin(), croppedAudio.end()); + + return result; +} + +const Configuration &Kokoro::selectConfig(size_t inputSize) const { + std::string modelLabel = + fixedModel_.has_value() ? fixedModel_.value() + : inputSize <= constants::kInputSmall.noTokens ? "small" + : inputSize <= constants::kInputMedium.noTokens ? "medium" + : "large"; + + return constants::kInputs.at(modelLabel); +} + +std::size_t Kokoro::getMemoryLowerBound() const noexcept { + return durationPredictor_.getMemoryLowerBound() + + f0nPredictor_.getMemoryLowerBound() + encoder_.getMemoryLowerBound() + + decoder_.getMemoryLowerBound() + sizeof(voice_) + sizeof(phonemizer_); +} + +void Kokoro::unload() noexcept { + durationPredictor_.unload(); + f0nPredictor_.unload(); + encoder_.unload(); + decoder_.unload(); +} + +void Kokoro::setFixedModel(std::string modelLabel) { + partitioner_.setFixedModel(modelLabel); + fixedModel_ = {modelLabel}; +} + +} // namespace rnexecutorch::models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h new file mode 100644 index 000000000..e534e579a --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "Decoder.h" +#include "DurationPredictor.h" +#include "Encoder.h" +#include "F0NPredictor.h" +#include "Partitioner.h" +#include +#include + +namespace rnexecutorch { +namespace models::text_to_speech::kokoro { + +class Kokoro { +public: + Kokoro(int language, const std::string &taggerDataSource, + const std::string &phonemizerDataSource, + const std::string &durationPredictorSource, + const std::string &f0nPredictorSource, + const std::string &encoderSource, const std::string &decoderSource, + const std::string &voiceSource, + std::shared_ptr callInvoker); + + // Processes the entire text at once, before sending back to the JS side. + std::vector generate(std::string text, float speed = 1.F); + + // Processes text in chunks, sending each chunk individualy to the JS side + // with asynchronous callbacks. + void stream(std::string text, float speed, + std::shared_ptr callback); + + std::size_t getMemoryLowerBound() const noexcept; + void unload() noexcept; + + // Extra options setters + void setFixedModel(std::string modelLabel); + +private: + // Helper function - loading voice array + void loadVoice(const std::string &voiceSource); + + // Helper function - selecting the appropriate input config for given input + // size + const Configuration &selectConfig(size_t inputSize) const; + + // Helper function - generate specialization for given input size + std::vector generateForConfig(const std::u32string &phonemes, + const Configuration &config, + float speed); + + // JS callback handle + std::shared_ptr callInvoker_; + + // Submodules + Partitioner partitioner_; // Not a part of the original Kokoro inference + DurationPredictor durationPredictor_; + F0NPredictor f0nPredictor_; + Encoder encoder_; + Decoder decoder_; + + // Voice array + // There is a separate voice vector for each of the possible numbers of input + // tokens. + std::array, + constants::kInputLarge.noTokens> + voice_; + + // Phonemizer pipeline + phonemis::Pipeline phonemizer_; + + // Extra options + std::optional fixedModel_; +}; +} // namespace models::text_to_speech::kokoro + +REGISTER_CONSTRUCTOR(models::text_to_speech::kokoro::Kokoro, int, std::string, + std::string, std::string, std::string, std::string, + std::string, std::string, + std::shared_ptr); +} // namespace rnexecutorch \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp new file mode 100644 index 000000000..4aef1f8ca --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp @@ -0,0 +1,187 @@ +#include "Partitioner.h" +#include "Constants.h" +#include +#include +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +// Custom infinity definition +constexpr Partitioner::Cost INF = 1e5; + +template <> +std::vector +Partitioner::divide( + const std::u32string &phonemes) { + // Update the small model cost back to normal + modelCosts_.at("small") = 4; + + return divide(phonemes, [](Cost a, Cost b) { return a + b; }); +} + +template <> +std::vector Partitioner::divide( + const std::u32string &phonemes) { + // In streaming mode, we particularly want to avoid using + // small model, since it might introduce a bigger latency + // if followed by the large model. + modelCosts_.at("small") = 1000; + + if (phonemes.size() <= constants::kInputMedium.noTokens - 2) { + return {phonemes}; + } + + // Try to start with a medium-sized model + auto begPart = phonemes.substr(0, constants::kInputMedium.noTokens - 2); + auto lastEosIt = + std::find_if(begPart.rbegin(), begPart.rend(), [](char32_t c) { + return constants::kEndOfSentencePhonemes.contains(c); + }); + auto lastPauseIt = + std::find_if(begPart.rbegin(), begPart.rend(), [](char32_t c) { + return constants::kPausePhonemes.contains(c); + }); + int32_t lastEos = lastEosIt != begPart.rend() + ? static_cast(std::distance( + begPart.begin(), lastEosIt.base())) - + 1 + : -1; + int32_t lastPause = lastPauseIt != begPart.rend() + ? static_cast(std::distance( + begPart.begin(), lastPauseIt.base())) - + 1 + : -1; + + if (!fixedModel_.has_value() && + std::max(lastEos, lastPause) > constants::kInputSmall.noTokens - 2) { + int32_t breakpoint = + lastEos > constants::kInputSmall.noTokens - 2 ? lastEos : lastPause; + + std::vector result = {phonemes.substr(0, breakpoint + 1)}; + auto rest = divide( + phonemes.substr(breakpoint + 1, phonemes.size() - breakpoint - 1), + [](Cost a, Cost b) { return a + b; }); + result.insert(result.end(), std::make_move_iterator(rest.begin()), + std::make_move_iterator(rest.end())); + return result; + } + + return divide(phonemes, [](Cost a, Cost b) { return a + b; }); +} + +void Partitioner::setFixedModel(const std::string &modelLabel) { + if (!constants::kInputs.contains(modelLabel)) + throw std::invalid_argument("Partitioner: invalid fixed model label"); + + fixedModel_ = {modelLabel}; +} + +void Partitioner::resetOptions() { fixedModel_ = std::nullopt; } + +// Helper function - partitioning +// A template which is controled by concrete operator instead of +// an abstract Strategy argument. +// Utilizes dynamic programming approach for finding the +// optimal solution. +std::vector +Partitioner::divide(const std::u32string &phonemes, + const std::function &op) { + // DP array + // (cost, prev_breakpoint_idx) pairs + std::vector> mem(phonemes.size(), {INF, -1}); + + // Keep the potential break point indices to speed up the calculation. + std::deque eosPoints, pausePoints, whitePoints; + + for (int32_t i = 0; i < phonemes.size(); i++) { + auto &[estimation, prevBreakIdx] = mem[i]; + + // We assume that phonemes[i] is the last character of currently analyzed + // substring. First, estimate for the entire substring without further + // division. + estimation = cost(i + 1); + + // Now, try to divide into 2 substring and utilize already calculated values + // for left-side substring. + for (auto *q : {&eosPoints, &pausePoints, &whitePoints}) { + // First, clear the queus from useless entries (out of even largest model + // bounds). + while (!q->empty() && q->front() < i - constants::kInputLarge.noTokens) { + q->pop_front(); + } + + // Now iterate through the reimaining positions. + Cost penalty = q == &eosPoints ? eosPenalty + : q == &pausePoints ? pausePenalty + : whitePenalty; + for (int32_t breakIdx : (*q)) { + Cost newEstimation = + op(mem[breakIdx].first, cost(i - breakIdx)) + penalty; + if (newEstimation < estimation && breakIdx > 0) { + estimation = newEstimation; + prevBreakIdx = breakIdx; + } + } + } + + // Add current phoneme to the appropriate queue. + char32_t phoneme = phonemes[i]; + if (constants::kEndOfSentencePhonemes.contains(phoneme)) { + eosPoints.push_back(i); + } else if (constants::kPausePhonemes.contains(phoneme)) { + pausePoints.push_back(i); + } else if (phoneme < 256 && std::isspace(static_cast(phoneme))) { + whitePoints.push_back(i); + } + } + + std::vector result = {}; + + // Perform backtracking to obtain all the substrings. + // Note that because of backtracking, the order is reversed. + int32_t end = phonemes.size() - 1; + while (end != -1) { + int32_t begin = mem[end].second + 1; + result.push_back(phonemes.substr(begin, end - begin + 1)); + end = mem[end].second; + } + + std::ranges::reverse(result); + + return result; +} + +// Helper function - cost estimation (by string size) +Partitioner::Cost Partitioner::cost(size_t stringSize) { + size_t effSize = stringSize + 2; + + // If fixed model is set, we are limited to using only one of the models. + std::string activeModel = + fixedModel_.has_value() ? fixedModel_.value() + : effSize <= constants::kInputSmall.noTokens ? "small" + : effSize <= constants::kInputMedium.noTokens ? "medium" + : "large"; + + const Configuration &modelConfig = constants::kInputs.at(activeModel); + Cost baseCost = + effSize <= modelConfig.noTokens ? modelCosts_.at(activeModel) : INF; + + // Scale the cost according to sentence length / input proportion. + // The idea is to penalize creating sentences much shorter than + // corresponding input length. + if (effSize < modelConfig.noTokens) { + baseCost += baseCost * (modelConfig.noTokens - effSize) * + (modelConfig.noTokens - effSize) / + (modelConfig.noTokens * modelConfig.noTokens); + } + + return baseCost; +} + +// Helper function - cost estimation (by string) +Partitioner::Cost Partitioner::cost(const std::u32string &phonemes) { + return cost(phonemes.size()); +} + +} // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h new file mode 100644 index 000000000..e992d28b4 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { + +class Partitioner { +public: + // Partition strategy + // Defines how to divide phoneme string into substrings, by minimizing + // one of the selected properties. + enum class Strategy { + TOTAL_TIME = 0, // Only minimizes the estimated total time of processing + LATENCY, // Minimizes the streaming latency by dividing into small and + // similar length parts + }; + + // Cost definition + using Cost = int32_t; + + // Partition function + // Performs a division of the input phoneme string according to + // given strategy. + template + std::vector divide(const std::u32string &phonemes); + + // Extra options setters + void setFixedModel(const std::string &modelLabel); + void resetOptions(); + +private: + // Helper function - partitioning + std::vector divide(const std::u32string &phonemes, + const std::function &op); + // Helper function - cost estimation (by string size) + Cost cost(size_t stringSize); + // Helper function - cost estimation (by string) + Cost cost(const std::u32string &phonemes); + + // Predefined costs + // Affect the algorithm behavior in selecting break points and + // therefore partitioning the strings. + std::unordered_map modelCosts_ = { + {"small", 40}, {"medium", 70}, {"large", 100}}; + Cost eosPenalty = 0; + Cost pausePenalty = 30; + Cost whitePenalty = 80; + + // Extra settings + std::optional fixedModel_ = std::nullopt; +}; + +} // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h new file mode 100644 index 000000000..9c9d1ffcf --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h @@ -0,0 +1,19 @@ +#pragma once + +#include + +namespace rnexecutorch::models::text_to_speech::kokoro { +// Type definitions - model (input) configuration +// Since all parts of the Kokoro model are exported with static input shapes, +// it operates on 3 levels of input size - defined by the configuration below. +struct Configuration { + int32_t noTokens; // Number of input tokens + int32_t duration; // Expected (maximal) duration (80 ~ 2 seconds of audio) +}; + +// Type definitions - model input tokens +// TODO: It's possible to switch to int32_t after reexporting the models with +// dtype=torch.int +using Token = int64_t; + +} // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp new file mode 100644 index 000000000..88e0cd68c --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp @@ -0,0 +1,93 @@ +#include "Utils.h" +#include "Constants.h" +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro::utils { + +// Helper functions +namespace { +// Normalizes an audio sample +float normalize(float sample) { + float v = std::abs(sample); + return v >= constants::kAudioSilenceThreshold ? v : 0.F; +} + +// Returns an index corresponding to the first (or last - if reverse=true) +// non-quiet part of an audio. +// Utilizes a moving average controled by hyperparameters from Constants.h. +template size_t findAudioBound(std::span audio) { + if (audio.empty()) { + return 0; + } + + size_t length = audio.size(); + + float sum = 0.F; + size_t count = 0; + size_t i = reverse ? length - 1 : 0; + + while (count < length) { + count++; + sum += normalize(audio[i]); + if (count > constants::kAudioCroppingSteps) { + sum -= normalize(audio[reverse ? i + constants::kAudioCroppingSteps + : i - constants::kAudioCroppingSteps]); + } + + if (count >= constants::kAudioCroppingSteps && + sum / constants::kAudioCroppingSteps >= + constants::kAudioSilenceThreshold) { + return i; + } + + i = reverse ? i - 1 : i + 1; + } + + return reverse ? 0 : length - 1; +} +} // namespace + +std::span stripAudio(std::span audio, size_t margin) { + auto lbound = findAudioBound(audio); + auto rbound = findAudioBound(audio); + + lbound = std::max(lbound - margin, size_t(0)); + rbound = std::min(rbound + margin, audio.size() - 1); + + return audio.subspan(lbound, rbound >= lbound ? rbound - lbound + 1 : 0); +} + +std::vector tokenize(const std::u32string &phonemes, + std::optional expectedSize) { + if (expectedSize.has_value() && expectedSize.value() < 2) { + throw std::invalid_argument( + "expected number of tokens cannot be lower than 2"); + } + + // Number of tokens to populate, with and without edge pad tokens + size_t lengthWithPadding = + expectedSize.has_value() ? expectedSize.value() : phonemes.size() + 2; + size_t lengthWithoutPadding = lengthWithPadding - 2; + size_t effNoTokens = std::min(lengthWithoutPadding, phonemes.size()); + + // Note that we populate tokens[1:noTokens - 1], since first and last tokens + // are zeros (padding). Input could still contain unrecognized tokens, and + // that's why we use partition() at the end. + std::vector tokens(lengthWithPadding, constants::kPadToken); + std::transform(phonemes.begin(), phonemes.begin() + effNoTokens, + tokens.begin() + 1, [](char32_t p) -> Token { + return constants::kVocab.contains(p) + ? constants::kVocab.at(p) + : constants::kInvalidToken; + }); + auto validSeqEnd = std::partition( + tokens.begin() + 1, tokens.begin() + effNoTokens + 1, + [](Token t) -> bool { return t != constants::kInvalidToken; }); + std::fill(validSeqEnd, tokens.begin() + effNoTokens + 1, + constants::kPadToken); + + return tokens; +} + +} // namespace rnexecutorch::models::text_to_speech::kokoro::utils \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h new file mode 100644 index 000000000..081d40c14 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h @@ -0,0 +1,26 @@ +#pragma once + +#include "Types.h" +#include +#include +#include +#include + +namespace rnexecutorch::models::text_to_speech::kokoro::utils { + +// Removes silence from the beginning and the end of an audio (with some +// margin). +// Returns a [l - m, r + m] range of audio samples, where m is the margin, +// l and r correspond to lower and upper audio bound respectively. +std::span stripAudio(std::span audio, + size_t margin = 0); + +// Tokenizes given phoneme string. +// Each phoneme corresponds to exactly one token, with 2 additional pad +// tokens added at both ends. +// If extecped number of tokens is provided, eventually expands the token vector +// with pad tokens to match the given length. +std::vector tokenize(const std::u32string &phonemes, + std::optional expectedSize = std::nullopt); + +} // namespace rnexecutorch::models::text_to_speech::kokoro::utils \ No newline at end of file diff --git a/packages/react-native-executorch/react-native-executorch.podspec b/packages/react-native-executorch/react-native-executorch.podspec index f5e1b33c2..e7c72b92c 100644 --- a/packages/react-native-executorch/react-native-executorch.podspec +++ b/packages/react-native-executorch/react-native-executorch.podspec @@ -17,6 +17,7 @@ Pod::Spec.new do |s| pthreadpool_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/pthreadpool', __dir__) cpuinfo_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/cpuinfo', __dir__) + phonemis_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/phonemis', __dir__) s.user_target_xcconfig = { "HEADER_SEARCH_PATHS" => "$(PODS_TARGET_SRCROOT)/third-party/include", @@ -27,7 +28,9 @@ Pod::Spec.new do |s| "\"#{tokenizers_binaries_path}/physical-arm64-release/libsentencepiece.a\"", "\"#{tokenizers_binaries_path}/physical-arm64-release/libtokenizers_c.a\"", "\"#{pthreadpool_binaries_path}/physical-arm64-release/libpthreadpool.a\"", - "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"" + "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"", + "\"#{phonemis_binaries_path}/physical-arm64-release/libphonemis.a\"", + ].join(' '), "OTHER_LDFLAGS[sdk=iphonesimulator*]" => [ @@ -36,7 +39,8 @@ Pod::Spec.new do |s| "\"#{tokenizers_binaries_path}/simulator-arm64-debug/libsentencepiece.a\"", "\"#{tokenizers_binaries_path}/simulator-arm64-debug/libtokenizers_c.a\"", "\"#{pthreadpool_binaries_path}/simulator-arm64-debug/libpthreadpool.a\"", - "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"" + "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"", + "\"#{phonemis_binaries_path}/simulator-arm64-debug/libphonemis.a\"", ].join(' '), 'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'x86_64', diff --git a/packages/react-native-executorch/scripts/create-package.sh b/packages/react-native-executorch/scripts/create-package.sh deleted file mode 100755 index 8b77cc20b..000000000 --- a/packages/react-native-executorch/scripts/create-package.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -yarn install --immutable - -if [ $# -ge 1 ] && [ "$1" = "generate_nightly_version" ]; then - VERSION=$(jq -r '.version' package.json) - IFS='.' read -r MAJOR MINOR PATCH <<<"$VERSION" - GIT_COMMIT=$(git rev-parse HEAD) - DATE=$(date +%Y%m%d) - NIGHTLY_UNIQUE_NAME="${GIT_COMMIT:0:7}-$DATE" - if [[ "$OSTYPE" == "darwin"* ]]; then - sed -i '' "3s/.*/ \"version\": \"$MAJOR.$MINOR.$PATCH-nightly-$NIGHTLY_UNIQUE_NAME\",/" package.json - else - sed -i "3s/.*/ \"version\": \"$MAJOR.$MINOR.$PATCH-nightly-$NIGHTLY_UNIQUE_NAME\",/" package.json - fi -fi - -yarn bob build - -npm pack - -if [ $# -ge 1 ] && [ "$1" = "generate_nightly_version" ]; then - if [[ "$OSTYPE" == "darwin"* ]]; then - sed -i '' "3s/.*/ \"version\": \"$MAJOR.$MINOR.$PATCH\",/" package.json - else - sed -i "3s/.*/ \"version\": \"$MAJOR.$MINOR.$PATCH\",/" package.json - fi -fi - -echo "Done!" diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 50e7ef5a8..40fdcb5d0 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -1,9 +1,9 @@ import { Platform } from 'react-native'; -const URL_PREFIX = +export const URL_PREFIX = 'https://huggingface.co/software-mansion/react-native-executorch'; -const VERSION_TAG = 'resolve/v0.6.0'; -// const NEXT_VERSION_TAG = 'resolve/v0.7.0'; +export const VERSION_TAG = 'resolve/v0.6.0'; +export const NEXT_VERSION_TAG = 'resolve/v0.7.0'; // LLMs diff --git a/packages/react-native-executorch/src/constants/tts/models.ts b/packages/react-native-executorch/src/constants/tts/models.ts new file mode 100644 index 000000000..c360c79f6 --- /dev/null +++ b/packages/react-native-executorch/src/constants/tts/models.ts @@ -0,0 +1,14 @@ +import { URL_PREFIX, NEXT_VERSION_TAG } from '../modelUrls'; + +// Text to speech (tts) - Kokoro model(s) +const KOKORO_EN_DURATION_PREDICTOR = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/xnnpack/duration_predictor.pte`; +const KOKORO_EN_F0N_PREDICTOR = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/xnnpack/f0n_predictor.pte`; +const KOKORO_EN_TEXT_ENCODER = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/xnnpack/text_encoder.pte`; +const KOKORO_EN_TEXT_DECODER = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/xnnpack/text_decoder.pte`; + +export const KOKORO_EN = { + durationPredictorSource: KOKORO_EN_DURATION_PREDICTOR, + f0nPredictorSource: KOKORO_EN_F0N_PREDICTOR, + textEncoderSource: KOKORO_EN_TEXT_ENCODER, + textDecoderSource: KOKORO_EN_TEXT_DECODER, +}; diff --git a/packages/react-native-executorch/src/constants/tts/voices.ts b/packages/react-native-executorch/src/constants/tts/voices.ts new file mode 100644 index 000000000..c1a92975e --- /dev/null +++ b/packages/react-native-executorch/src/constants/tts/voices.ts @@ -0,0 +1,60 @@ +import { TextToSpeechLanguage } from '../../types/tts'; +import { URL_PREFIX, NEXT_VERSION_TAG } from '../modelUrls'; + +// Kokoro voices - phonemizers +const KOKORO_PHONEMIZER_PREFIX = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/phonemizer`; +const KOKORO_PHONEMIZER_TAGGER_DATA = `${KOKORO_PHONEMIZER_PREFIX}/tags.json`; +const KOKORO_PHONEMIZER_LEXICON_EN_US_DATA = `${KOKORO_PHONEMIZER_PREFIX}/us_merged.json`; +const KOKORO_PHONEMIZER_LEXICON_EN_GB_DATA = `${KOKORO_PHONEMIZER_PREFIX}/gb_merged.json`; + +const EN_US_RESOURCES = { + tagger: KOKORO_PHONEMIZER_TAGGER_DATA, + lexicon: KOKORO_PHONEMIZER_LEXICON_EN_US_DATA, +}; +const EN_GB_RESOURCES = { + tagger: KOKORO_PHONEMIZER_TAGGER_DATA, + lexicon: KOKORO_PHONEMIZER_LEXICON_EN_GB_DATA, +}; + +// Kokoro voices +const KOKORO_VOICE_PREFIX = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/voices`; +export const KOKORO_VOICE_AF_HEART = { + language: TextToSpeechLanguage.EN_US, + data: `${KOKORO_VOICE_PREFIX}/af_heart.bin`, + extra: EN_US_RESOURCES, +}; +export const KOKORO_VOICE_AF_RIVER = { + language: TextToSpeechLanguage.EN_US, + data: `${KOKORO_VOICE_PREFIX}/af_river.bin`, + extra: EN_US_RESOURCES, +}; +export const KOKORO_VOICE_AF_SARAH = { + language: TextToSpeechLanguage.EN_US, + data: `${KOKORO_VOICE_PREFIX}/af_sarah.bin`, + extra: EN_US_RESOURCES, +}; +export const KOKORO_VOICE_AM_ADAM = { + language: TextToSpeechLanguage.EN_US, + data: `${KOKORO_VOICE_PREFIX}/am_adam.bin`, + extra: EN_US_RESOURCES, +}; +export const KOKORO_VOICE_AM_MICHAEL = { + language: TextToSpeechLanguage.EN_US, + data: `${KOKORO_VOICE_PREFIX}/am_michael.bin`, + extra: EN_US_RESOURCES, +}; +export const KOKORO_VOICE_AM_SANTA = { + language: TextToSpeechLanguage.EN_US, + data: `${KOKORO_VOICE_PREFIX}/am_santa.bin`, + extra: EN_US_RESOURCES, +}; +export const KOKORO_VOICE_BF_EMMA = { + language: TextToSpeechLanguage.EN_GB, + data: `${KOKORO_VOICE_PREFIX}/bf_emma.bin`, + extra: EN_GB_RESOURCES, +}; +export const KOKORO_VOICE_BM_DANIEL = { + language: TextToSpeechLanguage.EN_GB, + data: `${KOKORO_VOICE_PREFIX}/bm_daniel.bin`, + extra: EN_GB_RESOURCES, +}; diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts new file mode 100644 index 000000000..454c206c9 --- /dev/null +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts @@ -0,0 +1,101 @@ +import { useEffect, useState } from 'react'; +import { TextToSpeechModule } from '../../modules/natural_language_processing/TextToSpeechModule'; +import { + TextToSpeechConfig, + TextToSpeechInput, + TextToSpeechStreamingInput, +} from '../../types/tts'; +import { ETError, getError } from '../../Error'; + +interface Props extends TextToSpeechConfig { + preventLoad?: boolean; +} + +export const useTextToSpeech = ({ + model, + voice, + options, + preventLoad = false, +}: Props) => { + const [error, setError] = useState(null); + const [isReady, setIsReady] = useState(false); + const [isGenerating, setIsGenerating] = useState(false); + const [downloadProgress, setDownloadProgress] = useState(0); + + const [moduleInstance] = useState(() => new TextToSpeechModule()); + + // Stabilize options to prevent unnecessary reloads when new object references are passed + const optionsJson = JSON.stringify(options); + + useEffect(() => { + if (preventLoad) return; + + (async () => { + setDownloadProgress(0); + setError(null); + try { + setIsReady(false); + await moduleInstance.load( + { + model, + voice, + options, + }, + setDownloadProgress + ); + setIsReady(true); + } catch (err) { + setError((err as Error).message); + } + })(); + + return () => { + moduleInstance.delete(); + }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [ + moduleInstance, + model.durationPredictorSource, + model.f0nPredictorSource, + model.textEncoderSource, + model.textDecoderSource, + voice?.data, + voice?.extra, + optionsJson, + preventLoad, + ]); + + const forward = async (input: TextToSpeechInput) => { + if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded)); + if (isGenerating) throw new Error(getError(ETError.ModelGenerating)); + try { + setIsGenerating(true); + return await moduleInstance.forward(input.text, input.speed ?? 1.0); + } finally { + setIsGenerating(false); + } + }; + + const stream = async (input: TextToSpeechStreamingInput) => { + if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded)); + if (isGenerating) throw new Error(getError(ETError.ModelGenerating)); + try { + setIsGenerating(true); + await moduleInstance.stream({ + ...input, + speed: input.speed ?? 1.0, + }); + } finally { + setIsGenerating(false); + } + }; + + return { + error, + isReady, + isGenerating, + forward, + stream, + downloadProgress, + }; +}; diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index cddc6f595..5df0daf40 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -27,6 +27,16 @@ declare global { decoderSource: string, modelName: string ) => any; + var loadTextToSpeechKokoro: ( + language: number, + taggerData: string, + phonemizerData: string, + durationPredictorSource: string, + f0nPredictorSource: string, + textEncoderSource: string, + textDecoderSource: string, + voice: string + ) => any; var loadOCR: ( detectorSource: string, recognizerLarge: string, @@ -56,6 +66,7 @@ if ( global.loadVAD == null || global.loadLLM == null || global.loadSpeechToText == null || + global.loadTextToSpeechKokoro == null || global.loadOCR == null || global.loadVerticalOCR == null ) { @@ -79,6 +90,7 @@ export * from './hooks/computer_vision/useTextToImage'; export * from './hooks/natural_language_processing/useLLM'; export * from './hooks/natural_language_processing/useSpeechToText'; +export * from './hooks/natural_language_processing/useTextToSpeech'; export * from './hooks/natural_language_processing/useTextEmbeddings'; export * from './hooks/natural_language_processing/useTokenizer'; export * from './hooks/natural_language_processing/useVAD'; @@ -123,4 +135,6 @@ export { // constants export * from './constants/modelUrls'; export * from './constants/ocr/models'; +export * from './constants/tts/models'; +export * from './constants/tts/voices'; export * from './constants/llmDefaults'; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts new file mode 100644 index 000000000..ec678f972 --- /dev/null +++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts @@ -0,0 +1,121 @@ +import { ResourceFetcher } from '../../utils/ResourceFetcher'; +import { ETError, getError } from '../../Error'; +import { BaseModule } from '../BaseModule'; +import { + KokoroConfig, + TextToSpeechConfig, + TextToSpeechStreamingInput, + VoiceConfig, +} from '../../types/tts'; + +export class TextToSpeechModule extends BaseModule { + public async load( + config: TextToSpeechConfig, + onDownloadProgressCallback: (progress: number) => void = () => {} + ): Promise { + const anySourceKey = Object.keys(config.model).find((key) => + key.includes('Source') + ); + if (anySourceKey === undefined) { + throw new Error('No model source provided.'); + } + + // Select the text to speech model based on the input URL + // TODO: this check is pretty dubious and should be replaced with something better. + const uri = (config.model as any)[anySourceKey]; + if (uri.includes('kokoro')) { + await this.loadKokoro( + config.model, + config.voice!, + onDownloadProgressCallback, + config.options + ); + } + // ... more models? ... + } + + // Specialized loader - Kokoro model + private async loadKokoro( + model: KokoroConfig, + voice: VoiceConfig, + onDownloadProgressCallback: (progress: number) => void, + options?: any + ): Promise { + if (!voice.extra || !voice.extra.tagger || !voice.extra.lexicon) { + throw new Error( + 'Kokoro: voice config is missing required extra fields: tagger and/or lexicon.' + ); + } + + const paths = await ResourceFetcher.fetch( + onDownloadProgressCallback, + ...Object.values(model), + voice.data, + voice.extra!.tagger, + voice.extra!.lexicon + ); + + if (paths === null || paths.length < 7) { + throw new Error('Download interrupted.'); + } + + const modelPaths = paths.slice(0, 4); + const voiceDataPath = paths[4]; + const phonemizerPaths = paths.slice(5, 7); + + this.nativeModule = global.loadTextToSpeechKokoro( + voice.language, + phonemizerPaths[0]!, + phonemizerPaths[1]!, + modelPaths[0]!, + modelPaths[1]!, + modelPaths[2]!, + modelPaths[3]!, + voiceDataPath! + ); + + // Handle extra options + if (options && 'fixedModel' in options) { + const allowedModels = ['small', 'medium', 'large']; + const fixedModelValue = options.fixedModel; + if (!allowedModels.includes(fixedModelValue)) { + throw new Error( + `Invalid fixedModel value: ${fixedModelValue}. Allowed values are: ${allowedModels.join(', ')}.` + ); + } + this.nativeModule.setFixedModel(fixedModelValue); + } + } + + public async forward(text: string, speed: number = 1.0) { + if (this.nativeModule == null) + throw new Error(getError(ETError.ModuleNotLoaded)); + return await this.nativeModule.generate(text, speed); + } + + public async stream({ + text, + onBegin, + onNext, + onEnd, + speed, + }: TextToSpeechStreamingInput) { + let queue = Promise.resolve(); + + onBegin?.(); + + try { + await this.nativeModule.stream(text, speed, (audio: number[]) => { + queue = queue.then(() => + Promise.resolve(onNext?.(new Float32Array(audio))) + ); + }); + + await queue; + } catch (e) { + throw e; + } finally { + onEnd?.(); + } + } +} diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts new file mode 100644 index 000000000..b119ebf68 --- /dev/null +++ b/packages/react-native-executorch/src/types/tts.ts @@ -0,0 +1,44 @@ +import { ResourceSource } from './common'; + +// List all the languages available in TTS models +// The values should match the one used within the native side. +export enum TextToSpeechLanguage { + EN_US = 0, + EN_GB = 1, +} + +// Voice configuration +// So far in Kokoro, each voice is directly associated with a language. +// The 'data' field corresponds to (usually) binary file with voice tensor. +export interface VoiceConfig { + language: TextToSpeechLanguage; + data: ResourceSource; + extra?: Record; +} + +// Individual model configurations +// - Kokoro Configuration (including Phonemis tagger resource) +export interface KokoroConfig { + durationPredictorSource: ResourceSource; + f0nPredictorSource: ResourceSource; + textEncoderSource: ResourceSource; + textDecoderSource: ResourceSource; +} + +// Model + voice configurations +export interface TextToSpeechConfig { + model: KokoroConfig; // ... add other model types in the future + voice?: VoiceConfig; + options?: any; // A completely optional model-specific configuration +} + +export interface TextToSpeechInput { + text: string; + speed?: number; +} + +export interface TextToSpeechStreamingInput extends TextToSpeechInput { + onBegin?: () => void | Promise; + onNext?: (audio: Float32Array) => void | Promise; + onEnd?: () => void | Promise; +} diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a new file mode 100644 index 000000000..a40254dcc Binary files /dev/null and b/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a differ diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a new file mode 100644 index 000000000..c08d890ef Binary files /dev/null and b/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a differ diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/constants.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/constants.h new file mode 100644 index 000000000..f2f309976 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/constants.h @@ -0,0 +1,65 @@ +#pragma once + +#include +#include +#include + +namespace phonemis::phonemizer::constants { + +// Control constants & hyperparameters +// Determine the behavior of the phonemization algorithms. +inline constexpr int32_t kMaxSyllabeLength = + 6; // See the fallback phonemization mechanism +inline constexpr int32_t kVowelSyllabePenalty = + 2; // See the fallback phonemization mechanism + +// Alphabet-related constants +namespace alphabet { +inline const std::string kVowels = "aeiouy"; // Written vowels +inline const std::string kConsosants = + "bcdfghjklmnpqrstvwxz"; // Written consosants + +// Acceptable number suffixes +// Cause numbers to be converted into ordinal instead of cardinal representation +inline const std::unordered_set kOrdinalSuffixes = {"st", "nd", + "rd", "th"}; + +inline const std::unordered_map kAddSymbols = { + {'.', "dot"}, {'/', "slash"}}; + +inline const std::unordered_map kSymbols = {{'%', "percent"}, + {'&', "and"}, + {'+', "plus"}, + {'@', "at"}, + {'=', "equals"}}; + +inline const std::unordered_set kPunctations = {';', ':', ',', '.', '!', + '?', '-', '"', '\''}; + +inline const std::unordered_set kNonQuotePunctations = { + ';', ':', ',', '.', '!', '?', '-', '\''}; + +// Acceptable currencies (with spoken text representation) +// Maps currency signatures to it's spoken representation for both main and +// fractional units +inline const std::unordered_map> + kCurrencies = {{U'$', {"dolar", "cent"}}, + {U'£', {"pound", "pence"}}, + {U'€', {"euro", "cent"}}}; +} // namespace alphabet + +// Language (spoken) constants +namespace language { +inline const std::u32string kVowels = U"AIOQWYaiuæɑɒɔəɛɜɪʊʌᵻ"; // Spoken vowels +inline const std::u32string kConsonants = + U"bdfhjklmnpstvwzðŋɡɹɾʃʒʤʧθ"; // Spoken consosants +inline const std::u32string kUSTaus = U"AIOWYiuæɑəɛɪɹʊʌ"; +} // namespace language + +// Stress calculation constants +namespace stress { +inline constexpr char32_t kPrimary = U'ˈ'; +inline constexpr char32_t kSecondary = U'ˌ'; +} // namespace stress + +} // namespace phonemis::phonemizer::constants \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h new file mode 100644 index 000000000..3af426821 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h @@ -0,0 +1,60 @@ +#pragma once + +#include "../tagger/tag.h" +#include "types.h" +#include +#include +#include + +namespace phonemis::phonemizer { + +// Lexicon class +// Provides phonemization of extracted tokens. +// Wrapps a dictionary lookup for given word with additional +// pre/post-processing. +class Lexicon { +public: + Lexicon(Lang language, const std::string &dict_filepath); + + // Checks if given world exists in the lexicon in any form + bool is_known(const std::string &word) const; + + // Returns the phonemization for given word, or "" if the phonemization failed + std::u32string get(const std::string &word, const tagger::Tag &tag, + std::optional base_stress = std::nullopt, + std::optional vowel_next = std::nullopt); + +private: + // Helper functions - extract phonemes without stressing + std::u32string get_word(const std::string &word, const tagger::Tag &tag, + std::optional stress, + std::optional vowel_next) const; + + // Helper functions - word+suffix phonemization + // Phonemizes word ending with popular english suffixes, example: -ed, -s, + // -ing. + std::u32string stem_s(const std::string &word, const tagger::Tag &tag, + std::optional stress) const; + std::u32string stem_ed(const std::string &word, const tagger::Tag &tag, + std::optional stress) const; + std::u32string stem_ing(const std::string &word, const tagger::Tag &tag, + std::optional stress) const; + + // Helper functions - dictionary lookup with stressing + // Returns an empty phoneme string if failed to extract phonemes. + std::u32string lookup(const std::string &word, const tagger::Tag &tag, + std::optional stress) const; + std::u32string lookup_nnp(const std::string &word) const; + std::u32string lookup_special(const std::string &word, const tagger::Tag &tag, + std::optional stress, + std::optional vowel_next) const; + + // Resolved language + Lang language_; + + // Lookup dictionary: text -> phonemes + // Provide quick and direct phonemization for popular words. + std::unordered_map dict_ = {}; +}; + +} // namespace phonemis::phonemizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h new file mode 100644 index 000000000..0869a3bfb --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h @@ -0,0 +1,28 @@ +#pragma once + +#include "lexicon.h" +#include + +namespace phonemis::phonemizer { + +// Phonemizer class +// Combines lexicon lookup-style phonemization with rule-based fallback +class Phonemizer { +public: + Phonemizer(Lang language, const std::string &lexicon_filepath = ""); + + // Main phonemization method + std::u32string phonemize(const std::string &word, const tagger::Tag &tag, + std::optional base_stress = std::nullopt, + std::optional vowel_next = std::nullopt) const; + +private: + // Helper functions - rule-based fallback methods + std::u32string fallback(const std::string &word, + const tagger::Tag &tag) const; + + // Lexicon component + std::unique_ptr lexicon_ = nullptr; +}; + +} // namespace phonemis::phonemizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/stress.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/stress.h new file mode 100644 index 000000000..bb7a6bb81 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/stress.h @@ -0,0 +1,14 @@ +#pragma once + +#include "constants.h" + +namespace phonemis::phonemizer { + +// Applies given amount of stress to the phonemized string +std::u32string apply_stress(const std::u32string &phonemes, float stress); + +// Moves the stress mark so that the stress is placed directly before the +// nearest vowel +std::u32string restress(const std::u32string &phonemes); + +} // namespace phonemis::phonemizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h new file mode 100644 index 000000000..cf7dcc6fd --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h @@ -0,0 +1,8 @@ +#pragma once + +namespace phonemis::phonemizer { + +// Available languages (english variants) +enum class Lang { EN_US, EN_GB }; + +} // namespace phonemis::phonemizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/pipeline.h b/packages/react-native-executorch/third-party/include/phonemis/pipeline.h new file mode 100644 index 000000000..e8fdf35e3 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/pipeline.h @@ -0,0 +1,35 @@ +#pragma once + +#include "phonemizer/phonemizer.h" +#include "preprocessor/tools.h" +#include "tagger/tagger.h" +#include "tokenizer/tokenize.h" +#include + +namespace phonemis { + +using phonemizer::Lang; +using phonemizer::Phonemizer; +using tagger::Tagger; + +// #### Main phonemization pipeline +// Manages all the phonemization parts, from preprocessing, through +// tokenization and tagging to final Phonemizer call. +// Tagger and Lexicon .json data files are theoretically optional, but +// skipping these arguments will significantly impact the phonemization quality. +class Pipeline { +public: + Pipeline(Lang language, const std::string &tagger_data_filepath = "", + const std::string &lexicon_data_filepath = ""); + + std::u32string process(const std::string &text); + +private: + Lang language_; + + // Pipeline subcomponents + std::unique_ptr phonemizer_ = nullptr; + std::unique_ptr tagger_ = nullptr; +}; + +} // namespace phonemis \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/preprocessor/constants.h b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/constants.h new file mode 100644 index 000000000..7fc25c0d0 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/constants.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include +#include + +namespace phonemis::preprocessor { + +// ------------------- +// num2words constants +// ------------------- +namespace num2words::constants { +// Cards map: basic number -> word +inline const std::unordered_map kCardinals = { + {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, + {4, "four"}, {5, "five"}, {6, "six"}, {7, "seven"}, + {8, "eight"}, {9, "nine"}, {10, "ten"}, {11, "eleven"}, + {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"}, {15, "fifteen"}, + {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}, + {20, "twenty"}, {30, "thirty"}, {40, "forty"}, {50, "fifty"}, + {60, "sixty"}, {70, "seventy"}, {80, "eighty"}, {90, "ninety"}}; + +// Ordinal exceptions: cardinal word -> ordinal word +inline const std::unordered_map kOrdinals = { + {"one", "first"}, {"two", "second"}, {"three", "third"}, + {"five", "fifth"}, {"eight", "eighth"}, {"nine", "ninth"}, + {"twelve", "twelfth"}}; + +// Large scale names: scale value -> name +inline const std::unordered_map kLargeCardinals = { + {100, "hundred"}, + {1000, "thousand"}, + {1000000, "million"}, + {1000000000LL, "billion"}, + {1000000000000LL, "trillion"}}; +} // namespace num2words::constants + +// --------------- +// other constants +// --------------- +namespace constants { +// These are all characters that should end a correct english sentence +inline const std::unordered_set kEndOfSentenceCharacters = {'.', '?', '!', + ';'}; +} // namespace constants + +} // namespace phonemis::preprocessor \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/preprocessor/num2word.h b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/num2word.h new file mode 100644 index 000000000..3807de29d --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/num2word.h @@ -0,0 +1,58 @@ +#pragma once + +#include "types.h" +#include +#include +#include +#include +#include +#include +#include + +namespace phonemis::preprocessor::num2words { + +// Specialized conversions +// In order to cover different numeric types, we use capable +// 64-bit long long and double types in conversions. +std::string to_cardinal_int(long long value); +std::string to_cardinal_float(double value); +std::string to_ordinal(long long value); +std::string to_year(long long value); + +// Generic conversion - from number +// Converts from numerical to spoken text representation. +// Example: 15 -> fifteen. +template + requires std::is_arithmetic_v +std::string convert(T value) { + constexpr bool is_float = std::is_floating_point_v; + + if constexpr (mode == ConversionMode::CARDINAL) + return is_float ? to_cardinal_float(static_cast(value)) + : to_cardinal_int(static_cast(value)); + if constexpr (mode == ConversionMode::ORDINAL) + return to_ordinal(static_cast(value)); + if constexpr (mode == ConversionMode::YEAR) + return to_year(static_cast(value)); + + // Fallback - conversion to non-spoken text representation + return std::to_string(value); +} + +// Generic conversion - from a text +// Similar to the one above, but takes non-spoken text representation +// of a number instead. +template +std::string convert(std::string numText) { + bool is_float = numText.find('.') != std::string::npos; + + if (is_float) { + double float_value = std::stod(numText); + return convert(float_value); + } else { + long long int_value = std::stoll(numText); + return convert(int_value); + } +} + +} // namespace phonemis::preprocessor::num2words \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h new file mode 100644 index 000000000..5b19e2685 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h @@ -0,0 +1,17 @@ +#pragma once + +#include "constants.h" +#include + +namespace phonemis::preprocessor { + +// Divides a monolit text into multiple sentences. +// A sentence always ends with a end of sentence character (defined in +// constants.h). +std::vector split_sentences(const std::string &text); + +// Converts all the numbers in the text to spoken representations. +// Usually expands the size of the text. +std::string verbalize_numbers(const std::string &text); + +} // namespace phonemis::preprocessor \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/preprocessor/types.h b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/types.h new file mode 100644 index 000000000..e9afac474 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/types.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +namespace phonemis::preprocessor { + +// ------------------------- +// num2word type definitions +// ------------------------- +namespace num2words { +// Conversion type +// Either cardinal (example: five), ordinal (example: fifth) or special (year, +// etc.) +enum class ConversionMode { + CARDINAL = 1, + ORDINAL, + YEAR, + + UNDEFINED +}; +} // namespace num2words + +} // namespace phonemis::preprocessor \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tagger/constants.h b/packages/react-native-executorch/third-party/include/phonemis/tagger/constants.h new file mode 100644 index 000000000..7048bdc41 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/tagger/constants.h @@ -0,0 +1,13 @@ +#pragma once + +#include "tag.h" +#include + +namespace phonemis::tagger::constants { + +// Punctuation and special symbol tags +inline const std::unordered_set kPunctationTags = { + Tag("."), Tag(","), Tag("-LRB-"), Tag("-RRB-"), Tag("``"), Tag("\"\""), + Tag("''"), Tag(":"), Tag("$"), Tag("#"), Tag("NFP")}; + +} // namespace phonemis::tagger::constants \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h b/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h new file mode 100644 index 000000000..ba59af4e9 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h @@ -0,0 +1,49 @@ +#pragma once + +#include "../utilities/string_utils.h" +#include + +namespace phonemis::tagger { + +using namespace utilities; + +// Tag class definition +// An abstraction layer which wrapps a simple string-based tag definition +// with some additional logic. +class Tag : public std::string { +public: + // Inherit constructors and assignment from std::string + using std::string::string; + using std::string::operator=; + Tag(std::string const &s) : std::string(s) {} + Tag(std::string &&s) : std::string(std::move(s)) {} + + // Extra logic + Tag parent_tag() const { + auto this_tag = static_cast(*this); + if (this_tag == "VERB" || string_utils::starts_with(this_tag, "VB")) + return {"VERB"}; + if (this_tag == "NOUN" || string_utils::starts_with(this_tag, "NN")) + return {"NOUN"}; + if (string_utils::starts_with(this_tag, "ADV") || + string_utils::starts_with(this_tag, "RB")) + return {"ADV"}; + if (string_utils::starts_with(this_tag, "ADJ") || + string_utils::starts_with(this_tag, "JJ")) + return {"ADJ"}; + return (*this); + } +}; + +} // namespace phonemis::tagger + +// Hash definition +// Required to use Tag objects as map keys. +namespace std { +template <> struct hash { + size_t operator()(phonemis::tagger::Tag const &t) const noexcept { + // Use std::string's hash implementation + return std::hash()(static_cast(t)); + } +}; +} // namespace std \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h b/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h new file mode 100644 index 000000000..c5ef085b7 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h @@ -0,0 +1,37 @@ +#pragma once + +#include "../tokenizer/tokens.h" +#include "tag.h" +#include +#include +#include +#include + +namespace phonemis::tagger { + +// Tagger class +// Provides PoS (Part of Speech) tagging functionality. +// Requires a previous tokenization of the text (tokenizer module). +// A modification of the Viterbi algorithm for bigram HMM (Hidden Markov Model) +// tagger. +class Tagger { +public: + explicit Tagger(const std::string &hmm_data_path); + + // Main tagging method - a modified Viterbi algorithm + // Works in place bo modyfing the 'tag' fields. + void tag(std::vector &sentence) const; + +private: + // Set of possible tags (states) + std::unordered_set tags_; + + // Probability maps - loaded from the input json file. + std::unordered_map start_probs_ = {}; + std::unordered_map> + emission_probs_ = {}; + std::unordered_map> transition_probs_ = + {}; +}; + +} // namespace phonemis::tagger \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/constants.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/constants.h new file mode 100644 index 000000000..f29366ed5 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/constants.h @@ -0,0 +1,43 @@ +#pragma once + +#include "types.h" +#include +#include +#include + +namespace phonemis::tokenizer::constants { + +// Special characters and their separation rules +inline constexpr std::array kSpecialCharacters = { + SpecialCharacter{'\'', + rules::Separation::JOIN_LEFT}, // Apostrophe joins left + SpecialCharacter{'-', + rules::Separation::TOTAL_DIVIDE}, // Hyphen always divides + SpecialCharacter{'.', + rules::Separation::TOTAL_DIVIDE}, // Dot always divides + SpecialCharacter{':', rules::Separation::TOTAL_JOIN} // Colon always joins +}; + +// A set of special words, which can contain special characters as +// an integral part. +// Note that all of the words are lower case. +inline const std::unordered_set kSpecialWords = { + // Contractions + "'bout", "'d", "'em", "'ll", "'m", "'re", "'s", "'ve", "can't", "cain't", + "goin'", "let's", "ma'am", "musn't", "n't", "nothin'", "o'clock", "o'er", + "out'n", "po'k", "pop'lar", "somethin'", "tain't", "we'uns", "what's", + "y'know", "yesterday's", "you'uns", "y'all", + + // Abbreviations and acronyms + "a.m.", "aug.", "b.c.", "bros.", "co.", "corp.", "dr.", "e.g.", "feb.", + "f.b.i.", "f.d.r.", "fla.", "gen.", "gov.", "inc.", "jan.", "jr.", "ltd.", + "mar.", "mass.", "md.", "mich.", "minn.", "miss.", "mo.", "mont.", "mr.", + "mrs.", "mt.", "n.a.", "n.c.", "n.j.", "n.y.", "nov.", "oct.", "okla.", + "ore.", "pa.", "prof.", "rev.", "sept.", "st.", "tenn.", "u.n.", "u.s.", + "u.s.a.", "u.s.s.r.", "va.", "wash.", "wis.", "p.m.", "vs.", + + // Hyphenated and special forms + "and/or", "aujourd'hui", "cap'n", "i.e.", "mid-19th", "mid-20th", + "mid-21st", "pre-1960", "rock'n'roll", "state's", "year-'round"}; + +} // namespace phonemis::tokenizer::constants \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h new file mode 100644 index 000000000..ab52e6946 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h @@ -0,0 +1,14 @@ +#pragma once + +#include "tokens.h" +#include "types.h" +#include +#include + +namespace phonemis::tokenizer { + +// Tokenizes the input text into a vector of strings (tokens). +// Follows specific rules for special characters and special words. +std::vector tokenize(const std::string &text); + +} // namespace phonemis::tokenizer diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h new file mode 100644 index 000000000..0f1c0d5f4 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h @@ -0,0 +1,22 @@ +#pragma once + +#include "../tagger/tag.h" +#include +#include + +namespace phonemis::tokenizer { + +// A main structure representing a single token extracted from text +// Mandatory fields are extracted during the tokenization stage, while +// extra fields might be processed later (for example, during the tagging stage) +struct Token { + std::string text; + std::string whitespace = ""; // Following whitespace + bool is_first = false; // Whether it is a first token in the sentence + + // Extras + std::optional tag = + std::nullopt; // A PoS (Part of Speech) tag, example: NN (noun) +}; + +} // namespace phonemis::tokenizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h new file mode 100644 index 000000000..45e84a873 --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +namespace phonemis::tokenizer { + +namespace rules { +// Separation rules for special characters +enum class Separation { + JOIN_LEFT, // Join to the word on its left + JOIN_RIGHT, // Join to the word on its right + TOTAL_DIVIDE, // Always separate from both sides + TOTAL_JOIN // Always join both sides +}; +} // namespace rules + +struct SpecialCharacter { + char character; + rules::Separation sep_rule; +}; + +} // namespace phonemis::tokenizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h b/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h new file mode 100644 index 000000000..481212cbe --- /dev/null +++ b/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h @@ -0,0 +1,155 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace phonemis::utilities::string_utils { + +// ------------------------------------- +// String utils - byte format conversion +// ------------------------------------- + +// TODO: deprecated, replace with something else + +inline std::string char32_to_utf8(char32_t c) { + std::wstring_convert, char32_t> convert; + return convert.to_bytes(&c, &c + 1); +} + +inline std::u32string utf8_to_u32string(const std::string &utf8) { + std::wstring_convert, char32_t> convert; + return convert.from_bytes(utf8); +} + +inline std::string u32string_to_utf8(const std::u32string &u32) { + std::wstring_convert, char32_t> convert; + return convert.to_bytes(u32); +} + +// ---------------------------------------- +// String utils - capitalizing & lowerizing +// ---------------------------------------- + +// Capitalization (first letter only) +template inline void capitalize__(StringT &str) { + if (!str.empty()) + str[0] = std::toupper(str[0]); +} + +// Capitalization (an entire string) +template inline void to_upper__(StringT &str) { + std::transform(str.cbegin(), str.cend(), str.begin(), + [](auto c) { return std::toupper(c); }); +} + +// Lowerization (an entire string) +template inline void to_lower__(StringT &str) { + std::transform(str.cbegin(), str.cend(), str.begin(), + [](auto c) { return std::tolower(c); }); +} + +// ------------------------------------ +// String utils - other transformations +// ------------------------------------ + +// Filters a given string and omits all the characters which +// do not pass given predicate. +template +inline void filter__(StringT &str, Pred pred) { + str.erase(std::remove_if(str.begin(), str.end(), pred), str.end()); +} + +// Replaces all the occurances of a character `a` with a character `b`. +// If `b` is not specified, then it removes all occurances of `a` without +// replacement. +template +inline void replace__(StringT &str, CharT a, std::optional b) { + if (b.has_value()) + std::replace(str.begin(), str.end(), a, b.value()); + else + str.erase(std::remove(str.begin(), str.end(), a), str.end()); +} + +// Splits the string by the given character. +template +inline std::vector split(const StringT &str, CharT bpoint) { + std::vector result = {}; + + auto it = str.begin(); + while (it != str.end()) { + auto next = std::find(it, str.end(), bpoint); + result.emplace_back(it, next); + + it = next; + if (it != str.end()) + it++; + } + + return result; +} + +// Removes the leading and trailing characters equals to given character. +// If the character is not specified, it removes white spaces instead. +template +inline StringT strip(const StringT &str, + std::optional c = std::nullopt) { + auto lbound = std::find_if(str.cbegin(), str.cend(), [&c](CharT a) -> bool { + return c.has_value() ? a != c : !std::isspace(a); + }); + auto rbound = std::find_if(str.crbegin(), str.crend(), [&c](CharT a) -> bool { + return c.has_value() ? a != c : !std::isspace(a); + }); + + return lbound != str.end() ? StringT(lbound, std::prev(rbound.base())) + : StringT(); +} + +// ------------------------- +// String utils - predicates +// ------------------------- + +// Returns true if the string contains only alphabetic characters. +template inline bool is_alpha(const StringT &str) { + return std::all_of(str.cbegin(), str.cend(), + [](char c) -> bool { return std::isalpha(c); }); +} + +// Returns true if the string starts with given suffix and false otherwise +template +inline bool starts_with(const StringT &str, std::string_view prefix) { + return str.size() >= prefix.size() && str.substr(0, prefix.size()) == prefix; +} + +// Returns true if the string ends with given suffix and false otherwise +template +inline bool ends_with(const StringT &str, std::string_view suffix) { + return str.size() >= suffix.size() && + str.substr(str.size() - suffix.size()) == suffix; +} + +// -------------------------------------- +// String utils - (non)in-place resolving +// -------------------------------------- + +// Generates non-mutating wrapper `name(...)` that calls `name__(...)` +// Used to create a non-inplace versions of the above functions. +#define MAKE_NON_INPLACE(name) \ + template \ + inline StringT name(const StringT &str, Args &&...args) { \ + StringT tmp = str; \ + name##__(tmp, std::forward(args)...); \ + return tmp; \ + } + +MAKE_NON_INPLACE(capitalize) +MAKE_NON_INPLACE(to_lower) +MAKE_NON_INPLACE(to_upper) +MAKE_NON_INPLACE(filter) +MAKE_NON_INPLACE(replace) + +} // namespace phonemis::utilities::string_utils \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a new file mode 100644 index 000000000..41fd60d02 Binary files /dev/null and b/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a differ diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a new file mode 100644 index 000000000..0a25f2b61 Binary files /dev/null and b/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a differ diff --git a/yarn.lock b/yarn.lock index 37f9a45e5..1ca8d5d29 100644 --- a/yarn.lock +++ b/yarn.lock @@ -14075,9 +14075,9 @@ __metadata: languageName: node linkType: hard -"speech-to-text@workspace:apps/speech-to-text": +"speech@workspace:apps/speech": version: 0.0.0-use.local - resolution: "speech-to-text@workspace:apps/speech-to-text" + resolution: "speech@workspace:apps/speech" dependencies: "@babel/core": "npm:^7.25.2" "@react-native/metro-config": "npm:^0.76.3"