diff --git a/.gitignore b/.gitignore index 8488653..7dd9d97 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,11 @@ -# Byte-compiled / optimized / DLL files +# Environment variables (contains API keys!) +.env + +# Python __pycache__/ *.py[cod] *$py.class - -# C extensions *.so - -# Distribution / packaging .Python build/ develop-eggs/ @@ -20,114 +19,30 @@ parts/ sdist/ var/ wheels/ -pip-wheel-metadata/ -share/python-wheels/ *.egg-info/ .installed.cfg *.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ +# Audio files +audio/ +*.wav +*.mp3 -# Translations -*.mo -*.pot - -# Django stuff: +# Logs and data +status.txt +conv.txt *.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ -# Pyre type checker -.pyre/ +# OS +.DS_Store +Thumbs.db -# Common -temp.py -tmp \ No newline at end of file +# User data +user_data.json \ No newline at end of file diff --git a/DEVELOPMENT_LOG.md b/DEVELOPMENT_LOG.md new file mode 100644 index 0000000..f8ad072 --- /dev/null +++ b/DEVELOPMENT_LOG.md @@ -0,0 +1,398 @@ +\# šŸ“š DEVELOPMENT LOG - IRIS Voice Assistant + + + +\## Project Overview + +IRIS is a voice-controlled AI assistant built by forking and extensively customizing the open-source JARVIS project. The assistant integrates multiple APIs to provide speech-to-text, natural language processing, text-to-speech, and system automation capabilities. + + + +\*\*Original Repository:\*\* \[JARVIS by AlexandreSajus](https://github.com/AlexandreSajus/JARVIS) + +\*\*Tech Stack:\*\* Python 3.11, Deepgram API, Groq (Llama 3.3), ElevenLabs API, Pygame, Taipy + +\*\*Development Period:\*\* October 25-27, 2025 + + + +--- + + + +\## šŸŽ“ Technical Skills Developed + + + +\### 1. Multi-API Integration + +Successfully integrated three distinct APIs with different authentication mechanisms: + +\- \*\*Deepgram:\*\* Real-time speech-to-text transcription + +\- \*\*Groq:\*\* Large language model inference (Llama 3.3 70B) + +\- \*\*ElevenLabs:\*\* Neural text-to-speech synthesis + + + +Implemented secure credential management using environment variables and the `python-dotenv` library. + + + +\### 2. Dependency Resolution \& Package Management + +\- Resolved complex dependency conflicts in a legacy codebase + +\- Managed version-specific package requirements (`deepgram-sdk==0.3.0`) + +\- Worked around native compilation requirements using pre-built wheels + +\- Handled cross-platform compatibility issues (Windows-specific solutions) + + + +\### 3. Legacy Code Modernization + +Adapted a 2-year-old codebase to current API standards: + +\- Migrated from deprecated API methods to modern implementations + +\- Updated authentication patterns across multiple services + +\- Refactored synchronous code patterns while maintaining async operations + +\- Implemented proper error handling for external service calls + + + +\### 4. System Integration \& Automation + +Developed local command routing system using Python's standard library: + +\- Process control with `subprocess` module + +\- Cross-platform compatibility via `platform` detection + +\- Browser automation using `webbrowser` module + +\- Natural language intent parsing for command extraction + + + +\### 5. AI Prompt Engineering + +Designed system prompts to control AI behavior: + +\- Personality customization through context engineering + +\- Response format constraints + +\- Tone and verbosity control + + + +--- + + + +\## šŸ› Critical Issues Resolved + + + +\### Issue #1: Python Version Incompatibility + +\*\*Error:\*\* `ModuleNotFoundError: No module named 'distutils.msvccompiler'` + +\*\*Cause:\*\* Python 3.14 removed the `distutils` module required by Pygame + +\*\*Resolution:\*\* Downgraded to Python 3.11.9 (project requirement: Python 3.8-3.11) + +\*\*Impact:\*\* Highlighted importance of checking compatibility matrices before setup + + + +--- + + + +\### Issue #2: Native Dependency Compilation Failure + +\*\*Error:\*\* `error: Microsoft Visual C++ 14.0 or greater is required` + +\*\*Cause:\*\* `webrtcvad` package requires C++ compilation, build tools not installed + +\*\*Resolution:\*\* Used `webrtcvad-wheels` (pre-compiled binary) and `rhasspy-silence --no-deps` + +\*\*Impact:\*\* Learned to identify when pre-built alternatives exist for complex dependencies + + + +--- + + + +\### Issue #3: API Version Mismatch + +\*\*Error:\*\* `ImportError: cannot import name 'Deepgram'` followed by `401 Unauthorized` + +\*\*Cause:\*\* Code written for Deepgram SDK v0.x, attempted installation of v2.12+ + +\*\*Resolution:\*\* + +\- Downgraded to `deepgram-sdk==0.3.0` + +\- Generated fresh API credentials from correct project scope + +\*\*Impact:\*\* Reinforced need to match documentation versions with installed packages + + + +--- + + + +\### Issue #4: API Rate Limiting + +\*\*Error:\*\* `openai.RateLimitError: 429 - insufficient\_quota` + +\*\*Cause:\*\* OpenAI free tier credits exhausted + +\*\*Resolution:\*\* Migrated to Groq API (free tier, Llama 3.3 70B model) + +\*\*Benefits:\*\* + +\- Zero cost + +\- Faster inference (0.3-0.5s vs 1-2s) + +\- API-compatible interface minimized code changes + + + +--- + + + +\### Issue #5: Breaking Changes in ElevenLabs SDK + +\*\*Error:\*\* `AttributeError: 'ElevenLabs' object has no attribute 'generate'` + +\*\*Cause:\*\* ElevenLabs SDK v2.0+ restructured API interface + +\*\*Resolution:\*\* Updated initialization and method calls: + +```python + +\# Before + +elevenlabs.set\_api\_key(key) + +audio = elevenlabs.generate(text=response, voice="Adam") + + + +\# After + +client = ElevenLabs(api\_key=key) + +audio = client.text\_to\_speech.convert( + +  text=response, + +  voice\_id="pNInz6obpgDQGcFmaJgB", + +  model\_id="eleven\_monolingual\_v1" + +) + +``` + + + +--- + + + +\### Issue #6: Silent Failure Loop + +\*\*Error:\*\* Application continued listening without providing audio responses + +\*\*Cause:\*\* ElevenLabs API call failing without raising exceptions + +\*\*Resolution:\*\* Implemented comprehensive error handling with try-except blocks and logging + +\*\*Impact:\*\* Emphasized importance of defensive programming for external service dependencies + + + +--- + + + +\## šŸ”§ Technical Decisions \& Rationale + + + +\### Groq vs OpenAI + +\*\*Decision:\*\* Use Groq API instead of OpenAI + +\*\*Reasoning:\*\* + +\- Cost: $0 vs pay-per-token + +\- Performance: Comparable quality with Llama 3.3 70B + +\- Speed: Faster inference times + +\- Compatibility: Drop-in replacement for OpenAI client + + + +\### Legacy SDK Version + +\*\*Decision:\*\* Maintain Deepgram SDK v0.3.0 instead of migrating to v2.0+ + +\*\*Reasoning:\*\* + +\- Significant refactoring required for migration + +\- Current version fully functional + +\- Focus resources on feature development rather than migration + + + +\### Command Routing Architecture + +\*\*Decision:\*\* Implement local command processing before LLM inference + +\*\*Reasoning:\*\* + +\- Performance: Instant responses for deterministic queries + +\- Cost efficiency: Reduced API call volume + +\- Reliability: No dependency on external services for simple commands + +\- User experience: Predictable behavior for common tasks + + + +--- + + + +\## šŸ“Š Project Metrics + +| Metric | Value | + +|--------|-------| + +| Development Time | ~10 hours | + +| Issues Resolved | 15+ | + +| APIs Integrated | 3 | + +| Total Code Lines | ~280 | + +| Dependencies Managed | 50+ | + +| Cost | $0 (free tier APIs) | + + + +--- + + + +\## šŸš€ Implemented Features + + + +\### Core Functionality + +\- āœ… Voice input processing (Deepgram) + +\- āœ… Natural language understanding (Groq/Llama 3.3) + +\- āœ… Voice synthesis (ElevenLabs) + +\- āœ… Continuous conversation loop + +\- āœ… Error handling and logging + + + +\### Custom Enhancements + +\- āœ… Assistant rebranding (JARVIS → IRIS) + +\- āœ… Personality customization (humble, approachable tone) + +\- āœ… Local command routing system + +\- āœ… System automation (10+ commands): + +  - Time/date queries + +  - Application launching + +  - Web search integration + +  - YouTube playback + +  - Random utilities (coin flip, dice roll) + + + +--- + + + +\## šŸ’” Key Takeaways + + + +1\. \*\*Version Management:\*\* Always verify compatibility requirements before installation + +2\. \*\*API Economics:\*\* Free alternatives often exist with comparable quality + +3\. \*\*Error Interpretation:\*\* Stack traces provide specific guidance for resolution + +4\. \*\*Documentation Hygiene:\*\* Match documentation version to installed packages + +5\. \*\*Optimization Strategy:\*\* Local processing reduces latency and costs + +6\. \*\*Open Source Ethics:\*\* Forking with substantial customization demonstrates learning + + + +--- + + + +\## šŸŽÆ Planned Enhancements + +\- \[ ] Persistent user memory system (JSON-based storage) + +\- \[ ] Voice-controlled code generation + +\- \[ ] Sentiment analysis for adaptive responses + +\- \[ ] Interrupt-driven conversation flow + +\- \[ ] Multi-language support + +\- \[ ] Enhanced web dashboard with real-time visualizations + + + +--- + + + +This development log documents the complete journey from initial setup challenges through to a fully functional, customized voice assistant with production-ready error handling and system integration capabilities. + diff --git a/README.md b/README.md index 385f545..f632a3f 100644 --- a/README.md +++ b/README.md @@ -1,97 +1,444 @@ -# JARVIS +# IRIS - Intelligent Responsive Interactive System -

- JARVIS helping me choose a firearm -

+**A multi-mode AI voice assistant designed for developers, students, and productivity enthusiasts** -Your own voice personal assistant: Voice to Text to LLM to Speech, displayed in a web interface. +![Python](https://img.shields.io/badge/python-3.11-blue.svg) +![License](https://img.shields.io/badge/license-MIT-green.svg) +![Status](https://img.shields.io/badge/status-active%20development-orange.svg) +![Modes](https://img.shields.io/badge/modes-3-brightgreen.svg) -## How it works +--- +## ABOUT +## šŸŽÆ What is IRIS? -1. :microphone: The user speaks into the microphone -2. :keyboard: Voice is converted to text using Deepgram -3. :robot: Text is sent to OpenAI's GPT-3 API to generate a response -4. :loudspeaker: Response is converted to speech using ElevenLabs -5. :loud_sound: Speech is played using Pygame -6. :computer: Conversation is displayed in a webpage using Taipy +IRIS is not just another voice assistant - it's a **specialized AI companion** with three distinct modes, each optimized for specific tasks: -## Video Demo +1. **šŸ‘Øā€šŸ’» Developer Mode** - Your AI pair programmer +2. **šŸ“… Personal Mode** - Your daily life manager +3. **šŸ“š Learning Mode** - Your study companion -

- - Youtube Devlog - -

+Instead of doing random tasks poorly, IRIS excels at what it's designed for: Switch modes based on what you're doing, and get context-aware, intelligent assistance. -## Requirements +--- -**Python 3.8 - 3.11** +## ✨ Why IRIS is Different? -Make sure you have the following API keys: -- Deepgram -- OpenAI -- Elevenlabs +| Feature | Generic Voice Assistants | IRIS | +|---------|-------------------------|------| +| Purpose | One-size-fits-all | Specialized modes | +| Code Generation | Basic snippets | Full functions with tests | +| Context | Forgets quickly | Persistent project memory | +| Learning | Generic answers | Study-optimized responses | +| Cost | Subscription required | 100% free APIs | +| Customization | Limited | Fully open source | -## How to install +--- -1. Clone the repository +## šŸŽ­ The Three Modes +### šŸ”§ **Mode 1: Developer Assistant** + +Your AI pair programmer. Code faster, debug smarter. + +**What it does:** +- šŸ’» **Voice-controlled code generation** - "Create a function to sort users by age" +- šŸ“ **Code explanation** - "Explain this code" (reads from clipboard) +- ⚔ **Code improvement** - "Optimize this function" +- šŸ” **Smart search** - "Search Stack Overflow for async errors" +- šŸ“š **Documentation lookup** - "Python docs for decorators" +- šŸ› ļø **Tool integration** - "Open VS Code with my project" + +**Perfect for:** +- Writing functions and classes by voice +- Understanding unfamiliar code +- Quick Stack Overflow/documentation access +- Hands-free coding while thinking out loud + +--- + +### šŸ“… **Mode 2: Personal Assistant** + +Your daily life manager. Never miss a thing. + +**What it does:** +- ā° **Reminders & timers** - "Remind me to call mom at 6 PM" +- šŸŒ¤ļø **Weather & news** - "What's the weather in Hyderabad?" +- šŸ—“ļø **Schedule management** - "What's on my schedule today?" +- 🧮 **Quick calculations** - "Calculate 15% tip on 850 rupees" +- šŸ“ **Time zones** - "What time is it in Tokyo?" +- 🧠 **Personal memory** - Learns your preferences and habits + +**Perfect for:** +- Managing your daily schedule +- Staying informed about weather/news +- Setting reminders hands-free +- Quick information lookup + +--- + +### šŸ“š **Mode 3: Learning Assistant** + +Your study companion. Learn faster, retain longer. + +**What it does:** +- šŸŽ“ **Concept explanations** - "Explain quantum computing" +- ā±ļø **Study timer (Pomodoro)** - "Start a 25-minute study session" +- šŸ“ **Voice notes** - "Take a note: Machine learning uses neural networks" +- šŸ—£ļø **Topic summaries** - "Summarize the French Revolution" +- šŸ“Š **Quiz generation** *(planned)* - Test your knowledge +- šŸ“„ **Document summarization** *(planned)* - Summarize PDFs/articles + +**Perfect for:** +- Studying complex topics +- Taking quick voice notes +- Structured study sessions +- Understanding difficult concepts + +--- + +## šŸš€ Quick Start + +### Prerequisites +- Python 3.11 (versions 3.8-3.11 supported) +- Microphone +- Internet connection +- Clipboard access (for code features) + +### Installation + +1. **Clone the repository** ```bash -git clone https://github.com/AlexandreSajus/JARVIS.git +git clone https://github.com/balagamrisha/IRIS.git +cd IRIS ``` -2. Install the requirements - +2. **Install dependencies** ```bash pip install -r requirements.txt ``` -3. Create a `.env` file in the root directory and add the following variables: +3. **Get FREE API Keys** -```bash -DEEPGRAM_API_KEY=XXX...XXX -OPENAI_API_KEY=sk-XXX...XXX -ELEVENLABS_API_KEY=XXX...XXX -``` +| Service | Free Tier | Purpose | Sign Up | +|---------|-----------|---------|---------| +| **Deepgram** | $200 credit | Speech-to-text | [console.deepgram.com](https://console.deepgram.com/) | +| **Groq** | Unlimited | AI brain (Llama 3.3) | [console.groq.com](https://console.groq.com/) | +| **ElevenLabs** | 10k chars/month | Text-to-speech | [elevenlabs.io](https://elevenlabs.io/) | -## How to use +4. **Configure environment** -1. Run `display.py` to start the web interface +Create a `.env` file in the project root: +```env +DEEPGRAM_API_KEY=your_deepgram_key_here +GROQ_API_KEY=your_groq_key_here +ELEVENLABS_API_KEY=your_elevenlabs_key_here +``` +5. **Run IRIS** ```bash +# Terminal 1: Web interface (optional) python display.py + +# Terminal 2: Voice assistant +python main.py ``` -2. In another terminal, run `jarvis.py` to start the voice assistant +--- -```bash -python main.py +## šŸ’¬ Usage Examples + +### šŸ‘Øā€šŸ’» Developer Mode Commands + +**Code Generation:** +``` +"Create a Python function to validate email addresses" +"Write a REST API endpoint for user login" +"Generate a React component for a button" ``` -- Once ready, both the web interface and the terminal will show `Listening...` -- You can now speak into the microphone -- Once you stop speaking, it will show `Stopped listening` -- It will then start processing your request -- Once the response is ready, it will show `Speaking...` -- The response will be played and displayed in the web interface. +**Code Analysis:** +``` +*Copy code to clipboard* +"Explain this code" +"What does this function do?" +"Improve this code" +"Add error handling to this" +``` -Here is an example: +**Development Workflow:** +``` +"Open VS Code" +"Search Stack Overflow for Python asyncio" +"Python documentation for file handling" +"Open my GitHub" +``` +### šŸ“… Personal Mode Commands +``` +"What time is it?" +"What's the weather in Hyderabad?" +"Remind me to submit assignment at 5 PM" +"Set a timer for 25 minutes" +"What's on my schedule today?" ``` -Listening... -Done listening -Finished transcribing in 1.21 seconds. -Finished generating response in 0.72 seconds. -Finished generating audio in 1.85 seconds. -Speaking... - --- USER: good morning jarvis - --- JARVIS: Good morning, Alex! How can I assist you today? +### šŸ“š Learning Mode Commands +``` +"Explain machine learning like I'm 5" +"Start a study session" (Pomodoro timer) +"Take a note: Neural networks have multiple layers" +"Tell me about the Renaissance" +``` -Listening... -... +### šŸŽ›ļø Mode Switching +``` +"Switch to developer mode" +"Switch to personal mode" +"Switch to study mode" +"What mode am I in?" ``` -

- Saying good morning -

\ No newline at end of file +--- + +## šŸ—ļø Project Structure +``` +IRIS/ +ā”œā”€ā”€ main.py # Core assistant logic +ā”œā”€ā”€ display.py # Web interface (Taipy) +ā”œā”€ā”€ record.py # Audio recording module +ā”œā”€ā”€ requirements.txt # Python dependencies +ā”œā”€ā”€ .env # API keys (not committed) +│ +ā”œā”€ā”€ generated_code/ # AI-generated code files +ā”œā”€ā”€ audio/ # Audio recordings +ā”œā”€ā”€ user_data.json # Personal memory +ā”œā”€ā”€ mode_config.json # Current mode settings +│ +ā”œā”€ā”€ README.md # This file +└── DEVELOPMENT_LOG.md # Technical notes +``` + +--- + +## šŸŽØ Customization + +### Change Assistant Personality +Edit `main.py` line 30: +```python +context = "You are IRIS, a [your description]..." +``` + +### Add Custom Commands +In the appropriate mode handler function: +```python +def handle_developer_commands(text: str): + if "your custom command" in text.lower(): + # Your code here + return True, "Your response" +``` + +### Change Voice +Modify ElevenLabs voice ID (line ~120): +```python +voice_id="pNInz6obpgDQGcFmaJgB" # Try different voice IDs +``` + +### Add New Mode +Create a new mode handler function and integrate into the mode system! + +--- + +## šŸ“Š Current Status + +**Version:** 2.0-dev +**Active Development:** Yes 🚧 + +### āœ… Completed Features + +**Core System:** +- āœ… Multi-API integration (Deepgram, Groq, ElevenLabs) +- āœ… Continuous voice interaction loop +- āœ… Error handling and logging +- āœ… User memory system (remembers name/preferences) +- āœ… Smart command routing + +**Developer Mode:** +- āœ… Voice-controlled code generation +- āœ… Clipboard integration (explain/improve code) +- āœ… Stack Overflow search integration +- āœ… Python documentation quick access +- āœ… File creation with smart naming +- āœ… Multi-language support (Python, JS, Java, C++) + +**Personal Mode:** +- āœ… Time/date queries +- āœ… Application control (Chrome, VS Code, Spotify) +- āœ… Web search integration +- āœ… YouTube playback +- āœ… Random utilities (coin flip, dice roll) + +**Learning Mode:** +- ā³ In development + +### šŸ”„ In Progress + +**Developer Mode:** +- šŸ”Ø Git voice commands +- šŸ”Ø Project context memory +- šŸ”Ø Code template library + +**Personal Mode:** +- šŸ”Ø Weather API integration +- šŸ”Ø Reminder system +- šŸ”Ø News briefing + +**Learning Mode:** +- šŸ”Ø Wikipedia integration +- šŸ”Ø Pomodoro timer +- šŸ”Ø Voice note taking + +### šŸ“‹ Planned Features + +- [ ] Mode switching system +- [ ] Enhanced web dashboard +- [ ] Emotion detection +- [ ] Real-time conversation (interrupt capability) +- [ ] Multi-language support +- [ ] Quiz generation (Learning Mode) +- [ ] Calendar integration (Personal Mode) +- [ ] Code review assistant (Developer Mode) + +--- + +## šŸ› ļø Tech Stack + +### Core Technologies +- **Python 3.11** - Primary language +- **Deepgram API** - Speech recognition ($200 free credit) +- **Groq API** - LLM inference with Llama 3.3 70B (free) +- **ElevenLabs API** - Neural voice synthesis (10k chars/month) +- **Pygame** - Audio playback +- **Taipy** - Web interface +- **Pyperclip** - Clipboard integration + +### Why These Technologies? + +| Technology | Why We Chose It | +|-----------|-----------------| +| **Groq over OpenAI** | Free, faster inference, comparable quality | +| **Deepgram** | Most accurate STT, generous free tier | +| **ElevenLabs** | Most natural-sounding voices | +| **Python 3.11** | Best library support, async capabilities | + +--- + +## šŸŽÆ Use Cases + +### For Developers +- Code while walking/exercising +- Quickly generate boilerplate code +- Understand unfamiliar codebases +- Access documentation hands-free +- Debug with voice explanations + +### For Students +- Take voice notes during lectures +- Study with Pomodoro technique +- Get concept explanations +- Quiz yourself on topics +- Manage study schedules + +### For Everyone +- Manage daily tasks +- Set reminders +- Get weather updates +- Quick calculations +- Hands-free productivity + +--- + +## šŸ¤ Contributing + +This is a personal learning project as I'm experimenting with tools and ideas, but contributions are welcome! + +**Areas for contribution:** +- New mode handlers +- Additional voice commands +- UI improvements +- Documentation +- Bug fixes + +--- + +## šŸ“ Development Philosophy + +**Why did I want to structure IRIS this way?** + +1. **Specialized > Generic** - Three focused modes are integrated +2. **Free > Paid** - Students shouldn't pay for learning tools +3. **Practical > Flashy** - Features that solve real problems +4. **Open > Closed** - Fully customizable and transparent +5. **Learning-First** - Built to teach AI integration concepts + +--- + +## šŸ“– Documentation + +- **[DEVELOPMENT_LOG.md](DEVELOPMENT_LOG.md)** - Technical decisions, issues solved, learning notes +- **[API_REFERENCE.md](API_REFERENCE.md)** - Function documentation *(coming soon)* +- **[MODE_GUIDE.md](MODE_GUIDE.md)** - Detailed guide for each mode *(coming soon)* + +--- + +## šŸ™ Acknowledgments + +**Built upon:** +- [JARVIS](https://github.com/AlexandreSajus/JARVIS) by Alexandre Sajus - Original foundation +- [Groq](https://groq.com/) - Lightning-fast LLM inference +- [Deepgram](https://deepgram.com/) - Industry-leading STT +- [ElevenLabs](https://elevenlabs.io/) - Realistic voice synthesis + +**Inspired by:** +- A thought to make useful AI assistant which isn't generic +- Making AI accessible to students + +--- + +## šŸ“„ License +The same one as the forked project. + +--- + +## šŸ“¬ Contact + +**Developer:** Balagam Risha Raj +**Project Link:**(https://github.com/balagamrisha/IRIS) + +--- + +## 🚦 Project Roadmap + +### Phase 1: Foundation āœ… (Completed) +- Core voice pipeline +- Basic command system +- User memory + +### Phase 2: Developer Mode šŸ”„ (Current) +- Code generation +- Clipboard integration +- Tool integration + +### Phase 3: Personal Mode ā³ (Next) +- Weather API +- Reminders +- Calendar + +### Phase 4: Learning Mode ā³ +- Study timer +- Note taking +- Wikipedia + +### Phase 5: Polish ā³ +- Mode switching +- Enhanced UI +- Performance optimization diff --git a/main.py b/main.py index b8a60dd..beca5cf 100644 --- a/main.py +++ b/main.py @@ -3,128 +3,286 @@ from os import PathLike from time import time import asyncio +import webbrowser +import subprocess +import platform +import json +from datetime import datetime from typing import Union from dotenv import load_dotenv -import openai +from groq import Groq from deepgram import Deepgram import pygame from pygame import mixer -import elevenlabs +from elevenlabs.client import ElevenLabs from record import speech_to_text # Load API keys load_dotenv() -OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +GROQ_API_KEY = os.getenv("GROQ_API_KEY") DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") -elevenlabs.set_api_key(os.getenv("ELEVENLABS_API_KEY")) +ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") # Initialize APIs -gpt_client = openai.Client(api_key=OPENAI_API_KEY) +gpt_client = Groq(api_key=GROQ_API_KEY) deepgram = Deepgram(DEEPGRAM_API_KEY) +elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY) + # mixer is a pygame module for playing audio mixer.init() -# Change the context if you want to change Jarvis' personality -context = "You are Jarvis, Alex's human assistant. You are witty and full of personality. Your answers should be limited to 1-2 short sentences." +# Change the context if you want to change Iris' personality +context = ( + "You are Iris, a humble and approachable AI assistant. " + "You are friendly, helpful, and speak naturally like a supportive friend. " + "Keep your answers brief and easy to understand, limited to 1-2 short sentences." +) conversation = {"Conversation": []} RECORDING_PATH = "audio/recording.wav" def request_gpt(prompt: str) -> str: - """ - Send a prompt to the GPT-3 API and return the response. - - Args: - - state: The current state of the app. - - prompt: The prompt to send to the API. - - Returns: - The response from the API. - """ + """Send a prompt to the Groq API and return the response.""" response = gpt_client.chat.completions.create( - messages=[ - { - "role": "user", - "content": f"{prompt}", - } - ], - model="gpt-3.5-turbo", + messages=[{"role": "user", "content": f"{prompt}"}], + model="llama-3.3-70b-versatile", ) return response.choices[0].message.content -async def transcribe( - file_name: Union[Union[str, bytes, PathLike[str], PathLike[bytes]], int] -): - """ - Transcribe audio using Deepgram API. - - Args: - - file_name: The name of the file to transcribe. - - Returns: - The response from the API. - """ +async def transcribe(file_name: Union[Union[str, bytes, PathLike[str], PathLike[bytes]], int]): + """Transcribe audio using Deepgram API.""" with open(file_name, "rb") as audio: source = {"buffer": audio, "mimetype": "audio/wav"} response = await deepgram.transcription.prerecorded(source) return response["results"]["channels"][0]["alternatives"][0]["words"] -def log(log: str): - """ - Print and write to status.txt - """ - print(log) +def log(log_text: str): + """Print and write to status.txt""" + print(log_text) with open("status.txt", "w") as f: - f.write(log) + f.write(log_text) + + +def handle_local_commands(text: str) -> tuple[bool, str]: + """Check if the user input is a local command and handle it.""" + text_lower = text.lower() + + # Time command + if "what time" in text_lower or "current time" in text_lower: + current_time = datetime.now().strftime("%I:%M %p") + return True, f"It's currently {current_time}" + + # Date command + if "what date" in text_lower or "today's date" in text_lower or "what day" in text_lower: + current_date = datetime.now().strftime("%B %d, %Y") + return True, f"Today is {current_date}" + + # Open Chrome + if "open chrome" in text_lower or "open google chrome" in text_lower: + try: + if platform.system() == "Windows": + try: + subprocess.Popen(["chrome"]) + except: + try: + subprocess.Popen(["C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"]) + except: + subprocess.Popen(["C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"]) + elif platform.system() == "Darwin": # macOS + subprocess.Popen(["open", "-a", "Google Chrome"]) + else: # Linux + subprocess.Popen(["google-chrome"]) + return True, "Opening Chrome" + except Exception as e: + return True, f"Sorry, I couldn't open Chrome: {e}" + + # Open VSCode + if "open vscode" in text_lower or "open vs code" in text_lower or "open visual studio code" in text_lower: + try: + if platform.system() == "Windows": + try: + subprocess.Popen(["code"]) + except: + subprocess.Popen(["C:\\Users\\archa\\AppData\\Local\\Programs\\Microsoft VS Code\\Code.exe"]) + else: + subprocess.Popen(["code"]) + return True, "Opening VS Code" + except Exception as e: + return True, f"Sorry, I couldn't open VS Code: {e}" + + # Open Spotify + if "open spotify" in text_lower: + try: + if platform.system() == "Windows": + try: + subprocess.Popen(["spotify"]) + except: + subprocess.Popen(["C:\\Users\\archa\\AppData\\Roaming\\Spotify\\Spotify.exe"]) + elif platform.system() == "Darwin": # macOS + subprocess.Popen(["open", "-a", "Spotify"]) + else: + subprocess.Popen(["spotify"]) + return True, "Opening Spotify" + except Exception as e: + return True, f"Sorry, I couldn't open Spotify: {e}" + + # Search Google + if "search google for" in text_lower or "google search" in text_lower: + query = text_lower.split("for")[-1].strip() if "for" in text_lower else text_lower.replace("google search", "").strip() + url = f"https://www.google.com/search?q={query.replace(' ', '+')}" + webbrowser.open(url) + return True, f"Searching Google for {query}" + + # Play on YouTube + if "play" in text_lower and "youtube" in text_lower: + query = text_lower.replace("play", "").replace("on youtube", "").replace("youtube", "").strip() + url = f"https://www.youtube.com/results?search_query={query.replace(' ', '+')}" + webbrowser.open(url) + return True, f"Playing {query} on YouTube" + + # Open GitHub + if "open github" in text_lower or "open my github" in text_lower: + webbrowser.open("https://github.com") + return True, "Opening GitHub" + + # Flip coin + if "flip a coin" in text_lower or "flip coin" in text_lower: + import random + result = random.choice(["Heads", "Tails"]) + return True, f"It's {result}!" + + # Roll dice + if "roll dice" in text_lower or "roll a dice" in text_lower: + import random + result = random.randint(1, 6) + return True, f"You rolled a {result}" + + # No local command found + return False, "" +# === USER MEMORY FUNCTIONS === +def load_user_data(): + """Load user data from JSON file.""" + try: + with open("user_data.json", "r") as f: + return json.load(f) + except FileNotFoundError: + return {"name": None, "preferences": {}} + + +def save_user_data(data): + """Save user data to JSON file.""" + with open("user_data.json", "w") as f: + json.dump(data, f, indent=4) + + +def check_for_name_in_input(text: str, user_data: dict) -> tuple[bool, str, dict]: + """Check if user is introducing themselves.""" + text_lower = text.lower() + + if any(phrase in text_lower for phrase in ["my name is", "i am", "i'm", "call me"]): + if "my name is" in text_lower: + name = text_lower.split("my name is")[-1].strip() + elif "i am" in text_lower: + name = text_lower.split("i am")[-1].strip() + elif "i'm" in text_lower: + name = text_lower.split("i'm")[-1].strip() + elif "call me" in text_lower: + name = text_lower.split("call me")[-1].strip() + else: + name = "" + + name = name.split()[0].capitalize() if name else "" + if name: + user_data["name"] = name + save_user_data(user_data) + return True, f"Nice to meet you, {name}! I'll remember that. How can I help you today?", user_data + + return False, "", user_data + + +# === MAIN LOOP === if __name__ == "__main__": + # Load user data at startup + user_data = load_user_data() + + # Greet user by name if known + if user_data.get("name"): + print(f"\nšŸ‘‹ Welcome back, {user_data['name']}!\n") + else: + print("\nšŸ‘‹ Hello! I'm IRIS. What's your name?\n") + while True: - # Record audio log("Listening...") speech_to_text() log("Done listening") - # Transcribe audio + # Transcribe current_time = time() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) words = loop.run_until_complete(transcribe(RECORDING_PATH)) - string_words = " ".join( - word_dict.get("word") for word_dict in words if "word" in word_dict - ) + string_words = " ".join(word_dict.get("word") for word_dict in words if "word" in word_dict) with open("conv.txt", "a") as f: f.write(f"{string_words}\n") + transcription_time = time() - current_time log(f"Finished transcribing in {transcription_time:.2f} seconds.") - # Get response from GPT-3 - current_time = time() - context += f"\nAlex: {string_words}\nJarvis: " - response = request_gpt(context) - context += response - gpt_time = time() - current_time - log(f"Finished generating response in {gpt_time:.2f} seconds.") + # Check for name introduction + is_name_intro, name_response, user_data = check_for_name_in_input(string_words, user_data) + + if is_name_intro: + response = name_response + log("Learned user's name") + else: + # Check local command + is_local_command, local_response = handle_local_commands(string_words) + if is_local_command: + response = local_response + log("Handled as local command") + else: + # Get AI response + current_time = time() + if user_data.get("name"): + context_with_name = f"You are talking to {user_data['name']}. {context}" + else: + context_with_name = context + context_with_name += f"\nUser: {string_words}\nIris: " + response = request_gpt(context_with_name) + gpt_time = time() - current_time + log(f"Finished generating response in {gpt_time:.2f} seconds.") # Convert response to audio current_time = time() - audio = elevenlabs.generate( - text=response, voice="Adam", model="eleven_monolingual_v1" - ) - elevenlabs.save(audio, "audio/response.wav") - audio_time = time() - current_time - log(f"Finished generating audio in {audio_time:.2f} seconds.") - - # Play response + try: + audio_generator = elevenlabs_client.text_to_speech.convert( + text=response, + voice_id="pNInz6obpgDQGcFmaJgB", + model_id="eleven_monolingual_v1", + ) + with open("audio/response.wav", "wb") as f: + for chunk in audio_generator: + f.write(chunk) + + audio_time = time() - current_time + log(f"Finished generating audio in {audio_time:.2f} seconds.") + except Exception as e: + log(f"Error generating audio: {e}") + continue + + # Play audio log("Speaking...") sound = mixer.Sound("audio/response.wav") - # Add response as a new line to conv.txt with open("conv.txt", "a") as f: f.write(f"{response}\n") sound.play() pygame.time.wait(int(sound.get_length() * 1000)) - print(f"\n --- USER: {string_words}\n --- JARVIS: {response}\n") + + user_display = user_data.get("name", "USER") + print(f"\n --- {user_display}: {string_words}\n --- IRIS: {response}\n")