Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,15 @@

GROQ_API_KEY=your_groq_api_key_here
DEEPGRAM_API_KEY=your_deepgram_api_key_here

# Optional: 60db cloud (TTS + STT). Defaults keep Deepgram.
# Flip switches independently:
# STT_PROVIDER=sixtydb -> 60db /ws/stt
# TTS_PROVIDER=sixtydb -> 60db /tts-stream -> ffplay
STT_PROVIDER=
TTS_PROVIDER=
SIXTYDB_API_KEY=
# SIXTYDB_API_BASE=https://api.60db.ai
# SIXTYDB_TTS_VOICE_ID=fbb75ed2-975a-40c7-9e06-38e30524a9a1
# SIXTYDB_TTS_TRANSPORT=stream # stream (default) | sync | ws
# SIXTYDB_STT_LANGUAGE=en
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,31 @@ Voice-Chat-Bot/
4. Add tests if applicable
5. Submit a pull request

## 60db Provider (alongside Deepgram)

This fork adds [60db](https://docs.60db.ai) as a peer of Deepgram for both TTS and STT. Each service has an independent env switch and defaults preserve the Deepgram + Groq path.

| Concern | File | Switch | Endpoint |
|---|---|---|---|
| TTS | `sixtydb_tts.py` (`SixtyDbSpeechSynthesizer`) | `TTS_PROVIDER=sixtydb` + `SIXTYDB_TTS_TRANSPORT=stream\|sync\|ws` | `POST /tts-stream` (default, NDJSON mp3 chunks), `POST /tts-synthesize` (one-shot mp3), or `wss://api.60db.ai/ws/tts` (LINEAR16 PCM 24k, ffplay invoked with `-f s16le -ar 24000 -ac 1`) |
| STT | `sixtydb_stt.py` (`SixtyDbLiveTranscriber`) | `STT_PROVIDER=sixtydb` | `wss://api.60db.ai/ws/stt` browser mode (linear PCM 16k) |
| LLM | _no new file_ | manual edit | replace `ChatGroq(...)` in `Voice_Bot.py:113` with `ChatOpenAI(model="60db-tiny", base_url="https://api.60db.ai/v1", api_key=os.getenv("SIXTYDB_API_KEY"))` |

Both 60db classes match their Deepgram counterparts' interfaces exactly (`speak(text)` and `async listen() -> str`), so `VoiceAssistant.__init__` only branches on env — no body changes elsewhere.

```env
SIXTYDB_API_KEY=sk_live_...
TTS_PROVIDER=sixtydb
STT_PROVIDER=sixtydb
SIXTYDB_API_BASE=https://api.60db.ai
SIXTYDB_TTS_VOICE_ID=fbb75ed2-975a-40c7-9e06-38e30524a9a1
SIXTYDB_STT_LANGUAGE=en
```

Extra Python deps for the 60db path: `websockets`, `pyaudio` (already required by Deepgram's `Microphone`).

Reference: [docs.60db.ai](https://docs.60db.ai).

In case of any queries, please leave a message or contact me via the email provided in my profile.

<p align="center">
Expand Down
23 changes: 21 additions & 2 deletions Voice_Bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,28 @@ class VoiceAssistant:
TERMINATION_PHRASE = "goodbye"

def __init__(self, config: Config):
self.transcriber = LiveTranscriber(config)
# Provider switches — each defaults to the original Deepgram/Groq
# path so existing setups keep working with no .env edits.
# STT_PROVIDER=sixtydb → 60db /ws/stt
# TTS_PROVIDER=sixtydb → 60db /tts-stream → ffplay
# The LLM stays on Groq here; the LLM can also be routed through
# 60db by editing LLMProcessor's ChatGroq to ChatOpenAI(base_url=...).
stt_provider = os.getenv("STT_PROVIDER", "deepgram").strip().lower()
tts_provider = os.getenv("TTS_PROVIDER", "deepgram").strip().lower()

if stt_provider in ("sixtydb", "60db"):
from sixtydb_stt import SixtyDbLiveTranscriber
self.transcriber = SixtyDbLiveTranscriber(config)
else:
self.transcriber = LiveTranscriber(config)

self.llm_processor = LLMProcessor(config)
self.synthesizer = SpeechSynthesizer(config)

if tts_provider in ("sixtydb", "60db"):
from sixtydb_tts import SixtyDbSpeechSynthesizer
self.synthesizer = SixtyDbSpeechSynthesizer(config)
else:
self.synthesizer = SpeechSynthesizer(config)

async def run(self):
"""The main loop for the voice assistant."""
Expand Down
148 changes: 148 additions & 0 deletions sixtydb_stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""60db STT — peer of Voice_Bot.LiveTranscriber.

Exposes `SixtyDbLiveTranscriber.listen()` async coroutine returning a
final transcript string, matching the LiveTranscriber interface in
Voice_Bot.py. Uses 60db /ws/stt browser mode (linear PCM 16k JSON
envelopes) and PyAudio for mic capture.

The session closes after the first canonical final (is_final +
speech_final), mirroring how LiveTranscriber resolves its Future per
turn so the main loop advances identically.

Reference: https://docs.60db.ai/websocket-api/stt
"""

from __future__ import annotations

import asyncio
import base64
import json
import os

from dotenv import load_dotenv

load_dotenv()

DEFAULT_API_BASE = "https://api.60db.ai"
SAMPLE_RATE = 16000
CHUNK_BYTES = int(SAMPLE_RATE * 2 * 0.06) # 60 ms of 16-bit mono


class SixtyDbLiveTranscriber:
"""Drop-in peer of Voice_Bot.LiveTranscriber."""

def __init__(self, _config=None):
self.api_key = os.getenv("SIXTYDB_API_KEY")
if not self.api_key:
raise ValueError("SIXTYDB_API_KEY is not set in the environment.")
self.api_base = (os.getenv("SIXTYDB_API_BASE") or DEFAULT_API_BASE).rstrip("/")
self.language = os.getenv("SIXTYDB_STT_LANGUAGE", "en")

async def listen(self) -> str:
try:
import websockets
import pyaudio
except ImportError as e:
raise RuntimeError(
"60db STT requires 'websockets' and 'pyaudio': " + str(e)
)

ws_base = self.api_base.replace("https://", "wss://").replace("http://", "ws://")
url = f"{ws_base}/ws/stt?apiKey={self.api_key}"
final_text = ""
transcription_complete = asyncio.Event()
pa = pyaudio.PyAudio()
stream = pa.open(
format=pyaudio.paInt16,
channels=1,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK_BYTES // 2,
)

try:
async with websockets.connect(url, max_size=None) as ws:
# Skip past `connecting`, await `connection_established`.
while True:
probe = json.loads(await ws.recv())
if "connection_established" in probe:
break

await ws.send(json.dumps({
"type": "start",
"languages": [self.language],
"config": {
"encoding": "linear",
"sample_rate": SAMPLE_RATE,
"utterance_end_ms": 500,
"continuous_mode": False,
"interim_results_frequency": 300,
"audio_enhancement": "adaptive",
},
}))

async def _send_audio():
loop = asyncio.get_running_loop()
while not transcription_complete.is_set():
try:
chunk = await loop.run_in_executor(
None, stream.read, CHUNK_BYTES // 2, False
)
except OSError:
break
if not chunk:
continue
try:
await ws.send(json.dumps({
"type": "audio",
"audio": base64.b64encode(chunk).decode(),
"encoding": "linear",
"sample_rate": SAMPLE_RATE,
}))
except Exception:
break

sender_task: asyncio.Task | None = None
async for raw in ws:
msg = json.loads(raw)
mtype = msg.get("type")
if mtype == "connected" and sender_task is None:
sender_task = asyncio.create_task(_send_audio())
continue
if mtype in ("speech_started", "session_stopped"):
if mtype == "session_stopped":
break
continue
if mtype != "transcription":
continue
text = (msg.get("text") or "").strip()
if not text:
continue
if msg.get("is_final") and msg.get("speech_final"):
final_text = text
transcription_complete.set()
try:
await ws.send(json.dumps({"type": "stop"}))
except Exception:
pass
try:
await asyncio.wait_for(ws.recv(), timeout=2.0)
except (asyncio.TimeoutError, Exception):
pass
break

if sender_task:
sender_task.cancel()
try:
await sender_task
except (asyncio.CancelledError, Exception):
pass
finally:
try:
stream.stop_stream()
stream.close()
except Exception:
pass
pa.terminate()

return final_text
Loading