import os
import aiohttp
import asyncio
import base64
import logging
from typing import AsyncGenerator
from app.core.providers.base import TTSProvider
from aiohttp import ClientResponseError
from fastapi import HTTPException
# Configure logging
logger = logging.getLogger(__name__)
# New concrete class for Gemini TTS with the corrected voice list
class GeminiTTSProvider(TTSProvider):
# Class attribute with the corrected list of available voices
AVAILABLE_VOICES = [
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda",
"Orus", "Aoede", "Callirrhoe", "Autonoe", "Enceladus",
"Iapetus", "Umbriel", "Algieba", "Despina", "Erinome",
"Algenib", "Rasalgethi", "Laomedeia", "Achernar", "Alnilam",
"Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
]
def __init__(self, api_key: str, voice_name: str = "Kore", model_name: str = "gemini-2.5-flash-preview-tts"):
if voice_name not in self.AVAILABLE_VOICES:
raise ValueError(f"Invalid voice name: {voice_name}. Choose from {self.AVAILABLE_VOICES}")
self.api_key = api_key
# The API URL is now a f-string that includes the configurable model name
self.api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent"
self.voice_name = voice_name
self.model_name = model_name
logger.debug(f"Initialized GeminiTTSProvider with model: {self.model_name}, voice: {self.voice_name}")
async def generate_speech(self, text: str) -> bytes:
logger.debug(f"Starting speech generation for text: '{text[:50]}...'")
headers = {
"x-goog-api-key": self.api_key,
"Content-Type": "application/json"
}
json_data = {
"contents": [{
"parts": [{
"text": text
}]
}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": self.voice_name
}
}
}
},
# The model is now configurable via the instance variable
"model": self.model_name
}
logger.debug(f"API Request URL: {self.api_url}")
logger.debug(f"Request Headers: {headers}")
logger.debug(f"Request Payload: {json_data}")
try:
async with aiohttp.ClientSession() as session:
async with session.post(self.api_url, headers=headers, json=json_data) as response:
logger.debug(f"Received API response with status code: {response.status}")
response.raise_for_status()
response_json = await response.json()
logger.debug("Successfully parsed API response JSON.")
inline_data = response_json['candidates'][0]['content']['parts'][0]['inlineData']['data']
logger.debug("Successfully extracted audio data from JSON response.")
audio_bytes = base64.b64decode(inline_data)
logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.")
return audio_bytes
except ClientResponseError as e:
if e.status == 429:
logger.error("Rate limit exceeded on Gemini TTS API.")
raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
else:
logger.error(f"Aiohttp client error occurred: {e}")
raise HTTPException(status_code=500, detail=f"API request failed: {e}")
except KeyError as e:
logger.error(f"Key error in API response: {e}. Full response: {response_json}")
raise HTTPException(status_code=500, detail="Malformed API response from Gemini.")
except Exception as e:
logger.error(f"An unexpected error occurred during speech generation: {e}")
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}")