diff --git a/ai-hub/app/core/providers/tts/gcloud_tts.py b/ai-hub/app/core/providers/tts/gcloud_tts.py index 0ae78a6..5b24bbb 100644 --- a/ai-hub/app/core/providers/tts/gcloud_tts.py +++ b/ai-hub/app/core/providers/tts/gcloud_tts.py @@ -11,11 +11,12 @@ # Configure logging logger = logging.getLogger(__name__) -# New concrete class for the Google Cloud Text-to-Speech API +import re + class GCloudTTSProvider(TTSProvider): - # This provider uses Google's dedicated TTS API. The voices are different from Gemini. - # Updated with the full list of available Chirp3 HD voices you provided. - AVAILABLE_VOICES = [ + # English voices + # English voices + AVAILABLE_VOICES_EN = [ "en-US-Chirp3-HD-Achernar", "en-US-Chirp3-HD-Achird", "en-US-Chirp3-HD-Algenib", "en-US-Chirp3-HD-Algieba", "en-US-Chirp3-HD-Alnilam", "en-US-Chirp3-HD-Aoede", "en-US-Chirp3-HD-Autonoe", "en-US-Chirp3-HD-Callirrhoe", "en-US-Chirp3-HD-Charon", @@ -27,36 +28,72 @@ "en-US-Chirp3-HD-Schedar", "en-US-Chirp3-HD-Sulafat", "en-US-Chirp3-HD-Umbriel", "en-US-Chirp3-HD-Vindemiatrix", "en-US-Chirp3-HD-Zephyr", "en-US-Chirp3-HD-Zubenelgenubi" ] + DEFAULT_VOICE_EN = "en-US-Chirp3-HD-Kore" - def __init__(self, api_key: str, voice_name: str = "en-US-Chirp3-HD-Kore"): - if voice_name not in self.AVAILABLE_VOICES: - raise ValueError(f"Invalid voice name: {voice_name}. Choose from {self.AVAILABLE_VOICES}") - + # Chinese voices + AVAILABLE_VOICES_CMN = [ + "cmn-CN-Chirp3-HD-Achernar", "cmn-CN-Chirp3-HD-Achird", "cmn-CN-Chirp3-HD-Algenib", + "cmn-CN-Chirp3-HD-Algieba", "cmn-CN-Chirp3-HD-Alnilam", "cmn-CN-Chirp3-HD-Aoede", + "cmn-CN-Chirp3-HD-Autonoe", "cmn-CN-Chirp3-HD-Callirrhoe", "cmn-CN-Chirp3-HD-Charon", + "cmn-CN-Chirp3-HD-Despina", "cmn-CN-Chirp3-HD-Enceladus", "cmn-CN-Chirp3-HD-Erinome", + "cmn-CN-Chirp3-HD-Fenrir", "cmn-CN-Chirp3-HD-Gacrux", "cmn-CN-Chirp3-HD-Iapetus", + "cmn-CN-Chirp3-HD-Kore", "cmn-CN-Chirp3-HD-Laomedeia", "cmn-CN-Chirp3-HD-Leda", + "cmn-CN-Chirp3-HD-Orus", "cmn-CN-Chirp3-HD-Puck", "cmn-CN-Chirp3-HD-Pulcherrima", + "cmn-CN-Chirp3-HD-Rasalgethi", "cmn-CN-Chirp3-HD-Sadachbia", "cmn-CN-Chirp3-HD-Sadaltager", + "cmn-CN-Chirp3-HD-Schedar", "cmn-CN-Chirp3-HD-Sulafat", "cmn-CN-Chirp3-HD-Umbriel", + "cmn-CN-Chirp3-HD-Vindemiatrix", "cmn-CN-Chirp3-HD-Zephyr", "cmn-CN-Chirp3-HD-Zubenelgenubi" + ] + DEFAULT_VOICE_CMN = "cmn-CN-Chirp3-HD-Achernar" + + def __init__(self, api_key: str, voice_name: str = DEFAULT_VOICE_EN): + all_voices = self.AVAILABLE_VOICES_EN + self.AVAILABLE_VOICES_CMN + if voice_name not in all_voices: + raise ValueError(f"Invalid voice name: {voice_name}. Choose from {all_voices}") + self.api_key = api_key - # The new API URL for the Cloud Text-to-Speech service self.api_url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={self.api_key}" self.voice_name = voice_name logger.debug(f"Initialized GCloudTTSProvider with voice: {self.voice_name}") + def _detect_language(self, text: str) -> str: + # Simple heuristic: count Chinese characters vs. total chars + chinese_chars = re.findall(r'[\u4e00-\u9fff]', text) + if len(chinese_chars) / max(len(text), 1) > 0.5: + return "cmn-CN" + return "en-US" + async def generate_speech(self, text: str) -> bytes: - logger.debug(f"Starting speech generation for text: '{text[:50]}...'") - + language = self._detect_language(text) + logger.debug(f"Detected language '{language}' for text: '{text[:50]}...'") + + if language == "cmn-CN": + valid_voices = self.AVAILABLE_VOICES_CMN + default_voice = self.DEFAULT_VOICE_CMN + else: + language = "en-US" + valid_voices = self.AVAILABLE_VOICES_EN + default_voice = self.DEFAULT_VOICE_EN + + if self.voice_name not in valid_voices: + logger.warning(f"Voice '{self.voice_name}' not compatible with language '{language}'. Using default voice '{default_voice}'.") + voice_to_use = default_voice + else: + voice_to_use = self.voice_name + headers = { "Content-Type": "application/json" } json_data = { - "input": { - "text": text - }, + "input": {"text": text}, "voice": { - "languageCode": "en-US", - "name": self.voice_name + "languageCode": language, + "name": voice_to_use }, "audioConfig": { "audioEncoding": "LINEAR16" } } - + logger.debug(f"API Request URL: {self.api_url}") logger.debug(f"Request Payload: {json_data}") @@ -65,18 +102,17 @@ async with session.post(self.api_url, headers=headers, json=json_data) as response: logger.debug(f"Received API response with status code: {response.status}") response.raise_for_status() - + response_json = await response.json() logger.debug("Successfully parsed API response JSON.") - - # The audio data is now under the 'audioContent' key + audio_base64 = response_json.get('audioContent') if not audio_base64: raise KeyError("audioContent key not found in the response.") - + audio_bytes = base64.b64decode(audio_base64) logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.") - + return audio_bytes except ClientResponseError as e: if e.status == 429: @@ -90,4 +126,4 @@ raise HTTPException(status_code=500, detail="Malformed API response from Cloud TTS.") except Exception as e: logger.error(f"An unexpected error occurred during speech generation: {e}") - raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") \ No newline at end of file + raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") diff --git a/ai-hub/app/core/providers/tts/gcloud_tts.py b/ai-hub/app/core/providers/tts/gcloud_tts.py index 0ae78a6..5b24bbb 100644 --- a/ai-hub/app/core/providers/tts/gcloud_tts.py +++ b/ai-hub/app/core/providers/tts/gcloud_tts.py @@ -11,11 +11,12 @@ # Configure logging logger = logging.getLogger(__name__) -# New concrete class for the Google Cloud Text-to-Speech API +import re + class GCloudTTSProvider(TTSProvider): - # This provider uses Google's dedicated TTS API. The voices are different from Gemini. - # Updated with the full list of available Chirp3 HD voices you provided. - AVAILABLE_VOICES = [ + # English voices + # English voices + AVAILABLE_VOICES_EN = [ "en-US-Chirp3-HD-Achernar", "en-US-Chirp3-HD-Achird", "en-US-Chirp3-HD-Algenib", "en-US-Chirp3-HD-Algieba", "en-US-Chirp3-HD-Alnilam", "en-US-Chirp3-HD-Aoede", "en-US-Chirp3-HD-Autonoe", "en-US-Chirp3-HD-Callirrhoe", "en-US-Chirp3-HD-Charon", @@ -27,36 +28,72 @@ "en-US-Chirp3-HD-Schedar", "en-US-Chirp3-HD-Sulafat", "en-US-Chirp3-HD-Umbriel", "en-US-Chirp3-HD-Vindemiatrix", "en-US-Chirp3-HD-Zephyr", "en-US-Chirp3-HD-Zubenelgenubi" ] + DEFAULT_VOICE_EN = "en-US-Chirp3-HD-Kore" - def __init__(self, api_key: str, voice_name: str = "en-US-Chirp3-HD-Kore"): - if voice_name not in self.AVAILABLE_VOICES: - raise ValueError(f"Invalid voice name: {voice_name}. Choose from {self.AVAILABLE_VOICES}") - + # Chinese voices + AVAILABLE_VOICES_CMN = [ + "cmn-CN-Chirp3-HD-Achernar", "cmn-CN-Chirp3-HD-Achird", "cmn-CN-Chirp3-HD-Algenib", + "cmn-CN-Chirp3-HD-Algieba", "cmn-CN-Chirp3-HD-Alnilam", "cmn-CN-Chirp3-HD-Aoede", + "cmn-CN-Chirp3-HD-Autonoe", "cmn-CN-Chirp3-HD-Callirrhoe", "cmn-CN-Chirp3-HD-Charon", + "cmn-CN-Chirp3-HD-Despina", "cmn-CN-Chirp3-HD-Enceladus", "cmn-CN-Chirp3-HD-Erinome", + "cmn-CN-Chirp3-HD-Fenrir", "cmn-CN-Chirp3-HD-Gacrux", "cmn-CN-Chirp3-HD-Iapetus", + "cmn-CN-Chirp3-HD-Kore", "cmn-CN-Chirp3-HD-Laomedeia", "cmn-CN-Chirp3-HD-Leda", + "cmn-CN-Chirp3-HD-Orus", "cmn-CN-Chirp3-HD-Puck", "cmn-CN-Chirp3-HD-Pulcherrima", + "cmn-CN-Chirp3-HD-Rasalgethi", "cmn-CN-Chirp3-HD-Sadachbia", "cmn-CN-Chirp3-HD-Sadaltager", + "cmn-CN-Chirp3-HD-Schedar", "cmn-CN-Chirp3-HD-Sulafat", "cmn-CN-Chirp3-HD-Umbriel", + "cmn-CN-Chirp3-HD-Vindemiatrix", "cmn-CN-Chirp3-HD-Zephyr", "cmn-CN-Chirp3-HD-Zubenelgenubi" + ] + DEFAULT_VOICE_CMN = "cmn-CN-Chirp3-HD-Achernar" + + def __init__(self, api_key: str, voice_name: str = DEFAULT_VOICE_EN): + all_voices = self.AVAILABLE_VOICES_EN + self.AVAILABLE_VOICES_CMN + if voice_name not in all_voices: + raise ValueError(f"Invalid voice name: {voice_name}. Choose from {all_voices}") + self.api_key = api_key - # The new API URL for the Cloud Text-to-Speech service self.api_url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={self.api_key}" self.voice_name = voice_name logger.debug(f"Initialized GCloudTTSProvider with voice: {self.voice_name}") + def _detect_language(self, text: str) -> str: + # Simple heuristic: count Chinese characters vs. total chars + chinese_chars = re.findall(r'[\u4e00-\u9fff]', text) + if len(chinese_chars) / max(len(text), 1) > 0.5: + return "cmn-CN" + return "en-US" + async def generate_speech(self, text: str) -> bytes: - logger.debug(f"Starting speech generation for text: '{text[:50]}...'") - + language = self._detect_language(text) + logger.debug(f"Detected language '{language}' for text: '{text[:50]}...'") + + if language == "cmn-CN": + valid_voices = self.AVAILABLE_VOICES_CMN + default_voice = self.DEFAULT_VOICE_CMN + else: + language = "en-US" + valid_voices = self.AVAILABLE_VOICES_EN + default_voice = self.DEFAULT_VOICE_EN + + if self.voice_name not in valid_voices: + logger.warning(f"Voice '{self.voice_name}' not compatible with language '{language}'. Using default voice '{default_voice}'.") + voice_to_use = default_voice + else: + voice_to_use = self.voice_name + headers = { "Content-Type": "application/json" } json_data = { - "input": { - "text": text - }, + "input": {"text": text}, "voice": { - "languageCode": "en-US", - "name": self.voice_name + "languageCode": language, + "name": voice_to_use }, "audioConfig": { "audioEncoding": "LINEAR16" } } - + logger.debug(f"API Request URL: {self.api_url}") logger.debug(f"Request Payload: {json_data}") @@ -65,18 +102,17 @@ async with session.post(self.api_url, headers=headers, json=json_data) as response: logger.debug(f"Received API response with status code: {response.status}") response.raise_for_status() - + response_json = await response.json() logger.debug("Successfully parsed API response JSON.") - - # The audio data is now under the 'audioContent' key + audio_base64 = response_json.get('audioContent') if not audio_base64: raise KeyError("audioContent key not found in the response.") - + audio_bytes = base64.b64decode(audio_base64) logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.") - + return audio_bytes except ClientResponseError as e: if e.status == 429: @@ -90,4 +126,4 @@ raise HTTPException(status_code=500, detail="Malformed API response from Cloud TTS.") except Exception as e: logger.error(f"An unexpected error occurred during speech generation: {e}") - raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") \ No newline at end of file + raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") diff --git a/ui/.gitignore b/ui/.gitignore index c2658d7..d570088 100644 --- a/ui/.gitignore +++ b/ui/.gitignore @@ -1 +1,2 @@ node_modules/ + diff --git a/ai-hub/app/core/providers/tts/gcloud_tts.py b/ai-hub/app/core/providers/tts/gcloud_tts.py index 0ae78a6..5b24bbb 100644 --- a/ai-hub/app/core/providers/tts/gcloud_tts.py +++ b/ai-hub/app/core/providers/tts/gcloud_tts.py @@ -11,11 +11,12 @@ # Configure logging logger = logging.getLogger(__name__) -# New concrete class for the Google Cloud Text-to-Speech API +import re + class GCloudTTSProvider(TTSProvider): - # This provider uses Google's dedicated TTS API. The voices are different from Gemini. - # Updated with the full list of available Chirp3 HD voices you provided. - AVAILABLE_VOICES = [ + # English voices + # English voices + AVAILABLE_VOICES_EN = [ "en-US-Chirp3-HD-Achernar", "en-US-Chirp3-HD-Achird", "en-US-Chirp3-HD-Algenib", "en-US-Chirp3-HD-Algieba", "en-US-Chirp3-HD-Alnilam", "en-US-Chirp3-HD-Aoede", "en-US-Chirp3-HD-Autonoe", "en-US-Chirp3-HD-Callirrhoe", "en-US-Chirp3-HD-Charon", @@ -27,36 +28,72 @@ "en-US-Chirp3-HD-Schedar", "en-US-Chirp3-HD-Sulafat", "en-US-Chirp3-HD-Umbriel", "en-US-Chirp3-HD-Vindemiatrix", "en-US-Chirp3-HD-Zephyr", "en-US-Chirp3-HD-Zubenelgenubi" ] + DEFAULT_VOICE_EN = "en-US-Chirp3-HD-Kore" - def __init__(self, api_key: str, voice_name: str = "en-US-Chirp3-HD-Kore"): - if voice_name not in self.AVAILABLE_VOICES: - raise ValueError(f"Invalid voice name: {voice_name}. Choose from {self.AVAILABLE_VOICES}") - + # Chinese voices + AVAILABLE_VOICES_CMN = [ + "cmn-CN-Chirp3-HD-Achernar", "cmn-CN-Chirp3-HD-Achird", "cmn-CN-Chirp3-HD-Algenib", + "cmn-CN-Chirp3-HD-Algieba", "cmn-CN-Chirp3-HD-Alnilam", "cmn-CN-Chirp3-HD-Aoede", + "cmn-CN-Chirp3-HD-Autonoe", "cmn-CN-Chirp3-HD-Callirrhoe", "cmn-CN-Chirp3-HD-Charon", + "cmn-CN-Chirp3-HD-Despina", "cmn-CN-Chirp3-HD-Enceladus", "cmn-CN-Chirp3-HD-Erinome", + "cmn-CN-Chirp3-HD-Fenrir", "cmn-CN-Chirp3-HD-Gacrux", "cmn-CN-Chirp3-HD-Iapetus", + "cmn-CN-Chirp3-HD-Kore", "cmn-CN-Chirp3-HD-Laomedeia", "cmn-CN-Chirp3-HD-Leda", + "cmn-CN-Chirp3-HD-Orus", "cmn-CN-Chirp3-HD-Puck", "cmn-CN-Chirp3-HD-Pulcherrima", + "cmn-CN-Chirp3-HD-Rasalgethi", "cmn-CN-Chirp3-HD-Sadachbia", "cmn-CN-Chirp3-HD-Sadaltager", + "cmn-CN-Chirp3-HD-Schedar", "cmn-CN-Chirp3-HD-Sulafat", "cmn-CN-Chirp3-HD-Umbriel", + "cmn-CN-Chirp3-HD-Vindemiatrix", "cmn-CN-Chirp3-HD-Zephyr", "cmn-CN-Chirp3-HD-Zubenelgenubi" + ] + DEFAULT_VOICE_CMN = "cmn-CN-Chirp3-HD-Achernar" + + def __init__(self, api_key: str, voice_name: str = DEFAULT_VOICE_EN): + all_voices = self.AVAILABLE_VOICES_EN + self.AVAILABLE_VOICES_CMN + if voice_name not in all_voices: + raise ValueError(f"Invalid voice name: {voice_name}. Choose from {all_voices}") + self.api_key = api_key - # The new API URL for the Cloud Text-to-Speech service self.api_url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={self.api_key}" self.voice_name = voice_name logger.debug(f"Initialized GCloudTTSProvider with voice: {self.voice_name}") + def _detect_language(self, text: str) -> str: + # Simple heuristic: count Chinese characters vs. total chars + chinese_chars = re.findall(r'[\u4e00-\u9fff]', text) + if len(chinese_chars) / max(len(text), 1) > 0.5: + return "cmn-CN" + return "en-US" + async def generate_speech(self, text: str) -> bytes: - logger.debug(f"Starting speech generation for text: '{text[:50]}...'") - + language = self._detect_language(text) + logger.debug(f"Detected language '{language}' for text: '{text[:50]}...'") + + if language == "cmn-CN": + valid_voices = self.AVAILABLE_VOICES_CMN + default_voice = self.DEFAULT_VOICE_CMN + else: + language = "en-US" + valid_voices = self.AVAILABLE_VOICES_EN + default_voice = self.DEFAULT_VOICE_EN + + if self.voice_name not in valid_voices: + logger.warning(f"Voice '{self.voice_name}' not compatible with language '{language}'. Using default voice '{default_voice}'.") + voice_to_use = default_voice + else: + voice_to_use = self.voice_name + headers = { "Content-Type": "application/json" } json_data = { - "input": { - "text": text - }, + "input": {"text": text}, "voice": { - "languageCode": "en-US", - "name": self.voice_name + "languageCode": language, + "name": voice_to_use }, "audioConfig": { "audioEncoding": "LINEAR16" } } - + logger.debug(f"API Request URL: {self.api_url}") logger.debug(f"Request Payload: {json_data}") @@ -65,18 +102,17 @@ async with session.post(self.api_url, headers=headers, json=json_data) as response: logger.debug(f"Received API response with status code: {response.status}") response.raise_for_status() - + response_json = await response.json() logger.debug("Successfully parsed API response JSON.") - - # The audio data is now under the 'audioContent' key + audio_base64 = response_json.get('audioContent') if not audio_base64: raise KeyError("audioContent key not found in the response.") - + audio_bytes = base64.b64decode(audio_base64) logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.") - + return audio_bytes except ClientResponseError as e: if e.status == 429: @@ -90,4 +126,4 @@ raise HTTPException(status_code=500, detail="Malformed API response from Cloud TTS.") except Exception as e: logger.error(f"An unexpected error occurred during speech generation: {e}") - raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") \ No newline at end of file + raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") diff --git a/ui/.gitignore b/ui/.gitignore index c2658d7..d570088 100644 --- a/ui/.gitignore +++ b/ui/.gitignore @@ -1 +1,2 @@ node_modules/ + diff --git a/ui/run_web.sh b/ui/run_web.sh index 8824f9b..d4e2a17 100644 --- a/ui/run_web.sh +++ b/ui/run_web.sh @@ -1,59 +1,82 @@ #!/bin/bash +# Enable strict mode +set -euo pipefail + +# Default to HTTP +USE_HTTPS=false + + + +# Parse arguments +for arg in "$@"; do + if [[ "$arg" == "--https" ]]; then + USE_HTTPS=true + fi +done + +# Resolve script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +AI_HUB_DIR="$(realpath "$SCRIPT_DIR/../ai-hub")" +TTS_CLIENT_DIR="$SCRIPT_DIR/tts-client-app" + AI_HUB_HOST="0.0.0.0" AI_HUB_PORT="8001" APP_MODULE="app.main:app" -AI_HUB_DIR="/app/project/cortex-hub/ai-hub" -TTS_CLIENT_DIR="/app/project/cortex-hub/ui/tts-client-app" - echo "--- Cleaning up existing processes ---" - # Kill existing uvicorn processes on the expected port -EXISTING_UVICORN_PID=$(lsof -ti tcp:${AI_HUB_PORT}) +EXISTING_UVICORN_PID=$(lsof -ti tcp:${AI_HUB_PORT} || true) if [ -n "$EXISTING_UVICORN_PID" ]; then echo "Killing existing process on port ${AI_HUB_PORT} (PID: $EXISTING_UVICORN_PID)" kill -9 "$EXISTING_UVICORN_PID" fi # Kill existing React frontend on port 8000 -EXISTING_REACT_PID=$(lsof -ti tcp:8000) +EXISTING_REACT_PID=$(lsof -ti tcp:8000 || true) if [ -n "$EXISTING_REACT_PID" ]; then echo "Killing existing frontend process on port 8000 (PID: $EXISTING_REACT_PID)" kill -9 "$EXISTING_REACT_PID" fi - pushd "$AI_HUB_DIR" > /dev/null pip install -e . -echo "--- Generating self-signed SSL certificates ---" +SSL_ARGS="" +FRONTEND_ENV="" -# Create a temporary directory for certs -SSL_TEMP_DIR=$(mktemp -d) -SSL_KEYFILE="${SSL_TEMP_DIR}/key.pem" -SSL_CERTFILE="${SSL_TEMP_DIR}/cert.pem" +if [ "$USE_HTTPS" = true ]; then + echo "--- Generating self-signed SSL certificates ---" -# Generate self-signed certificate -openssl req -x509 -nodes -days 1 -newkey rsa:2048 \ - -keyout "$SSL_KEYFILE" \ - -out "$SSL_CERTFILE" \ - -subj "/CN=localhost" + # Create a temporary directory for certs + SSL_TEMP_DIR=$(mktemp -d) + SSL_KEYFILE="${SSL_TEMP_DIR}/key.pem" + SSL_CERTFILE="${SSL_TEMP_DIR}/cert.pem" -# Cleanup function to remove certs on exit -cleanup() { - echo "--- Cleaning up SSL certificates ---" - rm -rf "$SSL_TEMP_DIR" -} -trap cleanup EXIT + # Generate self-signed certificate + openssl req -x509 -nodes -days 1 -newkey rsa:2048 \ + -keyout "$SSL_KEYFILE" \ + -out "$SSL_CERTFILE" \ + -subj "/CN=localhost" + + # Cleanup function to remove certs on exit + cleanup() { + echo "--- Cleaning up SSL certificates ---" + rm -rf "$SSL_TEMP_DIR" + } + trap cleanup EXIT + + SSL_ARGS="--ssl-keyfile $SSL_KEYFILE --ssl-certfile $SSL_CERTFILE" + FRONTEND_ENV="HTTPS=true" +fi echo "--- Starting AI Hub Server, React frontend, and backend proxy ---" -# Run AI Hub backend (HTTPS), React frontend (HTTPS), concurrently +# Run backend and frontend concurrently concurrently \ --prefix "[{name}]" \ --names "aihub,tts-frontend" \ - "uvicorn $APP_MODULE --host $AI_HUB_HOST --port $AI_HUB_PORT --ssl-keyfile $SSL_KEYFILE --ssl-certfile $SSL_CERTFILE" \ - "cd $TTS_CLIENT_DIR && HTTPS=true HOST=0.0.0.0 PORT=8000 npm start" + "LOG_LEVEL=DEBUG uvicorn $APP_MODULE --host $AI_HUB_HOST --log-level debug --port $AI_HUB_PORT $SSL_ARGS" \ + "cd $TTS_CLIENT_DIR && $FRONTEND_ENV HOST=0.0.0.0 PORT=8000 npm start" popd > /dev/null diff --git a/ai-hub/app/core/providers/tts/gcloud_tts.py b/ai-hub/app/core/providers/tts/gcloud_tts.py index 0ae78a6..5b24bbb 100644 --- a/ai-hub/app/core/providers/tts/gcloud_tts.py +++ b/ai-hub/app/core/providers/tts/gcloud_tts.py @@ -11,11 +11,12 @@ # Configure logging logger = logging.getLogger(__name__) -# New concrete class for the Google Cloud Text-to-Speech API +import re + class GCloudTTSProvider(TTSProvider): - # This provider uses Google's dedicated TTS API. The voices are different from Gemini. - # Updated with the full list of available Chirp3 HD voices you provided. - AVAILABLE_VOICES = [ + # English voices + # English voices + AVAILABLE_VOICES_EN = [ "en-US-Chirp3-HD-Achernar", "en-US-Chirp3-HD-Achird", "en-US-Chirp3-HD-Algenib", "en-US-Chirp3-HD-Algieba", "en-US-Chirp3-HD-Alnilam", "en-US-Chirp3-HD-Aoede", "en-US-Chirp3-HD-Autonoe", "en-US-Chirp3-HD-Callirrhoe", "en-US-Chirp3-HD-Charon", @@ -27,36 +28,72 @@ "en-US-Chirp3-HD-Schedar", "en-US-Chirp3-HD-Sulafat", "en-US-Chirp3-HD-Umbriel", "en-US-Chirp3-HD-Vindemiatrix", "en-US-Chirp3-HD-Zephyr", "en-US-Chirp3-HD-Zubenelgenubi" ] + DEFAULT_VOICE_EN = "en-US-Chirp3-HD-Kore" - def __init__(self, api_key: str, voice_name: str = "en-US-Chirp3-HD-Kore"): - if voice_name not in self.AVAILABLE_VOICES: - raise ValueError(f"Invalid voice name: {voice_name}. Choose from {self.AVAILABLE_VOICES}") - + # Chinese voices + AVAILABLE_VOICES_CMN = [ + "cmn-CN-Chirp3-HD-Achernar", "cmn-CN-Chirp3-HD-Achird", "cmn-CN-Chirp3-HD-Algenib", + "cmn-CN-Chirp3-HD-Algieba", "cmn-CN-Chirp3-HD-Alnilam", "cmn-CN-Chirp3-HD-Aoede", + "cmn-CN-Chirp3-HD-Autonoe", "cmn-CN-Chirp3-HD-Callirrhoe", "cmn-CN-Chirp3-HD-Charon", + "cmn-CN-Chirp3-HD-Despina", "cmn-CN-Chirp3-HD-Enceladus", "cmn-CN-Chirp3-HD-Erinome", + "cmn-CN-Chirp3-HD-Fenrir", "cmn-CN-Chirp3-HD-Gacrux", "cmn-CN-Chirp3-HD-Iapetus", + "cmn-CN-Chirp3-HD-Kore", "cmn-CN-Chirp3-HD-Laomedeia", "cmn-CN-Chirp3-HD-Leda", + "cmn-CN-Chirp3-HD-Orus", "cmn-CN-Chirp3-HD-Puck", "cmn-CN-Chirp3-HD-Pulcherrima", + "cmn-CN-Chirp3-HD-Rasalgethi", "cmn-CN-Chirp3-HD-Sadachbia", "cmn-CN-Chirp3-HD-Sadaltager", + "cmn-CN-Chirp3-HD-Schedar", "cmn-CN-Chirp3-HD-Sulafat", "cmn-CN-Chirp3-HD-Umbriel", + "cmn-CN-Chirp3-HD-Vindemiatrix", "cmn-CN-Chirp3-HD-Zephyr", "cmn-CN-Chirp3-HD-Zubenelgenubi" + ] + DEFAULT_VOICE_CMN = "cmn-CN-Chirp3-HD-Achernar" + + def __init__(self, api_key: str, voice_name: str = DEFAULT_VOICE_EN): + all_voices = self.AVAILABLE_VOICES_EN + self.AVAILABLE_VOICES_CMN + if voice_name not in all_voices: + raise ValueError(f"Invalid voice name: {voice_name}. Choose from {all_voices}") + self.api_key = api_key - # The new API URL for the Cloud Text-to-Speech service self.api_url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={self.api_key}" self.voice_name = voice_name logger.debug(f"Initialized GCloudTTSProvider with voice: {self.voice_name}") + def _detect_language(self, text: str) -> str: + # Simple heuristic: count Chinese characters vs. total chars + chinese_chars = re.findall(r'[\u4e00-\u9fff]', text) + if len(chinese_chars) / max(len(text), 1) > 0.5: + return "cmn-CN" + return "en-US" + async def generate_speech(self, text: str) -> bytes: - logger.debug(f"Starting speech generation for text: '{text[:50]}...'") - + language = self._detect_language(text) + logger.debug(f"Detected language '{language}' for text: '{text[:50]}...'") + + if language == "cmn-CN": + valid_voices = self.AVAILABLE_VOICES_CMN + default_voice = self.DEFAULT_VOICE_CMN + else: + language = "en-US" + valid_voices = self.AVAILABLE_VOICES_EN + default_voice = self.DEFAULT_VOICE_EN + + if self.voice_name not in valid_voices: + logger.warning(f"Voice '{self.voice_name}' not compatible with language '{language}'. Using default voice '{default_voice}'.") + voice_to_use = default_voice + else: + voice_to_use = self.voice_name + headers = { "Content-Type": "application/json" } json_data = { - "input": { - "text": text - }, + "input": {"text": text}, "voice": { - "languageCode": "en-US", - "name": self.voice_name + "languageCode": language, + "name": voice_to_use }, "audioConfig": { "audioEncoding": "LINEAR16" } } - + logger.debug(f"API Request URL: {self.api_url}") logger.debug(f"Request Payload: {json_data}") @@ -65,18 +102,17 @@ async with session.post(self.api_url, headers=headers, json=json_data) as response: logger.debug(f"Received API response with status code: {response.status}") response.raise_for_status() - + response_json = await response.json() logger.debug("Successfully parsed API response JSON.") - - # The audio data is now under the 'audioContent' key + audio_base64 = response_json.get('audioContent') if not audio_base64: raise KeyError("audioContent key not found in the response.") - + audio_bytes = base64.b64decode(audio_base64) logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.") - + return audio_bytes except ClientResponseError as e: if e.status == 429: @@ -90,4 +126,4 @@ raise HTTPException(status_code=500, detail="Malformed API response from Cloud TTS.") except Exception as e: logger.error(f"An unexpected error occurred during speech generation: {e}") - raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") \ No newline at end of file + raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") diff --git a/ui/.gitignore b/ui/.gitignore index c2658d7..d570088 100644 --- a/ui/.gitignore +++ b/ui/.gitignore @@ -1 +1,2 @@ node_modules/ + diff --git a/ui/run_web.sh b/ui/run_web.sh index 8824f9b..d4e2a17 100644 --- a/ui/run_web.sh +++ b/ui/run_web.sh @@ -1,59 +1,82 @@ #!/bin/bash +# Enable strict mode +set -euo pipefail + +# Default to HTTP +USE_HTTPS=false + + + +# Parse arguments +for arg in "$@"; do + if [[ "$arg" == "--https" ]]; then + USE_HTTPS=true + fi +done + +# Resolve script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +AI_HUB_DIR="$(realpath "$SCRIPT_DIR/../ai-hub")" +TTS_CLIENT_DIR="$SCRIPT_DIR/tts-client-app" + AI_HUB_HOST="0.0.0.0" AI_HUB_PORT="8001" APP_MODULE="app.main:app" -AI_HUB_DIR="/app/project/cortex-hub/ai-hub" -TTS_CLIENT_DIR="/app/project/cortex-hub/ui/tts-client-app" - echo "--- Cleaning up existing processes ---" - # Kill existing uvicorn processes on the expected port -EXISTING_UVICORN_PID=$(lsof -ti tcp:${AI_HUB_PORT}) +EXISTING_UVICORN_PID=$(lsof -ti tcp:${AI_HUB_PORT} || true) if [ -n "$EXISTING_UVICORN_PID" ]; then echo "Killing existing process on port ${AI_HUB_PORT} (PID: $EXISTING_UVICORN_PID)" kill -9 "$EXISTING_UVICORN_PID" fi # Kill existing React frontend on port 8000 -EXISTING_REACT_PID=$(lsof -ti tcp:8000) +EXISTING_REACT_PID=$(lsof -ti tcp:8000 || true) if [ -n "$EXISTING_REACT_PID" ]; then echo "Killing existing frontend process on port 8000 (PID: $EXISTING_REACT_PID)" kill -9 "$EXISTING_REACT_PID" fi - pushd "$AI_HUB_DIR" > /dev/null pip install -e . -echo "--- Generating self-signed SSL certificates ---" +SSL_ARGS="" +FRONTEND_ENV="" -# Create a temporary directory for certs -SSL_TEMP_DIR=$(mktemp -d) -SSL_KEYFILE="${SSL_TEMP_DIR}/key.pem" -SSL_CERTFILE="${SSL_TEMP_DIR}/cert.pem" +if [ "$USE_HTTPS" = true ]; then + echo "--- Generating self-signed SSL certificates ---" -# Generate self-signed certificate -openssl req -x509 -nodes -days 1 -newkey rsa:2048 \ - -keyout "$SSL_KEYFILE" \ - -out "$SSL_CERTFILE" \ - -subj "/CN=localhost" + # Create a temporary directory for certs + SSL_TEMP_DIR=$(mktemp -d) + SSL_KEYFILE="${SSL_TEMP_DIR}/key.pem" + SSL_CERTFILE="${SSL_TEMP_DIR}/cert.pem" -# Cleanup function to remove certs on exit -cleanup() { - echo "--- Cleaning up SSL certificates ---" - rm -rf "$SSL_TEMP_DIR" -} -trap cleanup EXIT + # Generate self-signed certificate + openssl req -x509 -nodes -days 1 -newkey rsa:2048 \ + -keyout "$SSL_KEYFILE" \ + -out "$SSL_CERTFILE" \ + -subj "/CN=localhost" + + # Cleanup function to remove certs on exit + cleanup() { + echo "--- Cleaning up SSL certificates ---" + rm -rf "$SSL_TEMP_DIR" + } + trap cleanup EXIT + + SSL_ARGS="--ssl-keyfile $SSL_KEYFILE --ssl-certfile $SSL_CERTFILE" + FRONTEND_ENV="HTTPS=true" +fi echo "--- Starting AI Hub Server, React frontend, and backend proxy ---" -# Run AI Hub backend (HTTPS), React frontend (HTTPS), concurrently +# Run backend and frontend concurrently concurrently \ --prefix "[{name}]" \ --names "aihub,tts-frontend" \ - "uvicorn $APP_MODULE --host $AI_HUB_HOST --port $AI_HUB_PORT --ssl-keyfile $SSL_KEYFILE --ssl-certfile $SSL_CERTFILE" \ - "cd $TTS_CLIENT_DIR && HTTPS=true HOST=0.0.0.0 PORT=8000 npm start" + "LOG_LEVEL=DEBUG uvicorn $APP_MODULE --host $AI_HUB_HOST --log-level debug --port $AI_HUB_PORT $SSL_ARGS" \ + "cd $TTS_CLIENT_DIR && $FRONTEND_ENV HOST=0.0.0.0 PORT=8000 npm start" popd > /dev/null diff --git a/ui/tts-client-app/src/App.js b/ui/tts-client-app/src/App.js index b1f9514..6cfd356 100644 --- a/ui/tts-client-app/src/App.js +++ b/ui/tts-client-app/src/App.js @@ -22,7 +22,9 @@ // State for managing the chat session const [sessionId, setSessionId] = useState(null); - const [userId, setUserId] = useState(null); + // FIX: The userId was being assigned but not used in the rest of the component. + // It's already handled inside the useEffect, so we don't need a state variable for it. + // The previous state variable has been removed. // State to toggle between manual and automatic recording modes const [isAutoMode, setIsAutoMode] = useState(false); @@ -56,10 +58,10 @@ // --- Configuration --- // Please replace with your actual endpoints - const STT_ENDPOINT = "https://192.168.68.113:8001/stt/transcribe"; - const SESSIONS_CREATE_ENDPOINT = "https://192.168.68.113:8001/sessions"; - const SESSIONS_CHAT_ENDPOINT = (id) => `https://192.168.68.113:8001/sessions/${id}/chat`; - const TTS_ENDPOINT = "https://192.168.68.113:8001/speech"; + const STT_ENDPOINT = "http://localhost:8001/stt/transcribe"; + const SESSIONS_CREATE_ENDPOINT = "http://localhost:8001/sessions"; + const SESSIONS_CHAT_ENDPOINT = (id) => `http://localhost:8001/sessions/${id}/chat`; + const TTS_ENDPOINT = "http://localhost:8001/speech"; // Configuration for Voice Activity Detection const VAD_THRESHOLD = 0.01; // Adjust this value to control sensitivity (0 to 1) @@ -77,7 +79,9 @@ console.log("Attempting to create a new session."); try { const generatedUserId = crypto.randomUUID(); - setUserId(generatedUserId); + // FIX: The `setUserId(generatedUserId)` call has been removed + // because the state variable `userId` is no longer needed. + // The `generatedUserId` is still used in the API call. const response = await fetch(SESSIONS_CREATE_ENDPOINT, { method: "POST", @@ -645,10 +649,9 @@ } }; - // Determine the icon and status text based on the current state - const microphoneButtonState = isAutoMode - ? (isAutoListening || isRecording) - : isRecording; + // FIX: The microphoneButtonState variable was assigned but never used. + // The logic has been moved directly into the JSX below. + // The previous variable declaration has been removed. const micButtonColorClass = isRecording ? "bg-red-600 hover:bg-red-700 active:bg-red-800" @@ -656,123 +659,123 @@ return (
- Ask me anything and I'll respond! -
-+ Ask me anything and I'll respond! +
+{errorMessage}
+{errorMessage}
- -