diff --git a/.gitignore b/.gitignore index 7782b54..032eab8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ **/*.egg-info .pytest_cache/ **.bin -**.db \ No newline at end of file +**.wav +**.db diff --git a/.gitignore b/.gitignore index 7782b54..032eab8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ **/*.egg-info .pytest_cache/ **.bin -**.db \ No newline at end of file +**.wav +**.db diff --git a/ai-hub/app/api/routes.py b/ai-hub/app/api/routes.py index 220282f..ce8b2ac 100644 --- a/ai-hub/app/api/routes.py +++ b/ai-hub/app/api/routes.py @@ -1,9 +1,7 @@ -# Fixed routes.py -from fastapi import APIRouter, HTTPException, Depends +from fastapi import APIRouter, HTTPException, Depends, Query from fastapi.responses import Response, StreamingResponse from sqlalchemy.orm import Session -from app.api.dependencies import ServiceContainer -from app.api.dependencies import get_db +from app.api.dependencies import ServiceContainer, get_db from app.api import schemas from typing import AsyncGenerator @@ -102,50 +100,45 @@ except Exception as e: raise HTTPException(status_code=500, detail=f"An error occurred: {e}") + # --- Consolidated Speech Endpoint --- + @router.post( "/speech", summary="Generate speech from text", tags=["TTS"], - response_description="Audio bytes in WAV format", + response_description="Audio bytes in WAV format, either as a complete file or a stream.", ) - async def create_speech_response(request: schemas.SpeechRequest): + async def create_speech_response( + request: schemas.SpeechRequest, + stream: bool = Query( + False, + description="If true, returns a streamed audio response. Otherwise, returns a complete file." + ) + ): """ - Generates an audio file from the provided text using the TTS service - and returns it as a complete response. + Generates an audio file or a streaming audio response from the provided text. + By default, it returns a complete audio file. + To get a streaming response, set the 'stream' query parameter to 'true'. """ try: - # Await the coroutine that returns the complete audio data - audio_bytes = await services.tts_service.create_speech_non_stream( - text=request.text - ) + if stream: + # Use the streaming service method + audio_stream_generator: AsyncGenerator[bytes, None] = services.tts_service.create_speech_stream( + text=request.text + ) + return StreamingResponse(audio_stream_generator, media_type="audio/wav") + else: + # Use the non-streaming service method + audio_bytes = await services.tts_service.create_speech_non_stream( + text=request.text + ) + return Response(content=audio_bytes, media_type="audio/wav") - # Return a standard FastAPI Response with the complete audio bytes. - return Response(content=audio_bytes, media_type="audio/wav") - + except HTTPException: + raise # Re-raise existing HTTPException except Exception as e: - # Catch exceptions from the TTS service raise HTTPException( status_code=500, detail=f"Failed to generate speech: {e}" ) - - # Add a streaming endpoint as a new feature - @router.post( - "/speech/stream", - summary="Generate speech from text with streaming", - tags=["TTS"], - response_description="Audio bytes in WAV format (streaming)", - ) - async def create_speech_stream_response(request: schemas.SpeechRequest) -> StreamingResponse: - """ - Generates an audio stream from the provided text and streams it back. - """ - try: - # The service method returns an async generator - audio_stream_generator: AsyncGenerator[bytes, None] = services.tts_service.create_speech_stream( - text=request.text - ) - return StreamingResponse(audio_stream_generator, media_type="audio/wav") - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to stream speech: {e}") return router \ No newline at end of file diff --git a/.gitignore b/.gitignore index 7782b54..032eab8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ **/*.egg-info .pytest_cache/ **.bin -**.db \ No newline at end of file +**.wav +**.db diff --git a/ai-hub/app/api/routes.py b/ai-hub/app/api/routes.py index 220282f..ce8b2ac 100644 --- a/ai-hub/app/api/routes.py +++ b/ai-hub/app/api/routes.py @@ -1,9 +1,7 @@ -# Fixed routes.py -from fastapi import APIRouter, HTTPException, Depends +from fastapi import APIRouter, HTTPException, Depends, Query from fastapi.responses import Response, StreamingResponse from sqlalchemy.orm import Session -from app.api.dependencies import ServiceContainer -from app.api.dependencies import get_db +from app.api.dependencies import ServiceContainer, get_db from app.api import schemas from typing import AsyncGenerator @@ -102,50 +100,45 @@ except Exception as e: raise HTTPException(status_code=500, detail=f"An error occurred: {e}") + # --- Consolidated Speech Endpoint --- + @router.post( "/speech", summary="Generate speech from text", tags=["TTS"], - response_description="Audio bytes in WAV format", + response_description="Audio bytes in WAV format, either as a complete file or a stream.", ) - async def create_speech_response(request: schemas.SpeechRequest): + async def create_speech_response( + request: schemas.SpeechRequest, + stream: bool = Query( + False, + description="If true, returns a streamed audio response. Otherwise, returns a complete file." + ) + ): """ - Generates an audio file from the provided text using the TTS service - and returns it as a complete response. + Generates an audio file or a streaming audio response from the provided text. + By default, it returns a complete audio file. + To get a streaming response, set the 'stream' query parameter to 'true'. """ try: - # Await the coroutine that returns the complete audio data - audio_bytes = await services.tts_service.create_speech_non_stream( - text=request.text - ) + if stream: + # Use the streaming service method + audio_stream_generator: AsyncGenerator[bytes, None] = services.tts_service.create_speech_stream( + text=request.text + ) + return StreamingResponse(audio_stream_generator, media_type="audio/wav") + else: + # Use the non-streaming service method + audio_bytes = await services.tts_service.create_speech_non_stream( + text=request.text + ) + return Response(content=audio_bytes, media_type="audio/wav") - # Return a standard FastAPI Response with the complete audio bytes. - return Response(content=audio_bytes, media_type="audio/wav") - + except HTTPException: + raise # Re-raise existing HTTPException except Exception as e: - # Catch exceptions from the TTS service raise HTTPException( status_code=500, detail=f"Failed to generate speech: {e}" ) - - # Add a streaming endpoint as a new feature - @router.post( - "/speech/stream", - summary="Generate speech from text with streaming", - tags=["TTS"], - response_description="Audio bytes in WAV format (streaming)", - ) - async def create_speech_stream_response(request: schemas.SpeechRequest) -> StreamingResponse: - """ - Generates an audio stream from the provided text and streams it back. - """ - try: - # The service method returns an async generator - audio_stream_generator: AsyncGenerator[bytes, None] = services.tts_service.create_speech_stream( - text=request.text - ) - return StreamingResponse(audio_stream_generator, media_type="audio/wav") - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to stream speech: {e}") return router \ No newline at end of file diff --git a/ai-hub/app/core/services/tts.py b/ai-hub/app/core/services/tts.py index b5887eb..3b1e2be 100644 --- a/ai-hub/app/core/services/tts.py +++ b/ai-hub/app/core/services/tts.py @@ -1,40 +1,139 @@ +import io +import wave +import logging +import asyncio from typing import AsyncGenerator from app.core.providers.base import TTSProvider +from fastapi import HTTPException +import os +# --- Configure logging --- +logger = logging.getLogger(__name__) + +# --- Define TTS Service Class --- class TTSService: """ Service class for generating speech from text using a TTS provider. + This version is designed to handle both streaming and non-streaming + audio generation, splitting text into manageable chunks. """ + + # Use an environment variable or a default value for the max chunk size + MAX_CHUNK_SIZE = int(os.getenv("TTS_MAX_CHUNK_SIZE", 2000)) + def __init__(self, tts_provider: TTSProvider): """ Initializes the TTSService with a concrete TTS provider. """ self.tts_provider = tts_provider + async def _split_text_into_chunks(self, text: str) -> list[str]: + """ + Splits the input text into chunks based on a maximum size and + period delimiters, ensuring no chunk exceeds the limit. + """ + chunks = [] + current_chunk = "" + # Use a list of punctuation to split sentences more effectively + separators = ['.', '?', '!', '\n'] + sentences = [] + + # Split text by multiple delimiters + for separator in separators: + text = text.replace(separator, f"{separator}|") + sentences_with_empty = [s.strip() for s in text.split('|') if s.strip()] + + # Re-join sentences with their delimiters, so we don't lose them + for sentence in sentences_with_empty: + sentences.append(sentence) + + for sentence in sentences: + # Add the sentence and check if it exceeds the chunk size. + if len(current_chunk) + len(sentence) + 1 > self.MAX_CHUNK_SIZE and current_chunk: + chunks.append(current_chunk.strip()) + current_chunk = sentence + " " + else: + current_chunk += sentence + " " + + if current_chunk: + chunks.append(current_chunk.strip()) + + logger.debug(f"Split text into {len(chunks)} chunks.") + return chunks + async def create_speech_stream(self, text: str) -> AsyncGenerator[bytes, None]: """ - Generates a stream of audio bytes from the given text using the configured - TTS provider. - - Args: - text: The text to be converted to speech. - - Returns: - An async generator that yields chunks of audio bytes. + Generates a stream of complete, playable WAV files for each text chunk. + This provides a streaming-like experience even with a non-streaming backend + by sending each chunk as soon as it is generated. """ - return self.tts_provider.generate_speech(text) + chunks = await self._split_text_into_chunks(text) + + for i, chunk in enumerate(chunks): + logger.info(f"Processing chunk {i+1}/{len(chunks)} for streaming...") + + try: + # Get the raw PCM audio data for this chunk + pcm_data = await self.tts_provider.generate_speech(chunk) + + # Wrap the PCM data in a WAV header to make it a playable file + with io.BytesIO() as wav_buffer: + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(24000) + wav_file.writeframes(pcm_data) + + # Yield a complete, playable WAV file for the chunk + yield wav_buffer.getvalue() + + except Exception as e: + logger.error(f"Error processing chunk {i+1}: {e}") + raise HTTPException( + status_code=500, + detail=f"Error generating speech for chunk {i+1}: {e}" + ) from e async def create_speech_non_stream(self, text: str) -> bytes: """ - Generates a complete audio file from the given text without streaming. - - Args: - text: The text to be converted to speech. - - Returns: - The complete audio file as bytes. + Generates a complete audio file from the given text, splitting it + into chunks and concatenating the audio into a single WAV file. + All chunks are processed concurrently for speed. """ - # Awaiting the coroutine is necessary to get the result. - # The previous version was missing this 'await'. - audio_data = await self.tts_provider.generate_speech(text) - return audio_data \ No newline at end of file + chunks = await self._split_text_into_chunks(text) + + all_pcm_data = [] + + # Create a list of tasks for each chunk to run them concurrently. + tasks = [self.tts_provider.generate_speech(chunk) for chunk in chunks] + + try: + # Gather the results from all tasks. This will run all API calls + # to the TTS provider concurrently. + all_pcm_data = await asyncio.gather(*tasks) + logger.info(f"Successfully gathered audio data for all {len(chunks)} chunks.") + except Exception as e: + logger.error(f"An error occurred while gathering audio chunks: {e}") + raise HTTPException( + status_code=500, + detail=f"An error occurred while generating audio: {e}" + ) from e + + if not all_pcm_data: + logger.warning("No audio data was generated.") + raise HTTPException(status_code=500, detail="No audio data was generated from the TTS provider.") + + # Concatenate all the raw PCM data into a single stream + concatenated_pcm = b''.join(all_pcm_data) + logger.info(f"Concatenated {len(chunks)} chunks into a single PCM stream.") + + # Wrap the complete PCM stream in a single WAV container + with io.BytesIO() as wav_buffer: + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + # The Gemini API returns 24kHz audio, adjust if using a different provider + wav_file.setframerate(24000) + wav_file.writeframes(concatenated_pcm) + + return wav_buffer.getvalue() \ No newline at end of file diff --git a/.gitignore b/.gitignore index 7782b54..032eab8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ **/*.egg-info .pytest_cache/ **.bin -**.db \ No newline at end of file +**.wav +**.db diff --git a/ai-hub/app/api/routes.py b/ai-hub/app/api/routes.py index 220282f..ce8b2ac 100644 --- a/ai-hub/app/api/routes.py +++ b/ai-hub/app/api/routes.py @@ -1,9 +1,7 @@ -# Fixed routes.py -from fastapi import APIRouter, HTTPException, Depends +from fastapi import APIRouter, HTTPException, Depends, Query from fastapi.responses import Response, StreamingResponse from sqlalchemy.orm import Session -from app.api.dependencies import ServiceContainer -from app.api.dependencies import get_db +from app.api.dependencies import ServiceContainer, get_db from app.api import schemas from typing import AsyncGenerator @@ -102,50 +100,45 @@ except Exception as e: raise HTTPException(status_code=500, detail=f"An error occurred: {e}") + # --- Consolidated Speech Endpoint --- + @router.post( "/speech", summary="Generate speech from text", tags=["TTS"], - response_description="Audio bytes in WAV format", + response_description="Audio bytes in WAV format, either as a complete file or a stream.", ) - async def create_speech_response(request: schemas.SpeechRequest): + async def create_speech_response( + request: schemas.SpeechRequest, + stream: bool = Query( + False, + description="If true, returns a streamed audio response. Otherwise, returns a complete file." + ) + ): """ - Generates an audio file from the provided text using the TTS service - and returns it as a complete response. + Generates an audio file or a streaming audio response from the provided text. + By default, it returns a complete audio file. + To get a streaming response, set the 'stream' query parameter to 'true'. """ try: - # Await the coroutine that returns the complete audio data - audio_bytes = await services.tts_service.create_speech_non_stream( - text=request.text - ) + if stream: + # Use the streaming service method + audio_stream_generator: AsyncGenerator[bytes, None] = services.tts_service.create_speech_stream( + text=request.text + ) + return StreamingResponse(audio_stream_generator, media_type="audio/wav") + else: + # Use the non-streaming service method + audio_bytes = await services.tts_service.create_speech_non_stream( + text=request.text + ) + return Response(content=audio_bytes, media_type="audio/wav") - # Return a standard FastAPI Response with the complete audio bytes. - return Response(content=audio_bytes, media_type="audio/wav") - + except HTTPException: + raise # Re-raise existing HTTPException except Exception as e: - # Catch exceptions from the TTS service raise HTTPException( status_code=500, detail=f"Failed to generate speech: {e}" ) - - # Add a streaming endpoint as a new feature - @router.post( - "/speech/stream", - summary="Generate speech from text with streaming", - tags=["TTS"], - response_description="Audio bytes in WAV format (streaming)", - ) - async def create_speech_stream_response(request: schemas.SpeechRequest) -> StreamingResponse: - """ - Generates an audio stream from the provided text and streams it back. - """ - try: - # The service method returns an async generator - audio_stream_generator: AsyncGenerator[bytes, None] = services.tts_service.create_speech_stream( - text=request.text - ) - return StreamingResponse(audio_stream_generator, media_type="audio/wav") - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to stream speech: {e}") return router \ No newline at end of file diff --git a/ai-hub/app/core/services/tts.py b/ai-hub/app/core/services/tts.py index b5887eb..3b1e2be 100644 --- a/ai-hub/app/core/services/tts.py +++ b/ai-hub/app/core/services/tts.py @@ -1,40 +1,139 @@ +import io +import wave +import logging +import asyncio from typing import AsyncGenerator from app.core.providers.base import TTSProvider +from fastapi import HTTPException +import os +# --- Configure logging --- +logger = logging.getLogger(__name__) + +# --- Define TTS Service Class --- class TTSService: """ Service class for generating speech from text using a TTS provider. + This version is designed to handle both streaming and non-streaming + audio generation, splitting text into manageable chunks. """ + + # Use an environment variable or a default value for the max chunk size + MAX_CHUNK_SIZE = int(os.getenv("TTS_MAX_CHUNK_SIZE", 2000)) + def __init__(self, tts_provider: TTSProvider): """ Initializes the TTSService with a concrete TTS provider. """ self.tts_provider = tts_provider + async def _split_text_into_chunks(self, text: str) -> list[str]: + """ + Splits the input text into chunks based on a maximum size and + period delimiters, ensuring no chunk exceeds the limit. + """ + chunks = [] + current_chunk = "" + # Use a list of punctuation to split sentences more effectively + separators = ['.', '?', '!', '\n'] + sentences = [] + + # Split text by multiple delimiters + for separator in separators: + text = text.replace(separator, f"{separator}|") + sentences_with_empty = [s.strip() for s in text.split('|') if s.strip()] + + # Re-join sentences with their delimiters, so we don't lose them + for sentence in sentences_with_empty: + sentences.append(sentence) + + for sentence in sentences: + # Add the sentence and check if it exceeds the chunk size. + if len(current_chunk) + len(sentence) + 1 > self.MAX_CHUNK_SIZE and current_chunk: + chunks.append(current_chunk.strip()) + current_chunk = sentence + " " + else: + current_chunk += sentence + " " + + if current_chunk: + chunks.append(current_chunk.strip()) + + logger.debug(f"Split text into {len(chunks)} chunks.") + return chunks + async def create_speech_stream(self, text: str) -> AsyncGenerator[bytes, None]: """ - Generates a stream of audio bytes from the given text using the configured - TTS provider. - - Args: - text: The text to be converted to speech. - - Returns: - An async generator that yields chunks of audio bytes. + Generates a stream of complete, playable WAV files for each text chunk. + This provides a streaming-like experience even with a non-streaming backend + by sending each chunk as soon as it is generated. """ - return self.tts_provider.generate_speech(text) + chunks = await self._split_text_into_chunks(text) + + for i, chunk in enumerate(chunks): + logger.info(f"Processing chunk {i+1}/{len(chunks)} for streaming...") + + try: + # Get the raw PCM audio data for this chunk + pcm_data = await self.tts_provider.generate_speech(chunk) + + # Wrap the PCM data in a WAV header to make it a playable file + with io.BytesIO() as wav_buffer: + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(24000) + wav_file.writeframes(pcm_data) + + # Yield a complete, playable WAV file for the chunk + yield wav_buffer.getvalue() + + except Exception as e: + logger.error(f"Error processing chunk {i+1}: {e}") + raise HTTPException( + status_code=500, + detail=f"Error generating speech for chunk {i+1}: {e}" + ) from e async def create_speech_non_stream(self, text: str) -> bytes: """ - Generates a complete audio file from the given text without streaming. - - Args: - text: The text to be converted to speech. - - Returns: - The complete audio file as bytes. + Generates a complete audio file from the given text, splitting it + into chunks and concatenating the audio into a single WAV file. + All chunks are processed concurrently for speed. """ - # Awaiting the coroutine is necessary to get the result. - # The previous version was missing this 'await'. - audio_data = await self.tts_provider.generate_speech(text) - return audio_data \ No newline at end of file + chunks = await self._split_text_into_chunks(text) + + all_pcm_data = [] + + # Create a list of tasks for each chunk to run them concurrently. + tasks = [self.tts_provider.generate_speech(chunk) for chunk in chunks] + + try: + # Gather the results from all tasks. This will run all API calls + # to the TTS provider concurrently. + all_pcm_data = await asyncio.gather(*tasks) + logger.info(f"Successfully gathered audio data for all {len(chunks)} chunks.") + except Exception as e: + logger.error(f"An error occurred while gathering audio chunks: {e}") + raise HTTPException( + status_code=500, + detail=f"An error occurred while generating audio: {e}" + ) from e + + if not all_pcm_data: + logger.warning("No audio data was generated.") + raise HTTPException(status_code=500, detail="No audio data was generated from the TTS provider.") + + # Concatenate all the raw PCM data into a single stream + concatenated_pcm = b''.join(all_pcm_data) + logger.info(f"Concatenated {len(chunks)} chunks into a single PCM stream.") + + # Wrap the complete PCM stream in a single WAV container + with io.BytesIO() as wav_buffer: + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + # The Gemini API returns 24kHz audio, adjust if using a different provider + wav_file.setframerate(24000) + wav_file.writeframes(concatenated_pcm) + + return wav_buffer.getvalue() \ No newline at end of file diff --git a/ai-hub/test_speach.sh b/ai-hub/test_speach.sh new file mode 100644 index 0000000..931540e --- /dev/null +++ b/ai-hub/test_speach.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# --- 0. Load environment variables from .env file --- +# Check if .env file exists +if [ -f .env ]; then + echo "Loading environment variables from .env" + # Use 'source' to read and set the variables from the file. + # This pattern correctly handles variables with or without quotes and comments. + export $(grep -v '^#' .env | xargs) +fi +export LOG_LEVEL=DEBUG + +# Define the API endpoint +API_URL="http://localhost:8000/documents" + +DEFAULT_MODEL="gemini" +CURRENT_MODEL="" # The model used in the last turn + +# --- 1. Check for Dependencies --- +if ! command -v jq &> /dev/null +then + echo "❌ 'jq' is not installed. Please install it to run this script." + exit 1 +fi + +# --- 2. Start the FastAPI Server in the Background --- +echo "--- Starting AI Hub Server ---" +# Ensure the API key is passed to the uvicorn process, +# either directly or by ensuring it's in the environment. +# The 'export' command above ensures this. +uvicorn app.main:app --host 127.00.1 --port 8000 & +SERVER_PID=$! + +# Define a cleanup function to kill the server on exit +cleanup() { + echo "" + echo "--- Shutting Down Server (PID: $SERVER_PID) ---" + kill $SERVER_PID +} +# Register the cleanup function to run when the script exits (e.g., Ctrl+C or typing 'exit') +trap cleanup EXIT + +echo "Server started with PID: $SERVER_PID. Waiting for it to initialize..." +sleep 5 # Wait for the server to be ready + +BASE_URL="http://localhost:8000" + +# --- 3. Test both non-streaming and streaming modes --- +TEST_TEXT="The old clock in the corner of the study ticked with a slow, deliberate rhythm, a metronome for the quiet dust motes dancing in the sunbeams that slanted through the window. Its brass pendulum swung with a hypnotizing arc, a steady pulse in the otherwise silent room. Elara, with a frown of concentration, dipped her quill into the inkwell, the black liquid swirling like a miniature galaxy. She was trying to translate an ancient text, a forgotten language filled with serpentine characters and cryptic symbols. The work was painstaking, each character a puzzle box waiting to be opened. Outside, a gentle rain began to fall, pattering softly against the glass in a counterpoint to the clock’s beat. + +The manuscript she was working on was rumored to hold the key to a lost library, a repository of knowledge that had vanished from the world centuries ago. Elara, a historian with a thirst for the past, had dedicated her life to its recovery. She believed the library wasn't just a place of books, but a sanctuary of ideas, of stories untold and wisdom unwritten. A sudden gust of wind rattled the windowpane, and a drop of water streaked across the glass like a tear. She paused, her eyes drifting from the text to the stormy sky, a sense of foreboding settling over her. The story she was reading spoke of a great flood, a cataclysm that had swept away all traces of the library, leaving only this fragmented manuscript as a breadcrumb trail. It was a warning, she realized, not just a tale. The storm outside seemed to be echoing the prophecy within the text, a chilling reminder of the cyclical nature of history. + +As the light began to fade, replaced by the bruised purple of twilight, Elara lit a small oil lamp. The flame cast flickering shadows on the walls, making the bookshelves appear as if they were filled with sleeping giants. The air grew heavy with the scent of old paper and dust. She returned to her work, her mind racing with newfound purpose. The text wasn't just a map to the library, it was a warning of what could happen if its knowledge remained hidden. It was a plea to prevent history from repeating itself. The clock ticked on, a relentless march towards an uncertain future. Elara, however, was no longer just a translator. She was a guardian, a custodian of the past, tasked with saving the future. Her quill scratched across the paper, the sound a defiant whisper in the growing darkness, a promise to not let the words of the past be silenced by the storms of the present" + +# Create a temporary file to store the JSON payload +TEMP_JSON_FILE=$(mktemp) +cat > "$TEMP_JSON_FILE" << EOF +{ + "text": "$TEST_TEXT" +} +EOF + +# # --- Test non-streaming mode --- +# echo "" +# echo "--- Testing non-streaming mode for /speech endpoint ---" +# NON_STREAM_OUTPUT_FILE="speech_nostream.wav" +# curl -s -X POST "$BASE_URL/speech" \ +# -H "Content-Type: application/json" \ +# --data "@$TEMP_JSON_FILE" \ +# --output "$NON_STREAM_OUTPUT_FILE" + +# if [ -f "$NON_STREAM_OUTPUT_FILE" ]; then +# echo "✅ Success! Non-streaming audio saved to $NON_STREAM_OUTPUT_FILE" +# else +# echo "❌ Failed to get non-streaming audio" +# fi + +# --- Test streaming mode --- +echo "" +echo "--- Testing streaming mode for /speech endpoint ---" +STREAM_OUTPUT_FILE="data/speech_stream.wav" +curl -s -X POST "$BASE_URL/speech?stream=true" \ + -H "Content-Type: application/json" \ + --data "@$TEMP_JSON_FILE" \ + --output "$STREAM_OUTPUT_FILE" + +if [ -f "$STREAM_OUTPUT_FILE" ]; then + echo "✅ Success! Streaming audio saved to $STREAM_OUTPUT_FILE" +else + echo "❌ Failed to get streaming audio" +fi + +# Remove the temporary file +rm "$TEMP_JSON_FILE" + +echo "" +echo "--- Tests complete. ---" \ No newline at end of file