diff --git a/ai-hub/app/api/routes/README.md b/ai-hub/app/api/routes/README.md index 5fb40c6..b330071 100644 --- a/ai-hub/app/api/routes/README.md +++ b/ai-hub/app/api/routes/README.md @@ -1,683 +1,243 @@ -# Invoking the Text-to-Speech (TTS) API Endpoint +# API Documentation -This guide explains how a frontend application can interact with the FastAPI `/speech` endpoint for text-to-speech conversion. The endpoint supports both **non-streaming** and **streaming** audio responses. +This repository provides frontend integration guides for the FastAPI backend's main endpoints: + +1. **Text-to-Speech (TTS) API** +2. **Speech-to-Text (STT) API** +3. **Chat Sessions API** +4. **Documents API** --- -## 1. Endpoint Details +## 1. Text-to-Speech (TTS) API -* **HTTP Method:** `POST` -* **Path:** `/speech` -* **Purpose:** Convert a given text string into audio. +This API converts text into audio, supporting both **non-streaming** and **streaming** modes. + +### 1.1 Endpoint Details + +| Method | Path | Purpose | +| ------ | --------- | ---------------------------------- | +| POST | `/speech` | Convert a given text string to audio | --- -## 2. Request Structure +### 1.2 Request Structure -### 2.1 Request Body +#### Request Body (JSON) -The POST request must include a JSON object matching the `SpeechRequest` schema. +| Field | Type | Description | Example | +| ------ | ------ | ---------------------------- | ---------------------------------------------- | +| text | string | Text to convert to speech | `"Hello, this is a test message."` | -| Field | Type | Description | Example | -| ----- | ------ | ------------------------------ | ---------------------------------- | -| text | string | Text to be converted to speech | `"Hello, this is a test message."` | - -**Example JSON body:** - +**Example:** ```json { "text": "The quick brown fox jumps over the lazy dog." } -``` +```` ---- +#### Query Parameters -### 2.2 Query Parameter - -| Parameter | Type | Default | Description | -| --------- | ------- | ------- | -------------------------------------------------------------------------------------- | -| stream | boolean | false | If `true`, returns a continuous audio stream. If `false`, returns the full audio file. | +| Parameter | Type | Default | Description | +| --------- | ------- | ------- | --------------------------------------------------------------------------- | +| stream | boolean | false | If true, returns continuous audio stream. | +| as\_wav | boolean | true | **Streaming only**: If true, returns WAV chunks; if false, returns raw PCM. | **Example URLs:** -* Non-streaming (Default): - - ``` - http://[your-api-server]/speech - ``` - -* Streaming: - - ``` - http://[your-api-server]/speech?stream=true - ``` +``` +Non-streaming: http://[your-api-server]/speech +Streaming WAV: http://[your-api-server]/speech?stream=true +Streaming PCM: http://[your-api-server]/speech?stream=true&as_wav=false +``` --- -## 3. Frontend Implementation (JavaScript) +### 1.3 Frontend Examples (JavaScript) -Below are two implementations using the `fetch` API. - ---- - -### Example 1: Non-Streaming Response - -Downloads the complete WAV file before playing. Suitable for short messages. +#### Example 1: Non-Streaming ```javascript -// Generate and play non-streaming audio async function getSpeechAudio(text) { - const url = 'http://[your-api-server]/speech'; // Replace with your API URL - - try { - const response = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ text }) - }); - - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); - } - - const audioBlob = await response.blob(); - const audioUrl = URL.createObjectURL(audioBlob); - - const audio = new Audio(audioUrl); - audio.play(); - - console.log("Audio file received and is now playing."); - } catch (error) { - console.error("Failed to generate speech:", error); - } + const response = await fetch('http://[your-api-server]/speech', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text }) + }); + const audioBlob = await response.blob(); + const audioUrl = URL.createObjectURL(audioBlob); + new Audio(audioUrl).play(); } - -// Example: -// getSpeechAudio("This is an example of a non-streaming response."); ``` ---- - -### Example 2: Streaming Response - -Plays audio as it arrives using the **MediaSource API**. Ideal for long texts. +#### Example 2: Streaming WAV ```javascript -// Stream audio and play as it arrives async function streamSpeechAudio(text) { - const url = 'http://[your-api-server]/speech?stream=true'; // Replace with your API URL - - try { - const response = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ text }) - }); - - if (!response.ok || !response.body) { - throw new Error(`HTTP error! status: ${response.status}`); + const response = await fetch('http://[your-api-server]/speech?stream=true', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text }) + }); + const mediaSource = new MediaSource(); + const audio = new Audio(URL.createObjectURL(mediaSource)); + mediaSource.addEventListener('sourceopen', async () => { + const sourceBuffer = mediaSource.addSourceBuffer('audio/wav'); + const reader = response.body.getReader(); + while (true) { + const { done, value } = await reader.read(); + if (done) { mediaSource.endOfStream(); break; } + sourceBuffer.appendBuffer(value); } + }); + audio.play(); +} +``` - const mediaSource = new MediaSource(); - const audio = new Audio(); - audio.src = URL.createObjectURL(mediaSource); +#### Example 3: Streaming PCM (Web Audio API) - mediaSource.addEventListener('sourceopen', async () => { - const sourceBuffer = mediaSource.addSourceBuffer('audio/wav'); - const reader = response.body.getReader(); - - while (true) { - const { done, value } = await reader.read(); - if (done) { - mediaSource.endOfStream(); - break; - } - sourceBuffer.appendBuffer(value); - } - }); +```javascript +async function streamPcmAudio(text) { + const response = await fetch('http://[your-api-server]/speech?stream=true&as_wav=false', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text }) + }); + const audioContext = new AudioContext({ sampleRate: 24000 }); + const reader = response.body.getReader(); + let currentOffset = 0; - audio.play(); - console.log("Streaming audio is starting..."); - } catch (error) { - console.error("Failed to stream speech:", error); + function pcmToFloat32(pcm) { + const int16 = new Int16Array(pcm.buffer); + const float32 = new Float32Array(int16.length); + for (let i = 0; i < int16.length; i++) float32[i] = int16[i] / 32768.0; + return float32; + } + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + const data = pcmToFloat32(value); + const buffer = audioContext.createBuffer(1, data.length, audioContext.sampleRate); + buffer.copyToChannel(data, 0); + const source = audioContext.createBufferSource(); + source.buffer = buffer; + source.connect(audioContext.destination); + source.start(currentOffset); + currentOffset += buffer.duration; } } - -// Example: -// streamSpeechAudio("This is an example of a streaming response, which begins playing before the entire audio file is received."); ``` -# Invoking the Speech-to-Text (STT) API Endpoint +--- -This document explains how a frontend application can interact with the FastAPI `/stt/transcribe` endpoint to transcribe an uploaded audio file into text. +## 2. Speech-to-Text (STT) API + +Converts uploaded audio into text. + +### 2.1 Endpoint Details + +| Method | Path | Purpose | Content-Type | +| ------ | ----------------- | ---------------- | ------------------- | +| POST | `/stt/transcribe` | Transcribe audio | multipart/form-data | --- -## 1. Endpoint Details +### 2.2 Request Structure -* **HTTP Method:** `POST` -* **Path:** `/stt/transcribe` -* **Purpose:** Transcribe an uploaded audio file into text. -* **Content Type:** `multipart/form-data` +| Field | Type | Description | +| ----------- | ---- | ------------------------ | +| audio\_file | File | Audio file to transcribe | --- -## 2. Request Structure - -### 2.1 Request Body - -The POST request must include a `multipart/form-data` object with a single file field named `audio_file`. - -| Field | Type | Description | -| ----------- | ---- | -------------------------------- | -| audio\_file | File | The audio file to be transcribed | - ---- - -## 3. Frontend Implementation (JavaScript + HTML) - -Below is a complete working example using `fetch` to send the file and display the transcription result. +### 2.3 Example Frontend (HTML + JS) ```html - - - - - - STT API Example - - - -
-

Speech-to-Text (STT) Transcription

- - - -
Transcribing...
-
-

Your transcribed text will appear here.

-
-
+ + +
- - - + const res = await fetch('http://[your-api-server]/stt/transcribe', { method: 'POST', body: formData }); + const data = await res.json(); + document.getElementById('result').textContent = data.transcript; +}); + ``` -Here’s your Chat Sessions API documentation reformatted for clarity, structure, and consistency: +--- + +## 3. Chat Sessions API + +Manages conversational sessions with the AI. + +### 3.1 Endpoints + +| Method | Path | Purpose | +| ------ | --------------------------------- | --------------------------- | +| POST | `/sessions/` | Create a new chat session | +| POST | `/sessions/{session_id}/chat` | Send a message in a session | +| GET | `/sessions/{session_id}/messages` | Retrieve chat history | --- -# Invoking the Chat Sessions API Endpoint +## 4. Documents API -This document describes how a frontend application can interact with the FastAPI `/sessions` endpoints. These endpoints allow you to: +Add, list, and delete documents. -* Create new chat sessions -* Send messages within a session -* Retrieve chat history +### 4.1 Endpoints + +| Method | Path | Purpose | +| ------ | -------------------------- | ------------------ | +| POST | `/documents/` | Add a document | +| GET | `/documents/` | List all documents | +| DELETE | `/documents/{document_id}` | Delete a document | --- -## 1. Endpoint Details - -| HTTP Method | Path | Purpose | Request Type | -| ----------- | --------------------------------- | ------------------------------------------------------------- | ------------------ | -| **POST** | `/sessions/` | Creates a new chat session | `application/json` | -| **POST** | `/sessions/{session_id}/chat` | Sends a message and receives a response in a specific session | `application/json` | -| **GET** | `/sessions/{session_id}/messages` | Retrieves the message history for a given session | N/A | - ---- - -## 2. Request & Response Structures - -### 2.1 Create a New Chat Session - -**POST** `/sessions/` - -**Request Body:** - -| Field | Type | Description | -| -------- | ------ | ----------------------------------- | -| user\_id | string | ID of the user creating the session | -| model | string | Model to use for the session | - -**Example Request:** - -```json -{ - "user_id": "user-1234", - "model": "gemini" -} -``` - -**Response Body:** - -| Field | Type | Description | -| ----------- | ------- | -------------------------- | -| id | integer | Session ID | -| user\_id | string | User ID | -| created\_at | string | Session creation timestamp | -| model | string | Model used | - ---- - -### 2.2 Send a Message in a Session - -**POST** `/sessions/{session_id}/chat` - -**Path Parameter:** - -| Name | Type | Description | -| ----------- | ------- | ----------------- | -| session\_id | integer | Unique session ID | - -**Request Body:** - -| Field | Type | Description | -| ---------------------- | ------- | ----------------------------------------------------- | -| prompt | string | User message | -| model | string | Model for this message (can override session default) | -| load\_faiss\_retriever | boolean | Whether to use FAISS retriever | - -**Example Request:** - -```json -{ - "prompt": "What is the capital of France?", - "model": "gemini", - "load_faiss_retriever": false -} -``` - -**Response Body:** - -| Field | Type | Description | -| ----------- | ------ | --------------------------- | -| answer | string | Model's answer | -| model\_used | string | Model used for the response | - ---- - -### 2.3 Get Session Chat History - -**GET** `/sessions/{session_id}/messages` - -**Path Parameter:** - -| Name | Type | Description | -| ----------- | ------- | ----------------- | -| session\_id | integer | Unique session ID | - -**Response Body:** - -| Field | Type | Description | -| ----------- | ------- | -------------------------------------------------------- | -| session\_id | integer | Session ID | -| messages | array | List of message objects (`role`, `content`, `timestamp`) | - ---- - -## 3. Frontend Implementation (HTML + JavaScript) - -Below is a complete example that: - -1. Creates a new chat session -2. Sends a message in the session -3. Retrieves the chat history +### 4.2 Example Frontend (HTML + JS) ```html - - - - - - Chat Sessions API Example - - - -
-

Chat Sessions API Example

-

This page demonstrates creating a session, sending a message, and retrieving the history.

- -
-

Workflow Log

-

-    
-
+
+ + + +
+
- - - -``` - -# **Invoking the Documents API Endpoint** - -This guide explains how a frontend application can interact with the FastAPI `/documents` endpoints. -These endpoints allow you to **add**, **list**, and **delete** documents. - ---- - -## **Endpoint Summary** - -| HTTP Method | Path | Purpose | Request Type | -| ----------- | -------------------------- | ---------------------------------- | ------------------ | -| **POST** | `/documents/` | Adds a new document. | `application/json` | -| **GET** | `/documents/` | Lists all documents. | N/A | -| **DELETE** | `/documents/{document_id}` | Deletes a specific document by ID. | N/A | - ---- - -## **Request & Response Structures** - -### **1. Add a New Document** - -**POST** `/documents/` - -**Request Body** (JSON): - -* `title` *(string)* – The title of the document. -* `content` *(string)* – The content of the document. - -**Example Request:** - -```json -{ - "title": "My First Document", - "content": "This is the content of my very first document." +async function fetchDocs() { + const res = await fetch(`${API}/documents/`); + const data = await res.json(); + document.getElementById('docs').innerHTML = data.documents.map(doc => + `
${doc.title}
` + ).join(''); } + +document.getElementById('addDoc').onsubmit = async e => { + e.preventDefault(); + const title = document.getElementById('title').value; + const content = document.getElementById('content').value; + await fetch(`${API}/documents/`, { + method: 'POST', headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ title, content }) + }); + fetchDocs(); +}; + +async function delDoc(id) { + await fetch(`${API}/documents/${id}`, { method: 'DELETE' }); + fetchDocs(); +} + +fetchDocs(); + ``` - -**Response Body**: - -* `message` *(string)* – Success message. - ---- - -### **2. List All Documents** - -**GET** `/documents/` - -**Request Body:** None. - -**Response Body**: - -* `documents` *(array)* – List of documents. Each object contains: - - * `id` *(integer)* - * `title` *(string)* - * `content` *(string)* - * `created_at` *(timestamp)* - ---- - -### **3. Delete a Document** - -**DELETE** `/documents/{document_id}` - -**Path Parameters:** - -* `document_id` *(integer)* – Unique ID of the document to be deleted. - -**Response Body**: - -* `message` *(string)* – Success message. -* `document_id` *(integer)* – ID of the deleted document. - ---- - -## **Frontend Implementation (JavaScript Example)** - -Below is a complete HTML + JavaScript example showing how to **add**, **list**, and **delete** documents using the API. - -```html - - - - - - Documents API Example - - - -
-

Documents API Example

- -

Add a New Document

-
- - - - - -
- -

Documents List

- -
- -

Log

-

-    
- - - - -``` \ No newline at end of file diff --git a/ai-hub/app/api/routes/tts.py b/ai-hub/app/api/routes/tts.py index 557924a..e305ee2 100644 --- a/ai-hub/app/api/routes/tts.py +++ b/ai-hub/app/api/routes/tts.py @@ -10,22 +10,33 @@ @router.post( "", summary="Generate speech from text", - response_description="Audio bytes in WAV format, either as a complete file or a stream.", + response_description="Audio bytes in WAV or PCM format, either as a complete file or a stream.", ) async def create_speech_response( request: schemas.SpeechRequest, stream: bool = Query( False, description="If true, returns a streamed audio response. Otherwise, returns a complete file." + ), + as_wav: bool = Query( + True, + description="If true, returns WAV format audio. If false, returns raw PCM audio data. Only applies when stream is true." ) ): try: if stream: + # Pass the new as_wav parameter to the streaming function audio_stream_generator: AsyncGenerator[bytes, None] = services.tts_service.create_speech_stream( - text=request.text + text=request.text, + as_wav=as_wav ) - return StreamingResponse(audio_stream_generator, media_type="audio/wav") + + # Dynamically set the media_type based on the as_wav flag + media_type = "audio/wav" if as_wav else "audio/pcm" + + return StreamingResponse(audio_stream_generator, media_type=media_type) else: + # The non-streaming function only returns WAV, so this part remains the same audio_bytes = await services.tts_service.create_speech_non_stream( text=request.text ) diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py index 150a1ac..4afaa3f 100644 --- a/ai-hub/app/app.py +++ b/ai-hub/app/app.py @@ -83,6 +83,7 @@ tts_provider = get_tts_provider( provider_name=settings.TTS_PROVIDER, api_key=settings.TTS_API_KEY, + model_name = settings.TTS_MODEL_NAME, voice_name=settings.TTS_VOICE_NAME ) diff --git a/ai-hub/app/config.py b/ai-hub/app/config.py index 858c8ad..390c990 100644 --- a/ai-hub/app/config.py +++ b/ai-hub/app/config.py @@ -18,11 +18,12 @@ class TTSProvider(str, Enum): """An enum for supported Text-to-Speech (TTS) providers.""" GOOGLE_GEMINI = "google_gemini" + GCLOUD_TTS = "gcloud_tts" # NEW: Add Google Cloud TTS as a supported provider class STTProvider(str, Enum): """An enum for supported Speech-to-Text (STT) providers.""" GOOGLE_GEMINI = "google_gemini" - OPENAI = "openai" # NEW: Add OpenAI as a supported provider + OPENAI = "openai" class ApplicationSettings(BaseModel): project_name: str = "Cortex Hub" @@ -45,6 +46,7 @@ class TTSProviderSettings(BaseModel): provider: TTSProvider = Field(default=TTSProvider.GOOGLE_GEMINI) + # The default values are kept as originally requested voice_name: str = "Kore" model_name: str = "gemini-2.5-flash-preview-tts" api_key: Optional[SecretStr] = None @@ -53,9 +55,6 @@ provider: STTProvider = Field(default=STTProvider.GOOGLE_GEMINI) model_name: str = "gemini-2.5-flash" api_key: Optional[SecretStr] = None - # NOTE: OpenAI provider requires a different model name (e.g., 'whisper-1') - # but we will handle this dynamically or through configuration. - # The BaseModel is for schema validation, not for provider-specific logic. class VectorStoreSettings(BaseModel): index_path: str = "data/faiss_index.bin" @@ -125,22 +124,22 @@ # --- API Keys & Models --- self.DEEPSEEK_API_KEY: Optional[str] = os.getenv("DEEPSEEK_API_KEY") self.GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY") - self.OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY") # NEW: Add dedicated OpenAI API key + self.OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY") self.DEEPSEEK_MODEL_NAME: str = os.getenv("DEEPSEEK_MODEL_NAME") or \ - get_from_yaml(["llm_providers", "deepseek_model_name"]) or \ - config_from_pydantic.llm_providers.deepseek_model_name + get_from_yaml(["llm_providers", "deepseek_model_name"]) or \ + config_from_pydantic.llm_providers.deepseek_model_name self.GEMINI_MODEL_NAME: str = os.getenv("GEMINI_MODEL_NAME") or \ - get_from_yaml(["llm_providers", "gemini_model_name"]) or \ - config_from_pydantic.llm_providers.gemini_model_name + get_from_yaml(["llm_providers", "gemini_model_name"]) or \ + config_from_pydantic.llm_providers.gemini_model_name # --- Vector Store Settings --- self.FAISS_INDEX_PATH: str = os.getenv("FAISS_INDEX_PATH") or \ - get_from_yaml(["vector_store", "index_path"]) or \ - config_from_pydantic.vector_store.index_path + get_from_yaml(["vector_store", "index_path"]) or \ + config_from_pydantic.vector_store.index_path dimension_str = os.getenv("EMBEDDING_DIMENSION") or \ - get_from_yaml(["vector_store", "embedding_dimension"]) or \ - config_from_pydantic.vector_store.embedding_dimension + get_from_yaml(["vector_store", "embedding_dimension"]) or \ + config_from_pydantic.vector_store.embedding_dimension self.EMBEDDING_DIMENSION: int = int(dimension_str) # --- Embedding Provider Settings --- @@ -149,15 +148,15 @@ embedding_provider_env = embedding_provider_env.lower() self.EMBEDDING_PROVIDER: EmbeddingProvider = EmbeddingProvider(embedding_provider_env or \ - get_from_yaml(["embedding_provider", "provider"]) or \ - config_from_pydantic.embedding_provider.provider) + get_from_yaml(["embedding_provider", "provider"]) or \ + config_from_pydantic.embedding_provider.provider) self.EMBEDDING_MODEL_NAME: str = os.getenv("EMBEDDING_MODEL_NAME") or \ - get_from_yaml(["embedding_provider", "model_name"]) or \ - config_from_pydantic.embedding_provider.model_name + get_from_yaml(["embedding_provider", "model_name"]) or \ + config_from_pydantic.embedding_provider.model_name self.EMBEDDING_API_KEY: Optional[str] = os.getenv("EMBEDDING_API_KEY") or \ - get_from_yaml(["embedding_provider", "api_key"]) or \ - self.GEMINI_API_KEY + get_from_yaml(["embedding_provider", "api_key"]) or \ + self.GEMINI_API_KEY # --- TTS Provider Settings --- tts_provider_env = os.getenv("TTS_PROVIDER") @@ -168,17 +167,23 @@ get_from_yaml(["tts_provider", "provider"]) or \ config_from_pydantic.tts_provider.provider) self.TTS_VOICE_NAME: str = os.getenv("TTS_VOICE_NAME") or \ - get_from_yaml(["tts_provider", "voice_name"]) or \ - config_from_pydantic.tts_provider.voice_name + get_from_yaml(["tts_provider", "voice_name"]) or \ + config_from_pydantic.tts_provider.voice_name + self.TTS_MODEL_NAME: str = os.getenv("TTS_MODEL_NAME") or \ - get_from_yaml(["tts_provider", "model_name"]) or \ - config_from_pydantic.tts_provider.model_name + get_from_yaml(["tts_provider", "model_name"]) or \ + config_from_pydantic.tts_provider.model_name + + # API Key logic for TTS + tts_api_key_env = os.getenv("TTS_API_KEY") or get_from_yaml(["tts_provider", "api_key"]) - self.TTS_API_KEY: Optional[str] = os.getenv("TTS_API_KEY") or \ - get_from_yaml(["tts_provider", "api_key"]) or \ - self.GEMINI_API_KEY + if tts_api_key_env: + self.TTS_API_KEY: Optional[str] = tts_api_key_env + else: + # If no specific TTS key is set, use the Gemini key as a fallback + self.TTS_API_KEY: Optional[str] = self.GEMINI_API_KEY - # --- NEW STT Provider Settings --- + # --- STT Provider Settings --- stt_provider_env = os.getenv("STT_PROVIDER") if stt_provider_env: stt_provider_env = stt_provider_env.lower() @@ -187,11 +192,10 @@ get_from_yaml(["stt_provider", "provider"]) or \ config_from_pydantic.stt_provider.provider) self.STT_MODEL_NAME: str = os.getenv("STT_MODEL_NAME") or \ - get_from_yaml(["stt_provider", "model_name"]) or \ - config_from_pydantic.stt_provider.model_name + get_from_yaml(["stt_provider", "model_name"]) or \ + config_from_pydantic.stt_provider.model_name # Logic for STT_API_KEY: Prioritize a dedicated STT_API_KEY. - # Fallback to OPENAI_API_KEY if the provider is OpenAI, otherwise use GEMINI_API_KEY. explicit_stt_api_key = os.getenv("STT_API_KEY") or get_from_yaml(["stt_provider", "api_key"]) if explicit_stt_api_key: diff --git a/ai-hub/app/config.yaml b/ai-hub/app/config.yaml index 99ec019..0f9028c 100644 --- a/ai-hub/app/config.yaml +++ b/ai-hub/app/config.yaml @@ -34,11 +34,12 @@ tts_provider: # The provider for the TTS service. - provider: "google_gemini" + # Check more at https://cloud.google.com/text-to-speech + provider: "gcloud_tts" # The name of the voice to use for TTS. - voice_name: "Zephyr" + voice_name: "en-US-Chirp3-HD-Achernar" # The model name for the TTS service. - model_name: "gemini-2.5-flash-preview-tts" + model_name: "gemini-2.5-pro-preview-tts" # The provider for the Speech-to-Text (STT) service. stt_provider: diff --git a/ai-hub/app/core/providers/factory.py b/ai-hub/app/core/providers/factory.py index 6cfbed0..725f871 100644 --- a/ai-hub/app/core/providers/factory.py +++ b/ai-hub/app/core/providers/factory.py @@ -3,6 +3,7 @@ from .llm.deepseek import DeepSeekProvider from .llm.gemini import GeminiProvider from .tts.gemini import GeminiTTSProvider +from .tts.gcloud_tts import GCloudTTSProvider from .stt.gemini import GoogleSTTProvider from openai import AsyncOpenAI @@ -24,10 +25,12 @@ raise ValueError(f"Unsupported model provider: '{model_name}'. Supported providers are: {list(_llm_providers.keys())}") return provider -def get_tts_provider(provider_name: str, api_key: str, voice_name: str) -> TTSProvider: +def get_tts_provider(provider_name: str, api_key: str, model_name: str, voice_name: str) -> TTSProvider: if provider_name == "google_gemini": - return GeminiTTSProvider(api_key=api_key, voice_name = voice_name) - raise ValueError(f"Unsupported TTS provider: '{provider_name}'. Supported providers are: ['google_gemini']") + return GeminiTTSProvider(api_key=api_key,model_name = model_name, voice_name = voice_name) + elif provider_name == "gcloud_tts": + return GCloudTTSProvider(api_key=api_key, voice_name = voice_name) + raise ValueError(f"Unsupported TTS provider: '{provider_name}'. Supported providers are: ['google_gemini', 'gcloud_tts']") def get_stt_provider(provider_name: str, api_key: str, model_name: str) -> STTProvider: if provider_name == "google_gemini": diff --git a/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_bearer.sh b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_bearer.sh new file mode 100644 index 0000000..d753b84 --- /dev/null +++ b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_bearer.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Bearer token (best to set this as an environment variable) +BEARER_TOKEN="" + +# Google TTS API endpoint +TTS_API_URL="https://texttospeech.googleapis.com/v1/text:synthesize" + +# Request payload +read -r -d '' PAYLOAD < output.mp3 + +echo "✅ MP3 file saved to: output.mp3" diff --git a/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key.sh b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key.sh new file mode 100644 index 0000000..b62fc65 --- /dev/null +++ b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Your Google Cloud API key +API_KEY="" + +# Google TTS API endpoint with API key as query param +TTS_API_URL="https://texttospeech.googleapis.com/v1/text:synthesize?key=$API_KEY" + +# Request payload +read -r -d '' PAYLOAD < output.mp3 + +echo "✅ MP3 file saved to: output.mp3" diff --git a/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key_pcm.sh b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key_pcm.sh new file mode 100644 index 0000000..5149058 --- /dev/null +++ b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key_pcm.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Your Google Cloud API key +API_KEY="" + +# Google TTS API endpoint with API key as query param +TTS_API_URL="https://texttospeech.googleapis.com/v1/text:synthesize?key=$API_KEY" + +# Request payload (set audioEncoding to LINEAR16 for PCM) +read -r -d '' PAYLOAD < output.pcm + +echo "✅ PCM audio saved to: output.pcm" diff --git a/ai-hub/app/core/providers/tts/gcloud_tts.py b/ai-hub/app/core/providers/tts/gcloud_tts.py new file mode 100644 index 0000000..bfdb1b1 --- /dev/null +++ b/ai-hub/app/core/providers/tts/gcloud_tts.py @@ -0,0 +1,86 @@ +import os +import aiohttp +import asyncio +import base64 +import logging +from typing import AsyncGenerator +from app.core.providers.base import TTSProvider +from aiohttp import ClientResponseError +from fastapi import HTTPException + +# Configure logging +logger = logging.getLogger(__name__) + +# New concrete class for the Google Cloud Text-to-Speech API +class GCloudTTSProvider(TTSProvider): + # This provider uses Google's dedicated TTS API. The voices are different from Gemini. + # Here is a small, representative list of available WaveNet voices. + # The full list is much larger and can be found in the official documentation. + AVAILABLE_VOICES = [ + "en-US-Wavenet-A", "en-US-Wavenet-B", "en-US-Wavenet-C", "en-US-Wavenet-D", + "en-US-Wavenet-E", "en-US-Wavenet-F", "en-US-Wavenet-G", "en-US-Wavenet-H" + ] + + def __init__(self, api_key: str, voice_name: str = "en-US-Wavenet-D"): + if voice_name not in self.AVAILABLE_VOICES: + raise ValueError(f"Invalid voice name: {voice_name}. Choose from {self.AVAILABLE_VOICES}") + + self.api_key = api_key + # The new API URL for the Cloud Text-to-Speech service + self.api_url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={self.api_key}" + self.voice_name = voice_name + logger.debug(f"Initialized GCloudTTSProvider with voice: {self.voice_name}") + + async def generate_speech(self, text: str) -> bytes: + logger.debug(f"Starting speech generation for text: '{text[:50]}...'") + + headers = { + "Content-Type": "application/json" + } + json_data = { + "input": { + "text": text + }, + "voice": { + "languageCode": "en-US", + "name": self.voice_name + }, + "audioConfig": { + "audioEncoding": "LINEAR16" + } + } + + logger.debug(f"API Request URL: {self.api_url}") + logger.debug(f"Request Payload: {json_data}") + + try: + async with aiohttp.ClientSession() as session: + async with session.post(self.api_url, headers=headers, json=json_data) as response: + logger.debug(f"Received API response with status code: {response.status}") + response.raise_for_status() + + response_json = await response.json() + logger.debug("Successfully parsed API response JSON.") + + # The audio data is now under the 'audioContent' key + audio_base64 = response_json.get('audioContent') + if not audio_base64: + raise KeyError("audioContent key not found in the response.") + + audio_bytes = base64.b64decode(audio_base64) + logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.") + + return audio_bytes + except ClientResponseError as e: + if e.status == 429: + logger.error("Rate limit exceeded on Cloud TTS API.") + raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.") + else: + logger.error(f"Aiohttp client error occurred: {e}") + raise HTTPException(status_code=500, detail=f"API request failed: {e}") + except KeyError as e: + logger.error(f"Key error in API response: {e}. Full response: {await response.json()}") + raise HTTPException(status_code=500, detail="Malformed API response from Cloud TTS.") + except Exception as e: + logger.error(f"An unexpected error occurred during speech generation: {e}") + raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}") \ No newline at end of file diff --git a/ai-hub/app/core/providers/tts/gemini.py b/ai-hub/app/core/providers/tts/gemini.py index 264557c..89d59e1 100644 --- a/ai-hub/app/core/providers/tts/gemini.py +++ b/ai-hub/app/core/providers/tts/gemini.py @@ -5,6 +5,9 @@ import logging from typing import AsyncGenerator from app.core.providers.base import TTSProvider +from aiohttp import ClientResponseError +from fastapi import HTTPException + # Configure logging logger = logging.getLogger(__name__) @@ -79,9 +82,13 @@ logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.") return audio_bytes - except aiohttp.ClientError as e: - logger.error(f"Aiohttp client error occurred: {e}") - raise HTTPException(status_code=500, detail=f"API request failed: {e}") + except ClientResponseError as e: + if e.status == 429: + logger.error("Rate limit exceeded on Gemini TTS API.") + raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.") + else: + logger.error(f"Aiohttp client error occurred: {e}") + raise HTTPException(status_code=500, detail=f"API request failed: {e}") except KeyError as e: logger.error(f"Key error in API response: {e}. Full response: {response_json}") raise HTTPException(status_code=500, detail="Malformed API response from Gemini.") diff --git a/ai-hub/app/core/services/tts.py b/ai-hub/app/core/services/tts.py index 63298e4..d658e6c 100644 --- a/ai-hub/app/core/services/tts.py +++ b/ai-hub/app/core/services/tts.py @@ -10,6 +10,19 @@ # --- Configure logging --- logger = logging.getLogger(__name__) +# --- Helper Functions --- +def _create_wav_file(pcm_data: bytes) -> bytes: + """ + Wraps raw 16-bit PCM audio data in a WAV header. + """ + with io.BytesIO() as wav_buffer: + with wave.open(wav_buffer, 'wb') as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(24000) + wav_file.writeframes(pcm_data) + return wav_buffer.getvalue() + # --- Define TTS Service Class --- class TTSService: """ @@ -18,37 +31,25 @@ audio generation, splitting text into manageable chunks. """ - # Use an environment variable or a default value for the max chunk size - MAX_CHUNK_SIZE = int(os.getenv("TTS_MAX_CHUNK_SIZE", 200)) + MAX_CHUNK_SIZE = int(os.getenv("TTS_MAX_CHUNK_SIZE", 600)) def __init__(self, tts_provider: TTSProvider): - """ - Initializes the TTSService with a concrete TTS provider. - """ self.tts_provider = tts_provider async def _split_text_into_chunks(self, text: str) -> list[str]: - """ - Splits the input text into chunks based on a maximum size and - period delimiters, ensuring no chunk exceeds the limit. - """ chunks = [] current_chunk = "" - # Use a list of punctuation to split sentences more effectively separators = ['.', '?', '!', '\n'] sentences = [] - # Split text by multiple delimiters for separator in separators: text = text.replace(separator, f"{separator}|") sentences_with_empty = [s.strip() for s in text.split('|') if s.strip()] - # Re-join sentences with their delimiters, so we don't lose them for sentence in sentences_with_empty: sentences.append(sentence) for sentence in sentences: - # Add the sentence and check if it exceeds the chunk size. if len(current_chunk) + len(sentence) + 1 > self.MAX_CHUNK_SIZE and current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + " " @@ -61,32 +62,14 @@ logger.debug(f"Split text into {len(chunks)} chunks.") return chunks - async def create_speech_stream(self, text: str) -> AsyncGenerator[bytes, None]: - """ - Generates a stream of complete, playable WAV files for each text chunk. - This provides a streaming-like experience even with a non-streaming backend - by sending each chunk as soon as it is generated. - """ + async def _generate_pcm_chunks(self, text: str) -> AsyncGenerator[bytes, None]: chunks = await self._split_text_into_chunks(text) - + for i, chunk in enumerate(chunks): - logger.info(f"Processing chunk {i+1}/{len(chunks)} for streaming...") - + logger.info(f"Generating PCM for chunk {i+1}/{len(chunks)}: '{chunk[:30]}...'") try: - # Get the raw PCM audio data for this chunk pcm_data = await self.tts_provider.generate_speech(chunk) - - # Wrap the PCM data in a WAV header to make it a playable file - with io.BytesIO() as wav_buffer: - with wave.open(wav_buffer, 'wb') as wav_file: - wav_file.setnchannels(1) - wav_file.setsampwidth(2) - wav_file.setframerate(24000) - wav_file.writeframes(pcm_data) - - # Yield a complete, playable WAV file for the chunk - yield wav_buffer.getvalue() - + yield pcm_data except Exception as e: logger.error(f"Error processing chunk {i+1}: {e}") raise HTTPException( @@ -94,22 +77,36 @@ detail=f"Error generating speech for chunk {i+1}: {e}" ) from e + async def create_speech_stream(self, text: str, as_wav: bool = True) -> AsyncGenerator[bytes, None]: + async for pcm_data in self._generate_pcm_chunks(text): + if as_wav: + yield _create_wav_file(pcm_data) + else: + yield pcm_data + async def create_speech_non_stream(self, text: str) -> bytes: - """ - Generates a complete audio file from the given text, splitting it - into chunks and concatenating the audio into a single WAV file. - All chunks are processed concurrently for speed. - """ chunks = await self._split_text_into_chunks(text) - - all_pcm_data = [] - - # Create a list of tasks for each chunk to run them concurrently. - tasks = [self.tts_provider.generate_speech(chunk) for chunk in chunks] - + semaphore = asyncio.Semaphore(3) # Limit concurrency to 3 requests + + async def generate_with_limit(chunk): + retries = 3 + delay = 1 + async with semaphore: + for attempt in range(retries): + try: + return await self.tts_provider.generate_speech(chunk) + except HTTPException as e: + if e.status_code == 429: + logger.warning(f"429 Too Many Requests for chunk, retrying in {delay}s (attempt {attempt+1}/{retries})...") + await asyncio.sleep(delay) + delay *= 2 # exponential backoff + else: + raise + raise HTTPException(status_code=429, detail="Too many requests after retries.") + + tasks = [generate_with_limit(chunk) for chunk in chunks] + try: - # Gather the results from all tasks. This will run all API calls - # to the TTS provider concurrently. all_pcm_data = await asyncio.gather(*tasks) logger.info(f"Successfully gathered audio data for all {len(chunks)} chunks.") except Exception as e: @@ -123,17 +120,7 @@ logger.warning("No audio data was generated.") raise HTTPException(status_code=500, detail="No audio data was generated from the TTS provider.") - # Concatenate all the raw PCM data into a single stream concatenated_pcm = b''.join(all_pcm_data) logger.info(f"Concatenated {len(chunks)} chunks into a single PCM stream.") - # Wrap the complete PCM stream in a single WAV container - with io.BytesIO() as wav_buffer: - with wave.open(wav_buffer, 'wb') as wav_file: - wav_file.setnchannels(1) - wav_file.setsampwidth(2) - # The Gemini API returns 24kHz audio, adjust if using a different provider - wav_file.setframerate(24000) - wav_file.writeframes(concatenated_pcm) - - return wav_buffer.getvalue() \ No newline at end of file + return _create_wav_file(concatenated_pcm) diff --git a/ai-hub/run_integration_tests.sh b/ai-hub/run_integration_tests.sh index 46489d2..e0f94d5 100644 --- a/ai-hub/run_integration_tests.sh +++ b/ai-hub/run_integration_tests.sh @@ -7,15 +7,15 @@ # You can define aliases for your test file paths here. TEST_SUITES=( "All tests" - "integration_tests/test_sessions.py" - "integration_tests/test_documents.py" - "integration_tests/test_misc.py" + "integration_tests/test_sessions_api.py" + "integration_tests/test_documents_api.py" + "integration_tests/test_misc_api.py" ) TEST_PATHS=( "integration_tests/" - "integration_tests/test_sessions.py" - "integration_tests/test_documents.py" - "integration_tests/test_misc.py" + "integration_tests/test_sessions_api.py" + "integration_tests/test_documents_api.py" + "integration_tests/test_misc_api.py" ) export DB_MODE=sqlite diff --git a/ai-hub/tests/api/routes/test_tts.py b/ai-hub/tests/api/routes/test_tts.py index cd4f14e..0eb2bbb 100644 --- a/ai-hub/tests/api/routes/test_tts.py +++ b/ai-hub/tests/api/routes/test_tts.py @@ -19,20 +19,20 @@ mock_services.tts_service.create_speech_non_stream.assert_called_once_with(text="Hello, this is a test") @pytest.mark.asyncio -async def test_create_speech_stream_response(async_client): - """Test the /speech endpoint with stream=true returns a streaming response.""" +async def test_create_speech_stream_wav_response(async_client): + """Test the /speech endpoint with stream=true and as_wav=true returns a streamed WAV response.""" test_client, mock_services = await anext(async_client) mock_audio_bytes_chunks = [b"chunk1", b"chunk2", b"chunk3"] - # This async generator mock correctly simulates the streaming service async def mock_async_generator(): for chunk in mock_audio_bytes_chunks: yield chunk - # We mock `create_speech_stream` with a MagicMock returning the async generator + # Mock `create_speech_stream` with a MagicMock returning the async generator mock_services.tts_service.create_speech_stream = MagicMock(return_value=mock_async_generator()) - response = await test_client.post("/speech?stream=true", json={"text": "Hello, this is a test"}) + # Explicitly set stream=true and as_wav=true + response = await test_client.post("/speech?stream=true&as_wav=true", json={"text": "Hello, this is a test"}) assert response.status_code == 200 assert response.headers["content-type"] == "audio/wav" @@ -43,4 +43,29 @@ streamed_content += chunk assert streamed_content == b"".join(mock_audio_bytes_chunks) - mock_services.tts_service.create_speech_stream.assert_called_once_with(text="Hello, this is a test") \ No newline at end of file + mock_services.tts_service.create_speech_stream.assert_called_once_with(text="Hello, this is a test", as_wav=True) + +@pytest.mark.asyncio +async def test_create_speech_stream_pcm_response(async_client): + """Test the /speech endpoint with stream=true and as_wav=false returns a streamed PCM response.""" + test_client, mock_services = await anext(async_client) + mock_audio_bytes_chunks = [b"pcm_chunk1", b"pcm_chunk2", b"pcm_chunk3"] + + async def mock_async_generator(): + for chunk in mock_audio_bytes_chunks: + yield chunk + + mock_services.tts_service.create_speech_stream = MagicMock(return_value=mock_async_generator()) + + # Set stream=true and as_wav=false + response = await test_client.post("/speech?stream=true&as_wav=false", json={"text": "Hello, this is a test"}) + + assert response.status_code == 200 + assert response.headers["content-type"] == "audio/pcm" + + streamed_content = b"" + async for chunk in response.aiter_bytes(): + streamed_content += chunk + + assert streamed_content == b"".join(mock_audio_bytes_chunks) + mock_services.tts_service.create_speech_stream.assert_called_once_with(text="Hello, this is a test", as_wav=False) \ No newline at end of file diff --git a/ai-hub/tests/core/providers/test_factory.py b/ai-hub/tests/core/providers/test_factory.py index 3d48c03..e00e71b 100644 --- a/ai-hub/tests/core/providers/test_factory.py +++ b/ai-hub/tests/core/providers/test_factory.py @@ -27,11 +27,11 @@ def test_get_tts_provider_returns_gemini_tts_provider(): """Tests that the factory returns a GeminiTTSProvider instance for 'google_gemini'.""" - # Use a valid voice from AVAILABLE_VOICES to avoid ValueError valid_voice = GeminiTTSProvider.AVAILABLE_VOICES[0] provider = get_tts_provider( "google_gemini", api_key="dummy_key", + model_name="dummy-model", voice_name=valid_voice ) assert isinstance(provider, GeminiTTSProvider) @@ -45,6 +45,7 @@ get_tts_provider( "unknown", api_key="dummy_key", + model_name="dummy-model", voice_name=valid_voice )