diff --git a/ai-hub/app/api/routes/README.md b/ai-hub/app/api/routes/README.md
index 5fb40c6..b330071 100644
--- a/ai-hub/app/api/routes/README.md
+++ b/ai-hub/app/api/routes/README.md
@@ -1,683 +1,243 @@
-# Invoking the Text-to-Speech (TTS) API Endpoint
+# API Documentation
-This guide explains how a frontend application can interact with the FastAPI `/speech` endpoint for text-to-speech conversion. The endpoint supports both **non-streaming** and **streaming** audio responses.
+This repository provides frontend integration guides for the FastAPI backend's main endpoints:
+
+1. **Text-to-Speech (TTS) API**
+2. **Speech-to-Text (STT) API**
+3. **Chat Sessions API**
+4. **Documents API**
---
-## 1. Endpoint Details
+## 1. Text-to-Speech (TTS) API
-* **HTTP Method:** `POST`
-* **Path:** `/speech`
-* **Purpose:** Convert a given text string into audio.
+This API converts text into audio, supporting both **non-streaming** and **streaming** modes.
+
+### 1.1 Endpoint Details
+
+| Method | Path | Purpose |
+| ------ | --------- | ---------------------------------- |
+| POST | `/speech` | Convert a given text string to audio |
---
-## 2. Request Structure
+### 1.2 Request Structure
-### 2.1 Request Body
+#### Request Body (JSON)
-The POST request must include a JSON object matching the `SpeechRequest` schema.
+| Field | Type | Description | Example |
+| ------ | ------ | ---------------------------- | ---------------------------------------------- |
+| text | string | Text to convert to speech | `"Hello, this is a test message."` |
-| Field | Type | Description | Example |
-| ----- | ------ | ------------------------------ | ---------------------------------- |
-| text | string | Text to be converted to speech | `"Hello, this is a test message."` |
-
-**Example JSON body:**
-
+**Example:**
```json
{
"text": "The quick brown fox jumps over the lazy dog."
}
-```
+````
----
+#### Query Parameters
-### 2.2 Query Parameter
-
-| Parameter | Type | Default | Description |
-| --------- | ------- | ------- | -------------------------------------------------------------------------------------- |
-| stream | boolean | false | If `true`, returns a continuous audio stream. If `false`, returns the full audio file. |
+| Parameter | Type | Default | Description |
+| --------- | ------- | ------- | --------------------------------------------------------------------------- |
+| stream | boolean | false | If true, returns continuous audio stream. |
+| as\_wav | boolean | true | **Streaming only**: If true, returns WAV chunks; if false, returns raw PCM. |
**Example URLs:**
-* Non-streaming (Default):
-
- ```
- http://[your-api-server]/speech
- ```
-
-* Streaming:
-
- ```
- http://[your-api-server]/speech?stream=true
- ```
+```
+Non-streaming: http://[your-api-server]/speech
+Streaming WAV: http://[your-api-server]/speech?stream=true
+Streaming PCM: http://[your-api-server]/speech?stream=true&as_wav=false
+```
---
-## 3. Frontend Implementation (JavaScript)
+### 1.3 Frontend Examples (JavaScript)
-Below are two implementations using the `fetch` API.
-
----
-
-### Example 1: Non-Streaming Response
-
-Downloads the complete WAV file before playing. Suitable for short messages.
+#### Example 1: Non-Streaming
```javascript
-// Generate and play non-streaming audio
async function getSpeechAudio(text) {
- const url = 'http://[your-api-server]/speech'; // Replace with your API URL
-
- try {
- const response = await fetch(url, {
- method: 'POST',
- headers: { 'Content-Type': 'application/json' },
- body: JSON.stringify({ text })
- });
-
- if (!response.ok) {
- throw new Error(`HTTP error! status: ${response.status}`);
- }
-
- const audioBlob = await response.blob();
- const audioUrl = URL.createObjectURL(audioBlob);
-
- const audio = new Audio(audioUrl);
- audio.play();
-
- console.log("Audio file received and is now playing.");
- } catch (error) {
- console.error("Failed to generate speech:", error);
- }
+ const response = await fetch('http://[your-api-server]/speech', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ text })
+ });
+ const audioBlob = await response.blob();
+ const audioUrl = URL.createObjectURL(audioBlob);
+ new Audio(audioUrl).play();
}
-
-// Example:
-// getSpeechAudio("This is an example of a non-streaming response.");
```
----
-
-### Example 2: Streaming Response
-
-Plays audio as it arrives using the **MediaSource API**. Ideal for long texts.
+#### Example 2: Streaming WAV
```javascript
-// Stream audio and play as it arrives
async function streamSpeechAudio(text) {
- const url = 'http://[your-api-server]/speech?stream=true'; // Replace with your API URL
-
- try {
- const response = await fetch(url, {
- method: 'POST',
- headers: { 'Content-Type': 'application/json' },
- body: JSON.stringify({ text })
- });
-
- if (!response.ok || !response.body) {
- throw new Error(`HTTP error! status: ${response.status}`);
+ const response = await fetch('http://[your-api-server]/speech?stream=true', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ text })
+ });
+ const mediaSource = new MediaSource();
+ const audio = new Audio(URL.createObjectURL(mediaSource));
+ mediaSource.addEventListener('sourceopen', async () => {
+ const sourceBuffer = mediaSource.addSourceBuffer('audio/wav');
+ const reader = response.body.getReader();
+ while (true) {
+ const { done, value } = await reader.read();
+ if (done) { mediaSource.endOfStream(); break; }
+ sourceBuffer.appendBuffer(value);
}
+ });
+ audio.play();
+}
+```
- const mediaSource = new MediaSource();
- const audio = new Audio();
- audio.src = URL.createObjectURL(mediaSource);
+#### Example 3: Streaming PCM (Web Audio API)
- mediaSource.addEventListener('sourceopen', async () => {
- const sourceBuffer = mediaSource.addSourceBuffer('audio/wav');
- const reader = response.body.getReader();
-
- while (true) {
- const { done, value } = await reader.read();
- if (done) {
- mediaSource.endOfStream();
- break;
- }
- sourceBuffer.appendBuffer(value);
- }
- });
+```javascript
+async function streamPcmAudio(text) {
+ const response = await fetch('http://[your-api-server]/speech?stream=true&as_wav=false', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ text })
+ });
+ const audioContext = new AudioContext({ sampleRate: 24000 });
+ const reader = response.body.getReader();
+ let currentOffset = 0;
- audio.play();
- console.log("Streaming audio is starting...");
- } catch (error) {
- console.error("Failed to stream speech:", error);
+ function pcmToFloat32(pcm) {
+ const int16 = new Int16Array(pcm.buffer);
+ const float32 = new Float32Array(int16.length);
+ for (let i = 0; i < int16.length; i++) float32[i] = int16[i] / 32768.0;
+ return float32;
+ }
+
+ while (true) {
+ const { done, value } = await reader.read();
+ if (done) break;
+ const data = pcmToFloat32(value);
+ const buffer = audioContext.createBuffer(1, data.length, audioContext.sampleRate);
+ buffer.copyToChannel(data, 0);
+ const source = audioContext.createBufferSource();
+ source.buffer = buffer;
+ source.connect(audioContext.destination);
+ source.start(currentOffset);
+ currentOffset += buffer.duration;
}
}
-
-// Example:
-// streamSpeechAudio("This is an example of a streaming response, which begins playing before the entire audio file is received.");
```
-# Invoking the Speech-to-Text (STT) API Endpoint
+---
-This document explains how a frontend application can interact with the FastAPI `/stt/transcribe` endpoint to transcribe an uploaded audio file into text.
+## 2. Speech-to-Text (STT) API
+
+Converts uploaded audio into text.
+
+### 2.1 Endpoint Details
+
+| Method | Path | Purpose | Content-Type |
+| ------ | ----------------- | ---------------- | ------------------- |
+| POST | `/stt/transcribe` | Transcribe audio | multipart/form-data |
---
-## 1. Endpoint Details
+### 2.2 Request Structure
-* **HTTP Method:** `POST`
-* **Path:** `/stt/transcribe`
-* **Purpose:** Transcribe an uploaded audio file into text.
-* **Content Type:** `multipart/form-data`
+| Field | Type | Description |
+| ----------- | ---- | ------------------------ |
+| audio\_file | File | Audio file to transcribe |
---
-## 2. Request Structure
-
-### 2.1 Request Body
-
-The POST request must include a `multipart/form-data` object with a single file field named `audio_file`.
-
-| Field | Type | Description |
-| ----------- | ---- | -------------------------------- |
-| audio\_file | File | The audio file to be transcribed |
-
----
-
-## 3. Frontend Implementation (JavaScript + HTML)
-
-Below is a complete working example using `fetch` to send the file and display the transcription result.
+### 2.3 Example Frontend (HTML + JS)
```html
-
-
-
-
-
- STT API Example
-
-
-
-
-
Speech-to-Text (STT) Transcription
-
-
-
-
Transcribing...
-
-
Your transcribed text will appear here.
-
-
+
+
+
-
-
-
+ const res = await fetch('http://[your-api-server]/stt/transcribe', { method: 'POST', body: formData });
+ const data = await res.json();
+ document.getElementById('result').textContent = data.transcript;
+});
+
```
-Here’s your Chat Sessions API documentation reformatted for clarity, structure, and consistency:
+---
+
+## 3. Chat Sessions API
+
+Manages conversational sessions with the AI.
+
+### 3.1 Endpoints
+
+| Method | Path | Purpose |
+| ------ | --------------------------------- | --------------------------- |
+| POST | `/sessions/` | Create a new chat session |
+| POST | `/sessions/{session_id}/chat` | Send a message in a session |
+| GET | `/sessions/{session_id}/messages` | Retrieve chat history |
---
-# Invoking the Chat Sessions API Endpoint
+## 4. Documents API
-This document describes how a frontend application can interact with the FastAPI `/sessions` endpoints. These endpoints allow you to:
+Add, list, and delete documents.
-* Create new chat sessions
-* Send messages within a session
-* Retrieve chat history
+### 4.1 Endpoints
+
+| Method | Path | Purpose |
+| ------ | -------------------------- | ------------------ |
+| POST | `/documents/` | Add a document |
+| GET | `/documents/` | List all documents |
+| DELETE | `/documents/{document_id}` | Delete a document |
---
-## 1. Endpoint Details
-
-| HTTP Method | Path | Purpose | Request Type |
-| ----------- | --------------------------------- | ------------------------------------------------------------- | ------------------ |
-| **POST** | `/sessions/` | Creates a new chat session | `application/json` |
-| **POST** | `/sessions/{session_id}/chat` | Sends a message and receives a response in a specific session | `application/json` |
-| **GET** | `/sessions/{session_id}/messages` | Retrieves the message history for a given session | N/A |
-
----
-
-## 2. Request & Response Structures
-
-### 2.1 Create a New Chat Session
-
-**POST** `/sessions/`
-
-**Request Body:**
-
-| Field | Type | Description |
-| -------- | ------ | ----------------------------------- |
-| user\_id | string | ID of the user creating the session |
-| model | string | Model to use for the session |
-
-**Example Request:**
-
-```json
-{
- "user_id": "user-1234",
- "model": "gemini"
-}
-```
-
-**Response Body:**
-
-| Field | Type | Description |
-| ----------- | ------- | -------------------------- |
-| id | integer | Session ID |
-| user\_id | string | User ID |
-| created\_at | string | Session creation timestamp |
-| model | string | Model used |
-
----
-
-### 2.2 Send a Message in a Session
-
-**POST** `/sessions/{session_id}/chat`
-
-**Path Parameter:**
-
-| Name | Type | Description |
-| ----------- | ------- | ----------------- |
-| session\_id | integer | Unique session ID |
-
-**Request Body:**
-
-| Field | Type | Description |
-| ---------------------- | ------- | ----------------------------------------------------- |
-| prompt | string | User message |
-| model | string | Model for this message (can override session default) |
-| load\_faiss\_retriever | boolean | Whether to use FAISS retriever |
-
-**Example Request:**
-
-```json
-{
- "prompt": "What is the capital of France?",
- "model": "gemini",
- "load_faiss_retriever": false
-}
-```
-
-**Response Body:**
-
-| Field | Type | Description |
-| ----------- | ------ | --------------------------- |
-| answer | string | Model's answer |
-| model\_used | string | Model used for the response |
-
----
-
-### 2.3 Get Session Chat History
-
-**GET** `/sessions/{session_id}/messages`
-
-**Path Parameter:**
-
-| Name | Type | Description |
-| ----------- | ------- | ----------------- |
-| session\_id | integer | Unique session ID |
-
-**Response Body:**
-
-| Field | Type | Description |
-| ----------- | ------- | -------------------------------------------------------- |
-| session\_id | integer | Session ID |
-| messages | array | List of message objects (`role`, `content`, `timestamp`) |
-
----
-
-## 3. Frontend Implementation (HTML + JavaScript)
-
-Below is a complete example that:
-
-1. Creates a new chat session
-2. Sends a message in the session
-3. Retrieves the chat history
+### 4.2 Example Frontend (HTML + JS)
```html
-
-
-
-
-
- Chat Sessions API Example
-
-
-
-
-
Chat Sessions API Example
-
This page demonstrates creating a session, sending a message, and retrieving the history.
-
-
-
+
+
-
-
-
-```
-
-# **Invoking the Documents API Endpoint**
-
-This guide explains how a frontend application can interact with the FastAPI `/documents` endpoints.
-These endpoints allow you to **add**, **list**, and **delete** documents.
-
----
-
-## **Endpoint Summary**
-
-| HTTP Method | Path | Purpose | Request Type |
-| ----------- | -------------------------- | ---------------------------------- | ------------------ |
-| **POST** | `/documents/` | Adds a new document. | `application/json` |
-| **GET** | `/documents/` | Lists all documents. | N/A |
-| **DELETE** | `/documents/{document_id}` | Deletes a specific document by ID. | N/A |
-
----
-
-## **Request & Response Structures**
-
-### **1. Add a New Document**
-
-**POST** `/documents/`
-
-**Request Body** (JSON):
-
-* `title` *(string)* – The title of the document.
-* `content` *(string)* – The content of the document.
-
-**Example Request:**
-
-```json
-{
- "title": "My First Document",
- "content": "This is the content of my very first document."
+async function fetchDocs() {
+ const res = await fetch(`${API}/documents/`);
+ const data = await res.json();
+ document.getElementById('docs').innerHTML = data.documents.map(doc =>
+ `${doc.title}
`
+ ).join('');
}
+
+document.getElementById('addDoc').onsubmit = async e => {
+ e.preventDefault();
+ const title = document.getElementById('title').value;
+ const content = document.getElementById('content').value;
+ await fetch(`${API}/documents/`, {
+ method: 'POST', headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ title, content })
+ });
+ fetchDocs();
+};
+
+async function delDoc(id) {
+ await fetch(`${API}/documents/${id}`, { method: 'DELETE' });
+ fetchDocs();
+}
+
+fetchDocs();
+
```
-
-**Response Body**:
-
-* `message` *(string)* – Success message.
-
----
-
-### **2. List All Documents**
-
-**GET** `/documents/`
-
-**Request Body:** None.
-
-**Response Body**:
-
-* `documents` *(array)* – List of documents. Each object contains:
-
- * `id` *(integer)*
- * `title` *(string)*
- * `content` *(string)*
- * `created_at` *(timestamp)*
-
----
-
-### **3. Delete a Document**
-
-**DELETE** `/documents/{document_id}`
-
-**Path Parameters:**
-
-* `document_id` *(integer)* – Unique ID of the document to be deleted.
-
-**Response Body**:
-
-* `message` *(string)* – Success message.
-* `document_id` *(integer)* – ID of the deleted document.
-
----
-
-## **Frontend Implementation (JavaScript Example)**
-
-Below is a complete HTML + JavaScript example showing how to **add**, **list**, and **delete** documents using the API.
-
-```html
-
-
-
-
-
- Documents API Example
-
-
-
-
-
Documents API Example
-
-
Add a New Document
-
-
-
Documents List
-
-
-
-
Log
-
-
-
-
-
-
-```
\ No newline at end of file
diff --git a/ai-hub/app/api/routes/tts.py b/ai-hub/app/api/routes/tts.py
index 557924a..e305ee2 100644
--- a/ai-hub/app/api/routes/tts.py
+++ b/ai-hub/app/api/routes/tts.py
@@ -10,22 +10,33 @@
@router.post(
"",
summary="Generate speech from text",
- response_description="Audio bytes in WAV format, either as a complete file or a stream.",
+ response_description="Audio bytes in WAV or PCM format, either as a complete file or a stream.",
)
async def create_speech_response(
request: schemas.SpeechRequest,
stream: bool = Query(
False,
description="If true, returns a streamed audio response. Otherwise, returns a complete file."
+ ),
+ as_wav: bool = Query(
+ True,
+ description="If true, returns WAV format audio. If false, returns raw PCM audio data. Only applies when stream is true."
)
):
try:
if stream:
+ # Pass the new as_wav parameter to the streaming function
audio_stream_generator: AsyncGenerator[bytes, None] = services.tts_service.create_speech_stream(
- text=request.text
+ text=request.text,
+ as_wav=as_wav
)
- return StreamingResponse(audio_stream_generator, media_type="audio/wav")
+
+ # Dynamically set the media_type based on the as_wav flag
+ media_type = "audio/wav" if as_wav else "audio/pcm"
+
+ return StreamingResponse(audio_stream_generator, media_type=media_type)
else:
+ # The non-streaming function only returns WAV, so this part remains the same
audio_bytes = await services.tts_service.create_speech_non_stream(
text=request.text
)
diff --git a/ai-hub/app/app.py b/ai-hub/app/app.py
index 150a1ac..4afaa3f 100644
--- a/ai-hub/app/app.py
+++ b/ai-hub/app/app.py
@@ -83,6 +83,7 @@
tts_provider = get_tts_provider(
provider_name=settings.TTS_PROVIDER,
api_key=settings.TTS_API_KEY,
+ model_name = settings.TTS_MODEL_NAME,
voice_name=settings.TTS_VOICE_NAME
)
diff --git a/ai-hub/app/config.py b/ai-hub/app/config.py
index 858c8ad..390c990 100644
--- a/ai-hub/app/config.py
+++ b/ai-hub/app/config.py
@@ -18,11 +18,12 @@
class TTSProvider(str, Enum):
"""An enum for supported Text-to-Speech (TTS) providers."""
GOOGLE_GEMINI = "google_gemini"
+ GCLOUD_TTS = "gcloud_tts" # NEW: Add Google Cloud TTS as a supported provider
class STTProvider(str, Enum):
"""An enum for supported Speech-to-Text (STT) providers."""
GOOGLE_GEMINI = "google_gemini"
- OPENAI = "openai" # NEW: Add OpenAI as a supported provider
+ OPENAI = "openai"
class ApplicationSettings(BaseModel):
project_name: str = "Cortex Hub"
@@ -45,6 +46,7 @@
class TTSProviderSettings(BaseModel):
provider: TTSProvider = Field(default=TTSProvider.GOOGLE_GEMINI)
+ # The default values are kept as originally requested
voice_name: str = "Kore"
model_name: str = "gemini-2.5-flash-preview-tts"
api_key: Optional[SecretStr] = None
@@ -53,9 +55,6 @@
provider: STTProvider = Field(default=STTProvider.GOOGLE_GEMINI)
model_name: str = "gemini-2.5-flash"
api_key: Optional[SecretStr] = None
- # NOTE: OpenAI provider requires a different model name (e.g., 'whisper-1')
- # but we will handle this dynamically or through configuration.
- # The BaseModel is for schema validation, not for provider-specific logic.
class VectorStoreSettings(BaseModel):
index_path: str = "data/faiss_index.bin"
@@ -125,22 +124,22 @@
# --- API Keys & Models ---
self.DEEPSEEK_API_KEY: Optional[str] = os.getenv("DEEPSEEK_API_KEY")
self.GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
- self.OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY") # NEW: Add dedicated OpenAI API key
+ self.OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY")
self.DEEPSEEK_MODEL_NAME: str = os.getenv("DEEPSEEK_MODEL_NAME") or \
- get_from_yaml(["llm_providers", "deepseek_model_name"]) or \
- config_from_pydantic.llm_providers.deepseek_model_name
+ get_from_yaml(["llm_providers", "deepseek_model_name"]) or \
+ config_from_pydantic.llm_providers.deepseek_model_name
self.GEMINI_MODEL_NAME: str = os.getenv("GEMINI_MODEL_NAME") or \
- get_from_yaml(["llm_providers", "gemini_model_name"]) or \
- config_from_pydantic.llm_providers.gemini_model_name
+ get_from_yaml(["llm_providers", "gemini_model_name"]) or \
+ config_from_pydantic.llm_providers.gemini_model_name
# --- Vector Store Settings ---
self.FAISS_INDEX_PATH: str = os.getenv("FAISS_INDEX_PATH") or \
- get_from_yaml(["vector_store", "index_path"]) or \
- config_from_pydantic.vector_store.index_path
+ get_from_yaml(["vector_store", "index_path"]) or \
+ config_from_pydantic.vector_store.index_path
dimension_str = os.getenv("EMBEDDING_DIMENSION") or \
- get_from_yaml(["vector_store", "embedding_dimension"]) or \
- config_from_pydantic.vector_store.embedding_dimension
+ get_from_yaml(["vector_store", "embedding_dimension"]) or \
+ config_from_pydantic.vector_store.embedding_dimension
self.EMBEDDING_DIMENSION: int = int(dimension_str)
# --- Embedding Provider Settings ---
@@ -149,15 +148,15 @@
embedding_provider_env = embedding_provider_env.lower()
self.EMBEDDING_PROVIDER: EmbeddingProvider = EmbeddingProvider(embedding_provider_env or \
- get_from_yaml(["embedding_provider", "provider"]) or \
- config_from_pydantic.embedding_provider.provider)
+ get_from_yaml(["embedding_provider", "provider"]) or \
+ config_from_pydantic.embedding_provider.provider)
self.EMBEDDING_MODEL_NAME: str = os.getenv("EMBEDDING_MODEL_NAME") or \
- get_from_yaml(["embedding_provider", "model_name"]) or \
- config_from_pydantic.embedding_provider.model_name
+ get_from_yaml(["embedding_provider", "model_name"]) or \
+ config_from_pydantic.embedding_provider.model_name
self.EMBEDDING_API_KEY: Optional[str] = os.getenv("EMBEDDING_API_KEY") or \
- get_from_yaml(["embedding_provider", "api_key"]) or \
- self.GEMINI_API_KEY
+ get_from_yaml(["embedding_provider", "api_key"]) or \
+ self.GEMINI_API_KEY
# --- TTS Provider Settings ---
tts_provider_env = os.getenv("TTS_PROVIDER")
@@ -168,17 +167,23 @@
get_from_yaml(["tts_provider", "provider"]) or \
config_from_pydantic.tts_provider.provider)
self.TTS_VOICE_NAME: str = os.getenv("TTS_VOICE_NAME") or \
- get_from_yaml(["tts_provider", "voice_name"]) or \
- config_from_pydantic.tts_provider.voice_name
+ get_from_yaml(["tts_provider", "voice_name"]) or \
+ config_from_pydantic.tts_provider.voice_name
+
self.TTS_MODEL_NAME: str = os.getenv("TTS_MODEL_NAME") or \
- get_from_yaml(["tts_provider", "model_name"]) or \
- config_from_pydantic.tts_provider.model_name
+ get_from_yaml(["tts_provider", "model_name"]) or \
+ config_from_pydantic.tts_provider.model_name
+
+ # API Key logic for TTS
+ tts_api_key_env = os.getenv("TTS_API_KEY") or get_from_yaml(["tts_provider", "api_key"])
- self.TTS_API_KEY: Optional[str] = os.getenv("TTS_API_KEY") or \
- get_from_yaml(["tts_provider", "api_key"]) or \
- self.GEMINI_API_KEY
+ if tts_api_key_env:
+ self.TTS_API_KEY: Optional[str] = tts_api_key_env
+ else:
+ # If no specific TTS key is set, use the Gemini key as a fallback
+ self.TTS_API_KEY: Optional[str] = self.GEMINI_API_KEY
- # --- NEW STT Provider Settings ---
+ # --- STT Provider Settings ---
stt_provider_env = os.getenv("STT_PROVIDER")
if stt_provider_env:
stt_provider_env = stt_provider_env.lower()
@@ -187,11 +192,10 @@
get_from_yaml(["stt_provider", "provider"]) or \
config_from_pydantic.stt_provider.provider)
self.STT_MODEL_NAME: str = os.getenv("STT_MODEL_NAME") or \
- get_from_yaml(["stt_provider", "model_name"]) or \
- config_from_pydantic.stt_provider.model_name
+ get_from_yaml(["stt_provider", "model_name"]) or \
+ config_from_pydantic.stt_provider.model_name
# Logic for STT_API_KEY: Prioritize a dedicated STT_API_KEY.
- # Fallback to OPENAI_API_KEY if the provider is OpenAI, otherwise use GEMINI_API_KEY.
explicit_stt_api_key = os.getenv("STT_API_KEY") or get_from_yaml(["stt_provider", "api_key"])
if explicit_stt_api_key:
diff --git a/ai-hub/app/config.yaml b/ai-hub/app/config.yaml
index 99ec019..0f9028c 100644
--- a/ai-hub/app/config.yaml
+++ b/ai-hub/app/config.yaml
@@ -34,11 +34,12 @@
tts_provider:
# The provider for the TTS service.
- provider: "google_gemini"
+ # Check more at https://cloud.google.com/text-to-speech
+ provider: "gcloud_tts"
# The name of the voice to use for TTS.
- voice_name: "Zephyr"
+ voice_name: "en-US-Chirp3-HD-Achernar"
# The model name for the TTS service.
- model_name: "gemini-2.5-flash-preview-tts"
+ model_name: "gemini-2.5-pro-preview-tts"
# The provider for the Speech-to-Text (STT) service.
stt_provider:
diff --git a/ai-hub/app/core/providers/factory.py b/ai-hub/app/core/providers/factory.py
index 6cfbed0..725f871 100644
--- a/ai-hub/app/core/providers/factory.py
+++ b/ai-hub/app/core/providers/factory.py
@@ -3,6 +3,7 @@
from .llm.deepseek import DeepSeekProvider
from .llm.gemini import GeminiProvider
from .tts.gemini import GeminiTTSProvider
+from .tts.gcloud_tts import GCloudTTSProvider
from .stt.gemini import GoogleSTTProvider
from openai import AsyncOpenAI
@@ -24,10 +25,12 @@
raise ValueError(f"Unsupported model provider: '{model_name}'. Supported providers are: {list(_llm_providers.keys())}")
return provider
-def get_tts_provider(provider_name: str, api_key: str, voice_name: str) -> TTSProvider:
+def get_tts_provider(provider_name: str, api_key: str, model_name: str, voice_name: str) -> TTSProvider:
if provider_name == "google_gemini":
- return GeminiTTSProvider(api_key=api_key, voice_name = voice_name)
- raise ValueError(f"Unsupported TTS provider: '{provider_name}'. Supported providers are: ['google_gemini']")
+ return GeminiTTSProvider(api_key=api_key,model_name = model_name, voice_name = voice_name)
+ elif provider_name == "gcloud_tts":
+ return GCloudTTSProvider(api_key=api_key, voice_name = voice_name)
+ raise ValueError(f"Unsupported TTS provider: '{provider_name}'. Supported providers are: ['google_gemini', 'gcloud_tts']")
def get_stt_provider(provider_name: str, api_key: str, model_name: str) -> STTProvider:
if provider_name == "google_gemini":
diff --git a/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_bearer.sh b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_bearer.sh
new file mode 100644
index 0000000..d753b84
--- /dev/null
+++ b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_bearer.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Bearer token (best to set this as an environment variable)
+BEARER_TOKEN=""
+
+# Google TTS API endpoint
+TTS_API_URL="https://texttospeech.googleapis.com/v1/text:synthesize"
+
+# Request payload
+read -r -d '' PAYLOAD < output.mp3
+
+echo "✅ MP3 file saved to: output.mp3"
diff --git a/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key.sh b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key.sh
new file mode 100644
index 0000000..b62fc65
--- /dev/null
+++ b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Your Google Cloud API key
+API_KEY=""
+
+# Google TTS API endpoint with API key as query param
+TTS_API_URL="https://texttospeech.googleapis.com/v1/text:synthesize?key=$API_KEY"
+
+# Request payload
+read -r -d '' PAYLOAD < output.mp3
+
+echo "✅ MP3 file saved to: output.mp3"
diff --git a/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key_pcm.sh b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key_pcm.sh
new file mode 100644
index 0000000..5149058
--- /dev/null
+++ b/ai-hub/app/core/providers/tts/_debug/test_google_tts_script_key_pcm.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Your Google Cloud API key
+API_KEY=""
+
+# Google TTS API endpoint with API key as query param
+TTS_API_URL="https://texttospeech.googleapis.com/v1/text:synthesize?key=$API_KEY"
+
+# Request payload (set audioEncoding to LINEAR16 for PCM)
+read -r -d '' PAYLOAD < output.pcm
+
+echo "✅ PCM audio saved to: output.pcm"
diff --git a/ai-hub/app/core/providers/tts/gcloud_tts.py b/ai-hub/app/core/providers/tts/gcloud_tts.py
new file mode 100644
index 0000000..bfdb1b1
--- /dev/null
+++ b/ai-hub/app/core/providers/tts/gcloud_tts.py
@@ -0,0 +1,86 @@
+import os
+import aiohttp
+import asyncio
+import base64
+import logging
+from typing import AsyncGenerator
+from app.core.providers.base import TTSProvider
+from aiohttp import ClientResponseError
+from fastapi import HTTPException
+
+# Configure logging
+logger = logging.getLogger(__name__)
+
+# New concrete class for the Google Cloud Text-to-Speech API
+class GCloudTTSProvider(TTSProvider):
+ # This provider uses Google's dedicated TTS API. The voices are different from Gemini.
+ # Here is a small, representative list of available WaveNet voices.
+ # The full list is much larger and can be found in the official documentation.
+ AVAILABLE_VOICES = [
+ "en-US-Wavenet-A", "en-US-Wavenet-B", "en-US-Wavenet-C", "en-US-Wavenet-D",
+ "en-US-Wavenet-E", "en-US-Wavenet-F", "en-US-Wavenet-G", "en-US-Wavenet-H"
+ ]
+
+ def __init__(self, api_key: str, voice_name: str = "en-US-Wavenet-D"):
+ if voice_name not in self.AVAILABLE_VOICES:
+ raise ValueError(f"Invalid voice name: {voice_name}. Choose from {self.AVAILABLE_VOICES}")
+
+ self.api_key = api_key
+ # The new API URL for the Cloud Text-to-Speech service
+ self.api_url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={self.api_key}"
+ self.voice_name = voice_name
+ logger.debug(f"Initialized GCloudTTSProvider with voice: {self.voice_name}")
+
+ async def generate_speech(self, text: str) -> bytes:
+ logger.debug(f"Starting speech generation for text: '{text[:50]}...'")
+
+ headers = {
+ "Content-Type": "application/json"
+ }
+ json_data = {
+ "input": {
+ "text": text
+ },
+ "voice": {
+ "languageCode": "en-US",
+ "name": self.voice_name
+ },
+ "audioConfig": {
+ "audioEncoding": "LINEAR16"
+ }
+ }
+
+ logger.debug(f"API Request URL: {self.api_url}")
+ logger.debug(f"Request Payload: {json_data}")
+
+ try:
+ async with aiohttp.ClientSession() as session:
+ async with session.post(self.api_url, headers=headers, json=json_data) as response:
+ logger.debug(f"Received API response with status code: {response.status}")
+ response.raise_for_status()
+
+ response_json = await response.json()
+ logger.debug("Successfully parsed API response JSON.")
+
+ # The audio data is now under the 'audioContent' key
+ audio_base64 = response_json.get('audioContent')
+ if not audio_base64:
+ raise KeyError("audioContent key not found in the response.")
+
+ audio_bytes = base64.b64decode(audio_base64)
+ logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.")
+
+ return audio_bytes
+ except ClientResponseError as e:
+ if e.status == 429:
+ logger.error("Rate limit exceeded on Cloud TTS API.")
+ raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
+ else:
+ logger.error(f"Aiohttp client error occurred: {e}")
+ raise HTTPException(status_code=500, detail=f"API request failed: {e}")
+ except KeyError as e:
+ logger.error(f"Key error in API response: {e}. Full response: {await response.json()}")
+ raise HTTPException(status_code=500, detail="Malformed API response from Cloud TTS.")
+ except Exception as e:
+ logger.error(f"An unexpected error occurred during speech generation: {e}")
+ raise HTTPException(status_code=500, detail=f"Failed to generate speech: {e}")
\ No newline at end of file
diff --git a/ai-hub/app/core/providers/tts/gemini.py b/ai-hub/app/core/providers/tts/gemini.py
index 264557c..89d59e1 100644
--- a/ai-hub/app/core/providers/tts/gemini.py
+++ b/ai-hub/app/core/providers/tts/gemini.py
@@ -5,6 +5,9 @@
import logging
from typing import AsyncGenerator
from app.core.providers.base import TTSProvider
+from aiohttp import ClientResponseError
+from fastapi import HTTPException
+
# Configure logging
logger = logging.getLogger(__name__)
@@ -79,9 +82,13 @@
logger.debug(f"Decoded audio data, size: {len(audio_bytes)} bytes.")
return audio_bytes
- except aiohttp.ClientError as e:
- logger.error(f"Aiohttp client error occurred: {e}")
- raise HTTPException(status_code=500, detail=f"API request failed: {e}")
+ except ClientResponseError as e:
+ if e.status == 429:
+ logger.error("Rate limit exceeded on Gemini TTS API.")
+ raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
+ else:
+ logger.error(f"Aiohttp client error occurred: {e}")
+ raise HTTPException(status_code=500, detail=f"API request failed: {e}")
except KeyError as e:
logger.error(f"Key error in API response: {e}. Full response: {response_json}")
raise HTTPException(status_code=500, detail="Malformed API response from Gemini.")
diff --git a/ai-hub/app/core/services/tts.py b/ai-hub/app/core/services/tts.py
index 63298e4..d658e6c 100644
--- a/ai-hub/app/core/services/tts.py
+++ b/ai-hub/app/core/services/tts.py
@@ -10,6 +10,19 @@
# --- Configure logging ---
logger = logging.getLogger(__name__)
+# --- Helper Functions ---
+def _create_wav_file(pcm_data: bytes) -> bytes:
+ """
+ Wraps raw 16-bit PCM audio data in a WAV header.
+ """
+ with io.BytesIO() as wav_buffer:
+ with wave.open(wav_buffer, 'wb') as wav_file:
+ wav_file.setnchannels(1)
+ wav_file.setsampwidth(2)
+ wav_file.setframerate(24000)
+ wav_file.writeframes(pcm_data)
+ return wav_buffer.getvalue()
+
# --- Define TTS Service Class ---
class TTSService:
"""
@@ -18,37 +31,25 @@
audio generation, splitting text into manageable chunks.
"""
- # Use an environment variable or a default value for the max chunk size
- MAX_CHUNK_SIZE = int(os.getenv("TTS_MAX_CHUNK_SIZE", 200))
+ MAX_CHUNK_SIZE = int(os.getenv("TTS_MAX_CHUNK_SIZE", 600))
def __init__(self, tts_provider: TTSProvider):
- """
- Initializes the TTSService with a concrete TTS provider.
- """
self.tts_provider = tts_provider
async def _split_text_into_chunks(self, text: str) -> list[str]:
- """
- Splits the input text into chunks based on a maximum size and
- period delimiters, ensuring no chunk exceeds the limit.
- """
chunks = []
current_chunk = ""
- # Use a list of punctuation to split sentences more effectively
separators = ['.', '?', '!', '\n']
sentences = []
- # Split text by multiple delimiters
for separator in separators:
text = text.replace(separator, f"{separator}|")
sentences_with_empty = [s.strip() for s in text.split('|') if s.strip()]
- # Re-join sentences with their delimiters, so we don't lose them
for sentence in sentences_with_empty:
sentences.append(sentence)
for sentence in sentences:
- # Add the sentence and check if it exceeds the chunk size.
if len(current_chunk) + len(sentence) + 1 > self.MAX_CHUNK_SIZE and current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
@@ -61,32 +62,14 @@
logger.debug(f"Split text into {len(chunks)} chunks.")
return chunks
- async def create_speech_stream(self, text: str) -> AsyncGenerator[bytes, None]:
- """
- Generates a stream of complete, playable WAV files for each text chunk.
- This provides a streaming-like experience even with a non-streaming backend
- by sending each chunk as soon as it is generated.
- """
+ async def _generate_pcm_chunks(self, text: str) -> AsyncGenerator[bytes, None]:
chunks = await self._split_text_into_chunks(text)
-
+
for i, chunk in enumerate(chunks):
- logger.info(f"Processing chunk {i+1}/{len(chunks)} for streaming...")
-
+ logger.info(f"Generating PCM for chunk {i+1}/{len(chunks)}: '{chunk[:30]}...'")
try:
- # Get the raw PCM audio data for this chunk
pcm_data = await self.tts_provider.generate_speech(chunk)
-
- # Wrap the PCM data in a WAV header to make it a playable file
- with io.BytesIO() as wav_buffer:
- with wave.open(wav_buffer, 'wb') as wav_file:
- wav_file.setnchannels(1)
- wav_file.setsampwidth(2)
- wav_file.setframerate(24000)
- wav_file.writeframes(pcm_data)
-
- # Yield a complete, playable WAV file for the chunk
- yield wav_buffer.getvalue()
-
+ yield pcm_data
except Exception as e:
logger.error(f"Error processing chunk {i+1}: {e}")
raise HTTPException(
@@ -94,22 +77,36 @@
detail=f"Error generating speech for chunk {i+1}: {e}"
) from e
+ async def create_speech_stream(self, text: str, as_wav: bool = True) -> AsyncGenerator[bytes, None]:
+ async for pcm_data in self._generate_pcm_chunks(text):
+ if as_wav:
+ yield _create_wav_file(pcm_data)
+ else:
+ yield pcm_data
+
async def create_speech_non_stream(self, text: str) -> bytes:
- """
- Generates a complete audio file from the given text, splitting it
- into chunks and concatenating the audio into a single WAV file.
- All chunks are processed concurrently for speed.
- """
chunks = await self._split_text_into_chunks(text)
-
- all_pcm_data = []
-
- # Create a list of tasks for each chunk to run them concurrently.
- tasks = [self.tts_provider.generate_speech(chunk) for chunk in chunks]
-
+ semaphore = asyncio.Semaphore(3) # Limit concurrency to 3 requests
+
+ async def generate_with_limit(chunk):
+ retries = 3
+ delay = 1
+ async with semaphore:
+ for attempt in range(retries):
+ try:
+ return await self.tts_provider.generate_speech(chunk)
+ except HTTPException as e:
+ if e.status_code == 429:
+ logger.warning(f"429 Too Many Requests for chunk, retrying in {delay}s (attempt {attempt+1}/{retries})...")
+ await asyncio.sleep(delay)
+ delay *= 2 # exponential backoff
+ else:
+ raise
+ raise HTTPException(status_code=429, detail="Too many requests after retries.")
+
+ tasks = [generate_with_limit(chunk) for chunk in chunks]
+
try:
- # Gather the results from all tasks. This will run all API calls
- # to the TTS provider concurrently.
all_pcm_data = await asyncio.gather(*tasks)
logger.info(f"Successfully gathered audio data for all {len(chunks)} chunks.")
except Exception as e:
@@ -123,17 +120,7 @@
logger.warning("No audio data was generated.")
raise HTTPException(status_code=500, detail="No audio data was generated from the TTS provider.")
- # Concatenate all the raw PCM data into a single stream
concatenated_pcm = b''.join(all_pcm_data)
logger.info(f"Concatenated {len(chunks)} chunks into a single PCM stream.")
- # Wrap the complete PCM stream in a single WAV container
- with io.BytesIO() as wav_buffer:
- with wave.open(wav_buffer, 'wb') as wav_file:
- wav_file.setnchannels(1)
- wav_file.setsampwidth(2)
- # The Gemini API returns 24kHz audio, adjust if using a different provider
- wav_file.setframerate(24000)
- wav_file.writeframes(concatenated_pcm)
-
- return wav_buffer.getvalue()
\ No newline at end of file
+ return _create_wav_file(concatenated_pcm)
diff --git a/ai-hub/run_integration_tests.sh b/ai-hub/run_integration_tests.sh
index 46489d2..e0f94d5 100644
--- a/ai-hub/run_integration_tests.sh
+++ b/ai-hub/run_integration_tests.sh
@@ -7,15 +7,15 @@
# You can define aliases for your test file paths here.
TEST_SUITES=(
"All tests"
- "integration_tests/test_sessions.py"
- "integration_tests/test_documents.py"
- "integration_tests/test_misc.py"
+ "integration_tests/test_sessions_api.py"
+ "integration_tests/test_documents_api.py"
+ "integration_tests/test_misc_api.py"
)
TEST_PATHS=(
"integration_tests/"
- "integration_tests/test_sessions.py"
- "integration_tests/test_documents.py"
- "integration_tests/test_misc.py"
+ "integration_tests/test_sessions_api.py"
+ "integration_tests/test_documents_api.py"
+ "integration_tests/test_misc_api.py"
)
export DB_MODE=sqlite
diff --git a/ai-hub/tests/api/routes/test_tts.py b/ai-hub/tests/api/routes/test_tts.py
index cd4f14e..0eb2bbb 100644
--- a/ai-hub/tests/api/routes/test_tts.py
+++ b/ai-hub/tests/api/routes/test_tts.py
@@ -19,20 +19,20 @@
mock_services.tts_service.create_speech_non_stream.assert_called_once_with(text="Hello, this is a test")
@pytest.mark.asyncio
-async def test_create_speech_stream_response(async_client):
- """Test the /speech endpoint with stream=true returns a streaming response."""
+async def test_create_speech_stream_wav_response(async_client):
+ """Test the /speech endpoint with stream=true and as_wav=true returns a streamed WAV response."""
test_client, mock_services = await anext(async_client)
mock_audio_bytes_chunks = [b"chunk1", b"chunk2", b"chunk3"]
- # This async generator mock correctly simulates the streaming service
async def mock_async_generator():
for chunk in mock_audio_bytes_chunks:
yield chunk
- # We mock `create_speech_stream` with a MagicMock returning the async generator
+ # Mock `create_speech_stream` with a MagicMock returning the async generator
mock_services.tts_service.create_speech_stream = MagicMock(return_value=mock_async_generator())
- response = await test_client.post("/speech?stream=true", json={"text": "Hello, this is a test"})
+ # Explicitly set stream=true and as_wav=true
+ response = await test_client.post("/speech?stream=true&as_wav=true", json={"text": "Hello, this is a test"})
assert response.status_code == 200
assert response.headers["content-type"] == "audio/wav"
@@ -43,4 +43,29 @@
streamed_content += chunk
assert streamed_content == b"".join(mock_audio_bytes_chunks)
- mock_services.tts_service.create_speech_stream.assert_called_once_with(text="Hello, this is a test")
\ No newline at end of file
+ mock_services.tts_service.create_speech_stream.assert_called_once_with(text="Hello, this is a test", as_wav=True)
+
+@pytest.mark.asyncio
+async def test_create_speech_stream_pcm_response(async_client):
+ """Test the /speech endpoint with stream=true and as_wav=false returns a streamed PCM response."""
+ test_client, mock_services = await anext(async_client)
+ mock_audio_bytes_chunks = [b"pcm_chunk1", b"pcm_chunk2", b"pcm_chunk3"]
+
+ async def mock_async_generator():
+ for chunk in mock_audio_bytes_chunks:
+ yield chunk
+
+ mock_services.tts_service.create_speech_stream = MagicMock(return_value=mock_async_generator())
+
+ # Set stream=true and as_wav=false
+ response = await test_client.post("/speech?stream=true&as_wav=false", json={"text": "Hello, this is a test"})
+
+ assert response.status_code == 200
+ assert response.headers["content-type"] == "audio/pcm"
+
+ streamed_content = b""
+ async for chunk in response.aiter_bytes():
+ streamed_content += chunk
+
+ assert streamed_content == b"".join(mock_audio_bytes_chunks)
+ mock_services.tts_service.create_speech_stream.assert_called_once_with(text="Hello, this is a test", as_wav=False)
\ No newline at end of file
diff --git a/ai-hub/tests/core/providers/test_factory.py b/ai-hub/tests/core/providers/test_factory.py
index 3d48c03..e00e71b 100644
--- a/ai-hub/tests/core/providers/test_factory.py
+++ b/ai-hub/tests/core/providers/test_factory.py
@@ -27,11 +27,11 @@
def test_get_tts_provider_returns_gemini_tts_provider():
"""Tests that the factory returns a GeminiTTSProvider instance for 'google_gemini'."""
- # Use a valid voice from AVAILABLE_VOICES to avoid ValueError
valid_voice = GeminiTTSProvider.AVAILABLE_VOICES[0]
provider = get_tts_provider(
"google_gemini",
api_key="dummy_key",
+ model_name="dummy-model",
voice_name=valid_voice
)
assert isinstance(provider, GeminiTTSProvider)
@@ -45,6 +45,7 @@
get_tts_provider(
"unknown",
api_key="dummy_key",
+ model_name="dummy-model",
voice_name=valid_voice
)