Newer
Older
cortex-hub / ai-hub / integration_tests / test_audio.py
import os
import httpx
import pytest

BASE_URL = os.getenv("SYNC_TEST_BASE_URL", "http://127.0.0.1:8002/api/v1")

def _headers():
    return {
        "X-User-ID": os.environ.get("SYNC_TEST_USER_ID", "")
    }

def test_tts_voices():
    """Test retrieving available TTS voices."""
    with httpx.Client(timeout=10.0) as client:
        r = client.get(f"{BASE_URL}/speech/voices", headers=_headers())
        assert r.status_code == 200, f"Failed to get voices: {r.text}"
        voices = r.json()
        assert isinstance(voices, list), "Voices should be a list"

def test_tts_to_stt_lifecycle():
    """
    Test generating speech from text (TTS), then transcribing that audio 
    back to text (STT) to verify the full audio processing pipeline.
    """
    user_id = os.environ.get("SYNC_TEST_USER_ID", "")
    assert user_id, "User ID not found in environment from conftest."
    
    test_phrase = "Hello from integration test audio pipeline."

    with httpx.Client(timeout=30.0) as client:
        # Step 1: Generate speech (TTS)
        tts_payload = {
            "text": test_phrase
        }
        r_tts = client.post(
            f"{BASE_URL}/speech", 
            params={"stream": False}, 
            headers=_headers(), 
            json=tts_payload
        )
        assert r_tts.status_code == 200, f"TTS failed: {r_tts.text}"
        
        # Ensure we got audio bytes back
        audio_content = r_tts.content
        assert len(audio_content) > 1000, "TTS audio content seems too small"
        
        # Step 2: Transcribe the generated audio (STT)
        files = {
            "audio_file": ("test_audio_pipeline.wav", audio_content, "audio/wav")
        }
        r_stt = client.post(
            f"{BASE_URL}/stt/transcribe", 
            headers=_headers(),
            files=files
        )
        assert r_stt.status_code == 200, f"STT failed: {r_stt.text}"
        
        stt_result = r_stt.json()
        transcript = stt_result.get("transcript", "").lower()
        
        # Assert the transcript contains our original phrase (or at least parts of it)
        # Using a looser verification because STT models might slightly rephrase or omit punctuation
        assert "hello" in transcript, f"Expected 'hello' in transcript: {transcript}"
        assert "integration" in transcript, f"Expected 'integration' in transcript: {transcript}"