diff --git a/.gitignore b/.gitignore index dface39..96a9867 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,5 @@ **/*.egg-info .pytest_cache/ **.bin -**.wav **.db ai-hub/data/* diff --git a/.gitignore b/.gitignore index dface39..96a9867 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,5 @@ **/*.egg-info .pytest_cache/ **.bin -**.wav **.db ai-hub/data/* diff --git a/ai-hub/app/core/providers/stt/transcribe_audio_with_gemini.sh b/ai-hub/app/core/providers/stt/transcribe_audio_with_gemini.sh new file mode 100644 index 0000000..eeee2b1 --- /dev/null +++ b/ai-hub/app/core/providers/stt/transcribe_audio_with_gemini.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +set -euo pipefail + +# =============================================================================== +# Script Name: transcribe_audio_with_gemini.sh +# Description: This script transcribes a local audio file using the Gemini API. +# +# IMPORTANT: +# You must have the GEMINI_API_KEY environment variable set in your environment +# to authenticate with the API. +# You also need 'jq' for JSON parsing and 'file' for MIME type detection. +# =============================================================================== + +if [ $# -lt 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +AUDIO_PATH="$1" +DISPLAY_NAME="AUDIO" + +if [ ! -f "$AUDIO_PATH" ]; then + echo "Error: File not found: $AUDIO_PATH" + exit 1 +fi + +if [ -z "${GEMINI_API_KEY:-}" ]; then + echo "Error: GEMINI_API_KEY environment variable not set." + exit 1 +fi + +MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}") +NUM_BYTES=$(wc -c < "${AUDIO_PATH}") +TMP_HEADER_FILE=$(mktemp) + +# Remove any old JSON files +rm -f file_info.json request.json + +# Step 1: Start resumable upload +curl -s "https://generativelanguage.googleapis.com/upload/v1beta/files" \ + -H "x-goog-api-key: $GEMINI_API_KEY" \ + -D "$TMP_HEADER_FILE" \ + -H "X-Goog-Upload-Protocol: resumable" \ + -H "X-Goog-Upload-Command: start" \ + -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \ + -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \ + -H "Content-Type: application/json" \ + -d "{\"file\": {\"display_name\": \"${DISPLAY_NAME}\"}}" >/dev/null + +UPLOAD_URL=$(grep -i "x-goog-upload-url: " "$TMP_HEADER_FILE" | cut -d" " -f2 | tr -d "\r") +rm "$TMP_HEADER_FILE" + +# Step 2: Upload the file +curl -s "${UPLOAD_URL}" \ + -H "Content-Length: ${NUM_BYTES}" \ + -H "X-Goog-Upload-Offset: 0" \ + -H "X-Goog-Upload-Command: upload, finalize" \ + --data-binary "@${AUDIO_PATH}" > data/file_info.json + +FILE_URI="https://generativelanguage.googleapis.com/v1beta/files/$(jq -r '.file.name' data/file_info.json | cut -d'/' -f2)" + +# Step 3: Prepare request +cat > data/request.json <" + exit 1 +fi + +AUDIO_PATH="$1" +DISPLAY_NAME="AUDIO" + +if [ ! -f "$AUDIO_PATH" ]; then + echo "Error: File not found: $AUDIO_PATH" + exit 1 +fi + +if [ -z "${GEMINI_API_KEY:-}" ]; then + echo "Error: GEMINI_API_KEY environment variable not set." + exit 1 +fi + +MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}") +NUM_BYTES=$(wc -c < "${AUDIO_PATH}") +TMP_HEADER_FILE=$(mktemp) + +# Remove any old JSON files +rm -f file_info.json request.json + +# Step 1: Start resumable upload +curl -s "https://generativelanguage.googleapis.com/upload/v1beta/files" \ + -H "x-goog-api-key: $GEMINI_API_KEY" \ + -D "$TMP_HEADER_FILE" \ + -H "X-Goog-Upload-Protocol: resumable" \ + -H "X-Goog-Upload-Command: start" \ + -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \ + -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \ + -H "Content-Type: application/json" \ + -d "{\"file\": {\"display_name\": \"${DISPLAY_NAME}\"}}" >/dev/null + +UPLOAD_URL=$(grep -i "x-goog-upload-url: " "$TMP_HEADER_FILE" | cut -d" " -f2 | tr -d "\r") +rm "$TMP_HEADER_FILE" + +# Step 2: Upload the file +curl -s "${UPLOAD_URL}" \ + -H "Content-Length: ${NUM_BYTES}" \ + -H "X-Goog-Upload-Offset: 0" \ + -H "X-Goog-Upload-Command: upload, finalize" \ + --data-binary "@${AUDIO_PATH}" > data/file_info.json + +FILE_URI="https://generativelanguage.googleapis.com/v1beta/files/$(jq -r '.file.name' data/file_info.json | cut -d'/' -f2)" + +# Step 3: Prepare request +cat > data/request.json <" + exit 1 +fi + +AUDIO_PATH="$1" +DISPLAY_NAME="AUDIO" + +if [ ! -f "$AUDIO_PATH" ]; then + echo "Error: File not found: $AUDIO_PATH" + exit 1 +fi + +if [ -z "${GEMINI_API_KEY:-}" ]; then + echo "Error: GEMINI_API_KEY environment variable not set." + exit 1 +fi + +MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}") +NUM_BYTES=$(wc -c < "${AUDIO_PATH}") +TMP_HEADER_FILE=$(mktemp) + +# Remove any old JSON files +rm -f file_info.json request.json + +# Step 1: Start resumable upload +curl -s "https://generativelanguage.googleapis.com/upload/v1beta/files" \ + -H "x-goog-api-key: $GEMINI_API_KEY" \ + -D "$TMP_HEADER_FILE" \ + -H "X-Goog-Upload-Protocol: resumable" \ + -H "X-Goog-Upload-Command: start" \ + -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \ + -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \ + -H "Content-Type: application/json" \ + -d "{\"file\": {\"display_name\": \"${DISPLAY_NAME}\"}}" >/dev/null + +UPLOAD_URL=$(grep -i "x-goog-upload-url: " "$TMP_HEADER_FILE" | cut -d" " -f2 | tr -d "\r") +rm "$TMP_HEADER_FILE" + +# Step 2: Upload the file +curl -s "${UPLOAD_URL}" \ + -H "Content-Length: ${NUM_BYTES}" \ + -H "X-Goog-Upload-Offset: 0" \ + -H "X-Goog-Upload-Command: upload, finalize" \ + --data-binary "@${AUDIO_PATH}" > data/file_info.json + +FILE_URI="https://generativelanguage.googleapis.com/v1beta/files/$(jq -r '.file.name' data/file_info.json | cut -d'/' -f2)" + +# Step 3: Prepare request +cat > data/request.json < 0 - print("✅ TTS stream test passed.") \ No newline at end of file + print("✅ TTS stream test passed.") + +@pytest.mark.asyncio +async def test_stt_transcribe_endpoint(http_client): + """ + Tests the /stt/transcribe endpoint by uploading a dummy audio file + and verifying the transcription response. + """ + print("\n--- Running test_stt_transcribe_endpoint ---") + url = "/stt/transcribe" + + # --- Use a real audio file from the integration test data --- + audio_file_path = "integration_tests/test_data/test-audio.wav" + + with open(audio_file_path, "rb") as audio_file: + files = {'audio_file': ('test-audio.wav', audio_file, 'audio/wav')} + + # --- Send the POST request to the endpoint --- + response = await http_client.post(url, files=files) + + # --- Assertions --- + assert response.status_code == 200, f"STT request failed with status code {response.status_code}. Response: {response.text}" + response_json = response.json() + assert "transcript" in response_json, "Response JSON is missing the 'transcript' key." + assert isinstance(response_json["transcript"], str), "Transcript value is not a string." + + # Assert that the transcript matches the expected text + expected_transcript = "This audio is for integration testing of Cortex Hub, which is a wonderful project." + assert response_json["transcript"] == expected_transcript, f"Expected: '{expected_transcript}', Got: '{response_json['transcript']}'" + + print("✅ STT transcription test passed.") + diff --git a/.gitignore b/.gitignore index dface39..96a9867 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,5 @@ **/*.egg-info .pytest_cache/ **.bin -**.wav **.db ai-hub/data/* diff --git a/ai-hub/app/core/providers/stt/transcribe_audio_with_gemini.sh b/ai-hub/app/core/providers/stt/transcribe_audio_with_gemini.sh new file mode 100644 index 0000000..eeee2b1 --- /dev/null +++ b/ai-hub/app/core/providers/stt/transcribe_audio_with_gemini.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +set -euo pipefail + +# =============================================================================== +# Script Name: transcribe_audio_with_gemini.sh +# Description: This script transcribes a local audio file using the Gemini API. +# +# IMPORTANT: +# You must have the GEMINI_API_KEY environment variable set in your environment +# to authenticate with the API. +# You also need 'jq' for JSON parsing and 'file' for MIME type detection. +# =============================================================================== + +if [ $# -lt 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +AUDIO_PATH="$1" +DISPLAY_NAME="AUDIO" + +if [ ! -f "$AUDIO_PATH" ]; then + echo "Error: File not found: $AUDIO_PATH" + exit 1 +fi + +if [ -z "${GEMINI_API_KEY:-}" ]; then + echo "Error: GEMINI_API_KEY environment variable not set." + exit 1 +fi + +MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}") +NUM_BYTES=$(wc -c < "${AUDIO_PATH}") +TMP_HEADER_FILE=$(mktemp) + +# Remove any old JSON files +rm -f file_info.json request.json + +# Step 1: Start resumable upload +curl -s "https://generativelanguage.googleapis.com/upload/v1beta/files" \ + -H "x-goog-api-key: $GEMINI_API_KEY" \ + -D "$TMP_HEADER_FILE" \ + -H "X-Goog-Upload-Protocol: resumable" \ + -H "X-Goog-Upload-Command: start" \ + -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \ + -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \ + -H "Content-Type: application/json" \ + -d "{\"file\": {\"display_name\": \"${DISPLAY_NAME}\"}}" >/dev/null + +UPLOAD_URL=$(grep -i "x-goog-upload-url: " "$TMP_HEADER_FILE" | cut -d" " -f2 | tr -d "\r") +rm "$TMP_HEADER_FILE" + +# Step 2: Upload the file +curl -s "${UPLOAD_URL}" \ + -H "Content-Length: ${NUM_BYTES}" \ + -H "X-Goog-Upload-Offset: 0" \ + -H "X-Goog-Upload-Command: upload, finalize" \ + --data-binary "@${AUDIO_PATH}" > data/file_info.json + +FILE_URI="https://generativelanguage.googleapis.com/v1beta/files/$(jq -r '.file.name' data/file_info.json | cut -d'/' -f2)" + +# Step 3: Prepare request +cat > data/request.json < 0 - print("✅ TTS stream test passed.") \ No newline at end of file + print("✅ TTS stream test passed.") + +@pytest.mark.asyncio +async def test_stt_transcribe_endpoint(http_client): + """ + Tests the /stt/transcribe endpoint by uploading a dummy audio file + and verifying the transcription response. + """ + print("\n--- Running test_stt_transcribe_endpoint ---") + url = "/stt/transcribe" + + # --- Use a real audio file from the integration test data --- + audio_file_path = "integration_tests/test_data/test-audio.wav" + + with open(audio_file_path, "rb") as audio_file: + files = {'audio_file': ('test-audio.wav', audio_file, 'audio/wav')} + + # --- Send the POST request to the endpoint --- + response = await http_client.post(url, files=files) + + # --- Assertions --- + assert response.status_code == 200, f"STT request failed with status code {response.status_code}. Response: {response.text}" + response_json = response.json() + assert "transcript" in response_json, "Response JSON is missing the 'transcript' key." + assert isinstance(response_json["transcript"], str), "Transcript value is not a string." + + # Assert that the transcript matches the expected text + expected_transcript = "This audio is for integration testing of Cortex Hub, which is a wonderful project." + assert response_json["transcript"] == expected_transcript, f"Expected: '{expected_transcript}', Got: '{response_json['transcript']}'" + + print("✅ STT transcription test passed.") + diff --git a/ai-hub/test_stt.sh b/ai-hub/test_stt.sh index 9612819..6701225 100644 --- a/ai-hub/test_stt.sh +++ b/ai-hub/test_stt.sh @@ -1,81 +1,80 @@ -#!/usr/bin/env bash -set -euo pipefail +#!/bin/bash -if [ $# -lt 1 ]; then - echo "Usage: $0 " - exit 1 +# --- 0. Load environment variables from .env file --- +# This ensures any necessary credentials or configurations are loaded. +if [ -f .env ]; then + echo "Loading environment variables from .env" + export $(grep -v '^#' .env | xargs) fi +export LOG_LEVEL=DEBUG -AUDIO_PATH="$1" -DISPLAY_NAME="AUDIO" +# --- 1. Define variables and configurations --- +BASE_URL="http://127.0.0.1:8000" +STT_ENDPOINT="/stt/transcribe" +API_URL="${BASE_URL}${STT_ENDPOINT}" -if [ ! -f "$AUDIO_PATH" ]; then - echo "Error: File not found: $AUDIO_PATH" - exit 1 -fi +# --- 2. Start the FastAPI Server in the Background --- +echo "--- Starting AI Hub Server ---" +uvicorn app.main:app --host 127.0.0.1 --port 8000 & +SERVER_PID=$! -if [ -z "${GEMINI_API_KEY:-}" ]; then - echo "Error: GEMINI_API_KEY environment variable not set." - exit 1 -fi - -MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}") -NUM_BYTES=$(wc -c < "${AUDIO_PATH}") -TMP_HEADER_FILE=$(mktemp) - -# Remove any old JSON files -rm -f file_info.json request.json - -# Step 1: Start resumable upload -curl -s "https://generativelanguage.googleapis.com/upload/v1beta/files" \ - -H "x-goog-api-key: $GEMINI_API_KEY" \ - -D "$TMP_HEADER_FILE" \ - -H "X-Goog-Upload-Protocol: resumable" \ - -H "X-Goog-Upload-Command: start" \ - -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \ - -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \ - -H "Content-Type: application/json" \ - -d "{\"file\": {\"display_name\": \"${DISPLAY_NAME}\"}}" >/dev/null - -UPLOAD_URL=$(grep -i "x-goog-upload-url: " "$TMP_HEADER_FILE" | cut -d" " -f2 | tr -d "\r") -rm "$TMP_HEADER_FILE" - -# Step 2: Upload the file -curl -s "${UPLOAD_URL}" \ - -H "Content-Length: ${NUM_BYTES}" \ - -H "X-Goog-Upload-Offset: 0" \ - -H "X-Goog-Upload-Command: upload, finalize" \ - --data-binary "@${AUDIO_PATH}" > data/file_info.json - -FILE_URI="https://generativelanguage.googleapis.com/v1beta/files/$(jq -r '.file.name' data/file_info.json | cut -d'/' -f2)" - -# Step 3: Prepare request -cat > data/request.json < /dev/null; then + echo "✅ Server is ready." + break + fi + sleep 1 + if [ "$i" -eq 20 ]; then + echo "❌ Error: Server failed to start in time. Exiting." + exit 1 + fi +done -# Step 5: Output transcript text -echo "$RESPONSE" | jq -r '.candidates[0].content.parts[0].text' +# --- 4. Prepare the audio file for testing --- +echo "" +echo "--- Preparing audio file for testing ---" +# Check if a file path was provided as an argument +if [ -z "$1" ]; then + echo "❌ Error: No audio file path provided." + echo "Usage: $0 /path/to/your/audio.wav" + exit 1 +fi + +TEMP_AUDIO_FILE="$1" +if [ ! -f "$TEMP_AUDIO_FILE" ]; then + echo "❌ Error: The specified file '$TEMP_AUDIO_FILE' does not exist." + exit 1 +fi +echo "✅ Using provided audio file: $TEMP_AUDIO_FILE" +echo "" + +# --- 5. Test a successful transcription --- +echo "--- Testing successful transcription with valid audio file ---" +# We now explicitly specify the MIME type for the file using the curl -F flag syntax. +# We also use the -f, --fail flag to ensure curl returns an error code on HTTP 4xx/5xx status. +curl -s -f -X POST "$API_URL" \ + -H "Content-Type: multipart/form-data" \ + -F "audio_file=@$TEMP_AUDIO_FILE;type=audio/wav" | jq '.' + +# Check the exit status of the curl command +# The return code is now more reliable thanks to the `-f` flag. +if [ ${PIPESTATUS[0]} -eq 0 ]; then + echo "" + echo "✅ Success! Received a valid transcription response." +else + echo "" + echo "❌ Failed to get a valid transcription response. Check the server logs for details." +fi +echo "" + +# --- Tests complete. ---