cortex-hub/ai-hub/test_stt.sh at 9edc3003421a06cf5c5707f19ac5994546e302e7

Fork: 0
yangyangxie / cortex-hub
Find file
Newer
Older
cortex-hub / ai-hub / test_stt.sh
Yangyang Xie 8 days ago 2 KB poc proven for stt
Raw Blame History
#!/usr/bin/env bash
set -euo pipefail

if [ $# -lt 1 ]; then
  echo "Usage: $0 <audio_file>"
  exit 1
fi

AUDIO_PATH="$1"
DISPLAY_NAME="AUDIO"

if [ ! -f "$AUDIO_PATH" ]; then
  echo "Error: File not found: $AUDIO_PATH"
  exit 1
fi

if [ -z "${GEMINI_API_KEY:-}" ]; then
  echo "Error: GEMINI_API_KEY environment variable not set."
  exit 1
fi

MIME_TYPE=$(file -b --mime-type "${AUDIO_PATH}")
NUM_BYTES=$(wc -c < "${AUDIO_PATH}")
TMP_HEADER_FILE=$(mktemp)

# Remove any old JSON files
rm -f file_info.json request.json

# Step 1: Start resumable upload
curl -s "https://generativelanguage.googleapis.com/upload/v1beta/files" \
  -H "x-goog-api-key: $GEMINI_API_KEY" \
  -D "$TMP_HEADER_FILE" \
  -H "X-Goog-Upload-Protocol: resumable" \
  -H "X-Goog-Upload-Command: start" \
  -H "X-Goog-Upload-Header-Content-Length: ${NUM_BYTES}" \
  -H "X-Goog-Upload-Header-Content-Type: ${MIME_TYPE}" \
  -H "Content-Type: application/json" \
  -d "{\"file\": {\"display_name\": \"${DISPLAY_NAME}\"}}" >/dev/null

UPLOAD_URL=$(grep -i "x-goog-upload-url: " "$TMP_HEADER_FILE" | cut -d" " -f2 | tr -d "\r")
rm "$TMP_HEADER_FILE"

# Step 2: Upload the file
curl -s "${UPLOAD_URL}" \
  -H "Content-Length: ${NUM_BYTES}" \
  -H "X-Goog-Upload-Offset: 0" \
  -H "X-Goog-Upload-Command: upload, finalize" \
  --data-binary "@${AUDIO_PATH}" > data/file_info.json

FILE_URI="https://generativelanguage.googleapis.com/v1beta/files/$(jq -r '.file.name' data/file_info.json | cut -d'/' -f2)"

# Step 3: Prepare request
cat > data/request.json <<EOF
{
  "contents": [
    {
      "parts": [
        {
          "fileData": {
            "mimeType": "${MIME_TYPE}",
            "fileUri": "${FILE_URI}"
          }
        },
        {
          "text": "Transcribe this audio file."
        }
      ]
    }
  ]
}
EOF

# Step 4: Send transcription request
RESPONSE=$(curl -s "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \
  -H "x-goog-api-key: $GEMINI_API_KEY" \
  -H "Content-Type: application/json" \
  -X POST \
  --data-binary "@request.json")

# Step 5: Output transcript text
echo "$RESPONSE" | jq -r '.candidates[0].content.parts[0].text'