Newer
Older
cortex-hub / ui / tts-client-app / src / hooks / useVoiceChat.js
// src/hooks/useVoiceChat.js

// This file is a custom React hook that contains all the stateful logic
// and side effects for the voice chat application.

import { useState, useRef, useEffect } from "react";
import {
  createSession,
  transcribeAudio,
  chatWithAI,
  streamSpeech,
} from "../services/apiService";
import {
  stopAllPlayingAudio,
  stopAllMediaStreams,
  resampleBuffer,
} from "../services/audioUtils";

// Constants for Voice Activity Detection and timing
const VAD_THRESHOLD = 0.01;
const VAD_SILENCE_DURATION = 2500;
const MINIMUM_AUDIO_DURATION_MS = 500;
const AUTO_MODE_COOLDOWN_MS = 3000;

const useVoiceChat = ({ chatContainerRef }) => {
  const [chatHistory, setChatHistory] = useState([
    {
      text: "Hello! I'm an AI assistant. How can I help you today?",
      isUser: false,
    },
  ]);
  const [status, setStatus] = useState("Click the microphone to start recording.");
  const [isBusy, setIsBusy] = useState(false);
  const [isRecording, setIsRecording] = useState(false);
  const [showErrorModal, setShowErrorModal] = useState(false);
  const [errorMessage, setErrorMessage] = useState("");
  const [sessionId, setSessionId] = useState(null);
  const [isAutoMode, setIsAutoMode] = useState(false);
  const [isAutoListening, setIsAutoListening] = useState(false);

  // All refs must be declared here, inside the custom hook.
  const mediaRecorderRef = useRef(null);
  const audioChunksRef = useRef([]);
  const audioContextRef = useRef(null);
  const playbackTimeRef = useRef(0);
  const isRecordingRef = useRef(false);
  const playingSourcesRef = useRef([]);
  const vadStreamRef = useRef(null);
  const scriptProcessorRef = useRef(null);
  const silenceTimeoutRef = useRef(null);
  const lastRequestTimeRef = useRef(0);
  const streamRef = useRef(null);

  // --- Initial Session Creation Effect ---
  useEffect(() => {
    const startSession = async () => {
      setIsBusy(true);
      setStatus("Starting new chat session...");
      try {
        const session = await createSession();
        setSessionId(session.id);
        console.log(`Session created with ID: ${session.id}`);
        setStatus("Click the microphone to start recording.");
      } catch (err) {
        console.error("Error creating session:", err);
        setStatus(`Error: Could not start session. ${err.message}`);
        setErrorMessage(`Failed to create a chat session: ${err.message}`);
        setShowErrorModal(true);
      } finally {
        setIsBusy(false);
      }
    };
    startSession();

    return () => {
      // Pass the refs to the utility function here
      stopAllMediaStreams(vadStreamRef, mediaRecorderRef, scriptProcessorRef, streamRef);
    };
  }, []);

  // New useEffect hook to automatically scroll to the bottom of the chat history
  // The fix: `chatContainerRef` is now included in the dependency array.
  useEffect(() => {
    if (chatContainerRef.current) {
      chatContainerRef.current.scrollTop = chatContainerRef.current.scrollHeight;
    }
  }, [chatHistory, chatContainerRef]);

  const addMessage = (text, isUser) => {
    setChatHistory((prevHistory) => [...prevHistory, { text, isUser }]);
  };

  /**
   * Plays a stream of audio chunks using the Web Audio API by fetching them from the API.
   * This is the orchestrator that uses the stateless streamSpeech API function.
   * @param {string} text - The text to be synthesized by the TTS service.
   */
  const playStreamingAudio = async (text) => {
    setIsBusy(true);
    setStatus("Streaming audio...");
    // Pass the refs to the utility function
    stopAllPlayingAudio(playingSourcesRef, audioContextRef, playbackTimeRef);

    try {
      if (!audioContextRef.current) {
        audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)();
        playbackTimeRef.current = audioContextRef.current.currentTime;
      }
      
      const audioContext = audioContextRef.current;

      const onChunkReceived = (rawFloat32Data) => {
        // This is the callback that receives processed audio data from apiService.
        // It's responsible for using the Web Audio API to play the sound.
        const float32Resampled = resampleBuffer(
          rawFloat32Data,
          24000, // The model's sample rate is hardcoded to 24000
          audioContext.sampleRate
        );
        const audioBuffer = audioContext.createBuffer(
          1,
          float32Resampled.length,
          audioContext.sampleRate
        );
        audioBuffer.copyToChannel(float32Resampled, 0);

        const source = audioContext.createBufferSource();
        source.buffer = audioBuffer;
        source.connect(audioContext.destination);

        const currentTime = audioContext.currentTime;
        const startTime = Math.max(playbackTimeRef.current, currentTime);

        source.start(startTime);
        playbackTimeRef.current = startTime + audioBuffer.duration;

        playingSourcesRef.current.push(source);
        source.onended = () => {
          playingSourcesRef.current = playingSourcesRef.current.filter(
            (s) => s !== source
          );
        };
      };

      const onStreamDone = () => {
        // This callback is triggered when the stream finishes.
        console.log("TTS Stream complete.");
      };

      // Call the stateless API function, passing the UI-related callbacks
      await streamSpeech(text, onChunkReceived, onStreamDone);

    } catch (err) {
      console.error("Failed to stream speech:", err);
      setStatus(`Error: Failed to stream speech. ${err.message}`);
      setErrorMessage(`Failed to stream speech: ${err.message}`);
      setShowErrorModal(true);
    } finally {
      setIsBusy(false);
      lastRequestTimeRef.current = Date.now();
      if (isAutoMode && isAutoListening) {
        setStatus("Listening for voice...");
      } else if (!isAutoMode) {
        setStatus("Click the microphone to start recording.");
      } else {
        setStatus("Click to start listening.");
      }
    }
  };


  const processConversation = async (audioBlob) => {
    console.log("Processing conversation...");
    try {
      const audioDuration = audioBlob.size / (48000 * 2 * 1) * 1000;
      if (audioDuration < MINIMUM_AUDIO_DURATION_MS) {
        console.log(`Audio too short (${audioDuration.toFixed(2)}ms), skipping.`);
        setStatus("Audio was too short. Please speak a little longer.");
        lastRequestTimeRef.current = Date.now();
        return;
      }
      if (audioBlob.size === 0) {
        console.warn("Audio blob is empty, skipping STT API call.");
        setStatus("Recording stopped, but no audio was captured. Please try again.");
        lastRequestTimeRef.current = Date.now();
        return;
      }

      setStatus("Transcribing audio...");
      const userText = await transcribeAudio(audioBlob);
      addMessage(userText, true);

      setStatus("AI is thinking...");
      const aiText = await chatWithAI(sessionId, userText);
      addMessage(aiText, false);

      await playStreamingAudio(aiText);
    } catch (error) {
      console.error("Conversation processing failed:", error);
      setStatus(`Error: ${error.message}`);
      setErrorMessage(`An error occurred: ${error.message}`);
      setShowErrorModal(true);
    } finally {
      setIsBusy(false);
      lastRequestTimeRef.current = Date.now();
      // This is the main correction: only stop streams if not in auto-listening mode
      if (!isAutoMode) {
        setStatus("Click the microphone to start recording.");
        stopAllMediaStreams(vadStreamRef, mediaRecorderRef, scriptProcessorRef, streamRef);
      } else if (isAutoMode && isAutoListening) {
        setStatus("Listening for voice...");
      } else {
        setStatus("Click to start listening.");
      }
    }
  };

  const startManualRecording = async () => {
    if (isRecording) return;

    try {
      if (!sessionId) {
        setErrorMessage("Please wait for the chat session to be initialized.");
        setShowErrorModal(true);
        return;
      }
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      streamRef.current = stream;
      mediaRecorderRef.current = new MediaRecorder(stream);
      mediaRecorderRef.current.start();
      audioChunksRef.current = [];

      mediaRecorderRef.current.ondataavailable = (event) => {
        audioChunksRef.current.push(event.data);
      };

      mediaRecorderRef.current.onstop = async () => {
        if (streamRef.current) {
          streamRef.current.getTracks().forEach(track => track.stop());
          streamRef.current = null;
        }
        mediaRecorderRef.current = null;
        const audioBlob = new Blob(audioChunksRef.current, { type: "audio/wav" });
        await processConversation(audioBlob);
      };
      setIsRecording(true);
      isRecordingRef.current = true;
      setStatus("Recording... Click to stop.");
    } catch (err) {
      console.error("Error accessing microphone:", err);
      setStatus("Error: Cannot access microphone.");
      setErrorMessage("Microphone access has been denied. Please enable it.");
      setShowErrorModal(true);
    }
  };

  const stopManualRecording = () => {
    if (mediaRecorderRef.current?.state === "recording") {
      setIsBusy(true);
      setIsRecording(false);
      mediaRecorderRef.current.stop();
    }
  };

  const startAutoListening = async () => {
    try {
      if (!sessionId) {
        setErrorMessage("Please wait for the chat session to be initialized.");
        setShowErrorModal(true);
        return;
      }
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
      vadStreamRef.current = stream;
      const audioContext = new (window.AudioContext || window.webkitAudioContext)();
      const source = audioContext.createMediaStreamSource(stream);
      const bufferSize = 4096;
      const scriptProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
      scriptProcessorRef.current = scriptProcessor;
      source.connect(scriptProcessor);
      scriptProcessor.connect(audioContext.destination);

      scriptProcessor.onaudioprocess = (event) => {
        const inputBuffer = event.inputBuffer.getChannelData(0);
        let sum = 0.0;
        for (let i = 0; i < inputBuffer.length; i++) {
          sum += inputBuffer[i] * inputBuffer[i];
        }
        const volume = Math.sqrt(sum / inputBuffer.length);
        const isVoiceDetected = volume > VAD_THRESHOLD;
        const timeSinceLastRequest = Date.now() - lastRequestTimeRef.current;
        const isCooldownPassed = timeSinceLastRequest > AUTO_MODE_COOLDOWN_MS;

        if (isVoiceDetected) {
          if (silenceTimeoutRef.current) {
            clearTimeout(silenceTimeoutRef.current);
            silenceTimeoutRef.current = null;
          }
          if (!isRecordingRef.current && isCooldownPassed) {
            startAutoRecording(stream);
          }
        } else if (isRecordingRef.current) {
          if (!silenceTimeoutRef.current) {
            silenceTimeoutRef.current = setTimeout(() => {
              stopAutoRecording();
            }, VAD_SILENCE_DURATION);
          }
        }
      };
      setIsAutoListening(true);
      setStatus("Listening for voice...");
    } catch (err) {
      console.error("Error accessing microphone for VAD:", err);
      setStatus("Error: Cannot access microphone.");
      setErrorMessage("Microphone access has been denied. Please enable it.");
      setShowErrorModal(true);
    }
  };

  const stopAutoListening = () => {
    setIsAutoListening(false);
    // Pass the refs here to the utility function
    stopAllMediaStreams(vadStreamRef, mediaRecorderRef, scriptProcessorRef, streamRef);
    setStatus("Click to start listening.");
  };

  const startAutoRecording = (stream) => {
    if (mediaRecorderRef.current?.state === "recording") return;
    mediaRecorderRef.current = new MediaRecorder(stream);
    audioChunksRef.current = [];
    mediaRecorderRef.current.ondataavailable = (event) => {
      audioChunksRef.current.push(event.data);
    };
    mediaRecorderRef.current.onstop = async () => {
      isRecordingRef.current = false;
      setIsRecording(false);
      if (audioChunksRef.current.length > 0) {
        setIsBusy(true);
        setStatus("Transcribing audio...");
        const audioBlob = new Blob(audioChunksRef.current, { type: "audio/wav" });
        await processConversation(audioBlob);
      } else {
        setIsBusy(false);
        setStatus("Listening for voice...");
      }
    };
    mediaRecorderRef.current.start();
    isRecordingRef.current = true;
    setIsRecording(true);
    setStatus("Recording...");
  };

  const stopAutoRecording = () => {
    if (mediaRecorderRef.current?.state === "recording") {
      mediaRecorderRef.current.stop();
    }
  };

  const handleMicClick = () => {
    // Correctly pass the refs to the utility function
    stopAllPlayingAudio(playingSourcesRef, audioContextRef, playbackTimeRef);
    if (isBusy) return;

    if (isAutoMode) {
      if (isAutoListening) {
        stopAutoListening();
      } else {
        startAutoListening();
      }
    } else {
      if (isRecording) {
        stopManualRecording();
      } else {
        startManualRecording();
      }
    }
  };

  return {
    chatHistory,
    status,
    isBusy,
    isRecording,
    isAutoMode,
    isAutoListening,
    sessionId,
    showErrorModal,
    errorMessage,
    setIsAutoMode,
    handleMicClick,
    setShowErrorModal,
  };
};

export default useVoiceChat;