import React, { useState, useRef, useEffect } from "react"; // The main application component for the voice chat const App = () => { // State for managing the chat history, storing both user and AI messages const [chatHistory, setChatHistory] = useState([ { text: "Hello! I'm an AI assistant. How can I help you today?", isUser: false, }, ]); // State to manage the UI status, like "Recording...", "Transcribing...", etc. const [status, setStatus] = useState("Click the microphone to start recording."); // State to track if the application is currently busy with a request const [isBusy, setIsBusy] = useState(false); // State to track the recording status for UI feedback const [isRecording, setIsRecording] = useState(false); // State to manage the visibility of the error modal const [showErrorModal, setShowErrorModal] = useState(false); // State to hold the error message to be displayed in the modal const [errorMessage, setErrorMessage] = useState(""); // State for managing the chat session const [sessionId, setSessionId] = useState(null); const [userId, setUserId] = useState(null); // State to toggle between manual and automatic recording modes const [isAutoMode, setIsAutoMode] = useState(false); // State to indicate if the app is actively listening for voice in auto mode const [isAutoListening, setIsAutoListening] = useState(false); // Refs for managing Web Audio API and MediaRecorder instances const mediaRecorderRef = useRef(null); const audioChunksRef = useRef([]); const audioContextRef = useRef(null); const playbackTimeRef = useRef(0); // Ref to hold the current recording state reliably, to avoid stale closures in event handlers const isRecordingRef = useRef(false); // Keep track of currently playing audio sources so we can stop them on demand const playingSourcesRef = useRef([]); // Refs for managing the VAD (Voice Activity Detection) process const vadStreamRef = useRef(null); const scriptProcessorRef = useRef(null); const silenceTimeoutRef = useRef(null); // --- New Refs for Debouncing in Automatic Mode --- // Ref to track the last time a request was completed. This is used to implement a cooldown. const lastRequestTimeRef = useRef(0); // Ref to hold the stream from getUserMedia to be able to stop it later for manual mode const streamRef = useRef(null); // --- New Ref for scrolling to bottom of chat history --- const chatContainerRef = useRef(null); // --- Configuration --- // Please replace with your actual endpoints const STT_ENDPOINT = "https://192.168.68.113:8001/stt/transcribe"; const SESSIONS_CREATE_ENDPOINT = "https://192.168.68.113:8001/sessions"; const SESSIONS_CHAT_ENDPOINT = (id) => `https://192.168.68.113:8001/sessions/${id}/chat`; const TTS_ENDPOINT = "https://192.168.68.113:8001/speech"; // Configuration for Voice Activity Detection const VAD_THRESHOLD = 0.01; // Adjust this value to control sensitivity (0 to 1) const VAD_SILENCE_DURATION = 2500; // Duration of silence in ms before stopping recording // --- New Configuration for Audio Filtering and Debouncing --- const MINIMUM_AUDIO_DURATION_MS = 500; // Minimum duration for an audio recording to be processed const AUTO_MODE_COOLDOWN_MS = 3000; // Cooldown period in milliseconds to prevent rapid re-recordings in automatic mode // useEffect hook to create a new session when the component mounts useEffect(() => { const createSession = async () => { setIsBusy(true); setStatus("Starting new chat session..."); console.log("Attempting to create a new session."); try { const generatedUserId = crypto.randomUUID(); setUserId(generatedUserId); const response = await fetch(SESSIONS_CREATE_ENDPOINT, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ user_id: generatedUserId }), }); if (!response.ok) { throw new Error(`Failed to create session. Status: ${response.status}`); } const session = await response.json(); setSessionId(session.id); console.log(`Session created with ID: ${session.id}`); setStatus("Click the microphone to start recording."); } catch (err) { console.error("Error creating session:", err); setStatus(`Error: Could not start session. ${err.message}`); setErrorMessage(`Failed to create a chat session: ${err.message}`); setShowErrorModal(true); } finally { setIsBusy(false); } }; createSession(); // Cleanup function to stop all streams when the component unmounts return () => { stopAllStreams(); }; }, []); // Empty dependency array ensures this runs only once on mount // New useEffect hook to automatically scroll to the bottom of the chat history useEffect(() => { if (chatContainerRef.current) { chatContainerRef.current.scrollTop = chatContainerRef.current.scrollHeight; } }, [chatHistory]); /** * Cleans up all active audio streams and timers. */ const stopAllStreams = () => { if (vadStreamRef.current) { vadStreamRef.current.getTracks().forEach((track) => track.stop()); vadStreamRef.current = null; } if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { mediaRecorderRef.current.stop(); } if (silenceTimeoutRef.current) { clearTimeout(silenceTimeoutRef.current); silenceTimeoutRef.current = null; } if (scriptProcessorRef.current) { scriptProcessorRef.current.disconnect(); scriptProcessorRef.current = null; } if (streamRef.current) { streamRef.current.getTracks().forEach(track => track.stop()); streamRef.current = null; } isRecordingRef.current = false; setIsRecording(false); console.log("All audio streams and timers have been stopped."); }; /** * Adds a new message to the chat history state. The useEffect hook will handle the scrolling. * @param {string} text - The text content of the message. * @param {boolean} isUser - True for user messages, false for AI. */ const addMessage = (text, isUser) => { setChatHistory((prevHistory) => [...prevHistory, { text, isUser }]); }; /** * Resamples a Float32Array buffer from a source rate to a destination rate. * @param {Float32Array} buffer - The input audio buffer. * @param {number} srcRate - The source sample rate (e.g., 24000). * @param {number} dstRate - The destination sample rate (AudioContext's). * @returns {Float32Array} The resampled audio buffer. */ const resampleBuffer = (buffer, srcRate, dstRate) => { if (srcRate === dstRate) return buffer; const ratio = srcRate / dstRate; const newLength = Math.round(buffer.length / ratio); const resampled = new Float32Array(newLength); for (let i = 0; i < newLength; i++) { const srcIndex = i * ratio; const srcIndexFloor = Math.floor(srcIndex); const srcIndexCeil = Math.min(srcIndexFloor + 1, buffer.length - 1); const weight = srcIndex - srcIndexFloor; resampled[i] = buffer[srcIndexFloor] * (1 - weight) + buffer[srcIndexCeil] * weight; } return resampled; }; /** * Converts raw 16-bit PCM byte data to a normalized Float32Array. * @param {Uint8Array} pcmBytes - The raw PCM audio data. * @returns {Float32Array} The converted and normalized audio data. */ const convertPcmToFloat32 = (pcmBytes) => { const int16Array = new Int16Array( pcmBytes.buffer, pcmBytes.byteOffset, pcmBytes.byteLength / 2 ); const float32Array = new Float32Array(int16Array.length); for (let i = 0; i < int16Array.length; i++) { float32Array[i] = int16Array[i] / 32768; } return float32Array; }; /** * Plays a stream of audio chunks using the Web Audio API. * @param {string} text - The text to be synthesized by the TTS service. */ const playStreamingAudio = async (text) => { setIsBusy(true); setStatus("Streaming audio..."); stopAllPlayingAudio(); // Stop any previous playback try { if (!audioContextRef.current) { audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)(); playbackTimeRef.current = audioContextRef.current.currentTime; } const audioContext = audioContextRef.current; const url = `${TTS_ENDPOINT}?stream=true&as_wav=false`; const response = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text }), }); if (!response.ok) { throw new Error(`HTTP error! Status: ${response.status}`); } const reader = response.body.getReader(); let leftover = new Uint8Array(0); while (true) { const { done, value: chunk } = await reader.read(); if (done) { if (leftover.length > 0) { console.warn("Leftover bytes discarded:", leftover.length); } console.log("TTS Stream complete."); break; } let combined = new Uint8Array(leftover.length + chunk.length); combined.set(leftover); combined.set(chunk, leftover.length); let length = combined.length; if (length % 2 !== 0) { length -= 1; } const toConvert = combined.slice(0, length); leftover = combined.slice(length); const float32Raw = convertPcmToFloat32(toConvert); const float32Resampled = resampleBuffer( float32Raw, 24000, audioContext.sampleRate ); const audioBuffer = audioContext.createBuffer( 1, float32Resampled.length, audioContext.sampleRate ); audioBuffer.copyToChannel(float32Resampled, 0); const source = audioContext.createBufferSource(); source.buffer = audioBuffer; source.connect(audioContext.destination); const currentTime = audioContext.currentTime; const startTime = Math.max(playbackTimeRef.current, currentTime); source.start(startTime); playbackTimeRef.current = startTime + audioBuffer.duration; playingSourcesRef.current.push(source); source.onended = () => { playingSourcesRef.current = playingSourcesRef.current.filter( (s) => s !== source ); }; } } catch (err) { console.error("Failed to stream speech:", err); setStatus(`Error: Failed to stream speech. ${err.message}`); setErrorMessage(`Failed to stream speech: ${err.message}`); setShowErrorModal(true); } finally { setIsBusy(false); // This is the ideal place to update the last request time lastRequestTimeRef.current = Date.now(); if (isAutoMode && isAutoListening) { setStatus("Listening for voice..."); } else if (!isAutoMode) { setStatus("Click the microphone to start recording."); } else { // This case is when auto mode is on, but we stopped listening. setStatus("Click to start listening."); } } }; /** * Stops all currently playing TTS audio sources immediately. */ const stopAllPlayingAudio = () => { if (audioContextRef.current) { playingSourcesRef.current.forEach((source) => { try { source.stop(); } catch (e) { // Ignore errors from stopping already stopped sources } }); playingSourcesRef.current = []; playbackTimeRef.current = audioContextRef.current.currentTime; console.log("All playing audio has been stopped."); } }; /** * Starts the audio recording process for manual mode. */ const startManualRecording = async () => { if (isRecording) { console.log("Already recording, ignoring start request."); return; } try { if (!sessionId) { setErrorMessage("Please wait for the chat session to be initialized."); setShowErrorModal(true); return; } console.log("Starting manual recording..."); const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); streamRef.current = stream; // Store the stream to stop it later mediaRecorderRef.current = new MediaRecorder(stream); mediaRecorderRef.current.start(); audioChunksRef.current = []; mediaRecorderRef.current.ondataavailable = (event) => { audioChunksRef.current.push(event.data); }; mediaRecorderRef.current.onstop = async () => { // Ensure the stream is stopped after recording has finished. if (streamRef.current) { streamRef.current.getTracks().forEach(track => track.stop()); streamRef.current = null; } // Clear the media recorder ref to allow a new recording to be created. mediaRecorderRef.current = null; const audioBlob = new Blob(audioChunksRef.current, { type: "audio/wav", }); await processConversation(audioBlob); }; setIsRecording(true); isRecordingRef.current = true; setStatus("Recording... Click to stop."); } catch (err) { console.error("Error accessing microphone:", err); setStatus("Error: Cannot access microphone."); setErrorMessage( "Microphone access has been denied. Please enable it in your browser's site settings to continue." ); setShowErrorModal(true); } }; /** * Stops manual recording and finalizes the audio blob. */ const stopManualRecording = () => { if (mediaRecorderRef.current && mediaRecorderRef.current.state === "recording") { console.log("Stopping manual recording."); // Set isBusy immediately to prevent multiple stop requests. setIsBusy(true); setIsRecording(false); // Update UI immediately mediaRecorderRef.current.stop(); // The isRecording state will be set to false by the mediaRecorder.onstop handler after processing. } }; /** * Starts the voice activity detection process. */ const startAutoListening = async () => { try { if (!sessionId) { setErrorMessage("Please wait for the chat session to be initialized."); setShowErrorModal(true); return; } console.log("Starting automatic listening (VAD)."); // Request microphone access for VAD. const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); vadStreamRef.current = stream; const audioContext = new (window.AudioContext || window.webkitAudioContext)(); const source = audioContext.createMediaStreamSource(stream); // Use the maximum buffer size for less frequent onaudioprocess calls const bufferSize = 4096; const scriptProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1); scriptProcessorRef.current = scriptProcessor; source.connect(scriptProcessor); scriptProcessor.connect(audioContext.destination); scriptProcessor.onaudioprocess = (event) => { const inputBuffer = event.inputBuffer.getChannelData(0); let sum = 0.0; for (let i = 0; i < inputBuffer.length; i++) { sum += inputBuffer[i] * inputBuffer[i]; } const volume = Math.sqrt(sum / inputBuffer.length); console.log(`Current audio volume: ${volume.toFixed(2)}`); const isVoiceDetected = (volume > VAD_THRESHOLD); if (isVoiceDetected) { // Voice is detected, so clear the silence timeout if (silenceTimeoutRef.current) { clearTimeout(silenceTimeoutRef.current); silenceTimeoutRef.current = null; } // --- NEW LOGIC: Check for cooldown before starting recording --- const timeSinceLastRequest = Date.now() - lastRequestTimeRef.current; const isCooldownPassed = timeSinceLastRequest > AUTO_MODE_COOLDOWN_MS; // Start recording only if it's not already running AND the cooldown has passed if (!isRecordingRef.current && isCooldownPassed) { startAutoRecording(stream); } } else if (isRecordingRef.current) { // We are currently recording, but silence is detected // If a silence timeout is not already running, start one if (!silenceTimeoutRef.current) { console.log("Silence detected. Starting timeout."); silenceTimeoutRef.current = setTimeout(() => { // If the timeout expires, stop the recording and process the audio stopAutoRecording(); }, VAD_SILENCE_DURATION); } } }; setIsAutoListening(true); setStatus("Listening for voice..."); } catch (err) { console.error("Error accessing microphone for VAD:", err); setStatus("Error: Cannot access microphone."); setErrorMessage( "Microphone access has been denied. Please enable it in your browser's site settings to continue." ); setShowErrorModal(true); } }; /** * Stops the VAD process and cleans up resources. */ const stopAutoListening = () => { setIsAutoListening(false); // We only stop all streams when the user explicitly stops auto mode. stopAllStreams(); setStatus("Click to start listening."); console.log("Stopping automatic listening."); }; /** * Starts the MediaRecorder for automatic mode. */ const startAutoRecording = (stream) => { // Prevent starting a new recorder if one is already running if (mediaRecorderRef.current?.state === "recording") { console.log("Auto recording is already running, ignoring start request."); return; } console.log("Starting a new MediaRecorder for automatic mode."); mediaRecorderRef.current = new MediaRecorder(stream); audioChunksRef.current = []; mediaRecorderRef.current.ondataavailable = (event) => { audioChunksRef.current.push(event.data); }; mediaRecorderRef.current.onstop = async () => { // This flag is essential to prevent the VAD loop from re-triggering a recording // before the current one is fully processed. isRecordingRef.current = false; setIsRecording(false); if (audioChunksRef.current.length > 0) { // Set isBusy to prevent the user from starting a new request while processing setIsBusy(true); setStatus("Transcribing audio..."); console.log("Auto recording stopped. Processing conversation..."); const audioBlob = new Blob(audioChunksRef.current, { type: "audio/wav", }); await processConversation(audioBlob); } else { console.warn("Auto recording stopped but no audio chunks were collected."); setIsBusy(false); setStatus("Listening for voice..."); } }; mediaRecorderRef.current.start(); isRecordingRef.current = true; setIsRecording(true); setStatus("Recording..."); console.log("MediaRecorder started for automatic mode."); }; /** * Stops the MediaRecorder for automatic mode. */ const stopAutoRecording = () => { if (mediaRecorderRef.current && mediaRecorderRef.current.state === "recording") { console.log("MediaRecorder stopped for automatic mode."); mediaRecorderRef.current.stop(); } }; /** * Handles the complete conversation flow: STT -> LLM -> TTS. * @param {Blob} audioBlob - The recorded audio from the user. */ const processConversation = async (audioBlob) => { console.log("Processing conversation..."); try { // Step 1: Filter out audio that is too short to be meaningful // Get the duration of the audio blob. Assuming a 48kHz sample rate, each byte is ~1/48000th of a second // For a 'audio/wav' blob, this can be complex. A simple check on blob size is a good heuristic. // Let's assume an average bitrate and calculate duration. Or even simpler, check the raw size. // A more robust method would be to analyze the audio header, but for this context, a simple size check is fine. const audioDuration = audioBlob.size / (48000 * 2 * 1) * 1000; // Assuming 48kHz, 16-bit mono if (audioDuration < MINIMUM_AUDIO_DURATION_MS) { console.log(`Audio is too short (${audioDuration.toFixed(2)}ms), skipping.`); setStatus("Audio was too short. Please speak a little longer."); // Update the last request time here so the cooldown still applies. lastRequestTimeRef.current = Date.now(); return; // Exit the function early } setStatus("Transcribing audio..."); if (audioBlob.size === 0) { console.warn("Audio blob is empty, skipping STT API call."); setStatus("Recording stopped, but no audio was captured. Please try again."); // Update the last request time here so the cooldown still applies. lastRequestTimeRef.current = Date.now(); return; } const formData = new FormData(); formData.append("audio_file", audioBlob, "audio.wav"); const sttResponse = await fetch(STT_ENDPOINT, { method: "POST", body: formData, }); if (!sttResponse.ok) { console.error(`STT API failed with status: ${sttResponse.status}`); throw new Error("STT API failed"); } const sttResult = await sttResponse.json(); const userText = sttResult.transcript; console.log(`STT transcript received: "${userText}"`); addMessage(userText, true); // Step 2: Text-to-Text (LLM) setStatus("AI is thinking..."); const llmResponse = await fetch(SESSIONS_CHAT_ENDPOINT(sessionId), { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ prompt: userText, model: "gemini" }), }); if (!llmResponse.ok) { console.error(`LLM API failed with status: ${llmResponse.status}`); throw new Error("LLM API failed"); } const llmResult = await llmResponse.json(); const aiText = llmResult.answer; console.log(`LLM response received: "${aiText}"`); addMessage(aiText, false); // Step 3: Text-to-Speech (TTS) await playStreamingAudio(aiText); } catch (error) { console.error("Conversation processing failed:", error); setStatus(`Error: ${error.message}`); setErrorMessage(`An error occurred: ${error.message}`); setShowErrorModal(true); } finally { setIsBusy(false); // This is the ideal place to update the last request time lastRequestTimeRef.current = Date.now(); if (!isAutoMode) { setStatus("Click the microphone to start recording."); // Ensure refs are nullified after processing for a clean state mediaRecorderRef.current = null; streamRef.current = null; } else if (isAutoMode && isAutoListening) { setStatus("Listening for voice..."); } else { // This case is when auto mode is on, but we stopped listening. setStatus("Click to start listening."); } } }; /** * Toggles the recording state based on the current mode. */ const handleMicClick = () => { stopAllPlayingAudio(); // Interrupt any ongoing TTS playback immediately // Prevent new actions if the app is busy processing a request if (isBusy) { console.log("App is busy, ignoring mic click."); return; } if (isAutoMode) { if (isAutoListening) { stopAutoListening(); } else { startAutoListening(); } } else { // Manual Mode if (isRecording) { stopManualRecording(); } else { startManualRecording(); } } }; // Determine the icon and status text based on the current state const microphoneButtonState = isAutoMode ? (isAutoListening || isRecording) : isRecording; const micButtonColorClass = isRecording ? "bg-red-600 hover:bg-red-700 active:bg-red-800" : "bg-indigo-600 hover:bg-indigo-700 active:bg-indigo-800"; return ( <div className="flex items-center justify-center min-h-screen bg-gray-100 dark:bg-gray-900 text-gray-900 dark:text-gray-100 font-inter p-4"> <div className="w-full max-w-2xl bg-white dark:bg-gray-800 rounded-2xl shadow-xl overflow-hidden flex flex-col h-[90vh]"> {/* Header */} <div className="p-6 bg-gray-50 dark:bg-gray-700 border-b border-gray-200 dark:border-gray-600"> <h1 className="text-2xl font-bold text-center">Voice Chat Assistant</h1> <p className="mt-2 text-center text-sm text-gray-500 dark:text-gray-400"> Ask me anything and I'll respond! </p> </div> {/* Chat History */} <div ref={chatContainerRef} className="flex-grow p-6 space-y-4 overflow-y-auto" > {chatHistory.map((message, index) => ( <div key={index} className={`flex ${message.isUser ? "justify-end" : "justify-start"}`} > <div className={`max-w-[75%] px-4 py-2 rounded-2xl shadow-md ${ message.isUser ? "bg-indigo-500 text-white rounded-br-none" : "bg-gray-200 dark:bg-gray-700 text-gray-900 dark:text-gray-100 rounded-bl-none" }`} > {message.text} </div> </div> ))} </div> {/* Status and Controls */} <div className="p-6 bg-gray-50 dark:bg-gray-700 border-t border-gray-200 dark:border-gray-600"> <div className="text-center text-sm font-medium text-gray-600 dark:text-gray-400 h-5"> {status} </div> <div className="flex items-center justify-center mt-4"> {/* Microphone Button */} <button onClick={handleMicClick} disabled={isBusy && !isRecording} // Prevent interaction when busy, unless we are already recording (for manual stop) className={`flex items-center justify-center w-20 h-20 rounded-full transition-all duration-300 transform ${micButtonColorClass} text-white shadow-lg focus:outline-none focus:ring-4 focus:ring-offset-2 focus:ring-indigo-500 disabled:opacity-50 disabled:cursor-not-allowed`} > {isRecording ? ( <svg xmlns="http://www.w3.org/2000/svg" className="h-10 w-10 animate-pulse" viewBox="0 0 20 20" fill="currentColor" > <path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8 7a1 1 0 011-1h2a1 1 0 110 2H9a1 1 0 01-1-1zm1 4a1 1 0 100 2h2a1 1 0 100-2H9z" clipRule="evenodd" /> </svg> ) : ( <svg xmlns="http://www.w3.org/2000/svg" className="h-10 w-10" viewBox="0 0 20 20" fill="currentColor" > <path fillRule="evenodd" d="M7 4a3 3 0 016 0v4a3 3 0 11-6 0V4zm4 10.93A7.001 7.001 0 0017 8a1 1 0 10-2 0A5 5 0 015 8a1 1 0 00-2 0 7.001 7.001 0 006 6.93V17H6a1 1 0 100 2h8a1 1 0 100-2h-3v-2.07z" clipRule="evenodd" /> </svg> )} </button> </div> {/* Mode Toggle */} <div className="mt-4 flex justify-center"> <label className="inline-flex relative items-center cursor-pointer"> <input type="checkbox" value="" className="sr-only peer" checked={isAutoMode} onChange={() => { stopAllStreams(); setIsAutoMode(!isAutoMode); setStatus(!isAutoMode ? "Click to start listening." : "Click the microphone to start recording."); }} disabled={isBusy} /> <div className="w-11 h-6 bg-gray-200 peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-indigo-300 dark:peer-focus:ring-indigo-800 rounded-full peer dark:bg-gray-600 peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all dark:border-gray-500 peer-checked:bg-indigo-600"></div> <span className="ml-3 text-sm font-medium text-gray-900 dark:text-gray-300"> {isAutoMode ? "Auto Mode" : "Manual Mode"} </span> </label> </div> </div> </div> {/* Error Modal */} {showErrorModal && ( <div className="fixed inset-0 z-50 flex items-center justify-center p-4 bg-gray-900 bg-opacity-50"> <div className="bg-white dark:bg-gray-800 p-6 rounded-xl shadow-2xl max-w-sm w-full"> <h3 className="text-xl font-bold text-red-600">Error</h3> <p className="mt-4 text-gray-700 dark:text-gray-300">{errorMessage}</p> <button onClick={() => setShowErrorModal(false)} className="mt-6 w-full py-3 px-4 bg-indigo-600 text-white rounded-lg font-semibold hover:bg-indigo-700 transition-colors" > Close </button> </div> </div> )} </div> ); }; export default App;