// src/hooks/useVoiceChat.js
// This file is a custom React hook that contains all the stateful logic
// and side effects for the voice chat application.
import { useState, useRef, useEffect, useCallback } from "react";
import {
createSession,
transcribeAudio,
chatWithAI,
streamSpeech,
getSessionMessages,
getSessionTokenStatus,
getUserConfig,
getSession,
uploadMessageAudio,
fetchMessageAudio
} from "../services/apiService";
import { getSessionId } from "../services/websocket";
import {
stopAllPlayingAudio,
stopAllMediaStreams,
resampleBuffer,
encodeWAV,
} from "../services/audioUtils";
// Constants for Voice Activity Detection and timing
const VAD_THRESHOLD = 0.01;
const VAD_SILENCE_DURATION = 2500;
const MINIMUM_AUDIO_DURATION_MS = 500;
const AUTO_MODE_COOLDOWN_MS = 3000;
const useVoiceChat = ({ chatContainerRef }) => {
const [chatHistory, setChatHistory] = useState([]);
const [status, setStatus] = useState("Click the microphone to start recording.");
const [isBusy, setIsBusy] = useState(false);
const [isRecording, setIsRecording] = useState(false);
const [showErrorModal, setShowErrorModal] = useState(false);
const [errorMessage, setErrorMessage] = useState("");
const [sessionId, setSessionId] = useState(null);
const [isAutoMode, setIsAutoMode] = useState(false);
const [isAutoListening, setIsAutoListening] = useState(false);
const [tokenUsage, setTokenUsage] = useState({ token_count: 0, token_limit: 100000, percentage: 0 });
const [isConfigured, setIsConfigured] = useState(true);
const [missingConfigs, setMissingConfigs] = useState([]);
const [userConfigData, setUserConfigData] = useState(null);
const [localActivePrefs, setLocalActivePrefs] = useState({ llm: '', tts: '', stt: '' });
// All refs must be declared here, inside the custom hook.
const sessionIdRef = useRef(null);
const mediaRecorderRef = useRef(null);
const audioChunksRef = useRef([]);
const audioContextRef = useRef(null);
const playbackTimeRef = useRef(0);
const isRecordingRef = useRef(false);
const isBusyRef = useRef(false);
const playingSourcesRef = useRef([]);
const vadStreamRef = useRef(null);
const scriptProcessorRef = useRef(null);
const silenceTimeoutRef = useRef(null);
const lastRequestTimeRef = useRef(0);
const streamRef = useRef(null);
const [isStreamingPlaying, setIsStreamingPlaying] = useState(false);
// Helper to keep ref and state in sync
const setBusy = (val) => {
setIsBusy(val);
isBusyRef.current = val;
};
/**
* Stops any currently playing streaming audio.
*/
const stopStreamingPlayback = useCallback(() => {
stopAllPlayingAudio(playingSourcesRef, audioContextRef, playbackTimeRef);
setIsStreamingPlaying(false);
setBusy(false);
}, []);
const fetchTokenUsage = useCallback(async () => {
if (!sessionIdRef.current) return;
try {
const usage = await getSessionTokenStatus(sessionIdRef.current);
setTokenUsage(usage);
} catch (err) {
console.warn("Failed to fetch voice token usage", err);
}
}, []);
// --- Initial Session Creation Effect ---
useEffect(() => {
const startSession = async () => {
setBusy(true);
setStatus("Loading chat session...");
try {
let configDataToUse = null;
let providerToUse = "gemini";
try {
configDataToUse = await getUserConfig();
setUserConfigData(configDataToUse);
if (configDataToUse.effective?.llm?.active_provider) {
providerToUse = configDataToUse.effective.llm.active_provider;
}
} catch (e) {
console.warn("Could not load user config for Voice", e);
}
const currentSessionId = await getSessionId("voice_chat", providerToUse);
setSessionId(currentSessionId);
sessionIdRef.current = currentSessionId;
// Try to load chat history
try {
const messagesData = await getSessionMessages(currentSessionId);
if (messagesData && messagesData.messages && messagesData.messages.length > 0) {
const formattedHistoryPromises = messagesData.messages.map(async (msg) => {
let audioBlob = null;
if (msg.has_audio) {
try {
audioBlob = await fetchMessageAudio(msg.id);
} catch (e) {
console.warn(`Failed to fetch audio for message ${msg.id}`, e);
}
}
return {
id: msg.id,
isUser: msg.sender === "user",
text: msg.content,
timestamp: msg.created_at,
isFromHistory: true,
audioBlob: audioBlob
};
});
const formattedHistory = await Promise.all(formattedHistoryPromises);
setChatHistory(formattedHistory);
}
} catch (historyErr) {
console.warn("Failed to load voice chat history", historyErr);
}
// Load initial tokens
await fetchTokenUsage();
// Check if configuration is fully populated
try {
const eff = configDataToUse?.effective || {};
let sessionLlmProvider = providerToUse;
try {
const currentSession = await getSession(currentSessionId);
if (currentSession && currentSession.provider_name) {
sessionLlmProvider = currentSession.provider_name;
}
} catch (sessionErr) {
console.warn("Could not fetch session details", sessionErr);
}
setLocalActivePrefs({
llm: sessionLlmProvider,
tts: eff.tts?.active_provider || '',
stt: eff.stt?.active_provider || ''
});
const missing = [];
const llmProviders = eff.llm?.providers || {};
const hasLLMKey = Object.values(llmProviders).some(p => p.api_key && p.api_key !== 'None');
if (!hasLLMKey) missing.push("Language Model (LLM) API Key");
const ttsProviders = eff.tts?.providers || {};
const hasTTSKey = Object.values(ttsProviders).some(p => p.api_key && p.api_key !== 'None');
if (!hasTTSKey) missing.push("Text-to-Speech (TTS) API Key");
const sttProviders = eff.stt?.providers || {};
const hasSTTKey = Object.values(sttProviders).some(p => p.api_key && p.api_key !== 'None');
if (!hasSTTKey) missing.push("Speech-to-Text (STT) API Key");
if (missing.length > 0) {
setIsConfigured(false);
setMissingConfigs(missing);
setStatus(`Feature not usable. Missing: ${missing.join(', ')}`);
} else {
setIsConfigured(true);
setMissingConfigs([]);
setStatus("Click the microphone to start recording.");
}
} catch (configErr) {
console.warn("Failed to load user config", configErr);
setIsConfigured(false);
setMissingConfigs(["Failed to load complete configuration"]);
}
console.log(`Voice Session loaded with ID: ${currentSessionId}`);
} catch (err) {
console.error("Error creating session:", err);
setStatus(`Error: Could not start session. ${err.message}`);
setErrorMessage(`Failed to create a chat session: ${err.message}`);
setShowErrorModal(true);
} finally {
setIsBusy(false);
}
};
startSession();
return () => {
// Pass the refs to the utility function here
stopAllMediaStreams(vadStreamRef, mediaRecorderRef, scriptProcessorRef, streamRef);
stopStreamingPlayback();
};
}, [stopStreamingPlayback]);
// New useEffect hook to automatically scroll to the bottom of the chat history
// The fix: `chatContainerRef` is now included in the dependency array.
useEffect(() => {
if (chatContainerRef.current) {
chatContainerRef.current.scrollTop = chatContainerRef.current.scrollHeight;
}
}, [chatHistory, chatContainerRef]);
const addMessage = (text, isUser, id = null) => {
setChatHistory((prevHistory) => [...prevHistory, {
text,
isUser,
id,
timestamp: new Date().toISOString()
}]);
};
/**
* Plays a stream of audio chunks using the Web Audio API by fetching them from the API.
* This is the orchestrator that uses the stateless streamSpeech API function.
* @param {string} text - The text to be synthesized by the TTS service.
* @param {number} messageId - The ID of the message to associated the audio with.
*/
const playStreamingAudio = async (text, messageId = null) => {
setBusy(true);
setIsStreamingPlaying(true);
setStatus("Streaming audio...");
// Stop any existing playback
stopStreamingPlayback();
setBusy(true); // stopStreamingPlayback sets it to false, we want it true during this process
// Track chunks to store in history
const accumulatedChunks = [];
try {
if (!audioContextRef.current) {
audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)();
playbackTimeRef.current = audioContextRef.current.currentTime;
}
const audioContext = audioContextRef.current;
const onChunkReceived = (rawFloat32Data, totalChunks, currentChunkIndex) => {
// Collect for storage
accumulatedChunks.push(new Float32Array(rawFloat32Data));
// Update UI progress
if (totalChunks > 0) {
const progress = Math.min(Math.round((currentChunkIndex / totalChunks) * 100), 100);
setChatHistory(prev => {
const next = [...prev];
for (let i = next.length - 1; i >= 0; i--) {
if (!next[i].isUser && !next[i].audioBlob) {
next[i].audioProgress = progress;
break;
}
}
return next;
});
}
const float32Resampled = resampleBuffer(
rawFloat32Data,
24000,
audioContext.sampleRate
);
const audioBuffer = audioContext.createBuffer(
1,
float32Resampled.length,
audioContext.sampleRate
);
audioBuffer.copyToChannel(float32Resampled, 0);
// Apply a very short fade-in and fade-out (2ms) to eliminate "clicks" at segment boundaries
const fadeSamps = Math.floor(audioContext.sampleRate * 0.002);
const chanData = audioBuffer.getChannelData(0);
if (chanData.length > fadeSamps * 2) {
for (let i = 0; i < fadeSamps; i++) {
chanData[i] *= (i / fadeSamps);
chanData[chanData.length - 1 - i] *= (i / fadeSamps);
}
}
const source = audioContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(audioContext.destination);
const currentTime = audioContext.currentTime;
const startTime = Math.max(playbackTimeRef.current, currentTime);
source.start(startTime);
playbackTimeRef.current = startTime + audioBuffer.duration;
playingSourcesRef.current.push(source);
source.onended = () => {
playingSourcesRef.current = playingSourcesRef.current.filter(
(s) => s !== source
);
};
};
const onStreamDone = async () => {
console.log("TTS Stream complete.");
setIsStreamingPlaying(false);
if (accumulatedChunks.length > 0) {
// Concatenate all chunks and save the blob
const totalLen = accumulatedChunks.reduce((acc, c) => acc + c.length, 0);
const result = new Float32Array(totalLen);
let offset = 0;
for (const c of accumulatedChunks) {
result.set(c, offset);
offset += c.length;
}
// resample to standard 44.1k for download/blob stability
const finalPcm = resampleBuffer(result, 24000, 44100);
const wavBlob = encodeWAV(finalPcm, 44100);
// Upload to persistent storage if messageId is available
if (messageId) {
try {
console.log(`Uploading audio for message ${messageId}...`);
await uploadMessageAudio(messageId, wavBlob);
} catch (uploadErr) {
console.warn("Failed to upload persistent audio", uploadErr);
}
}
// Post-update: find the last AI message and attach this blob
setChatHistory(prev => {
const next = [...prev];
// Find the latest assistant message that matches this text (or just the latest)
for (let i = next.length - 1; i >= 0; i--) {
if (!next[i].isUser && !next[i].audioBlob) {
next[i].audioBlob = wavBlob;
if (messageId) next[i].id = messageId;
break;
}
}
return next;
});
}
};
await streamSpeech(text, onChunkReceived, onStreamDone, localActivePrefs.tts);
} catch (err) {
console.error("Failed to stream speech:", err);
setIsStreamingPlaying(false);
setStatus(`Error: Failed to stream speech. ${err.message}`);
setErrorMessage(`Failed to stream speech: ${err.message}`);
setShowErrorModal(true);
} finally {
setBusy(false);
lastRequestTimeRef.current = Date.now();
if (isAutoMode && isAutoListening) {
setStatus("Listening for voice...");
} else if (!isAutoMode) {
setStatus("Click the microphone to start recording.");
} else {
setStatus("Click to start listening.");
}
}
};
/**
* Specifically for manual replay/synthesis of any message (including history)
*/
const synthesizeMessageAudio = async (index, text) => {
if (isBusy) return;
const accumulatedChunks = [];
if (chatHistory[index]?.audioBlob) return;
setIsBusy(true);
try {
if (!audioContextRef.current) {
audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)();
playbackTimeRef.current = audioContextRef.current.currentTime;
}
const audioContext = audioContextRef.current;
const onData = (rawFloat32Data, total, current) => {
accumulatedChunks.push(new Float32Array(rawFloat32Data));
if (total > 0) {
const progress = Math.min(Math.round((current / total) * 100), 100);
setChatHistory(prev => {
const next = [...prev];
if (next[index]) next[index].audioProgress = progress;
return next;
});
}
const float32Resampled = resampleBuffer(rawFloat32Data, 24000, audioContext.sampleRate);
const audioBuffer = audioContext.createBuffer(1, float32Resampled.length, audioContext.sampleRate);
audioBuffer.copyToChannel(float32Resampled, 0);
const source = audioContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(audioContext.destination);
const startTime = Math.max(playbackTimeRef.current, audioContext.currentTime);
source.start(startTime);
playbackTimeRef.current = startTime + audioBuffer.duration;
playingSourcesRef.current.push(source);
};
const onDone = async () => {
if (accumulatedChunks.length > 0) {
const totalLen = accumulatedChunks.reduce((acc, c) => acc + c.length, 0);
const result = new Float32Array(totalLen);
let offset = 0;
for (const c of accumulatedChunks) {
result.set(c, offset);
offset += c.length;
}
const finalPcm = resampleBuffer(result, 24000, 44100);
const wavBlob = encodeWAV(finalPcm, 44100);
const messageId = chatHistory[index]?.id;
if (messageId) {
try {
await uploadMessageAudio(messageId, wavBlob);
} catch (e) { console.warn("Upload failed during manual synthesis", e); }
}
setChatHistory(prev => {
const next = [...prev];
if (next[index]) next[index].audioBlob = wavBlob;
return next;
});
}
};
await streamSpeech(text, onData, onDone, localActivePrefs.tts);
} catch (err) {
console.error("Manual synthesis failed", err);
} finally {
setIsBusy(false);
}
};
const processConversation = async (audioBlob) => {
setBusy(true);
console.log("Processing conversation...");
try {
const audioDuration = audioBlob.size / (48000 * 2 * 1) * 1000;
if (audioDuration < MINIMUM_AUDIO_DURATION_MS) {
console.log(`Audio too short (${audioDuration.toFixed(2)}ms), skipping.`);
setStatus("Audio was too short. Please speak a little longer.");
lastRequestTimeRef.current = Date.now();
return;
}
if (audioBlob.size === 0) {
console.warn("Audio blob is empty, skipping STT API call.");
setStatus("Recording stopped, but no audio was captured. Please try again.");
lastRequestTimeRef.current = Date.now();
return;
}
setStatus("Transcribing audio...");
const userText = await transcribeAudio(audioBlob, localActivePrefs.stt);
addMessage(userText, true);
setStatus("AI is thinking...");
const aiResponse = await chatWithAI(sessionId, userText, localActivePrefs.llm || "gemini");
addMessage(aiResponse.answer, false, aiResponse.message_id);
fetchTokenUsage();
await playStreamingAudio(aiResponse.answer, aiResponse.message_id);
} catch (error) {
console.error("Conversation processing failed:", error);
setStatus(`Error: ${error.message}`);
setErrorMessage(`An error occurred: ${error.message}`);
setShowErrorModal(true);
} finally {
setBusy(false);
lastRequestTimeRef.current = Date.now();
// This is the main correction: only stop streams if not in auto-listening mode
if (!isAutoMode) {
setStatus("Click the microphone to start recording.");
stopAllMediaStreams(vadStreamRef, mediaRecorderRef, scriptProcessorRef, streamRef);
} else if (isAutoMode && isAutoListening) {
setStatus("Listening for voice...");
} else {
setStatus("Click to start listening.");
}
}
};
const startManualRecording = async () => {
if (isRecording) return;
try {
if (!sessionId) {
setErrorMessage("Please wait for the chat session to be initialized.");
setShowErrorModal(true);
return;
}
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
streamRef.current = stream;
mediaRecorderRef.current = new MediaRecorder(stream);
mediaRecorderRef.current.start();
audioChunksRef.current = [];
mediaRecorderRef.current.ondataavailable = (event) => {
audioChunksRef.current.push(event.data);
};
mediaRecorderRef.current.onstop = async () => {
if (streamRef.current) {
streamRef.current.getTracks().forEach(track => track.stop());
streamRef.current = null;
}
mediaRecorderRef.current = null;
const audioBlob = new Blob(audioChunksRef.current, { type: "audio/wav" });
await processConversation(audioBlob);
};
setIsRecording(true);
isRecordingRef.current = true;
setStatus("Recording... Click to stop.");
} catch (err) {
console.error("Error accessing microphone:", err);
setStatus("Error: Cannot access microphone.");
setErrorMessage("Microphone access has been denied. Please enable it.");
setShowErrorModal(true);
}
};
const stopManualRecording = () => {
if (mediaRecorderRef.current?.state === "recording") {
setBusy(true);
setIsRecording(false);
mediaRecorderRef.current.stop();
}
};
const startAutoListening = async () => {
try {
if (!sessionId) {
setErrorMessage("Please wait for the chat session to be initialized.");
setShowErrorModal(true);
return;
}
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
vadStreamRef.current = stream;
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
const source = audioContext.createMediaStreamSource(stream);
const bufferSize = 4096;
const scriptProcessor = audioContext.createScriptProcessor(bufferSize, 1, 1);
scriptProcessorRef.current = scriptProcessor;
source.connect(scriptProcessor);
scriptProcessor.connect(audioContext.destination);
scriptProcessor.onaudioprocess = (event) => {
const inputBuffer = event.inputBuffer.getChannelData(0);
let sum = 0.0;
for (let i = 0; i < inputBuffer.length; i++) {
sum += inputBuffer[i] * inputBuffer[i];
}
const volume = Math.sqrt(sum / inputBuffer.length);
const isVoiceDetected = volume > VAD_THRESHOLD;
const timeSinceLastRequest = Date.now() - lastRequestTimeRef.current;
const isCooldownPassed = timeSinceLastRequest > AUTO_MODE_COOLDOWN_MS;
if (isVoiceDetected && !isBusyRef.current) {
if (silenceTimeoutRef.current) {
clearTimeout(silenceTimeoutRef.current);
silenceTimeoutRef.current = null;
}
if (!isRecordingRef.current && isCooldownPassed) {
startAutoRecording(stream);
}
} else if (isRecordingRef.current) {
if (!silenceTimeoutRef.current) {
silenceTimeoutRef.current = setTimeout(() => {
stopAutoRecording();
}, VAD_SILENCE_DURATION);
}
}
};
setIsAutoListening(true);
setStatus("Listening for voice...");
} catch (err) {
console.error("Error accessing microphone for VAD:", err);
setStatus("Error: Cannot access microphone.");
setErrorMessage("Microphone access has been denied. Please enable it.");
setShowErrorModal(true);
}
};
const stopAutoListening = () => {
setIsAutoListening(false);
// Pass the refs here to the utility function
stopAllMediaStreams(vadStreamRef, mediaRecorderRef, scriptProcessorRef, streamRef);
setStatus("Click to start listening.");
};
const startAutoRecording = (stream) => {
if (mediaRecorderRef.current?.state === "recording") return;
mediaRecorderRef.current = new MediaRecorder(stream);
audioChunksRef.current = [];
mediaRecorderRef.current.ondataavailable = (event) => {
audioChunksRef.current.push(event.data);
};
mediaRecorderRef.current.onstop = async () => {
isRecordingRef.current = false;
setIsRecording(false);
if (audioChunksRef.current.length > 0) {
setIsBusy(true);
setStatus("Transcribing audio...");
const audioBlob = new Blob(audioChunksRef.current, { type: "audio/wav" });
await processConversation(audioBlob);
} else {
setIsBusy(false);
setStatus("Listening for voice...");
}
};
mediaRecorderRef.current.start();
isRecordingRef.current = true;
setIsRecording(true);
setStatus("Recording...");
};
const stopAutoRecording = () => {
if (mediaRecorderRef.current?.state === "recording") {
mediaRecorderRef.current.stop();
}
};
const handleMicClick = () => {
// Correctly pass the refs to the utility function
stopAllPlayingAudio(playingSourcesRef, audioContextRef, playbackTimeRef);
if (!isConfigured) {
setErrorMessage("Voice Chat requires valid configurations. Please visit Settings to set up your LLM, TTS, and STT providers.");
setShowErrorModal(true);
return;
}
if (isBusy) return;
if (isAutoMode) {
if (isAutoListening) {
stopAutoListening();
} else {
startAutoListening();
}
} else {
if (isRecording) {
stopManualRecording();
} else {
startManualRecording();
}
}
};
const handleNewSession = async () => {
setChatHistory([]);
localStorage.removeItem("sessionId_voice_chat");
setIsBusy(true);
setStatus("Starting new session...");
try {
const newSessionId = await getSessionId("voice_chat", localActivePrefs.llm || "gemini", {
stt_provider_name: localActivePrefs.stt,
tts_provider_name: localActivePrefs.tts
});
setSessionId(newSessionId);
sessionIdRef.current = newSessionId;
fetchTokenUsage();
setStatus("Click the microphone to start recording.");
} catch (err) {
console.error("Failed to start new voice session", err);
setStatus("Error creating new session.");
} finally {
setIsBusy(false);
}
};
const handleSwitchSession = useCallback(async (targetSessionId) => {
localStorage.setItem("sessionId_voice_chat", targetSessionId);
setSessionId(targetSessionId);
sessionIdRef.current = targetSessionId;
setChatHistory([]);
setStatus(`Loading session #${targetSessionId}...`);
try {
// Because we are switching sessions, we MUST sync the settings layout
// back to the new session's provider_name
try {
const switchedSession = await getSession(targetSessionId);
if (switchedSession && switchedSession.provider_name) {
setLocalActivePrefs(prev => ({
...prev,
llm: switchedSession.provider_name
}));
}
} catch (e) { console.warn("Could not fetch switched session provider", e); }
const messagesData = await getSessionMessages(targetSessionId);
if (messagesData && messagesData.messages) {
const mappedHistoryPromises = messagesData.messages.map(async (msg) => {
let audioBlob = null;
if (msg.has_audio) {
try {
audioBlob = await fetchMessageAudio(msg.id);
} catch (e) {
console.warn(`Failed to fetch audio for message ${msg.id} during switch`, e);
}
}
return {
id: msg.id,
text: msg.content,
isUser: msg.sender === 'user',
timestamp: msg.created_at,
isFromHistory: true,
audioBlob: audioBlob
};
});
const mappedHistory = await Promise.all(mappedHistoryPromises);
setChatHistory(mappedHistory);
}
fetchTokenUsage();
setStatus(`Click the microphone to start recording.`);
} catch (error) {
console.error("Failed to switch session:", error);
setStatus("Failed to load session history.");
}
}, [fetchTokenUsage]);
return {
chatHistory,
status,
isBusy,
isRecording,
isAutoMode,
isAutoListening,
sessionId,
showErrorModal,
errorMessage,
tokenUsage,
isConfigured,
missingConfigs,
userConfigData,
localActivePrefs,
setLocalActivePrefs,
setIsAutoMode,
handleMicClick,
handleNewSession,
handleSwitchSession,
setShowErrorModal,
setErrorMessage,
synthesizeMessageAudio,
isStreamingPlaying,
stopStreamingPlayback
};
};
export default useVoiceChat;