added live mode for speach with utterence detection

This commit is contained in:
Will
2026-03-29 19:34:00 +01:00
parent d577528e47
commit 58392ea198
2 changed files with 192 additions and 50 deletions

View File

@@ -1,18 +1,51 @@
// src/app/page.tsx
"use client";
import { useState, useRef, useEffect } from "react";
import { useState, useRef, useEffect, useCallback } from "react";
import { useChat } from "@/hooks/useChat";
import { useWhisper } from "@/hooks/useWhisper";
import { useVoiceRecorder } from "@/hooks/useVoiceRecorder";
import { useLiveVoice } from "@/hooks/useLiveVoice";
import { stopSpeaking } from "@/lib/tts";
export default function Home() {
const [textInput, setTextInput] = useState("");
const [liveMode, setLiveMode] = useState(false);
const [isSpeaking, setIsSpeaking] = useState(false);
const { messages, isLoading, sendMessage } = useChat();
const { status: whisperStatus, modelMessage, transcribe } = useWhisper();
const { status: whisperStatus, transcribe } = useWhisper();
const { isRecording, startRecording, stopRecording } = useVoiceRecorder();
const bottomRef = useRef<HTMLDivElement>(null);
const handleUtterance = useCallback(
(text: string) => {
stopSpeaking();
sendMessage(text, "voice");
},
[sendMessage]
);
const { isListening, isSpeaking: vadSpeaking, start: startLive, stop: stopLive } =
useLiveVoice({
onUtterance: handleUtterance,
onSpeechStart: () => setIsSpeaking(true),
});
// Sync VAD speaking state
useEffect(() => {
setIsSpeaking(vadSpeaking);
}, [vadSpeaking]);
const handleLiveToggle = () => {
if (!liveMode) {
setLiveMode(true);
startLive();
} else {
setLiveMode(false);
stopLive();
setIsSpeaking(false);
}
};
useEffect(() => {
bottomRef.current?.scrollIntoView({ behavior: "smooth" });
}, [messages]);
@@ -37,22 +70,15 @@ export default function Home() {
if (text) sendMessage(text, "voice");
};
const pttDisabled =
whisperStatus !== "ready" || isLoading;
const pttLabel = () => {
if (whisperStatus === "loading") return "⏳";
if (whisperStatus === "transcribing") return "💬";
if (isRecording) return "🔴";
return "🎙";
};
const pttDisabled = whisperStatus !== "ready" || isLoading || liveMode;
const statusLine = () => {
if (whisperStatus === "loading") return modelMessage;
if (whisperStatus === "transcribing") return "Transcribing on-device…";
if (isRecording) return "Recording… release to send";
if (whisperStatus === "ready") return "Hold to talk — Whisper ready ✓";
return "Initialising Whisper…";
if (liveMode && isSpeaking) return "🎙 Hearing you…";
if (liveMode && isLoading) return "⏳ Claw is thinking…";
if (liveMode) return "👂 Listening — just speak naturally";
if (whisperStatus === "transcribing") return "💬 Transcribing…";
if (isRecording) return "🔴 Recording… release to send";
return "Hold to talk";
};
return (
@@ -64,23 +90,48 @@ export default function Home() {
<h1 className="text-xl font-bold tracking-tight">OpenClaw Voice</h1>
<p className="text-xs text-gray-500">On-device Whisper · No API keys</p>
</div>
{/* Live Mode Toggle */}
<div className="ml-auto flex items-center gap-2">
<span className={`text-xs font-medium ${liveMode ? "text-green-400" : "text-gray-500"}`}>
Live
</span>
<button
onClick={handleLiveToggle}
className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors focus:outline-none
${liveMode ? "bg-green-500" : "bg-gray-700"}`}
>
<span
className={`ml-auto w-2 h-2 rounded-full ${
whisperStatus === "ready" ? "bg-green-400" : "bg-yellow-400 animate-pulse"
className={`inline-block h-4 w-4 transform rounded-full bg-white shadow transition-transform
${liveMode ? "translate-x-6" : "translate-x-1"}`}
/>
</button>
</div>
</header>
{/* Live mode indicator bar */}
{liveMode && (
<div
className={`flex items-center justify-center gap-2 py-1.5 text-xs font-medium transition-all
${isSpeaking ? "bg-green-600 text-white" : "bg-green-900/40 text-green-400"}`}
>
<span
className={`w-1.5 h-1.5 rounded-full ${
isSpeaking ? "bg-white animate-ping" : "bg-green-400"
}`}
/>
</header>
{isSpeaking ? "Speech detected" : "Waiting for speech…"}
</div>
)}
{/* Messages */}
<div className="flex-1 overflow-y-auto px-4 py-6 space-y-4">
{messages.length === 0 && (
<div className="text-center mt-20 space-y-2">
<p className="text-gray-500 text-sm">
{whisperStatus === "ready"
? "Whisper loaded. Hold the button to talk or type below."
: modelMessage || "Loading Whisper model…"}
<p className="text-center text-gray-600 mt-20 text-sm">
{liveMode
? "Live mode on — just start talking"
: "Hold the button to talk, or type below."}
</p>
</div>
)}
{messages.map((msg) => (
<div
@@ -96,7 +147,7 @@ export default function Home() {
>
{msg.source === "voice" && (
<span className="text-xs opacity-40 block mb-1">
{msg.role === "user" ? "🎙 transcribed" : "🔊 spoken"}
{msg.role === "user" ? "🎙 live" : "🔊 spoken"}
</span>
)}
{msg.content || <span className="opacity-40 animate-pulse"></span>}
@@ -108,7 +159,8 @@ export default function Home() {
{/* Controls */}
<div className="border-t border-gray-800 bg-gray-900 px-4 py-4 space-y-3">
{/* PTT Button */}
{!liveMode && (
<>
<div className="flex justify-center">
<button
onMouseDown={handlePTTDown}
@@ -126,9 +178,11 @@ export default function Home() {
: "bg-indigo-600 hover:bg-indigo-500 active:scale-95 cursor-pointer"
}`}
>
{pttLabel()}
{isRecording ? "🔴" : whisperStatus === "transcribing" ? "💬" : "🎙"}
</button>
</div>
</>
)}
<p className="text-center text-xs text-gray-500">{statusLine()}</p>

88
src/hooks/useLiveVoice.ts Normal file
View File

@@ -0,0 +1,88 @@
// src/hooks/useLiveVoice.ts
"use client";
import { useRef, useState, useCallback, useEffect } from "react";
type LiveVoiceOptions = {
onUtterance: (text: string) => void;
onSpeechStart?: () => void;
};
export function useLiveVoice({ onUtterance, onSpeechStart }: LiveVoiceOptions) {
const [isListening, setIsListening] = useState(false);
const [isSpeaking, setIsSpeaking] = useState(false);
const recognitionRef = useRef<SpeechRecognition | null>(null);
const stoppedManually = useRef(false);
const start = useCallback(() => {
const SpeechRecognition =
window.SpeechRecognition || (window as any).webkitSpeechRecognition;
if (!SpeechRecognition) {
alert("Your browser doesn't support SpeechRecognition. Try Chrome.");
return;
}
const recognition = new SpeechRecognition();
recognition.continuous = true; // keep listening between utterances
recognition.interimResults = false; // only fire when utterance is complete
recognition.lang = "en-GB";
recognition.onstart = () => {
setIsListening(true);
};
recognition.onspeechstart = () => {
setIsSpeaking(true);
onSpeechStart?.();
};
recognition.onspeechend = () => {
setIsSpeaking(false);
};
recognition.onresult = (event: SpeechRecognitionEvent) => {
const last = event.results[event.results.length - 1];
if (last.isFinal) {
const text = last[0].transcript.trim();
if (text) onUtterance(text);
}
};
recognition.onerror = (e: SpeechRecognitionErrorEvent) => {
// 'no-speech' is normal background silence — just ignore it
if (e.error === "no-speech") return;
console.error("SpeechRecognition error:", e.error);
};
recognition.onend = () => {
// Auto-restart unless we stopped it manually
if (!stoppedManually.current) {
recognition.start();
} else {
setIsListening(false);
setIsSpeaking(false);
}
};
stoppedManually.current = false;
recognition.start();
recognitionRef.current = recognition;
}, [onUtterance, onSpeechStart]);
const stop = useCallback(() => {
stoppedManually.current = true;
recognitionRef.current?.stop();
recognitionRef.current = null;
setIsListening(false);
setIsSpeaking(false);
}, []);
useEffect(() => {
return () => {
stoppedManually.current = true;
recognitionRef.current?.stop();
};
}, []);
return { isListening, isSpeaking, start, stop };
}