added live mode for speach with utterence detection
This commit is contained in:
110
src/app/page.tsx
110
src/app/page.tsx
@@ -1,18 +1,51 @@
|
|||||||
// src/app/page.tsx
|
// src/app/page.tsx
|
||||||
"use client";
|
"use client";
|
||||||
import { useState, useRef, useEffect } from "react";
|
import { useState, useRef, useEffect, useCallback } from "react";
|
||||||
import { useChat } from "@/hooks/useChat";
|
import { useChat } from "@/hooks/useChat";
|
||||||
import { useWhisper } from "@/hooks/useWhisper";
|
import { useWhisper } from "@/hooks/useWhisper";
|
||||||
import { useVoiceRecorder } from "@/hooks/useVoiceRecorder";
|
import { useVoiceRecorder } from "@/hooks/useVoiceRecorder";
|
||||||
|
import { useLiveVoice } from "@/hooks/useLiveVoice";
|
||||||
import { stopSpeaking } from "@/lib/tts";
|
import { stopSpeaking } from "@/lib/tts";
|
||||||
|
|
||||||
export default function Home() {
|
export default function Home() {
|
||||||
const [textInput, setTextInput] = useState("");
|
const [textInput, setTextInput] = useState("");
|
||||||
|
const [liveMode, setLiveMode] = useState(false);
|
||||||
|
const [isSpeaking, setIsSpeaking] = useState(false);
|
||||||
const { messages, isLoading, sendMessage } = useChat();
|
const { messages, isLoading, sendMessage } = useChat();
|
||||||
const { status: whisperStatus, modelMessage, transcribe } = useWhisper();
|
const { status: whisperStatus, transcribe } = useWhisper();
|
||||||
const { isRecording, startRecording, stopRecording } = useVoiceRecorder();
|
const { isRecording, startRecording, stopRecording } = useVoiceRecorder();
|
||||||
const bottomRef = useRef<HTMLDivElement>(null);
|
const bottomRef = useRef<HTMLDivElement>(null);
|
||||||
|
|
||||||
|
const handleUtterance = useCallback(
|
||||||
|
(text: string) => {
|
||||||
|
stopSpeaking();
|
||||||
|
sendMessage(text, "voice");
|
||||||
|
},
|
||||||
|
[sendMessage]
|
||||||
|
);
|
||||||
|
|
||||||
|
const { isListening, isSpeaking: vadSpeaking, start: startLive, stop: stopLive } =
|
||||||
|
useLiveVoice({
|
||||||
|
onUtterance: handleUtterance,
|
||||||
|
onSpeechStart: () => setIsSpeaking(true),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Sync VAD speaking state
|
||||||
|
useEffect(() => {
|
||||||
|
setIsSpeaking(vadSpeaking);
|
||||||
|
}, [vadSpeaking]);
|
||||||
|
|
||||||
|
const handleLiveToggle = () => {
|
||||||
|
if (!liveMode) {
|
||||||
|
setLiveMode(true);
|
||||||
|
startLive();
|
||||||
|
} else {
|
||||||
|
setLiveMode(false);
|
||||||
|
stopLive();
|
||||||
|
setIsSpeaking(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
bottomRef.current?.scrollIntoView({ behavior: "smooth" });
|
bottomRef.current?.scrollIntoView({ behavior: "smooth" });
|
||||||
}, [messages]);
|
}, [messages]);
|
||||||
@@ -37,22 +70,15 @@ export default function Home() {
|
|||||||
if (text) sendMessage(text, "voice");
|
if (text) sendMessage(text, "voice");
|
||||||
};
|
};
|
||||||
|
|
||||||
const pttDisabled =
|
const pttDisabled = whisperStatus !== "ready" || isLoading || liveMode;
|
||||||
whisperStatus !== "ready" || isLoading;
|
|
||||||
|
|
||||||
const pttLabel = () => {
|
|
||||||
if (whisperStatus === "loading") return "⏳";
|
|
||||||
if (whisperStatus === "transcribing") return "💬";
|
|
||||||
if (isRecording) return "🔴";
|
|
||||||
return "🎙";
|
|
||||||
};
|
|
||||||
|
|
||||||
const statusLine = () => {
|
const statusLine = () => {
|
||||||
if (whisperStatus === "loading") return modelMessage;
|
if (liveMode && isSpeaking) return "🎙 Hearing you…";
|
||||||
if (whisperStatus === "transcribing") return "Transcribing on-device…";
|
if (liveMode && isLoading) return "⏳ Claw is thinking…";
|
||||||
if (isRecording) return "Recording… release to send";
|
if (liveMode) return "👂 Listening — just speak naturally";
|
||||||
if (whisperStatus === "ready") return "Hold to talk — Whisper ready ✓";
|
if (whisperStatus === "transcribing") return "💬 Transcribing…";
|
||||||
return "Initialising Whisper…";
|
if (isRecording) return "🔴 Recording… release to send";
|
||||||
|
return "Hold to talk";
|
||||||
};
|
};
|
||||||
|
|
||||||
return (
|
return (
|
||||||
@@ -64,23 +90,48 @@ export default function Home() {
|
|||||||
<h1 className="text-xl font-bold tracking-tight">OpenClaw Voice</h1>
|
<h1 className="text-xl font-bold tracking-tight">OpenClaw Voice</h1>
|
||||||
<p className="text-xs text-gray-500">On-device Whisper · No API keys</p>
|
<p className="text-xs text-gray-500">On-device Whisper · No API keys</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Live Mode Toggle */}
|
||||||
|
<div className="ml-auto flex items-center gap-2">
|
||||||
|
<span className={`text-xs font-medium ${liveMode ? "text-green-400" : "text-gray-500"}`}>
|
||||||
|
Live
|
||||||
|
</span>
|
||||||
|
<button
|
||||||
|
onClick={handleLiveToggle}
|
||||||
|
className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors focus:outline-none
|
||||||
|
${liveMode ? "bg-green-500" : "bg-gray-700"}`}
|
||||||
|
>
|
||||||
<span
|
<span
|
||||||
className={`ml-auto w-2 h-2 rounded-full ${
|
className={`inline-block h-4 w-4 transform rounded-full bg-white shadow transition-transform
|
||||||
whisperStatus === "ready" ? "bg-green-400" : "bg-yellow-400 animate-pulse"
|
${liveMode ? "translate-x-6" : "translate-x-1"}`}
|
||||||
|
/>
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
{/* Live mode indicator bar */}
|
||||||
|
{liveMode && (
|
||||||
|
<div
|
||||||
|
className={`flex items-center justify-center gap-2 py-1.5 text-xs font-medium transition-all
|
||||||
|
${isSpeaking ? "bg-green-600 text-white" : "bg-green-900/40 text-green-400"}`}
|
||||||
|
>
|
||||||
|
<span
|
||||||
|
className={`w-1.5 h-1.5 rounded-full ${
|
||||||
|
isSpeaking ? "bg-white animate-ping" : "bg-green-400"
|
||||||
}`}
|
}`}
|
||||||
/>
|
/>
|
||||||
</header>
|
{isSpeaking ? "Speech detected" : "Waiting for speech…"}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Messages */}
|
{/* Messages */}
|
||||||
<div className="flex-1 overflow-y-auto px-4 py-6 space-y-4">
|
<div className="flex-1 overflow-y-auto px-4 py-6 space-y-4">
|
||||||
{messages.length === 0 && (
|
{messages.length === 0 && (
|
||||||
<div className="text-center mt-20 space-y-2">
|
<p className="text-center text-gray-600 mt-20 text-sm">
|
||||||
<p className="text-gray-500 text-sm">
|
{liveMode
|
||||||
{whisperStatus === "ready"
|
? "Live mode on — just start talking"
|
||||||
? "Whisper loaded. Hold the button to talk or type below."
|
: "Hold the button to talk, or type below."}
|
||||||
: modelMessage || "Loading Whisper model…"}
|
|
||||||
</p>
|
</p>
|
||||||
</div>
|
|
||||||
)}
|
)}
|
||||||
{messages.map((msg) => (
|
{messages.map((msg) => (
|
||||||
<div
|
<div
|
||||||
@@ -96,7 +147,7 @@ export default function Home() {
|
|||||||
>
|
>
|
||||||
{msg.source === "voice" && (
|
{msg.source === "voice" && (
|
||||||
<span className="text-xs opacity-40 block mb-1">
|
<span className="text-xs opacity-40 block mb-1">
|
||||||
{msg.role === "user" ? "🎙 transcribed" : "🔊 spoken"}
|
{msg.role === "user" ? "🎙 live" : "🔊 spoken"}
|
||||||
</span>
|
</span>
|
||||||
)}
|
)}
|
||||||
{msg.content || <span className="opacity-40 animate-pulse">▍</span>}
|
{msg.content || <span className="opacity-40 animate-pulse">▍</span>}
|
||||||
@@ -108,7 +159,8 @@ export default function Home() {
|
|||||||
|
|
||||||
{/* Controls */}
|
{/* Controls */}
|
||||||
<div className="border-t border-gray-800 bg-gray-900 px-4 py-4 space-y-3">
|
<div className="border-t border-gray-800 bg-gray-900 px-4 py-4 space-y-3">
|
||||||
{/* PTT Button */}
|
{!liveMode && (
|
||||||
|
<>
|
||||||
<div className="flex justify-center">
|
<div className="flex justify-center">
|
||||||
<button
|
<button
|
||||||
onMouseDown={handlePTTDown}
|
onMouseDown={handlePTTDown}
|
||||||
@@ -126,9 +178,11 @@ export default function Home() {
|
|||||||
: "bg-indigo-600 hover:bg-indigo-500 active:scale-95 cursor-pointer"
|
: "bg-indigo-600 hover:bg-indigo-500 active:scale-95 cursor-pointer"
|
||||||
}`}
|
}`}
|
||||||
>
|
>
|
||||||
{pttLabel()}
|
{isRecording ? "🔴" : whisperStatus === "transcribing" ? "💬" : "🎙"}
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
|
||||||
<p className="text-center text-xs text-gray-500">{statusLine()}</p>
|
<p className="text-center text-xs text-gray-500">{statusLine()}</p>
|
||||||
|
|
||||||
|
|||||||
88
src/hooks/useLiveVoice.ts
Normal file
88
src/hooks/useLiveVoice.ts
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
// src/hooks/useLiveVoice.ts
|
||||||
|
"use client";
|
||||||
|
import { useRef, useState, useCallback, useEffect } from "react";
|
||||||
|
|
||||||
|
type LiveVoiceOptions = {
|
||||||
|
onUtterance: (text: string) => void;
|
||||||
|
onSpeechStart?: () => void;
|
||||||
|
};
|
||||||
|
|
||||||
|
export function useLiveVoice({ onUtterance, onSpeechStart }: LiveVoiceOptions) {
|
||||||
|
const [isListening, setIsListening] = useState(false);
|
||||||
|
const [isSpeaking, setIsSpeaking] = useState(false);
|
||||||
|
const recognitionRef = useRef<SpeechRecognition | null>(null);
|
||||||
|
const stoppedManually = useRef(false);
|
||||||
|
|
||||||
|
const start = useCallback(() => {
|
||||||
|
const SpeechRecognition =
|
||||||
|
window.SpeechRecognition || (window as any).webkitSpeechRecognition;
|
||||||
|
|
||||||
|
if (!SpeechRecognition) {
|
||||||
|
alert("Your browser doesn't support SpeechRecognition. Try Chrome.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const recognition = new SpeechRecognition();
|
||||||
|
recognition.continuous = true; // keep listening between utterances
|
||||||
|
recognition.interimResults = false; // only fire when utterance is complete
|
||||||
|
recognition.lang = "en-GB";
|
||||||
|
|
||||||
|
recognition.onstart = () => {
|
||||||
|
setIsListening(true);
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onspeechstart = () => {
|
||||||
|
setIsSpeaking(true);
|
||||||
|
onSpeechStart?.();
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onspeechend = () => {
|
||||||
|
setIsSpeaking(false);
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onresult = (event: SpeechRecognitionEvent) => {
|
||||||
|
const last = event.results[event.results.length - 1];
|
||||||
|
if (last.isFinal) {
|
||||||
|
const text = last[0].transcript.trim();
|
||||||
|
if (text) onUtterance(text);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onerror = (e: SpeechRecognitionErrorEvent) => {
|
||||||
|
// 'no-speech' is normal background silence — just ignore it
|
||||||
|
if (e.error === "no-speech") return;
|
||||||
|
console.error("SpeechRecognition error:", e.error);
|
||||||
|
};
|
||||||
|
|
||||||
|
recognition.onend = () => {
|
||||||
|
// Auto-restart unless we stopped it manually
|
||||||
|
if (!stoppedManually.current) {
|
||||||
|
recognition.start();
|
||||||
|
} else {
|
||||||
|
setIsListening(false);
|
||||||
|
setIsSpeaking(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
stoppedManually.current = false;
|
||||||
|
recognition.start();
|
||||||
|
recognitionRef.current = recognition;
|
||||||
|
}, [onUtterance, onSpeechStart]);
|
||||||
|
|
||||||
|
const stop = useCallback(() => {
|
||||||
|
stoppedManually.current = true;
|
||||||
|
recognitionRef.current?.stop();
|
||||||
|
recognitionRef.current = null;
|
||||||
|
setIsListening(false);
|
||||||
|
setIsSpeaking(false);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
return () => {
|
||||||
|
stoppedManually.current = true;
|
||||||
|
recognitionRef.current?.stop();
|
||||||
|
};
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
return { isListening, isSpeaking, start, stop };
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user