added live mode for speach with utterence detection

2026-03-29 19:34:00 +01:00
parent d577528e47
commit 58392ea198
2 changed files with 192 additions and 50 deletions
--- a/src/app/page.tsx
+++ b/src/app/page.tsx
@@ -1,18 +1,51 @@
 // src/app/page.tsx
 "use client";
-import { useState, useRef, useEffect } from "react";
+import { useState, useRef, useEffect, useCallback } from "react";
 import { useChat } from "@/hooks/useChat";
 import { useWhisper } from "@/hooks/useWhisper";
 import { useVoiceRecorder } from "@/hooks/useVoiceRecorder";
 import { useLiveVoice } from "@/hooks/useLiveVoice";
 import { stopSpeaking } from "@/lib/tts";
 export default function Home() {
  const [textInput, setTextInput] = useState("");
  const [liveMode, setLiveMode] = useState(false);
  const [isSpeaking, setIsSpeaking] = useState(false);
  const { messages, isLoading, sendMessage } = useChat();
-  const { status: whisperStatus, modelMessage, transcribe } = useWhisper();
+  const { status: whisperStatus, transcribe } = useWhisper();
  const { isRecording, startRecording, stopRecording } = useVoiceRecorder();
  const bottomRef = useRef<HTMLDivElement>(null);
  const handleUtterance = useCallback(
    (text: string) => {
      stopSpeaking();
      sendMessage(text, "voice");
    },
    [sendMessage]
  );
  const { isListening, isSpeaking: vadSpeaking, start: startLive, stop: stopLive } =
    useLiveVoice({
      onUtterance: handleUtterance,
      onSpeechStart: () => setIsSpeaking(true),
    });
  // Sync VAD speaking state
  useEffect(() => {
    setIsSpeaking(vadSpeaking);
  }, [vadSpeaking]);
  const handleLiveToggle = () => {
    if (!liveMode) {
      setLiveMode(true);
      startLive();
    } else {
      setLiveMode(false);
      stopLive();
      setIsSpeaking(false);
    }
  };
  useEffect(() => {
    bottomRef.current?.scrollIntoView({ behavior: "smooth" });
  }, [messages]);
@@ -37,22 +70,15 @@ export default function Home() {
    if (text) sendMessage(text, "voice");
  };
-  const pttDisabled =
+  const pttDisabled = whisperStatus !== "ready" || isLoading || liveMode;
    whisperStatus !== "ready" || isLoading;
  const pttLabel = () => {
    if (whisperStatus === "loading") return "⏳";
    if (whisperStatus === "transcribing") return "💬";
    if (isRecording) return "🔴";
    return "🎙";
  };
  const statusLine = () => {
-    if (whisperStatus === "loading") return modelMessage;
+    if (liveMode && isSpeaking) return "🎙 Hearing you…";
-    if (whisperStatus === "transcribing") return "Transcribing on-device…";
+    if (liveMode && isLoading) return "⏳ Claw is thinking…";
-    if (isRecording) return "Recording… release to send";
+    if (liveMode) return "👂 Listening — just speak naturally";
-    if (whisperStatus === "ready") return "Hold to talk — Whisper ready ✓";
+    if (whisperStatus === "transcribing") return "💬 Transcribing…";
-    return "Initialising Whisper…";
+    if (isRecording) return "🔴 Recording… release to send";
    return "Hold to talk";
  };
  return (
@@ -64,23 +90,48 @@ export default function Home() {
          <h1 className="text-xl font-bold tracking-tight">OpenClaw Voice</h1>
          <p className="text-xs text-gray-500">On-device Whisper · No API keys</p>
        </div>
        {/* Live Mode Toggle */}
        <div className="ml-auto flex items-center gap-2">
          <span className={`text-xs font-medium ${liveMode ? "text-green-400" : "text-gray-500"}`}>
            Live
          </span>
          <button
            onClick={handleLiveToggle}
            className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors focus:outline-none
              ${liveMode ? "bg-green-500" : "bg-gray-700"}`}
          >
            <span
-          className={`ml-auto w-2 h-2 rounded-full ${
+              className={`inline-block h-4 w-4 transform rounded-full bg-white shadow transition-transform
-            whisperStatus === "ready" ? "bg-green-400" : "bg-yellow-400 animate-pulse"
+                ${liveMode ? "translate-x-6" : "translate-x-1"}`}
            />
          </button>
        </div>
      </header>
      {/* Live mode indicator bar */}
      {liveMode && (
        <div
          className={`flex items-center justify-center gap-2 py-1.5 text-xs font-medium transition-all
            ${isSpeaking ? "bg-green-600 text-white" : "bg-green-900/40 text-green-400"}`}
        >
          <span
            className={`w-1.5 h-1.5 rounded-full ${
              isSpeaking ? "bg-white animate-ping" : "bg-green-400"
            }`}
          />
-      </header>
+          {isSpeaking ? "Speech detected" : "Waiting for speech…"}
        </div>
      )}
      {/* Messages */}
      <div className="flex-1 overflow-y-auto px-4 py-6 space-y-4">
        {messages.length === 0 && (
-          <div className="text-center mt-20 space-y-2">
+          <p className="text-center text-gray-600 mt-20 text-sm">
-            <p className="text-gray-500 text-sm">
+            {liveMode
-              {whisperStatus === "ready"
+              ? "Live mode on — just start talking"
-                ? "Whisper loaded. Hold the button to talk or type below."
+              : "Hold the button to talk, or type below."}
                : modelMessage || "Loading Whisper model…"}
          </p>
          </div>
        )}
        {messages.map((msg) => (
          <div
@@ -96,7 +147,7 @@ export default function Home() {
            >
              {msg.source === "voice" && (
                <span className="text-xs opacity-40 block mb-1">
-                  {msg.role === "user" ? "🎙 transcribed" : "🔊 spoken"}
+                  {msg.role === "user" ? "🎙 live" : "🔊 spoken"}
                </span>
              )}
              {msg.content || <span className="opacity-40 animate-pulse">▍</span>}
@@ -108,7 +159,8 @@ export default function Home() {
      {/* Controls */}
      <div className="border-t border-gray-800 bg-gray-900 px-4 py-4 space-y-3">
-        {/* PTT Button */}
+        {!liveMode && (
          <>
            <div className="flex justify-center">
              <button
                onMouseDown={handlePTTDown}
@@ -126,9 +178,11 @@ export default function Home() {
                    : "bg-indigo-600 hover:bg-indigo-500 active:scale-95 cursor-pointer"
                  }`}
              >
-            {pttLabel()}
+                {isRecording ? "🔴" : whisperStatus === "transcribing" ? "💬" : "🎙"}
              </button>
            </div>
          </>
        )}
        <p className="text-center text-xs text-gray-500">{statusLine()}</p>
--- a/src/hooks/useLiveVoice.ts
+++ b/src/hooks/useLiveVoice.ts
@@ -0,0 +1,88 @@
 // src/hooks/useLiveVoice.ts
 "use client";
 import { useRef, useState, useCallback, useEffect } from "react";
 type LiveVoiceOptions = {
  onUtterance: (text: string) => void;
  onSpeechStart?: () => void;
 };
 export function useLiveVoice({ onUtterance, onSpeechStart }: LiveVoiceOptions) {
  const [isListening, setIsListening] = useState(false);
  const [isSpeaking, setIsSpeaking] = useState(false);
  const recognitionRef = useRef<SpeechRecognition | null>(null);
  const stoppedManually = useRef(false);
  const start = useCallback(() => {
    const SpeechRecognition =
      window.SpeechRecognition || (window as any).webkitSpeechRecognition;
    if (!SpeechRecognition) {
      alert("Your browser doesn't support SpeechRecognition. Try Chrome.");
      return;
    }
    const recognition = new SpeechRecognition();
    recognition.continuous = true;       // keep listening between utterances
    recognition.interimResults = false;  // only fire when utterance is complete
    recognition.lang = "en-GB";
    recognition.onstart = () => {
      setIsListening(true);
    };
    recognition.onspeechstart = () => {
      setIsSpeaking(true);
      onSpeechStart?.();
    };
    recognition.onspeechend = () => {
      setIsSpeaking(false);
    };
    recognition.onresult = (event: SpeechRecognitionEvent) => {
      const last = event.results[event.results.length - 1];
      if (last.isFinal) {
        const text = last[0].transcript.trim();
        if (text) onUtterance(text);
      }
    };
    recognition.onerror = (e: SpeechRecognitionErrorEvent) => {
      // 'no-speech' is normal background silence — just ignore it
      if (e.error === "no-speech") return;
      console.error("SpeechRecognition error:", e.error);
    };
    recognition.onend = () => {
      // Auto-restart unless we stopped it manually
      if (!stoppedManually.current) {
        recognition.start();
      } else {
        setIsListening(false);
        setIsSpeaking(false);
      }
    };
    stoppedManually.current = false;
    recognition.start();
    recognitionRef.current = recognition;
  }, [onUtterance, onSpeechStart]);
  const stop = useCallback(() => {
    stoppedManually.current = true;
    recognitionRef.current?.stop();
    recognitionRef.current = null;
    setIsListening(false);
    setIsSpeaking(false);
  }, []);
  useEffect(() => {
    return () => {
      stoppedManually.current = true;
      recognitionRef.current?.stop();
    };
  }, []);
  return { isListening, isSpeaking, start, stop };
 }