added live mode for speach with utterence detection

2026-03-29 19:34:00 +01:00
parent d577528e47
commit 58392ea198
2 changed files with 192 additions and 50 deletions
--- a/src/app/page.tsx
+++ b/src/app/page.tsx
@@ -1,18 +1,51 @@
 // src/app/page.tsx
 "use client";
-import { useState, useRef, useEffect } from "react";
+import { useState, useRef, useEffect, useCallback } from "react";
 import { useChat } from "@/hooks/useChat";
 import { useWhisper } from "@/hooks/useWhisper";
 import { useVoiceRecorder } from "@/hooks/useVoiceRecorder";
+import { useLiveVoice } from "@/hooks/useLiveVoice";
 import { stopSpeaking } from "@/lib/tts";

 export default function Home() {
  const [textInput, setTextInput] = useState("");
+  const [liveMode, setLiveMode] = useState(false);
+  const [isSpeaking, setIsSpeaking] = useState(false);
  const { messages, isLoading, sendMessage } = useChat();
-  const { status: whisperStatus, modelMessage, transcribe } = useWhisper();
+  const { status: whisperStatus, transcribe } = useWhisper();
  const { isRecording, startRecording, stopRecording } = useVoiceRecorder();
  const bottomRef = useRef<HTMLDivElement>(null);

+  const handleUtterance = useCallback(
+    (text: string) => {
+      stopSpeaking();
+      sendMessage(text, "voice");
+    },
+    [sendMessage]
+  );
+
+  const { isListening, isSpeaking: vadSpeaking, start: startLive, stop: stopLive } =
+    useLiveVoice({
+      onUtterance: handleUtterance,
+      onSpeechStart: () => setIsSpeaking(true),
+    });
+
+  // Sync VAD speaking state
+  useEffect(() => {
+    setIsSpeaking(vadSpeaking);
+  }, [vadSpeaking]);
+
+  const handleLiveToggle = () => {
+    if (!liveMode) {
+      setLiveMode(true);
+      startLive();
+    } else {
+      setLiveMode(false);
+      stopLive();
+      setIsSpeaking(false);
+    }
+  };
+
  useEffect(() => {
    bottomRef.current?.scrollIntoView({ behavior: "smooth" });
  }, [messages]);
@@ -37,22 +70,15 @@ export default function Home() {
    if (text) sendMessage(text, "voice");
  };

-  const pttDisabled =
-    whisperStatus !== "ready" || isLoading;
-
-  const pttLabel = () => {
-    if (whisperStatus === "loading") return "⏳";
-    if (whisperStatus === "transcribing") return "💬";
-    if (isRecording) return "🔴";
-    return "🎙";
-  };
+  const pttDisabled = whisperStatus !== "ready" || isLoading || liveMode;

  const statusLine = () => {
-    if (whisperStatus === "loading") return modelMessage;
-    if (whisperStatus === "transcribing") return "Transcribing on-device…";
-    if (isRecording) return "Recording… release to send";
-    if (whisperStatus === "ready") return "Hold to talk — Whisper ready ✓";
-    return "Initialising Whisper…";
+    if (liveMode && isSpeaking) return "🎙 Hearing you…";
+    if (liveMode && isLoading) return "⏳ Claw is thinking…";
+    if (liveMode) return "👂 Listening — just speak naturally";
+    if (whisperStatus === "transcribing") return "💬 Transcribing…";
+    if (isRecording) return "🔴 Recording… release to send";
+    return "Hold to talk";
  };

  return (
@@ -64,23 +90,48 @@ export default function Home() {
          <h1 className="text-xl font-bold tracking-tight">OpenClaw Voice</h1>
          <p className="text-xs text-gray-500">On-device Whisper · No API keys</p>
        </div>
+
+        {/* Live Mode Toggle */}
+        <div className="ml-auto flex items-center gap-2">
+          <span className={`text-xs font-medium ${liveMode ? "text-green-400" : "text-gray-500"}`}>
+            Live
+          </span>
+          <button
+            onClick={handleLiveToggle}
+            className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors focus:outline-none
+              ${liveMode ? "bg-green-500" : "bg-gray-700"}`}
+          >
            <span
-          className={`ml-auto w-2 h-2 rounded-full ${
-            whisperStatus === "ready" ? "bg-green-400" : "bg-yellow-400 animate-pulse"
+              className={`inline-block h-4 w-4 transform rounded-full bg-white shadow transition-transform
+                ${liveMode ? "translate-x-6" : "translate-x-1"}`}
+            />
+          </button>
+        </div>
+      </header>
+
+      {/* Live mode indicator bar */}
+      {liveMode && (
+        <div
+          className={`flex items-center justify-center gap-2 py-1.5 text-xs font-medium transition-all
+            ${isSpeaking ? "bg-green-600 text-white" : "bg-green-900/40 text-green-400"}`}
+        >
+          <span
+            className={`w-1.5 h-1.5 rounded-full ${
+              isSpeaking ? "bg-white animate-ping" : "bg-green-400"
            }`}
          />
-      </header>
+          {isSpeaking ? "Speech detected" : "Waiting for speech…"}
+        </div>
+      )}

      {/* Messages */}
      <div className="flex-1 overflow-y-auto px-4 py-6 space-y-4">
        {messages.length === 0 && (
-          <div className="text-center mt-20 space-y-2">
-            <p className="text-gray-500 text-sm">
-              {whisperStatus === "ready"
-                ? "Whisper loaded. Hold the button to talk or type below."
-                : modelMessage || "Loading Whisper model…"}
+          <p className="text-center text-gray-600 mt-20 text-sm">
+            {liveMode
+              ? "Live mode on — just start talking"
+              : "Hold the button to talk, or type below."}
          </p>
-          </div>
        )}
        {messages.map((msg) => (
          <div
@@ -96,7 +147,7 @@ export default function Home() {
            >
              {msg.source === "voice" && (
                <span className="text-xs opacity-40 block mb-1">
-                  {msg.role === "user" ? "🎙 transcribed" : "🔊 spoken"}
+                  {msg.role === "user" ? "🎙 live" : "🔊 spoken"}
                </span>
              )}
              {msg.content || <span className="opacity-40 animate-pulse">▍</span>}
@@ -108,7 +159,8 @@ export default function Home() {

      {/* Controls */}
      <div className="border-t border-gray-800 bg-gray-900 px-4 py-4 space-y-3">
-        {/* PTT Button */}
+        {!liveMode && (
+          <>
            <div className="flex justify-center">
              <button
                onMouseDown={handlePTTDown}
@@ -126,9 +178,11 @@ export default function Home() {
                    : "bg-indigo-600 hover:bg-indigo-500 active:scale-95 cursor-pointer"
                  }`}
              >
-            {pttLabel()}
+                {isRecording ? "🔴" : whisperStatus === "transcribing" ? "💬" : "🎙"}
              </button>
            </div>
+          </>
+        )}

        <p className="text-center text-xs text-gray-500">{statusLine()}</p>

--- a/src/hooks/useLiveVoice.ts
+++ b/src/hooks/useLiveVoice.ts
@@ -0,0 +1,88 @@
+// src/hooks/useLiveVoice.ts
+"use client";
+import { useRef, useState, useCallback, useEffect } from "react";
+
+type LiveVoiceOptions = {
+  onUtterance: (text: string) => void;
+  onSpeechStart?: () => void;
+};
+
+export function useLiveVoice({ onUtterance, onSpeechStart }: LiveVoiceOptions) {
+  const [isListening, setIsListening] = useState(false);
+  const [isSpeaking, setIsSpeaking] = useState(false);
+  const recognitionRef = useRef<SpeechRecognition | null>(null);
+  const stoppedManually = useRef(false);
+
+  const start = useCallback(() => {
+    const SpeechRecognition =
+      window.SpeechRecognition || (window as any).webkitSpeechRecognition;
+
+    if (!SpeechRecognition) {
+      alert("Your browser doesn't support SpeechRecognition. Try Chrome.");
+      return;
+    }
+
+    const recognition = new SpeechRecognition();
+    recognition.continuous = true;       // keep listening between utterances
+    recognition.interimResults = false;  // only fire when utterance is complete
+    recognition.lang = "en-GB";
+
+    recognition.onstart = () => {
+      setIsListening(true);
+    };
+
+    recognition.onspeechstart = () => {
+      setIsSpeaking(true);
+      onSpeechStart?.();
+    };
+
+    recognition.onspeechend = () => {
+      setIsSpeaking(false);
+    };
+
+    recognition.onresult = (event: SpeechRecognitionEvent) => {
+      const last = event.results[event.results.length - 1];
+      if (last.isFinal) {
+        const text = last[0].transcript.trim();
+        if (text) onUtterance(text);
+      }
+    };
+
+    recognition.onerror = (e: SpeechRecognitionErrorEvent) => {
+      // 'no-speech' is normal background silence — just ignore it
+      if (e.error === "no-speech") return;
+      console.error("SpeechRecognition error:", e.error);
+    };
+
+    recognition.onend = () => {
+      // Auto-restart unless we stopped it manually
+      if (!stoppedManually.current) {
+        recognition.start();
+      } else {
+        setIsListening(false);
+        setIsSpeaking(false);
+      }
+    };
+
+    stoppedManually.current = false;
+    recognition.start();
+    recognitionRef.current = recognition;
+  }, [onUtterance, onSpeechStart]);
+
+  const stop = useCallback(() => {
+    stoppedManually.current = true;
+    recognitionRef.current?.stop();
+    recognitionRef.current = null;
+    setIsListening(false);
+    setIsSpeaking(false);
+  }, []);
+
+  useEffect(() => {
+    return () => {
+      stoppedManually.current = true;
+      recognitionRef.current?.stop();
+    };
+  }, []);
+
+  return { isListening, isSpeaking, start, stop };
+}