You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
	
	
		
			360 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			TypeScript
		
	
			
		
		
	
	
			360 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			TypeScript
		
	
| import VoiceIcon from "@/app/icons/voice.svg";
 | |
| import VoiceOffIcon from "@/app/icons/voice-off.svg";
 | |
| import PowerIcon from "@/app/icons/power.svg";
 | |
| 
 | |
| import styles from "./realtime-chat.module.scss";
 | |
| import clsx from "clsx";
 | |
| 
 | |
| import { useState, useRef, useEffect } from "react";
 | |
| 
 | |
| import { useChatStore, createMessage, useAppConfig } from "@/app/store";
 | |
| 
 | |
| import { IconButton } from "@/app/components/button";
 | |
| 
 | |
| import {
 | |
|   Modality,
 | |
|   RTClient,
 | |
|   RTInputAudioItem,
 | |
|   RTResponse,
 | |
|   TurnDetection,
 | |
| } from "rt-client";
 | |
| import { AudioHandler } from "@/app/lib/audio";
 | |
| import { uploadImage } from "@/app/utils/chat";
 | |
| import { VoicePrint } from "@/app/components/voice-print";
 | |
| 
 | |
| interface RealtimeChatProps {
 | |
|   onClose?: () => void;
 | |
|   onStartVoice?: () => void;
 | |
|   onPausedVoice?: () => void;
 | |
| }
 | |
| 
 | |
| export function RealtimeChat({
 | |
|   onClose,
 | |
|   onStartVoice,
 | |
|   onPausedVoice,
 | |
| }: RealtimeChatProps) {
 | |
|   const chatStore = useChatStore();
 | |
|   const session = chatStore.currentSession();
 | |
|   const config = useAppConfig();
 | |
|   const [status, setStatus] = useState("");
 | |
|   const [isRecording, setIsRecording] = useState(false);
 | |
|   const [isConnected, setIsConnected] = useState(false);
 | |
|   const [isConnecting, setIsConnecting] = useState(false);
 | |
|   const [modality, setModality] = useState("audio");
 | |
|   const [useVAD, setUseVAD] = useState(true);
 | |
|   const [frequencies, setFrequencies] = useState<Uint8Array | undefined>();
 | |
| 
 | |
|   const clientRef = useRef<RTClient | null>(null);
 | |
|   const audioHandlerRef = useRef<AudioHandler | null>(null);
 | |
|   const initRef = useRef(false);
 | |
| 
 | |
|   const temperature = config.realtimeConfig.temperature;
 | |
|   const apiKey = config.realtimeConfig.apiKey;
 | |
|   const model = config.realtimeConfig.model;
 | |
|   const azure = config.realtimeConfig.provider === "Azure";
 | |
|   const azureEndpoint = config.realtimeConfig.azure.endpoint;
 | |
|   const azureDeployment = config.realtimeConfig.azure.deployment;
 | |
|   const voice = config.realtimeConfig.voice;
 | |
| 
 | |
|   const handleConnect = async () => {
 | |
|     if (isConnecting) return;
 | |
|     if (!isConnected) {
 | |
|       try {
 | |
|         setIsConnecting(true);
 | |
|         clientRef.current = azure
 | |
|           ? new RTClient(
 | |
|               new URL(azureEndpoint),
 | |
|               { key: apiKey },
 | |
|               { deployment: azureDeployment },
 | |
|             )
 | |
|           : new RTClient({ key: apiKey }, { model });
 | |
|         const modalities: Modality[] =
 | |
|           modality === "audio" ? ["text", "audio"] : ["text"];
 | |
|         const turnDetection: TurnDetection = useVAD
 | |
|           ? { type: "server_vad" }
 | |
|           : null;
 | |
|         await clientRef.current.configure({
 | |
|           instructions: "",
 | |
|           voice,
 | |
|           input_audio_transcription: { model: "whisper-1" },
 | |
|           turn_detection: turnDetection,
 | |
|           tools: [],
 | |
|           temperature,
 | |
|           modalities,
 | |
|         });
 | |
|         startResponseListener();
 | |
| 
 | |
|         setIsConnected(true);
 | |
|         // TODO
 | |
|         // try {
 | |
|         //   const recentMessages = chatStore.getMessagesWithMemory();
 | |
|         //   for (const message of recentMessages) {
 | |
|         //     const { role, content } = message;
 | |
|         //     if (typeof content === "string") {
 | |
|         //       await clientRef.current.sendItem({
 | |
|         //         type: "message",
 | |
|         //         role: role as any,
 | |
|         //         content: [
 | |
|         //           {
 | |
|         //             type: (role === "assistant" ? "text" : "input_text") as any,
 | |
|         //             text: content as string,
 | |
|         //           },
 | |
|         //         ],
 | |
|         //       });
 | |
|         //     }
 | |
|         //   }
 | |
|         //   // await clientRef.current.generateResponse();
 | |
|         // } catch (error) {
 | |
|         //   console.error("Set message failed:", error);
 | |
|         // }
 | |
|       } catch (error) {
 | |
|         console.error("Connection failed:", error);
 | |
|         setStatus("Connection failed");
 | |
|       } finally {
 | |
|         setIsConnecting(false);
 | |
|       }
 | |
|     } else {
 | |
|       await disconnect();
 | |
|     }
 | |
|   };
 | |
| 
 | |
|   const disconnect = async () => {
 | |
|     if (clientRef.current) {
 | |
|       try {
 | |
|         await clientRef.current.close();
 | |
|         clientRef.current = null;
 | |
|         setIsConnected(false);
 | |
|       } catch (error) {
 | |
|         console.error("Disconnect failed:", error);
 | |
|       }
 | |
|     }
 | |
|   };
 | |
| 
 | |
|   const startResponseListener = async () => {
 | |
|     if (!clientRef.current) return;
 | |
| 
 | |
|     try {
 | |
|       for await (const serverEvent of clientRef.current.events()) {
 | |
|         if (serverEvent.type === "response") {
 | |
|           await handleResponse(serverEvent);
 | |
|         } else if (serverEvent.type === "input_audio") {
 | |
|           await handleInputAudio(serverEvent);
 | |
|         }
 | |
|       }
 | |
|     } catch (error) {
 | |
|       if (clientRef.current) {
 | |
|         console.error("Response iteration error:", error);
 | |
|       }
 | |
|     }
 | |
|   };
 | |
| 
 | |
|   const handleResponse = async (response: RTResponse) => {
 | |
|     for await (const item of response) {
 | |
|       if (item.type === "message" && item.role === "assistant") {
 | |
|         const botMessage = createMessage({
 | |
|           role: item.role,
 | |
|           content: "",
 | |
|         });
 | |
|         // add bot message first
 | |
|         chatStore.updateTargetSession(session, (session) => {
 | |
|           session.messages = session.messages.concat([botMessage]);
 | |
|         });
 | |
|         let hasAudio = false;
 | |
|         for await (const content of item) {
 | |
|           if (content.type === "text") {
 | |
|             for await (const text of content.textChunks()) {
 | |
|               botMessage.content += text;
 | |
|             }
 | |
|           } else if (content.type === "audio") {
 | |
|             const textTask = async () => {
 | |
|               for await (const text of content.transcriptChunks()) {
 | |
|                 botMessage.content += text;
 | |
|               }
 | |
|             };
 | |
|             const audioTask = async () => {
 | |
|               audioHandlerRef.current?.startStreamingPlayback();
 | |
|               for await (const audio of content.audioChunks()) {
 | |
|                 hasAudio = true;
 | |
|                 audioHandlerRef.current?.playChunk(audio);
 | |
|               }
 | |
|             };
 | |
|             await Promise.all([textTask(), audioTask()]);
 | |
|           }
 | |
|           // update message.content
 | |
|           chatStore.updateTargetSession(session, (session) => {
 | |
|             session.messages = session.messages.concat();
 | |
|           });
 | |
|         }
 | |
|         if (hasAudio) {
 | |
|           // upload audio get audio_url
 | |
|           const blob = audioHandlerRef.current?.savePlayFile();
 | |
|           uploadImage(blob!).then((audio_url) => {
 | |
|             botMessage.audio_url = audio_url;
 | |
|             // update text and audio_url
 | |
|             chatStore.updateTargetSession(session, (session) => {
 | |
|               session.messages = session.messages.concat();
 | |
|             });
 | |
|           });
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   };
 | |
| 
 | |
|   const handleInputAudio = async (item: RTInputAudioItem) => {
 | |
|     await item.waitForCompletion();
 | |
|     if (item.transcription) {
 | |
|       const userMessage = createMessage({
 | |
|         role: "user",
 | |
|         content: item.transcription,
 | |
|       });
 | |
|       chatStore.updateTargetSession(session, (session) => {
 | |
|         session.messages = session.messages.concat([userMessage]);
 | |
|       });
 | |
|       // save input audio_url, and update session
 | |
|       const { audioStartMillis, audioEndMillis } = item;
 | |
|       // upload audio get audio_url
 | |
|       const blob = audioHandlerRef.current?.saveRecordFile(
 | |
|         audioStartMillis,
 | |
|         audioEndMillis,
 | |
|       );
 | |
|       uploadImage(blob!).then((audio_url) => {
 | |
|         userMessage.audio_url = audio_url;
 | |
|         chatStore.updateTargetSession(session, (session) => {
 | |
|           session.messages = session.messages.concat();
 | |
|         });
 | |
|       });
 | |
|     }
 | |
|     // stop streaming play after get input audio.
 | |
|     audioHandlerRef.current?.stopStreamingPlayback();
 | |
|   };
 | |
| 
 | |
|   const toggleRecording = async () => {
 | |
|     if (!isRecording && clientRef.current) {
 | |
|       try {
 | |
|         if (!audioHandlerRef.current) {
 | |
|           audioHandlerRef.current = new AudioHandler();
 | |
|           await audioHandlerRef.current.initialize();
 | |
|         }
 | |
|         await audioHandlerRef.current.startRecording(async (chunk) => {
 | |
|           await clientRef.current?.sendAudio(chunk);
 | |
|         });
 | |
|         setIsRecording(true);
 | |
|       } catch (error) {
 | |
|         console.error("Failed to start recording:", error);
 | |
|       }
 | |
|     } else if (audioHandlerRef.current) {
 | |
|       try {
 | |
|         audioHandlerRef.current.stopRecording();
 | |
|         if (!useVAD) {
 | |
|           const inputAudio = await clientRef.current?.commitAudio();
 | |
|           await handleInputAudio(inputAudio!);
 | |
|           await clientRef.current?.generateResponse();
 | |
|         }
 | |
|         setIsRecording(false);
 | |
|       } catch (error) {
 | |
|         console.error("Failed to stop recording:", error);
 | |
|       }
 | |
|     }
 | |
|   };
 | |
| 
 | |
|   useEffect(() => {
 | |
|     // 防止重复初始化
 | |
|     if (initRef.current) return;
 | |
|     initRef.current = true;
 | |
| 
 | |
|     const initAudioHandler = async () => {
 | |
|       const handler = new AudioHandler();
 | |
|       await handler.initialize();
 | |
|       audioHandlerRef.current = handler;
 | |
|       await handleConnect();
 | |
|       await toggleRecording();
 | |
|     };
 | |
| 
 | |
|     initAudioHandler().catch((error) => {
 | |
|       setStatus(error);
 | |
|       console.error(error);
 | |
|     });
 | |
| 
 | |
|     return () => {
 | |
|       if (isRecording) {
 | |
|         toggleRecording();
 | |
|       }
 | |
|       audioHandlerRef.current?.close().catch(console.error);
 | |
|       disconnect();
 | |
|     };
 | |
|   }, []);
 | |
| 
 | |
|   useEffect(() => {
 | |
|     let animationFrameId: number;
 | |
| 
 | |
|     if (isConnected && isRecording) {
 | |
|       const animationFrame = () => {
 | |
|         if (audioHandlerRef.current) {
 | |
|           const freqData = audioHandlerRef.current.getByteFrequencyData();
 | |
|           setFrequencies(freqData);
 | |
|         }
 | |
|         animationFrameId = requestAnimationFrame(animationFrame);
 | |
|       };
 | |
| 
 | |
|       animationFrameId = requestAnimationFrame(animationFrame);
 | |
|     } else {
 | |
|       setFrequencies(undefined);
 | |
|     }
 | |
| 
 | |
|     return () => {
 | |
|       if (animationFrameId) {
 | |
|         cancelAnimationFrame(animationFrameId);
 | |
|       }
 | |
|     };
 | |
|   }, [isConnected, isRecording]);
 | |
| 
 | |
|   // update session params
 | |
|   useEffect(() => {
 | |
|     clientRef.current?.configure({ voice });
 | |
|   }, [voice]);
 | |
|   useEffect(() => {
 | |
|     clientRef.current?.configure({ temperature });
 | |
|   }, [temperature]);
 | |
| 
 | |
|   const handleClose = async () => {
 | |
|     onClose?.();
 | |
|     if (isRecording) {
 | |
|       await toggleRecording();
 | |
|     }
 | |
|     disconnect().catch(console.error);
 | |
|   };
 | |
| 
 | |
|   return (
 | |
|     <div className={styles["realtime-chat"]}>
 | |
|       <div
 | |
|         className={clsx(styles["circle-mic"], {
 | |
|           [styles["pulse"]]: isRecording,
 | |
|         })}
 | |
|       >
 | |
|         <VoicePrint frequencies={frequencies} isActive={isRecording} />
 | |
|       </div>
 | |
| 
 | |
|       <div className={styles["bottom-icons"]}>
 | |
|         <div>
 | |
|           <IconButton
 | |
|             icon={isRecording ? <VoiceIcon /> : <VoiceOffIcon />}
 | |
|             onClick={toggleRecording}
 | |
|             disabled={!isConnected}
 | |
|             shadow
 | |
|             bordered
 | |
|           />
 | |
|         </div>
 | |
|         <div className={styles["icon-center"]}>{status}</div>
 | |
|         <div>
 | |
|           <IconButton
 | |
|             icon={<PowerIcon />}
 | |
|             onClick={handleClose}
 | |
|             shadow
 | |
|             bordered
 | |
|           />
 | |
|         </div>
 | |
|       </div>
 | |
|     </div>
 | |
|   );
 | |
| }
 |