You cannot select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
	
	
		
			360 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			TypeScript
		
	
		
		
			
		
	
	
			360 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			TypeScript
		
	
| 
											9 months ago
										 | import VoiceIcon from "@/app/icons/voice.svg"; | ||
|  | import VoiceOffIcon from "@/app/icons/voice-off.svg"; | ||
|  | import PowerIcon from "@/app/icons/power.svg"; | ||
|  | 
 | ||
|  | import styles from "./realtime-chat.module.scss"; | ||
|  | import clsx from "clsx"; | ||
|  | 
 | ||
|  | import { useState, useRef, useEffect } from "react"; | ||
|  | 
 | ||
|  | import { useChatStore, createMessage, useAppConfig } from "@/app/store"; | ||
|  | 
 | ||
|  | import { IconButton } from "@/app/components/button"; | ||
|  | 
 | ||
|  | import { | ||
|  |   Modality, | ||
|  |   RTClient, | ||
|  |   RTInputAudioItem, | ||
|  |   RTResponse, | ||
|  |   TurnDetection, | ||
|  | } from "rt-client"; | ||
|  | import { AudioHandler } from "@/app/lib/audio"; | ||
|  | import { uploadImage } from "@/app/utils/chat"; | ||
|  | import { VoicePrint } from "@/app/components/voice-print"; | ||
|  | 
 | ||
|  | interface RealtimeChatProps { | ||
|  |   onClose?: () => void; | ||
|  |   onStartVoice?: () => void; | ||
|  |   onPausedVoice?: () => void; | ||
|  | } | ||
|  | 
 | ||
|  | export function RealtimeChat({ | ||
|  |   onClose, | ||
|  |   onStartVoice, | ||
|  |   onPausedVoice, | ||
|  | }: RealtimeChatProps) { | ||
|  |   const chatStore = useChatStore(); | ||
|  |   const session = chatStore.currentSession(); | ||
|  |   const config = useAppConfig(); | ||
|  |   const [status, setStatus] = useState(""); | ||
|  |   const [isRecording, setIsRecording] = useState(false); | ||
|  |   const [isConnected, setIsConnected] = useState(false); | ||
|  |   const [isConnecting, setIsConnecting] = useState(false); | ||
|  |   const [modality, setModality] = useState("audio"); | ||
|  |   const [useVAD, setUseVAD] = useState(true); | ||
|  |   const [frequencies, setFrequencies] = useState<Uint8Array | undefined>(); | ||
|  | 
 | ||
|  |   const clientRef = useRef<RTClient | null>(null); | ||
|  |   const audioHandlerRef = useRef<AudioHandler | null>(null); | ||
|  |   const initRef = useRef(false); | ||
|  | 
 | ||
|  |   const temperature = config.realtimeConfig.temperature; | ||
|  |   const apiKey = config.realtimeConfig.apiKey; | ||
|  |   const model = config.realtimeConfig.model; | ||
|  |   const azure = config.realtimeConfig.provider === "Azure"; | ||
|  |   const azureEndpoint = config.realtimeConfig.azure.endpoint; | ||
|  |   const azureDeployment = config.realtimeConfig.azure.deployment; | ||
|  |   const voice = config.realtimeConfig.voice; | ||
|  | 
 | ||
|  |   const handleConnect = async () => { | ||
|  |     if (isConnecting) return; | ||
|  |     if (!isConnected) { | ||
|  |       try { | ||
|  |         setIsConnecting(true); | ||
|  |         clientRef.current = azure | ||
|  |           ? new RTClient( | ||
|  |               new URL(azureEndpoint), | ||
|  |               { key: apiKey }, | ||
|  |               { deployment: azureDeployment }, | ||
|  |             ) | ||
|  |           : new RTClient({ key: apiKey }, { model }); | ||
|  |         const modalities: Modality[] = | ||
|  |           modality === "audio" ? ["text", "audio"] : ["text"]; | ||
|  |         const turnDetection: TurnDetection = useVAD | ||
|  |           ? { type: "server_vad" } | ||
|  |           : null; | ||
|  |         await clientRef.current.configure({ | ||
|  |           instructions: "", | ||
|  |           voice, | ||
|  |           input_audio_transcription: { model: "whisper-1" }, | ||
|  |           turn_detection: turnDetection, | ||
|  |           tools: [], | ||
|  |           temperature, | ||
|  |           modalities, | ||
|  |         }); | ||
|  |         startResponseListener(); | ||
|  | 
 | ||
|  |         setIsConnected(true); | ||
|  |         // TODO
 | ||
|  |         // try {
 | ||
|  |         //   const recentMessages = chatStore.getMessagesWithMemory();
 | ||
|  |         //   for (const message of recentMessages) {
 | ||
|  |         //     const { role, content } = message;
 | ||
|  |         //     if (typeof content === "string") {
 | ||
|  |         //       await clientRef.current.sendItem({
 | ||
|  |         //         type: "message",
 | ||
|  |         //         role: role as any,
 | ||
|  |         //         content: [
 | ||
|  |         //           {
 | ||
|  |         //             type: (role === "assistant" ? "text" : "input_text") as any,
 | ||
|  |         //             text: content as string,
 | ||
|  |         //           },
 | ||
|  |         //         ],
 | ||
|  |         //       });
 | ||
|  |         //     }
 | ||
|  |         //   }
 | ||
|  |         //   // await clientRef.current.generateResponse();
 | ||
|  |         // } catch (error) {
 | ||
|  |         //   console.error("Set message failed:", error);
 | ||
|  |         // }
 | ||
|  |       } catch (error) { | ||
|  |         console.error("Connection failed:", error); | ||
|  |         setStatus("Connection failed"); | ||
|  |       } finally { | ||
|  |         setIsConnecting(false); | ||
|  |       } | ||
|  |     } else { | ||
|  |       await disconnect(); | ||
|  |     } | ||
|  |   }; | ||
|  | 
 | ||
|  |   const disconnect = async () => { | ||
|  |     if (clientRef.current) { | ||
|  |       try { | ||
|  |         await clientRef.current.close(); | ||
|  |         clientRef.current = null; | ||
|  |         setIsConnected(false); | ||
|  |       } catch (error) { | ||
|  |         console.error("Disconnect failed:", error); | ||
|  |       } | ||
|  |     } | ||
|  |   }; | ||
|  | 
 | ||
|  |   const startResponseListener = async () => { | ||
|  |     if (!clientRef.current) return; | ||
|  | 
 | ||
|  |     try { | ||
|  |       for await (const serverEvent of clientRef.current.events()) { | ||
|  |         if (serverEvent.type === "response") { | ||
|  |           await handleResponse(serverEvent); | ||
|  |         } else if (serverEvent.type === "input_audio") { | ||
|  |           await handleInputAudio(serverEvent); | ||
|  |         } | ||
|  |       } | ||
|  |     } catch (error) { | ||
|  |       if (clientRef.current) { | ||
|  |         console.error("Response iteration error:", error); | ||
|  |       } | ||
|  |     } | ||
|  |   }; | ||
|  | 
 | ||
|  |   const handleResponse = async (response: RTResponse) => { | ||
|  |     for await (const item of response) { | ||
|  |       if (item.type === "message" && item.role === "assistant") { | ||
|  |         const botMessage = createMessage({ | ||
|  |           role: item.role, | ||
|  |           content: "", | ||
|  |         }); | ||
|  |         // add bot message first
 | ||
|  |         chatStore.updateTargetSession(session, (session) => { | ||
|  |           session.messages = session.messages.concat([botMessage]); | ||
|  |         }); | ||
|  |         let hasAudio = false; | ||
|  |         for await (const content of item) { | ||
|  |           if (content.type === "text") { | ||
|  |             for await (const text of content.textChunks()) { | ||
|  |               botMessage.content += text; | ||
|  |             } | ||
|  |           } else if (content.type === "audio") { | ||
|  |             const textTask = async () => { | ||
|  |               for await (const text of content.transcriptChunks()) { | ||
|  |                 botMessage.content += text; | ||
|  |               } | ||
|  |             }; | ||
|  |             const audioTask = async () => { | ||
|  |               audioHandlerRef.current?.startStreamingPlayback(); | ||
|  |               for await (const audio of content.audioChunks()) { | ||
|  |                 hasAudio = true; | ||
|  |                 audioHandlerRef.current?.playChunk(audio); | ||
|  |               } | ||
|  |             }; | ||
|  |             await Promise.all([textTask(), audioTask()]); | ||
|  |           } | ||
|  |           // update message.content
 | ||
|  |           chatStore.updateTargetSession(session, (session) => { | ||
|  |             session.messages = session.messages.concat(); | ||
|  |           }); | ||
|  |         } | ||
|  |         if (hasAudio) { | ||
|  |           // upload audio get audio_url
 | ||
|  |           const blob = audioHandlerRef.current?.savePlayFile(); | ||
|  |           uploadImage(blob!).then((audio_url) => { | ||
|  |             botMessage.audio_url = audio_url; | ||
|  |             // update text and audio_url
 | ||
|  |             chatStore.updateTargetSession(session, (session) => { | ||
|  |               session.messages = session.messages.concat(); | ||
|  |             }); | ||
|  |           }); | ||
|  |         } | ||
|  |       } | ||
|  |     } | ||
|  |   }; | ||
|  | 
 | ||
|  |   const handleInputAudio = async (item: RTInputAudioItem) => { | ||
|  |     await item.waitForCompletion(); | ||
|  |     if (item.transcription) { | ||
|  |       const userMessage = createMessage({ | ||
|  |         role: "user", | ||
|  |         content: item.transcription, | ||
|  |       }); | ||
|  |       chatStore.updateTargetSession(session, (session) => { | ||
|  |         session.messages = session.messages.concat([userMessage]); | ||
|  |       }); | ||
|  |       // save input audio_url, and update session
 | ||
|  |       const { audioStartMillis, audioEndMillis } = item; | ||
|  |       // upload audio get audio_url
 | ||
|  |       const blob = audioHandlerRef.current?.saveRecordFile( | ||
|  |         audioStartMillis, | ||
|  |         audioEndMillis, | ||
|  |       ); | ||
|  |       uploadImage(blob!).then((audio_url) => { | ||
|  |         userMessage.audio_url = audio_url; | ||
|  |         chatStore.updateTargetSession(session, (session) => { | ||
|  |           session.messages = session.messages.concat(); | ||
|  |         }); | ||
|  |       }); | ||
|  |     } | ||
|  |     // stop streaming play after get input audio.
 | ||
|  |     audioHandlerRef.current?.stopStreamingPlayback(); | ||
|  |   }; | ||
|  | 
 | ||
|  |   const toggleRecording = async () => { | ||
|  |     if (!isRecording && clientRef.current) { | ||
|  |       try { | ||
|  |         if (!audioHandlerRef.current) { | ||
|  |           audioHandlerRef.current = new AudioHandler(); | ||
|  |           await audioHandlerRef.current.initialize(); | ||
|  |         } | ||
|  |         await audioHandlerRef.current.startRecording(async (chunk) => { | ||
|  |           await clientRef.current?.sendAudio(chunk); | ||
|  |         }); | ||
|  |         setIsRecording(true); | ||
|  |       } catch (error) { | ||
|  |         console.error("Failed to start recording:", error); | ||
|  |       } | ||
|  |     } else if (audioHandlerRef.current) { | ||
|  |       try { | ||
|  |         audioHandlerRef.current.stopRecording(); | ||
|  |         if (!useVAD) { | ||
|  |           const inputAudio = await clientRef.current?.commitAudio(); | ||
|  |           await handleInputAudio(inputAudio!); | ||
|  |           await clientRef.current?.generateResponse(); | ||
|  |         } | ||
|  |         setIsRecording(false); | ||
|  |       } catch (error) { | ||
|  |         console.error("Failed to stop recording:", error); | ||
|  |       } | ||
|  |     } | ||
|  |   }; | ||
|  | 
 | ||
|  |   useEffect(() => { | ||
|  |     // 防止重复初始化
 | ||
|  |     if (initRef.current) return; | ||
|  |     initRef.current = true; | ||
|  | 
 | ||
|  |     const initAudioHandler = async () => { | ||
|  |       const handler = new AudioHandler(); | ||
|  |       await handler.initialize(); | ||
|  |       audioHandlerRef.current = handler; | ||
|  |       await handleConnect(); | ||
|  |       await toggleRecording(); | ||
|  |     }; | ||
|  | 
 | ||
|  |     initAudioHandler().catch((error) => { | ||
|  |       setStatus(error); | ||
|  |       console.error(error); | ||
|  |     }); | ||
|  | 
 | ||
|  |     return () => { | ||
|  |       if (isRecording) { | ||
|  |         toggleRecording(); | ||
|  |       } | ||
|  |       audioHandlerRef.current?.close().catch(console.error); | ||
|  |       disconnect(); | ||
|  |     }; | ||
|  |   }, []); | ||
|  | 
 | ||
|  |   useEffect(() => { | ||
|  |     let animationFrameId: number; | ||
|  | 
 | ||
|  |     if (isConnected && isRecording) { | ||
|  |       const animationFrame = () => { | ||
|  |         if (audioHandlerRef.current) { | ||
|  |           const freqData = audioHandlerRef.current.getByteFrequencyData(); | ||
|  |           setFrequencies(freqData); | ||
|  |         } | ||
|  |         animationFrameId = requestAnimationFrame(animationFrame); | ||
|  |       }; | ||
|  | 
 | ||
|  |       animationFrameId = requestAnimationFrame(animationFrame); | ||
|  |     } else { | ||
|  |       setFrequencies(undefined); | ||
|  |     } | ||
|  | 
 | ||
|  |     return () => { | ||
|  |       if (animationFrameId) { | ||
|  |         cancelAnimationFrame(animationFrameId); | ||
|  |       } | ||
|  |     }; | ||
|  |   }, [isConnected, isRecording]); | ||
|  | 
 | ||
|  |   // update session params
 | ||
|  |   useEffect(() => { | ||
|  |     clientRef.current?.configure({ voice }); | ||
|  |   }, [voice]); | ||
|  |   useEffect(() => { | ||
|  |     clientRef.current?.configure({ temperature }); | ||
|  |   }, [temperature]); | ||
|  | 
 | ||
|  |   const handleClose = async () => { | ||
|  |     onClose?.(); | ||
|  |     if (isRecording) { | ||
|  |       await toggleRecording(); | ||
|  |     } | ||
|  |     disconnect().catch(console.error); | ||
|  |   }; | ||
|  | 
 | ||
|  |   return ( | ||
|  |     <div className={styles["realtime-chat"]}> | ||
|  |       <div | ||
|  |         className={clsx(styles["circle-mic"], { | ||
|  |           [styles["pulse"]]: isRecording, | ||
|  |         })} | ||
|  |       > | ||
|  |         <VoicePrint frequencies={frequencies} isActive={isRecording} /> | ||
|  |       </div> | ||
|  | 
 | ||
|  |       <div className={styles["bottom-icons"]}> | ||
|  |         <div> | ||
|  |           <IconButton | ||
|  |             icon={isRecording ? <VoiceIcon /> : <VoiceOffIcon />} | ||
|  |             onClick={toggleRecording} | ||
|  |             disabled={!isConnected} | ||
|  |             shadow | ||
|  |             bordered | ||
|  |           /> | ||
|  |         </div> | ||
|  |         <div className={styles["icon-center"]}>{status}</div> | ||
|  |         <div> | ||
|  |           <IconButton | ||
|  |             icon={<PowerIcon />} | ||
|  |             onClick={handleClose} | ||
|  |             shadow | ||
|  |             bordered | ||
|  |           /> | ||
|  |         </div> | ||
|  |       </div> | ||
|  |     </div> | ||
|  |   ); | ||
|  | } |