feat: 新增 Computer Use Agent 初步支持

This commit is contained in:
mofeng-git
2026-06-15 22:24:40 +08:00
parent 5c98aea7e3
commit 4b7be20fe0
20 changed files with 2518 additions and 2 deletions

View File

@@ -10,8 +10,10 @@ import { useConsoleEvents } from '@/composables/useConsoleEvents'
import { useHidWebSocket } from '@/composables/useHidWebSocket'
import { useWebRTC } from '@/composables/useWebRTC'
import { useVideoSession } from '@/composables/useVideoSession'
import { useComputerUseSocket, type ComputerUseServerMessage } from '@/composables/useComputerUseSocket'
import { getUnifiedAudio } from '@/composables/useUnifiedAudio'
import { streamApi, hidApi, atxApi, atxConfigApi, authApi } from '@/api'
import { streamApi, hidApi, atxApi, atxConfigApi, authApi, computerUseApi } from '@/api'
import type { ComputerUseScreenshot, ComputerUseSession } from '@/api'
import { CanonicalKey, HidBackend } from '@/types/generated'
import type { HidKeyboardEvent, HidMouseEvent } from '@/types/hid'
import { keyboardEventToCanonicalKey, updateModifierMaskForKey } from '@/lib/keyboardMappings'
@@ -29,6 +31,8 @@ import ActionBar from '@/components/ActionBar.vue'
import InfoBar from '@/components/InfoBar.vue'
import VirtualKeyboard from '@/components/VirtualKeyboard.vue'
import StatsSheet from '@/components/StatsSheet.vue'
import ComputerUseSheet from '@/components/ComputerUseSheet.vue'
import type { ComputerUseTimelineItem, NewComputerUseTimelineItem } from '@/types/computerUseTimeline'
import LanguageToggleButton from '@/components/LanguageToggleButton.vue'
import BrandMark from '@/components/BrandMark.vue'
import { Button } from '@/components/ui/button'
@@ -88,6 +92,11 @@ const consoleEvents = useConsoleEvents({
})
const videoMode = ref<VideoMode>('mjpeg')
const computerUseOpen = ref(false)
const computerUseSession = ref<ComputerUseSession | null>(null)
const computerUseTimeline = ref<ComputerUseTimelineItem[]>([])
const computerUseConversationStarted = ref(false)
let computerUseTimelineSeq = 0
const videoRef = ref<HTMLImageElement | null>(null)
const webrtcVideoRef = ref<HTMLVideoElement | null>(null)
@@ -118,6 +127,11 @@ const clientsStats = ref<Record<string, ClientStat>>({})
const myClientId = generateUUID()
const computerUseSocket = useComputerUseSocket({
onMessage: handleComputerUseMessage,
onScreenshotRequested: captureComputerUseFrame,
})
const mouseMode = ref<'absolute' | 'relative'>('absolute')
const pressedKeys = ref<CanonicalKey[]>([])
const keyboardLed = computed(() => ({
@@ -617,6 +631,8 @@ const videoContainerStyle = computed(() => {
}
})
const computerUsePanelVisible = computed(() => computerUseOpen.value && !isFullscreen.value)
const showMsdStatusCard = computed(() => {
return !!(systemStore.msd?.available && systemStore.hid?.backend !== 'ch9329')
})
@@ -677,6 +693,114 @@ async function captureFrameOverlay() {
}
}
async function captureComputerUseFrame(): Promise<ComputerUseScreenshot | null> {
try {
const canvas = document.createElement('canvas')
const ctx = canvas.getContext('2d')
if (!ctx) return null
const MAX_WIDTH = 1920
if (videoMode.value === 'mjpeg') {
const img = videoRef.value
if (!img || !img.naturalWidth || !img.naturalHeight) return null
const scale = Math.min(1, MAX_WIDTH / img.naturalWidth)
canvas.width = Math.max(1, Math.round(img.naturalWidth * scale))
canvas.height = Math.max(1, Math.round(img.naturalHeight * scale))
ctx.drawImage(img, 0, 0, canvas.width, canvas.height)
} else {
const video = webrtcVideoRef.value
if (!video || !video.videoWidth || !video.videoHeight) return null
const scale = Math.min(1, MAX_WIDTH / video.videoWidth)
canvas.width = Math.max(1, Math.round(video.videoWidth * scale))
canvas.height = Math.max(1, Math.round(video.videoHeight * scale))
ctx.drawImage(video, 0, 0, canvas.width, canvas.height)
}
return {
data_url: canvas.toDataURL('image/jpeg', 0.82),
width: canvas.width,
height: canvas.height,
}
} catch (err) {
console.error('[ComputerUse] Failed to capture frame:', err)
return null
}
}
function handleComputerUseMessage(message: ComputerUseServerMessage) {
switch (message.type) {
case 'session_updated':
computerUseSession.value = message.session
if (message.session.last_error) {
pushComputerUseTimeline({ type: 'error', text: message.session.last_error })
}
if (message.session.final_message) {
pushComputerUseTimeline({ type: 'assistant', text: message.session.final_message })
}
break
case 'screenshot_captured':
pushComputerUseTimeline({ type: 'screenshot', screenshot: message.screenshot })
break
case 'actions_executed':
pushComputerUseTimeline({ type: 'actions_executed', actions: message.actions })
break
case 'error':
pushComputerUseTimeline({ type: 'error', text: message.message })
toast.error('Computer Use failed', { description: message.message })
break
}
}
function pushComputerUseTimeline(item: NewComputerUseTimelineItem) {
const last = computerUseTimeline.value[computerUseTimeline.value.length - 1]
if (last?.type === item.type) {
if ('text' in last && 'text' in item && last.text === item.text) return
if (last.type === 'actions_executed' && item.type === 'actions_executed' && JSON.stringify(last.actions) === JSON.stringify(item.actions)) return
}
computerUseTimeline.value.push({
id: `${Date.now()}-${computerUseTimelineSeq++}`,
...item,
} as ComputerUseTimelineItem)
}
function clearComputerUseTimeline() {
computerUseTimeline.value = []
computerUseConversationStarted.value = false
}
async function openComputerUse() {
computerUseOpen.value = true
await computerUseSocket.connect().catch(() => {})
computerUseSession.value = await computerUseApi.session().catch(() => computerUseSession.value)
}
async function startComputerUse(prompt: string) {
try {
await computerUseSocket.connect()
pushComputerUseTimeline({ type: 'user', text: prompt })
computerUseSession.value = await computerUseApi.start({
prompt,
continue_conversation: computerUseConversationStarted.value,
client_id: computerUseSocket.clientId,
})
computerUseConversationStarted.value = true
} catch (err: any) {
pushComputerUseTimeline({ type: 'error', text: err?.message ?? 'Computer Use start failed' })
toast.error('Computer Use start failed', { description: err?.message })
}
}
async function stopComputerUse() {
try {
computerUseSession.value = await computerUseApi.stop()
} catch (err: any) {
toast.error('Computer Use stop failed', { description: err?.message })
}
}
function waitForVideoFirstFrame(el: HTMLVideoElement, timeoutMs = 2000): Promise<boolean> {
return new Promise((resolve) => {
let done = false
@@ -2706,6 +2830,7 @@ onUnmounted(() => {
@reset="handleReset"
@wol="handleWol"
@open-terminal="openTerminal"
@open-computer-use="openComputerUse"
/>
<div class="flex-1 overflow-hidden relative">
<div
@@ -2715,7 +2840,11 @@ onUnmounted(() => {
background-size: 20px 20px;
"
/>
<div class="relative h-full w-full flex items-center justify-center p-1 sm:p-4">
<div class="relative flex h-full w-full min-w-0 items-stretch gap-3 p-1 sm:p-4">
<div
class="flex min-w-0 flex-1 items-center justify-center transition-all duration-300"
:class="{ 'md:pr-1': computerUsePanelVisible }"
>
<div
ref="videoContainerRef"
class="relative bg-black overflow-hidden flex items-center justify-center"
@@ -2906,6 +3035,17 @@ onUnmounted(() => {
</div>
</Transition>
</div>
</div>
<ComputerUseSheet
v-model:open="computerUseOpen"
:connected="computerUseSocket.connected.value"
:ws-error="computerUseSocket.error.value"
:session="computerUseSession"
:timeline="computerUseTimeline"
@start="startComputerUse"
@stop="stopComputerUse"
@clear="clearComputerUseTimeline"
/>
</div>
</div>
<Teleport :to="virtualKeyboardAttached ? '#keyboard-anchor' : 'body'" :disabled="virtualKeyboardAttached">