Using the Agent SDK
Add AI automation with vision-language models
Now that you've verified your sandbox works with the Computer SDK, use an Agent to automate complex tasks. The agent interacts with the computer environment using a vision-language model to understand the UI and execute actions.
Cua Agent Framework provides:
- 100+ VLM options through Cua VLM Router and direct provider access
- Built-in optimizations for computer-use tasks
- Structured agent loops for consistent behavior
Installation
Using uv (recommended):
uv pip install "cua-agent[all]"Or with pip:
pip install "cua-agent[all]"npm install @trycua/computer aiChoose Your Model Provider
Use Cua's inference API to access multiple model providers with a single API key (same key used for sandbox access). Cua VLM Router provides intelligent routing and cost optimization.
import os
import asyncio
from computer import Computer
from agent import ComputerAgent
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
computer = Computer(
os_type="linux", # or "windows" or "macos"
provider_type="cloud",
name="your-sandbox-name" # from CLI or website
)
async def main():
await computer.run() # Connect to the sandbox
try:
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5", # Cua-routed model
tools=[computer],
max_trajectory_budget=5.0
)
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
finally:
await computer.disconnect()
asyncio.run(main())Available Cua models:
cua/anthropic/claude-sonnet-4.5- Claude Sonnet 4.5 (recommended)cua/anthropic/claude-opus-4.5- Claude Opus 4.5 (enhanced agentic capabilities)cua/anthropic/claude-haiku-4.5- Claude Haiku 4.5 (faster, cost-effective)cua/google/gemini-3-pro-preview- Gemini 3 Pro Preview (most powerful multimodal)cua/google/gemini-3-flash-preview- Gemini 3 Flash Preview (fastest and cheapest)
Available composed models:
huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929- GTA1 grounding + Claude Sonnet 4.5 planninghuggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5- GTA1 grounding + GPT-5 planninghuggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o- UI-TARS grounding + GPT-4o planningmoondream3+openai/gpt-4o- Moondream3 grounding + GPT-4o planning
Benefits:
- Single API key for multiple providers
- Cost tracking and optimization
- No need to manage multiple provider keys
Use your own API keys from model providers like Anthropic, OpenAI, or others.
import os
import asyncio
from computer import Computer
from agent import ComputerAgent
# Set your provider API key
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..." # For Anthropic
# OR
os.environ["OPENAI_API_KEY"] = "sk-..." # For OpenAI
computer = Computer(
os_type="linux", # or "windows" or "macos"
provider_type="cloud",
name="your-sandbox-name" # from CLI or website
)
async def main():
await computer.run() # Launch & connect to the sandbox
try:
agent = ComputerAgent(
model="anthropic/claude-sonnet-4-5-20250929", # Direct provider model
tools=[computer],
max_trajectory_budget=5.0
)
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
finally:
await computer.disconnect()
asyncio.run(main())Supported providers:
anthropic/claude-*- Anthropic Claude modelsopenai/gpt-*- OpenAI GPT modelsopenai/o1-*- OpenAI o1 modelshuggingface-local/*- Local HuggingFace models- And many more via LiteLLM
See Supported Models for the complete list.
For TypeScript, you can build agent loops using the Vercel AI SDK with the Cua Computer Framework TypeScript library.
import Anthropic from "@anthropic-ai/sdk";
import { Computer, OSType } from "@trycua/computer";
const client = new Anthropic();
let computer: Computer;
const computerTool = {
type: "tool" as const,
name: "computer",
description: "Control the computer with actions like screenshot, click, type, etc.",
inputSchema: {
type: "object" as const,
properties: {
action: {
type: "string" as const,
description: "Action to perform (screenshot, click, type, key_press, etc.)",
},
coordinate: {
type: "array" as const,
items: { type: "number" as const },
description: "x, y coordinates for click actions",
},
text: {
type: "string" as const,
description: "Text to type",
},
},
required: ["action"],
},
};
async function runAgentLoop(goal: string) {
// Initialize computer
computer = new Computer({
osType: OSType.LINUX,
provider_type: "cloud",
name: "your-sandbox-name",
apiKey: process.env.CUA_API_KEY!,
});
await computer.run();
const messages: any[] = [];
messages.push({ role: "user", content: goal });
// Agent loop
for (let i = 0; i < 10; i++) {
const response = await client.messages.create({
model: "claude-opus-4-1-20250805",
max_tokens: 4096,
tools: [computerTool],
messages: messages,
});
if (response.stop_reason === "end_turn") {
console.log("Task completed!");
break;
}
messages.push({ role: "assistant", content: response.content });
// Process tool calls
const toolResults = [];
for (const block of response.content) {
if (block.type === "tool_use") {
let result: any;
switch (block.input.action) {
case "screenshot":
result = await computer.interface.screenshot();
break;
case "click":
result = await computer.interface.click(
block.input.coordinate[0],
block.input.coordinate[1]
);
break;
case "type":
result = await computer.interface.type(block.input.text);
break;
}
toolResults.push({
type: "tool_result",
tool_use_id: block.id,
content: JSON.stringify(result),
});
}
}
if (toolResults.length > 0) {
messages.push({ role: "user", content: toolResults });
}
}
await computer.disconnect();
}
runAgentLoop("Take a screenshot and tell me what you see");For more details, see the Vercel AI SDK Computer Use Cookbook.
Next Steps
- Explore Agent Loops for advanced agent configuration
- Check out Custom Tools to extend your agents
- Review Supported Model Providers for more LLM options
- Try the Form Filling example use case
- Join our Discord community for help and discussion
Was this page helpful?