Build a Custom Agent
Create your own agent to solve computer-use tasks
In this tutorial, you'll build a custom agent that can interact with desktop environments. You'll start with a simple rule-based agent and progressively add intelligence.
Time: ~20 minutes Prerequisites: cua-bench installed, basic Python knowledge
What You'll Build
A custom agent that can:
- Take screenshots of the desktop
- Analyze what it sees
- Decide on actions (click, type, etc.)
- Execute those actions
Agent Architecture
Every agent in cua-bench extends the BaseAgent class and implements the perform_task() method:
from cua_bench.agents import BaseAgent, AgentResult
class MyAgent(BaseAgent):
@staticmethod
def name() -> str:
return "my-agent"
async def perform_task(
self,
task_description: str,
session: DesktopSession,
logging_dir: Path | None = None,
) -> AgentResult:
# Your agent logic here
passStep 1: Scaffold the Agent
Use the CLI to create the agent structure:
cb agent init my-agent --output-dir ./agentsThis creates:
agents/
├── __init__.py
├── agent.py
└── requirements.txtStep 2: Implement a Basic Agent
Edit agents/agent.py:
import cua_bench as cb
from cua_bench.agents import BaseAgent, AgentResult, FailureMode
from pathlib import Path
class MyAgent(BaseAgent):
"""A simple rule-based agent."""
@staticmethod
def name() -> str:
return "my-agent"
async def perform_task(
self,
task_description: str,
session: cb.DesktopSession,
logging_dir: Path | None = None,
) -> AgentResult:
"""Perform the task by taking screenshots and clicking."""
# Step 1: Take a screenshot
screenshot = await session.screenshot()
# Step 2: Simple strategy - click the center of the screen
# (We'll make this smarter later)
center_x = 256
center_y = 256
await session.execute_action(cb.ClickAction(x=center_x, y=center_y))
# Return result with token counts (for cost tracking)
return AgentResult(
total_input_tokens=0,
total_output_tokens=0,
failure_mode=FailureMode.NONE
)This basic agent just clicks the center of the screen—not very smart, but it's a starting point!
Step 3: Register the Agent
Create or edit .cua/agents.yaml in your project root:
agents:
- name: my-agent
import_path: agents.agent:MyAgent
defaults:
max_steps: 20Step 4: Test the Agent
Run the agent against a simple task:
# First, create a simple button task (from previous example)
# Then run your agent against it
cb run task tasks/my-button-task --agent my-agentThe agent will click the center—which probably won't hit the button. Let's make it smarter.
Step 5: Add Screenshot Analysis
Now let's add actual intelligence. We'll use a simple approach: look for elements and click them.
import cua_bench as cb
from cua_bench.agents import BaseAgent, AgentResult, FailureMode
from pathlib import Path
import base64
class MyAgent(BaseAgent):
"""An agent that analyzes screenshots to find targets."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.max_steps = kwargs.get("max_steps", 20)
@staticmethod
def name() -> str:
return "my-agent"
async def perform_task(
self,
task_description: str,
session: cb.DesktopSession,
logging_dir: Path | None = None,
) -> AgentResult:
"""Perform task with screenshot analysis."""
total_input_tokens = 0
total_output_tokens = 0
for step in range(self.max_steps):
# Take screenshot
screenshot_bytes = await session.screenshot()
# Save screenshot for debugging
if logging_dir:
screenshot_path = logging_dir / f"step_{step}.png"
screenshot_path.write_bytes(screenshot_bytes)
# Analyze and decide action
action = await self._decide_action(
task_description,
screenshot_bytes,
step
)
if action is None:
# Agent thinks task is complete
break
# Execute the action
await session.execute_action(action)
return AgentResult(
total_input_tokens=total_input_tokens,
total_output_tokens=total_output_tokens,
failure_mode=FailureMode.NONE
)
async def _decide_action(
self,
task_description: str,
screenshot: bytes,
step: int
) -> cb.Action | None:
"""Decide what action to take based on the screenshot."""
# For now, use a simple heuristic:
# Click slightly below center where buttons often are
if step == 0:
return cb.ClickAction(x=256, y=300)
# After first click, assume done
return NoneStep 6: Integrate with an LLM
For real intelligence, integrate with a language model. Here's an example using Anthropic's Claude:
import anthropic
import cua_bench as cb
from cua_bench.agents import BaseAgent, AgentResult, FailureMode
from pathlib import Path
import base64
class MyAgent(BaseAgent):
"""An LLM-powered agent."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.model = kwargs.get("model", "claude-sonnet-4-20250514")
self.max_steps = kwargs.get("max_steps", 20)
self.client = anthropic.Anthropic()
@staticmethod
def name() -> str:
return "my-agent"
async def perform_task(
self,
task_description: str,
session: cb.DesktopSession,
logging_dir: Path | None = None,
) -> AgentResult:
"""Perform task using LLM for decisions."""
total_input_tokens = 0
total_output_tokens = 0
for step in range(self.max_steps):
# Take screenshot
screenshot_bytes = await session.screenshot()
# Ask LLM what to do
action, input_tokens, output_tokens = await self._ask_llm(
task_description,
screenshot_bytes
)
total_input_tokens += input_tokens
total_output_tokens += output_tokens
if action is None:
break
await session.execute_action(action)
return AgentResult(
total_input_tokens=total_input_tokens,
total_output_tokens=total_output_tokens,
failure_mode=FailureMode.NONE
)
async def _ask_llm(
self,
task_description: str,
screenshot: bytes
) -> tuple[cb.Action | None, int, int]:
"""Ask the LLM what action to take."""
# Encode screenshot as base64
screenshot_b64 = base64.standard_b64encode(screenshot).decode()
response = self.client.messages.create(
model=self.model,
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot_b64,
}
},
{
"type": "text",
"text": f"""Task: {task_description}
Look at this screenshot and decide what action to take.
Respond with ONE of:
- CLICK x y (to click at coordinates)
- TYPE text (to type text)
- DONE (if task is complete)
Example: CLICK 150 200"""
}
]
}]
)
# Parse response
response_text = response.content[0].text.strip()
action = self._parse_action(response_text)
return (
action,
response.usage.input_tokens,
response.usage.output_tokens
)
def _parse_action(self, response: str) -> cb.Action | None:
"""Parse LLM response into an action."""
if response.startswith("DONE"):
return None
if response.startswith("CLICK"):
parts = response.split()
if len(parts) >= 3:
x, y = int(parts[1]), int(parts[2])
return cb.ClickAction(x=x, y=y)
if response.startswith("TYPE"):
text = response[5:].strip()
return cb.TypeAction(text=text)
# Default: no action
return NoneStep 7: Run with Custom Model
Update your agent registration to accept model configuration:
agents:
- name: my-agent
import_path: agents.agent:MyAgent
defaults:
model: claude-sonnet-4-20250514
max_steps: 20Then run:
export ANTHROPIC_API_KEY=sk-...
cb run task tasks/my-button-task --agent my-agentAdvanced: Build a Docker Image
For production use, package your agent as a Docker image:
Create Dockerfile
FROM python:3.11-slim
WORKDIR /app
# Install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy agent code
COPY agents/ ./agents/
# Set entrypoint
ENTRYPOINT ["python", "-m", "agents.agent"]Build and Push
# Build
docker build -t myregistry/my-agent:latest .
# Push to registry
cb agent push myregistry/my-agent:latestRegister Docker Agent
agents:
- name: my-agent-docker
image: myregistry/my-agent:latest
defaults:
model: claude-sonnet-4-20250514
max_steps: 50Available Actions
Your agent can execute these actions via session.execute_action():
| Action | Description |
|---|---|
ClickAction(x, y) | Single click at coordinates |
RightClickAction(x, y) | Right-click |
DoubleClickAction(x, y) | Double-click |
DragAction(from_x, from_y, to_x, to_y) | Drag from one point to another |
TypeAction(text) | Type text |
KeyAction(key) | Press a single key (e.g., "return", "escape") |
HotkeyAction(keys) | Press key combination (e.g., ["ctrl", "c"]) |
ScrollAction(direction, amount) | Scroll up/down |
WaitAction(seconds) | Wait for a duration |
Complete Code
agents/agent.py (LLM-powered)
import anthropic
import cua_bench as cb
from cua_bench.agents import BaseAgent, AgentResult, FailureMode
from pathlib import Path
import base64
class MyAgent(BaseAgent):
"""An LLM-powered computer-use agent."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.model = kwargs.get("model", "claude-sonnet-4-20250514")
self.max_steps = kwargs.get("max_steps", 20)
self.client = anthropic.Anthropic()
@staticmethod
def name() -> str:
return "my-agent"
async def perform_task(
self,
task_description: str,
session: cb.DesktopSession,
logging_dir: Path | None = None,
) -> AgentResult:
"""Perform task using LLM for decisions."""
total_input_tokens = 0
total_output_tokens = 0
for step in range(self.max_steps):
screenshot_bytes = await session.screenshot()
if logging_dir:
(logging_dir / f"step_{step}.png").write_bytes(screenshot_bytes)
action, in_tokens, out_tokens = await self._ask_llm(
task_description, screenshot_bytes
)
total_input_tokens += in_tokens
total_output_tokens += out_tokens
if action is None:
break
await session.execute_action(action)
return AgentResult(
total_input_tokens=total_input_tokens,
total_output_tokens=total_output_tokens,
failure_mode=FailureMode.NONE
)
async def _ask_llm(
self, task_description: str, screenshot: bytes
) -> tuple[cb.Action | None, int, int]:
"""Ask the LLM what action to take."""
screenshot_b64 = base64.standard_b64encode(screenshot).decode()
response = self.client.messages.create(
model=self.model,
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot_b64,
}
},
{
"type": "text",
"text": f"""Task: {task_description}
Look at this screenshot and decide what action to take.
Respond with ONE of:
- CLICK x y (to click at coordinates)
- TYPE text (to type text)
- KEY key (to press a key like "return")
- DONE (if task is complete)
Example: CLICK 150 200"""
}
]
}]
)
response_text = response.content[0].text.strip()
action = self._parse_action(response_text)
return (action, response.usage.input_tokens, response.usage.output_tokens)
def _parse_action(self, response: str) -> cb.Action | None:
"""Parse LLM response into an action."""
response = response.upper().strip()
if response.startswith("DONE"):
return None
if response.startswith("CLICK"):
parts = response.split()
if len(parts) >= 3:
return cb.ClickAction(x=int(parts[1]), y=int(parts[2]))
if response.startswith("TYPE"):
text = response[5:].strip()
return cb.TypeAction(text=text)
if response.startswith("KEY"):
key = response[4:].strip().lower()
return cb.KeyAction(key=key)
return None.cua/agents.yaml
agents:
- name: my-agent
import_path: agents.agent:MyAgent
defaults:
model: claude-sonnet-4-20250514
max_steps: 20agents/requirements.txt
cua-bench
anthropic>=0.20.0Next Steps
- Build a Universal Task with custom GUIs
- See the Custom Agents Guide for more details
- Explore the Adapters for benchmark integrations
Was this page helpful?