Overview

OpenAI’s Computer Use lets AI agents control computer interfaces through the Responses API. This guide shows how to use it with Orgo’s virtual desktops.

Quick Start

1

Install packages

pip install orgo openai python-dotenv
2

Set up API keys

export ORGO_API_KEY=your_orgo_api_key
export OPENAI_API_KEY=your_openai_api_key
3

Run your first task

import time
from openai import OpenAI
from orgo import Computer

# Initialize
client = OpenAI()
computer = Computer()

# Create request with task
response = client.responses.create(
    model="computer-use-preview",
    tools=[{
        "type": "computer_use_preview",
        "display_width": 1024,
        "display_height": 768,
        "environment": "linux"
    }],
    input=[{
        "role": "user",
        "content": [{
            "type": "input_text",
            "text": "Open Firefox and search for OpenAI"
        }]
    }],
    truncation="auto"
)

# Execute the suggested action
actions = [item for item in response.output if item.type == "computer_call"]
if actions:
    action = actions[0].action
    if action.type == "click":
        computer.left_click(action.x, action.y)
    elif action.type == "type":
        computer.type(action.text)

# Clean up
computer.destroy()

Complete Example

Here’s a full working example that handles the complete agent loop:
import time
import base64
from openai import OpenAI
from orgo import Computer
from dotenv import load_dotenv

load_dotenv()

def run_computer_task(task, project_id=None):
    """Execute a task using OpenAI Computer Use with Orgo."""
    
    # Initialize OpenAI client and Orgo computer
    client = OpenAI()
    computer = Computer(project_id=project_id)
    print(f"🖥️  Computer ID: {computer.project_id}")
    
    # Create initial request with the task
    response = client.responses.create(
        model="computer-use-preview",
        tools=[{
            "type": "computer_use_preview",
            "display_width": 1024,
            "display_height": 768,
            "environment": "linux"  # Orgo provides Linux desktops
        }],
        input=[{
            "role": "user",
            "content": [{
                "type": "input_text", 
                "text": f"""IMPORTANT: You are controlling a Linux desktop. 
- Always double-click desktop icons to open applications
- Use keyboard shortcuts as single commands (e.g., 'ctrl+c' not separate keys)
Task: {task}"""
            }]
        }],
        reasoning={"summary": "concise"},  # Show reasoning steps
        truncation="auto"  # Required for computer use
    )
    
    # Main agent loop
    while True:
        # Display progress
        for item in response.output:
            if item.type == "reasoning" and hasattr(item, "summary"):
                for summary in item.summary:
                    if hasattr(summary, "text"):
                        print(f"💭 {summary.text}")
            elif item.type == "text" and hasattr(item, "text"):
                print(f"💬 {item.text}")
        
        # Get computer actions from response
        actions = [item for item in response.output if item.type == "computer_call"]
        
        # If no actions, task is complete
        if not actions:
            print("✓ Task completed")
            break
            
        # Execute the action
        action = actions[0]
        print(f"→ {action.action.type}")
        
        execute_action(computer, action.action)
        time.sleep(1)  # Allow UI to update
        
        # Capture screenshot and continue
        screenshot = computer.screenshot_base64()
        
        response = client.responses.create(
            model="computer-use-preview",
            previous_response_id=response.id,  # Link to previous response
            tools=[{
                "type": "computer_use_preview",
                "display_width": 1024,
                "display_height": 768,
                "environment": "linux"
            }],
            input=[{
                "call_id": action.call_id,
                "type": "computer_call_output",
                "output": {
                    "type": "input_image",
                    "image_url": f"data:image/png;base64,{screenshot}"
                }
            }],
            reasoning={"summary": "concise"},
            truncation="auto"
        )
    
    return computer


def execute_action(computer, action):
    """Execute computer actions using Orgo."""
    
    match action.type:
        case "click":
            # Handle left/right clicks
            if getattr(action, 'button', 'left') == "right":
                computer.right_click(action.x, action.y)
            else:
                computer.left_click(action.x, action.y)
                
        case "double_click":
            computer.double_click(action.x, action.y)
            
        case "type":
            computer.type(action.text)
            
        case "key" | "keypress":
            # Handle single keys or key combinations
            keys = getattr(action, 'keys', [getattr(action, 'key', [])])
            if len(keys) > 1:
                # Multiple keys = keyboard shortcut
                computer.key('+'.join(keys).lower())
            else:
                # Single key press
                for key in keys:
                    computer.key(key)
                    
        case "scroll":
            # Convert scroll amount to direction
            scroll_y = getattr(action, 'scroll_y', 0)
            direction = "down" if scroll_y > 0 else "up"
            computer.scroll(direction, abs(scroll_y) // 100)
            
        case "wait":
            computer.wait(getattr(action, 'seconds', 2))
            
        case "screenshot":
            # Screenshot is taken automatically in the loop
            pass


if __name__ == "__main__":
    # Example usage
    computer = run_computer_task("Open a terminal and list files")
    
    # Always clean up
    computer.destroy()

Usage Examples

Basic Tasks

# Open a browser
computer = run_computer_task("Open Firefox")

# Navigate to a website
computer = run_computer_task("Go to github.com and search for orgo")

# Fill out a form
computer = run_computer_task("Fill out the contact form with test data")

# Always clean up
computer.destroy()

Complex Workflows

# Multi-step task
task = """
1. Open a text editor
2. Write a Python hello world program
3. Save it as hello.py
4. Open a terminal
5. Run the program
"""
computer = run_computer_task(task)
computer.destroy()

Reusing Sessions

# First task
computer = run_computer_task("Open VS Code")
project_id = computer.project_id

# Continue in same session
computer = run_computer_task(
    "Create a new Python file", 
    project_id=project_id
)

# Clean up when done
computer.destroy()

Key Concepts

The Agent Loop

OpenAI Computer Use works in a continuous loop:
  1. Request → Send task to the model
  2. Action → Model suggests an action (click, type, etc.)
  3. Execute → Your code executes the action
  4. Screenshot → Capture the result
  5. Repeat → Continue until task is complete

Action Types

ActionDescriptionExample
clickClick at coordinatesClick button at (100, 200)
double_clickDouble-clickOpen desktop icon
typeType textEnter username
keyPress key(s)Press Enter, Ctrl+C
scrollScroll pageScroll down 3 units
waitPause executionWait 2 seconds
screenshotTake screenshotCapture current state

Safety Features

OpenAI includes safety checks to prevent misuse:
# Handle safety checks if they occur
if hasattr(action, 'pending_safety_checks'):
    for check in action.pending_safety_checks:
        print(f"⚠️  Safety check: {check.message}")
        # Acknowledge in next request if proceeding

Best Practices

1. Clear Instructions

# ✅ Good - Specific and clear
task = "Open Firefox, go to github.com, and star the orgo repository"

# ❌ Avoid - Too vague
task = "Do some web stuff"

2. Error Handling

def safe_run_task(task):
    """Run task with error handling."""
    computer = None
    try:
        computer = run_computer_task(task)
        return computer
    except Exception as e:
        print(f"❌ Error: {e}")
        if computer:
            computer.destroy()
        raise

3. Session Management

# Use context manager pattern
class ComputerSession:
    def __init__(self, task):
        self.task = task
        self.computer = None
        
    def __enter__(self):
        self.computer = run_computer_task(self.task)
        return self.computer
        
    def __exit__(self, *args):
        if self.computer:
            self.computer.destroy()

# Usage
with ComputerSession("Open calculator") as computer:
    print(f"Session ID: {computer.project_id}")

4. Timing Considerations

# Add delays for UI updates
time.sleep(1)  # After clicks
time.sleep(2)  # After opening applications
time.sleep(0.5)  # After typing

Comparison with Claude

FeatureOpenAI Computer UseClaude Computer Use
APIResponses APIMessages API
Modelcomputer-use-previewclaude-4-sonnet
Beta TagBuilt-incomputer-use-2025-01-24
ReasoningOptional summariesThinking blocks
EnvironmentMultiple (browser, OS)Single tool definition

Limitations

  • Beta Status: Computer Use is in beta and may have unexpected behaviors
  • Rate Limits: The model has constrained rate limits
  • Accuracy: ~38% success rate on complex OS tasks
  • Environment: Best suited for browser-based tasks

Next Steps