Here’s a full working example that handles the complete agent loop:
example.py
import osimport timeimport base64import iofrom google import genaifrom google.genai import typesfrom orgo import Computerfrom PIL import Imagefrom dotenv import load_dotenv# Load environment variablesload_dotenv()# Initialize Gemini clientclient = genai.Client(api_key=os.environ.get('GEMINI_API_KEY'))# Connect to your Orgo computer# Get your computer_id from https://orgo.ai/workspacescomputer = Computer(computer_id="your-computer-id")# Screen resolutionSCREEN_WIDTH = 1024SCREEN_HEIGHT = 768# System prompt with Ubuntu-specific instructionsSYSTEM_PROMPT = f"""You are controlling an Ubuntu Linux virtual machine with a display resolution of {SCREEN_WIDTH}x{SCREEN_HEIGHT}.<SYSTEM_CAPABILITY>* You have access to a virtual Ubuntu desktop environment with standard applications* You can see the current state through screenshots and control the computer through actions* The environment has Firefox browser and standard Ubuntu applications pre-installed</SYSTEM_CAPABILITY><UBUNTU_DESKTOP_GUIDELINES>* CRITICAL: When opening applications or files on the Ubuntu desktop, you MUST USE DOUBLE-CLICK, not single-click* Single-click only selects desktop icons but DOES NOT open them* Desktop interactions: - Desktop icons (apps/folders): DOUBLE-CLICK to open - Menu items: SINGLE-CLICK to select - Taskbar/launcher icons: SINGLE-CLICK to open - Window buttons (close/minimize/maximize): SINGLE-CLICK - File browser items: DOUBLE-CLICK to open* Always start by taking a screenshot to see the current state* When you need to submit or confirm, use the 'Enter' key</UBUNTU_DESKTOP_GUIDELINES><IMPORTANT_NOTES>* Be efficient with screenshots - only take them when you need to see the current state* Wait for pages/applications to load before taking another screenshot* Batch multiple actions together when possible before checking the result</IMPORTANT_NOTES>"""def denormalize_x(x: int) -> int: """Convert normalized x coordinate (0-999) to actual pixel.""" return int(x / 1000 * SCREEN_WIDTH)def denormalize_y(y: int) -> int: """Convert normalized y coordinate (0-999) to actual pixel.""" return int(y / 1000 * SCREEN_HEIGHT)def get_screenshot_png() -> bytes: """Get screenshot as PNG bytes (Gemini requires PNG format).""" jpeg_data = base64.b64decode(computer.screenshot_base64()) image = Image.open(io.BytesIO(jpeg_data)) png_buffer = io.BytesIO() image.save(png_buffer, format='PNG') return png_buffer.getvalue()def get_current_url() -> str: """Get the current URL from the browser.""" try: result = computer.bash("xdotool getactivewindow getwindowname") return result if result else "about:blank" except: return "about:blank"def execute_function_calls(candidate): """Execute function calls from Gemini's response.""" results = [] function_calls = [ part.function_call for part in candidate.content.parts if part.function_call ] for function_call in function_calls: fname = function_call.name args = function_call.args action_result = {} print(f" → {fname}") try: if fname == "open_web_browser": pass # Browser already open elif fname == "click_at": computer.left_click(denormalize_x(args["x"]), denormalize_y(args["y"])) elif fname == "type_text_at": computer.left_click(denormalize_x(args["x"]), denormalize_y(args["y"])) computer.type(args["text"]) if args.get("press_enter", False): computer.key("Return") elif fname == "scroll_document": computer.scroll(args["direction"], 3) elif fname == "key_combination": computer.key(args["keys"]) elif fname == "go_back": computer.key("alt+Left") elif fname == "navigate": url = args["url"] computer.bash(f'firefox "{url}" &') action_result["url"] = url elif fname == "wait_5_seconds": computer.wait(5) else: print(f" Warning: Unimplemented function {fname}") time.sleep(1) # Wait for UI to update except Exception as e: print(f" Error: {e}") action_result = {"error": str(e)} results.append((fname, action_result)) return resultsdef get_function_responses(results): """Create function responses with screenshot and URL.""" screenshot_png = get_screenshot_png() current_url = get_current_url() function_responses = [] for name, result in results: response_data = { "status": "completed", "url": result.get("url", current_url) } response_data.update(result) function_responses.append( types.FunctionResponse( name=name, response=response_data, parts=[ types.FunctionResponsePart( inline_data=types.FunctionResponseBlob( mime_type="image/png", data=screenshot_png ) ) ] ) ) return function_responsestry: # Configure Computer Use tool with system instruction config = types.GenerateContentConfig( system_instruction=SYSTEM_PROMPT, tools=[ types.Tool( computer_use=types.ComputerUse( environment=types.Environment.ENVIRONMENT_BROWSER ) ) ] ) # Define task task = "Open Chrome and search for 'gemini ai'" print(f"Task: {task}\n") # Get initial screenshot initial_screenshot = get_screenshot_png() # Create initial request contents = [ types.Content( role="user", parts=[ types.Part(text=task), types.Part.from_bytes( data=initial_screenshot, mime_type='image/png' ) ] ) ] # Agent loop for iteration in range(20): print(f"\n--- Turn {iteration + 1} ---") # Get response from Gemini response = client.models.generate_content( model='gemini-2.5-computer-use-preview-10-2025', contents=contents, config=config ) candidate = response.candidates[0] contents.append(candidate.content) # Display progress for part in candidate.content.parts: if part.text: print(f"💬 {part.text}") # Check for function calls has_function_calls = any( part.function_call for part in candidate.content.parts ) if not has_function_calls: print("\n✓ Task completed") break # Execute actions print("→ Executing actions...") results = execute_function_calls(candidate) # Get responses with screenshot and URL function_responses = get_function_responses(results) # Continue conversation contents.append( types.Content( role="user", parts=[ types.Part(function_response=fr) for fr in function_responses ] ) )except Exception as e: print(f"\n❌ Error: {e}")finally: print("\nDone!") # Note: computer.destroy() not called to keep computer running # Call computer.destroy() if you want to clean up
# Change the task variable to control what Gemini doestask = "Open Chrome and search for 'gemini ai'"# Navigate to a websitetask = "Go to github.com and search for 'orgo'"# Fill a formtask = "Fill out the contact form with test data"
The system prompt provides crucial context to Gemini about the Ubuntu environment:
SYSTEM_PROMPT = f"""You are controlling an Ubuntu Linux virtual machine...<UBUNTU_DESKTOP_GUIDELINES>* CRITICAL: When opening applications or files on the Ubuntu desktop, you MUST USE DOUBLE-CLICK, not single-click* Single-click only selects desktop icons but DOES NOT open them* Desktop icons (apps/folders): DOUBLE-CLICK to open* Menu items: SINGLE-CLICK to select</UBUNTU_DESKTOP_GUIDELINES>"""