yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
ai_gui_controller.cc
Go to the documentation of this file.
2
3#include <chrono>
4#include <thread>
5
6#include "absl/strings/str_cat.h"
7#include "absl/strings/str_format.h"
8#include "absl/time/clock.h"
9#include "absl/time/time.h"
11
12#ifdef YAZE_WITH_GRPC
15#endif
16
17namespace yaze {
18namespace cli {
19namespace ai {
20
22 GuiAutomationClient* gui_client)
23 : gemini_service_(gemini_service),
24 gui_client_(gui_client),
25 vision_refiner_(std::make_unique<VisionActionRefiner>(gemini_service)) {
26
27 if (!gemini_service_) {
28 throw std::invalid_argument("Gemini service cannot be null");
29 }
30
31 if (!gui_client_) {
32 throw std::invalid_argument("GUI client cannot be null");
33 }
34}
35
39
41
42 return absl::OkStatus();
43}
44
45absl::StatusOr<ControlResult> AIGUIController::ExecuteCommand(
46 const std::string& command) {
47
48 // Parse natural language command into actions
49 auto actions_result = AIActionParser::ParseCommand(command);
50 if (!actions_result.ok()) {
51 return actions_result.status();
52 }
53
54 return ExecuteActions(*actions_result);
55}
56
57absl::StatusOr<ControlResult> AIGUIController::ExecuteActions(
58 const std::vector<AIAction>& actions) {
59
60 ControlResult result;
61 result.success = false;
62
63 for (const auto& action : actions) {
64 int retry_count = 0;
65 bool action_succeeded = false;
66 AIAction current_action = action;
67
68 while (retry_count < config_.max_retries_per_action && !action_succeeded) {
69 result.iterations_performed++;
70
72 result.error_message = "Max iterations reached";
73 return result;
74 }
75
76 // Execute the action with vision verification
77 auto execute_result = ExecuteSingleAction(
78 current_action,
80 );
81
82 if (!execute_result.ok()) {
83 result.error_message = std::string(execute_result.status().message());
84 return result;
85 }
86
87 result.vision_analyses.push_back(*execute_result);
88 result.actions_executed.push_back(current_action);
89
90 if (execute_result->action_successful) {
91 action_succeeded = true;
92 }
94 // Refine action and retry
95 auto refinement = vision_refiner_->RefineAction(
96 current_action,
97 *execute_result
98 );
99
100 if (!refinement.ok()) {
101 result.error_message =
102 absl::StrCat("Failed to refine action: ",
103 refinement.status().message());
104 return result;
105 }
106
107 if (refinement->needs_different_approach) {
108 result.error_message =
109 absl::StrCat("Action requires different approach: ",
110 refinement->reasoning);
111 return result;
112 }
113
114 if (refinement->needs_retry) {
115 // Update action parameters
116 for (const auto& [key, value] : refinement->adjusted_parameters) {
117 current_action.parameters[key] = value;
118 }
119 }
120
121 retry_count++;
122 }
123 else {
124 // No refinement, just fail
125 result.error_message = execute_result->error_message;
126 return result;
127 }
128 }
129
130 if (!action_succeeded) {
131 result.error_message =
132 absl::StrFormat("Action failed after %d retries", retry_count);
133 return result;
134 }
135 }
136
137 result.success = true;
138
139 // Capture final state
140 auto final_screenshot = CaptureCurrentState("final_state");
141 if (final_screenshot.ok()) {
142 result.screenshots_taken.push_back(*final_screenshot);
143
144 // Analyze final state
145 auto final_analysis = vision_refiner_->AnalyzeScreenshot(
146 *final_screenshot,
147 "Verify all actions completed successfully"
148 );
149
150 if (final_analysis.ok()) {
151 result.final_state_description = final_analysis->description;
152 }
153 }
154
155 return result;
156}
157
158absl::StatusOr<VisionAnalysisResult> AIGUIController::ExecuteSingleAction(
159 const AIAction& action,
160 bool verify_with_vision) {
161
163
164 // Capture before screenshot
165 std::filesystem::path before_screenshot;
166 if (verify_with_vision) {
167 auto before_result = CaptureCurrentState("before_action");
168 if (!before_result.ok()) {
169 return before_result.status();
170 }
171 before_screenshot = *before_result;
172 }
173
174 // Wait for UI to settle
176 std::this_thread::sleep_for(
177 std::chrono::milliseconds(config_.screenshot_delay_ms));
178 }
179
180 // Execute the action via gRPC
181 auto execute_status = ExecuteGRPCAction(action);
182 if (!execute_status.ok()) {
183 result.action_successful = false;
184 result.error_message = std::string(execute_status.message());
185 return result;
186 }
187
188 // Wait for action to complete
189 std::this_thread::sleep_for(
190 std::chrono::milliseconds(config_.screenshot_delay_ms));
191
192 if (verify_with_vision) {
193 // Capture after screenshot
194 auto after_result = CaptureCurrentState("after_action");
195 if (!after_result.ok()) {
196 return after_result.status();
197 }
198
199 // Verify with vision
200 return VerifyActionSuccess(action, before_screenshot, *after_result);
201 }
202 else {
203 // Assume success without verification
204 result.action_successful = true;
205 result.description = "Action executed (no vision verification)";
206 return result;
207 }
208}
209
210absl::StatusOr<VisionAnalysisResult> AIGUIController::AnalyzeCurrentGUIState(
211 const std::string& context) {
212
213 auto screenshot = CaptureCurrentState("analysis");
214 if (!screenshot.ok()) {
215 return screenshot.status();
216 }
217
218 return vision_refiner_->AnalyzeScreenshot(*screenshot, context);
219}
220
221// Private helper methods
222
223absl::StatusOr<std::filesystem::path> AIGUIController::CaptureCurrentState(
224 const std::string& description) {
225
226#ifdef YAZE_WITH_GRPC
227 std::filesystem::path path = GenerateScreenshotPath(description);
228
229 auto result = yaze::test::CaptureHarnessScreenshot(path.string());
230 if (!result.ok()) {
231 return result.status();
232 }
233
234 return std::filesystem::path(result->file_path);
235#else
236 return absl::UnimplementedError("Screenshot capture requires gRPC support");
237#endif
238}
239
240absl::Status AIGUIController::ExecuteGRPCAction(const AIAction& action) {
241 // Convert AI action to gRPC test commands
242 auto test_script_result = action_generator_.GenerateTestScript({action});
243
244 if (!test_script_result.ok()) {
245 return test_script_result.status();
246 }
247
248#ifdef YAZE_WITH_GRPC
249 if (!gui_client_) {
250 return absl::FailedPreconditionError("GUI automation client not initialized");
251 }
252
253 // Execute the action based on its type
254 if (action.type == AIActionType::kClickButton) {
255 // Extract target from parameters
256 std::string target = "button:Unknown";
257 if (action.parameters.count("target") > 0) {
258 target = action.parameters.at("target");
259 }
260
261 // Determine click type
262 ClickType click_type = ClickType::kLeft;
263 if (action.parameters.count("click_type") > 0) {
264 const std::string& type = action.parameters.at("click_type");
265 if (type == "right") {
266 click_type = ClickType::kRight;
267 } else if (type == "middle") {
268 click_type = ClickType::kMiddle;
269 } else if (type == "double") {
270 click_type = ClickType::kDouble;
271 }
272 }
273
274 auto result = gui_client_->Click(target, click_type);
275 if (!result.ok()) {
276 return result.status();
277 }
278
279 if (!result->success) {
280 return absl::InternalError(
281 absl::StrCat("Click action failed: ", result->message));
282 }
283
284 return absl::OkStatus();
285 }
286 else if (action.type == AIActionType::kSelectTile) {
287 // Extract target and text from parameters (treating select as a type-like action)
288 std::string target = "input:Unknown";
289 std::string text = "";
290 bool clear_first = true;
291
292 if (action.parameters.count("target") > 0) {
293 target = action.parameters.at("target");
294 }
295 if (action.parameters.count("text") > 0) {
296 text = action.parameters.at("text");
297 }
298 if (action.parameters.count("clear_first") > 0) {
299 clear_first = (action.parameters.at("clear_first") == "true");
300 }
301
302 auto result = gui_client_->Type(target, text, clear_first);
303 if (!result.ok()) {
304 return result.status();
305 }
306
307 if (!result->success) {
308 return absl::InternalError(
309 absl::StrCat("Type action failed: ", result->message));
310 }
311
312 return absl::OkStatus();
313 }
314 else if (action.type == AIActionType::kWait) {
315 // Extract condition and timeout from parameters
316 std::string condition = "visible";
317 int timeout_ms = 5000;
318 int poll_interval_ms = 100;
319
320 if (action.parameters.count("condition") > 0) {
321 condition = action.parameters.at("condition");
322 }
323 if (action.parameters.count("timeout_ms") > 0) {
324 timeout_ms = std::stoi(action.parameters.at("timeout_ms"));
325 }
326 if (action.parameters.count("poll_interval_ms") > 0) {
327 poll_interval_ms = std::stoi(action.parameters.at("poll_interval_ms"));
328 }
329
330 auto result = gui_client_->Wait(condition, timeout_ms, poll_interval_ms);
331 if (!result.ok()) {
332 return result.status();
333 }
334
335 if (!result->success) {
336 return absl::InternalError(
337 absl::StrCat("Wait action failed: ", result->message));
338 }
339
340 return absl::OkStatus();
341 }
342 else if (action.type == AIActionType::kVerifyTile) {
343 // Extract condition from parameters (treating verify as assert)
344 std::string condition = "";
345 if (action.parameters.count("condition") > 0) {
346 condition = action.parameters.at("condition");
347 }
348
349 auto result = gui_client_->Assert(condition);
350 if (!result.ok()) {
351 return result.status();
352 }
353
354 if (!result->success) {
355 return absl::InternalError(
356 absl::StrCat("Assert action failed: ", result->message,
357 " (expected: ", result->expected_value,
358 ", actual: ", result->actual_value, ")"));
359 }
360
361 return absl::OkStatus();
362 }
363 else if (action.type == AIActionType::kPlaceTile) {
364 // This is a special action for setting overworld tiles
365 // Extract map_id, x, y, tile from parameters
366 if (action.parameters.count("map_id") == 0 ||
367 action.parameters.count("x") == 0 ||
368 action.parameters.count("y") == 0 ||
369 action.parameters.count("tile") == 0) {
370 return absl::InvalidArgumentError(
371 "set_tile action requires map_id, x, y, and tile parameters");
372 }
373
374 int map_id = std::stoi(action.parameters.at("map_id"));
375 int x = std::stoi(action.parameters.at("x"));
376 int y = std::stoi(action.parameters.at("y"));
377 std::string tile_str = action.parameters.at("tile");
378
379 // Navigate to overworld editor
380 auto click_result = gui_client_->Click("menu:Overworld", ClickType::kLeft);
381 if (!click_result.ok() || !click_result->success) {
382 return absl::InternalError("Failed to open Overworld editor");
383 }
384
385 // Wait for overworld editor to be visible
386 auto wait_result = gui_client_->Wait("window:Overworld Editor", 2000, 100);
387 if (!wait_result.ok() || !wait_result->success) {
388 return absl::InternalError("Overworld editor did not appear");
389 }
390
391 // Set the map ID
392 auto type_result = gui_client_->Type("input:Map ID", std::to_string(map_id), true);
393 if (!type_result.ok() || !type_result->success) {
394 return absl::InternalError("Failed to set map ID");
395 }
396
397 // Click on the tile position (approximate based on editor layout)
398 // This is a simplified implementation
399 std::string target = absl::StrCat("canvas:overworld@", x * 16, ",", y * 16);
400 click_result = gui_client_->Click(target, ClickType::kLeft);
401 if (!click_result.ok() || !click_result->success) {
402 return absl::InternalError("Failed to click tile position");
403 }
404
405 return absl::OkStatus();
406 }
407 else {
408 return absl::UnimplementedError(
409 absl::StrCat("Action type not implemented: ",
410 static_cast<int>(action.type)));
411 }
412#else
413 return absl::UnimplementedError(
414 "gRPC GUI automation requires building with -DYAZE_WITH_GRPC=ON");
415#endif
416}
417
418
419absl::StatusOr<VisionAnalysisResult> AIGUIController::VerifyActionSuccess(
420 const AIAction& action,
421 const std::filesystem::path& before_screenshot,
422 const std::filesystem::path& after_screenshot) {
423
424 return vision_refiner_->VerifyAction(action, before_screenshot, after_screenshot);
425}
426
428 const AIAction& original_action,
429 const VisionAnalysisResult& analysis) {
430
431 auto refinement = vision_refiner_->RefineAction(original_action, analysis);
432 if (!refinement.ok()) {
433 return refinement.status();
434 }
435
436 AIAction refined_action = original_action;
437
438 // Apply adjusted parameters
439 for (const auto& [key, value] : refinement->adjusted_parameters) {
440 refined_action.parameters[key] = value;
441 }
442
443 return refined_action;
444}
445
447 std::error_code ec;
448 std::filesystem::create_directories(screenshots_dir_, ec);
449
450 if (ec) {
451 std::cerr << "Warning: Failed to create screenshots directory: "
452 << ec.message() << std::endl;
453 }
454}
455
457 const std::string& suffix) {
458
459 int64_t timestamp = absl::ToUnixMillis(absl::Now());
460
461 std::string filename = absl::StrFormat(
462 "ai_gui_%s_%lld.png",
463 suffix,
464 static_cast<long long>(timestamp)
465 );
466
467 return screenshots_dir_ / filename;
468}
469
470} // namespace ai
471} // namespace cli
472} // namespace yaze
Client for automating YAZE GUI through gRPC.
absl::StatusOr< AutomationResult > Type(const std::string &target, const std::string &text, bool clear_first=false)
Type text into an input field.
absl::StatusOr< AutomationResult > Wait(const std::string &condition, int timeout_ms=5000, int poll_interval_ms=100)
Wait for a condition to be met.
absl::StatusOr< AutomationResult > Assert(const std::string &condition)
Assert a GUI state condition.
absl::StatusOr< AutomationResult > Click(const std::string &target, ClickType type=ClickType::kLeft)
Click a GUI element.
static absl::StatusOr< std::vector< AIAction > > ParseCommand(const std::string &command)
std::unique_ptr< VisionActionRefiner > vision_refiner_
absl::StatusOr< AIAction > RefineActionWithVision(const AIAction &original_action, const VisionAnalysisResult &analysis)
absl::Status ExecuteGRPCAction(const AIAction &action)
absl::Status Initialize(const ControlLoopConfig &config)
Initialize the controller with configuration.
std::filesystem::path GenerateScreenshotPath(const std::string &suffix)
std::filesystem::path screenshots_dir_
absl::StatusOr< std::filesystem::path > CaptureCurrentState(const std::string &description)
AIGUIController(GeminiAIService *gemini_service, GuiAutomationClient *gui_client)
Construct controller with required services.
absl::StatusOr< ControlResult > ExecuteActions(const std::vector< ai::AIAction > &actions)
Execute a sequence of pre-parsed actions.
absl::StatusOr< ControlResult > ExecuteCommand(const std::string &command)
Execute a natural language command with AI vision guidance.
gui::GuiActionGenerator action_generator_
absl::StatusOr< VisionAnalysisResult > ExecuteSingleAction(const AIAction &action, bool verify_with_vision=true)
Execute a single action with optional vision verification.
GuiAutomationClient * gui_client_
const ControlLoopConfig & config() const
Get the current configuration.
absl::StatusOr< VisionAnalysisResult > AnalyzeCurrentGUIState(const std::string &context="")
Analyze the current GUI state without executing actions.
absl::StatusOr< VisionAnalysisResult > VerifyActionSuccess(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Uses Gemini Vision to analyze GUI screenshots and refine AI actions.
absl::StatusOr< std::string > GenerateTestScript(const std::vector< ai::AIAction > &actions)
ClickType
Type of click action to perform.
Main namespace for the application.
Definition controller.cc:20
Represents a single action to be performed in the GUI.
std::map< std::string, std::string > parameters
Configuration for the AI GUI control loop.
Result of AI-controlled GUI automation.
std::vector< std::filesystem::path > screenshots_taken
std::vector< ai::AIAction > actions_executed
std::vector< VisionAnalysisResult > vision_analyses
Result of analyzing a screenshot with Gemini Vision.