yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
ai_gui_controller.cc
Go to the documentation of this file.
2
3#include <chrono>
4#include <thread>
5
6#include "absl/strings/str_cat.h"
7#include "absl/strings/str_format.h"
8#include "absl/time/clock.h"
9#include "absl/time/time.h"
11
12#ifdef YAZE_WITH_GRPC
15#endif
16
17namespace yaze {
18namespace cli {
19namespace ai {
20
22 GuiAutomationClient* gui_client)
23 : gemini_service_(gemini_service),
24 gui_client_(gui_client),
25 vision_refiner_(std::make_unique<VisionActionRefiner>(gemini_service)) {
26 if (!gemini_service_) {
27 throw std::invalid_argument("Gemini service cannot be null");
28 }
29
30 if (!gui_client_) {
31 throw std::invalid_argument("GUI client cannot be null");
32 }
33}
34
38
40
41 return absl::OkStatus();
42}
43
44absl::StatusOr<ControlResult> AIGUIController::ExecuteCommand(
45 const std::string& command) {
46 // Parse natural language command into actions
47 auto actions_result = AIActionParser::ParseCommand(command);
48 if (!actions_result.ok()) {
49 return actions_result.status();
50 }
51
52 return ExecuteActions(*actions_result);
53}
54
55absl::StatusOr<ControlResult> AIGUIController::ExecuteActions(
56 const std::vector<AIAction>& actions) {
57 ControlResult result;
58 result.success = false;
59
60 for (const auto& action : actions) {
61 int retry_count = 0;
62 bool action_succeeded = false;
63 AIAction current_action = action;
64
65 while (retry_count < config_.max_retries_per_action && !action_succeeded) {
66 result.iterations_performed++;
67
69 result.error_message = "Max iterations reached";
70 return result;
71 }
72
73 // Execute the action with vision verification
74 auto execute_result = ExecuteSingleAction(
75 current_action, config_.enable_vision_verification);
76
77 if (!execute_result.ok()) {
78 result.error_message = std::string(execute_result.status().message());
79 return result;
80 }
81
82 result.vision_analyses.push_back(*execute_result);
83 result.actions_executed.push_back(current_action);
84
85 if (execute_result->action_successful) {
86 action_succeeded = true;
88 // Refine action and retry
89 auto refinement =
90 vision_refiner_->RefineAction(current_action, *execute_result);
91
92 if (!refinement.ok()) {
93 result.error_message = absl::StrCat("Failed to refine action: ",
94 refinement.status().message());
95 return result;
96 }
97
98 if (refinement->needs_different_approach) {
99 result.error_message = absl::StrCat(
100 "Action requires different approach: ", refinement->reasoning);
101 return result;
102 }
103
104 if (refinement->needs_retry) {
105 // Update action parameters
106 for (const auto& [key, value] : refinement->adjusted_parameters) {
107 current_action.parameters[key] = value;
108 }
109 }
110
111 retry_count++;
112 } else {
113 // No refinement, just fail
114 result.error_message = execute_result->error_message;
115 return result;
116 }
117 }
118
119 if (!action_succeeded) {
120 result.error_message =
121 absl::StrFormat("Action failed after %d retries", retry_count);
122 return result;
123 }
124 }
125
126 result.success = true;
127
128 // Capture final state
129 auto final_screenshot = CaptureCurrentState("final_state");
130 if (final_screenshot.ok()) {
131 result.screenshots_taken.push_back(*final_screenshot);
132
133 // Analyze final state
134 auto final_analysis = vision_refiner_->AnalyzeScreenshot(
135 *final_screenshot, "Verify all actions completed successfully");
136
137 if (final_analysis.ok()) {
138 result.final_state_description = final_analysis->description;
139 }
140 }
141
142 return result;
143}
144
145absl::StatusOr<VisionAnalysisResult> AIGUIController::ExecuteSingleAction(
146 const AIAction& action, bool verify_with_vision) {
148
149 // Capture before screenshot
150 std::filesystem::path before_screenshot;
151 if (verify_with_vision) {
152 auto before_result = CaptureCurrentState("before_action");
153 if (!before_result.ok()) {
154 return before_result.status();
155 }
156 before_screenshot = *before_result;
157 }
158
159 // Wait for UI to settle
161 std::this_thread::sleep_for(
162 std::chrono::milliseconds(config_.screenshot_delay_ms));
163 }
164
165 // Execute the action via gRPC
166 auto execute_status = ExecuteGRPCAction(action);
167 if (!execute_status.ok()) {
168 result.action_successful = false;
169 result.error_message = std::string(execute_status.message());
170 return result;
171 }
172
173 // Wait for action to complete
174 std::this_thread::sleep_for(
175 std::chrono::milliseconds(config_.screenshot_delay_ms));
176
177 if (verify_with_vision) {
178 // Capture after screenshot
179 auto after_result = CaptureCurrentState("after_action");
180 if (!after_result.ok()) {
181 return after_result.status();
182 }
183
184 // Verify with vision
185 return VerifyActionSuccess(action, before_screenshot, *after_result);
186 } else {
187 // Assume success without verification
188 result.action_successful = true;
189 result.description = "Action executed (no vision verification)";
190 return result;
191 }
192}
193
194absl::StatusOr<VisionAnalysisResult> AIGUIController::AnalyzeCurrentGUIState(
195 const std::string& context) {
196 auto screenshot = CaptureCurrentState("analysis");
197 if (!screenshot.ok()) {
198 return screenshot.status();
199 }
200
201 return vision_refiner_->AnalyzeScreenshot(*screenshot, context);
202}
203
204// Private helper methods
205
206absl::StatusOr<std::filesystem::path> AIGUIController::CaptureCurrentState(
207 const std::string& description) {
208#ifdef YAZE_WITH_GRPC
209 std::filesystem::path path = GenerateScreenshotPath(description);
210
211 auto result = yaze::test::CaptureHarnessScreenshot(path.string());
212 if (!result.ok()) {
213 return result.status();
214 }
215
216 return std::filesystem::path(result->file_path);
217#else
218 return absl::UnimplementedError("Screenshot capture requires gRPC support");
219#endif
220}
221
222absl::Status AIGUIController::ExecuteGRPCAction(const AIAction& action) {
223 // Convert AI action to gRPC test commands
224 auto test_script_result = action_generator_.GenerateTestScript({action});
225
226 if (!test_script_result.ok()) {
227 return test_script_result.status();
228 }
229
230#ifdef YAZE_WITH_GRPC
231 if (!gui_client_) {
232 return absl::FailedPreconditionError(
233 "GUI automation client not initialized");
234 }
235
236 // Execute the action based on its type
237 if (action.type == AIActionType::kClickButton) {
238 // Extract target from parameters
239 std::string target = "button:Unknown";
240 if (action.parameters.count("target") > 0) {
241 target = action.parameters.at("target");
242 }
243
244 // Determine click type
245 ClickType click_type = ClickType::kLeft;
246 if (action.parameters.count("click_type") > 0) {
247 const std::string& type = action.parameters.at("click_type");
248 if (type == "right") {
249 click_type = ClickType::kRight;
250 } else if (type == "middle") {
251 click_type = ClickType::kMiddle;
252 } else if (type == "double") {
253 click_type = ClickType::kDouble;
254 }
255 }
256
257 auto result = gui_client_->Click(target, click_type);
258 if (!result.ok()) {
259 return result.status();
260 }
261
262 if (!result->success) {
263 return absl::InternalError(
264 absl::StrCat("Click action failed: ", result->message));
265 }
266
267 return absl::OkStatus();
268 } else if (action.type == AIActionType::kSelectTile) {
269 // Extract target and text from parameters (treating select as a type-like
270 // action)
271 std::string target = "input:Unknown";
272 std::string text = "";
273 bool clear_first = true;
274
275 if (action.parameters.count("target") > 0) {
276 target = action.parameters.at("target");
277 }
278 if (action.parameters.count("text") > 0) {
279 text = action.parameters.at("text");
280 }
281 if (action.parameters.count("clear_first") > 0) {
282 clear_first = (action.parameters.at("clear_first") == "true");
283 }
284
285 auto result = gui_client_->Type(target, text, clear_first);
286 if (!result.ok()) {
287 return result.status();
288 }
289
290 if (!result->success) {
291 return absl::InternalError(
292 absl::StrCat("Type action failed: ", result->message));
293 }
294
295 return absl::OkStatus();
296 } else if (action.type == AIActionType::kWait) {
297 // Extract condition and timeout from parameters
298 std::string condition = "visible";
299 int timeout_ms = 5000;
300 int poll_interval_ms = 100;
301
302 if (action.parameters.count("condition") > 0) {
303 condition = action.parameters.at("condition");
304 }
305 if (action.parameters.count("timeout_ms") > 0) {
306 timeout_ms = std::stoi(action.parameters.at("timeout_ms"));
307 }
308 if (action.parameters.count("poll_interval_ms") > 0) {
309 poll_interval_ms = std::stoi(action.parameters.at("poll_interval_ms"));
310 }
311
312 auto result = gui_client_->Wait(condition, timeout_ms, poll_interval_ms);
313 if (!result.ok()) {
314 return result.status();
315 }
316
317 if (!result->success) {
318 return absl::InternalError(
319 absl::StrCat("Wait action failed: ", result->message));
320 }
321
322 return absl::OkStatus();
323 } else if (action.type == AIActionType::kVerifyTile) {
324 // Extract condition from parameters (treating verify as assert)
325 std::string condition = "";
326 if (action.parameters.count("condition") > 0) {
327 condition = action.parameters.at("condition");
328 }
329
330 auto result = gui_client_->Assert(condition);
331 if (!result.ok()) {
332 return result.status();
333 }
334
335 if (!result->success) {
336 return absl::InternalError(absl::StrCat(
337 "Assert action failed: ", result->message, " (expected: ",
338 result->expected_value, ", actual: ", result->actual_value, ")"));
339 }
340
341 return absl::OkStatus();
342 } else if (action.type == AIActionType::kPlaceTile) {
343 // This is a special action for setting overworld tiles
344 // Extract map_id, x, y, tile from parameters
345 if (action.parameters.count("map_id") == 0 ||
346 action.parameters.count("x") == 0 ||
347 action.parameters.count("y") == 0 ||
348 action.parameters.count("tile") == 0) {
349 return absl::InvalidArgumentError(
350 "set_tile action requires map_id, x, y, and tile parameters");
351 }
352
353 int map_id = std::stoi(action.parameters.at("map_id"));
354 int x = std::stoi(action.parameters.at("x"));
355 int y = std::stoi(action.parameters.at("y"));
356 std::string tile_str = action.parameters.at("tile");
357
358 // Navigate to overworld editor
359 auto click_result = gui_client_->Click("menu:Overworld", ClickType::kLeft);
360 if (!click_result.ok() || !click_result->success) {
361 return absl::InternalError("Failed to open Overworld editor");
362 }
363
364 // Wait for overworld editor to be visible
365 auto wait_result = gui_client_->Wait("window:Overworld Editor", 2000, 100);
366 if (!wait_result.ok() || !wait_result->success) {
367 return absl::InternalError("Overworld editor did not appear");
368 }
369
370 // Set the map ID
371 auto type_result =
372 gui_client_->Type("input:Map ID", std::to_string(map_id), true);
373 if (!type_result.ok() || !type_result->success) {
374 return absl::InternalError("Failed to set map ID");
375 }
376
377 // Click on the tile position (approximate based on editor layout)
378 // This is a simplified implementation
379 std::string target = absl::StrCat("canvas:overworld@", x * 16, ",", y * 16);
380 click_result = gui_client_->Click(target, ClickType::kLeft);
381 if (!click_result.ok() || !click_result->success) {
382 return absl::InternalError("Failed to click tile position");
383 }
384
385 return absl::OkStatus();
386 } else {
387 return absl::UnimplementedError(absl::StrCat(
388 "Action type not implemented: ", static_cast<int>(action.type)));
389 }
390#else
391 return absl::UnimplementedError(
392 "gRPC GUI automation requires building with -DYAZE_WITH_GRPC=ON");
393#endif
394}
395
396absl::StatusOr<VisionAnalysisResult> AIGUIController::VerifyActionSuccess(
397 const AIAction& action, const std::filesystem::path& before_screenshot,
398 const std::filesystem::path& after_screenshot) {
399 return vision_refiner_->VerifyAction(action, before_screenshot,
400 after_screenshot);
401}
402
404 const AIAction& original_action, const VisionAnalysisResult& analysis) {
405 auto refinement = vision_refiner_->RefineAction(original_action, analysis);
406 if (!refinement.ok()) {
407 return refinement.status();
408 }
409
410 AIAction refined_action = original_action;
411
412 // Apply adjusted parameters
413 for (const auto& [key, value] : refinement->adjusted_parameters) {
414 refined_action.parameters[key] = value;
415 }
416
417 return refined_action;
418}
419
421 std::error_code ec;
422 std::filesystem::create_directories(screenshots_dir_, ec);
423
424 if (ec) {
425 std::cerr << "Warning: Failed to create screenshots directory: "
426 << ec.message() << std::endl;
427 }
428}
429
431 const std::string& suffix) {
432 int64_t timestamp = absl::ToUnixMillis(absl::Now());
433
434 std::string filename = absl::StrFormat("ai_gui_%s_%lld.png", suffix,
435 static_cast<long long>(timestamp));
436
437 return screenshots_dir_ / filename;
438}
439
440} // namespace ai
441} // namespace cli
442} // namespace yaze
Client for automating YAZE GUI through gRPC.
absl::StatusOr< AutomationResult > Type(const std::string &target, const std::string &text, bool clear_first=false)
Type text into an input field.
absl::StatusOr< AutomationResult > Wait(const std::string &condition, int timeout_ms=5000, int poll_interval_ms=100)
Wait for a condition to be met.
absl::StatusOr< AutomationResult > Assert(const std::string &condition)
Assert a GUI state condition.
absl::StatusOr< AutomationResult > Click(const std::string &target, ClickType type=ClickType::kLeft)
Click a GUI element.
static absl::StatusOr< std::vector< AIAction > > ParseCommand(const std::string &command)
std::unique_ptr< VisionActionRefiner > vision_refiner_
absl::StatusOr< AIAction > RefineActionWithVision(const AIAction &original_action, const VisionAnalysisResult &analysis)
absl::Status ExecuteGRPCAction(const AIAction &action)
absl::Status Initialize(const ControlLoopConfig &config)
Initialize the controller with configuration.
std::filesystem::path GenerateScreenshotPath(const std::string &suffix)
std::filesystem::path screenshots_dir_
absl::StatusOr< std::filesystem::path > CaptureCurrentState(const std::string &description)
AIGUIController(GeminiAIService *gemini_service, GuiAutomationClient *gui_client)
Construct controller with required services.
absl::StatusOr< ControlResult > ExecuteActions(const std::vector< ai::AIAction > &actions)
Execute a sequence of pre-parsed actions.
absl::StatusOr< ControlResult > ExecuteCommand(const std::string &command)
Execute a natural language command with AI vision guidance.
gui::GuiActionGenerator action_generator_
absl::StatusOr< VisionAnalysisResult > ExecuteSingleAction(const AIAction &action, bool verify_with_vision=true)
Execute a single action with optional vision verification.
GuiAutomationClient * gui_client_
const ControlLoopConfig & config() const
Get the current configuration.
absl::StatusOr< VisionAnalysisResult > AnalyzeCurrentGUIState(const std::string &context="")
Analyze the current GUI state without executing actions.
absl::StatusOr< VisionAnalysisResult > VerifyActionSuccess(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Uses Gemini Vision to analyze GUI screenshots and refine AI actions.
absl::StatusOr< std::string > GenerateTestScript(const std::vector< ai::AIAction > &actions)
ClickType
Type of click action to perform.
Represents a single action to be performed in the GUI.
std::map< std::string, std::string > parameters
Configuration for the AI GUI control loop.
Result of AI-controlled GUI automation.
std::vector< std::filesystem::path > screenshots_taken
std::vector< ai::AIAction > actions_executed
std::vector< VisionAnalysisResult > vision_analyses
Result of analyzing a screenshot with Gemini Vision.