6#include "absl/strings/str_cat.h"
7#include "absl/strings/str_format.h"
8#include "absl/time/clock.h"
9#include "absl/time/time.h"
23 : gemini_service_(gemini_service),
24 gui_client_(gui_client),
28 throw std::invalid_argument(
"Gemini service cannot be null");
32 throw std::invalid_argument(
"GUI client cannot be null");
42 return absl::OkStatus();
46 const std::string& command) {
50 if (!actions_result.ok()) {
51 return actions_result.status();
58 const std::vector<AIAction>& actions) {
63 for (
const auto& action : actions) {
65 bool action_succeeded =
false;
82 if (!execute_result.ok()) {
83 result.
error_message = std::string(execute_result.status().message());
90 if (execute_result->action_successful) {
91 action_succeeded =
true;
100 if (!refinement.ok()) {
102 absl::StrCat(
"Failed to refine action: ",
103 refinement.status().message());
107 if (refinement->needs_different_approach) {
109 absl::StrCat(
"Action requires different approach: ",
110 refinement->reasoning);
114 if (refinement->needs_retry) {
116 for (
const auto& [key, value] : refinement->adjusted_parameters) {
130 if (!action_succeeded) {
132 absl::StrFormat(
"Action failed after %d retries", retry_count);
141 if (final_screenshot.ok()) {
147 "Verify all actions completed successfully"
150 if (final_analysis.ok()) {
160 bool verify_with_vision) {
165 std::filesystem::path before_screenshot;
166 if (verify_with_vision) {
168 if (!before_result.ok()) {
169 return before_result.status();
171 before_screenshot = *before_result;
176 std::this_thread::sleep_for(
182 if (!execute_status.ok()) {
184 result.
error_message = std::string(execute_status.message());
189 std::this_thread::sleep_for(
192 if (verify_with_vision) {
195 if (!after_result.ok()) {
196 return after_result.status();
205 result.
description =
"Action executed (no vision verification)";
211 const std::string& context) {
214 if (!screenshot.ok()) {
215 return screenshot.status();
224 const std::string& description) {
229 auto result = yaze::test::CaptureHarnessScreenshot(path.string());
231 return result.status();
234 return std::filesystem::path(result->file_path);
236 return absl::UnimplementedError(
"Screenshot capture requires gRPC support");
244 if (!test_script_result.ok()) {
245 return test_script_result.status();
250 return absl::FailedPreconditionError(
"GUI automation client not initialized");
256 std::string target =
"button:Unknown";
263 if (action.
parameters.count(
"click_type") > 0) {
264 const std::string& type = action.
parameters.at(
"click_type");
265 if (type ==
"right") {
267 }
else if (type ==
"middle") {
269 }
else if (type ==
"double") {
276 return result.status();
279 if (!result->success) {
280 return absl::InternalError(
281 absl::StrCat(
"Click action failed: ", result->message));
284 return absl::OkStatus();
288 std::string target =
"input:Unknown";
289 std::string text =
"";
290 bool clear_first =
true;
298 if (action.
parameters.count(
"clear_first") > 0) {
299 clear_first = (action.
parameters.at(
"clear_first") ==
"true");
304 return result.status();
307 if (!result->success) {
308 return absl::InternalError(
309 absl::StrCat(
"Type action failed: ", result->message));
312 return absl::OkStatus();
316 std::string condition =
"visible";
317 int timeout_ms = 5000;
318 int poll_interval_ms = 100;
320 if (action.
parameters.count(
"condition") > 0) {
321 condition = action.
parameters.at(
"condition");
323 if (action.
parameters.count(
"timeout_ms") > 0) {
324 timeout_ms = std::stoi(action.
parameters.at(
"timeout_ms"));
326 if (action.
parameters.count(
"poll_interval_ms") > 0) {
327 poll_interval_ms = std::stoi(action.
parameters.at(
"poll_interval_ms"));
330 auto result =
gui_client_->
Wait(condition, timeout_ms, poll_interval_ms);
332 return result.status();
335 if (!result->success) {
336 return absl::InternalError(
337 absl::StrCat(
"Wait action failed: ", result->message));
340 return absl::OkStatus();
344 std::string condition =
"";
345 if (action.
parameters.count(
"condition") > 0) {
346 condition = action.
parameters.at(
"condition");
351 return result.status();
354 if (!result->success) {
355 return absl::InternalError(
356 absl::StrCat(
"Assert action failed: ", result->message,
357 " (expected: ", result->expected_value,
358 ", actual: ", result->actual_value,
")"));
361 return absl::OkStatus();
370 return absl::InvalidArgumentError(
371 "set_tile action requires map_id, x, y, and tile parameters");
374 int map_id = std::stoi(action.
parameters.at(
"map_id"));
377 std::string tile_str = action.
parameters.at(
"tile");
381 if (!click_result.ok() || !click_result->success) {
382 return absl::InternalError(
"Failed to open Overworld editor");
386 auto wait_result =
gui_client_->
Wait(
"window:Overworld Editor", 2000, 100);
387 if (!wait_result.ok() || !wait_result->success) {
388 return absl::InternalError(
"Overworld editor did not appear");
392 auto type_result =
gui_client_->
Type(
"input:Map ID", std::to_string(map_id),
true);
393 if (!type_result.ok() || !type_result->success) {
394 return absl::InternalError(
"Failed to set map ID");
399 std::string target = absl::StrCat(
"canvas:overworld@", x * 16,
",", y * 16);
401 if (!click_result.ok() || !click_result->success) {
402 return absl::InternalError(
"Failed to click tile position");
405 return absl::OkStatus();
408 return absl::UnimplementedError(
409 absl::StrCat(
"Action type not implemented: ",
410 static_cast<int>(action.
type)));
413 return absl::UnimplementedError(
414 "gRPC GUI automation requires building with -DYAZE_WITH_GRPC=ON");
421 const std::filesystem::path& before_screenshot,
422 const std::filesystem::path& after_screenshot) {
424 return vision_refiner_->VerifyAction(action, before_screenshot, after_screenshot);
431 auto refinement =
vision_refiner_->RefineAction(original_action, analysis);
432 if (!refinement.ok()) {
433 return refinement.status();
436 AIAction refined_action = original_action;
439 for (
const auto& [key, value] : refinement->adjusted_parameters) {
443 return refined_action;
451 std::cerr <<
"Warning: Failed to create screenshots directory: "
452 << ec.message() << std::endl;
457 const std::string& suffix) {
459 int64_t timestamp = absl::ToUnixMillis(absl::Now());
461 std::string filename = absl::StrFormat(
462 "ai_gui_%s_%lld.png",
464 static_cast<long long>(timestamp)
Client for automating YAZE GUI through gRPC.
absl::StatusOr< AutomationResult > Type(const std::string &target, const std::string &text, bool clear_first=false)
Type text into an input field.
absl::StatusOr< AutomationResult > Wait(const std::string &condition, int timeout_ms=5000, int poll_interval_ms=100)
Wait for a condition to be met.
absl::StatusOr< AutomationResult > Assert(const std::string &condition)
Assert a GUI state condition.
absl::StatusOr< AutomationResult > Click(const std::string &target, ClickType type=ClickType::kLeft)
Click a GUI element.
static absl::StatusOr< std::vector< AIAction > > ParseCommand(const std::string &command)
std::unique_ptr< VisionActionRefiner > vision_refiner_
absl::StatusOr< AIAction > RefineActionWithVision(const AIAction &original_action, const VisionAnalysisResult &analysis)
absl::Status ExecuteGRPCAction(const AIAction &action)
GeminiAIService * gemini_service_
absl::Status Initialize(const ControlLoopConfig &config)
Initialize the controller with configuration.
std::filesystem::path GenerateScreenshotPath(const std::string &suffix)
std::filesystem::path screenshots_dir_
absl::StatusOr< std::filesystem::path > CaptureCurrentState(const std::string &description)
AIGUIController(GeminiAIService *gemini_service, GuiAutomationClient *gui_client)
Construct controller with required services.
absl::StatusOr< ControlResult > ExecuteActions(const std::vector< ai::AIAction > &actions)
Execute a sequence of pre-parsed actions.
absl::StatusOr< ControlResult > ExecuteCommand(const std::string &command)
Execute a natural language command with AI vision guidance.
gui::GuiActionGenerator action_generator_
absl::StatusOr< VisionAnalysisResult > ExecuteSingleAction(const AIAction &action, bool verify_with_vision=true)
Execute a single action with optional vision verification.
GuiAutomationClient * gui_client_
const ControlLoopConfig & config() const
Get the current configuration.
ControlLoopConfig config_
absl::StatusOr< VisionAnalysisResult > AnalyzeCurrentGUIState(const std::string &context="")
Analyze the current GUI state without executing actions.
void EnsureScreenshotsDirectory()
absl::StatusOr< VisionAnalysisResult > VerifyActionSuccess(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Uses Gemini Vision to analyze GUI screenshots and refine AI actions.
absl::StatusOr< std::string > GenerateTestScript(const std::vector< ai::AIAction > &actions)
ClickType
Type of click action to perform.
Main namespace for the application.
Represents a single action to be performed in the GUI.
std::map< std::string, std::string > parameters
Configuration for the AI GUI control loop.
std::string screenshots_dir
bool enable_vision_verification
bool enable_iterative_refinement
int max_retries_per_action
Result of AI-controlled GUI automation.
std::vector< std::filesystem::path > screenshots_taken
std::vector< ai::AIAction > actions_executed
std::vector< VisionAnalysisResult > vision_analyses
std::string error_message
std::string final_state_description
Result of analyzing a screenshot with Gemini Vision.
std::string error_message