6#include "absl/strings/str_cat.h"
7#include "absl/strings/str_split.h"
8#include "absl/strings/string_view.h"
16 : gemini_service_(gemini_service) {
18 throw std::invalid_argument(
"Gemini service cannot be null");
23 const std::filesystem::path& screenshot_path,
const std::string& context) {
24 if (!std::filesystem::exists(screenshot_path)) {
25 return absl::NotFoundError(
26 absl::StrCat(
"Screenshot not found: ", screenshot_path.string()));
32 screenshot_path.string(), prompt);
35 return response.status();
42 const AIAction& action,
const std::filesystem::path& before_screenshot,
43 const std::filesystem::path& after_screenshot) {
44 if (!std::filesystem::exists(before_screenshot)) {
45 return absl::NotFoundError(
"Before screenshot not found");
48 if (!std::filesystem::exists(after_screenshot)) {
49 return absl::NotFoundError(
"After screenshot not found");
56 after_screenshot.string(), verification_prompt);
58 if (!after_response.ok()) {
59 return after_response.status();
76 std::transform(error_lower.begin(), error_lower.end(), error_lower.begin(),
79 if (error_lower.find(
"not found") != std::string::npos ||
80 error_lower.find(
"missing") != std::string::npos) {
83 "UI element not found, may need to open different editor";
84 }
else if (error_lower.find(
"wrong") != std::string::npos ||
85 error_lower.find(
"incorrect") != std::string::npos) {
88 "Action executed on wrong element, adjusting parameters";
91 for (
const auto& suggestion : analysis.
suggestions) {
94 if (suggestion.find(
"position") != std::string::npos) {
96 size_t pos = suggestion.find(
'(');
97 if (pos != std::string::npos) {
98 size_t end = suggestion.find(
')', pos);
99 if (end != std::string::npos) {
100 std::string coords = suggestion.substr(pos + 1, end - pos - 1);
101 std::vector<std::string> parts = absl::StrSplit(coords,
',');
102 if (parts.size() == 2) {
104 std::string(absl::StripAsciiWhitespace(parts[0]));
106 std::string(absl::StripAsciiWhitespace(parts[1]));
114 refinement.
reasoning =
"Generic failure, will retry with same parameters";
120absl::StatusOr<std::map<std::string, std::string>>
122 const std::filesystem::path& screenshot_path,
123 const std::string& element_name) {
127 screenshot_path.string(), prompt);
129 if (!response.ok()) {
130 return response.status();
133 std::map<std::string, std::string> location;
138 std::string text = response->text_response;
139 std::transform(text.begin(), text.end(), text.begin(), ::tolower);
141 if (text.find(
"not found") != std::string::npos ||
142 text.find(
"not visible") != std::string::npos) {
143 location[
"found"] =
"false";
144 location[
"description"] = response->text_response;
146 location[
"found"] =
"true";
147 location[
"description"] = response->text_response;
150 size_t pos = text.find(
'(');
151 if (pos != std::string::npos) {
152 size_t end = text.find(
')', pos);
153 if (end != std::string::npos) {
154 std::string coords = text.substr(pos + 1, end - pos - 1);
155 std::vector<std::string> parts = absl::StrSplit(coords,
',');
156 if (parts.size() == 2) {
157 location[
"x"] = std::string(absl::StripAsciiWhitespace(parts[0]));
158 location[
"y"] = std::string(absl::StripAsciiWhitespace(parts[1]));
167absl::StatusOr<std::vector<std::string>>
169 const std::filesystem::path& screenshot_path) {
173 screenshot_path.string(), prompt);
175 if (!response.ok()) {
176 return response.status();
180 std::vector<std::string> widgets;
181 std::stringstream ss(response->text_response);
184 while (std::getline(ss, line)) {
187 line.find_first_not_of(
" \t\n\r") == std::string::npos) {
193 if (line[0] ==
'-' || line[0] ==
'*') {
195 }
else if (std::isdigit(line[0])) {
196 start = line.find(
'.');
197 if (start != std::string::npos) {
204 absl::string_view widget_view =
205 absl::StripAsciiWhitespace(absl::string_view(line).substr(start));
207 if (!widget_view.empty()) {
208 widgets.push_back(std::string(widget_view));
218 const std::string& context) {
219 std::string base_prompt =
220 "Analyze this screenshot of the YAZE ROM editor GUI. "
221 "Identify all visible UI elements, windows, and widgets. "
222 "List them in order of importance.";
224 if (!context.empty()) {
225 return absl::StrCat(base_prompt,
"\n\nContext: ", context);
236 "This screenshot was taken after attempting to perform the following "
239 "\n\nDid the action succeed? Look for visual evidence that the action "
242 "SUCCESS: <description of what changed>\n"
244 "FAILURE: <description of what went wrong>");
248 const std::string& element_name) {
249 return absl::StrCat(
"Locate the '", element_name,
250 "' UI element in this screenshot. "
251 "If found, describe its position (coordinates if "
252 "possible, or relative position). "
253 "If not found, state 'NOT FOUND'.");
257 return "List all visible UI widgets, buttons, windows, and interactive "
259 "in this screenshot. Format as a bulleted list, one element per line.";
263 const std::string& response) {
269 std::stringstream ss(response);
272 while (std::getline(ss, line)) {
274 std::string lower = line;
275 std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
277 if (lower.find(
"button") != std::string::npos ||
278 lower.find(
"window") != std::string::npos ||
279 lower.find(
"panel") != std::string::npos ||
280 lower.find(
"selector") != std::string::npos ||
281 lower.find(
"editor") != std::string::npos) {
282 result.
widgets.push_back(std::string(absl::StripAsciiWhitespace(line)));
286 if (lower.find(
"suggest") != std::string::npos ||
287 lower.find(
"try") != std::string::npos ||
288 lower.find(
"could") != std::string::npos) {
290 std::string(absl::StripAsciiWhitespace(line)));
298 const std::string& response,
const AIAction& action) {
302 std::string response_upper = response;
303 std::transform(response_upper.begin(), response_upper.end(),
304 response_upper.begin(), ::toupper);
306 if (response_upper.find(
"SUCCESS") != std::string::npos) {
310 size_t pos = response_upper.find(
"SUCCESS:");
311 if (pos != std::string::npos) {
312 std::string desc = response.substr(pos + 8);
313 result.
description = std::string(absl::StripAsciiWhitespace(desc));
315 }
else if (response_upper.find(
"FAILURE") != std::string::npos) {
319 size_t pos = response_upper.find(
"FAILURE:");
320 if (pos != std::string::npos) {
321 std::string desc = response.substr(pos + 8);
324 result.
error_message =
"Action failed (details in description)";
330 "Could not determine action success from vision analysis";
absl::StatusOr< AgentResponse > GenerateMultimodalResponse(const std::string &, const std::string &)
static std::string ActionToString(const AIAction &action)
std::string BuildWidgetExtractionPrompt()
VisionActionRefiner(GeminiAIService *gemini_service)
Construct refiner with Gemini service.
VisionAnalysisResult ParseAnalysisResponse(const std::string &response)
absl::StatusOr< VisionAnalysisResult > VerifyAction(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Verify an action was successful by comparing before/after screenshots.
GeminiAIService * gemini_service_
std::string BuildVerificationPrompt(const AIAction &action)
absl::StatusOr< std::vector< std::string > > ExtractVisibleWidgets(const std::filesystem::path &screenshot_path)
Extract all visible widgets from a screenshot.
absl::StatusOr< ActionRefinement > RefineAction(const AIAction &original_action, const VisionAnalysisResult &analysis)
Refine an action based on vision analysis feedback.
absl::StatusOr< std::map< std::string, std::string > > LocateUIElement(const std::filesystem::path &screenshot_path, const std::string &element_name)
Find a specific UI element in a screenshot.
std::string BuildAnalysisPrompt(const std::string &context)
std::string BuildElementLocationPrompt(const std::string &element_name)
absl::StatusOr< VisionAnalysisResult > AnalyzeScreenshot(const std::filesystem::path &screenshot_path, const std::string &context="")
Analyze the current GUI state from a screenshot.
VisionAnalysisResult ParseVerificationResponse(const std::string &response, const AIAction &action)
Represents a single action to be performed in the GUI.
Refined action parameters based on vision analysis.
std::map< std::string, std::string > adjusted_parameters
bool needs_different_approach
Result of analyzing a screenshot with Gemini Vision.
std::vector< std::string > widgets
std::vector< std::string > suggestions
std::string error_message