6#include "absl/strings/str_cat.h"
7#include "absl/strings/str_split.h"
8#include "absl/strings/string_view.h"
16 : gemini_service_(gemini_service) {
18 throw std::invalid_argument(
"Gemini service cannot be null");
23 const std::filesystem::path& screenshot_path,
24 const std::string& context) {
26 if (!std::filesystem::exists(screenshot_path)) {
27 return absl::NotFoundError(
28 absl::StrCat(
"Screenshot not found: ", screenshot_path.string()));
34 screenshot_path.string(),
39 return response.status();
47 const std::filesystem::path& before_screenshot,
48 const std::filesystem::path& after_screenshot) {
50 if (!std::filesystem::exists(before_screenshot)) {
51 return absl::NotFoundError(
"Before screenshot not found");
54 if (!std::filesystem::exists(after_screenshot)) {
55 return absl::NotFoundError(
"After screenshot not found");
62 after_screenshot.string(),
66 if (!after_response.ok()) {
67 return after_response.status();
86 std::transform(error_lower.begin(), error_lower.end(),
87 error_lower.begin(), ::tolower);
89 if (error_lower.find(
"not found") != std::string::npos ||
90 error_lower.find(
"missing") != std::string::npos) {
92 refinement.
reasoning =
"UI element not found, may need to open different editor";
94 else if (error_lower.find(
"wrong") != std::string::npos ||
95 error_lower.find(
"incorrect") != std::string::npos) {
97 refinement.
reasoning =
"Action executed on wrong element, adjusting parameters";
100 for (
const auto& suggestion : analysis.
suggestions) {
103 if (suggestion.find(
"position") != std::string::npos) {
105 size_t pos = suggestion.find(
'(');
106 if (pos != std::string::npos) {
107 size_t end = suggestion.find(
')', pos);
108 if (end != std::string::npos) {
109 std::string coords = suggestion.substr(pos + 1, end - pos - 1);
110 std::vector<std::string> parts = absl::StrSplit(coords,
',');
111 if (parts.size() == 2) {
113 std::string(absl::StripAsciiWhitespace(parts[0]));
115 std::string(absl::StripAsciiWhitespace(parts[1]));
124 refinement.
reasoning =
"Generic failure, will retry with same parameters";
130absl::StatusOr<std::map<std::string, std::string>>
132 const std::filesystem::path& screenshot_path,
133 const std::string& element_name) {
138 screenshot_path.string(),
142 if (!response.ok()) {
143 return response.status();
146 std::map<std::string, std::string> location;
151 std::string text = response->text_response;
152 std::transform(text.begin(), text.end(), text.begin(), ::tolower);
154 if (text.find(
"not found") != std::string::npos ||
155 text.find(
"not visible") != std::string::npos) {
156 location[
"found"] =
"false";
157 location[
"description"] = response->text_response;
159 location[
"found"] =
"true";
160 location[
"description"] = response->text_response;
163 size_t pos = text.find(
'(');
164 if (pos != std::string::npos) {
165 size_t end = text.find(
')', pos);
166 if (end != std::string::npos) {
167 std::string coords = text.substr(pos + 1, end - pos - 1);
168 std::vector<std::string> parts = absl::StrSplit(coords,
',');
169 if (parts.size() == 2) {
170 location[
"x"] = std::string(absl::StripAsciiWhitespace(parts[0]));
171 location[
"y"] = std::string(absl::StripAsciiWhitespace(parts[1]));
180absl::StatusOr<std::vector<std::string>>
182 const std::filesystem::path& screenshot_path) {
187 screenshot_path.string(),
191 if (!response.ok()) {
192 return response.status();
196 std::vector<std::string> widgets;
197 std::stringstream ss(response->text_response);
200 while (std::getline(ss, line)) {
202 if (line.empty() || line.find_first_not_of(
" \t\n\r") == std::string::npos) {
208 if (line[0] ==
'-' || line[0] ==
'*') {
210 }
else if (std::isdigit(line[0])) {
211 start = line.find(
'.');
212 if (start != std::string::npos) {
219 absl::string_view widget_view = absl::StripAsciiWhitespace(
220 absl::string_view(line).substr(start));
222 if (!widget_view.empty()) {
223 widgets.push_back(std::string(widget_view));
233 std::string base_prompt =
234 "Analyze this screenshot of the YAZE ROM editor GUI. "
235 "Identify all visible UI elements, windows, and widgets. "
236 "List them in order of importance.";
238 if (!context.empty()) {
239 return absl::StrCat(base_prompt,
"\n\nContext: ", context);
249 "This screenshot was taken after attempting to perform the following action: ",
251 "\n\nDid the action succeed? Look for visual evidence that the action completed. "
253 "SUCCESS: <description of what changed>\n"
255 "FAILURE: <description of what went wrong>"
260 const std::string& element_name) {
262 "Locate the '", element_name,
"' UI element in this screenshot. "
263 "If found, describe its position (coordinates if possible, or relative position). "
264 "If not found, state 'NOT FOUND'."
270 "List all visible UI widgets, buttons, windows, and interactive elements "
271 "in this screenshot. Format as a bulleted list, one element per line.";
275 const std::string& response) {
282 std::stringstream ss(response);
285 while (std::getline(ss, line)) {
287 std::string lower = line;
288 std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
290 if (lower.find(
"button") != std::string::npos ||
291 lower.find(
"window") != std::string::npos ||
292 lower.find(
"panel") != std::string::npos ||
293 lower.find(
"selector") != std::string::npos ||
294 lower.find(
"editor") != std::string::npos) {
295 result.
widgets.push_back(std::string(absl::StripAsciiWhitespace(line)));
299 if (lower.find(
"suggest") != std::string::npos ||
300 lower.find(
"try") != std::string::npos ||
301 lower.find(
"could") != std::string::npos) {
302 result.
suggestions.push_back(std::string(absl::StripAsciiWhitespace(line)));
310 const std::string& response,
316 std::string response_upper = response;
317 std::transform(response_upper.begin(), response_upper.end(),
318 response_upper.begin(), ::toupper);
320 if (response_upper.find(
"SUCCESS") != std::string::npos) {
324 size_t pos = response_upper.find(
"SUCCESS:");
325 if (pos != std::string::npos) {
326 std::string desc = response.substr(pos + 8);
327 result.
description = std::string(absl::StripAsciiWhitespace(desc));
330 else if (response_upper.find(
"FAILURE") != std::string::npos) {
334 size_t pos = response_upper.find(
"FAILURE:");
335 if (pos != std::string::npos) {
336 std::string desc = response.substr(pos + 8);
339 result.
error_message =
"Action failed (details in description)";
345 result.
error_message =
"Could not determine action success from vision analysis";
absl::StatusOr< AgentResponse > GenerateMultimodalResponse(const std::string &image_path, const std::string &prompt)
static std::string ActionToString(const AIAction &action)
std::string BuildWidgetExtractionPrompt()
VisionActionRefiner(GeminiAIService *gemini_service)
Construct refiner with Gemini service.
VisionAnalysisResult ParseAnalysisResponse(const std::string &response)
absl::StatusOr< VisionAnalysisResult > VerifyAction(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Verify an action was successful by comparing before/after screenshots.
GeminiAIService * gemini_service_
std::string BuildVerificationPrompt(const AIAction &action)
absl::StatusOr< std::vector< std::string > > ExtractVisibleWidgets(const std::filesystem::path &screenshot_path)
Extract all visible widgets from a screenshot.
absl::StatusOr< ActionRefinement > RefineAction(const AIAction &original_action, const VisionAnalysisResult &analysis)
Refine an action based on vision analysis feedback.
absl::StatusOr< std::map< std::string, std::string > > LocateUIElement(const std::filesystem::path &screenshot_path, const std::string &element_name)
Find a specific UI element in a screenshot.
std::string BuildAnalysisPrompt(const std::string &context)
std::string BuildElementLocationPrompt(const std::string &element_name)
absl::StatusOr< VisionAnalysisResult > AnalyzeScreenshot(const std::filesystem::path &screenshot_path, const std::string &context="")
Analyze the current GUI state from a screenshot.
VisionAnalysisResult ParseVerificationResponse(const std::string &response, const AIAction &action)
Main namespace for the application.
Represents a single action to be performed in the GUI.
Refined action parameters based on vision analysis.
std::map< std::string, std::string > adjusted_parameters
bool needs_different_approach
Result of analyzing a screenshot with Gemini Vision.
std::vector< std::string > widgets
std::vector< std::string > suggestions
std::string error_message