yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
vision_action_refiner.cc
Go to the documentation of this file.
2
3#include <algorithm>
4#include <sstream>
5
6#include "absl/strings/str_cat.h"
7#include "absl/strings/str_split.h"
8#include "absl/strings/string_view.h"
10
11namespace yaze {
12namespace cli {
13namespace ai {
14
16 : gemini_service_(gemini_service) {
17 if (!gemini_service_) {
18 throw std::invalid_argument("Gemini service cannot be null");
19 }
20}
21
22absl::StatusOr<VisionAnalysisResult> VisionActionRefiner::AnalyzeScreenshot(
23 const std::filesystem::path& screenshot_path, const std::string& context) {
24 if (!std::filesystem::exists(screenshot_path)) {
25 return absl::NotFoundError(
26 absl::StrCat("Screenshot not found: ", screenshot_path.string()));
27 }
28
29 std::string prompt = BuildAnalysisPrompt(context);
30
32 screenshot_path.string(), prompt);
33
34 if (!response.ok()) {
35 return response.status();
36 }
37
38 return ParseAnalysisResponse(response->text_response);
39}
40
41absl::StatusOr<VisionAnalysisResult> VisionActionRefiner::VerifyAction(
42 const AIAction& action, const std::filesystem::path& before_screenshot,
43 const std::filesystem::path& after_screenshot) {
44 if (!std::filesystem::exists(before_screenshot)) {
45 return absl::NotFoundError("Before screenshot not found");
46 }
47
48 if (!std::filesystem::exists(after_screenshot)) {
49 return absl::NotFoundError("After screenshot not found");
50 }
51
52 // First, analyze the after screenshot
53 std::string verification_prompt = BuildVerificationPrompt(action);
54
55 auto after_response = gemini_service_->GenerateMultimodalResponse(
56 after_screenshot.string(), verification_prompt);
57
58 if (!after_response.ok()) {
59 return after_response.status();
60 }
61
62 return ParseVerificationResponse(after_response->text_response, action);
63}
64
65absl::StatusOr<ActionRefinement> VisionActionRefiner::RefineAction(
66 const AIAction& original_action, const VisionAnalysisResult& analysis) {
67 ActionRefinement refinement;
68
69 // If action was successful, no refinement needed
70 if (analysis.action_successful) {
71 return refinement;
72 }
73
74 // Determine refinement strategy based on error
75 std::string error_lower = analysis.error_message;
76 std::transform(error_lower.begin(), error_lower.end(), error_lower.begin(),
77 ::tolower);
78
79 if (error_lower.find("not found") != std::string::npos ||
80 error_lower.find("missing") != std::string::npos) {
81 refinement.needs_different_approach = true;
82 refinement.reasoning =
83 "UI element not found, may need to open different editor";
84 } else if (error_lower.find("wrong") != std::string::npos ||
85 error_lower.find("incorrect") != std::string::npos) {
86 refinement.needs_retry = true;
87 refinement.reasoning =
88 "Action executed on wrong element, adjusting parameters";
89
90 // Try to extract corrected parameters from suggestions
91 for (const auto& suggestion : analysis.suggestions) {
92 // Parse suggestions for parameter corrections
93 // e.g., "Try position (6, 8) instead"
94 if (suggestion.find("position") != std::string::npos) {
95 // Extract coordinates
96 size_t pos = suggestion.find('(');
97 if (pos != std::string::npos) {
98 size_t end = suggestion.find(')', pos);
99 if (end != std::string::npos) {
100 std::string coords = suggestion.substr(pos + 1, end - pos - 1);
101 std::vector<std::string> parts = absl::StrSplit(coords, ',');
102 if (parts.size() == 2) {
103 refinement.adjusted_parameters["x"] =
104 std::string(absl::StripAsciiWhitespace(parts[0]));
105 refinement.adjusted_parameters["y"] =
106 std::string(absl::StripAsciiWhitespace(parts[1]));
107 }
108 }
109 }
110 }
111 }
112 } else {
113 refinement.needs_retry = true;
114 refinement.reasoning = "Generic failure, will retry with same parameters";
115 }
116
117 return refinement;
118}
119
120absl::StatusOr<std::map<std::string, std::string>>
122 const std::filesystem::path& screenshot_path,
123 const std::string& element_name) {
124 std::string prompt = BuildElementLocationPrompt(element_name);
125
127 screenshot_path.string(), prompt);
128
129 if (!response.ok()) {
130 return response.status();
131 }
132
133 std::map<std::string, std::string> location;
134
135 // Parse location from response
136 // Expected format: "The element is located at position (X, Y)"
137 // or "The element is in the top-right corner"
138 std::string text = response->text_response;
139 std::transform(text.begin(), text.end(), text.begin(), ::tolower);
140
141 if (text.find("not found") != std::string::npos ||
142 text.find("not visible") != std::string::npos) {
143 location["found"] = "false";
144 location["description"] = response->text_response;
145 } else {
146 location["found"] = "true";
147 location["description"] = response->text_response;
148
149 // Try to extract coordinates
150 size_t pos = text.find('(');
151 if (pos != std::string::npos) {
152 size_t end = text.find(')', pos);
153 if (end != std::string::npos) {
154 std::string coords = text.substr(pos + 1, end - pos - 1);
155 std::vector<std::string> parts = absl::StrSplit(coords, ',');
156 if (parts.size() == 2) {
157 location["x"] = std::string(absl::StripAsciiWhitespace(parts[0]));
158 location["y"] = std::string(absl::StripAsciiWhitespace(parts[1]));
159 }
160 }
161 }
162 }
163
164 return location;
165}
166
167absl::StatusOr<std::vector<std::string>>
169 const std::filesystem::path& screenshot_path) {
170 std::string prompt = BuildWidgetExtractionPrompt();
171
173 screenshot_path.string(), prompt);
174
175 if (!response.ok()) {
176 return response.status();
177 }
178
179 // Parse widget list from response
180 std::vector<std::string> widgets;
181 std::stringstream ss(response->text_response);
182 std::string line;
183
184 while (std::getline(ss, line)) {
185 // Skip empty lines
186 if (line.empty() ||
187 line.find_first_not_of(" \t\n\r") == std::string::npos) {
188 continue;
189 }
190
191 // Remove list markers (-, *, 1., etc.)
192 size_t start = 0;
193 if (line[0] == '-' || line[0] == '*') {
194 start = 1;
195 } else if (std::isdigit(line[0])) {
196 start = line.find('.');
197 if (start != std::string::npos) {
198 start++;
199 } else {
200 start = 0;
201 }
202 }
203
204 absl::string_view widget_view =
205 absl::StripAsciiWhitespace(absl::string_view(line).substr(start));
206
207 if (!widget_view.empty()) {
208 widgets.push_back(std::string(widget_view));
209 }
210 }
211
212 return widgets;
213}
214
215// Private helper methods
216
218 const std::string& context) {
219 std::string base_prompt =
220 "Analyze this screenshot of the YAZE ROM editor GUI. "
221 "Identify all visible UI elements, windows, and widgets. "
222 "List them in order of importance.";
223
224 if (!context.empty()) {
225 return absl::StrCat(base_prompt, "\n\nContext: ", context);
226 }
227
228 return base_prompt;
229}
230
232 const AIAction& action) {
233 std::string action_desc = AIActionParser::ActionToString(action);
234
235 return absl::StrCat(
236 "This screenshot was taken after attempting to perform the following "
237 "action: ",
238 action_desc,
239 "\n\nDid the action succeed? Look for visual evidence that the action "
240 "completed. "
241 "Respond with:\n"
242 "SUCCESS: <description of what changed>\n"
243 "or\n"
244 "FAILURE: <description of what went wrong>");
245}
246
248 const std::string& element_name) {
249 return absl::StrCat("Locate the '", element_name,
250 "' UI element in this screenshot. "
251 "If found, describe its position (coordinates if "
252 "possible, or relative position). "
253 "If not found, state 'NOT FOUND'.");
254}
255
257 return "List all visible UI widgets, buttons, windows, and interactive "
258 "elements "
259 "in this screenshot. Format as a bulleted list, one element per line.";
260}
261
263 const std::string& response) {
265 result.description = response;
266
267 // Extract widgets from description
268 // Look for common patterns like "- Button", "1. Window", etc.
269 std::stringstream ss(response);
270 std::string line;
271
272 while (std::getline(ss, line)) {
273 // Check if line contains a widget mention
274 std::string lower = line;
275 std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
276
277 if (lower.find("button") != std::string::npos ||
278 lower.find("window") != std::string::npos ||
279 lower.find("panel") != std::string::npos ||
280 lower.find("selector") != std::string::npos ||
281 lower.find("editor") != std::string::npos) {
282 result.widgets.push_back(std::string(absl::StripAsciiWhitespace(line)));
283 }
284
285 // Extract suggestions
286 if (lower.find("suggest") != std::string::npos ||
287 lower.find("try") != std::string::npos ||
288 lower.find("could") != std::string::npos) {
289 result.suggestions.push_back(
290 std::string(absl::StripAsciiWhitespace(line)));
291 }
292 }
293
294 return result;
295}
296
298 const std::string& response, const AIAction& action) {
300 result.description = response;
301
302 std::string response_upper = response;
303 std::transform(response_upper.begin(), response_upper.end(),
304 response_upper.begin(), ::toupper);
305
306 if (response_upper.find("SUCCESS") != std::string::npos) {
307 result.action_successful = true;
308
309 // Extract success description
310 size_t pos = response_upper.find("SUCCESS:");
311 if (pos != std::string::npos) {
312 std::string desc = response.substr(pos + 8);
313 result.description = std::string(absl::StripAsciiWhitespace(desc));
314 }
315 } else if (response_upper.find("FAILURE") != std::string::npos) {
316 result.action_successful = false;
317
318 // Extract failure description
319 size_t pos = response_upper.find("FAILURE:");
320 if (pos != std::string::npos) {
321 std::string desc = response.substr(pos + 8);
322 result.error_message = absl::StripAsciiWhitespace(desc);
323 } else {
324 result.error_message = "Action failed (details in description)";
325 }
326 } else {
327 // Ambiguous response, assume failure
328 result.action_successful = false;
329 result.error_message =
330 "Could not determine action success from vision analysis";
331 }
332
333 return result;
334}
335
336} // namespace ai
337} // namespace cli
338} // namespace yaze
absl::StatusOr< AgentResponse > GenerateMultimodalResponse(const std::string &, const std::string &)
static std::string ActionToString(const AIAction &action)
VisionActionRefiner(GeminiAIService *gemini_service)
Construct refiner with Gemini service.
VisionAnalysisResult ParseAnalysisResponse(const std::string &response)
absl::StatusOr< VisionAnalysisResult > VerifyAction(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Verify an action was successful by comparing before/after screenshots.
std::string BuildVerificationPrompt(const AIAction &action)
absl::StatusOr< std::vector< std::string > > ExtractVisibleWidgets(const std::filesystem::path &screenshot_path)
Extract all visible widgets from a screenshot.
absl::StatusOr< ActionRefinement > RefineAction(const AIAction &original_action, const VisionAnalysisResult &analysis)
Refine an action based on vision analysis feedback.
absl::StatusOr< std::map< std::string, std::string > > LocateUIElement(const std::filesystem::path &screenshot_path, const std::string &element_name)
Find a specific UI element in a screenshot.
std::string BuildAnalysisPrompt(const std::string &context)
std::string BuildElementLocationPrompt(const std::string &element_name)
absl::StatusOr< VisionAnalysisResult > AnalyzeScreenshot(const std::filesystem::path &screenshot_path, const std::string &context="")
Analyze the current GUI state from a screenshot.
VisionAnalysisResult ParseVerificationResponse(const std::string &response, const AIAction &action)
Represents a single action to be performed in the GUI.
Refined action parameters based on vision analysis.
std::map< std::string, std::string > adjusted_parameters
Result of analyzing a screenshot with Gemini Vision.