yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
vision_action_refiner.cc
Go to the documentation of this file.
2
3#include <algorithm>
4#include <sstream>
5
6#include "absl/strings/str_cat.h"
7#include "absl/strings/str_split.h"
8#include "absl/strings/string_view.h"
10
11namespace yaze {
12namespace cli {
13namespace ai {
14
16 : gemini_service_(gemini_service) {
17 if (!gemini_service_) {
18 throw std::invalid_argument("Gemini service cannot be null");
19 }
20}
21
22absl::StatusOr<VisionAnalysisResult> VisionActionRefiner::AnalyzeScreenshot(
23 const std::filesystem::path& screenshot_path,
24 const std::string& context) {
25
26 if (!std::filesystem::exists(screenshot_path)) {
27 return absl::NotFoundError(
28 absl::StrCat("Screenshot not found: ", screenshot_path.string()));
29 }
30
31 std::string prompt = BuildAnalysisPrompt(context);
32
34 screenshot_path.string(),
35 prompt
36 );
37
38 if (!response.ok()) {
39 return response.status();
40 }
41
42 return ParseAnalysisResponse(response->text_response);
43}
44
45absl::StatusOr<VisionAnalysisResult> VisionActionRefiner::VerifyAction(
46 const AIAction& action,
47 const std::filesystem::path& before_screenshot,
48 const std::filesystem::path& after_screenshot) {
49
50 if (!std::filesystem::exists(before_screenshot)) {
51 return absl::NotFoundError("Before screenshot not found");
52 }
53
54 if (!std::filesystem::exists(after_screenshot)) {
55 return absl::NotFoundError("After screenshot not found");
56 }
57
58 // First, analyze the after screenshot
59 std::string verification_prompt = BuildVerificationPrompt(action);
60
61 auto after_response = gemini_service_->GenerateMultimodalResponse(
62 after_screenshot.string(),
63 verification_prompt
64 );
65
66 if (!after_response.ok()) {
67 return after_response.status();
68 }
69
70 return ParseVerificationResponse(after_response->text_response, action);
71}
72
73absl::StatusOr<ActionRefinement> VisionActionRefiner::RefineAction(
74 const AIAction& original_action,
75 const VisionAnalysisResult& analysis) {
76
77 ActionRefinement refinement;
78
79 // If action was successful, no refinement needed
80 if (analysis.action_successful) {
81 return refinement;
82 }
83
84 // Determine refinement strategy based on error
85 std::string error_lower = analysis.error_message;
86 std::transform(error_lower.begin(), error_lower.end(),
87 error_lower.begin(), ::tolower);
88
89 if (error_lower.find("not found") != std::string::npos ||
90 error_lower.find("missing") != std::string::npos) {
91 refinement.needs_different_approach = true;
92 refinement.reasoning = "UI element not found, may need to open different editor";
93 }
94 else if (error_lower.find("wrong") != std::string::npos ||
95 error_lower.find("incorrect") != std::string::npos) {
96 refinement.needs_retry = true;
97 refinement.reasoning = "Action executed on wrong element, adjusting parameters";
98
99 // Try to extract corrected parameters from suggestions
100 for (const auto& suggestion : analysis.suggestions) {
101 // Parse suggestions for parameter corrections
102 // e.g., "Try position (6, 8) instead"
103 if (suggestion.find("position") != std::string::npos) {
104 // Extract coordinates
105 size_t pos = suggestion.find('(');
106 if (pos != std::string::npos) {
107 size_t end = suggestion.find(')', pos);
108 if (end != std::string::npos) {
109 std::string coords = suggestion.substr(pos + 1, end - pos - 1);
110 std::vector<std::string> parts = absl::StrSplit(coords, ',');
111 if (parts.size() == 2) {
112 refinement.adjusted_parameters["x"] =
113 std::string(absl::StripAsciiWhitespace(parts[0]));
114 refinement.adjusted_parameters["y"] =
115 std::string(absl::StripAsciiWhitespace(parts[1]));
116 }
117 }
118 }
119 }
120 }
121 }
122 else {
123 refinement.needs_retry = true;
124 refinement.reasoning = "Generic failure, will retry with same parameters";
125 }
126
127 return refinement;
128}
129
130absl::StatusOr<std::map<std::string, std::string>>
132 const std::filesystem::path& screenshot_path,
133 const std::string& element_name) {
134
135 std::string prompt = BuildElementLocationPrompt(element_name);
136
138 screenshot_path.string(),
139 prompt
140 );
141
142 if (!response.ok()) {
143 return response.status();
144 }
145
146 std::map<std::string, std::string> location;
147
148 // Parse location from response
149 // Expected format: "The element is located at position (X, Y)"
150 // or "The element is in the top-right corner"
151 std::string text = response->text_response;
152 std::transform(text.begin(), text.end(), text.begin(), ::tolower);
153
154 if (text.find("not found") != std::string::npos ||
155 text.find("not visible") != std::string::npos) {
156 location["found"] = "false";
157 location["description"] = response->text_response;
158 } else {
159 location["found"] = "true";
160 location["description"] = response->text_response;
161
162 // Try to extract coordinates
163 size_t pos = text.find('(');
164 if (pos != std::string::npos) {
165 size_t end = text.find(')', pos);
166 if (end != std::string::npos) {
167 std::string coords = text.substr(pos + 1, end - pos - 1);
168 std::vector<std::string> parts = absl::StrSplit(coords, ',');
169 if (parts.size() == 2) {
170 location["x"] = std::string(absl::StripAsciiWhitespace(parts[0]));
171 location["y"] = std::string(absl::StripAsciiWhitespace(parts[1]));
172 }
173 }
174 }
175 }
176
177 return location;
178}
179
180absl::StatusOr<std::vector<std::string>>
182 const std::filesystem::path& screenshot_path) {
183
184 std::string prompt = BuildWidgetExtractionPrompt();
185
187 screenshot_path.string(),
188 prompt
189 );
190
191 if (!response.ok()) {
192 return response.status();
193 }
194
195 // Parse widget list from response
196 std::vector<std::string> widgets;
197 std::stringstream ss(response->text_response);
198 std::string line;
199
200 while (std::getline(ss, line)) {
201 // Skip empty lines
202 if (line.empty() || line.find_first_not_of(" \t\n\r") == std::string::npos) {
203 continue;
204 }
205
206 // Remove list markers (-, *, 1., etc.)
207 size_t start = 0;
208 if (line[0] == '-' || line[0] == '*') {
209 start = 1;
210 } else if (std::isdigit(line[0])) {
211 start = line.find('.');
212 if (start != std::string::npos) {
213 start++;
214 } else {
215 start = 0;
216 }
217 }
218
219 absl::string_view widget_view = absl::StripAsciiWhitespace(
220 absl::string_view(line).substr(start));
221
222 if (!widget_view.empty()) {
223 widgets.push_back(std::string(widget_view));
224 }
225 }
226
227 return widgets;
228}
229
230// Private helper methods
231
232std::string VisionActionRefiner::BuildAnalysisPrompt(const std::string& context) {
233 std::string base_prompt =
234 "Analyze this screenshot of the YAZE ROM editor GUI. "
235 "Identify all visible UI elements, windows, and widgets. "
236 "List them in order of importance.";
237
238 if (!context.empty()) {
239 return absl::StrCat(base_prompt, "\n\nContext: ", context);
240 }
241
242 return base_prompt;
243}
244
246 std::string action_desc = AIActionParser::ActionToString(action);
247
248 return absl::StrCat(
249 "This screenshot was taken after attempting to perform the following action: ",
250 action_desc,
251 "\n\nDid the action succeed? Look for visual evidence that the action completed. "
252 "Respond with:\n"
253 "SUCCESS: <description of what changed>\n"
254 "or\n"
255 "FAILURE: <description of what went wrong>"
256 );
257}
258
260 const std::string& element_name) {
261 return absl::StrCat(
262 "Locate the '", element_name, "' UI element in this screenshot. "
263 "If found, describe its position (coordinates if possible, or relative position). "
264 "If not found, state 'NOT FOUND'."
265 );
266}
267
269 return
270 "List all visible UI widgets, buttons, windows, and interactive elements "
271 "in this screenshot. Format as a bulleted list, one element per line.";
272}
273
275 const std::string& response) {
276
278 result.description = response;
279
280 // Extract widgets from description
281 // Look for common patterns like "- Button", "1. Window", etc.
282 std::stringstream ss(response);
283 std::string line;
284
285 while (std::getline(ss, line)) {
286 // Check if line contains a widget mention
287 std::string lower = line;
288 std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
289
290 if (lower.find("button") != std::string::npos ||
291 lower.find("window") != std::string::npos ||
292 lower.find("panel") != std::string::npos ||
293 lower.find("selector") != std::string::npos ||
294 lower.find("editor") != std::string::npos) {
295 result.widgets.push_back(std::string(absl::StripAsciiWhitespace(line)));
296 }
297
298 // Extract suggestions
299 if (lower.find("suggest") != std::string::npos ||
300 lower.find("try") != std::string::npos ||
301 lower.find("could") != std::string::npos) {
302 result.suggestions.push_back(std::string(absl::StripAsciiWhitespace(line)));
303 }
304 }
305
306 return result;
307}
308
310 const std::string& response,
311 const AIAction& action) {
312
314 result.description = response;
315
316 std::string response_upper = response;
317 std::transform(response_upper.begin(), response_upper.end(),
318 response_upper.begin(), ::toupper);
319
320 if (response_upper.find("SUCCESS") != std::string::npos) {
321 result.action_successful = true;
322
323 // Extract success description
324 size_t pos = response_upper.find("SUCCESS:");
325 if (pos != std::string::npos) {
326 std::string desc = response.substr(pos + 8);
327 result.description = std::string(absl::StripAsciiWhitespace(desc));
328 }
329 }
330 else if (response_upper.find("FAILURE") != std::string::npos) {
331 result.action_successful = false;
332
333 // Extract failure description
334 size_t pos = response_upper.find("FAILURE:");
335 if (pos != std::string::npos) {
336 std::string desc = response.substr(pos + 8);
337 result.error_message = absl::StripAsciiWhitespace(desc);
338 } else {
339 result.error_message = "Action failed (details in description)";
340 }
341 }
342 else {
343 // Ambiguous response, assume failure
344 result.action_successful = false;
345 result.error_message = "Could not determine action success from vision analysis";
346 }
347
348 return result;
349}
350
351} // namespace ai
352} // namespace cli
353} // namespace yaze
absl::StatusOr< AgentResponse > GenerateMultimodalResponse(const std::string &image_path, const std::string &prompt)
static std::string ActionToString(const AIAction &action)
VisionActionRefiner(GeminiAIService *gemini_service)
Construct refiner with Gemini service.
VisionAnalysisResult ParseAnalysisResponse(const std::string &response)
absl::StatusOr< VisionAnalysisResult > VerifyAction(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Verify an action was successful by comparing before/after screenshots.
std::string BuildVerificationPrompt(const AIAction &action)
absl::StatusOr< std::vector< std::string > > ExtractVisibleWidgets(const std::filesystem::path &screenshot_path)
Extract all visible widgets from a screenshot.
absl::StatusOr< ActionRefinement > RefineAction(const AIAction &original_action, const VisionAnalysisResult &analysis)
Refine an action based on vision analysis feedback.
absl::StatusOr< std::map< std::string, std::string > > LocateUIElement(const std::filesystem::path &screenshot_path, const std::string &element_name)
Find a specific UI element in a screenshot.
std::string BuildAnalysisPrompt(const std::string &context)
std::string BuildElementLocationPrompt(const std::string &element_name)
absl::StatusOr< VisionAnalysisResult > AnalyzeScreenshot(const std::filesystem::path &screenshot_path, const std::string &context="")
Analyze the current GUI state from a screenshot.
VisionAnalysisResult ParseVerificationResponse(const std::string &response, const AIAction &action)
Main namespace for the application.
Definition controller.cc:20
Represents a single action to be performed in the GUI.
Refined action parameters based on vision analysis.
std::map< std::string, std::string > adjusted_parameters
Result of analyzing a screenshot with Gemini Vision.