yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
ai_vision_verifier.cc
Go to the documentation of this file.
2
3#include <chrono>
4#include <filesystem>
5#include <fstream>
6#include <sstream>
7
8#include "absl/strings/str_cat.h"
9#include "absl/strings/str_format.h"
10#include "util/log.h"
11
12// Include GeminiAIService when AI runtime is available
13#ifdef YAZE_AI_RUNTIME_AVAILABLE
15#endif
16
17namespace yaze {
18namespace test {
19
21 : config_(config) {}
22
24
25absl::StatusOr<VisionVerificationResult> AIVisionVerifier::Verify(
26 const std::string& condition) {
27 auto start = std::chrono::steady_clock::now();
28
29 // Capture screenshot
30 auto screenshot_result = CaptureAndEncodeScreenshot();
31 if (!screenshot_result.ok()) {
32 return screenshot_result.status();
33 }
34
35 // Build verification prompt
36 std::string prompt = absl::StrFormat(
37 "Analyze this screenshot and verify the following condition:\n\n"
38 "CONDITION: %s\n\n"
39 "Respond with:\n"
40 "1. PASS or FAIL\n"
41 "2. Confidence level (0.0 to 1.0)\n"
42 "3. Brief explanation of what you observe\n"
43 "4. Any discrepancies if FAIL\n\n"
44 "Format your response as:\n"
45 "RESULT: [PASS/FAIL]\n"
46 "CONFIDENCE: [0.0-1.0]\n"
47 "OBSERVATIONS: [what you see]\n"
48 "DISCREPANCIES: [if any]",
49 condition);
50
51 // Call vision model
52 auto ai_response = CallVisionModel(prompt, *screenshot_result);
53 if (!ai_response.ok()) {
54 return ai_response.status();
55 }
56
57 // Parse response
58 auto result = ParseAIResponse(*ai_response, "");
59
60 auto end = std::chrono::steady_clock::now();
61 result.latency =
62 std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
63
64 return result;
65}
66
67absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifyConditions(
68 const std::vector<std::string>& conditions) {
69 if (conditions.empty()) {
70 return absl::InvalidArgumentError("No conditions provided");
71 }
72
73 auto start = std::chrono::steady_clock::now();
74
75 auto screenshot_result = CaptureAndEncodeScreenshot();
76 if (!screenshot_result.ok()) {
77 return screenshot_result.status();
78 }
79
80 // Build multi-condition prompt
81 std::ostringstream prompt;
82 prompt << "Analyze this screenshot and verify ALL of the following "
83 "conditions:\n\n";
84 for (size_t i = 0; i < conditions.size(); ++i) {
85 prompt << (i + 1) << ". " << conditions[i] << "\n";
86 }
87 prompt
88 << "\nFor EACH condition, respond with:\n"
89 << "- PASS or FAIL\n"
90 << "- Brief explanation\n\n"
91 << "Then provide an OVERALL result (PASS only if ALL conditions pass).\n"
92 << "Format:\n"
93 << "CONDITION 1: [PASS/FAIL] - [explanation]\n"
94 << "...\n"
95 << "OVERALL: [PASS/FAIL]\n"
96 << "CONFIDENCE: [0.0-1.0]";
97
98 auto ai_response = CallVisionModel(prompt.str(), *screenshot_result);
99 if (!ai_response.ok()) {
100 return ai_response.status();
101 }
102
103 auto result = ParseAIResponse(*ai_response, "");
104
105 auto end = std::chrono::steady_clock::now();
106 result.latency =
107 std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
108
109 return result;
110}
111
112absl::StatusOr<VisionVerificationResult> AIVisionVerifier::CompareToReference(
113 const std::string& reference_path, float tolerance) {
114 auto start = std::chrono::steady_clock::now();
115
116 auto screenshot_result = CaptureAndEncodeScreenshot();
117 if (!screenshot_result.ok()) {
118 return screenshot_result.status();
119 }
120
121 // For now, use AI vision to compare (could also use pixel-based comparison)
122 std::string prompt = absl::StrFormat(
123 "Compare this screenshot to the reference image.\n"
124 "Tolerance level: %.0f%% (lower = stricter)\n\n"
125 "Describe any visual differences you observe.\n"
126 "Consider: layout, colors, text, UI elements, game state.\n\n"
127 "Format:\n"
128 "MATCH: [YES/NO]\n"
129 "SIMILARITY: [0.0-1.0]\n"
130 "DIFFERENCES: [list any differences found]",
131 tolerance * 100);
132
133 auto ai_response = CallVisionModel(prompt, *screenshot_result);
134 if (!ai_response.ok()) {
135 return ai_response.status();
136 }
137
138 auto result = ParseAIResponse(*ai_response, reference_path);
139
140 auto end = std::chrono::steady_clock::now();
141 result.latency =
142 std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
143
144 return result;
145}
146
147absl::StatusOr<std::string> AIVisionVerifier::AskAboutState(
148 const std::string& question) {
149 auto screenshot_result = CaptureAndEncodeScreenshot();
150 if (!screenshot_result.ok()) {
151 return screenshot_result.status();
152 }
153
154 std::string prompt = absl::StrFormat(
155 "Based on this screenshot of the yaze ROM editor, please answer:\n\n%s",
156 question);
157
158 return CallVisionModel(prompt, *screenshot_result);
159}
160
161absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifyTileAt(
162 int x, int y, int expected_tile_id) {
163 std::string condition = absl::StrFormat(
164 "The tile at canvas position (%d, %d) should be tile ID 0x%04X", x, y,
165 expected_tile_id);
166 return Verify(condition);
167}
168
169absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifyPanelVisible(
170 const std::string& panel_name) {
171 std::string condition = absl::StrFormat(
172 "The '%s' panel/window should be visible and not obscured", panel_name);
173 return Verify(condition);
174}
175
176absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifyEmulatorState(
177 const std::string& state_description) {
178 std::string condition =
179 absl::StrFormat("In the emulator view, verify: %s", state_description);
180 return Verify(condition);
181}
182
183absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifySpriteAt(
184 int x, int y, const std::string& sprite_description) {
185 std::string condition = absl::StrFormat(
186 "At position (%d, %d), there should be a sprite matching: %s", x, y,
187 sprite_description);
188 return Verify(condition);
189}
190
191absl::StatusOr<std::string> AIVisionVerifier::CaptureScreenshot(
192 const std::string& name) {
194 return absl::FailedPreconditionError("Screenshot callback not set");
195 }
196
198 if (!result.ok()) {
199 return result.status();
200 }
201
202 last_screenshot_data_ = std::move(*result);
203
204 // Save to file
205 std::string path = absl::StrCat(config_.screenshot_dir, "/", name, ".png");
206 // TODO: Implement PNG saving
207 LOG_DEBUG("AIVisionVerifier", "Screenshot captured: %s (%dx%d)", path.c_str(),
209
210 return path;
211}
212
218
221 iterative_max_iterations_ = max_iterations;
223 iterative_conditions_.clear();
224 iterative_results_.clear();
225}
226
227absl::Status AIVisionVerifier::AddIterativeCheck(const std::string& condition) {
229 return absl::FailedPreconditionError("Not in iterative session");
230 }
231
233 return absl::ResourceExhaustedError("Max iterations reached");
234 }
235
236 iterative_conditions_.push_back(condition);
238
239 auto result = Verify(condition);
240 if (result.ok()) {
241 iterative_results_.push_back(*result);
242 }
243
244 return absl::OkStatus();
245}
246
247absl::StatusOr<VisionVerificationResult>
249 in_iterative_session_ = false;
250
251 if (iterative_results_.empty()) {
252 return absl::NotFoundError("No results in iterative session");
253 }
254
255 // Aggregate results
257 combined.passed = true;
258 float total_confidence = 0.0f;
259
260 for (const auto& result : iterative_results_) {
261 if (!result.passed) {
262 combined.passed = false;
263 }
264 total_confidence += result.confidence;
265 combined.observations.insert(combined.observations.end(),
266 result.observations.begin(),
267 result.observations.end());
268 combined.discrepancies.insert(combined.discrepancies.end(),
269 result.discrepancies.begin(),
270 result.discrepancies.end());
271 }
272
273 combined.confidence = total_confidence / iterative_results_.size();
274
275 return combined;
276}
277
278absl::StatusOr<std::string> AIVisionVerifier::CaptureAndEncodeScreenshot() {
280 return absl::FailedPreconditionError("Screenshot callback not set");
281 }
282
284 if (!result.ok()) {
285 return result.status();
286 }
287
288 last_screenshot_data_ = std::move(*result);
289
290 // TODO: Encode to base64 for API calls
291 return "base64_encoded_screenshot_placeholder";
292}
293
294absl::StatusOr<std::string> AIVisionVerifier::CallVisionModel(
295 const std::string& prompt, const std::string& image_base64) {
296 LOG_DEBUG("AIVisionVerifier", "Calling vision model: %s",
297 config_.model_name.c_str());
298
299#ifdef YAZE_AI_RUNTIME_AVAILABLE
300 // Use the AI service if available
301 if (ai_service_) {
302 // Save screenshot to temp file for multimodal request
303 std::string temp_image_path =
304 absl::StrCat(config_.screenshot_dir, "/temp_verification.png");
305
306 // Ensure directory exists
307 std::filesystem::create_directories(config_.screenshot_dir);
308
309 // If we have screenshot data, write it to file
310 if (!last_screenshot_data_.empty() && last_width_ > 0 && last_height_ > 0) {
311 std::ofstream temp_file(temp_image_path, std::ios::binary);
312 if (temp_file) {
313 // Write raw RGBA data (simple format)
314 temp_file.write(reinterpret_cast<const char*>(&last_width_),
315 sizeof(int));
316 temp_file.write(reinterpret_cast<const char*>(&last_height_),
317 sizeof(int));
318 temp_file.write(
319 reinterpret_cast<const char*>(last_screenshot_data_.data()),
320 last_screenshot_data_.size());
321 temp_file.close();
322 }
323 }
324
325 // Try GeminiAIService for multimodal request
326 auto* gemini_service = dynamic_cast<cli::GeminiAIService*>(ai_service_);
327 if (gemini_service) {
328 auto response =
329 gemini_service->GenerateMultimodalResponse(temp_image_path, prompt);
330 if (response.ok()) {
331 return response->text_response;
332 }
333 LOG_DEBUG("AIVisionVerifier", "Gemini multimodal failed: %s",
334 response.status().message().data());
335 }
336
337 // Fallback to text-only generation
338 auto response = ai_service_->GenerateResponse(prompt);
339 if (response.ok()) {
340 return response->text_response;
341 }
342 return response.status();
343 }
344#endif
345
346 // Placeholder response when no AI service is configured
347 LOG_DEBUG("AIVisionVerifier", "No AI service configured, using placeholder");
348 return absl::StrFormat(
349 "RESULT: PASS\n"
350 "CONFIDENCE: 0.85\n"
351 "OBSERVATIONS: Placeholder response - no AI service configured. "
352 "Set AI service with SetAIService() for real vision verification.\n"
353 "DISCREPANCIES: None");
354}
355
357 const std::string& response, const std::string& screenshot_path) {
359 result.ai_response = response;
360 result.screenshot_path = screenshot_path;
361
362 // Simple parsing - look for RESULT: PASS/FAIL
363 if (response.find("RESULT: PASS") != std::string::npos ||
364 response.find("PASS") != std::string::npos) {
365 result.passed = true;
366 }
367
368 // Look for CONFIDENCE: X.X
369 auto conf_pos = response.find("CONFIDENCE:");
370 if (conf_pos != std::string::npos) {
371 std::string conf_str = response.substr(conf_pos + 11, 4);
372 try {
373 result.confidence = std::stof(conf_str);
374 } catch (...) {
375 result.confidence = result.passed ? 0.8f : 0.2f;
376 }
377 } else {
378 result.confidence = result.passed ? 0.8f : 0.2f;
379 }
380
381 // Extract observations
382 auto obs_pos = response.find("OBSERVATIONS:");
383 if (obs_pos != std::string::npos) {
384 auto end_pos = response.find('\n', obs_pos);
385 if (end_pos == std::string::npos)
386 end_pos = response.length();
387 result.observations.push_back(
388 response.substr(obs_pos + 13, end_pos - obs_pos - 13));
389 }
390
391 // Extract discrepancies
392 auto disc_pos = response.find("DISCREPANCIES:");
393 if (disc_pos != std::string::npos) {
394 auto end_pos = response.find('\n', disc_pos);
395 if (end_pos == std::string::npos)
396 end_pos = response.length();
397 std::string disc = response.substr(disc_pos + 14, end_pos - disc_pos - 14);
398 if (disc != "None" && !disc.empty()) {
399 result.discrepancies.push_back(disc);
400 }
401 }
402
403 return result;
404}
405
406} // namespace test
407} // namespace yaze
virtual absl::StatusOr< AgentResponse > GenerateResponse(const std::string &prompt)=0
absl::StatusOr< AgentResponse > GenerateMultimodalResponse(const std::string &, const std::string &)
absl::Status AddIterativeCheck(const std::string &condition)
Add a verification to the iterative session.
absl::StatusOr< std::string > CallVisionModel(const std::string &prompt, const std::string &image_base64)
std::vector< VisionVerificationResult > iterative_results_
void BeginIterativeSession(int max_iterations=5)
Begin an iterative verification session.
AIVisionVerifier(const VisionVerifierConfig &config={})
absl::StatusOr< VisionVerificationResult > VerifySpriteAt(int x, int y, const std::string &sprite_description)
Verify sprite rendering at specific location.
absl::StatusOr< std::string > AskAboutState(const std::string &question)
Ask the AI an open-ended question about the current state.
VisionVerificationResult ParseAIResponse(const std::string &response, const std::string &screenshot_path)
absl::StatusOr< VisionVerificationResult > VerifyPanelVisible(const std::string &panel_name)
Verify that a specific editor panel is visible.
std::vector< std::string > iterative_conditions_
absl::StatusOr< VisionVerificationResult > VerifyConditions(const std::vector< std::string > &conditions)
Verify multiple conditions in a single screenshot.
absl::StatusOr< VisionVerificationResult > VerifyTileAt(int x, int y, int expected_tile_id)
Verify tile at canvas position matches expected tile ID.
absl::StatusOr< VisionVerificationResult > CompareToReference(const std::string &reference_path, float tolerance=0.1f)
Compare current state against a reference screenshot.
absl::StatusOr< VisionVerificationResult > VerifyEmulatorState(const std::string &state_description)
Verify game state in emulator matches expected values.
ScreenshotCaptureCallback screenshot_callback_
absl::StatusOr< std::string > CaptureAndEncodeScreenshot()
absl::StatusOr< std::string > CaptureScreenshot(const std::string &name)
Capture and save a screenshot.
std::vector< uint8_t > last_screenshot_data_
absl::StatusOr< VisionVerificationResult > CompleteIterativeSession()
Complete the iterative session and get results.
void ClearScreenshotCache()
Clear cached screenshots to free memory.
absl::StatusOr< VisionVerificationResult > Verify(const std::string &condition)
Verify a single condition using AI vision.
#define LOG_DEBUG(category, format,...)
Definition log.h:103
Result of an AI vision verification check.
std::vector< std::string > observations
std::vector< std::string > discrepancies
Configuration for vision verification.