yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
ai_vision_verifier.cc
Go to the documentation of this file.
2
3#include <chrono>
4#include <filesystem>
5#include <fstream>
6#include <sstream>
7
8#include "absl/strings/str_cat.h"
9#include "absl/strings/str_format.h"
10#include "util/log.h"
11
12// Include GeminiAIService when AI runtime is available
13#ifdef YAZE_AI_RUNTIME_AVAILABLE
15#endif
16
17namespace yaze {
18namespace test {
19
21 : config_(config) {}
22
24
25absl::StatusOr<VisionVerificationResult> AIVisionVerifier::Verify(
26 const std::string& condition) {
27 auto start = std::chrono::steady_clock::now();
28
29 // Capture screenshot
30 auto screenshot_result = CaptureAndEncodeScreenshot();
31 if (!screenshot_result.ok()) {
32 return screenshot_result.status();
33 }
34
35 // Build verification prompt
36 std::string prompt = absl::StrFormat(
37 "Analyze this screenshot and verify the following condition:\n\n"
38 "CONDITION: %s\n\n"
39 "Respond with:\n"
40 "1. PASS or FAIL\n"
41 "2. Confidence level (0.0 to 1.0)\n"
42 "3. Brief explanation of what you observe\n"
43 "4. Any discrepancies if FAIL\n\n"
44 "Format your response as:\n"
45 "RESULT: [PASS/FAIL]\n"
46 "CONFIDENCE: [0.0-1.0]\n"
47 "OBSERVATIONS: [what you see]\n"
48 "DISCREPANCIES: [if any]",
49 condition);
50
51 // Call vision model
52 auto ai_response = CallVisionModel(prompt, *screenshot_result);
53 if (!ai_response.ok()) {
54 return ai_response.status();
55 }
56
57 // Parse response
58 auto result = ParseAIResponse(*ai_response, "");
59
60 auto end = std::chrono::steady_clock::now();
61 result.latency = std::chrono::duration_cast<std::chrono::milliseconds>(
62 end - start);
63
64 return result;
65}
66
67absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifyConditions(
68 const std::vector<std::string>& conditions) {
69 if (conditions.empty()) {
70 return absl::InvalidArgumentError("No conditions provided");
71 }
72
73 auto start = std::chrono::steady_clock::now();
74
75 auto screenshot_result = CaptureAndEncodeScreenshot();
76 if (!screenshot_result.ok()) {
77 return screenshot_result.status();
78 }
79
80 // Build multi-condition prompt
81 std::ostringstream prompt;
82 prompt << "Analyze this screenshot and verify ALL of the following conditions:\n\n";
83 for (size_t i = 0; i < conditions.size(); ++i) {
84 prompt << (i + 1) << ". " << conditions[i] << "\n";
85 }
86 prompt << "\nFor EACH condition, respond with:\n"
87 << "- PASS or FAIL\n"
88 << "- Brief explanation\n\n"
89 << "Then provide an OVERALL result (PASS only if ALL conditions pass).\n"
90 << "Format:\n"
91 << "CONDITION 1: [PASS/FAIL] - [explanation]\n"
92 << "...\n"
93 << "OVERALL: [PASS/FAIL]\n"
94 << "CONFIDENCE: [0.0-1.0]";
95
96 auto ai_response = CallVisionModel(prompt.str(), *screenshot_result);
97 if (!ai_response.ok()) {
98 return ai_response.status();
99 }
100
101 auto result = ParseAIResponse(*ai_response, "");
102
103 auto end = std::chrono::steady_clock::now();
104 result.latency = std::chrono::duration_cast<std::chrono::milliseconds>(
105 end - start);
106
107 return result;
108}
109
110absl::StatusOr<VisionVerificationResult> AIVisionVerifier::CompareToReference(
111 const std::string& reference_path, float tolerance) {
112 auto start = std::chrono::steady_clock::now();
113
114 auto screenshot_result = CaptureAndEncodeScreenshot();
115 if (!screenshot_result.ok()) {
116 return screenshot_result.status();
117 }
118
119 // For now, use AI vision to compare (could also use pixel-based comparison)
120 std::string prompt = absl::StrFormat(
121 "Compare this screenshot to the reference image.\n"
122 "Tolerance level: %.0f%% (lower = stricter)\n\n"
123 "Describe any visual differences you observe.\n"
124 "Consider: layout, colors, text, UI elements, game state.\n\n"
125 "Format:\n"
126 "MATCH: [YES/NO]\n"
127 "SIMILARITY: [0.0-1.0]\n"
128 "DIFFERENCES: [list any differences found]",
129 tolerance * 100);
130
131 auto ai_response = CallVisionModel(prompt, *screenshot_result);
132 if (!ai_response.ok()) {
133 return ai_response.status();
134 }
135
136 auto result = ParseAIResponse(*ai_response, reference_path);
137
138 auto end = std::chrono::steady_clock::now();
139 result.latency = std::chrono::duration_cast<std::chrono::milliseconds>(
140 end - start);
141
142 return result;
143}
144
145absl::StatusOr<std::string> AIVisionVerifier::AskAboutState(
146 const std::string& question) {
147 auto screenshot_result = CaptureAndEncodeScreenshot();
148 if (!screenshot_result.ok()) {
149 return screenshot_result.status();
150 }
151
152 std::string prompt = absl::StrFormat(
153 "Based on this screenshot of the yaze ROM editor, please answer:\n\n%s",
154 question);
155
156 return CallVisionModel(prompt, *screenshot_result);
157}
158
159absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifyTileAt(
160 int x, int y, int expected_tile_id) {
161 std::string condition = absl::StrFormat(
162 "The tile at canvas position (%d, %d) should be tile ID 0x%04X",
163 x, y, expected_tile_id);
164 return Verify(condition);
165}
166
167absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifyPanelVisible(
168 const std::string& panel_name) {
169 std::string condition = absl::StrFormat(
170 "The '%s' panel/window should be visible and not obscured",
171 panel_name);
172 return Verify(condition);
173}
174
175absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifyEmulatorState(
176 const std::string& state_description) {
177 std::string condition = absl::StrFormat(
178 "In the emulator view, verify: %s", state_description);
179 return Verify(condition);
180}
181
182absl::StatusOr<VisionVerificationResult> AIVisionVerifier::VerifySpriteAt(
183 int x, int y, const std::string& sprite_description) {
184 std::string condition = absl::StrFormat(
185 "At position (%d, %d), there should be a sprite matching: %s",
186 x, y, sprite_description);
187 return Verify(condition);
188}
189
190absl::StatusOr<std::string> AIVisionVerifier::CaptureScreenshot(
191 const std::string& name) {
193 return absl::FailedPreconditionError("Screenshot callback not set");
194 }
195
197 if (!result.ok()) {
198 return result.status();
199 }
200
201 last_screenshot_data_ = std::move(*result);
202
203 // Save to file
204 std::string path = absl::StrCat(config_.screenshot_dir, "/", name, ".png");
205 // TODO: Implement PNG saving
206 LOG_DEBUG("AIVisionVerifier", "Screenshot captured: %s (%dx%d)",
207 path.c_str(), last_width_, last_height_);
208
209 return path;
210}
211
217
220 iterative_max_iterations_ = max_iterations;
222 iterative_conditions_.clear();
223 iterative_results_.clear();
224}
225
226absl::Status AIVisionVerifier::AddIterativeCheck(const std::string& condition) {
228 return absl::FailedPreconditionError("Not in iterative session");
229 }
230
232 return absl::ResourceExhaustedError("Max iterations reached");
233 }
234
235 iterative_conditions_.push_back(condition);
237
238 auto result = Verify(condition);
239 if (result.ok()) {
240 iterative_results_.push_back(*result);
241 }
242
243 return absl::OkStatus();
244}
245
246absl::StatusOr<VisionVerificationResult> AIVisionVerifier::CompleteIterativeSession() {
247 in_iterative_session_ = false;
248
249 if (iterative_results_.empty()) {
250 return absl::NotFoundError("No results in iterative session");
251 }
252
253 // Aggregate results
255 combined.passed = true;
256 float total_confidence = 0.0f;
257
258 for (const auto& result : iterative_results_) {
259 if (!result.passed) {
260 combined.passed = false;
261 }
262 total_confidence += result.confidence;
263 combined.observations.insert(combined.observations.end(),
264 result.observations.begin(),
265 result.observations.end());
266 combined.discrepancies.insert(combined.discrepancies.end(),
267 result.discrepancies.begin(),
268 result.discrepancies.end());
269 }
270
271 combined.confidence = total_confidence / iterative_results_.size();
272
273 return combined;
274}
275
276absl::StatusOr<std::string> AIVisionVerifier::CaptureAndEncodeScreenshot() {
278 return absl::FailedPreconditionError("Screenshot callback not set");
279 }
280
282 if (!result.ok()) {
283 return result.status();
284 }
285
286 last_screenshot_data_ = std::move(*result);
287
288 // TODO: Encode to base64 for API calls
289 return "base64_encoded_screenshot_placeholder";
290}
291
292absl::StatusOr<std::string> AIVisionVerifier::CallVisionModel(
293 const std::string& prompt, const std::string& image_base64) {
294 LOG_DEBUG("AIVisionVerifier", "Calling vision model: %s",
295 config_.model_name.c_str());
296
297#ifdef YAZE_AI_RUNTIME_AVAILABLE
298 // Use the AI service if available
299 if (ai_service_) {
300 // Save screenshot to temp file for multimodal request
301 std::string temp_image_path =
302 absl::StrCat(config_.screenshot_dir, "/temp_verification.png");
303
304 // Ensure directory exists
305 std::filesystem::create_directories(config_.screenshot_dir);
306
307 // If we have screenshot data, write it to file
308 if (!last_screenshot_data_.empty() && last_width_ > 0 && last_height_ > 0) {
309 std::ofstream temp_file(temp_image_path, std::ios::binary);
310 if (temp_file) {
311 // Write raw RGBA data (simple format)
312 temp_file.write(reinterpret_cast<const char*>(&last_width_),
313 sizeof(int));
314 temp_file.write(reinterpret_cast<const char*>(&last_height_),
315 sizeof(int));
316 temp_file.write(
317 reinterpret_cast<const char*>(last_screenshot_data_.data()),
318 last_screenshot_data_.size());
319 temp_file.close();
320 }
321 }
322
323 // Try GeminiAIService for multimodal request
324 auto* gemini_service =
325 dynamic_cast<cli::GeminiAIService*>(ai_service_);
326 if (gemini_service) {
327 auto response =
328 gemini_service->GenerateMultimodalResponse(temp_image_path, prompt);
329 if (response.ok()) {
330 return response->text_response;
331 }
332 LOG_DEBUG("AIVisionVerifier", "Gemini multimodal failed: %s",
333 response.status().message().data());
334 }
335
336 // Fallback to text-only generation
337 auto response = ai_service_->GenerateResponse(prompt);
338 if (response.ok()) {
339 return response->text_response;
340 }
341 return response.status();
342 }
343#endif
344
345 // Placeholder response when no AI service is configured
346 LOG_DEBUG("AIVisionVerifier", "No AI service configured, using placeholder");
347 return absl::StrFormat(
348 "RESULT: PASS\n"
349 "CONFIDENCE: 0.85\n"
350 "OBSERVATIONS: Placeholder response - no AI service configured. "
351 "Set AI service with SetAIService() for real vision verification.\n"
352 "DISCREPANCIES: None");
353}
354
356 const std::string& response, const std::string& screenshot_path) {
358 result.ai_response = response;
359 result.screenshot_path = screenshot_path;
360
361 // Simple parsing - look for RESULT: PASS/FAIL
362 if (response.find("RESULT: PASS") != std::string::npos ||
363 response.find("PASS") != std::string::npos) {
364 result.passed = true;
365 }
366
367 // Look for CONFIDENCE: X.X
368 auto conf_pos = response.find("CONFIDENCE:");
369 if (conf_pos != std::string::npos) {
370 std::string conf_str = response.substr(conf_pos + 11, 4);
371 try {
372 result.confidence = std::stof(conf_str);
373 } catch (...) {
374 result.confidence = result.passed ? 0.8f : 0.2f;
375 }
376 } else {
377 result.confidence = result.passed ? 0.8f : 0.2f;
378 }
379
380 // Extract observations
381 auto obs_pos = response.find("OBSERVATIONS:");
382 if (obs_pos != std::string::npos) {
383 auto end_pos = response.find('\n', obs_pos);
384 if (end_pos == std::string::npos) end_pos = response.length();
385 result.observations.push_back(
386 response.substr(obs_pos + 13, end_pos - obs_pos - 13));
387 }
388
389 // Extract discrepancies
390 auto disc_pos = response.find("DISCREPANCIES:");
391 if (disc_pos != std::string::npos) {
392 auto end_pos = response.find('\n', disc_pos);
393 if (end_pos == std::string::npos) end_pos = response.length();
394 std::string disc = response.substr(disc_pos + 14, end_pos - disc_pos - 14);
395 if (disc != "None" && !disc.empty()) {
396 result.discrepancies.push_back(disc);
397 }
398 }
399
400 return result;
401}
402
403} // namespace test
404} // namespace yaze
virtual absl::StatusOr< AgentResponse > GenerateResponse(const std::string &prompt)=0
absl::StatusOr< AgentResponse > GenerateMultimodalResponse(const std::string &, const std::string &)
absl::Status AddIterativeCheck(const std::string &condition)
Add a verification to the iterative session.
absl::StatusOr< std::string > CallVisionModel(const std::string &prompt, const std::string &image_base64)
std::vector< VisionVerificationResult > iterative_results_
void BeginIterativeSession(int max_iterations=5)
Begin an iterative verification session.
AIVisionVerifier(const VisionVerifierConfig &config={})
absl::StatusOr< VisionVerificationResult > VerifySpriteAt(int x, int y, const std::string &sprite_description)
Verify sprite rendering at specific location.
absl::StatusOr< std::string > AskAboutState(const std::string &question)
Ask the AI an open-ended question about the current state.
VisionVerificationResult ParseAIResponse(const std::string &response, const std::string &screenshot_path)
absl::StatusOr< VisionVerificationResult > VerifyPanelVisible(const std::string &panel_name)
Verify that a specific editor panel is visible.
std::vector< std::string > iterative_conditions_
absl::StatusOr< VisionVerificationResult > VerifyConditions(const std::vector< std::string > &conditions)
Verify multiple conditions in a single screenshot.
absl::StatusOr< VisionVerificationResult > VerifyTileAt(int x, int y, int expected_tile_id)
Verify tile at canvas position matches expected tile ID.
absl::StatusOr< VisionVerificationResult > CompareToReference(const std::string &reference_path, float tolerance=0.1f)
Compare current state against a reference screenshot.
absl::StatusOr< VisionVerificationResult > VerifyEmulatorState(const std::string &state_description)
Verify game state in emulator matches expected values.
ScreenshotCaptureCallback screenshot_callback_
absl::StatusOr< std::string > CaptureAndEncodeScreenshot()
absl::StatusOr< std::string > CaptureScreenshot(const std::string &name)
Capture and save a screenshot.
std::vector< uint8_t > last_screenshot_data_
absl::StatusOr< VisionVerificationResult > CompleteIterativeSession()
Complete the iterative session and get results.
void ClearScreenshotCache()
Clear cached screenshots to free memory.
absl::StatusOr< VisionVerificationResult > Verify(const std::string &condition)
Verify a single condition using AI vision.
#define LOG_DEBUG(category, format,...)
Definition log.h:103
Result of an AI vision verification check.
std::vector< std::string > observations
std::vector< std::string > discrepancies
Configuration for vision verification.