yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
vision_action_refiner.h
Go to the documentation of this file.
1#ifndef YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
2#define YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
3
4#include <filesystem>
5#include <string>
6#include <vector>
7
8#include "absl/status/statusor.h"
10
11namespace yaze {
12namespace cli {
13
14// Forward declare
15class GeminiAIService;
16
17namespace ai {
18
24 std::string description; // What Gemini sees in the image
25 std::vector<std::string> widgets; // Detected UI widgets
26 std::vector<std::string> suggestions; // Action suggestions
27 bool action_successful = false; // Whether the last action succeeded
28 std::string error_message; // Error description if action failed
29};
30
36 bool needs_retry = false;
38 std::map<std::string, std::string> adjusted_parameters;
39 std::string reasoning;
40};
41
79 public:
84 explicit VisionActionRefiner(GeminiAIService* gemini_service);
85
92 absl::StatusOr<VisionAnalysisResult> AnalyzeScreenshot(
93 const std::filesystem::path& screenshot_path,
94 const std::string& context = "");
95
104 absl::StatusOr<VisionAnalysisResult> VerifyAction(
105 const AIAction& action, const std::filesystem::path& before_screenshot,
106 const std::filesystem::path& after_screenshot);
107
114 absl::StatusOr<ActionRefinement> RefineAction(
115 const AIAction& original_action, const VisionAnalysisResult& analysis);
116
123 absl::StatusOr<std::map<std::string, std::string>> LocateUIElement(
124 const std::filesystem::path& screenshot_path,
125 const std::string& element_name);
126
132 absl::StatusOr<std::vector<std::string>> ExtractVisibleWidgets(
133 const std::filesystem::path& screenshot_path);
134
135 private:
137
138 // Build prompts for different vision analysis tasks
139 std::string BuildAnalysisPrompt(const std::string& context);
140 std::string BuildVerificationPrompt(const AIAction& action);
141 std::string BuildElementLocationPrompt(const std::string& element_name);
142 std::string BuildWidgetExtractionPrompt();
143
144 // Parse Gemini vision responses
145 VisionAnalysisResult ParseAnalysisResponse(const std::string& response);
146 VisionAnalysisResult ParseVerificationResponse(const std::string& response,
147 const AIAction& action);
148};
149
150} // namespace ai
151} // namespace cli
152} // namespace yaze
153
154#endif // YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
Uses Gemini Vision to analyze GUI screenshots and refine AI actions.
VisionActionRefiner(GeminiAIService *gemini_service)
Construct refiner with Gemini service.
VisionAnalysisResult ParseAnalysisResponse(const std::string &response)
absl::StatusOr< VisionAnalysisResult > VerifyAction(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Verify an action was successful by comparing before/after screenshots.
std::string BuildVerificationPrompt(const AIAction &action)
absl::StatusOr< std::vector< std::string > > ExtractVisibleWidgets(const std::filesystem::path &screenshot_path)
Extract all visible widgets from a screenshot.
absl::StatusOr< ActionRefinement > RefineAction(const AIAction &original_action, const VisionAnalysisResult &analysis)
Refine an action based on vision analysis feedback.
absl::StatusOr< std::map< std::string, std::string > > LocateUIElement(const std::filesystem::path &screenshot_path, const std::string &element_name)
Find a specific UI element in a screenshot.
std::string BuildAnalysisPrompt(const std::string &context)
std::string BuildElementLocationPrompt(const std::string &element_name)
absl::StatusOr< VisionAnalysisResult > AnalyzeScreenshot(const std::filesystem::path &screenshot_path, const std::string &context="")
Analyze the current GUI state from a screenshot.
VisionAnalysisResult ParseVerificationResponse(const std::string &response, const AIAction &action)
Represents a single action to be performed in the GUI.
Refined action parameters based on vision analysis.
std::map< std::string, std::string > adjusted_parameters
Result of analyzing a screenshot with Gemini Vision.