yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
vision_action_refiner.h
Go to the documentation of this file.
1#ifndef YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
2#define YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
3
4#include <filesystem>
5#include <string>
6#include <vector>
7
8#include "absl/status/statusor.h"
10
11namespace yaze {
12namespace cli {
13
14// Forward declare
15class GeminiAIService;
16
17namespace ai {
18
24 std::string description; // What Gemini sees in the image
25 std::vector<std::string> widgets; // Detected UI widgets
26 std::vector<std::string> suggestions; // Action suggestions
27 bool action_successful = false; // Whether the last action succeeded
28 std::string error_message; // Error description if action failed
29};
30
36 bool needs_retry = false;
38 std::map<std::string, std::string> adjusted_parameters;
39 std::string reasoning;
40};
41
79 public:
84 explicit VisionActionRefiner(GeminiAIService* gemini_service);
85
92 absl::StatusOr<VisionAnalysisResult> AnalyzeScreenshot(
93 const std::filesystem::path& screenshot_path,
94 const std::string& context = "");
95
103 absl::StatusOr<VisionAnalysisResult> VerifyAction(
104 const AIAction& action,
105 const std::filesystem::path& before_screenshot,
106 const std::filesystem::path& after_screenshot);
107
114 absl::StatusOr<ActionRefinement> RefineAction(
115 const AIAction& original_action,
116 const VisionAnalysisResult& analysis);
117
124 absl::StatusOr<std::map<std::string, std::string>> LocateUIElement(
125 const std::filesystem::path& screenshot_path,
126 const std::string& element_name);
127
133 absl::StatusOr<std::vector<std::string>> ExtractVisibleWidgets(
134 const std::filesystem::path& screenshot_path);
135
136 private:
138
139 // Build prompts for different vision analysis tasks
140 std::string BuildAnalysisPrompt(const std::string& context);
141 std::string BuildVerificationPrompt(const AIAction& action);
142 std::string BuildElementLocationPrompt(const std::string& element_name);
143 std::string BuildWidgetExtractionPrompt();
144
145 // Parse Gemini vision responses
146 VisionAnalysisResult ParseAnalysisResponse(const std::string& response);
148 const std::string& response, const AIAction& action);
149};
150
151} // namespace ai
152} // namespace cli
153} // namespace yaze
154
155#endif // YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
Uses Gemini Vision to analyze GUI screenshots and refine AI actions.
VisionAnalysisResult ParseAnalysisResponse(const std::string &response)
absl::StatusOr< VisionAnalysisResult > VerifyAction(const AIAction &action, const std::filesystem::path &before_screenshot, const std::filesystem::path &after_screenshot)
Verify an action was successful by comparing before/after screenshots.
std::string BuildVerificationPrompt(const AIAction &action)
absl::StatusOr< std::vector< std::string > > ExtractVisibleWidgets(const std::filesystem::path &screenshot_path)
Extract all visible widgets from a screenshot.
absl::StatusOr< ActionRefinement > RefineAction(const AIAction &original_action, const VisionAnalysisResult &analysis)
Refine an action based on vision analysis feedback.
absl::StatusOr< std::map< std::string, std::string > > LocateUIElement(const std::filesystem::path &screenshot_path, const std::string &element_name)
Find a specific UI element in a screenshot.
std::string BuildAnalysisPrompt(const std::string &context)
std::string BuildElementLocationPrompt(const std::string &element_name)
absl::StatusOr< VisionAnalysisResult > AnalyzeScreenshot(const std::filesystem::path &screenshot_path, const std::string &context="")
Analyze the current GUI state from a screenshot.
VisionAnalysisResult ParseVerificationResponse(const std::string &response, const AIAction &action)
Main namespace for the application.
Definition controller.cc:20
Represents a single action to be performed in the GUI.
Refined action parameters based on vision analysis.
std::map< std::string, std::string > adjusted_parameters
Result of analyzing a screenshot with Gemini Vision.