yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
gemini_ai_service.cc
Go to the documentation of this file.
3
4#include <atomic>
5#include <cstdlib>
6#include <iostream>
7#include <mutex>
8#include <string>
9#include <vector>
10
11#include "absl/strings/str_cat.h"
12#include "absl/strings/str_split.h"
13#include "absl/strings/strip.h"
14#include "util/platform_paths.h"
15
16#ifdef YAZE_WITH_JSON
17#include <filesystem>
18#include <fstream>
19#include "httplib.h"
20#include "nlohmann/json.hpp"
21
22// OpenSSL initialization for HTTPS support
23#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
24#include <openssl/ssl.h>
25#include <openssl/err.h>
26#include <openssl/crypto.h>
27
28// Global flag to track OpenSSL initialization
29static std::atomic<bool> g_openssl_initialized{false};
30static std::mutex g_openssl_init_mutex;
31
32static void InitializeOpenSSL() {
33 std::lock_guard<std::mutex> lock(g_openssl_init_mutex);
34 if (!g_openssl_initialized.exchange(true)) {
35 OPENSSL_init_ssl(OPENSSL_INIT_LOAD_SSL_STRINGS | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, nullptr);
36 std::cerr << "✓ OpenSSL initialized for HTTPS support" << std::endl;
37 }
38}
39#endif
40#endif
41
42namespace yaze {
43namespace cli {
44
46 : function_calling_enabled_(config.use_function_calling), config_(config) {
47 if (config_.verbose) {
48 std::cerr << "[DEBUG] Initializing Gemini service..." << std::endl;
49 std::cerr << "[DEBUG] Function calling: " << (function_calling_enabled_ ? "enabled" : "disabled") << std::endl;
50 std::cerr << "[DEBUG] Prompt version: " << config_.prompt_version << std::endl;
51 }
52
53#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
54 // Initialize OpenSSL for HTTPS support
55 InitializeOpenSSL();
56 if (config_.verbose) {
57 std::cerr << "[DEBUG] OpenSSL initialized for HTTPS" << std::endl;
58 }
59#endif
60
61 // Load command documentation into prompt builder with specified version
62 std::string catalogue_path = config_.prompt_version == "v2"
63 ? "assets/agent/prompt_catalogue_v2.yaml"
64 : "assets/agent/prompt_catalogue.yaml";
65 if (auto status = prompt_builder_.LoadResourceCatalogue(catalogue_path); !status.ok()) {
66 std::cerr << "⚠️ Failed to load agent prompt catalogue: "
67 << status.message() << std::endl;
68 }
69
70 if (config_.verbose) {
71 std::cerr << "[DEBUG] Loaded prompt catalogue" << std::endl;
72 }
73
74 if (config_.system_instruction.empty()) {
75 if (config_.verbose) {
76 std::cerr << "[DEBUG] Building system instruction..." << std::endl;
77 }
78
79 // Try to load version-specific system prompt file using FindAsset
80 std::string prompt_file;
81 if (config_.prompt_version == "v3") {
82 prompt_file = "agent/system_prompt_v3.txt";
83 } else if (config_.prompt_version == "v2") {
84 prompt_file = "agent/system_prompt_v2.txt";
85 } else {
86 prompt_file = "agent/system_prompt.txt";
87 }
88
89 auto prompt_path = util::PlatformPaths::FindAsset(prompt_file);
90 bool loaded = false;
91
92 if (prompt_path.ok()) {
93 std::ifstream file(prompt_path->string());
94 if (file.good()) {
95 std::stringstream buffer;
96 buffer << file.rdbuf();
97 config_.system_instruction = buffer.str();
98 if (config_.verbose) {
99 std::cerr << "[DEBUG] Loaded prompt: " << prompt_path->string() << std::endl;
100 }
101 loaded = true;
102 }
103 }
104
105 if (!loaded) {
106 // Fallback to builder
109 } else {
111 }
112 }
113 }
114
115 if (config_.verbose) {
116 std::cerr << "[DEBUG] Gemini service initialized" << std::endl;
117 }
118}
119
123
124std::vector<std::string> GeminiAIService::GetAvailableTools() const {
125 return {
126 "resource-list",
127 "resource-search",
128 "dungeon-list-sprites",
129 "dungeon-describe-room",
130 "overworld-find-tile",
131 "overworld-describe-map",
132 "overworld-list-warps"
133 };
134}
135
137#ifndef YAZE_WITH_JSON
138 return "{}"; // Empty object if JSON not available
139#else
140 // Use the prompt builder's schema generation which reads from prompt_catalogue.yaml
141 std::string schemas = prompt_builder_.BuildFunctionCallSchemas();
142 if (!schemas.empty() && schemas != "[]") {
143 return schemas;
144 }
145
146 // Fallback: Search for function_schemas.json using FindAsset
147 auto schema_path_or = util::PlatformPaths::FindAsset("agent/function_schemas.json");
148
149 if (!schema_path_or.ok()) {
150 if (config_.verbose) {
151 std::cerr << "⚠️ Function schemas file not found: "
152 << schema_path_or.status().message() << std::endl;
153 }
154 return "[]"; // Return empty array as fallback
155 }
156
157 // Load and parse the JSON file
158 std::ifstream file(schema_path_or->string());
159 if (!file.is_open()) {
160 std::cerr << "⚠️ Failed to open function schemas file: "
161 << schema_path_or->string() << std::endl;
162 return "[]";
163 }
164
165 try {
166 nlohmann::json schemas_json;
167 file >> schemas_json;
168 return schemas_json.dump();
169 } catch (const nlohmann::json::exception& e) {
170 std::cerr << "⚠️ Failed to parse function schemas JSON: "
171 << e.what() << std::endl;
172 return "[]";
173 }
174#endif
175}
176
178 // Fallback prompt if enhanced prompting is disabled
179 // Use PromptBuilder's basic system instruction
181}
182
186
188#ifndef YAZE_WITH_JSON
189 return absl::UnimplementedError(
190 "Gemini AI service requires JSON support. Build with -DYAZE_WITH_JSON=ON");
191#else
192 try {
193 if (config_.verbose) {
194 std::cerr << "[DEBUG] CheckAvailability: start" << std::endl;
195 }
196
197 if (config_.api_key.empty()) {
198 return absl::FailedPreconditionError(
199 "❌ Gemini API key not configured\n"
200 " Set GEMINI_API_KEY environment variable\n"
201 " Get your API key at: https://makersuite.google.com/app/apikey");
202 }
203
204 if (config_.verbose) {
205 std::cerr << "[DEBUG] CheckAvailability: creating HTTPS client" << std::endl;
206 }
207 // Test API connectivity with a simple request
208 httplib::Client cli("https://generativelanguage.googleapis.com");
209 if (config_.verbose) {
210 std::cerr << "[DEBUG] CheckAvailability: client created" << std::endl;
211 }
212
213 cli.set_connection_timeout(5, 0); // 5 seconds timeout
214
215 if (config_.verbose) {
216 std::cerr << "[DEBUG] CheckAvailability: building endpoint" << std::endl;
217 }
218 std::string test_endpoint = "/v1beta/models/" + config_.model;
219 httplib::Headers headers = {
220 {"x-goog-api-key", config_.api_key},
221 };
222
223 if (config_.verbose) {
224 std::cerr << "[DEBUG] CheckAvailability: making request to " << test_endpoint << std::endl;
225 }
226 auto res = cli.Get(test_endpoint.c_str(), headers);
227
228 if (config_.verbose) {
229 std::cerr << "[DEBUG] CheckAvailability: got response" << std::endl;
230 }
231
232 if (!res) {
233 return absl::UnavailableError(
234 "❌ Cannot reach Gemini API\n"
235 " Check your internet connection");
236 }
237
238 if (res->status == 401 || res->status == 403) {
239 return absl::PermissionDeniedError(
240 "❌ Invalid Gemini API key\n"
241 " Verify your key at: https://makersuite.google.com/app/apikey");
242 }
243
244 if (res->status == 404) {
245 return absl::NotFoundError(
246 absl::StrCat("❌ Model '", config_.model, "' not found\n",
247 " Try: gemini-2.5-flash or gemini-1.5-pro"));
248 }
249
250 if (res->status != 200) {
251 return absl::InternalError(
252 absl::StrCat("❌ Gemini API error: ", res->status, "\n ", res->body));
253 }
254
255 return absl::OkStatus();
256 } catch (const std::exception& e) {
257 if (config_.verbose) {
258 std::cerr << "[DEBUG] CheckAvailability: EXCEPTION: " << e.what() << std::endl;
259 }
260 return absl::InternalError(absl::StrCat("Exception during availability check: ", e.what()));
261 } catch (...) {
262 if (config_.verbose) {
263 std::cerr << "[DEBUG] CheckAvailability: UNKNOWN EXCEPTION" << std::endl;
264 }
265 return absl::InternalError("Unknown exception during availability check");
266 }
267#endif
268}
269
270absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
271 const std::string& prompt) {
272 return GenerateResponse({{{agent::ChatMessage::Sender::kUser, prompt, absl::Now()}}});
273}
274
275absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
276 const std::vector<agent::ChatMessage>& history) {
277#ifndef YAZE_WITH_JSON
278 return absl::UnimplementedError(
279 "Gemini AI service requires JSON support. Build with -DYAZE_WITH_JSON=ON");
280#else
281 if (history.empty()) {
282 return absl::InvalidArgumentError("History cannot be empty.");
283 }
284
285 // Build a structured conversation history for better context
286 // Gemini supports multi-turn conversations via the contents array
287 std::string prompt = prompt_builder_.BuildPromptFromHistory(history);
288
289 // Skip availability check - causes segfault with current SSL setup
290 // TODO: Fix SSL/TLS initialization issue
291 // if (auto status = CheckAvailability(); !status.ok()) {
292 // return status;
293 // }
294
295 if (config_.api_key.empty()) {
296 return absl::FailedPreconditionError("Gemini API key not configured");
297 }
298
299 try {
300 if (config_.verbose) {
301 std::cerr << "[DEBUG] Using curl for HTTPS request" << std::endl;
302 std::cerr << "[DEBUG] Processing " << history.size() << " messages in history" << std::endl;
303 }
304
305 // Build conversation history for multi-turn context
306 // Gemini supports alternating user/model messages for better context
307 nlohmann::json contents = nlohmann::json::array();
308
309 // Add conversation history (up to last 10 messages for context window)
310 int start_idx = std::max(0, static_cast<int>(history.size()) - 10);
311 for (size_t i = start_idx; i < history.size(); ++i) {
312 const auto& msg = history[i];
313 std::string role = (msg.sender == agent::ChatMessage::Sender::kUser) ? "user" : "model";
314
315 nlohmann::json message = {
316 {"role", role},
317 {"parts", {{
318 {"text", msg.message}
319 }}}
320 };
321 contents.push_back(message);
322 }
323
324 // If the last message is from the model, we need to ensure the conversation
325 // ends with a user message for Gemini
326 if (!history.empty() &&
327 history.back().sender == agent::ChatMessage::Sender::kAgent) {
328 // Add a continuation prompt
329 nlohmann::json user_continuation = {
330 {"role", "user"},
331 {"parts", {{
332 {"text", "Please continue or clarify your response."}
333 }}}
334 };
335 contents.push_back(user_continuation);
336 }
337
338 // Build request with proper Gemini API v1beta format
339 nlohmann::json request_body = {
340 {"system_instruction", {
341 {"parts", {
343 }}
344 }},
345 {"contents", contents},
346 {"generationConfig", {
347 {"temperature", config_.temperature},
348 {"maxOutputTokens", config_.max_output_tokens}
349 }}
350 };
351
352 if (config_.verbose) {
353 std::cerr << "[DEBUG] Sending " << contents.size() << " conversation turns to Gemini" << std::endl;
354 }
355
356 // Only add responseMimeType if NOT using function calling
357 // (Gemini doesn't support both at the same time)
359 request_body["generationConfig"]["responseMimeType"] = "application/json";
360 }
361
362 // Add function calling tools if enabled
364 try {
365 std::string schemas_str = BuildFunctionCallSchemas();
366 if (config_.verbose) {
367 std::cerr << "[DEBUG] Function calling schemas: " << schemas_str.substr(0, 200) << "..." << std::endl;
368 }
369
370 nlohmann::json schemas = nlohmann::json::parse(schemas_str);
371
372 // Build tools array - schemas might be an array of tools or a function_declarations object
373 if (schemas.is_array()) {
374 // If it's already an array of tools, use it directly
375 request_body["tools"] = {{
376 {"function_declarations", schemas}
377 }};
378 } else if (schemas.is_object() && schemas.contains("function_declarations")) {
379 // If it's a wrapper object with function_declarations
380 request_body["tools"] = {{
381 {"function_declarations", schemas["function_declarations"]}
382 }};
383 } else {
384 // Treat as single tool object
385 request_body["tools"] = {{
386 {"function_declarations", nlohmann::json::array({schemas})}
387 }};
388 }
389 } catch (const nlohmann::json::exception& e) {
390 std::cerr << "⚠️ Failed to parse function schemas: " << e.what() << std::endl;
391 }
392 }
393
394 // Write request body to temp file
395 std::string temp_file = "/tmp/gemini_request.json";
396 std::ofstream out(temp_file);
397 out << request_body.dump();
398 out.close();
399
400 // Use curl to make the request (avoiding httplib SSL issues)
401 std::string endpoint = "https://generativelanguage.googleapis.com/v1beta/models/" +
402 config_.model + ":generateContent";
403 std::string curl_cmd = "curl -s -X POST '" + endpoint + "' "
404 "-H 'Content-Type: application/json' "
405 "-H 'x-goog-api-key: " + config_.api_key + "' "
406 "-d @" + temp_file + " 2>&1";
407
408 if (config_.verbose) {
409 std::cerr << "[DEBUG] Executing API request..." << std::endl;
410 }
411
412#ifdef _WIN32
413 FILE* pipe = _popen(curl_cmd.c_str(), "r");
414#else
415 FILE* pipe = popen(curl_cmd.c_str(), "r");
416#endif
417 if (!pipe) {
418 return absl::InternalError("Failed to execute curl command");
419 }
420
421 std::string response_str;
422 char buffer[4096];
423 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
424 response_str += buffer;
425 }
426
427#ifdef _WIN32
428 int status = _pclose(pipe);
429#else
430 int status = pclose(pipe);
431#endif
432 std::remove(temp_file.c_str());
433
434 if (status != 0) {
435 return absl::InternalError(absl::StrCat("Curl failed with status ", status));
436 }
437
438 if (response_str.empty()) {
439 return absl::InternalError("Empty response from Gemini API");
440 }
441
442 // Debug: print response
443 if (config_.verbose) {
444 std::cout << "\n" << "\033[35m" << "🔍 Raw Gemini API Response:" << "\033[0m" << "\n"
445 << "\033[2m" << response_str.substr(0, 500) << "\033[0m" << "\n\n";
446 }
447
448 if (config_.verbose) {
449 std::cerr << "[DEBUG] Parsing response..." << std::endl;
450 }
451 return ParseGeminiResponse(response_str);
452
453 } catch (const std::exception& e) {
454 if (config_.verbose) {
455 std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
456 }
457 return absl::InternalError(absl::StrCat("Exception during generation: ", e.what()));
458 } catch (...) {
459 if (config_.verbose) {
460 std::cerr << "[ERROR] Unknown exception" << std::endl;
461 }
462 return absl::InternalError("Unknown exception during generation");
463 }
464#endif
465}
466
467absl::StatusOr<AgentResponse> GeminiAIService::ParseGeminiResponse(
468 const std::string& response_body) {
469#ifndef YAZE_WITH_JSON
470 return absl::UnimplementedError("JSON support required");
471#else
472 AgentResponse agent_response;
473
474 auto response_json = nlohmann::json::parse(response_body, nullptr, false);
475 if (response_json.is_discarded()) {
476 return absl::InternalError("❌ Failed to parse Gemini response JSON");
477 }
478
479 // Navigate Gemini's response structure
480 if (!response_json.contains("candidates") ||
481 response_json["candidates"].empty()) {
482 return absl::InternalError("❌ No candidates in Gemini response");
483 }
484
485 for (const auto& candidate : response_json["candidates"]) {
486 if (!candidate.contains("content") ||
487 !candidate["content"].contains("parts")) {
488 continue;
489 }
490
491 for (const auto& part : candidate["content"]["parts"]) {
492 if (part.contains("text")) {
493 std::string text_content = part["text"].get<std::string>();
494
495 // Debug: Print raw LLM output when verbose mode is enabled
496 if (config_.verbose) {
497 std::cout << "\n" << "\033[35m" << "🔍 Raw LLM Response:" << "\033[0m" << "\n"
498 << "\033[2m" << text_content << "\033[0m" << "\n\n";
499 }
500
501 // Strip markdown code blocks if present (```json ... ```)
502 text_content = std::string(absl::StripAsciiWhitespace(text_content));
503 if (absl::StartsWith(text_content, "```json")) {
504 text_content = text_content.substr(7); // Remove ```json
505 } else if (absl::StartsWith(text_content, "```")) {
506 text_content = text_content.substr(3); // Remove ```
507 }
508 if (absl::EndsWith(text_content, "```")) {
509 text_content = text_content.substr(0, text_content.length() - 3);
510 }
511 text_content = std::string(absl::StripAsciiWhitespace(text_content));
512
513 // Try to parse as JSON object
514 auto parsed_text = nlohmann::json::parse(text_content, nullptr, false);
515 if (!parsed_text.is_discarded()) {
516 // Extract text_response
517 if (parsed_text.contains("text_response") &&
518 parsed_text["text_response"].is_string()) {
519 agent_response.text_response =
520 parsed_text["text_response"].get<std::string>();
521 }
522
523 // Extract reasoning
524 if (parsed_text.contains("reasoning") &&
525 parsed_text["reasoning"].is_string()) {
526 agent_response.reasoning =
527 parsed_text["reasoning"].get<std::string>();
528 }
529
530 // Extract commands
531 if (parsed_text.contains("commands") &&
532 parsed_text["commands"].is_array()) {
533 for (const auto& cmd : parsed_text["commands"]) {
534 if (cmd.is_string()) {
535 std::string command = cmd.get<std::string>();
536 if (absl::StartsWith(command, "z3ed ")) {
537 command = command.substr(5);
538 }
539 agent_response.commands.push_back(command);
540 }
541 }
542 }
543
544 // Extract tool_calls from the parsed JSON
545 if (parsed_text.contains("tool_calls") &&
546 parsed_text["tool_calls"].is_array()) {
547 for (const auto& call : parsed_text["tool_calls"]) {
548 if (call.contains("tool_name") && call["tool_name"].is_string()) {
549 ToolCall tool_call;
550 tool_call.tool_name = call["tool_name"].get<std::string>();
551
552 if (call.contains("args") && call["args"].is_object()) {
553 for (auto& [key, value] : call["args"].items()) {
554 if (value.is_string()) {
555 tool_call.args[key] = value.get<std::string>();
556 } else if (value.is_number()) {
557 tool_call.args[key] = std::to_string(value.get<double>());
558 } else if (value.is_boolean()) {
559 tool_call.args[key] = value.get<bool>() ? "true" : "false";
560 }
561 }
562 }
563 agent_response.tool_calls.push_back(tool_call);
564 }
565 }
566 }
567 } else {
568 // If parsing the full object fails, fallback to extracting commands from text
569 std::vector<std::string> lines = absl::StrSplit(text_content, '\n');
570 for (const auto& line : lines) {
571 std::string trimmed = std::string(absl::StripAsciiWhitespace(line));
572 if (!trimmed.empty() &&
573 (absl::StartsWith(trimmed, "z3ed ") ||
574 absl::StartsWith(trimmed, "palette ") ||
575 absl::StartsWith(trimmed, "overworld ") ||
576 absl::StartsWith(trimmed, "sprite ") ||
577 absl::StartsWith(trimmed, "dungeon "))) {
578 if (absl::StartsWith(trimmed, "z3ed ")) {
579 trimmed = trimmed.substr(5);
580 }
581 agent_response.commands.push_back(trimmed);
582 }
583 }
584 }
585 } else if (part.contains("functionCall")) {
586 const auto& call = part["functionCall"];
587 if (call.contains("name") && call["name"].is_string()) {
588 ToolCall tool_call;
589 tool_call.tool_name = call["name"].get<std::string>();
590 if (call.contains("args") && call["args"].is_object()) {
591 for (auto& [key, value] : call["args"].items()) {
592 if (value.is_string()) {
593 tool_call.args[key] = value.get<std::string>();
594 } else if (value.is_number()) {
595 tool_call.args[key] = std::to_string(value.get<double>());
596 }
597 }
598 }
599 agent_response.tool_calls.push_back(tool_call);
600 }
601 }
602 }
603 }
604
605 if (agent_response.text_response.empty() &&
606 agent_response.commands.empty() &&
607 agent_response.tool_calls.empty()) {
608 return absl::InternalError(
609 "❌ No valid response extracted from Gemini\n"
610 " Expected at least one of: text_response, commands, or tool_calls\n"
611 " Raw response: " + response_body);
612 }
613
614 return agent_response;
615#endif
616}
617
618absl::StatusOr<std::string> GeminiAIService::EncodeImageToBase64(
619 const std::string& image_path) const {
620#ifndef YAZE_WITH_JSON
621 (void)image_path; // Suppress unused parameter warning
622 return absl::UnimplementedError(
623 "Gemini AI service requires JSON support. Build with -DYAZE_WITH_JSON=ON");
624#else
625 std::ifstream file(image_path, std::ios::binary);
626 if (!file.is_open()) {
627 return absl::NotFoundError(
628 absl::StrCat("Failed to open image file: ", image_path));
629 }
630
631 // Read file into buffer
632 file.seekg(0, std::ios::end);
633 size_t size = file.tellg();
634 file.seekg(0, std::ios::beg);
635
636 std::vector<unsigned char> buffer(size);
637 if (!file.read(reinterpret_cast<char*>(buffer.data()), size)) {
638 return absl::InternalError("Failed to read image file");
639 }
640
641 // Base64 encode
642 static const char* base64_chars =
643 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
644
645 std::string encoded;
646 encoded.reserve(((size + 2) / 3) * 4);
647
648 int i = 0;
649 int j = 0;
650 unsigned char char_array_3[3];
651 unsigned char char_array_4[4];
652
653 for (size_t idx = 0; idx < size; idx++) {
654 char_array_3[i++] = buffer[idx];
655 if (i == 3) {
656 char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
657 char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
658 char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
659 char_array_4[3] = char_array_3[2] & 0x3f;
660
661 for (i = 0; i < 4; i++)
662 encoded += base64_chars[char_array_4[i]];
663 i = 0;
664 }
665 }
666
667 if (i) {
668 for (j = i; j < 3; j++)
669 char_array_3[j] = '\0';
670
671 char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
672 char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
673 char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
674
675 for (j = 0; j < i + 1; j++)
676 encoded += base64_chars[char_array_4[j]];
677
678 while (i++ < 3)
679 encoded += '=';
680 }
681
682 return encoded;
683#endif
684}
685
687 const std::string& image_path, const std::string& prompt) {
688#ifndef YAZE_WITH_JSON
689 (void)image_path; // Suppress unused parameter warnings
690 (void)prompt;
691 return absl::UnimplementedError(
692 "Gemini AI service requires JSON support. Build with -DYAZE_WITH_JSON=ON");
693#else
694 if (config_.api_key.empty()) {
695 return absl::FailedPreconditionError("Gemini API key not configured");
696 }
697
698 // Determine MIME type from file extension
699 std::string mime_type = "image/png";
700 if (image_path.ends_with(".jpg") || image_path.ends_with(".jpeg")) {
701 mime_type = "image/jpeg";
702 } else if (image_path.ends_with(".bmp")) {
703 mime_type = "image/bmp";
704 } else if (image_path.ends_with(".webp")) {
705 mime_type = "image/webp";
706 }
707
708 // Encode image to base64
709 auto encoded_or = EncodeImageToBase64(image_path);
710 if (!encoded_or.ok()) {
711 return encoded_or.status();
712 }
713 std::string encoded_image = std::move(encoded_or.value());
714
715 try {
716 if (config_.verbose) {
717 std::cerr << "[DEBUG] Preparing multimodal request with image" << std::endl;
718 }
719
720 // Build multimodal request with image and text
721 nlohmann::json request_body = {
722 {"contents", {{
723 {"parts", {
724 {
725 {"inline_data", {
726 {"mime_type", mime_type},
727 {"data", encoded_image}
728 }}
729 },
730 {{"text", prompt}}
731 }}
732 }}},
733 {"generationConfig", {
734 {"temperature", config_.temperature},
735 {"maxOutputTokens", config_.max_output_tokens}
736 }}
737 };
738
739 // Write request body to temp file
740 std::string temp_file = "/tmp/gemini_multimodal_request.json";
741 std::ofstream out(temp_file);
742 out << request_body.dump();
743 out.close();
744
745 // Use curl to make the request
746 std::string endpoint = "https://generativelanguage.googleapis.com/v1beta/models/" +
747 config_.model + ":generateContent";
748 std::string curl_cmd = "curl -s -X POST '" + endpoint + "' "
749 "-H 'Content-Type: application/json' "
750 "-H 'x-goog-api-key: " + config_.api_key + "' "
751 "-d @" + temp_file + " 2>&1";
752
753 if (config_.verbose) {
754 std::cerr << "[DEBUG] Executing multimodal API request..." << std::endl;
755 }
756
757#ifdef _WIN32
758 FILE* pipe = _popen(curl_cmd.c_str(), "r");
759#else
760 FILE* pipe = popen(curl_cmd.c_str(), "r");
761#endif
762 if (!pipe) {
763 return absl::InternalError("Failed to execute curl command");
764 }
765
766 std::string response_str;
767 char buffer[4096];
768 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
769 response_str += buffer;
770 }
771
772#ifdef _WIN32
773 int status = _pclose(pipe);
774#else
775 int status = pclose(pipe);
776#endif
777 std::remove(temp_file.c_str());
778
779 if (status != 0) {
780 return absl::InternalError(absl::StrCat("Curl failed with status ", status));
781 }
782
783 if (response_str.empty()) {
784 return absl::InternalError("Empty response from Gemini API");
785 }
786
787 if (config_.verbose) {
788 std::cout << "\n" << "\033[35m" << "🔍 Raw Gemini Multimodal Response:" << "\033[0m" << "\n"
789 << "\033[2m" << response_str.substr(0, 500) << "\033[0m" << "\n\n";
790 }
791
792 return ParseGeminiResponse(response_str);
793
794 } catch (const std::exception& e) {
795 if (config_.verbose) {
796 std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
797 }
798 return absl::InternalError(absl::StrCat("Exception during multimodal generation: ", e.what()));
799 }
800#endif
801}
802
803} // namespace cli
804} // namespace yaze
The Rom class is used to load, save, and modify Rom data.
Definition rom.h:71
absl::StatusOr< AgentResponse > GenerateMultimodalResponse(const std::string &image_path, const std::string &prompt)
GeminiAIService(const GeminiConfig &config)
void EnableFunctionCalling(bool enable=true)
std::vector< std::string > GetAvailableTools() const
void SetRomContext(Rom *rom) override
absl::StatusOr< std::string > EncodeImageToBase64(const std::string &image_path) const
absl::StatusOr< AgentResponse > ParseGeminiResponse(const std::string &response_body)
absl::StatusOr< AgentResponse > GenerateResponse(const std::string &prompt) override
std::string BuildFunctionCallSchemas() const
std::string BuildSystemInstructionWithExamples()
std::string BuildPromptFromHistory(const std::vector< agent::ChatMessage > &history)
std::string BuildSystemInstruction()
absl::Status LoadResourceCatalogue(const std::string &yaml_path)
static absl::StatusOr< std::filesystem::path > FindAsset(const std::string &relative_path)
Find an asset file in multiple standard locations.
Main namespace for the application.
std::vector< std::string > commands
Definition common.h:26
std::string reasoning
Definition common.h:29
std::vector< ToolCall > tool_calls
Definition common.h:23
std::string text_response
Definition common.h:20
std::map< std::string, std::string > args
Definition common.h:14
std::string tool_name
Definition common.h:13