yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
gemini_ai_service.cc
Go to the documentation of this file.
2
3#include <atomic>
4#include <cstdlib>
5#include <iostream>
6#include <mutex>
7#include <string>
8#include <vector>
9
10#include "absl/strings/str_cat.h"
11#include "absl/strings/str_split.h"
12#include "absl/strings/strip.h"
13#include "absl/time/clock.h"
14#include "absl/time/time.h"
16#include "util/platform_paths.h"
17
18#ifdef YAZE_WITH_JSON
19#include <filesystem>
20#include <fstream>
21
22#include "httplib.h"
23#include "nlohmann/json.hpp"
24
25// OpenSSL initialization for HTTPS support
26#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
27#include <openssl/crypto.h>
28#include <openssl/err.h>
29#include <openssl/ssl.h>
30
31// Global flag to track OpenSSL initialization
32static std::atomic<bool> g_openssl_initialized{false};
33static std::mutex g_openssl_init_mutex;
34
35static void InitializeOpenSSL() {
36 std::lock_guard<std::mutex> lock(g_openssl_init_mutex);
37 if (!g_openssl_initialized.exchange(true)) {
38 OPENSSL_init_ssl(
39 OPENSSL_INIT_LOAD_SSL_STRINGS | OPENSSL_INIT_LOAD_CRYPTO_STRINGS,
40 nullptr);
41 std::cerr << "✓ OpenSSL initialized for HTTPS support" << std::endl;
42 }
43}
44#endif
45#endif
46
47namespace yaze {
48namespace cli {
49
50GeminiAIService::GeminiAIService(const GeminiConfig& config)
51 : function_calling_enabled_(config.use_function_calling), config_(config) {
52 if (config_.verbose) {
53 std::cerr << "[DEBUG] Initializing Gemini service..." << std::endl;
54 std::cerr << "[DEBUG] Function calling: "
55 << (function_calling_enabled_ ? "enabled" : "disabled")
56 << std::endl;
57 std::cerr << "[DEBUG] Prompt version: " << config_.prompt_version
58 << std::endl;
59 }
60
61#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
62 // Initialize OpenSSL for HTTPS support
63 InitializeOpenSSL();
64 if (config_.verbose) {
65 std::cerr << "[DEBUG] OpenSSL initialized for HTTPS" << std::endl;
66 }
67#endif
68
69 // Load command documentation into prompt builder with specified version
70 std::string catalogue_path = config_.prompt_version == "v2"
71 ? "assets/agent/prompt_catalogue_v2.yaml"
72 : "assets/agent/prompt_catalogue.yaml";
73 if (auto status = prompt_builder_.LoadResourceCatalogue(catalogue_path);
74 !status.ok()) {
75 std::cerr << "⚠️ Failed to load agent prompt catalogue: "
76 << status.message() << std::endl;
77 }
78
79 if (config_.verbose) {
80 std::cerr << "[DEBUG] Loaded prompt catalogue" << std::endl;
81 }
82
83 if (config_.system_instruction.empty()) {
84 if (config_.verbose) {
85 std::cerr << "[DEBUG] Building system instruction..." << std::endl;
86 }
87
88 // Try to load version-specific system prompt file using FindAsset
89 std::string prompt_file;
90 if (config_.prompt_version == "v3") {
91 prompt_file = "agent/system_prompt_v3.txt";
92 } else if (config_.prompt_version == "v2") {
93 prompt_file = "agent/system_prompt_v2.txt";
94 } else {
95 prompt_file = "agent/system_prompt.txt";
96 }
97
98 auto prompt_path = util::PlatformPaths::FindAsset(prompt_file);
99 bool loaded = false;
100
101 if (prompt_path.ok()) {
102 std::ifstream file(prompt_path->string());
103 if (file.good()) {
104 std::stringstream buffer;
105 buffer << file.rdbuf();
106 config_.system_instruction = buffer.str();
107 if (config_.verbose) {
108 std::cerr << "[DEBUG] Loaded prompt: " << prompt_path->string()
109 << std::endl;
110 }
111 loaded = true;
112 }
113 }
114
115 if (!loaded) {
116 // Fallback to builder
117 if (config_.use_enhanced_prompting) {
118 config_.system_instruction =
119 prompt_builder_.BuildSystemInstructionWithExamples();
120 } else {
121 config_.system_instruction = BuildSystemInstruction();
122 }
123 }
124 }
125
126 if (config_.verbose) {
127 std::cerr << "[DEBUG] Gemini service initialized" << std::endl;
128 }
129}
130
131void GeminiAIService::EnableFunctionCalling(bool enable) {
132 function_calling_enabled_ = enable;
133}
134
135std::vector<std::string> GeminiAIService::GetAvailableTools() const {
136 return {"resource-list", "resource-search",
137 "dungeon-list-sprites", "dungeon-describe-room",
138 "overworld-find-tile", "overworld-describe-map",
139 "overworld-list-warps"};
140}
141
142std::string GeminiAIService::BuildFunctionCallSchemas() {
143#ifndef YAZE_WITH_JSON
144 return "{}"; // Empty object if JSON not available
145#else
146 // Use the prompt builder's schema generation which reads from
147 // prompt_catalogue.yaml
148 std::string schemas = prompt_builder_.BuildFunctionCallSchemas();
149 if (!schemas.empty() && schemas != "[]") {
150 return schemas;
151 }
152
153 // Fallback: Search for function_schemas.json using FindAsset
154 auto schema_path_or =
155 util::PlatformPaths::FindAsset("agent/function_schemas.json");
156
157 if (!schema_path_or.ok()) {
158 if (config_.verbose) {
159 std::cerr << "⚠️ Function schemas file not found: "
160 << schema_path_or.status().message() << std::endl;
161 }
162 return "[]"; // Return empty array as fallback
163 }
164
165 // Load and parse the JSON file
166 std::ifstream file(schema_path_or->string());
167 if (!file.is_open()) {
168 std::cerr << "⚠️ Failed to open function schemas file: "
169 << schema_path_or->string() << std::endl;
170 return "[]";
171 }
172
173 try {
174 nlohmann::json schemas_json;
175 file >> schemas_json;
176 return schemas_json.dump();
177 } catch (const nlohmann::json::exception& e) {
178 std::cerr << "⚠️ Failed to parse function schemas JSON: " << e.what()
179 << std::endl;
180 return "[]";
181 }
182#endif
183}
184
185std::string GeminiAIService::BuildSystemInstruction() {
186 // Fallback prompt if enhanced prompting is disabled
187 // Use PromptBuilder's basic system instruction
188 return prompt_builder_.BuildSystemInstruction();
189}
190
191void GeminiAIService::SetRomContext(Rom* rom) {
192 prompt_builder_.SetRom(rom);
193}
194
195absl::StatusOr<std::vector<ModelInfo>> GeminiAIService::ListAvailableModels() {
196#ifndef YAZE_WITH_JSON
197 return absl::UnimplementedError("Gemini AI service requires JSON support");
198#else
199 if (config_.api_key.empty()) {
200 // Return default known models if API key is missing
201 std::vector<ModelInfo> defaults = {
202 {.name = "gemini-3.0-preview",
203 .display_name = "Gemini 3.0 Preview",
204 .provider = "gemini",
205 .description = "Cutting-edge model, currently in preview"},
206 {.name = "gemini-2.5-pro",
207 .display_name = "Gemini 2.5 Pro",
208 .provider = "gemini",
209 .description = "High intelligence for complex tasks"},
210 {.name = "gemini-2.5-flash",
211 .display_name = "Gemini 2.5 Flash",
212 .provider = "gemini",
213 .description = "Fastest multimodal model"}};
214 return defaults;
215 }
216
217 try {
218 // Use curl to list models from the API
219 std::string endpoint =
220 "https://generativelanguage.googleapis.com/v1beta/models?key=" +
221 config_.api_key;
222 std::string curl_cmd = "curl -s -X GET '" + endpoint + "' 2>&1";
223
224 if (config_.verbose) {
225 std::cerr << "[DEBUG] Listing models: "
226 << curl_cmd.substr(0, curl_cmd.find("key=")) << "...'"
227 << std::endl;
228 }
229
230#ifdef _WIN32
231 FILE* pipe = _popen(curl_cmd.c_str(), "r");
232#else
233 FILE* pipe = popen(curl_cmd.c_str(), "r");
234#endif
235 if (!pipe) {
236 return absl::InternalError("Failed to execute curl command");
237 }
238
239 std::string response_str;
240 char buffer[4096];
241 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
242 response_str += buffer;
243 }
244
245#ifdef _WIN32
246 _pclose(pipe);
247#else
248 pclose(pipe);
249#endif
250
251 auto models_json = nlohmann::json::parse(response_str, nullptr, false);
252 if (models_json.is_discarded()) {
253 return absl::InternalError("Failed to parse Gemini models JSON");
254 }
255
256 if (!models_json.contains("models")) {
257 // Return defaults on error
258 std::vector<ModelInfo> defaults = {{.name = "gemini-2.5-flash",
259 .display_name = "Gemini 2.0 Flash",
260 .provider = "gemini"},
261 {.name = "gemini-1.5-flash",
262 .display_name = "Gemini 1.5 Flash",
263 .provider = "gemini"},
264 {.name = "gemini-1.5-pro",
265 .display_name = "Gemini 1.5 Pro",
266 .provider = "gemini"}};
267 return defaults;
268 }
269
270 std::vector<ModelInfo> models;
271 for (const auto& m : models_json["models"]) {
272 std::string name = m.value("name", "");
273 // Name comes as "models/gemini-pro", strip prefix
274 if (absl::StartsWith(name, "models/")) {
275 name = name.substr(7);
276 }
277
278 // Filter for gemini models
279 if (absl::StartsWith(name, "gemini")) {
280 ModelInfo info;
281 info.name = name;
282 info.display_name = m.value("displayName", name);
283 info.provider = "gemini";
284 info.description = m.value("description", "");
285 info.family = "gemini";
286 info.is_local = false;
287 models.push_back(std::move(info));
288 }
289 }
290 return models;
291
292 } catch (const std::exception& e) {
293 return absl::InternalError(
294 absl::StrCat("Failed to list models: ", e.what()));
295 }
296#endif
297}
298
299absl::Status GeminiAIService::CheckAvailability() {
300#ifndef YAZE_WITH_JSON
301 return absl::UnimplementedError(
302 "Gemini AI service requires JSON support. Build with "
303 "-DYAZE_WITH_JSON=ON");
304#else
305 try {
306 if (config_.verbose) {
307 std::cerr << "[DEBUG] CheckAvailability: start" << std::endl;
308 }
309
310 if (config_.api_key.empty()) {
311 return absl::FailedPreconditionError(
312 "❌ Gemini API key not configured\n"
313 " Set GEMINI_API_KEY environment variable\n"
314 " Get your API key at: https://makersuite.google.com/app/apikey");
315 }
316
317 if (config_.verbose) {
318 std::cerr << "[DEBUG] CheckAvailability: creating HTTPS client"
319 << std::endl;
320 }
321 // Test API connectivity with a simple request
322 httplib::Client cli("https://generativelanguage.googleapis.com");
323 if (config_.verbose) {
324 std::cerr << "[DEBUG] CheckAvailability: client created" << std::endl;
325 }
326
327 cli.set_connection_timeout(5, 0); // 5 seconds timeout
328
329 if (config_.verbose) {
330 std::cerr << "[DEBUG] CheckAvailability: building endpoint" << std::endl;
331 }
332 std::string test_endpoint = "/v1beta/models/" + config_.model;
333 httplib::Headers headers = {
334 {"x-goog-api-key", config_.api_key},
335 };
336
337 if (config_.verbose) {
338 std::cerr << "[DEBUG] CheckAvailability: making request to "
339 << test_endpoint << std::endl;
340 }
341 auto res = cli.Get(test_endpoint.c_str(), headers);
342
343 if (config_.verbose) {
344 std::cerr << "[DEBUG] CheckAvailability: got response" << std::endl;
345 }
346
347 if (!res) {
348 return absl::UnavailableError(
349 "❌ Cannot reach Gemini API\n"
350 " Check your internet connection");
351 }
352
353 if (res->status == 401 || res->status == 403) {
354 return absl::PermissionDeniedError(
355 "❌ Invalid Gemini API key\n"
356 " Verify your key at: https://makersuite.google.com/app/apikey");
357 }
358
359 if (res->status == 404) {
360 return absl::NotFoundError(
361 absl::StrCat("❌ Model '", config_.model, "' not found\n",
362 " Try: gemini-2.5-flash or gemini-1.5-pro"));
363 }
364
365 if (res->status != 200) {
366 return absl::InternalError(absl::StrCat(
367 "❌ Gemini API error: ", res->status, "\n ", res->body));
368 }
369
370 return absl::OkStatus();
371 } catch (const std::exception& e) {
372 if (config_.verbose) {
373 std::cerr << "[DEBUG] CheckAvailability: EXCEPTION: " << e.what()
374 << std::endl;
375 }
376 return absl::InternalError(
377 absl::StrCat("Exception during availability check: ", e.what()));
378 } catch (...) {
379 if (config_.verbose) {
380 std::cerr << "[DEBUG] CheckAvailability: UNKNOWN EXCEPTION" << std::endl;
381 }
382 return absl::InternalError("Unknown exception during availability check");
383 }
384#endif
385}
386
387absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
388 const std::string& prompt) {
389 return GenerateResponse(
390 {{{agent::ChatMessage::Sender::kUser, prompt, absl::Now()}}});
391}
392
393absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
394 const std::vector<agent::ChatMessage>& history) {
395#ifndef YAZE_WITH_JSON
396 return absl::UnimplementedError(
397 "Gemini AI service requires JSON support. Build with "
398 "-DYAZE_WITH_JSON=ON");
399#else
400 if (history.empty()) {
401 return absl::InvalidArgumentError("History cannot be empty.");
402 }
403
404 // Build a structured conversation history for better context
405 // Gemini supports multi-turn conversations via the contents array
406 std::string prompt = prompt_builder_.BuildPromptFromHistory(history);
407
408 // Skip availability check - causes segfault with current SSL setup
409 // TODO: Fix SSL/TLS initialization issue
410 // if (auto status = CheckAvailability(); !status.ok()) {
411 // return status;
412 // }
413
414 if (config_.api_key.empty()) {
415 return absl::FailedPreconditionError("Gemini API key not configured");
416 }
417
418 absl::Time request_start = absl::Now();
419
420 try {
421 if (config_.verbose) {
422 std::cerr << "[DEBUG] Using curl for HTTPS request" << std::endl;
423 std::cerr << "[DEBUG] Processing " << history.size()
424 << " messages in history" << std::endl;
425 }
426
427 // Build conversation history for multi-turn context
428 // Gemini supports alternating user/model messages for better context
429 nlohmann::json contents = nlohmann::json::array();
430
431 // Add conversation history (up to last 10 messages for context window)
432 int start_idx = std::max(0, static_cast<int>(history.size()) - 10);
433 for (size_t i = start_idx; i < history.size(); ++i) {
434 const auto& msg = history[i];
435 std::string role =
436 (msg.sender == agent::ChatMessage::Sender::kUser) ? "user" : "model";
437
438 nlohmann::json message = {{"role", role},
439 {"parts", {{{"text", msg.message}}}}};
440 contents.push_back(message);
441 }
442
443 // If the last message is from the model, we need to ensure the conversation
444 // ends with a user message for Gemini
445 if (!history.empty() &&
446 history.back().sender == agent::ChatMessage::Sender::kAgent) {
447 // Add a continuation prompt
448 nlohmann::json user_continuation = {
449 {"role", "user"},
450 {"parts", {{{"text", "Please continue or clarify your response."}}}}};
451 contents.push_back(user_continuation);
452 }
453
454 // Build request with proper Gemini API v1beta format
455 nlohmann::json request_body = {
456 {"system_instruction",
457 {{"parts", {{"text", config_.system_instruction}}}}},
458 {"contents", contents},
459 {"generationConfig",
460 {{"temperature", config_.temperature},
461 {"maxOutputTokens", config_.max_output_tokens}}}};
462
463 if (config_.verbose) {
464 std::cerr << "[DEBUG] Sending " << contents.size()
465 << " conversation turns to Gemini" << std::endl;
466 }
467
468 // Only add responseMimeType if NOT using function calling
469 // (Gemini doesn't support both at the same time)
470 if (!function_calling_enabled_) {
471 request_body["generationConfig"]["responseMimeType"] = "application/json";
472 }
473
474 // Add function calling tools if enabled
475 if (function_calling_enabled_) {
476 try {
477 std::string schemas_str = BuildFunctionCallSchemas();
478 if (config_.verbose) {
479 std::cerr << "[DEBUG] Function calling schemas: "
480 << schemas_str.substr(0, 200) << "..." << std::endl;
481 }
482
483 nlohmann::json schemas = nlohmann::json::parse(schemas_str);
484
485 // Build tools array - schemas might be an array of tools or a
486 // function_declarations object
487 if (schemas.is_array()) {
488 // If it's already an array of tools, use it directly
489 request_body["tools"] = {{{"function_declarations", schemas}}};
490 } else if (schemas.is_object() &&
491 schemas.contains("function_declarations")) {
492 // If it's a wrapper object with function_declarations
493 request_body["tools"] = {
494 {{"function_declarations", schemas["function_declarations"]}}};
495 } else {
496 // Treat as single tool object
497 request_body["tools"] = {
498 {{"function_declarations", nlohmann::json::array({schemas})}}};
499 }
500 } catch (const nlohmann::json::exception& e) {
501 std::cerr << "⚠️ Failed to parse function schemas: " << e.what()
502 << std::endl;
503 }
504 }
505
506 // Write request body to temp file
507 std::string temp_file = "/tmp/gemini_request.json";
508 std::ofstream out(temp_file);
509 out << request_body.dump();
510 out.close();
511
512 // Use curl to make the request (avoiding httplib SSL issues)
513 std::string endpoint =
514 "https://generativelanguage.googleapis.com/v1beta/models/" +
515 config_.model + ":generateContent";
516 std::string curl_cmd = "curl -s -X POST '" + endpoint +
517 "' "
518 "-H 'Content-Type: application/json' "
519 "-H 'x-goog-api-key: " +
520 config_.api_key +
521 "' "
522 "-d @" +
523 temp_file + " 2>&1";
524
525 if (config_.verbose) {
526 std::cerr << "[DEBUG] Executing API request..." << std::endl;
527 }
528
529#ifdef _WIN32
530 FILE* pipe = _popen(curl_cmd.c_str(), "r");
531#else
532 FILE* pipe = popen(curl_cmd.c_str(), "r");
533#endif
534 if (!pipe) {
535 return absl::InternalError("Failed to execute curl command");
536 }
537
538 std::string response_str;
539 char buffer[4096];
540 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
541 response_str += buffer;
542 }
543
544#ifdef _WIN32
545 int status = _pclose(pipe);
546#else
547 int status = pclose(pipe);
548#endif
549 std::remove(temp_file.c_str());
550
551 if (status != 0) {
552 return absl::InternalError(
553 absl::StrCat("Curl failed with status ", status));
554 }
555
556 if (response_str.empty()) {
557 return absl::InternalError("Empty response from Gemini API");
558 }
559
560 // Debug: print response
561 if (config_.verbose) {
562 std::cout << "\n"
563 << "\033[35m" << "🔍 Raw Gemini API Response:" << "\033[0m"
564 << "\n"
565 << "\033[2m" << response_str.substr(0, 500) << "\033[0m"
566 << "\n\n";
567 }
568
569 if (config_.verbose) {
570 std::cerr << "[DEBUG] Parsing response..." << std::endl;
571 }
572 auto parsed_or = ParseGeminiResponse(response_str);
573 if (!parsed_or.ok()) {
574 return parsed_or.status();
575 }
576 AgentResponse agent_response = std::move(parsed_or.value());
577 agent_response.provider = "gemini";
578 agent_response.model = config_.model;
579 agent_response.latency_seconds =
580 absl::ToDoubleSeconds(absl::Now() - request_start);
581 agent_response.parameters["prompt_version"] = config_.prompt_version;
582 agent_response.parameters["temperature"] =
583 absl::StrFormat("%.2f", config_.temperature);
584 agent_response.parameters["max_output_tokens"] =
585 absl::StrFormat("%d", config_.max_output_tokens);
586 agent_response.parameters["function_calling"] =
587 function_calling_enabled_ ? "true" : "false";
588 return agent_response;
589
590 } catch (const std::exception& e) {
591 if (config_.verbose) {
592 std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
593 }
594 return absl::InternalError(
595 absl::StrCat("Exception during generation: ", e.what()));
596 } catch (...) {
597 if (config_.verbose) {
598 std::cerr << "[ERROR] Unknown exception" << std::endl;
599 }
600 return absl::InternalError("Unknown exception during generation");
601 }
602#endif
603}
604
605absl::StatusOr<AgentResponse> GeminiAIService::ParseGeminiResponse(
606 const std::string& response_body) {
607#ifndef YAZE_WITH_JSON
608 return absl::UnimplementedError("JSON support required");
609#else
610 AgentResponse agent_response;
611
612 auto response_json = nlohmann::json::parse(response_body, nullptr, false);
613 if (response_json.is_discarded()) {
614 return absl::InternalError("❌ Failed to parse Gemini response JSON");
615 }
616
617 // Navigate Gemini's response structure
618 if (!response_json.contains("candidates") ||
619 response_json["candidates"].empty()) {
620 return absl::InternalError("❌ No candidates in Gemini response");
621 }
622
623 for (const auto& candidate : response_json["candidates"]) {
624 if (!candidate.contains("content") ||
625 !candidate["content"].contains("parts")) {
626 continue;
627 }
628
629 for (const auto& part : candidate["content"]["parts"]) {
630 if (part.contains("text")) {
631 std::string text_content = part["text"].get<std::string>();
632
633 // Debug: Print raw LLM output when verbose mode is enabled
634 if (config_.verbose) {
635 std::cout << "\n"
636 << "\033[35m" << "🔍 Raw LLM Response:" << "\033[0m" << "\n"
637 << "\033[2m" << text_content << "\033[0m" << "\n\n";
638 }
639
640 // Strip markdown code blocks if present (```json ... ```)
641 text_content = std::string(absl::StripAsciiWhitespace(text_content));
642 if (absl::StartsWith(text_content, "```json")) {
643 text_content = text_content.substr(7); // Remove ```json
644 } else if (absl::StartsWith(text_content, "```")) {
645 text_content = text_content.substr(3); // Remove ```
646 }
647 if (absl::EndsWith(text_content, "```")) {
648 text_content = text_content.substr(0, text_content.length() - 3);
649 }
650 text_content = std::string(absl::StripAsciiWhitespace(text_content));
651
652 // Try to parse as JSON object
653 auto parsed_text = nlohmann::json::parse(text_content, nullptr, false);
654 if (!parsed_text.is_discarded()) {
655 // Extract text_response
656 if (parsed_text.contains("text_response") &&
657 parsed_text["text_response"].is_string()) {
658 agent_response.text_response =
659 parsed_text["text_response"].get<std::string>();
660 }
661
662 // Extract reasoning
663 if (parsed_text.contains("reasoning") &&
664 parsed_text["reasoning"].is_string()) {
665 agent_response.reasoning =
666 parsed_text["reasoning"].get<std::string>();
667 }
668
669 // Extract commands
670 if (parsed_text.contains("commands") &&
671 parsed_text["commands"].is_array()) {
672 for (const auto& cmd : parsed_text["commands"]) {
673 if (cmd.is_string()) {
674 std::string command = cmd.get<std::string>();
675 if (absl::StartsWith(command, "z3ed ")) {
676 command = command.substr(5);
677 }
678 agent_response.commands.push_back(command);
679 }
680 }
681 }
682
683 // Extract tool_calls from the parsed JSON
684 if (parsed_text.contains("tool_calls") &&
685 parsed_text["tool_calls"].is_array()) {
686 for (const auto& call : parsed_text["tool_calls"]) {
687 if (call.contains("tool_name") && call["tool_name"].is_string()) {
688 ToolCall tool_call;
689 tool_call.tool_name = call["tool_name"].get<std::string>();
690
691 if (call.contains("args") && call["args"].is_object()) {
692 for (auto& [key, value] : call["args"].items()) {
693 if (value.is_string()) {
694 tool_call.args[key] = value.get<std::string>();
695 } else if (value.is_number()) {
696 tool_call.args[key] = std::to_string(value.get<double>());
697 } else if (value.is_boolean()) {
698 tool_call.args[key] =
699 value.get<bool>() ? "true" : "false";
700 }
701 }
702 }
703 agent_response.tool_calls.push_back(tool_call);
704 }
705 }
706 }
707 } else {
708 // If parsing the full object fails, fallback to extracting commands
709 // from text
710 std::vector<std::string> lines = absl::StrSplit(text_content, '\n');
711 for (const auto& line : lines) {
712 std::string trimmed = std::string(absl::StripAsciiWhitespace(line));
713 if (!trimmed.empty() && (absl::StartsWith(trimmed, "z3ed ") ||
714 absl::StartsWith(trimmed, "palette ") ||
715 absl::StartsWith(trimmed, "overworld ") ||
716 absl::StartsWith(trimmed, "sprite ") ||
717 absl::StartsWith(trimmed, "dungeon "))) {
718 if (absl::StartsWith(trimmed, "z3ed ")) {
719 trimmed = trimmed.substr(5);
720 }
721 agent_response.commands.push_back(trimmed);
722 }
723 }
724 }
725 } else if (part.contains("functionCall")) {
726 const auto& call = part["functionCall"];
727 if (call.contains("name") && call["name"].is_string()) {
728 ToolCall tool_call;
729 tool_call.tool_name = call["name"].get<std::string>();
730 if (call.contains("args") && call["args"].is_object()) {
731 for (auto& [key, value] : call["args"].items()) {
732 if (value.is_string()) {
733 tool_call.args[key] = value.get<std::string>();
734 } else if (value.is_number()) {
735 tool_call.args[key] = std::to_string(value.get<double>());
736 }
737 }
738 }
739 agent_response.tool_calls.push_back(tool_call);
740 }
741 }
742 }
743 }
744
745 if (agent_response.text_response.empty() && agent_response.commands.empty() &&
746 agent_response.tool_calls.empty()) {
747 return absl::InternalError(
748 "❌ No valid response extracted from Gemini\n"
749 " Expected at least one of: text_response, commands, or tool_calls\n"
750 " Raw response: " +
751 response_body);
752 }
753
754 return agent_response;
755#endif
756}
757
758absl::StatusOr<std::string> GeminiAIService::EncodeImageToBase64(
759 const std::string& image_path) const {
760#ifndef YAZE_WITH_JSON
761 (void)image_path; // Suppress unused parameter warning
762 return absl::UnimplementedError(
763 "Gemini AI service requires JSON support. Build with "
764 "-DYAZE_WITH_JSON=ON");
765#else
766 std::ifstream file(image_path, std::ios::binary);
767 if (!file.is_open()) {
768 return absl::NotFoundError(
769 absl::StrCat("Failed to open image file: ", image_path));
770 }
771
772 // Read file into buffer
773 file.seekg(0, std::ios::end);
774 size_t size = file.tellg();
775 file.seekg(0, std::ios::beg);
776
777 std::vector<unsigned char> buffer(size);
778 if (!file.read(reinterpret_cast<char*>(buffer.data()), size)) {
779 return absl::InternalError("Failed to read image file");
780 }
781
782 // Base64 encode
783 static const char* base64_chars =
784 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
785
786 std::string encoded;
787 encoded.reserve(((size + 2) / 3) * 4);
788
789 int i = 0;
790 int j = 0;
791 unsigned char char_array_3[3];
792 unsigned char char_array_4[4];
793
794 for (size_t idx = 0; idx < size; idx++) {
795 char_array_3[i++] = buffer[idx];
796 if (i == 3) {
797 char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
798 char_array_4[1] =
799 ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
800 char_array_4[2] =
801 ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
802 char_array_4[3] = char_array_3[2] & 0x3f;
803
804 for (i = 0; i < 4; i++)
805 encoded += base64_chars[char_array_4[i]];
806 i = 0;
807 }
808 }
809
810 if (i) {
811 for (j = i; j < 3; j++)
812 char_array_3[j] = '\0';
813
814 char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
815 char_array_4[1] =
816 ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
817 char_array_4[2] =
818 ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
819
820 for (j = 0; j < i + 1; j++)
821 encoded += base64_chars[char_array_4[j]];
822
823 while (i++ < 3)
824 encoded += '=';
825 }
826
827 return encoded;
828#endif
829}
830
831absl::StatusOr<AgentResponse> GeminiAIService::GenerateMultimodalResponse(
832 const std::string& image_path, const std::string& prompt) {
833#ifndef YAZE_WITH_JSON
834 (void)image_path; // Suppress unused parameter warnings
835 (void)prompt;
836 return absl::UnimplementedError(
837 "Gemini AI service requires JSON support. Build with "
838 "-DYAZE_WITH_JSON=ON");
839#else
840 if (config_.api_key.empty()) {
841 return absl::FailedPreconditionError("Gemini API key not configured");
842 }
843
844 // Determine MIME type from file extension
845 std::string mime_type = "image/png";
846 if (image_path.ends_with(".jpg") || image_path.ends_with(".jpeg")) {
847 mime_type = "image/jpeg";
848 } else if (image_path.ends_with(".bmp")) {
849 mime_type = "image/bmp";
850 } else if (image_path.ends_with(".webp")) {
851 mime_type = "image/webp";
852 }
853
854 // Encode image to base64
855 auto encoded_or = EncodeImageToBase64(image_path);
856 if (!encoded_or.ok()) {
857 return encoded_or.status();
858 }
859 std::string encoded_image = std::move(encoded_or.value());
860
861 try {
862 if (config_.verbose) {
863 std::cerr << "[DEBUG] Preparing multimodal request with image"
864 << std::endl;
865 }
866
867 // Build multimodal request with image and text
868 nlohmann::json request_body = {
869 {"contents",
870 {{{"parts",
871 {{{"inline_data",
872 {{"mime_type", mime_type}, {"data", encoded_image}}}},
873 {{"text", prompt}}}}}}},
874 {"generationConfig",
875 {{"temperature", config_.temperature},
876 {"maxOutputTokens", config_.max_output_tokens}}}};
877
878 // Write request body to temp file
879 std::string temp_file = "/tmp/gemini_multimodal_request.json";
880 std::ofstream out(temp_file);
881 out << request_body.dump();
882 out.close();
883
884 // Use curl to make the request
885 std::string endpoint =
886 "https://generativelanguage.googleapis.com/v1beta/models/" +
887 config_.model + ":generateContent";
888 std::string curl_cmd = "curl -s -X POST '" + endpoint +
889 "' "
890 "-H 'Content-Type: application/json' "
891 "-H 'x-goog-api-key: " +
892 config_.api_key +
893 "' "
894 "-d @" +
895 temp_file + " 2>&1";
896
897 if (config_.verbose) {
898 std::cerr << "[DEBUG] Executing multimodal API request..." << std::endl;
899 }
900
901#ifdef _WIN32
902 FILE* pipe = _popen(curl_cmd.c_str(), "r");
903#else
904 FILE* pipe = popen(curl_cmd.c_str(), "r");
905#endif
906 if (!pipe) {
907 return absl::InternalError("Failed to execute curl command");
908 }
909
910 std::string response_str;
911 char buffer[4096];
912 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
913 response_str += buffer;
914 }
915
916#ifdef _WIN32
917 int status = _pclose(pipe);
918#else
919 int status = pclose(pipe);
920#endif
921 std::remove(temp_file.c_str());
922
923 if (status != 0) {
924 return absl::InternalError(
925 absl::StrCat("Curl failed with status ", status));
926 }
927
928 if (response_str.empty()) {
929 return absl::InternalError("Empty response from Gemini API");
930 }
931
932 if (config_.verbose) {
933 std::cout << "\n"
934 << "\033[35m"
935 << "🔍 Raw Gemini Multimodal Response:" << "\033[0m" << "\n"
936 << "\033[2m" << response_str.substr(0, 500) << "\033[0m"
937 << "\n\n";
938 }
939
940 return ParseGeminiResponse(response_str);
941
942 } catch (const std::exception& e) {
943 if (config_.verbose) {
944 std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
945 }
946 return absl::InternalError(
947 absl::StrCat("Exception during multimodal generation: ", e.what()));
948 }
949#endif
950}
951
952} // namespace cli
953} // namespace yaze
GeminiAIService(const GeminiConfig &)