yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
gemini_ai_service.cc
Go to the documentation of this file.
2
3#include <atomic>
4#include <cstdlib>
5#include <iostream>
6#include <mutex>
7#include <string>
8#include <vector>
9
10#include "absl/strings/str_cat.h"
11#include "absl/strings/str_format.h"
12#include "absl/strings/str_split.h"
13#include "absl/strings/strip.h"
14#include "absl/time/clock.h"
15#include "absl/time/time.h"
17#include "util/platform_paths.h"
18
19#ifdef YAZE_WITH_JSON
20#include <filesystem>
21#include <fstream>
22
23#include "httplib.h"
24#include "nlohmann/json.hpp"
25
26// OpenSSL initialization for HTTPS support
27#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
28#include <openssl/crypto.h>
29#include <openssl/err.h>
30#include <openssl/ssl.h>
31
32// Global flag to track OpenSSL initialization
33static std::atomic<bool> g_openssl_initialized{false};
34static std::mutex g_openssl_init_mutex;
35
36static void InitializeOpenSSL() {
37 std::lock_guard<std::mutex> lock(g_openssl_init_mutex);
38 if (!g_openssl_initialized.exchange(true)) {
39 OPENSSL_init_ssl(
40 OPENSSL_INIT_LOAD_SSL_STRINGS | OPENSSL_INIT_LOAD_CRYPTO_STRINGS,
41 nullptr);
42 std::cerr << "✓ OpenSSL initialized for HTTPS support" << std::endl;
43 }
44}
45#endif
46#endif
47
48namespace yaze {
49namespace cli {
50
51GeminiAIService::GeminiAIService(const GeminiConfig& config)
52 : function_calling_enabled_(config.use_function_calling), config_(config) {
53 if (config_.verbose) {
54 std::cerr << "[DEBUG] Initializing Gemini service..." << std::endl;
55 std::cerr << "[DEBUG] Function calling: "
56 << (function_calling_enabled_ ? "enabled" : "disabled")
57 << std::endl;
58 std::cerr << "[DEBUG] Prompt version: " << config_.prompt_version
59 << std::endl;
60 }
61
62#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
63 // Initialize OpenSSL for HTTPS support
64 InitializeOpenSSL();
65 if (config_.verbose) {
66 std::cerr << "[DEBUG] OpenSSL initialized for HTTPS" << std::endl;
67 }
68#endif
69
70 // Load command documentation into prompt builder with specified version
71 std::string catalogue_path = config_.prompt_version == "v2"
72 ? "assets/agent/prompt_catalogue_v2.yaml"
73 : "assets/agent/prompt_catalogue.yaml";
74 if (auto status = prompt_builder_.LoadResourceCatalogue(catalogue_path);
75 !status.ok()) {
76 std::cerr << "⚠️ Failed to load agent prompt catalogue: "
77 << status.message() << std::endl;
78 }
79
80 if (config_.verbose) {
81 std::cerr << "[DEBUG] Loaded prompt catalogue" << std::endl;
82 }
83
84 if (config_.system_instruction.empty()) {
85 if (config_.verbose) {
86 std::cerr << "[DEBUG] Building system instruction..." << std::endl;
87 }
88
89 // Try to load version-specific system prompt file using FindAsset
90 std::string prompt_file;
91 if (config_.prompt_version == "v3") {
92 prompt_file = "agent/system_prompt_v3.txt";
93 } else if (config_.prompt_version == "v2") {
94 prompt_file = "agent/system_prompt_v2.txt";
95 } else {
96 prompt_file = "agent/system_prompt.txt";
97 }
98
99 auto prompt_path = util::PlatformPaths::FindAsset(prompt_file);
100 bool loaded = false;
101
102 if (prompt_path.ok()) {
103 std::ifstream file(prompt_path->string());
104 if (file.good()) {
105 std::stringstream buffer;
106 buffer << file.rdbuf();
107 config_.system_instruction = buffer.str();
108 if (config_.verbose) {
109 std::cerr << "[DEBUG] Loaded prompt: " << prompt_path->string()
110 << std::endl;
111 }
112 loaded = true;
113 }
114 }
115
116 if (!loaded) {
117 // Fallback to builder
118 if (config_.use_enhanced_prompting) {
119 config_.system_instruction =
120 prompt_builder_.BuildSystemInstructionWithExamples();
121 } else {
122 config_.system_instruction = BuildSystemInstruction();
123 }
124 }
125 }
126
127 if (config_.verbose) {
128 std::cerr << "[DEBUG] Gemini service initialized" << std::endl;
129 }
130}
131
132void GeminiAIService::EnableFunctionCalling(bool enable) {
133 function_calling_enabled_ = enable;
134}
135
136std::vector<std::string> GeminiAIService::GetAvailableTools() const {
137 return {"resource-list", "resource-search",
138 "dungeon-list-sprites", "dungeon-describe-room",
139 "overworld-find-tile", "overworld-describe-map",
140 "overworld-list-warps"};
141}
142
143std::string GeminiAIService::BuildFunctionCallSchemas() {
144#ifndef YAZE_WITH_JSON
145 return "{}"; // Empty object if JSON not available
146#else
147 // Use the prompt builder's schema generation which reads from
148 // prompt_catalogue.yaml
149 std::string schemas = prompt_builder_.BuildFunctionCallSchemas();
150 if (!schemas.empty() && schemas != "[]") {
151 return schemas;
152 }
153
154 // Fallback: Search for function_schemas.json using FindAsset
155 auto schema_path_or =
156 util::PlatformPaths::FindAsset("agent/function_schemas.json");
157
158 if (!schema_path_or.ok()) {
159 if (config_.verbose) {
160 std::cerr << "⚠️ Function schemas file not found: "
161 << schema_path_or.status().message() << std::endl;
162 }
163 return "[]"; // Return empty array as fallback
164 }
165
166 // Load and parse the JSON file
167 std::ifstream file(schema_path_or->string());
168 if (!file.is_open()) {
169 std::cerr << "⚠️ Failed to open function schemas file: "
170 << schema_path_or->string() << std::endl;
171 return "[]";
172 }
173
174 try {
175 nlohmann::json schemas_json;
176 file >> schemas_json;
177 return schemas_json.dump();
178 } catch (const nlohmann::json::exception& e) {
179 std::cerr << "⚠️ Failed to parse function schemas JSON: " << e.what()
180 << std::endl;
181 return "[]";
182 }
183#endif
184}
185
186std::string GeminiAIService::BuildSystemInstruction() {
187 // Fallback prompt if enhanced prompting is disabled
188 // Use PromptBuilder's basic system instruction
189 return prompt_builder_.BuildSystemInstruction();
190}
191
192void GeminiAIService::SetRomContext(Rom* rom) {
193 prompt_builder_.SetRom(rom);
194}
195
196absl::StatusOr<std::vector<ModelInfo>> GeminiAIService::ListAvailableModels() {
197#ifndef YAZE_WITH_JSON
198 return absl::UnimplementedError("Gemini AI service requires JSON support");
199#else
200 if (config_.api_key.empty()) {
201 // Return default known models if API key is missing
202 std::vector<ModelInfo> defaults = {
203 {.name = "gemini-3.0-preview",
204 .display_name = "Gemini 3.0 Preview",
205 .provider = "gemini",
206 .description = "Cutting-edge model, currently in preview"},
207 {.name = "gemini-3.0-flash-preview",
208 .display_name = "Gemini 3.0 Flash Preview",
209 .provider = "gemini",
210 .description = "Fastest preview model"},
211 {.name = "gemini-2.5-pro",
212 .display_name = "Gemini 2.5 Pro",
213 .provider = "gemini",
214 .description = "High intelligence for complex tasks"},
215 {.name = "gemini-2.5-flash",
216 .display_name = "Gemini 2.5 Flash",
217 .provider = "gemini",
218 .description = "Fastest multimodal model"}};
219 return defaults;
220 }
221
222 try {
223 // Use curl to list models from the API
224 std::string endpoint =
225 "https://generativelanguage.googleapis.com/v1beta/models?key=" +
226 config_.api_key;
227 std::string curl_cmd = "curl -s -X GET '" + endpoint + "' 2>&1";
228
229 if (config_.verbose) {
230 std::cerr << "[DEBUG] Listing models: "
231 << curl_cmd.substr(0, curl_cmd.find("key=")) << "...'"
232 << std::endl;
233 }
234
235#ifdef _WIN32
236 FILE* pipe = _popen(curl_cmd.c_str(), "r");
237#else
238 FILE* pipe = popen(curl_cmd.c_str(), "r");
239#endif
240 if (!pipe) {
241 return absl::InternalError("Failed to execute curl command");
242 }
243
244 std::string response_str;
245 char buffer[4096];
246 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
247 response_str += buffer;
248 }
249
250#ifdef _WIN32
251 _pclose(pipe);
252#else
253 pclose(pipe);
254#endif
255
256 auto models_json = nlohmann::json::parse(response_str, nullptr, false);
257 if (models_json.is_discarded()) {
258 return absl::InternalError("Failed to parse Gemini models JSON");
259 }
260
261 if (!models_json.contains("models")) {
262 // Return defaults on error
263 std::vector<ModelInfo> defaults = {{.name = "gemini-2.5-flash",
264 .display_name = "Gemini 2.0 Flash",
265 .provider = "gemini"},
266 {.name = "gemini-1.5-flash",
267 .display_name = "Gemini 1.5 Flash",
268 .provider = "gemini"},
269 {.name = "gemini-1.5-pro",
270 .display_name = "Gemini 1.5 Pro",
271 .provider = "gemini"}};
272 return defaults;
273 }
274
275 std::vector<ModelInfo> models;
276 for (const auto& m : models_json["models"]) {
277 std::string name = m.value("name", "");
278 // Name comes as "models/gemini-pro", strip prefix
279 if (absl::StartsWith(name, "models/")) {
280 name = name.substr(7);
281 }
282
283 // Filter for gemini models
284 if (absl::StartsWith(name, "gemini")) {
285 ModelInfo info;
286 info.name = name;
287 info.display_name = m.value("displayName", name);
288 info.provider = "gemini";
289 info.description = m.value("description", "");
290 info.family = "gemini";
291 info.is_local = false;
292 models.push_back(std::move(info));
293 }
294 }
295 return models;
296
297 } catch (const std::exception& e) {
298 return absl::InternalError(
299 absl::StrCat("Failed to list models: ", e.what()));
300 }
301#endif
302}
303
304absl::Status GeminiAIService::CheckAvailability() {
305#ifndef YAZE_WITH_JSON
306 return absl::UnimplementedError(
307 "Gemini AI service requires JSON support. Build with "
308 "-DYAZE_WITH_JSON=ON");
309#else
310 try {
311 if (config_.verbose) {
312 std::cerr << "[DEBUG] CheckAvailability: start" << std::endl;
313 }
314
315 if (config_.api_key.empty()) {
316 return absl::FailedPreconditionError(
317 "❌ Gemini API key not configured\n"
318 " Set GEMINI_API_KEY environment variable\n"
319 " Get your API key at: https://makersuite.google.com/app/apikey");
320 }
321
322 if (config_.verbose) {
323 std::cerr << "[DEBUG] CheckAvailability: creating HTTPS client"
324 << std::endl;
325 }
326 // Test API connectivity with a simple request
327 httplib::Client cli("https://generativelanguage.googleapis.com");
328 if (config_.verbose) {
329 std::cerr << "[DEBUG] CheckAvailability: client created" << std::endl;
330 }
331
332 cli.set_connection_timeout(5, 0); // 5 seconds timeout
333
334 if (config_.verbose) {
335 std::cerr << "[DEBUG] CheckAvailability: building endpoint" << std::endl;
336 }
337 std::string test_endpoint = "/v1beta/models/" + config_.model;
338 httplib::Headers headers = {
339 {"x-goog-api-key", config_.api_key},
340 };
341
342 if (config_.verbose) {
343 std::cerr << "[DEBUG] CheckAvailability: making request to "
344 << test_endpoint << std::endl;
345 }
346 auto res = cli.Get(test_endpoint.c_str(), headers);
347
348 if (config_.verbose) {
349 std::cerr << "[DEBUG] CheckAvailability: got response" << std::endl;
350 }
351
352 if (!res) {
353 return absl::UnavailableError(
354 "❌ Cannot reach Gemini API\n"
355 " Check your internet connection");
356 }
357
358 if (res->status == 401 || res->status == 403) {
359 return absl::PermissionDeniedError(
360 "❌ Invalid Gemini API key\n"
361 " Verify your key at: https://makersuite.google.com/app/apikey");
362 }
363
364 if (res->status == 404) {
365 return absl::NotFoundError(
366 absl::StrCat("❌ Model '", config_.model, "' not found\n",
367 " Try: gemini-2.5-flash or gemini-1.5-pro"));
368 }
369
370 if (res->status != 200) {
371 return absl::InternalError(absl::StrCat(
372 "❌ Gemini API error: ", res->status, "\n ", res->body));
373 }
374
375 return absl::OkStatus();
376 } catch (const std::exception& e) {
377 if (config_.verbose) {
378 std::cerr << "[DEBUG] CheckAvailability: EXCEPTION: " << e.what()
379 << std::endl;
380 }
381 return absl::InternalError(
382 absl::StrCat("Exception during availability check: ", e.what()));
383 } catch (...) {
384 if (config_.verbose) {
385 std::cerr << "[DEBUG] CheckAvailability: UNKNOWN EXCEPTION" << std::endl;
386 }
387 return absl::InternalError("Unknown exception during availability check");
388 }
389#endif
390}
391
392absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
393 const std::string& prompt) {
394 return GenerateResponse(
395 {{{agent::ChatMessage::Sender::kUser, prompt, absl::Now()}}});
396}
397
398absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
399 const std::vector<agent::ChatMessage>& history) {
400#ifndef YAZE_WITH_JSON
401 return absl::UnimplementedError(
402 "Gemini AI service requires JSON support. Build with "
403 "-DYAZE_WITH_JSON=ON");
404#else
405 if (history.empty()) {
406 return absl::InvalidArgumentError("History cannot be empty.");
407 }
408
409 // Build a structured conversation history for better context
410 // Gemini supports multi-turn conversations via the contents array
411 std::string prompt = prompt_builder_.BuildPromptFromHistory(history);
412
413 // Skip availability check - causes segfault with current SSL setup
414 // TODO: Fix SSL/TLS initialization issue
415 // if (auto status = CheckAvailability(); !status.ok()) {
416 // return status;
417 // }
418
419 if (config_.api_key.empty()) {
420 return absl::FailedPreconditionError("Gemini API key not configured");
421 }
422
423 absl::Time request_start = absl::Now();
424
425 try {
426 if (config_.verbose) {
427 std::cerr << "[DEBUG] Using curl for HTTPS request" << std::endl;
428 std::cerr << "[DEBUG] Processing " << history.size()
429 << " messages in history" << std::endl;
430 }
431
432 // Build conversation history for multi-turn context
433 // Gemini supports alternating user/model messages for better context
434 nlohmann::json contents = nlohmann::json::array();
435
436 // Add conversation history (up to last 10 messages for context window)
437 int start_idx = std::max(0, static_cast<int>(history.size()) - 10);
438 for (size_t i = start_idx; i < history.size(); ++i) {
439 const auto& msg = history[i];
440 std::string role =
441 (msg.sender == agent::ChatMessage::Sender::kUser) ? "user" : "model";
442
443 nlohmann::json message = {{"role", role},
444 {"parts", {{{"text", msg.message}}}}};
445 contents.push_back(message);
446 }
447
448 // If the last message is from the model, we need to ensure the conversation
449 // ends with a user message for Gemini
450 if (!history.empty() &&
451 history.back().sender == agent::ChatMessage::Sender::kAgent) {
452 // Add a continuation prompt
453 nlohmann::json user_continuation = {
454 {"role", "user"},
455 {"parts", {{{"text", "Please continue or clarify your response."}}}}};
456 contents.push_back(user_continuation);
457 }
458
459 // Build request with proper Gemini API v1beta format
460 nlohmann::json request_body = {
461 {"system_instruction",
462 {{"parts", {{"text", config_.system_instruction}}}}},
463 {"contents", contents},
464 {"generationConfig",
465 {{"temperature", config_.temperature},
466 {"maxOutputTokens", config_.max_output_tokens}}}};
467
468 if (config_.verbose) {
469 std::cerr << "[DEBUG] Sending " << contents.size()
470 << " conversation turns to Gemini" << std::endl;
471 }
472
473 // Only add responseMimeType if NOT using function calling
474 // (Gemini doesn't support both at the same time)
475 if (!function_calling_enabled_) {
476 request_body["generationConfig"]["responseMimeType"] = "application/json";
477 }
478
479 // Add function calling tools if enabled
480 if (function_calling_enabled_) {
481 try {
482 std::string schemas_str = BuildFunctionCallSchemas();
483 if (config_.verbose) {
484 std::cerr << "[DEBUG] Function calling schemas: "
485 << schemas_str.substr(0, 200) << "..." << std::endl;
486 }
487
488 nlohmann::json schemas = nlohmann::json::parse(schemas_str);
489
490 // Build tools array - schemas might be an array of tools or a
491 // function_declarations object
492 if (schemas.is_array()) {
493 // If it's already an array of tools, use it directly
494 request_body["tools"] = {{{"function_declarations", schemas}}};
495 } else if (schemas.is_object() &&
496 schemas.contains("function_declarations")) {
497 // If it's a wrapper object with function_declarations
498 request_body["tools"] = {
499 {{"function_declarations", schemas["function_declarations"]}}};
500 } else {
501 // Treat as single tool object
502 request_body["tools"] = {
503 {{"function_declarations", nlohmann::json::array({schemas})}}};
504 }
505 } catch (const nlohmann::json::exception& e) {
506 std::cerr << "⚠️ Failed to parse function schemas: " << e.what()
507 << std::endl;
508 }
509 }
510
511 // Write request body to temp file
512 std::string temp_file = "/tmp/gemini_request.json";
513 std::ofstream out(temp_file);
514 out << request_body.dump();
515 out.close();
516
517 // Use curl to make the request (avoiding httplib SSL issues)
518 std::string endpoint =
519 "https://generativelanguage.googleapis.com/v1beta/models/" +
520 config_.model + ":generateContent";
521 std::string curl_cmd = "curl -s -X POST '" + endpoint +
522 "' "
523 "-H 'Content-Type: application/json' "
524 "-H 'x-goog-api-key: " +
525 config_.api_key +
526 "' "
527 "-d @" +
528 temp_file + " 2>&1";
529
530 if (config_.verbose) {
531 std::cerr << "[DEBUG] Executing API request..." << std::endl;
532 }
533
534#ifdef _WIN32
535 FILE* pipe = _popen(curl_cmd.c_str(), "r");
536#else
537 FILE* pipe = popen(curl_cmd.c_str(), "r");
538#endif
539 if (!pipe) {
540 return absl::InternalError("Failed to execute curl command");
541 }
542
543 std::string response_str;
544 char buffer[4096];
545 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
546 response_str += buffer;
547 }
548
549#ifdef _WIN32
550 int status = _pclose(pipe);
551#else
552 int status = pclose(pipe);
553#endif
554 std::remove(temp_file.c_str());
555
556 if (status != 0) {
557 return absl::InternalError(
558 absl::StrCat("Curl failed with status ", status));
559 }
560
561 if (response_str.empty()) {
562 return absl::InternalError("Empty response from Gemini API");
563 }
564
565 // Debug: print response
566 if (config_.verbose) {
567 std::cout << "\n"
568 << "\033[35m"
569 << "🔍 Raw Gemini API Response:"
570 << "\033[0m"
571 << "\n"
572 << "\033[2m" << response_str.substr(0, 500) << "\033[0m"
573 << "\n\n";
574 }
575
576 if (config_.verbose) {
577 std::cerr << "[DEBUG] Parsing response..." << std::endl;
578 }
579 auto parsed_or = ParseGeminiResponse(response_str);
580 if (!parsed_or.ok()) {
581 return parsed_or.status();
582 }
583 AgentResponse agent_response = std::move(parsed_or.value());
584 agent_response.provider = "gemini";
585 agent_response.model = config_.model;
586 agent_response.latency_seconds =
587 absl::ToDoubleSeconds(absl::Now() - request_start);
588 agent_response.parameters["prompt_version"] = config_.prompt_version;
589 agent_response.parameters["temperature"] =
590 absl::StrFormat("%.2f", config_.temperature);
591 agent_response.parameters["max_output_tokens"] =
592 absl::StrFormat("%d", config_.max_output_tokens);
593 agent_response.parameters["function_calling"] =
594 function_calling_enabled_ ? "true" : "false";
595 return agent_response;
596
597 } catch (const std::exception& e) {
598 if (config_.verbose) {
599 std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
600 }
601 return absl::InternalError(
602 absl::StrCat("Exception during generation: ", e.what()));
603 } catch (...) {
604 if (config_.verbose) {
605 std::cerr << "[ERROR] Unknown exception" << std::endl;
606 }
607 return absl::InternalError("Unknown exception during generation");
608 }
609#endif
610}
611
612absl::StatusOr<AgentResponse> GeminiAIService::ParseGeminiResponse(
613 const std::string& response_body) {
614#ifndef YAZE_WITH_JSON
615 return absl::UnimplementedError("JSON support required");
616#else
617 AgentResponse agent_response;
618
619 auto response_json = nlohmann::json::parse(response_body, nullptr, false);
620 if (response_json.is_discarded()) {
621 return absl::InternalError("❌ Failed to parse Gemini response JSON");
622 }
623
624 // Navigate Gemini's response structure
625 if (!response_json.contains("candidates") ||
626 response_json["candidates"].empty()) {
627 return absl::InternalError("❌ No candidates in Gemini response");
628 }
629
630 for (const auto& candidate : response_json["candidates"]) {
631 if (!candidate.contains("content") ||
632 !candidate["content"].contains("parts")) {
633 continue;
634 }
635
636 for (const auto& part : candidate["content"]["parts"]) {
637 if (part.contains("text")) {
638 std::string text_content = part["text"].get<std::string>();
639
640 // Debug: Print raw LLM output when verbose mode is enabled
641 if (config_.verbose) {
642 std::cout << "\n"
643 << "\033[35m"
644 << "🔍 Raw LLM Response:"
645 << "\033[0m"
646 << "\n"
647 << "\033[2m" << text_content << "\033[0m"
648 << "\n\n";
649 }
650
651 // Strip markdown code blocks if present (```json ... ```)
652 text_content = std::string(absl::StripAsciiWhitespace(text_content));
653 if (absl::StartsWith(text_content, "```json")) {
654 text_content = text_content.substr(7); // Remove ```json
655 } else if (absl::StartsWith(text_content, "```")) {
656 text_content = text_content.substr(3); // Remove ```
657 }
658 if (absl::EndsWith(text_content, "```")) {
659 text_content = text_content.substr(0, text_content.length() - 3);
660 }
661 text_content = std::string(absl::StripAsciiWhitespace(text_content));
662
663 // Try to parse as JSON object
664 auto parsed_text = nlohmann::json::parse(text_content, nullptr, false);
665 if (!parsed_text.is_discarded()) {
666 // Extract text_response
667 if (parsed_text.contains("text_response") &&
668 parsed_text["text_response"].is_string()) {
669 agent_response.text_response =
670 parsed_text["text_response"].get<std::string>();
671 }
672
673 // Extract reasoning
674 if (parsed_text.contains("reasoning") &&
675 parsed_text["reasoning"].is_string()) {
676 agent_response.reasoning =
677 parsed_text["reasoning"].get<std::string>();
678 }
679
680 // Extract commands
681 if (parsed_text.contains("commands") &&
682 parsed_text["commands"].is_array()) {
683 for (const auto& cmd : parsed_text["commands"]) {
684 if (cmd.is_string()) {
685 std::string command = cmd.get<std::string>();
686 if (absl::StartsWith(command, "z3ed ")) {
687 command = command.substr(5);
688 }
689 agent_response.commands.push_back(command);
690 }
691 }
692 }
693
694 // Extract tool_calls from the parsed JSON
695 if (parsed_text.contains("tool_calls") &&
696 parsed_text["tool_calls"].is_array()) {
697 for (const auto& call : parsed_text["tool_calls"]) {
698 if (call.contains("tool_name") && call["tool_name"].is_string()) {
699 ToolCall tool_call;
700 tool_call.tool_name = call["tool_name"].get<std::string>();
701
702 if (call.contains("args") && call["args"].is_object()) {
703 for (auto& [key, value] : call["args"].items()) {
704 if (value.is_string()) {
705 tool_call.args[key] = value.get<std::string>();
706 } else if (value.is_number()) {
707 tool_call.args[key] = std::to_string(value.get<double>());
708 } else if (value.is_boolean()) {
709 tool_call.args[key] =
710 value.get<bool>() ? "true" : "false";
711 }
712 }
713 }
714 agent_response.tool_calls.push_back(tool_call);
715 }
716 }
717 }
718 } else {
719 // If parsing the full object fails, fallback to extracting commands
720 // from text
721 std::vector<std::string> lines = absl::StrSplit(text_content, '\n');
722 for (const auto& line : lines) {
723 std::string trimmed = std::string(absl::StripAsciiWhitespace(line));
724 if (!trimmed.empty() && (absl::StartsWith(trimmed, "z3ed ") ||
725 absl::StartsWith(trimmed, "palette ") ||
726 absl::StartsWith(trimmed, "overworld ") ||
727 absl::StartsWith(trimmed, "sprite ") ||
728 absl::StartsWith(trimmed, "dungeon "))) {
729 if (absl::StartsWith(trimmed, "z3ed ")) {
730 trimmed = trimmed.substr(5);
731 }
732 agent_response.commands.push_back(trimmed);
733 }
734 }
735 }
736 } else if (part.contains("functionCall")) {
737 const auto& call = part["functionCall"];
738 if (call.contains("name") && call["name"].is_string()) {
739 ToolCall tool_call;
740 tool_call.tool_name = call["name"].get<std::string>();
741 if (call.contains("args") && call["args"].is_object()) {
742 for (auto& [key, value] : call["args"].items()) {
743 if (value.is_string()) {
744 tool_call.args[key] = value.get<std::string>();
745 } else if (value.is_number()) {
746 tool_call.args[key] = std::to_string(value.get<double>());
747 }
748 }
749 }
750 agent_response.tool_calls.push_back(tool_call);
751 }
752 }
753 }
754 }
755
756 if (agent_response.text_response.empty() && agent_response.commands.empty() &&
757 agent_response.tool_calls.empty()) {
758 return absl::InternalError(
759 "❌ No valid response extracted from Gemini\n"
760 " Expected at least one of: text_response, commands, or tool_calls\n"
761 " Raw response: " +
762 response_body);
763 }
764
765 return agent_response;
766#endif
767}
768
769absl::StatusOr<std::string> GeminiAIService::EncodeImageToBase64(
770 const std::string& image_path) const {
771#ifndef YAZE_WITH_JSON
772 (void)image_path; // Suppress unused parameter warning
773 return absl::UnimplementedError(
774 "Gemini AI service requires JSON support. Build with "
775 "-DYAZE_WITH_JSON=ON");
776#else
777 std::ifstream file(image_path, std::ios::binary);
778 if (!file.is_open()) {
779 return absl::NotFoundError(
780 absl::StrCat("Failed to open image file: ", image_path));
781 }
782
783 // Read file into buffer
784 file.seekg(0, std::ios::end);
785 size_t size = file.tellg();
786 file.seekg(0, std::ios::beg);
787
788 std::vector<unsigned char> buffer(size);
789 if (!file.read(reinterpret_cast<char*>(buffer.data()), size)) {
790 return absl::InternalError("Failed to read image file");
791 }
792
793 // Base64 encode
794 static const char* base64_chars =
795 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
796
797 std::string encoded;
798 encoded.reserve(((size + 2) / 3) * 4);
799
800 int i = 0;
801 int j = 0;
802 unsigned char char_array_3[3];
803 unsigned char char_array_4[4];
804
805 for (size_t idx = 0; idx < size; idx++) {
806 char_array_3[i++] = buffer[idx];
807 if (i == 3) {
808 char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
809 char_array_4[1] =
810 ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
811 char_array_4[2] =
812 ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
813 char_array_4[3] = char_array_3[2] & 0x3f;
814
815 for (i = 0; i < 4; i++)
816 encoded += base64_chars[char_array_4[i]];
817 i = 0;
818 }
819 }
820
821 if (i) {
822 for (j = i; j < 3; j++)
823 char_array_3[j] = '\0';
824
825 char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
826 char_array_4[1] =
827 ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
828 char_array_4[2] =
829 ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
830
831 for (j = 0; j < i + 1; j++)
832 encoded += base64_chars[char_array_4[j]];
833
834 while (i++ < 3)
835 encoded += '=';
836 }
837
838 return encoded;
839#endif
840}
841
842absl::StatusOr<AgentResponse> GeminiAIService::GenerateMultimodalResponse(
843 const std::string& image_path, const std::string& prompt) {
844#ifndef YAZE_WITH_JSON
845 (void)image_path; // Suppress unused parameter warnings
846 (void)prompt;
847 return absl::UnimplementedError(
848 "Gemini AI service requires JSON support. Build with "
849 "-DYAZE_WITH_JSON=ON");
850#else
851 if (config_.api_key.empty()) {
852 return absl::FailedPreconditionError("Gemini API key not configured");
853 }
854
855 // Determine MIME type from file extension
856 std::string mime_type = "image/png";
857 if (image_path.ends_with(".jpg") || image_path.ends_with(".jpeg")) {
858 mime_type = "image/jpeg";
859 } else if (image_path.ends_with(".bmp")) {
860 mime_type = "image/bmp";
861 } else if (image_path.ends_with(".webp")) {
862 mime_type = "image/webp";
863 }
864
865 // Encode image to base64
866 auto encoded_or = EncodeImageToBase64(image_path);
867 if (!encoded_or.ok()) {
868 return encoded_or.status();
869 }
870 std::string encoded_image = std::move(encoded_or.value());
871
872 try {
873 if (config_.verbose) {
874 std::cerr << "[DEBUG] Preparing multimodal request with image"
875 << std::endl;
876 }
877
878 // Build multimodal request with image and text
879 nlohmann::json request_body = {
880 {"contents",
881 {{{"parts",
882 {{{"inline_data",
883 {{"mime_type", mime_type}, {"data", encoded_image}}}},
884 {{"text", prompt}}}}}}},
885 {"generationConfig",
886 {{"temperature", config_.temperature},
887 {"maxOutputTokens", config_.max_output_tokens}}}};
888
889 // Write request body to temp file
890 std::string temp_file = "/tmp/gemini_multimodal_request.json";
891 std::ofstream out(temp_file);
892 out << request_body.dump();
893 out.close();
894
895 // Use curl to make the request
896 std::string endpoint =
897 "https://generativelanguage.googleapis.com/v1beta/models/" +
898 config_.model + ":generateContent";
899 std::string curl_cmd = "curl -s -X POST '" + endpoint +
900 "' "
901 "-H 'Content-Type: application/json' "
902 "-H 'x-goog-api-key: " +
903 config_.api_key +
904 "' "
905 "-d @" +
906 temp_file + " 2>&1";
907
908 if (config_.verbose) {
909 std::cerr << "[DEBUG] Executing multimodal API request..." << std::endl;
910 }
911
912#ifdef _WIN32
913 FILE* pipe = _popen(curl_cmd.c_str(), "r");
914#else
915 FILE* pipe = popen(curl_cmd.c_str(), "r");
916#endif
917 if (!pipe) {
918 return absl::InternalError("Failed to execute curl command");
919 }
920
921 std::string response_str;
922 char buffer[4096];
923 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
924 response_str += buffer;
925 }
926
927#ifdef _WIN32
928 int status = _pclose(pipe);
929#else
930 int status = pclose(pipe);
931#endif
932 std::remove(temp_file.c_str());
933
934 if (status != 0) {
935 return absl::InternalError(
936 absl::StrCat("Curl failed with status ", status));
937 }
938
939 if (response_str.empty()) {
940 return absl::InternalError("Empty response from Gemini API");
941 }
942
943 if (config_.verbose) {
944 std::cout << "\n"
945 << "\033[35m"
946 << "🔍 Raw Gemini Multimodal Response:"
947 << "\033[0m"
948 << "\n"
949 << "\033[2m" << response_str.substr(0, 500) << "\033[0m"
950 << "\n\n";
951 }
952
953 return ParseGeminiResponse(response_str);
954
955 } catch (const std::exception& e) {
956 if (config_.verbose) {
957 std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
958 }
959 return absl::InternalError(
960 absl::StrCat("Exception during multimodal generation: ", e.what()));
961 }
962#endif
963}
964
965} // namespace cli
966} // namespace yaze
GeminiAIService(const GeminiConfig &)