// (C)artificialintelligence.dk - examples/diffusion/diffusion-http.cpp // // Minimal OpenAI-compatible HTTP server for DiffusionGemma / llama.cpp diffusion models. // // Behavior: // - GGUF model is loaded once at server startup. // - Every HTTP request creates a fresh llama_context. // - Therefore requests do not share KV/cache/history. // - This avoids model reload while keeping questions separated. // // Audit logging: // - JSONL audit records are written to stderr by default. // - Optional file logging: --audit-log C:\path\diffusion-http-audit.jsonl // - Disable audit: --no-audit // - Raw model output logging is disabled by default; enable with --log-raw. // // Request controls added in this version: // - stream=true is supported with OpenAI-compatible SSE framing. // Diffusion generation itself is still batch/canvas-based, so chunks are sent after // generation completes, split by --stream-chunk-chars / stream_chunk_chars. // - max_tokens controls returned/generated tokens. // - processing_tokens / allocated_tokens / n_ctx / n_batch / n_ubatch control // context and processing allocation per request. // // Endpoints: // GET /health // GET /v1/models // POST /v1/chat/completions // POST /completion #include "llama.h" #include "common.h" #include "chat.h" #include "diffusion.h" #include "ggml-backend.h" #include "httplib.h" #include "json.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if !defined(_WIN32) #include #endif #if defined(_WIN32) #define NOMINMAX #include #include #else #include #endif struct ServerConfig { std::string model_path; std::string host = "127.0.0.1"; int port = 8081; int n_gpu_layers = 99; int n_threads = 20; int default_max_tokens = 256; int diffusion_steps = 128; float temperature = 0.8f; float top_p = 0.95f; int top_k = 40; int seed = 1234; bool use_chat_template = true; bool expose_raw = false; bool audit_enabled = true; bool log_raw_output = false; std::string audit_log_path; // Audit verbosity: // summary = compact JSONL records suitable for normal operations. // full = previous detailed records with full nested memory snapshots. bool audit_summary = true; bool audit_memory_pressure = true; bool streaming_enabled = true; int stream_chunk_chars = 96; // Auto allocation defaults. For DiffusionGemma canvas models the historical // llama-diffusion-cli behavior is: // n_ctx = blocks * canvas_length + 2048 // The value below controls that extra 2048 headroom unless overridden. int ctx_headroom_tokens = 2048; // If non-zero, these become server-wide defaults for request allocation. // They can still be overridden per request with JSON fields. int default_processing_tokens = 0; int default_n_ctx = 0; int default_n_batch = 0; int default_n_ubatch = 0; // Concurrency + memory queue controls. // // Memory accounting is conservative/estimated: llama.cpp backends do not expose // exact per-context GPU allocations through this wrapper. The defaults are chosen // to be safe for DiffusionGemma-style large canvas requests and can be tuned: // // estimated_request_bytes = // request_base_memory_bytes + max(n_ctx, n_batch, n_ubatch) * bytes_per_token // // The scheduler compares: // model_memory_bytes + active_request_bytes + new_request_bytes + safety_margin // // against detected GPU physical/budget memory by default. --memory-limit-mb // can still override the detected budget. Dynamic GPU budget/usage checks are used when available: DXGI on Windows, // Linux DRM sysfs or nvidia-smi on Ubuntu/Linux; otherwise the code falls back to OS RAM. // If admission would exceed the limit, the request waits in a queue until // other requests complete. int max_concurrent_requests = 0; // 0 = unlimited except memory budget uint64_t memory_limit_bytes = 0; // 0 = use OS physical availability only uint64_t memory_safety_margin_bytes = 512ull * 1024ull * 1024ull; uint64_t model_memory_bytes_override = 0; uint64_t bytes_per_token = 1024ull * 1024ull; // ~1 MiB/token, conservative for DiffusionGemma SYCL uint64_t request_base_memory_bytes = 512ull * 1024ull * 1024ull; uint64_t queue_timeout_ms = 0; // 0 = wait indefinitely // Queue hardening. FIFO prevents small later requests from jumping ahead of // large earlier requests. max_queue_* protects the HTTP server from unlimited // blocked handler threads when clients flood it. 0 = unlimited for that limit. bool fifo_queue = true; int max_queue_requests = 128; uint64_t max_queue_memory_bytes = 0; // Context creation uses shared llama_model/backend state. Keep it serialized // by default while still allowing generation itself to run concurrently. bool serialize_context_creation = true; // Emergency switch for unstable backends: keep request isolation but run one // generation at a time. Default false because the memory gate is the primary // parallelism control. bool serialize_generation = false; // Optional post-context safety check. If DXGI/OS telemetry says memory became // dangerously low immediately after llama_init_from_model(), abort before // launching diffusion kernels. bool post_context_memory_check = true; // Limit httplib worker threads separately from model concurrency. 0 means use // the library default. int http_worker_threads = 0; // GPU memory auto-detection. On Windows this uses DXGI directly. On Linux/Ubuntu it tries DRM sysfs // (/sys/class/drm/card*/device/mem_info_vram_*) and then nvidia-smi fallback, // so you usually do not need --memory-limit-mb on AMD/Intel/NVIDIA systems. // largest = use the largest detected adapter, best for one server per GPU // sum = sum all detected discrete GPU adapters, only use when the model // is genuinely split across GPUs std::string gpu_memory_policy = "largest"; bool gpu_dynamic_memory_guard = true; bool memory_limit_auto_detected = false; // Optional hot context pool. This avoids paying llama_init_from_model() on // every request. Each pooled context is exclusively leased to one request at // a time and cleared before reuse. // // 0 = disabled; old fresh-context-per-request behavior. int context_pool_size = 0; // Pool dimensions. 0 means derive from server defaults/max request caps. int context_pool_n_ctx = 0; int context_pool_n_batch = 0; int context_pool_n_ubatch = 0; // Strict mode rejects requests that do not fit the pool. Non-strict mode // falls back to a temporary fresh context for oversized requests. bool context_pool_strict = false; // Clear KV/cache/state before putting a pooled context back. Keep enabled // for request isolation. bool context_pool_clear_on_release = true; }; struct PooledContextSlot { llama_context * ctx = nullptr; int32_t n_ctx = 0; int32_t n_batch = 0; int32_t n_ubatch = 0; uint64_t slot_id = 0; }; struct App { ServerConfig cfg; llama_model * model = nullptr; const llama_vocab * vocab = nullptr; // Resolved from GGUF metadata after load. Used in audit/debug output. std::string model_name = "diffusiongemma"; std::string model_architecture = "diffusion-gemma"; int32_t canvas_length = 0; std::atomic next_request_id{1}; // Memory-gated concurrency scheduler. Requests do not share llama_context, so // they are isolated. This gate only decides when a request may allocate its // fresh context and generation buffers. std::mutex memory_mutex; std::condition_variable memory_cv; uint64_t model_memory_bytes = 0; uint64_t active_request_memory_bytes = 0; uint64_t peak_request_memory_bytes = 0; uint64_t queued_request_memory_bytes = 0; uint64_t peak_queued_request_memory_bytes = 0; int active_requests = 0; int queued_requests = 0; uint64_t next_queue_ticket = 1; std::deque queue_order; // These are deliberately separate from the memory gate. They protect backend // phases that may not be fully thread-safe on every llama.cpp backend. std::mutex context_init_mutex; std::mutex generation_mutex; // Optional pool of warm llama_context objects. A slot is never shared by two // requests at once. It is returned to the pool only after generation finishes // and the KV/cache has been cleared. std::mutex context_pool_mutex; std::condition_variable context_pool_cv; std::vector> context_pool; std::deque context_pool_available; int active_pooled_contexts = 0; uint64_t context_pool_memory_bytes = 0; uint64_t context_pool_acquires = 0; uint64_t context_pool_reuses = 0; uint64_t context_pool_waits = 0; uint64_t context_pool_misses = 0; int32_t context_pool_n_ctx_actual = 0; int32_t context_pool_n_batch_actual = 0; int32_t context_pool_n_ubatch_actual = 0; // Detected GPU memory. memory_limit_bytes is set from this at startup when // the user does not provide --memory-limit-mb. bool gpu_memory_detected = false; std::string gpu_memory_source; std::string gpu_adapter_name; int gpu_adapter_count = 0; uint64_t gpu_dedicated_memory_bytes = 0; uint64_t gpu_budget_bytes = 0; uint64_t gpu_current_usage_bytes_at_start = 0; uint64_t gpu_available_budget_bytes_at_start = 0; // Serialize audit writes so JSONL records are not interleaved. std::mutex audit_mutex; }; struct GenerationResult { std::string text; std::string raw_text; int prompt_tokens = 0; int output_tokens = 0; int raw_chars = 0; int clean_chars = 0; int requested_max_tokens = 0; int processing_tokens_requested = 0; int ctx_headroom_tokens = 0; int min_required_tokens = 0; int n_ctx = 0; int n_batch = 0; int n_ubatch = 0; int blocks_requested = 0; int blocks_completed = 0; int canvas_tokens_processed = 0; double total_seconds = 0.0; double generation_seconds = 0.0; uint64_t estimated_request_memory_bytes = 0; uint64_t model_memory_bytes = 0; uint64_t active_request_memory_bytes_after_admit = 0; uint64_t active_request_memory_bytes_after_release = 0; uint64_t memory_limit_bytes = 0; uint64_t memory_safety_margin_bytes = 0; uint64_t os_total_physical_bytes = 0; uint64_t os_available_physical_bytes_at_admit = 0; uint64_t gpu_available_budget_bytes_at_admit = 0; uint64_t queue_wait_ms = 0; uint64_t queue_ticket = 0; int active_requests_after_admit = 0; int queued_requests_after_admit = 0; uint64_t queued_request_memory_bytes_after_admit = 0; double context_init_lock_seconds = 0.0; double generation_lock_wait_seconds = 0.0; double context_pool_wait_seconds = 0.0; bool context_reused = false; uint64_t context_pool_slot_id = 0; }; struct RequestOptions { int max_tokens = 0; float temperature = 0.0f; bool stream = false; // Allocation controls. 0 means auto/default. int processing_tokens = 0; // alias: allocated_tokens int n_ctx = 0; int n_batch = 0; int n_ubatch = 0; int ctx_headroom_tokens = 0; int stream_chunk_chars = 0; }; static std::string trim_ws(std::string s) { while (!s.empty() && (s.front() == '\n' || s.front() == '\r' || s.front() == ' ' || s.front() == '\t')) { s.erase(s.begin()); } while (!s.empty() && (s.back() == '\n' || s.back() == '\r' || s.back() == ' ' || s.back() == '\t')) { s.pop_back(); } return s; } static std::string timestamp_utc_ms() { using namespace std::chrono; const auto now = system_clock::now(); const auto ms = duration_cast(now.time_since_epoch()) % 1000; const std::time_t tt = system_clock::to_time_t(now); std::tm tm{}; #if defined(_WIN32) gmtime_s(&tm, &tt); #else gmtime_r(&tt, &tm); #endif std::ostringstream out; out << std::put_time(&tm, "%Y-%m-%dT%H:%M:%S") << "." << std::setw(3) << std::setfill('0') << ms.count() << "Z"; return out.str(); } static double seconds_between( const std::chrono::steady_clock::time_point & a, const std::chrono::steady_clock::time_point & b ) { return std::chrono::duration(b - a).count(); } static double tokens_per_second(int tokens, double seconds) { if (seconds <= 0.0) { return 0.0; } return (double) tokens / seconds; } static uint64_t mb_to_bytes(uint64_t mb) { return mb * 1024ull * 1024ull; } static uint64_t kb_to_bytes(uint64_t kb) { return kb * 1024ull; } static uint64_t file_size_bytes(const std::string & path) { FILE * f = nullptr; #if defined(_WIN32) fopen_s(&f, path.c_str(), "rb"); #else f = fopen(path.c_str(), "rb"); #endif if (!f) { return 0; } #if defined(_WIN32) _fseeki64(f, 0, SEEK_END); const int64_t size = _ftelli64(f); #else fseeko(f, 0, SEEK_END); const off_t size = ftello(f); #endif fclose(f); return size > 0 ? (uint64_t) size : 0; } struct GpuMemoryInfo { bool valid = false; std::string source; std::string adapter_name; int adapter_count = 0; // Physical/local VRAM reported by the adapter. uint64_t dedicated_video_memory_bytes = 0; // Runtime budget/usage reported by the OS. On Windows this is DXGI // QueryVideoMemoryInfo for the LOCAL segment. It is often the most useful // admission-control number because it accounts for memory pressure. uint64_t budget_bytes = 0; uint64_t current_usage_bytes = 0; uint64_t available_budget_bytes = 0; }; #if defined(_WIN32) static std::string wide_to_utf8(const wchar_t * ws) { if (!ws || !*ws) { return ""; } const int needed = WideCharToMultiByte(CP_UTF8, 0, ws, -1, nullptr, 0, nullptr, nullptr); if (needed <= 0) { return ""; } std::string out((size_t) needed, '\0'); WideCharToMultiByte(CP_UTF8, 0, ws, -1, &out[0], needed, nullptr, nullptr); if (!out.empty() && out.back() == '\0') { out.pop_back(); } return out; } #endif static GpuMemoryInfo detect_gpu_memory_info(const std::string & policy) { GpuMemoryInfo result; #if defined(_WIN32) HMODULE dxgi = LoadLibraryA("dxgi.dll"); if (!dxgi) { result.source = "dxgi_load_failed"; return result; } using CreateDXGIFactory1Fn = HRESULT (WINAPI *)(REFIID, void **); auto create_factory = reinterpret_cast(GetProcAddress(dxgi, "CreateDXGIFactory1")); if (!create_factory) { result.source = "CreateDXGIFactory1_missing"; FreeLibrary(dxgi); return result; } IDXGIFactory1 * factory = nullptr; HRESULT hr = create_factory(__uuidof(IDXGIFactory1), reinterpret_cast(&factory)); if (FAILED(hr) || !factory) { result.source = "CreateDXGIFactory1_failed"; FreeLibrary(dxgi); return result; } struct Candidate { std::string name; uint64_t dedicated = 0; uint64_t budget = 0; uint64_t current_usage = 0; uint64_t available_budget = 0; }; std::vector candidates; for (UINT i = 0;; ++i) { IDXGIAdapter1 * adapter = nullptr; hr = factory->EnumAdapters1(i, &adapter); if (hr == DXGI_ERROR_NOT_FOUND) { break; } if (FAILED(hr) || !adapter) { continue; } DXGI_ADAPTER_DESC1 desc{}; if (SUCCEEDED(adapter->GetDesc1(&desc))) { const bool is_software = (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) != 0; // DedicatedVideoMemory is 0 or tiny for many iGPUs. For this server we // care about the selected discrete inference GPU budget. if (!is_software && desc.DedicatedVideoMemory > 0) { Candidate c; c.name = wide_to_utf8(desc.Description); c.dedicated = (uint64_t) desc.DedicatedVideoMemory; IDXGIAdapter3 * adapter3 = nullptr; if (SUCCEEDED(adapter->QueryInterface(__uuidof(IDXGIAdapter3), reinterpret_cast(&adapter3))) && adapter3) { DXGI_QUERY_VIDEO_MEMORY_INFO info{}; if (SUCCEEDED(adapter3->QueryVideoMemoryInfo(0, DXGI_MEMORY_SEGMENT_GROUP_LOCAL, &info))) { c.budget = (uint64_t) info.Budget; c.current_usage = (uint64_t) info.CurrentUsage; c.available_budget = c.budget > c.current_usage ? c.budget - c.current_usage : 0; } adapter3->Release(); } candidates.push_back(c); } } adapter->Release(); } factory->Release(); FreeLibrary(dxgi); if (candidates.empty()) { result.source = "dxgi_no_discrete_adapter"; return result; } result.valid = true; result.source = "dxgi"; result.adapter_count = (int) candidates.size(); if (policy == "sum") { result.adapter_name = "sum_of_detected_adapters"; for (const Candidate & c : candidates) { result.dedicated_video_memory_bytes += c.dedicated; result.budget_bytes += c.budget > 0 ? c.budget : c.dedicated; result.current_usage_bytes += c.current_usage; result.available_budget_bytes += c.available_budget; } } else { auto best = std::max_element(candidates.begin(), candidates.end(), [](const Candidate & a, const Candidate & b) { return a.dedicated < b.dedicated; }); result.adapter_name = best->name; result.dedicated_video_memory_bytes = best->dedicated; result.budget_bytes = best->budget > 0 ? best->budget : best->dedicated; result.current_usage_bytes = best->current_usage; result.available_budget_bytes = best->available_budget; } #else struct Candidate { std::string name; uint64_t dedicated = 0; uint64_t current_usage = 0; uint64_t available_budget = 0; std::string source; }; std::vector candidates; auto read_u64_file = [](const std::filesystem::path & path, uint64_t & value) -> bool { std::ifstream f(path); if (!f) { return false; } uint64_t v = 0; f >> v; if (!f) { return false; } value = v; return true; }; auto read_string_file = [](const std::filesystem::path & path) -> std::string { std::ifstream f(path); if (!f) { return ""; } std::string v; std::getline(f, v); return v; }; auto trim_newline = [](std::string v) -> std::string { while (!v.empty() && (v.back() == '\n' || v.back() == '\r' || v.back() == ' ' || v.back() == '\t')) { v.pop_back(); } return v; }; // Linux generic DRM sysfs path. This works for AMDGPU officially and also // works on many modern discrete Linux GPU drivers that expose DRM memory // accounting under /sys/class/drm/card*/device. // Official AMDGPU docs define mem_info_vram_total and mem_info_vram_used as // byte counters for total and currently used VRAM. try { const std::filesystem::path drm_root("/sys/class/drm"); if (std::filesystem::exists(drm_root)) { for (const auto & entry : std::filesystem::directory_iterator(drm_root)) { const std::string card = entry.path().filename().string(); if (card.rfind("card", 0) != 0 || card.find('-') != std::string::npos) { continue; } const auto dev = entry.path() / "device"; uint64_t total = 0; uint64_t used = 0; if (!read_u64_file(dev / "mem_info_vram_total", total)) { // Some Intel/xe/i915 setups expose local memory size through // local_mem_size rather than the amdgpu-style VRAM file. read_u64_file(dev / "local_mem_size", total); } read_u64_file(dev / "mem_info_vram_used", used); if (total == 0) { continue; } Candidate c; const std::string vendor = trim_newline(read_string_file(dev / "vendor")); const std::string device = trim_newline(read_string_file(dev / "device")); c.name = card; if (!vendor.empty() || !device.empty()) { c.name += " vendor=" + vendor + " device=" + device; } c.dedicated = total; c.current_usage = used; c.available_budget = total > used ? total - used : 0; c.source = "linux_drm_sysfs"; candidates.push_back(c); } } } catch (const std::exception &) { // sysfs enumeration is best-effort; fall through to vendor-tool fallback. } // NVIDIA fallback without adding a build-time NVML dependency. This uses the // nvidia-smi command if present. It reports MiB, so convert to bytes. It is // intentionally a fallback because spawning a process per memory check is // slower than reading sysfs/DXGI. if (candidates.empty()) { FILE * pipe = popen("nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv,noheader,nounits 2>/dev/null", "r"); if (pipe) { char line[1024]; while (fgets(line, sizeof(line), pipe)) { std::string row(line); row = trim_newline(row); if (row.empty()) { continue; } std::vector cols; std::stringstream ss(row); std::string col; while (std::getline(ss, col, ',')) { cols.push_back(trim_ws(col)); } if (cols.size() >= 3) { try { Candidate c; c.name = cols[0]; c.dedicated = mb_to_bytes((uint64_t) std::stoull(cols[1])); c.current_usage = mb_to_bytes((uint64_t) std::stoull(cols[2])); c.available_budget = c.dedicated > c.current_usage ? c.dedicated - c.current_usage : 0; c.source = "nvidia_smi"; if (c.dedicated > 0) { candidates.push_back(c); } } catch (...) { // Ignore malformed rows. } } } pclose(pipe); } } if (candidates.empty()) { result.source = "linux_gpu_memory_detection_unavailable"; (void) policy; return result; } result.valid = true; result.adapter_count = (int) candidates.size(); if (policy == "sum") { result.source = "linux_gpu_memory_sum"; result.adapter_name = "sum_of_detected_linux_adapters"; for (const Candidate & c : candidates) { result.dedicated_video_memory_bytes += c.dedicated; result.current_usage_bytes += c.current_usage; result.available_budget_bytes += c.available_budget; } result.budget_bytes = result.dedicated_video_memory_bytes; } else { auto best = std::max_element(candidates.begin(), candidates.end(), [](const Candidate & a, const Candidate & b) { return a.dedicated < b.dedicated; }); result.source = best->source; result.adapter_name = best->name; result.dedicated_video_memory_bytes = best->dedicated; result.budget_bytes = best->dedicated; result.current_usage_bytes = best->current_usage; result.available_budget_bytes = best->available_budget; } #endif return result; } static uint64_t os_total_physical_memory_bytes() { #if defined(_WIN32) MEMORYSTATUSEX status; status.dwLength = sizeof(status); if (GlobalMemoryStatusEx(&status)) { return (uint64_t) status.ullTotalPhys; } return 0; #else const long pages = sysconf(_SC_PHYS_PAGES); const long page_size = sysconf(_SC_PAGE_SIZE); if (pages > 0 && page_size > 0) { return (uint64_t) pages * (uint64_t) page_size; } return 0; #endif } static uint64_t os_available_physical_memory_bytes() { #if defined(_WIN32) MEMORYSTATUSEX status; status.dwLength = sizeof(status); if (GlobalMemoryStatusEx(&status)) { return (uint64_t) status.ullAvailPhys; } return std::numeric_limits::max(); #else #if defined(_SC_AVPHYS_PAGES) const long pages = sysconf(_SC_AVPHYS_PAGES); const long page_size = sysconf(_SC_PAGE_SIZE); if (pages > 0 && page_size > 0) { return (uint64_t) pages * (uint64_t) page_size; } #endif return std::numeric_limits::max(); #endif } static double bytes_to_mib(uint64_t bytes) { return (double) bytes / (1024.0 * 1024.0); } static uint64_t estimate_request_memory_bytes(const App & app, int32_t n_ctx, int32_t n_batch, int32_t n_ubatch) { const uint64_t token_slots = (uint64_t) std::max(n_ctx, std::max(n_batch, n_ubatch)); return app.cfg.request_base_memory_bytes + token_slots * app.cfg.bytes_per_token; } static bool context_pool_enabled(const App & app) { return app.cfg.context_pool_size > 0 && !app.context_pool.empty(); } static bool context_pool_can_satisfy(const App & app, int32_t n_ctx, int32_t n_batch, int32_t n_ubatch) { if (!context_pool_enabled(app)) { return false; } return n_ctx <= app.context_pool_n_ctx_actual && n_batch <= app.context_pool_n_batch_actual && n_ubatch <= app.context_pool_n_ubatch_actual; } static uint64_t estimate_active_request_memory_bytes( const App & app, int32_t n_ctx, int32_t n_batch, int32_t n_ubatch, bool will_use_context_pool ) { // If the request uses a pre-created pooled context, the large n_ctx/n_batch // GPU allocations are already resident and accounted as context_pool_memory. // Keep only a conservative per-request working estimate so the scheduler // still limits active CPU/output buffers and transient backend allocations. if (will_use_context_pool) { return app.cfg.request_base_memory_bytes; } return estimate_request_memory_bytes(app, n_ctx, n_batch, n_ubatch); } struct ContextLease { App * app = nullptr; llama_context * ctx = nullptr; PooledContextSlot * slot = nullptr; bool pooled = false; ContextLease() = default; ContextLease(App * app_, llama_context * ctx_) : app(app_), ctx(ctx_), slot(nullptr), pooled(false) {} ContextLease(App * app_, PooledContextSlot * slot_) : app(app_), ctx(slot_ ? slot_->ctx : nullptr), slot(slot_), pooled(true) {} ContextLease(const ContextLease &) = delete; ContextLease & operator=(const ContextLease &) = delete; ContextLease(ContextLease && other) noexcept { app = other.app; ctx = other.ctx; slot = other.slot; pooled = other.pooled; other.app = nullptr; other.ctx = nullptr; other.slot = nullptr; other.pooled = false; } ContextLease & operator=(ContextLease && other) noexcept { if (this != &other) { release(); app = other.app; ctx = other.ctx; slot = other.slot; pooled = other.pooled; other.app = nullptr; other.ctx = nullptr; other.slot = nullptr; other.pooled = false; } return *this; } ~ContextLease() { release(); } void release() { if (!ctx) { return; } if (pooled && app && slot) { if (app->cfg.context_pool_clear_on_release) { llama_memory_clear(llama_get_memory(ctx), true); } { std::lock_guard guard(app->context_pool_mutex); app->active_pooled_contexts = std::max(0, app->active_pooled_contexts - 1); app->context_pool_available.push_back(slot); } app->context_pool_cv.notify_one(); } else { llama_free(ctx); } app = nullptr; ctx = nullptr; slot = nullptr; pooled = false; } }; static ContextLease acquire_context_lease( App & app, int32_t n_ctx, int32_t n_batch, int32_t n_ubatch, bool & reused_out, uint64_t & pool_slot_id_out, double & pool_wait_seconds_out ) { reused_out = false; pool_slot_id_out = 0; pool_wait_seconds_out = 0.0; if (context_pool_can_satisfy(app, n_ctx, n_batch, n_ubatch)) { const auto wait_start = std::chrono::steady_clock::now(); std::unique_lock lock(app.context_pool_mutex); app.context_pool_waits++; app.context_pool_cv.wait(lock, [&]() { return !app.context_pool_available.empty(); }); PooledContextSlot * slot = app.context_pool_available.front(); app.context_pool_available.pop_front(); app.active_pooled_contexts++; app.context_pool_acquires++; app.context_pool_reuses++; lock.unlock(); // Clear again on acquire so a failed previous request cannot leak state. if (app.cfg.context_pool_clear_on_release) { llama_memory_clear(llama_get_memory(slot->ctx), true); } llama_set_n_threads(slot->ctx, app.cfg.n_threads, app.cfg.n_threads); const auto wait_end = std::chrono::steady_clock::now(); reused_out = true; pool_slot_id_out = slot->slot_id; pool_wait_seconds_out = seconds_between(wait_start, wait_end); return ContextLease(&app, slot); } if (app.cfg.context_pool_size > 0 && app.cfg.context_pool_strict) { std::ostringstream err; err << "request does not fit the configured context pool: requested n_ctx=" << n_ctx << ", n_batch=" << n_batch << ", n_ubatch=" << n_ubatch << "; pool n_ctx=" << app.context_pool_n_ctx_actual << ", n_batch=" << app.context_pool_n_batch_actual << ", n_ubatch=" << app.context_pool_n_ubatch_actual << ". Increase --context-pool-n-ctx/--context-pool-n-batch/--context-pool-n-ubatch or disable --context-pool-strict."; throw std::runtime_error(err.str()); } { std::lock_guard lock(app.context_pool_mutex); app.context_pool_misses++; } llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = n_ctx; ctx_params.n_batch = n_batch; ctx_params.n_ubatch = n_ubatch; ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; std::unique_lock context_lock; if (app.cfg.serialize_context_creation) { context_lock = std::unique_lock(app.context_init_mutex); } llama_context * ctx = llama_init_from_model(app.model, ctx_params); if (!ctx) { throw std::runtime_error("failed to create llama_context"); } llama_set_n_threads(ctx, app.cfg.n_threads, app.cfg.n_threads); return ContextLease(&app, ctx); } static void init_context_pool(App & app) { if (app.cfg.context_pool_size <= 0) { return; } int32_t n_ctx = app.cfg.context_pool_n_ctx; if (n_ctx <= 0) { if (app.cfg.default_n_ctx > 0) { n_ctx = app.cfg.default_n_ctx; } else if (app.cfg.default_processing_tokens > 0) { n_ctx = app.cfg.default_processing_tokens; } else { const int32_t cl = app.canvas_length > 0 ? app.canvas_length : app.cfg.default_max_tokens; const int32_t blocks = app.canvas_length > 0 ? std::max(1, (app.cfg.default_max_tokens + cl - 1) / cl) : 1; n_ctx = std::max(app.cfg.default_max_tokens, blocks * cl + app.cfg.ctx_headroom_tokens); } } int32_t n_batch = app.cfg.context_pool_n_batch > 0 ? app.cfg.context_pool_n_batch : n_ctx; int32_t n_ubatch = app.cfg.context_pool_n_ubatch > 0 ? app.cfg.context_pool_n_ubatch : n_batch; app.context_pool_n_ctx_actual = n_ctx; app.context_pool_n_batch_actual = n_batch; app.context_pool_n_ubatch_actual = n_ubatch; std::cerr << "Creating context pool: size=" << app.cfg.context_pool_size << ", n_ctx=" << n_ctx << ", n_batch=" << n_batch << ", n_ubatch=" << n_ubatch << "\n"; uint64_t pool_bytes = 0; for (int i = 0; i < app.cfg.context_pool_size; ++i) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = n_ctx; ctx_params.n_batch = n_batch; ctx_params.n_ubatch = n_ubatch; ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; std::unique_lock context_lock; if (app.cfg.serialize_context_creation) { context_lock = std::unique_lock(app.context_init_mutex); } llama_context * ctx = llama_init_from_model(app.model, ctx_params); if (!ctx) { throw std::runtime_error("failed to create pooled llama_context"); } llama_set_n_threads(ctx, app.cfg.n_threads, app.cfg.n_threads); llama_memory_clear(llama_get_memory(ctx), true); auto slot = std::make_unique(); slot->ctx = ctx; slot->n_ctx = n_ctx; slot->n_batch = n_batch; slot->n_ubatch = n_ubatch; slot->slot_id = (uint64_t) i + 1; app.context_pool_available.push_back(slot.get()); app.context_pool.push_back(std::move(slot)); pool_bytes += estimate_request_memory_bytes(app, n_ctx, n_batch, n_ubatch); } app.context_pool_memory_bytes = pool_bytes; // The admission gate treats context_pool_memory_bytes as a separate resident // baseline. Active pooled requests then reserve only request_base_memory_bytes. std::cerr << "Context pool ready. Estimated resident pool memory = " << bytes_to_mib(pool_bytes) << " MiB\n"; } static void free_context_pool(App & app) { std::lock_guard lock(app.context_pool_mutex); for (auto & slot : app.context_pool) { if (slot && slot->ctx) { llama_free(slot->ctx); slot->ctx = nullptr; } } app.context_pool_available.clear(); app.context_pool.clear(); app.active_pooled_contexts = 0; } static json memory_status_snapshot_locked(const App & app) { const uint64_t os_total = os_total_physical_memory_bytes(); const uint64_t os_avail = os_available_physical_memory_bytes(); const GpuMemoryInfo gpu_now = detect_gpu_memory_info(app.cfg.gpu_memory_policy); const uint64_t tracked_total = app.model_memory_bytes + app.context_pool_memory_bytes + app.active_request_memory_bytes + app.cfg.memory_safety_margin_bytes; json j; j["model_memory_bytes"] = app.model_memory_bytes; j["model_memory_mib"] = bytes_to_mib(app.model_memory_bytes); j["active_request_memory_bytes"] = app.active_request_memory_bytes; j["active_request_memory_mib"] = bytes_to_mib(app.active_request_memory_bytes); j["peak_request_memory_bytes"] = app.peak_request_memory_bytes; j["peak_request_memory_mib"] = bytes_to_mib(app.peak_request_memory_bytes); j["queued_request_memory_bytes"] = app.queued_request_memory_bytes; j["queued_request_memory_mib"] = bytes_to_mib(app.queued_request_memory_bytes); j["peak_queued_request_memory_bytes"] = app.peak_queued_request_memory_bytes; j["peak_queued_request_memory_mib"] = bytes_to_mib(app.peak_queued_request_memory_bytes); j["tracked_total_with_safety_bytes"] = tracked_total; j["tracked_total_with_safety_mib"] = bytes_to_mib(tracked_total); j["memory_limit_bytes"] = app.cfg.memory_limit_bytes; j["memory_limit_mib"] = bytes_to_mib(app.cfg.memory_limit_bytes); j["memory_safety_margin_bytes"] = app.cfg.memory_safety_margin_bytes; j["memory_safety_margin_mib"] = bytes_to_mib(app.cfg.memory_safety_margin_bytes); j["bytes_per_token"] = app.cfg.bytes_per_token; j["request_base_memory_bytes"] = app.cfg.request_base_memory_bytes; j["request_base_memory_mib"] = bytes_to_mib(app.cfg.request_base_memory_bytes); j["active_requests"] = app.active_requests; j["queued_requests"] = app.queued_requests; j["queue_depth"] = (int) app.queue_order.size(); j["next_queue_ticket"] = app.next_queue_ticket; j["max_concurrent_requests"] = app.cfg.max_concurrent_requests; j["fifo_queue"] = app.cfg.fifo_queue; j["max_queue_requests"] = app.cfg.max_queue_requests; j["max_queue_memory_bytes"] = app.cfg.max_queue_memory_bytes; j["max_queue_memory_mib"] = bytes_to_mib(app.cfg.max_queue_memory_bytes); j["serialize_context_creation"] = app.cfg.serialize_context_creation; j["serialize_generation"] = app.cfg.serialize_generation; j["post_context_memory_check"] = app.cfg.post_context_memory_check; j["http_worker_threads"] = app.cfg.http_worker_threads; j["context_pool_size"] = app.cfg.context_pool_size; j["context_pool_strict"] = app.cfg.context_pool_strict; j["context_pool_clear_on_release"] = app.cfg.context_pool_clear_on_release; j["context_pool_n_ctx"] = app.context_pool_n_ctx_actual; j["context_pool_n_batch"] = app.context_pool_n_batch_actual; j["context_pool_n_ubatch"] = app.context_pool_n_ubatch_actual; j["context_pool_memory_bytes"] = app.context_pool_memory_bytes; j["context_pool_memory_mib"] = bytes_to_mib(app.context_pool_memory_bytes); j["context_pool_available"] = (int) app.context_pool_available.size(); j["active_pooled_contexts"] = app.active_pooled_contexts; j["context_pool_reuses"] = app.context_pool_reuses; j["context_pool_misses"] = app.context_pool_misses; j["os_total_physical_bytes"] = os_total; j["os_total_physical_mib"] = bytes_to_mib(os_total); j["os_available_physical_bytes"] = os_avail == std::numeric_limits::max() ? 0 : os_avail; j["os_available_physical_mib"] = os_avail == std::numeric_limits::max() ? 0.0 : bytes_to_mib(os_avail); j["gpu_memory_detected"] = app.gpu_memory_detected; j["gpu_memory_policy"] = app.cfg.gpu_memory_policy; j["gpu_memory_source"] = app.gpu_memory_source; j["gpu_adapter_name"] = app.gpu_adapter_name; j["gpu_adapter_count"] = app.gpu_adapter_count; j["gpu_dedicated_memory_bytes"] = app.gpu_dedicated_memory_bytes; j["gpu_dedicated_memory_mib"] = bytes_to_mib(app.gpu_dedicated_memory_bytes); j["gpu_budget_bytes_at_start"] = app.gpu_budget_bytes; j["gpu_budget_mib_at_start"] = bytes_to_mib(app.gpu_budget_bytes); j["gpu_current_usage_bytes_at_start"] = app.gpu_current_usage_bytes_at_start; j["gpu_current_usage_mib_at_start"] = bytes_to_mib(app.gpu_current_usage_bytes_at_start); j["gpu_available_budget_bytes_at_start"] = app.gpu_available_budget_bytes_at_start; j["gpu_available_budget_mib_at_start"] = bytes_to_mib(app.gpu_available_budget_bytes_at_start); j["gpu_budget_bytes_now"] = gpu_now.budget_bytes; j["gpu_budget_mib_now"] = bytes_to_mib(gpu_now.budget_bytes); j["gpu_current_usage_bytes_now"] = gpu_now.current_usage_bytes; j["gpu_current_usage_mib_now"] = bytes_to_mib(gpu_now.current_usage_bytes); j["gpu_available_budget_bytes_now"] = gpu_now.available_budget_bytes; j["gpu_available_budget_mib_now"] = bytes_to_mib(gpu_now.available_budget_bytes); j["gpu_dynamic_memory_guard"] = app.cfg.gpu_dynamic_memory_guard; j["memory_limit_auto_detected"] = app.cfg.memory_limit_auto_detected; return j; } static json memory_status_snapshot(App & app) { std::lock_guard guard(app.memory_mutex); return memory_status_snapshot_locked(app); } static json physical_memory_pressure_snapshot(const App & app) { const uint64_t os_total = os_total_physical_memory_bytes(); const uint64_t os_avail_raw = os_available_physical_memory_bytes(); const bool os_avail_valid = os_avail_raw != std::numeric_limits::max(); const uint64_t os_avail = os_avail_valid ? os_avail_raw : 0; const uint64_t os_used = (os_total > 0 && os_avail_valid && os_total >= os_avail) ? (os_total - os_avail) : 0; const GpuMemoryInfo gpu_now = detect_gpu_memory_info(app.cfg.gpu_memory_policy); const bool gpu_valid = gpu_now.valid || app.gpu_memory_detected; const std::string gpu_source = gpu_now.valid ? gpu_now.source : app.gpu_memory_source; const std::string gpu_adapter = gpu_now.valid ? gpu_now.adapter_name : app.gpu_adapter_name; const uint64_t gpu_dedicated = gpu_now.valid ? gpu_now.dedicated_video_memory_bytes : app.gpu_dedicated_memory_bytes; const uint64_t gpu_budget = gpu_now.valid ? gpu_now.budget_bytes : app.gpu_budget_bytes; const uint64_t gpu_used = gpu_now.valid ? gpu_now.current_usage_bytes : app.gpu_current_usage_bytes_at_start; const uint64_t gpu_available = gpu_now.valid ? gpu_now.available_budget_bytes : app.gpu_available_budget_bytes_at_start; json j; j["system_physical"] = { {"total_bytes", os_total}, {"total_mib", bytes_to_mib(os_total)}, {"available_bytes", os_avail}, {"available_mib", bytes_to_mib(os_avail)}, {"currently_consumed_bytes", os_used}, {"currently_consumed_mib", bytes_to_mib(os_used)}, {"available_valid", os_avail_valid} }; j["gpu_physical"] = { {"detected", gpu_valid}, {"source", gpu_source}, {"policy", app.cfg.gpu_memory_policy}, {"adapter_name", gpu_adapter}, {"dedicated_total_bytes", gpu_dedicated}, {"dedicated_total_mib", bytes_to_mib(gpu_dedicated)}, {"budget_bytes_now", gpu_budget}, {"budget_mib_now", bytes_to_mib(gpu_budget)}, {"currently_consumed_bytes_now", gpu_used}, {"currently_consumed_mib_now", bytes_to_mib(gpu_used)}, {"available_budget_bytes_now", gpu_available}, {"available_budget_mib_now", bytes_to_mib(gpu_available)} }; return j; } struct MemoryReservation { App * app = nullptr; uint64_t request_id = 0; uint64_t bytes = 0; bool active = false; MemoryReservation() = default; MemoryReservation(App * app_, uint64_t request_id_, uint64_t bytes_) : app(app_), request_id(request_id_), bytes(bytes_), active(true) {} MemoryReservation(const MemoryReservation &) = delete; MemoryReservation & operator=(const MemoryReservation &) = delete; MemoryReservation(MemoryReservation && other) noexcept { app = other.app; request_id = other.request_id; bytes = other.bytes; active = other.active; other.app = nullptr; other.bytes = 0; other.active = false; } MemoryReservation & operator=(MemoryReservation && other) noexcept { if (this != &other) { release(); app = other.app; request_id = other.request_id; bytes = other.bytes; active = other.active; other.app = nullptr; other.bytes = 0; other.active = false; } return *this; } ~MemoryReservation() { release(); } void release() { if (!active || !app) { return; } { std::lock_guard guard(app->memory_mutex); if (app->active_request_memory_bytes >= bytes) { app->active_request_memory_bytes -= bytes; } else { app->active_request_memory_bytes = 0; } app->active_requests = std::max(0, app->active_requests - 1); } app->memory_cv.notify_all(); active = false; } }; static bool request_exceeds_fixed_budget_locked(const App & app, uint64_t request_bytes) { if (app.cfg.memory_limit_bytes == 0) { return false; } return app.model_memory_bytes + app.context_pool_memory_bytes + request_bytes + app.cfg.memory_safety_margin_bytes > app.cfg.memory_limit_bytes; } static bool queue_is_over_capacity_locked(const App & app, uint64_t request_bytes) { if (app.cfg.max_queue_requests > 0 && app.queued_requests >= app.cfg.max_queue_requests) { return true; } if (app.cfg.max_queue_memory_bytes > 0 && app.queued_request_memory_bytes + request_bytes > app.cfg.max_queue_memory_bytes) { return true; } return false; } static void erase_queue_ticket_locked(App & app, uint64_t ticket) { auto it = std::find(app.queue_order.begin(), app.queue_order.end(), ticket); if (it != app.queue_order.end()) { app.queue_order.erase(it); } } static bool queue_turn_ready_locked(const App & app, uint64_t ticket) { if (!app.cfg.fifo_queue) { return true; } return !app.queue_order.empty() && app.queue_order.front() == ticket; } static bool can_admit_request_locked(const App & app, uint64_t request_bytes, uint64_t ticket) { if (!queue_turn_ready_locked(app, ticket)) { return false; } if (app.cfg.max_concurrent_requests > 0 && app.active_requests >= app.cfg.max_concurrent_requests) { return false; } if (app.cfg.memory_limit_bytes > 0) { const uint64_t projected = app.model_memory_bytes + app.context_pool_memory_bytes + app.active_request_memory_bytes + request_bytes + app.cfg.memory_safety_margin_bytes; if (projected > app.cfg.memory_limit_bytes) { return false; } } // Dynamic GPU memory guard. DXGI Budget/CurrentUsage reflects the current // local video-memory pressure, including other processes. This prevents a // request from starting when the GPU budget is already too tight. if (app.cfg.gpu_dynamic_memory_guard) { const GpuMemoryInfo gpu = detect_gpu_memory_info(app.cfg.gpu_memory_policy); if (gpu.valid && gpu.budget_bytes > 0) { if (request_bytes + app.cfg.memory_safety_margin_bytes > gpu.available_budget_bytes) { return false; } } else { // Fallback OS physical memory guard when GPU memory telemetry is unavailable. const uint64_t avail_phys = os_available_physical_memory_bytes(); if (avail_phys != std::numeric_limits::max()) { if (request_bytes + app.cfg.memory_safety_margin_bytes > avail_phys) { return false; } } } } return true; } static MemoryReservation acquire_memory_reservation( App & app, uint64_t request_id, const std::string & endpoint, uint64_t request_bytes, uint64_t & queue_wait_ms_out, uint64_t & active_after_admit_out, uint64_t & os_available_at_admit_out, uint64_t & gpu_available_budget_at_admit_out, uint64_t & queue_ticket_out, int & active_requests_after_admit_out, int & queued_requests_after_admit_out, uint64_t & queued_memory_after_admit_out ) { const auto t_queue_start = std::chrono::steady_clock::now(); std::unique_lock lock(app.memory_mutex); if (request_exceeds_fixed_budget_locked(app, request_bytes)) { std::ostringstream err; err << "single request exceeds configured memory budget: model_memory_mib=" << bytes_to_mib(app.model_memory_bytes) << ", context_pool_memory_mib=" << bytes_to_mib(app.context_pool_memory_bytes) << ", request_memory_mib=" << bytes_to_mib(request_bytes) << ", safety_margin_mib=" << bytes_to_mib(app.cfg.memory_safety_margin_bytes) << ", memory_limit_mib=" << bytes_to_mib(app.cfg.memory_limit_bytes) << ". Increase --memory-limit-mb or reduce max_tokens / processing_tokens."; throw std::runtime_error(err.str()); } if (queue_is_over_capacity_locked(app, request_bytes)) { std::ostringstream err; err << "request rejected because memory queue is full: queued_requests=" << app.queued_requests << ", max_queue_requests=" << app.cfg.max_queue_requests << ", queued_memory_mib=" << bytes_to_mib(app.queued_request_memory_bytes) << ", max_queue_memory_mib=" << bytes_to_mib(app.cfg.max_queue_memory_bytes); throw std::runtime_error(err.str()); } const uint64_t ticket = app.next_queue_ticket++; queue_ticket_out = ticket; app.queued_requests++; app.queued_request_memory_bytes += request_bytes; app.peak_queued_request_memory_bytes = std::max(app.peak_queued_request_memory_bytes, app.queued_request_memory_bytes); app.queue_order.push_back(ticket); bool admitted = false; auto admitted_pred = [&]() { return can_admit_request_locked(app, request_bytes, ticket); }; try { if (app.cfg.queue_timeout_ms == 0) { app.memory_cv.wait(lock, admitted_pred); admitted = true; } else { const auto timeout = std::chrono::milliseconds(app.cfg.queue_timeout_ms); admitted = app.memory_cv.wait_for(lock, timeout, admitted_pred); } } catch (...) { erase_queue_ticket_locked(app, ticket); app.queued_requests = std::max(0, app.queued_requests - 1); if (app.queued_request_memory_bytes >= request_bytes) { app.queued_request_memory_bytes -= request_bytes; } else { app.queued_request_memory_bytes = 0; } app.memory_cv.notify_all(); throw; } if (!admitted) { erase_queue_ticket_locked(app, ticket); app.queued_requests = std::max(0, app.queued_requests - 1); if (app.queued_request_memory_bytes >= request_bytes) { app.queued_request_memory_bytes -= request_bytes; } else { app.queued_request_memory_bytes = 0; } app.memory_cv.notify_all(); std::ostringstream err; err << "request timed out waiting for memory queue after " << app.cfg.queue_timeout_ms << " ms. request_memory_mib=" << bytes_to_mib(request_bytes) << ", ticket=" << ticket << ", current_memory=" << memory_status_snapshot_locked(app).dump(); throw std::runtime_error(err.str()); } erase_queue_ticket_locked(app, ticket); app.queued_requests = std::max(0, app.queued_requests - 1); if (app.queued_request_memory_bytes >= request_bytes) { app.queued_request_memory_bytes -= request_bytes; } else { app.queued_request_memory_bytes = 0; } app.active_request_memory_bytes += request_bytes; app.peak_request_memory_bytes = std::max(app.peak_request_memory_bytes, app.active_request_memory_bytes); app.active_requests++; active_after_admit_out = app.active_request_memory_bytes; active_requests_after_admit_out = app.active_requests; queued_requests_after_admit_out = app.queued_requests; queued_memory_after_admit_out = app.queued_request_memory_bytes; os_available_at_admit_out = os_available_physical_memory_bytes(); const GpuMemoryInfo gpu_at_admit = detect_gpu_memory_info(app.cfg.gpu_memory_policy); gpu_available_budget_at_admit_out = gpu_at_admit.available_budget_bytes; const auto t_queue_end = std::chrono::steady_clock::now(); queue_wait_ms_out = (uint64_t) std::chrono::duration_cast(t_queue_end - t_queue_start).count(); (void) endpoint; return MemoryReservation(&app, request_id, request_bytes); } static std::string json_error(const std::string & message, int code = 500) { json j; j["error"] = { {"message", message}, {"type", "server_error"}, {"code", code} }; return j.dump(); } static int json_int_any(const json & body, const std::vector & keys, int def) { for (const char * key : keys) { if (body.contains(key) && body[key].is_number_integer()) { return body[key].get(); } } return def; } static bool json_bool_any(const json & body, const std::vector & keys, bool def) { for (const char * key : keys) { if (body.contains(key) && body[key].is_boolean()) { return body[key].get(); } } return def; } static RequestOptions request_options_from_body(const App & app, const json & body, bool completion_endpoint) { RequestOptions opts; opts.max_tokens = completion_endpoint ? json_int_any(body, {"n_predict", "max_tokens"}, app.cfg.default_max_tokens) : json_int_any(body, {"max_tokens", "n_predict"}, app.cfg.default_max_tokens); opts.temperature = body.value("temperature", app.cfg.temperature); opts.stream = json_bool_any(body, {"stream"}, false); opts.processing_tokens = json_int_any( body, {"processing_tokens", "allocated_tokens", "alloc_tokens"}, app.cfg.default_processing_tokens ); opts.n_ctx = json_int_any(body, {"n_ctx", "ctx_size", "context_size"}, app.cfg.default_n_ctx); opts.n_batch = json_int_any(body, {"n_batch", "batch_size"}, app.cfg.default_n_batch); opts.n_ubatch = json_int_any(body, {"n_ubatch", "ubatch_size", "micro_batch_size"}, app.cfg.default_n_ubatch); opts.ctx_headroom_tokens = json_int_any( body, {"ctx_headroom_tokens", "context_headroom", "processing_headroom"}, app.cfg.ctx_headroom_tokens ); opts.stream_chunk_chars = json_int_any( body, {"stream_chunk_chars", "stream_chunk_size"}, app.cfg.stream_chunk_chars ); if (opts.max_tokens <= 0) { opts.max_tokens = app.cfg.default_max_tokens; } if (opts.ctx_headroom_tokens < 0) { opts.ctx_headroom_tokens = 0; } if (opts.stream_chunk_chars <= 0) { opts.stream_chunk_chars = app.cfg.stream_chunk_chars > 0 ? app.cfg.stream_chunk_chars : 96; } return opts; } static const json * json_find_ptr(const json & root, const std::vector & path) { const json * cur = &root; for (const char * key : path) { if (!cur->is_object() || !cur->contains(key)) { return nullptr; } cur = &((*cur)[key]); } return cur; } static double json_number_or(const json & root, const std::vector & path, double def = 0.0) { const json * p = json_find_ptr(root, path); if (!p || !p->is_number()) { return def; } return p->get(); } static int64_t json_i64_or(const json & root, const std::vector & path, int64_t def = 0) { const json * p = json_find_ptr(root, path); if (!p || !p->is_number_integer()) { return def; } return p->get(); } static bool json_bool_or(const json & root, const std::vector & path, bool def = false) { const json * p = json_find_ptr(root, path); if (!p || !p->is_boolean()) { return def; } return p->get(); } static std::string json_string_or(const json & root, const std::vector & path, const std::string & def = "") { const json * p = json_find_ptr(root, path); if (!p || !p->is_string()) { return def; } return p->get(); } static json compact_memory_pressure_summary(const App & app) { const json pressure = physical_memory_pressure_snapshot(app); json j; j["system_available_mib"] = json_number_or(pressure, {"system_physical", "available_mib"}); j["system_consumed_mib"] = json_number_or(pressure, {"system_physical", "currently_consumed_mib"}); j["system_total_mib"] = json_number_or(pressure, {"system_physical", "total_mib"}); j["gpu_detected"] = json_bool_or(pressure, {"gpu_physical", "detected"}); j["gpu_source"] = json_string_or(pressure, {"gpu_physical", "source"}); j["gpu_adapter"] = json_string_or(pressure, {"gpu_physical", "adapter_name"}); j["gpu_available_mib"] = json_number_or(pressure, {"gpu_physical", "available_budget_mib_now"}); j["gpu_consumed_mib"] = json_number_or(pressure, {"gpu_physical", "currently_consumed_mib_now"}); j["gpu_budget_mib"] = json_number_or(pressure, {"gpu_physical", "budget_mib_now"}); j["gpu_dedicated_mib"] = json_number_or(pressure, {"gpu_physical", "dedicated_total_mib"}); return j; } static json summarize_audit_record(App & app, const json & record) { const std::string event = record.value("event", ""); json out; out["event"] = event; // Always include the resolved model identity in compact audit/debug logs. out["model"] = record.value("model", app.model_name); out["model_arch"] = record.value("model_arch", app.model_architecture); if (record.contains("timestamp")) { out["timestamp"] = record["timestamp"]; } if (record.contains("request_id")) { out["request_id"] = record["request_id"]; } if (record.contains("endpoint")) { out["endpoint"] = record["endpoint"]; } if (record.contains("success")) { out["success"] = record["success"]; } if (event == "generation_complete") { out["tokens"] = { {"prompt", json_i64_or(record, {"tokens", "prompt_tokens"})}, {"requested", json_i64_or(record, {"tokens", "requested_max_tokens"})}, {"output", json_i64_or(record, {"tokens", "output_tokens"})}, {"canvas_processed", json_i64_or(record, {"tokens", "canvas_tokens_processed"})} }; out["timing_s"] = { {"queue", json_number_or(record, {"timings_seconds", "queue_wait"})}, {"context", json_number_or(record, {"timings_seconds", "context_create"})}, {"context_pool_wait", json_number_or(record, {"timings_seconds", "context_pool_wait"})}, {"generation", json_number_or(record, {"timings_seconds", "diffusion_generate"})}, {"total", json_number_or(record, {"timings_seconds", "total_request"})} }; out["speed"] = { {"output_tps_total", json_number_or(record, {"throughput", "output_tokens_per_sec_total"})}, {"output_tps_generation", json_number_or(record, {"throughput", "output_tokens_per_sec_generation_only"})}, {"canvas_tps_generation", json_number_or(record, {"throughput", "canvas_tokens_per_sec_generation_only"})} }; out["allocation"] = { {"n_ctx", json_i64_or(record, {"diffusion", "n_ctx"})}, {"n_batch", json_i64_or(record, {"diffusion", "n_batch"})}, {"n_ubatch", json_i64_or(record, {"diffusion", "n_ubatch"})}, {"blocks", json_i64_or(record, {"diffusion", "blocks_completed"})}, {"canvas_length", json_i64_or(record, {"diffusion", "canvas_length"})}, {"request_est_mib", json_number_or(record, {"memory", "estimated_request_memory_mib"})}, {"model_mib", json_number_or(record, {"memory", "model_memory_mib"})}, {"context_reused", json_bool_or(record, {"memory", "context_reused"})}, {"context_pool_slot", json_i64_or(record, {"memory", "context_pool_slot_id"})}, {"context_pool_mib", json_number_or(record, {"memory", "context_pool_memory_mib"})} }; out["queue"] = { {"ticket", json_i64_or(record, {"memory", "queue_ticket"})}, {"active_after_admit", json_i64_or(record, {"memory", "active_requests_after_admit"})}, {"queued_after_admit", json_i64_or(record, {"memory", "queued_requests_after_admit"})}, {"active_mib_after_admit", json_number_or(record, {"memory", "active_request_memory_mib_after_admit"})}, {"active_mib_after_release", json_number_or(record, {"memory", "active_request_memory_mib_after_release"})} }; json mem = compact_memory_pressure_summary(app); mem["memory_limit_mib"] = json_number_or(record, {"memory", "memory_limit_mib"}); mem["safety_margin_mib"] = json_number_or(record, {"memory", "memory_safety_margin_mib"}); out["memory"] = mem; } else if (event == "request_memory_admitted") { out["queue"] = { {"ticket", json_i64_or(record, {"queue_ticket"})}, {"wait_ms", json_i64_or(record, {"queue_wait_ms"})}, {"active_requests", json_i64_or(record, {"active_requests_after_admit"})}, {"queued_requests", json_i64_or(record, {"queued_requests_after_admit"})} }; out["memory"] = { {"request_est_mib", json_number_or(record, {"estimated_request_memory_mib"})}, {"active_mib_after_admit", json_number_or(record, {"active_request_memory_mib_after_admit"})}, {"queued_mib_after_admit", json_number_or(record, {"queued_request_memory_mib_after_admit"})}, {"system_available_mib_at_admit", json_number_or(record, {"os_available_physical_mib_at_admit"})}, {"gpu_available_mib_at_admit", json_number_or(record, {"gpu_available_budget_mib_at_admit"})} }; } else if (event == "request_begin") { out["request"] = { {"max_tokens", record.value("max_tokens", 0)}, {"temperature", record.value("temperature", 0.0)}, {"stream", record.value("stream", false)}, {"processing_tokens", record.value("processing_tokens", 0)}, {"n_ctx", record.value("n_ctx", 0)}, {"n_batch", record.value("n_batch", 0)}, {"n_ubatch", record.value("n_ubatch", 0)}, {"ctx_headroom_tokens", record.value("ctx_headroom_tokens", 0)} }; } else if (event == "request_error") { out["error"] = record.value("error", ""); out["memory"] = compact_memory_pressure_summary(app); } else if (event == "server_start") { out["server"] = { {"host", record.value("host", "")}, {"port", record.value("port", 0)}, {"model_name", record.value("model_name", app.model_name)}, {"model_arch", record.value("model_arch", app.model_architecture)}, {"model_path", record.value("model_path", app.cfg.model_path)}, {"canvas_length", record.value("canvas_length", 0)}, {"default_max_tokens", record.value("default_max_tokens", 0)}, {"max_concurrent_requests", record.value("max_concurrent_requests", 0)}, {"http_worker_threads", record.value("http_worker_threads", 0)} }; out["memory"] = { {"model_mib", record.value("model_memory_mib", 0.0)}, {"memory_limit_mib", record.value("memory_limit_mib", 0.0)}, {"safety_margin_mib", record.value("memory_safety_margin_mib", 0.0)}, {"gpu_detected", record.value("gpu_memory_detected", false)}, {"gpu_source", record.value("gpu_memory_source", "")}, {"gpu_adapter", record.value("gpu_adapter_name", "")}, {"gpu_budget_mib", record.value("gpu_budget_mib", 0.0)}, {"gpu_available_mib_at_start", record.value("gpu_available_budget_mib_at_start", 0.0)} }; } else { out = record; out["memory"] = compact_memory_pressure_summary(app); } return out; } static void audit_log(App & app, json record) { if (!app.cfg.audit_enabled) { return; } if (!record.contains("timestamp")) { record["timestamp"] = timestamp_utc_ms(); } // Full audit/debug records also carry model identity unless explicitly set. if (!record.contains("model")) { record["model"] = app.model_name; } if (!record.contains("model_arch")) { record["model_arch"] = app.model_architecture; } json log_record = record; if (app.cfg.audit_summary) { log_record = summarize_audit_record(app, record); } else if (app.cfg.audit_memory_pressure && !log_record.contains("memory_pressure")) { log_record["memory_pressure"] = physical_memory_pressure_snapshot(app); } const std::string line = log_record.dump(); std::lock_guard guard(app.audit_mutex); std::cerr << "[audit] " << line << "\n"; if (!app.cfg.audit_log_path.empty()) { std::ofstream f(app.cfg.audit_log_path, std::ios::out | std::ios::app); if (f) { f << line << "\n"; } else { std::cerr << "[audit_error] failed to open audit log file: " << app.cfg.audit_log_path << "\n"; } } } static std::string clean_diffusion_output(std::string s) { const std::string raw = trim_ws(s); // DiffusionGemma may output: // <|channel>thought ... final answer // Keep the text after the last channel close marker, but only if non-empty. const std::string close = ""; const size_t p = s.rfind(close); if (p != std::string::npos) { std::string tail = trim_ws(s.substr(p + close.size())); if (!tail.empty()) { s = tail; } } s = trim_ws(s); // Remove leading channel markers, but never return empty just because cleanup was aggressive. const std::string open = "<|channel>"; while (s.rfind(open, 0) == 0) { s.erase(0, open.size()); s = trim_ws(s); } const char * prefixes[] = { "final", "answer", "assistant" }; for (const char * pref : prefixes) { std::string pfx(pref); if (s.rfind(pfx, 0) == 0) { s.erase(0, pfx.size()); s = trim_ws(s); break; } } const char * stops[] = { "<|end|>", "", "" }; for (const char * stop : stops) { const size_t q = s.find(stop); if (q != std::string::npos) { s.erase(q); } } s = trim_ws(s); return s.empty() ? raw : s; } static int32_t get_meta_i(llama_model * model, const char * key, int32_t def) { char buf[64] = {}; if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) >= 0) { return (int32_t) strtol(buf, nullptr, 10); } return def; } static float get_meta_f(llama_model * model, const char * key, float def) { char buf[64] = {}; if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) >= 0) { return strtof(buf, nullptr); } return def; } static bool get_meta_bool(llama_model * model, const char * key, bool def) { char buf[64] = {}; if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) >= 0) { std::string v = buf; return v == "true" || v == "1"; } return def; } static std::string get_meta_string(llama_model * model, const char * key, const std::string & def) { char buf[512] = {}; if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) >= 0) { std::string v = trim_ws(buf); if (!v.empty()) { return v; } } return def; } static std::string basename_from_path(const std::string & path) { size_t p = path.find_last_of("\\/"); if (p == std::string::npos) { return path; } return path.substr(p + 1); } static int count_gpu_devices() { int gpu_devs = 0; for (size_t i = 0; i < ggml_backend_dev_count(); i++) { const ggml_backend_dev_t dev = ggml_backend_dev_get(i); const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev); if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) { gpu_devs++; } } return gpu_devs; } static size_t trim_canvas_tokens(const llama_vocab * vocab, const llama_token * canvas, size_t n) { size_t cut = n; for (size_t i = 0; i < n; i++) { if (llama_vocab_is_eog(vocab, canvas[i])) { cut = i; break; } } // DiffusionGemma checkpoints sometimes do not emit a clean stop token. // Cut obvious repetition loops. for (size_t i = 0; i + 1 < cut; i++) { bool loop = false; for (size_t stride = 1; stride <= 2 && !loop; stride++) { size_t reps = 0; for (size_t j = i; j + stride < cut && canvas[j] == canvas[j + stride]; j += stride) { reps++; } loop = reps >= 6; } if (loop) { cut = i; break; } } return cut; } static std::string messages_to_prompt(llama_model * model, const json & messages, bool use_chat_template) { if (!messages.is_array()) { throw std::runtime_error("messages must be an array"); } if (!use_chat_template) { std::ostringstream raw; for (const auto & m : messages) { const std::string role = m.value("role", "user"); const std::string content = m.value("content", ""); raw << role << ": " << content << "\n"; } raw << "assistant: "; return raw.str(); } auto tmpls = common_chat_templates_init(model, ""); common_chat_templates_inputs inputs; inputs.add_generation_prompt = true; for (const auto & m : messages) { common_chat_msg msg; msg.role = m.value("role", "user"); if (m.contains("content") && m["content"].is_string()) { msg.content = m["content"].get(); } else if (m.contains("content") && m["content"].is_array()) { std::ostringstream content; for (const auto & part : m["content"]) { if (part.is_object() && part.value("type", "") == "text") { content << part.value("text", ""); } } msg.content = content.str(); } else { msg.content = ""; } inputs.messages.push_back(msg); } return common_chat_templates_apply(tmpls.get(), inputs).prompt; } static std::string completion_prompt_from_body(const json & body) { if (!body.contains("prompt")) { throw std::runtime_error("missing prompt"); } if (!body["prompt"].is_string()) { throw std::runtime_error("prompt must be a string"); } return body["prompt"].get(); } static GenerationResult run_one_request( App & app, uint64_t request_id, const std::string & endpoint, const std::string & formatted_prompt, const RequestOptions & opts ) { const auto t_enter = std::chrono::steady_clock::now(); const int max_tokens = opts.max_tokens > 0 ? opts.max_tokens : app.cfg.default_max_tokens; const float temperature = opts.temperature; const auto t_tokenize_start = std::chrono::steady_clock::now(); std::vector prefix = common_tokenize( app.vocab, formatted_prompt, true, true ); const auto t_tokenize_end = std::chrono::steady_clock::now(); if (prefix.empty()) { throw std::runtime_error("tokenization produced no tokens"); } const int32_t n_input = (int32_t) prefix.size(); llama_token mask_token_id = llama_vocab_mask(app.vocab); if (mask_token_id == LLAMA_TOKEN_NULL) { throw std::runtime_error("model has no mask token"); } int32_t blocks_requested = 1; int32_t blocks_completed = 0; int32_t min_required_tokens = n_input + max_tokens; int32_t n_ctx = min_required_tokens; if (app.canvas_length > 0) { const int32_t cl = app.canvas_length; blocks_requested = (max_tokens + cl - 1) / cl; blocks_requested = std::max(1, blocks_requested); min_required_tokens = n_input + blocks_requested * cl; // Match diffusion-cli.cpp sizing style unless the user overrides it: // -n 256 -> 1 block // n_ctx/n_batch/n_ubatch = blocks * canvas_length + ctx_headroom_tokens const int32_t cli_style = blocks_requested * cl + opts.ctx_headroom_tokens; n_ctx = std::max(cli_style, min_required_tokens); } if (opts.processing_tokens > 0) { n_ctx = opts.processing_tokens; } if (opts.n_ctx > 0) { n_ctx = opts.n_ctx; } if (n_ctx < min_required_tokens) { std::ostringstream err; err << "allocated context is too small: n_ctx=" << n_ctx << " but minimum required is " << min_required_tokens << " tokens. Increase processing_tokens / allocated_tokens / n_ctx or lower max_tokens."; throw std::runtime_error(err.str()); } int32_t n_batch = opts.n_batch > 0 ? opts.n_batch : n_ctx; int32_t n_ubatch = opts.n_ubatch > 0 ? opts.n_ubatch : n_batch; if (n_batch < min_required_tokens || n_ubatch < min_required_tokens) { std::ostringstream err; err << "allocated batch is too small: n_batch=" << n_batch << ", n_ubatch=" << n_ubatch << ", minimum required=" << min_required_tokens << ". For DiffusionGemma, keep n_batch/n_ubatch >= prompt_tokens + canvas tokens."; throw std::runtime_error(err.str()); } const bool context_pool_candidate = context_pool_can_satisfy(app, n_ctx, n_batch, n_ubatch); const uint64_t estimated_request_memory_bytes = estimate_active_request_memory_bytes( app, n_ctx, n_batch, n_ubatch, context_pool_candidate ); uint64_t queue_wait_ms = 0; uint64_t active_memory_after_admit = 0; uint64_t os_available_at_admit = 0; uint64_t gpu_available_budget_at_admit = 0; uint64_t queue_ticket = 0; int active_requests_after_admit = 0; int queued_requests_after_admit = 0; uint64_t queued_memory_after_admit = 0; MemoryReservation memory_reservation = acquire_memory_reservation( app, request_id, endpoint, estimated_request_memory_bytes, queue_wait_ms, active_memory_after_admit, os_available_at_admit, gpu_available_budget_at_admit, queue_ticket, active_requests_after_admit, queued_requests_after_admit, queued_memory_after_admit ); const auto t_lock_acquired = std::chrono::steady_clock::now(); json admit_audit; admit_audit["event"] = "request_memory_admitted"; admit_audit["request_id"] = request_id; admit_audit["endpoint"] = endpoint; admit_audit["queue_ticket"] = queue_ticket; admit_audit["estimated_request_memory_bytes"] = estimated_request_memory_bytes; admit_audit["estimated_request_memory_mib"] = bytes_to_mib(estimated_request_memory_bytes); admit_audit["queue_wait_ms"] = queue_wait_ms; admit_audit["active_request_memory_bytes_after_admit"] = active_memory_after_admit; admit_audit["active_request_memory_mib_after_admit"] = bytes_to_mib(active_memory_after_admit); admit_audit["active_requests_after_admit"] = active_requests_after_admit; admit_audit["queued_requests_after_admit"] = queued_requests_after_admit; admit_audit["queued_request_memory_bytes_after_admit"] = queued_memory_after_admit; admit_audit["queued_request_memory_mib_after_admit"] = bytes_to_mib(queued_memory_after_admit); admit_audit["os_available_physical_bytes_at_admit"] = os_available_at_admit == std::numeric_limits::max() ? 0 : os_available_at_admit; admit_audit["os_available_physical_mib_at_admit"] = os_available_at_admit == std::numeric_limits::max() ? 0.0 : bytes_to_mib(os_available_at_admit); admit_audit["gpu_available_budget_bytes_at_admit"] = gpu_available_budget_at_admit; admit_audit["gpu_available_budget_mib_at_admit"] = bytes_to_mib(gpu_available_budget_at_admit); admit_audit["memory"] = memory_status_snapshot(app); audit_log(app, admit_audit); const auto t_context_lock_wait_start = std::chrono::steady_clock::now(); const auto t_context_start = std::chrono::steady_clock::now(); bool context_reused = false; uint64_t context_pool_slot_id = 0; double context_pool_wait_s = 0.0; ContextLease ctx_lease = acquire_context_lease( app, n_ctx, n_batch, n_ubatch, context_reused, context_pool_slot_id, context_pool_wait_s ); llama_context * ctx = ctx_lease.ctx; if (!ctx) { throw std::runtime_error("failed to acquire llama_context"); } const auto t_context_end = std::chrono::steady_clock::now(); const double context_lock_wait_s = context_reused ? context_pool_wait_s : seconds_between(t_context_lock_wait_start, t_context_start); if (app.cfg.post_context_memory_check && app.cfg.gpu_dynamic_memory_guard) { const GpuMemoryInfo gpu_after_context = detect_gpu_memory_info(app.cfg.gpu_memory_policy); if (gpu_after_context.valid && gpu_after_context.available_budget_bytes > 0 && gpu_after_context.available_budget_bytes < app.cfg.memory_safety_margin_bytes / 2) { ctx_lease.release(); std::ostringstream err; err << "post-context GPU memory safety check failed: available_budget_mib=" << bytes_to_mib(gpu_after_context.available_budget_bytes) << ", required_half_safety_margin_mib=" << bytes_to_mib(app.cfg.memory_safety_margin_bytes / 2); throw std::runtime_error(err.str()); } } std::vector output_tokens(n_ctx); std::vector response_tokens; diffusion_params diff_params; diff_params.mask_token_id = mask_token_id; diff_params.seed = app.cfg.seed; diff_params.temperature = temperature; diff_params.steps = app.cfg.diffusion_steps; diff_params.algorithm = DIFFUSION_ALGORITHM_CONFIDENCE_BASED; diff_params.top_p = app.cfg.top_p; diff_params.top_k = app.cfg.top_k; diff_params.visual_mode = false; diff_params.add_gumbel_noise = false; int eb_max_denoising_steps = 0; int gpu_devs = count_gpu_devices(); const auto t_generation_lock_wait_start = std::chrono::steady_clock::now(); std::unique_lock generation_lock; if (app.cfg.serialize_generation) { generation_lock = std::unique_lock(app.generation_mutex); } const auto t_diffusion_start = std::chrono::steady_clock::now(); const double generation_lock_wait_s = seconds_between(t_generation_lock_wait_start, t_diffusion_start); if (app.canvas_length > 0) { diff_params.shift_logits = get_meta_bool(app.model, "diffusion.shift_logits", false); diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED; diff_params.eps = 1e-3f; diff_params.suppress_mask_token = true; diff_params.self_conditioning = true; } else { diff_params.shift_logits = get_meta_bool(app.model, "diffusion.shift_logits", true); diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED; diff_params.eps = 1e-3f; diff_params.max_length = n_ctx; } if (app.canvas_length > 0) { diffusion_eb_params eb_params; eb_params.max_denoising_steps = get_meta_i(app.model, "diffusion.eb_max_steps", 48); eb_params.t_min = get_meta_f(app.model, "diffusion.eb_t_min", 0.4f); eb_params.t_max = get_meta_f(app.model, "diffusion.eb_t_max", 0.8f); eb_params.entropy_bound = get_meta_f(app.model, "diffusion.eb_entropy_bound", 0.1f); eb_params.stability_threshold = get_meta_i(app.model, "diffusion.eb_stability_threshold", 1); eb_params.confidence_threshold = get_meta_f(app.model, "diffusion.eb_confidence_threshold", 0.005f); eb_params.seed = app.cfg.seed; eb_params.visual_mode = false; eb_max_denoising_steps = eb_params.max_denoising_steps; // Same auto-policy as diffusion-cli.cpp: // single GPU -> KV cache and device sampling on where backend supports it. // multi GPU -> off because these paths are single-device. eb_params.kv_cache = gpu_devs <= 1; eb_params.gpu_sampling = gpu_devs <= 1; eb_params.gpu_sample_reduce = eb_params.gpu_sampling && gpu_devs == 1; for (int32_t b = 0; b < blocks_requested; b++) { const int32_t prefix_len = (int32_t) prefix.size(); const int32_t max_length = prefix_len + app.canvas_length; if (max_length > n_ctx) { if (b == 0) { ctx_lease.release(); throw std::runtime_error("prompt + canvas does not fit in context"); } break; } eb_params.max_length = max_length; int32_t n_generated = 0; diffusion_generate_entropy_bound( ctx, prefix.data(), output_tokens.data(), prefix_len, eb_params, n_generated ); if (n_generated <= prefix_len) { if (b == 0) { ctx_lease.release(); throw std::runtime_error("diffusion generation failed"); } break; } blocks_completed++; const llama_token * canvas = output_tokens.data() + prefix_len; const size_t cut = trim_canvas_tokens(app.vocab, canvas, (size_t) app.canvas_length); response_tokens.insert(response_tokens.end(), canvas, canvas + cut); if ((int) response_tokens.size() >= max_tokens) { response_tokens.resize(max_tokens); break; } if (cut < (size_t) app.canvas_length) { break; } // Commit this canvas block and generate next block. prefix.insert(prefix.end(), canvas, canvas + cut); } } else { int32_t n_generated = 0; diffusion_generate( ctx, prefix.data(), output_tokens.data(), n_input, diff_params, n_generated ); if (n_generated <= n_input) { ctx_lease.release(); throw std::runtime_error("diffusion generation failed"); } response_tokens.assign(output_tokens.begin() + n_input, output_tokens.begin() + n_generated); if ((int) response_tokens.size() > max_tokens) { response_tokens.resize(max_tokens); } blocks_completed = 1; } const auto t_diffusion_end = std::chrono::steady_clock::now(); if (generation_lock.owns_lock()) { generation_lock.unlock(); } ctx_lease.release(); const auto t_detokenize_start = std::chrono::steady_clock::now(); std::string raw_text = common_detokenize(app.vocab, response_tokens, false); std::string clean_text = app.cfg.expose_raw ? trim_ws(raw_text) : clean_diffusion_output(raw_text); const auto t_detokenize_end = std::chrono::steady_clock::now(); memory_reservation.release(); uint64_t active_memory_after_release = 0; { std::lock_guard mem_guard(app.memory_mutex); active_memory_after_release = app.active_request_memory_bytes; } if (app.cfg.log_raw_output) { std::cerr << "\n--- raw diffusion output start ---\n"; std::cerr << raw_text << "\n"; std::cerr << "--- raw diffusion output end ---\n"; } const double queue_wait_s = (double) queue_wait_ms / 1000.0; const double tokenize_s = seconds_between(t_tokenize_start, t_tokenize_end); const double context_s = seconds_between(t_context_start, t_context_end); const double diffusion_s = seconds_between(t_diffusion_start, t_diffusion_end); const double detokenize_s = seconds_between(t_detokenize_start, t_detokenize_end); const double total_s = seconds_between(t_enter, t_detokenize_end); const int output_token_count = (int) response_tokens.size(); const int canvas_tokens_processed = app.canvas_length > 0 ? blocks_completed * app.canvas_length : output_token_count; json audit; audit["event"] = "generation_complete"; audit["request_id"] = request_id; audit["endpoint"] = endpoint; audit["model"] = app.model_name; audit["model_arch"] = app.model_architecture; audit["success"] = true; audit["config"] = { {"temperature", temperature}, {"top_p", app.cfg.top_p}, {"top_k", app.cfg.top_k}, {"seed", app.cfg.seed}, {"n_gpu_layers", app.cfg.n_gpu_layers}, {"n_threads", app.cfg.n_threads}, {"diffusion_steps", app.cfg.diffusion_steps}, {"eb_max_denoising_steps", eb_max_denoising_steps}, {"streaming_enabled", app.cfg.streaming_enabled}, {"stream_requested", opts.stream}, {"fifo_queue", app.cfg.fifo_queue}, {"max_concurrent_requests", app.cfg.max_concurrent_requests}, {"max_queue_requests", app.cfg.max_queue_requests}, {"serialize_context_creation", app.cfg.serialize_context_creation}, {"serialize_generation", app.cfg.serialize_generation}, {"post_context_memory_check", app.cfg.post_context_memory_check}, {"context_pool_size", app.cfg.context_pool_size}, {"context_pool_strict", app.cfg.context_pool_strict}, {"context_pool_clear_on_release", app.cfg.context_pool_clear_on_release} }; audit["tokens"] = { {"prompt_tokens", n_input}, {"requested_max_tokens", max_tokens}, {"processing_tokens_requested", opts.processing_tokens}, {"output_tokens", output_token_count}, {"canvas_tokens_processed", canvas_tokens_processed} }; audit["diffusion"] = { {"canvas_mode", app.canvas_length > 0}, {"canvas_length", app.canvas_length}, {"blocks_requested", blocks_requested}, {"blocks_completed", blocks_completed}, {"min_required_tokens", min_required_tokens}, {"ctx_headroom_tokens", opts.ctx_headroom_tokens}, {"n_ctx", n_ctx}, {"n_batch", n_batch}, {"n_ubatch", n_ubatch}, {"gpu_devices_seen", gpu_devs} }; audit["sizes"] = { {"raw_chars", (int) raw_text.size()}, {"clean_chars", (int) clean_text.size()} }; audit["timings_seconds"] = { {"queue_wait", queue_wait_s}, {"tokenize", tokenize_s}, {"context_init_lock_wait", context_lock_wait_s}, {"context_pool_wait", context_pool_wait_s}, {"context_create", context_s}, {"generation_lock_wait", generation_lock_wait_s}, {"diffusion_generate", diffusion_s}, {"detokenize_and_clean", detokenize_s}, {"total_request", total_s} }; audit["throughput"] = { {"output_tokens_per_sec_total", tokens_per_second(output_token_count, total_s)}, {"output_tokens_per_sec_generation_only", tokens_per_second(output_token_count, diffusion_s)}, {"canvas_tokens_per_sec_total", tokens_per_second(canvas_tokens_processed, total_s)}, {"canvas_tokens_per_sec_generation_only", tokens_per_second(canvas_tokens_processed, diffusion_s)}, {"raw_chars_per_sec_total", tokens_per_second((int) raw_text.size(), total_s)}, {"clean_chars_per_sec_total", tokens_per_second((int) clean_text.size(), total_s)} }; const json memory_pressure_after_release = physical_memory_pressure_snapshot(app); audit["memory"] = { {"model_memory_bytes", app.model_memory_bytes}, {"model_memory_mib", bytes_to_mib(app.model_memory_bytes)}, {"estimated_request_memory_bytes", estimated_request_memory_bytes}, {"estimated_request_memory_mib", bytes_to_mib(estimated_request_memory_bytes)}, {"queue_ticket", queue_ticket}, {"active_requests_after_admit", active_requests_after_admit}, {"queued_requests_after_admit", queued_requests_after_admit}, {"queued_request_memory_bytes_after_admit", queued_memory_after_admit}, {"queued_request_memory_mib_after_admit", bytes_to_mib(queued_memory_after_admit)}, {"active_request_memory_bytes_after_admit", active_memory_after_admit}, {"active_request_memory_mib_after_admit", bytes_to_mib(active_memory_after_admit)}, {"active_request_memory_bytes_after_release", active_memory_after_release}, {"active_request_memory_mib_after_release", bytes_to_mib(active_memory_after_release)}, {"memory_limit_bytes", app.cfg.memory_limit_bytes}, {"memory_limit_mib", bytes_to_mib(app.cfg.memory_limit_bytes)}, {"memory_safety_margin_bytes", app.cfg.memory_safety_margin_bytes}, {"memory_safety_margin_mib", bytes_to_mib(app.cfg.memory_safety_margin_bytes)}, {"bytes_per_token", app.cfg.bytes_per_token}, {"request_base_memory_bytes", app.cfg.request_base_memory_bytes}, {"request_base_memory_mib", bytes_to_mib(app.cfg.request_base_memory_bytes)}, {"context_reused", context_reused}, {"context_pool_slot_id", context_pool_slot_id}, {"context_pool_size", app.cfg.context_pool_size}, {"context_pool_n_ctx", app.context_pool_n_ctx_actual}, {"context_pool_n_batch", app.context_pool_n_batch_actual}, {"context_pool_n_ubatch", app.context_pool_n_ubatch_actual}, {"context_pool_memory_bytes", app.context_pool_memory_bytes}, {"context_pool_memory_mib", bytes_to_mib(app.context_pool_memory_bytes)}, {"os_total_physical_bytes", os_total_physical_memory_bytes()}, {"os_total_physical_mib", bytes_to_mib(os_total_physical_memory_bytes())}, {"os_available_physical_bytes_at_admit", os_available_at_admit}, {"os_available_physical_mib_at_admit", bytes_to_mib(os_available_at_admit)}, {"gpu_available_budget_bytes_at_admit", gpu_available_budget_at_admit}, {"gpu_available_budget_mib_at_admit", bytes_to_mib(gpu_available_budget_at_admit)}, {"gpu_memory_detected", app.gpu_memory_detected}, {"gpu_memory_source", app.gpu_memory_source}, {"gpu_memory_policy", app.cfg.gpu_memory_policy}, {"gpu_adapter_name", app.gpu_adapter_name}, {"gpu_dedicated_memory_bytes", app.gpu_dedicated_memory_bytes}, {"gpu_dedicated_memory_mib", bytes_to_mib(app.gpu_dedicated_memory_bytes)}, {"gpu_budget_bytes_at_start", app.gpu_budget_bytes}, {"gpu_budget_mib_at_start", bytes_to_mib(app.gpu_budget_bytes)}, {"gpu_dynamic_memory_guard", app.cfg.gpu_dynamic_memory_guard}, {"memory_limit_auto_detected", app.cfg.memory_limit_auto_detected}, {"memory_pressure_after_release", memory_pressure_after_release}, {"memory_status_after_release", memory_status_snapshot(app)} }; audit_log(app, audit); GenerationResult result; result.text = clean_text; result.raw_text = raw_text; result.prompt_tokens = n_input; result.output_tokens = output_token_count; result.raw_chars = (int) raw_text.size(); result.clean_chars = (int) clean_text.size(); result.requested_max_tokens = max_tokens; result.processing_tokens_requested = opts.processing_tokens; result.ctx_headroom_tokens = opts.ctx_headroom_tokens; result.min_required_tokens = min_required_tokens; result.n_ctx = n_ctx; result.n_batch = n_batch; result.n_ubatch = n_ubatch; result.blocks_requested = blocks_requested; result.blocks_completed = blocks_completed; result.canvas_tokens_processed = canvas_tokens_processed; result.total_seconds = total_s; result.generation_seconds = diffusion_s; result.estimated_request_memory_bytes = estimated_request_memory_bytes; result.model_memory_bytes = app.model_memory_bytes; result.active_request_memory_bytes_after_admit = active_memory_after_admit; result.active_request_memory_bytes_after_release = active_memory_after_release; result.memory_limit_bytes = app.cfg.memory_limit_bytes; result.memory_safety_margin_bytes = app.cfg.memory_safety_margin_bytes; result.os_total_physical_bytes = os_total_physical_memory_bytes(); result.os_available_physical_bytes_at_admit = os_available_at_admit; result.gpu_available_budget_bytes_at_admit = gpu_available_budget_at_admit; result.queue_wait_ms = queue_wait_ms; result.queue_ticket = queue_ticket; result.active_requests_after_admit = active_requests_after_admit; result.queued_requests_after_admit = queued_requests_after_admit; result.queued_request_memory_bytes_after_admit = queued_memory_after_admit; result.context_init_lock_seconds = context_lock_wait_s; result.generation_lock_wait_seconds = generation_lock_wait_s; result.context_pool_wait_seconds = context_pool_wait_s; result.context_reused = context_reused; result.context_pool_slot_id = context_pool_slot_id; return result; } static std::vector split_text_chunks(const std::string & text, int chunk_chars) { std::vector chunks; if (chunk_chars <= 0) { chunk_chars = 96; } for (size_t i = 0; i < text.size(); i += (size_t) chunk_chars) { chunks.push_back(text.substr(i, (size_t) chunk_chars)); } if (chunks.empty()) { chunks.push_back(""); } return chunks; } static std::string make_openai_chat_stream_response( const std::string & model_name, uint64_t request_id, const GenerationResult & result, int chunk_chars ) { const int64_t created = (int64_t) std::time(nullptr); const std::string id = "chatcmpl-diffusion-local-" + std::to_string(request_id); std::ostringstream out; json first; first["id"] = id; first["object"] = "chat.completion.chunk"; first["created"] = created; first["model"] = model_name; first["choices"] = json::array({{ {"index", 0}, {"delta", {{"role", "assistant"}}}, {"finish_reason", nullptr} }}); out << "data: " << first.dump() << "\n\n"; for (const std::string & chunk : split_text_chunks(result.text, chunk_chars)) { json j; j["id"] = id; j["object"] = "chat.completion.chunk"; j["created"] = created; j["model"] = model_name; j["choices"] = json::array({{ {"index", 0}, {"delta", {{"content", chunk}}}, {"finish_reason", nullptr} }}); out << "data: " << j.dump() << "\n\n"; } json last; last["id"] = id; last["object"] = "chat.completion.chunk"; last["created"] = created; last["model"] = model_name; last["choices"] = json::array({{ {"index", 0}, {"delta", json::object()}, {"finish_reason", "stop"} }}); last["usage"] = { {"prompt_tokens", result.prompt_tokens}, {"completion_tokens", result.output_tokens}, {"total_tokens", result.prompt_tokens + result.output_tokens} }; out << "data: " << last.dump() << "\n\n"; out << "data: [DONE]\n\n"; return out.str(); } static std::string make_completion_stream_response( const std::string & model_name, uint64_t request_id, const GenerationResult & result, int chunk_chars ) { const int64_t created = (int64_t) std::time(nullptr); const std::string id = "cmpl-diffusion-local-" + std::to_string(request_id); std::ostringstream out; for (const std::string & chunk : split_text_chunks(result.text, chunk_chars)) { json j; j["id"] = id; j["object"] = "text_completion.chunk"; j["created"] = created; j["model"] = model_name; j["choices"] = json::array({{ {"text", chunk}, {"index", 0}, {"finish_reason", nullptr} }}); out << "data: " << j.dump() << "\n\n"; } json last; last["id"] = id; last["object"] = "text_completion.chunk"; last["created"] = created; last["model"] = model_name; last["choices"] = json::array({{ {"text", ""}, {"index", 0}, {"finish_reason", "stop"} }}); last["usage"] = { {"prompt_tokens", result.prompt_tokens}, {"completion_tokens", result.output_tokens}, {"total_tokens", result.prompt_tokens + result.output_tokens} }; out << "data: " << last.dump() << "\n\n"; out << "data: [DONE]\n\n"; return out.str(); } static json make_openai_chat_response( const std::string & model_name, uint64_t request_id, const GenerationResult & result ) { json j; j["id"] = "chatcmpl-diffusion-local-" + std::to_string(request_id); j["object"] = "chat.completion"; j["created"] = (int64_t) std::time(nullptr); j["model"] = model_name; j["choices"] = json::array({ { {"index", 0}, {"message", { {"role", "assistant"}, {"content", result.text} }}, {"finish_reason", "stop"} } }); j["usage"] = { {"prompt_tokens", result.prompt_tokens}, {"completion_tokens", result.output_tokens}, {"total_tokens", result.prompt_tokens + result.output_tokens} }; return j; } static json make_completion_response( const std::string & model_name, uint64_t request_id, const GenerationResult & result ) { json j; j["id"] = "cmpl-diffusion-local-" + std::to_string(request_id); j["object"] = "text_completion"; j["created"] = (int64_t) std::time(nullptr); j["model"] = model_name; j["choices"] = json::array({ { {"text", result.text}, {"index", 0}, {"finish_reason", "stop"} } }); j["usage"] = { {"prompt_tokens", result.prompt_tokens}, {"completion_tokens", result.output_tokens}, {"total_tokens", result.prompt_tokens + result.output_tokens} }; return j; } static void usage() { std::cerr << "Usage:\n" << " llama-diffusion-http -m MODEL.gguf [--host 127.0.0.1] [--port 8081]\n" << " [-ngl 99] [-t 20] [-n 256]\n" << " [--raw] [--log-raw]\n" << " [--audit-log PATH] [--no-audit]\n" << " [--audit-summary] [--audit-full] [--no-audit-memory-pressure]\n" << " [--no-streaming] [--stream-chunk-chars N]\n" << " [--processing-tokens N] [--ctx-headroom-tokens N]\n" << " [--n-ctx N] [--n-batch N] [--n-ubatch N]\n" << " [--context-pool-size N] [--context-pool-n-ctx N]\n" << " [--context-pool-n-batch N] [--context-pool-n-ubatch N]\n" << " [--context-pool-strict] [--no-context-pool-clear]\n" << " [--max-concurrent N]\n" << " [--memory-limit-mb N] [--memory-safety-margin-mb N]\n" << " [--model-memory-mb N]\n" << " [--bytes-per-token N] [--token-memory-kb N]\n" << " [--request-base-memory-mb N] [--queue-timeout-ms N]\n" << " [--max-queue-requests N] [--max-queue-memory-mb N]\n" << " [--fifo-queue|--no-fifo-queue]\n" << " [--serialize-context-creation|--no-serialize-context-creation]\n" << " [--serialize-generation|--parallel-generation]\n" << " [--post-context-memory-check|--no-post-context-memory-check]\n" << " [--gpu-memory-policy largest|sum] [--no-gpu-memory-guard]\n" << " [--http-worker-threads N]\n"; } static bool parse_args(int argc, char ** argv, ServerConfig & cfg) { for (int i = 1; i < argc; ++i) { std::string a = argv[i]; auto need_value = [&](const std::string & name) -> std::string { if (i + 1 >= argc) { throw std::runtime_error("missing value for " + name); } return argv[++i]; }; if (a == "-m" || a == "--model") { cfg.model_path = need_value(a); } else if (a == "--host") { cfg.host = need_value(a); } else if (a == "--port") { cfg.port = std::stoi(need_value(a)); } else if (a == "-ngl" || a == "--n-gpu-layers") { cfg.n_gpu_layers = std::stoi(need_value(a)); } else if (a == "-t" || a == "--threads") { cfg.n_threads = std::stoi(need_value(a)); } else if (a == "-n" || a == "--max-tokens") { cfg.default_max_tokens = std::stoi(need_value(a)); } else if (a == "--temp" || a == "--temperature") { cfg.temperature = std::stof(need_value(a)); } else if (a == "--top-p") { cfg.top_p = std::stof(need_value(a)); } else if (a == "--top-k") { cfg.top_k = std::stoi(need_value(a)); } else if (a == "--seed") { cfg.seed = std::stoi(need_value(a)); } else if (a == "--diffusion-steps") { cfg.diffusion_steps = std::stoi(need_value(a)); } else if (a == "--no-chat-template") { cfg.use_chat_template = false; } else if (a == "--raw") { cfg.expose_raw = true; } else if (a == "--log-raw") { cfg.log_raw_output = true; } else if (a == "--audit-log") { cfg.audit_log_path = need_value(a); } else if (a == "--no-audit") { cfg.audit_enabled = false; } else if (a == "--audit-summary" || a == "--audit-compact") { cfg.audit_summary = true; } else if (a == "--audit-full" || a == "--audit-verbose") { cfg.audit_summary = false; } else if (a == "--no-audit-memory-pressure") { cfg.audit_memory_pressure = false; } else if (a == "--audit-memory-pressure") { cfg.audit_memory_pressure = true; } else if (a == "--no-streaming") { cfg.streaming_enabled = false; } else if (a == "--streaming") { cfg.streaming_enabled = true; } else if (a == "--stream-chunk-chars") { cfg.stream_chunk_chars = std::stoi(need_value(a)); } else if (a == "--processing-tokens" || a == "--allocated-tokens" || a == "--alloc-tokens") { cfg.default_processing_tokens = std::stoi(need_value(a)); } else if (a == "--ctx-headroom-tokens" || a == "--ctx-headroom") { cfg.ctx_headroom_tokens = std::stoi(need_value(a)); } else if (a == "--n-ctx") { cfg.default_n_ctx = std::stoi(need_value(a)); } else if (a == "--n-batch") { cfg.default_n_batch = std::stoi(need_value(a)); } else if (a == "--n-ubatch") { cfg.default_n_ubatch = std::stoi(need_value(a)); } else if (a == "--context-pool-size" || a == "--ctx-pool-size") { cfg.context_pool_size = std::stoi(need_value(a)); } else if (a == "--context-pool-n-ctx" || a == "--ctx-pool-n-ctx") { cfg.context_pool_n_ctx = std::stoi(need_value(a)); } else if (a == "--context-pool-n-batch" || a == "--ctx-pool-n-batch") { cfg.context_pool_n_batch = std::stoi(need_value(a)); } else if (a == "--context-pool-n-ubatch" || a == "--ctx-pool-n-ubatch") { cfg.context_pool_n_ubatch = std::stoi(need_value(a)); } else if (a == "--context-pool-strict") { cfg.context_pool_strict = true; } else if (a == "--no-context-pool-strict") { cfg.context_pool_strict = false; } else if (a == "--no-context-pool-clear") { cfg.context_pool_clear_on_release = false; } else if (a == "--context-pool-clear") { cfg.context_pool_clear_on_release = true; } else if (a == "--max-concurrent" || a == "--max-concurrent-requests") { cfg.max_concurrent_requests = std::stoi(need_value(a)); } else if (a == "--memory-limit-mb" || a == "--mem-limit-mb") { cfg.memory_limit_bytes = mb_to_bytes((uint64_t) std::stoull(need_value(a))); cfg.memory_limit_auto_detected = false; } else if (a == "--gpu-memory-policy") { cfg.gpu_memory_policy = need_value(a); if (cfg.gpu_memory_policy != "largest" && cfg.gpu_memory_policy != "sum") { throw std::runtime_error("--gpu-memory-policy must be 'largest' or 'sum'"); } } else if (a == "--no-gpu-memory-guard") { cfg.gpu_dynamic_memory_guard = false; } else if (a == "--gpu-memory-guard") { cfg.gpu_dynamic_memory_guard = true; } else if (a == "--memory-safety-margin-mb" || a == "--mem-safety-margin-mb") { cfg.memory_safety_margin_bytes = mb_to_bytes((uint64_t) std::stoull(need_value(a))); } else if (a == "--model-memory-mb") { cfg.model_memory_bytes_override = mb_to_bytes((uint64_t) std::stoull(need_value(a))); } else if (a == "--bytes-per-token") { cfg.bytes_per_token = (uint64_t) std::stoull(need_value(a)); } else if (a == "--token-memory-kb") { cfg.bytes_per_token = kb_to_bytes((uint64_t) std::stoull(need_value(a))); } else if (a == "--request-base-memory-mb") { cfg.request_base_memory_bytes = mb_to_bytes((uint64_t) std::stoull(need_value(a))); } else if (a == "--queue-timeout-ms") { cfg.queue_timeout_ms = (uint64_t) std::stoull(need_value(a)); } else if (a == "--max-queue-requests") { cfg.max_queue_requests = std::stoi(need_value(a)); } else if (a == "--max-queue-memory-mb") { cfg.max_queue_memory_bytes = mb_to_bytes((uint64_t) std::stoull(need_value(a))); } else if (a == "--fifo-queue") { cfg.fifo_queue = true; } else if (a == "--no-fifo-queue") { cfg.fifo_queue = false; } else if (a == "--serialize-context-creation") { cfg.serialize_context_creation = true; } else if (a == "--no-serialize-context-creation") { cfg.serialize_context_creation = false; } else if (a == "--serialize-generation") { cfg.serialize_generation = true; } else if (a == "--parallel-generation") { cfg.serialize_generation = false; } else if (a == "--post-context-memory-check") { cfg.post_context_memory_check = true; } else if (a == "--no-post-context-memory-check") { cfg.post_context_memory_check = false; } else if (a == "--http-worker-threads") { cfg.http_worker_threads = std::stoi(need_value(a)); } else if (a == "-h" || a == "--help") { usage(); return false; } else { throw std::runtime_error("unknown argument: " + a); } } if (cfg.model_path.empty()) { usage(); throw std::runtime_error("missing -m MODEL.gguf"); } return true; } int main(int argc, char ** argv) { App app; try { if (!parse_args(argc, argv, app.cfg)) { return 0; } std::setlocale(LC_NUMERIC, "C"); ggml_time_init(); common_init(); llama_backend_init(); llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = app.cfg.n_gpu_layers; std::cerr << "Loading model once: " << app.cfg.model_path << "\n"; app.model = llama_model_load_from_file(app.cfg.model_path.c_str(), model_params); if (!app.model) { throw std::runtime_error("failed to load model"); } app.model_memory_bytes = app.cfg.model_memory_bytes_override > 0 ? app.cfg.model_memory_bytes_override : file_size_bytes(app.cfg.model_path); const GpuMemoryInfo startup_gpu = detect_gpu_memory_info(app.cfg.gpu_memory_policy); app.gpu_memory_detected = startup_gpu.valid; app.gpu_memory_source = startup_gpu.source; app.gpu_adapter_name = startup_gpu.adapter_name; app.gpu_adapter_count = startup_gpu.adapter_count; app.gpu_dedicated_memory_bytes = startup_gpu.dedicated_video_memory_bytes; app.gpu_budget_bytes = startup_gpu.budget_bytes; app.gpu_current_usage_bytes_at_start = startup_gpu.current_usage_bytes; app.gpu_available_budget_bytes_at_start = startup_gpu.available_budget_bytes; if (app.cfg.memory_limit_bytes == 0 && startup_gpu.valid) { app.cfg.memory_limit_bytes = startup_gpu.budget_bytes > 0 ? startup_gpu.budget_bytes : startup_gpu.dedicated_video_memory_bytes; app.cfg.memory_limit_auto_detected = true; } if (!llama_model_is_diffusion(app.model)) { throw std::runtime_error("model is not a llama.cpp diffusion model"); } app.model_architecture = get_meta_string(app.model, "general.architecture", app.model_architecture); app.model_name = get_meta_string(app.model, "general.name", ""); if (app.model_name.empty()) { app.model_name = basename_from_path(app.cfg.model_path); } std::cerr << "Model name = " << app.model_name << "\n"; std::cerr << "Model architecture = " << app.model_architecture << "\n"; app.vocab = llama_model_get_vocab(app.model); char canvas_str[32] = {}; if (llama_model_meta_val_str(app.model, "diffusion.canvas_length", canvas_str, sizeof(canvas_str)) >= 0) { app.canvas_length = (int32_t) strtol(canvas_str, nullptr, 10); } if (app.canvas_length > 0) { llama_diffusion_set_sc(app.model, nullptr, 0.0f, 1.0f, true); } std::cerr << "Diffusion canvas_length = " << app.canvas_length << "\n"; init_context_pool(app); if (app.cfg.context_pool_size > 0) { std::cerr << "Model stays loaded; requests lease isolated warm contexts from the pool when they fit.\n"; } else { std::cerr << "Model stays loaded; every request gets a fresh llama_context.\n"; } json startup_audit; startup_audit["event"] = "server_start"; startup_audit["model"] = app.model_name; startup_audit["model_name"] = app.model_name; startup_audit["model_arch"] = app.model_architecture; startup_audit["model_path"] = app.cfg.model_path; startup_audit["host"] = app.cfg.host; startup_audit["port"] = app.cfg.port; startup_audit["canvas_length"] = app.canvas_length; startup_audit["n_gpu_layers"] = app.cfg.n_gpu_layers; startup_audit["n_threads"] = app.cfg.n_threads; startup_audit["default_max_tokens"] = app.cfg.default_max_tokens; startup_audit["diffusion_steps"] = app.cfg.diffusion_steps; startup_audit["streaming_enabled"] = app.cfg.streaming_enabled; startup_audit["stream_chunk_chars"] = app.cfg.stream_chunk_chars; startup_audit["default_processing_tokens"] = app.cfg.default_processing_tokens; startup_audit["ctx_headroom_tokens"] = app.cfg.ctx_headroom_tokens; startup_audit["default_n_ctx"] = app.cfg.default_n_ctx; startup_audit["default_n_batch"] = app.cfg.default_n_batch; startup_audit["default_n_ubatch"] = app.cfg.default_n_ubatch; startup_audit["context_pool_size"] = app.cfg.context_pool_size; startup_audit["context_pool_strict"] = app.cfg.context_pool_strict; startup_audit["context_pool_clear_on_release"] = app.cfg.context_pool_clear_on_release; startup_audit["context_pool_n_ctx"] = app.context_pool_n_ctx_actual; startup_audit["context_pool_n_batch"] = app.context_pool_n_batch_actual; startup_audit["context_pool_n_ubatch"] = app.context_pool_n_ubatch_actual; startup_audit["context_pool_memory_bytes"] = app.context_pool_memory_bytes; startup_audit["context_pool_memory_mib"] = bytes_to_mib(app.context_pool_memory_bytes); startup_audit["max_concurrent_requests"] = app.cfg.max_concurrent_requests; startup_audit["model_memory_bytes"] = app.model_memory_bytes; startup_audit["model_memory_mib"] = bytes_to_mib(app.model_memory_bytes); startup_audit["memory_limit_bytes"] = app.cfg.memory_limit_bytes; startup_audit["memory_limit_mib"] = bytes_to_mib(app.cfg.memory_limit_bytes); startup_audit["memory_safety_margin_bytes"] = app.cfg.memory_safety_margin_bytes; startup_audit["memory_safety_margin_mib"] = bytes_to_mib(app.cfg.memory_safety_margin_bytes); startup_audit["bytes_per_token"] = app.cfg.bytes_per_token; startup_audit["request_base_memory_bytes"] = app.cfg.request_base_memory_bytes; startup_audit["request_base_memory_mib"] = bytes_to_mib(app.cfg.request_base_memory_bytes); startup_audit["queue_timeout_ms"] = app.cfg.queue_timeout_ms; startup_audit["fifo_queue"] = app.cfg.fifo_queue; startup_audit["max_queue_requests"] = app.cfg.max_queue_requests; startup_audit["max_queue_memory_bytes"] = app.cfg.max_queue_memory_bytes; startup_audit["max_queue_memory_mib"] = bytes_to_mib(app.cfg.max_queue_memory_bytes); startup_audit["serialize_context_creation"] = app.cfg.serialize_context_creation; startup_audit["serialize_generation"] = app.cfg.serialize_generation; startup_audit["post_context_memory_check"] = app.cfg.post_context_memory_check; startup_audit["http_worker_threads"] = app.cfg.http_worker_threads; startup_audit["gpu_memory_detected"] = app.gpu_memory_detected; startup_audit["gpu_memory_source"] = app.gpu_memory_source; startup_audit["gpu_memory_policy"] = app.cfg.gpu_memory_policy; startup_audit["gpu_adapter_name"] = app.gpu_adapter_name; startup_audit["gpu_adapter_count"] = app.gpu_adapter_count; startup_audit["gpu_dedicated_memory_bytes"] = app.gpu_dedicated_memory_bytes; startup_audit["gpu_dedicated_memory_mib"] = bytes_to_mib(app.gpu_dedicated_memory_bytes); startup_audit["gpu_budget_bytes"] = app.gpu_budget_bytes; startup_audit["gpu_budget_mib"] = bytes_to_mib(app.gpu_budget_bytes); startup_audit["gpu_current_usage_bytes_at_start"] = app.gpu_current_usage_bytes_at_start; startup_audit["gpu_current_usage_mib_at_start"] = bytes_to_mib(app.gpu_current_usage_bytes_at_start); startup_audit["gpu_available_budget_bytes_at_start"] = app.gpu_available_budget_bytes_at_start; startup_audit["gpu_available_budget_mib_at_start"] = bytes_to_mib(app.gpu_available_budget_bytes_at_start); startup_audit["gpu_dynamic_memory_guard"] = app.cfg.gpu_dynamic_memory_guard; startup_audit["memory_limit_auto_detected"] = app.cfg.memory_limit_auto_detected; startup_audit["audit_summary"] = app.cfg.audit_summary; startup_audit["audit_memory_pressure"] = app.cfg.audit_memory_pressure; startup_audit["os_total_physical_bytes"] = os_total_physical_memory_bytes(); startup_audit["os_total_physical_mib"] = bytes_to_mib(os_total_physical_memory_bytes()); startup_audit["os_available_physical_bytes"] = os_available_physical_memory_bytes(); startup_audit["os_available_physical_mib"] = bytes_to_mib(os_available_physical_memory_bytes()); audit_log(app, startup_audit); httplib::Server server; if (app.cfg.http_worker_threads > 0) { server.new_task_queue = [&app] { return new httplib::ThreadPool((size_t) app.cfg.http_worker_threads); }; } server.Get("/health", [](const httplib::Request &, httplib::Response & res) { res.set_content("{\"status\":\"ok\"}", "application/json"); }); server.Get("/memory", [&](const httplib::Request &, httplib::Response & res) { json j = memory_status_snapshot(app); j["status"] = "ok"; res.set_content(j.dump(), "application/json"); }); server.Get("/v1/models", [&](const httplib::Request &, httplib::Response & res) { json j; j["object"] = "list"; j["data"] = json::array({ { {"id", app.model_name}, {"object", "model"}, {"owned_by", "local"}, {"architecture", app.model_architecture}, {"path", app.cfg.model_path} } }); res.set_content(j.dump(), "application/json"); }); server.Post("/v1/chat/completions", [&](const httplib::Request & req, httplib::Response & res) { const uint64_t request_id = app.next_request_id.fetch_add(1); try { json body = json::parse(req.body); RequestOptions opts = request_options_from_body(app, body, false); json begin; begin["event"] = "request_begin"; begin["request_id"] = request_id; begin["endpoint"] = "/v1/chat/completions"; begin["max_tokens"] = opts.max_tokens; begin["temperature"] = opts.temperature; begin["stream"] = opts.stream; begin["processing_tokens"] = opts.processing_tokens; begin["n_ctx"] = opts.n_ctx; begin["n_batch"] = opts.n_batch; begin["n_ubatch"] = opts.n_ubatch; begin["ctx_headroom_tokens"] = opts.ctx_headroom_tokens; begin["scheduler"] = memory_status_snapshot(app); audit_log(app, begin); if (opts.stream && !app.cfg.streaming_enabled) { res.status = 400; res.set_content(json_error("stream=true requested, but streaming is disabled by --no-streaming", 400), "application/json"); return; } std::string prompt = messages_to_prompt(app.model, body.at("messages"), app.cfg.use_chat_template); GenerationResult result = run_one_request( app, request_id, "/v1/chat/completions", prompt, opts ); if (opts.stream) { res.set_header("Cache-Control", "no-cache"); res.set_header("Connection", "keep-alive"); res.set_content(make_openai_chat_stream_response(app.model_name, request_id, result, opts.stream_chunk_chars), "text/event-stream"); } else { json out = make_openai_chat_response(app.model_name, request_id, result); res.set_content(out.dump(), "application/json"); } } catch (const std::exception & e) { json err; err["event"] = "request_error"; err["request_id"] = request_id; err["endpoint"] = "/v1/chat/completions"; err["error"] = e.what(); audit_log(app, err); res.status = 500; res.set_content(json_error(e.what()), "application/json"); } }); server.Post("/completion", [&](const httplib::Request & req, httplib::Response & res) { const uint64_t request_id = app.next_request_id.fetch_add(1); try { json body = json::parse(req.body); RequestOptions opts = request_options_from_body(app, body, true); json begin; begin["event"] = "request_begin"; begin["request_id"] = request_id; begin["endpoint"] = "/completion"; begin["max_tokens"] = opts.max_tokens; begin["temperature"] = opts.temperature; begin["stream"] = opts.stream; begin["processing_tokens"] = opts.processing_tokens; begin["n_ctx"] = opts.n_ctx; begin["n_batch"] = opts.n_batch; begin["n_ubatch"] = opts.n_ubatch; begin["ctx_headroom_tokens"] = opts.ctx_headroom_tokens; begin["scheduler"] = memory_status_snapshot(app); audit_log(app, begin); if (opts.stream && !app.cfg.streaming_enabled) { res.status = 400; res.set_content(json_error("stream=true requested, but streaming is disabled by --no-streaming", 400), "application/json"); return; } std::string prompt = completion_prompt_from_body(body); GenerationResult result = run_one_request( app, request_id, "/completion", prompt, opts ); if (opts.stream) { res.set_header("Cache-Control", "no-cache"); res.set_header("Connection", "keep-alive"); res.set_content(make_completion_stream_response(app.model_name, request_id, result, opts.stream_chunk_chars), "text/event-stream"); } else { json out = make_completion_response(app.model_name, request_id, result); res.set_content(out.dump(), "application/json"); } } catch (const std::exception & e) { json err; err["event"] = "request_error"; err["request_id"] = request_id; err["endpoint"] = "/completion"; err["error"] = e.what(); audit_log(app, err); res.status = 500; res.set_content(json_error(e.what()), "application/json"); } }); std::cerr << "Listening on http://" << app.cfg.host << ":" << app.cfg.port << "\n"; if (!server.listen(app.cfg.host, app.cfg.port)) { throw std::runtime_error("failed to bind server"); } free_context_pool(app); llama_model_free(app.model); llama_backend_free(); return 0; } catch (const std::exception & e) { std::cerr << "fatal: " << e.what() << "\n"; free_context_pool(app); if (app.model) { llama_model_free(app.model); } llama_backend_free(); return 1; } }