// (C)artificialintelligence.dk - examples/diffusion/diffusion-http.cpp
//
// Minimal OpenAI-compatible HTTP server for DiffusionGemma / llama.cpp diffusion models.
//
// Behavior:
//   - GGUF model is loaded once at server startup.
//   - Every HTTP request creates a fresh llama_context.
//   - Therefore requests do not share KV/cache/history.
//   - This avoids model reload while keeping questions separated.
//
// Audit logging:
//   - JSONL audit records are written to stderr by default.
//   - Optional file logging: --audit-log C:\path\diffusion-http-audit.jsonl
//   - Disable audit: --no-audit
//   - Raw model output logging is disabled by default; enable with --log-raw.
//
// Request controls added in this version:
//   - stream=true is supported with OpenAI-compatible SSE framing.
//     Diffusion generation itself is still batch/canvas-based, so chunks are sent after
//     generation completes, split by --stream-chunk-chars / stream_chunk_chars.
//   - max_tokens controls returned/generated tokens.
//   - processing_tokens / allocated_tokens / n_ctx / n_batch / n_ubatch control
//     context and processing allocation per request.
//
// Endpoints:
//   GET  /health
//   GET  /v1/models
//   POST /v1/chat/completions
//   POST /completion

#include "llama.h"
#include "common.h"
#include "chat.h"
#include "diffusion.h"
#include "ggml-backend.h"

#include "httplib.h"
#include "json.hpp"

#include <algorithm>
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <deque>
#include <clocale>
#include <cstdint>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <iomanip>
#include <memory>
#include <limits>
#include <iostream>
#include <mutex>
#include <sstream>
#include <cstdio>
#include <cstring>
#include <cerrno>
#include <stdexcept>
#include <string>
#include <vector>

#if !defined(_WIN32)
#include <filesystem>
#endif

#if defined(_WIN32)
#define NOMINMAX
#include <windows.h>
#include <dxgi1_4.h>
#else
#include <unistd.h>
#endif

struct ServerConfig {
    std::string model_path;
    std::string host = "127.0.0.1";
    int port = 8081;

    int n_gpu_layers = 99;
    int n_threads = 20;

    int default_max_tokens = 256;
    int diffusion_steps = 128;

    float temperature = 0.8f;
    float top_p = 0.95f;
    int top_k = 40;
    int seed = 1234;

    bool use_chat_template = true;
    bool expose_raw = false;

    bool audit_enabled = true;
    bool log_raw_output = false;
    std::string audit_log_path;

    // Audit verbosity:
    //   summary = compact JSONL records suitable for normal operations.
    //   full    = previous detailed records with full nested memory snapshots.
    bool audit_summary = true;
    bool audit_memory_pressure = true;

    bool streaming_enabled = true;
    int stream_chunk_chars = 96;

    // Auto allocation defaults. For DiffusionGemma canvas models the historical
    // llama-diffusion-cli behavior is:
    //   n_ctx = blocks * canvas_length + 2048
    // The value below controls that extra 2048 headroom unless overridden.
    int ctx_headroom_tokens = 2048;

    // If non-zero, these become server-wide defaults for request allocation.
    // They can still be overridden per request with JSON fields.
    int default_processing_tokens = 0;
    int default_n_ctx = 0;
    int default_n_batch = 0;
    int default_n_ubatch = 0;

    // Concurrency + memory queue controls.
    //
    // Memory accounting is conservative/estimated: llama.cpp backends do not expose
    // exact per-context GPU allocations through this wrapper. The defaults are chosen
    // to be safe for DiffusionGemma-style large canvas requests and can be tuned:
    //
    //   estimated_request_bytes =
    //       request_base_memory_bytes + max(n_ctx, n_batch, n_ubatch) * bytes_per_token
    //
    // The scheduler compares:
    //   model_memory_bytes + active_request_bytes + new_request_bytes + safety_margin
    //
    // against detected GPU physical/budget memory by default. --memory-limit-mb
    // can still override the detected budget. Dynamic GPU budget/usage checks are used when available: DXGI on Windows,
    // Linux DRM sysfs or nvidia-smi on Ubuntu/Linux; otherwise the code falls back to OS RAM.
    // If admission would exceed the limit, the request waits in a queue until
    // other requests complete.
    int max_concurrent_requests = 0; // 0 = unlimited except memory budget
    uint64_t memory_limit_bytes = 0; // 0 = use OS physical availability only
    uint64_t memory_safety_margin_bytes = 512ull * 1024ull * 1024ull;
    uint64_t model_memory_bytes_override = 0;
    uint64_t bytes_per_token = 1024ull * 1024ull; // ~1 MiB/token, conservative for DiffusionGemma SYCL
    uint64_t request_base_memory_bytes = 512ull * 1024ull * 1024ull;
    uint64_t queue_timeout_ms = 0; // 0 = wait indefinitely

    // Queue hardening. FIFO prevents small later requests from jumping ahead of
    // large earlier requests. max_queue_* protects the HTTP server from unlimited
    // blocked handler threads when clients flood it. 0 = unlimited for that limit.
    bool fifo_queue = true;
    int max_queue_requests = 128;
    uint64_t max_queue_memory_bytes = 0;

    // Context creation uses shared llama_model/backend state. Keep it serialized
    // by default while still allowing generation itself to run concurrently.
    bool serialize_context_creation = true;

    // Emergency switch for unstable backends: keep request isolation but run one
    // generation at a time. Default false because the memory gate is the primary
    // parallelism control.
    bool serialize_generation = false;

    // Optional post-context safety check. If DXGI/OS telemetry says memory became
    // dangerously low immediately after llama_init_from_model(), abort before
    // launching diffusion kernels.
    bool post_context_memory_check = true;

    // Limit httplib worker threads separately from model concurrency. 0 means use
    // the library default.
    int http_worker_threads = 0;

    // GPU memory auto-detection. On Windows this uses DXGI directly. On Linux/Ubuntu it tries DRM sysfs
    // (/sys/class/drm/card*/device/mem_info_vram_*) and then nvidia-smi fallback,
    // so you usually do not need --memory-limit-mb on AMD/Intel/NVIDIA systems.
    //   largest = use the largest detected adapter, best for one server per GPU
    //   sum     = sum all detected discrete GPU adapters, only use when the model
    //             is genuinely split across GPUs
    std::string gpu_memory_policy = "largest";
    bool gpu_dynamic_memory_guard = true;
    bool memory_limit_auto_detected = false;

    // Optional hot context pool. This avoids paying llama_init_from_model() on
    // every request. Each pooled context is exclusively leased to one request at
    // a time and cleared before reuse.
    //
    // 0 = disabled; old fresh-context-per-request behavior.
    int context_pool_size = 0;

    // Pool dimensions. 0 means derive from server defaults/max request caps.
    int context_pool_n_ctx = 0;
    int context_pool_n_batch = 0;
    int context_pool_n_ubatch = 0;

    // Strict mode rejects requests that do not fit the pool. Non-strict mode
    // falls back to a temporary fresh context for oversized requests.
    bool context_pool_strict = false;

    // Clear KV/cache/state before putting a pooled context back. Keep enabled
    // for request isolation.
    bool context_pool_clear_on_release = true;
};

struct PooledContextSlot {
    llama_context * ctx = nullptr;
    int32_t n_ctx = 0;
    int32_t n_batch = 0;
    int32_t n_ubatch = 0;
    uint64_t slot_id = 0;
};

struct App {
    ServerConfig cfg;

    llama_model * model = nullptr;
    const llama_vocab * vocab = nullptr;

    // Resolved from GGUF metadata after load. Used in audit/debug output.
    std::string model_name = "diffusiongemma";
    std::string model_architecture = "diffusion-gemma";

    int32_t canvas_length = 0;

    std::atomic<uint64_t> next_request_id{1};

    // Memory-gated concurrency scheduler. Requests do not share llama_context, so
    // they are isolated. This gate only decides when a request may allocate its
    // fresh context and generation buffers.
    std::mutex memory_mutex;
    std::condition_variable memory_cv;
    uint64_t model_memory_bytes = 0;
    uint64_t active_request_memory_bytes = 0;
    uint64_t peak_request_memory_bytes = 0;
    uint64_t queued_request_memory_bytes = 0;
    uint64_t peak_queued_request_memory_bytes = 0;
    int active_requests = 0;
    int queued_requests = 0;

    uint64_t next_queue_ticket = 1;
    std::deque<uint64_t> queue_order;

    // These are deliberately separate from the memory gate. They protect backend
    // phases that may not be fully thread-safe on every llama.cpp backend.
    std::mutex context_init_mutex;
    std::mutex generation_mutex;

    // Optional pool of warm llama_context objects. A slot is never shared by two
    // requests at once. It is returned to the pool only after generation finishes
    // and the KV/cache has been cleared.
    std::mutex context_pool_mutex;
    std::condition_variable context_pool_cv;
    std::vector<std::unique_ptr<PooledContextSlot>> context_pool;
    std::deque<PooledContextSlot *> context_pool_available;
    int active_pooled_contexts = 0;
    uint64_t context_pool_memory_bytes = 0;
    uint64_t context_pool_acquires = 0;
    uint64_t context_pool_reuses = 0;
    uint64_t context_pool_waits = 0;
    uint64_t context_pool_misses = 0;
    int32_t context_pool_n_ctx_actual = 0;
    int32_t context_pool_n_batch_actual = 0;
    int32_t context_pool_n_ubatch_actual = 0;

    // Detected GPU memory. memory_limit_bytes is set from this at startup when
    // the user does not provide --memory-limit-mb.
    bool gpu_memory_detected = false;
    std::string gpu_memory_source;
    std::string gpu_adapter_name;
    int gpu_adapter_count = 0;
    uint64_t gpu_dedicated_memory_bytes = 0;
    uint64_t gpu_budget_bytes = 0;
    uint64_t gpu_current_usage_bytes_at_start = 0;
    uint64_t gpu_available_budget_bytes_at_start = 0;

    // Serialize audit writes so JSONL records are not interleaved.
    std::mutex audit_mutex;
};

struct GenerationResult {
    std::string text;
    std::string raw_text;

    int prompt_tokens = 0;
    int output_tokens = 0;
    int raw_chars = 0;
    int clean_chars = 0;

    int requested_max_tokens = 0;
    int processing_tokens_requested = 0;
    int ctx_headroom_tokens = 0;
    int min_required_tokens = 0;
    int n_ctx = 0;
    int n_batch = 0;
    int n_ubatch = 0;
    int blocks_requested = 0;
    int blocks_completed = 0;
    int canvas_tokens_processed = 0;

    double total_seconds = 0.0;
    double generation_seconds = 0.0;

    uint64_t estimated_request_memory_bytes = 0;
    uint64_t model_memory_bytes = 0;
    uint64_t active_request_memory_bytes_after_admit = 0;
    uint64_t active_request_memory_bytes_after_release = 0;
    uint64_t memory_limit_bytes = 0;
    uint64_t memory_safety_margin_bytes = 0;
    uint64_t os_total_physical_bytes = 0;
    uint64_t os_available_physical_bytes_at_admit = 0;
    uint64_t gpu_available_budget_bytes_at_admit = 0;
    uint64_t queue_wait_ms = 0;
    uint64_t queue_ticket = 0;
    int active_requests_after_admit = 0;
    int queued_requests_after_admit = 0;
    uint64_t queued_request_memory_bytes_after_admit = 0;
    double context_init_lock_seconds = 0.0;
    double generation_lock_wait_seconds = 0.0;
    double context_pool_wait_seconds = 0.0;
    bool context_reused = false;
    uint64_t context_pool_slot_id = 0;
};

struct RequestOptions {
    int max_tokens = 0;
    float temperature = 0.0f;
    bool stream = false;

    // Allocation controls. 0 means auto/default.
    int processing_tokens = 0;  // alias: allocated_tokens
    int n_ctx = 0;
    int n_batch = 0;
    int n_ubatch = 0;
    int ctx_headroom_tokens = 0;

    int stream_chunk_chars = 0;
};

static std::string trim_ws(std::string s) {
    while (!s.empty() && (s.front() == '\n' || s.front() == '\r' || s.front() == ' ' || s.front() == '\t')) {
        s.erase(s.begin());
    }
    while (!s.empty() && (s.back() == '\n' || s.back() == '\r' || s.back() == ' ' || s.back() == '\t')) {
        s.pop_back();
    }
    return s;
}

static std::string timestamp_utc_ms() {
    using namespace std::chrono;

    const auto now = system_clock::now();
    const auto ms = duration_cast<milliseconds>(now.time_since_epoch()) % 1000;

    const std::time_t tt = system_clock::to_time_t(now);
    std::tm tm{};

#if defined(_WIN32)
    gmtime_s(&tm, &tt);
#else
    gmtime_r(&tt, &tm);
#endif

    std::ostringstream out;
    out << std::put_time(&tm, "%Y-%m-%dT%H:%M:%S")
        << "."
        << std::setw(3) << std::setfill('0') << ms.count()
        << "Z";

    return out.str();
}

static double seconds_between(
    const std::chrono::steady_clock::time_point & a,
    const std::chrono::steady_clock::time_point & b
) {
    return std::chrono::duration<double>(b - a).count();
}

static double tokens_per_second(int tokens, double seconds) {
    if (seconds <= 0.0) {
        return 0.0;
    }

    return (double) tokens / seconds;
}


static uint64_t mb_to_bytes(uint64_t mb) {
    return mb * 1024ull * 1024ull;
}

static uint64_t kb_to_bytes(uint64_t kb) {
    return kb * 1024ull;
}

static uint64_t file_size_bytes(const std::string & path) {
    FILE * f = nullptr;

#if defined(_WIN32)
    fopen_s(&f, path.c_str(), "rb");
#else
    f = fopen(path.c_str(), "rb");
#endif

    if (!f) {
        return 0;
    }

#if defined(_WIN32)
    _fseeki64(f, 0, SEEK_END);
    const int64_t size = _ftelli64(f);
#else
    fseeko(f, 0, SEEK_END);
    const off_t size = ftello(f);
#endif

    fclose(f);

    return size > 0 ? (uint64_t) size : 0;
}


struct GpuMemoryInfo {
    bool valid = false;
    std::string source;
    std::string adapter_name;
    int adapter_count = 0;

    // Physical/local VRAM reported by the adapter.
    uint64_t dedicated_video_memory_bytes = 0;

    // Runtime budget/usage reported by the OS. On Windows this is DXGI
    // QueryVideoMemoryInfo for the LOCAL segment. It is often the most useful
    // admission-control number because it accounts for memory pressure.
    uint64_t budget_bytes = 0;
    uint64_t current_usage_bytes = 0;
    uint64_t available_budget_bytes = 0;
};

#if defined(_WIN32)
static std::string wide_to_utf8(const wchar_t * ws) {
    if (!ws || !*ws) {
        return "";
    }

    const int needed = WideCharToMultiByte(CP_UTF8, 0, ws, -1, nullptr, 0, nullptr, nullptr);
    if (needed <= 0) {
        return "";
    }

    std::string out((size_t) needed, '\0');
    WideCharToMultiByte(CP_UTF8, 0, ws, -1, &out[0], needed, nullptr, nullptr);

    if (!out.empty() && out.back() == '\0') {
        out.pop_back();
    }

    return out;
}
#endif

static GpuMemoryInfo detect_gpu_memory_info(const std::string & policy) {
    GpuMemoryInfo result;

#if defined(_WIN32)
    HMODULE dxgi = LoadLibraryA("dxgi.dll");
    if (!dxgi) {
        result.source = "dxgi_load_failed";
        return result;
    }

    using CreateDXGIFactory1Fn = HRESULT (WINAPI *)(REFIID, void **);
    auto create_factory = reinterpret_cast<CreateDXGIFactory1Fn>(GetProcAddress(dxgi, "CreateDXGIFactory1"));
    if (!create_factory) {
        result.source = "CreateDXGIFactory1_missing";
        FreeLibrary(dxgi);
        return result;
    }

    IDXGIFactory1 * factory = nullptr;
    HRESULT hr = create_factory(__uuidof(IDXGIFactory1), reinterpret_cast<void **>(&factory));
    if (FAILED(hr) || !factory) {
        result.source = "CreateDXGIFactory1_failed";
        FreeLibrary(dxgi);
        return result;
    }

    struct Candidate {
        std::string name;
        uint64_t dedicated = 0;
        uint64_t budget = 0;
        uint64_t current_usage = 0;
        uint64_t available_budget = 0;
    };

    std::vector<Candidate> candidates;

    for (UINT i = 0;; ++i) {
        IDXGIAdapter1 * adapter = nullptr;
        hr = factory->EnumAdapters1(i, &adapter);
        if (hr == DXGI_ERROR_NOT_FOUND) {
            break;
        }
        if (FAILED(hr) || !adapter) {
            continue;
        }

        DXGI_ADAPTER_DESC1 desc{};
        if (SUCCEEDED(adapter->GetDesc1(&desc))) {
            const bool is_software = (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) != 0;

            // DedicatedVideoMemory is 0 or tiny for many iGPUs. For this server we
            // care about the selected discrete inference GPU budget.
            if (!is_software && desc.DedicatedVideoMemory > 0) {
                Candidate c;
                c.name = wide_to_utf8(desc.Description);
                c.dedicated = (uint64_t) desc.DedicatedVideoMemory;

                IDXGIAdapter3 * adapter3 = nullptr;
                if (SUCCEEDED(adapter->QueryInterface(__uuidof(IDXGIAdapter3), reinterpret_cast<void **>(&adapter3))) && adapter3) {
                    DXGI_QUERY_VIDEO_MEMORY_INFO info{};
                    if (SUCCEEDED(adapter3->QueryVideoMemoryInfo(0, DXGI_MEMORY_SEGMENT_GROUP_LOCAL, &info))) {
                        c.budget = (uint64_t) info.Budget;
                        c.current_usage = (uint64_t) info.CurrentUsage;
                        c.available_budget = c.budget > c.current_usage ? c.budget - c.current_usage : 0;
                    }
                    adapter3->Release();
                }

                candidates.push_back(c);
            }
        }

        adapter->Release();
    }

    factory->Release();
    FreeLibrary(dxgi);

    if (candidates.empty()) {
        result.source = "dxgi_no_discrete_adapter";
        return result;
    }

    result.valid = true;
    result.source = "dxgi";
    result.adapter_count = (int) candidates.size();

    if (policy == "sum") {
        result.adapter_name = "sum_of_detected_adapters";
        for (const Candidate & c : candidates) {
            result.dedicated_video_memory_bytes += c.dedicated;
            result.budget_bytes += c.budget > 0 ? c.budget : c.dedicated;
            result.current_usage_bytes += c.current_usage;
            result.available_budget_bytes += c.available_budget;
        }
    } else {
        auto best = std::max_element(candidates.begin(), candidates.end(), [](const Candidate & a, const Candidate & b) {
            return a.dedicated < b.dedicated;
        });

        result.adapter_name = best->name;
        result.dedicated_video_memory_bytes = best->dedicated;
        result.budget_bytes = best->budget > 0 ? best->budget : best->dedicated;
        result.current_usage_bytes = best->current_usage;
        result.available_budget_bytes = best->available_budget;
    }
#else
    struct Candidate {
        std::string name;
        uint64_t dedicated = 0;
        uint64_t current_usage = 0;
        uint64_t available_budget = 0;
        std::string source;
    };

    std::vector<Candidate> candidates;

    auto read_u64_file = [](const std::filesystem::path & path, uint64_t & value) -> bool {
        std::ifstream f(path);
        if (!f) {
            return false;
        }

        uint64_t v = 0;
        f >> v;
        if (!f) {
            return false;
        }

        value = v;
        return true;
    };

    auto read_string_file = [](const std::filesystem::path & path) -> std::string {
        std::ifstream f(path);
        if (!f) {
            return "";
        }

        std::string v;
        std::getline(f, v);
        return v;
    };

    auto trim_newline = [](std::string v) -> std::string {
        while (!v.empty() && (v.back() == '\n' || v.back() == '\r' || v.back() == ' ' || v.back() == '\t')) {
            v.pop_back();
        }
        return v;
    };

    // Linux generic DRM sysfs path. This works for AMDGPU officially and also
    // works on many modern discrete Linux GPU drivers that expose DRM memory
    // accounting under /sys/class/drm/card*/device.
    // Official AMDGPU docs define mem_info_vram_total and mem_info_vram_used as
    // byte counters for total and currently used VRAM.
    try {
        const std::filesystem::path drm_root("/sys/class/drm");
        if (std::filesystem::exists(drm_root)) {
            for (const auto & entry : std::filesystem::directory_iterator(drm_root)) {
                const std::string card = entry.path().filename().string();
                if (card.rfind("card", 0) != 0 || card.find('-') != std::string::npos) {
                    continue;
                }

                const auto dev = entry.path() / "device";
                uint64_t total = 0;
                uint64_t used = 0;

                if (!read_u64_file(dev / "mem_info_vram_total", total)) {
                    // Some Intel/xe/i915 setups expose local memory size through
                    // local_mem_size rather than the amdgpu-style VRAM file.
                    read_u64_file(dev / "local_mem_size", total);
                }

                read_u64_file(dev / "mem_info_vram_used", used);

                if (total == 0) {
                    continue;
                }

                Candidate c;
                const std::string vendor = trim_newline(read_string_file(dev / "vendor"));
                const std::string device = trim_newline(read_string_file(dev / "device"));

                c.name = card;
                if (!vendor.empty() || !device.empty()) {
                    c.name += " vendor=" + vendor + " device=" + device;
                }
                c.dedicated = total;
                c.current_usage = used;
                c.available_budget = total > used ? total - used : 0;
                c.source = "linux_drm_sysfs";

                candidates.push_back(c);
            }
        }
    } catch (const std::exception &) {
        // sysfs enumeration is best-effort; fall through to vendor-tool fallback.
    }

    // NVIDIA fallback without adding a build-time NVML dependency. This uses the
    // nvidia-smi command if present. It reports MiB, so convert to bytes. It is
    // intentionally a fallback because spawning a process per memory check is
    // slower than reading sysfs/DXGI.
    if (candidates.empty()) {
        FILE * pipe = popen("nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv,noheader,nounits 2>/dev/null", "r");
        if (pipe) {
            char line[1024];
            while (fgets(line, sizeof(line), pipe)) {
                std::string row(line);
                row = trim_newline(row);
                if (row.empty()) {
                    continue;
                }

                std::vector<std::string> cols;
                std::stringstream ss(row);
                std::string col;
                while (std::getline(ss, col, ',')) {
                    cols.push_back(trim_ws(col));
                }

                if (cols.size() >= 3) {
                    try {
                        Candidate c;
                        c.name = cols[0];
                        c.dedicated = mb_to_bytes((uint64_t) std::stoull(cols[1]));
                        c.current_usage = mb_to_bytes((uint64_t) std::stoull(cols[2]));
                        c.available_budget = c.dedicated > c.current_usage ? c.dedicated - c.current_usage : 0;
                        c.source = "nvidia_smi";
                        if (c.dedicated > 0) {
                            candidates.push_back(c);
                        }
                    } catch (...) {
                        // Ignore malformed rows.
                    }
                }
            }
            pclose(pipe);
        }
    }

    if (candidates.empty()) {
        result.source = "linux_gpu_memory_detection_unavailable";
        (void) policy;
        return result;
    }

    result.valid = true;
    result.adapter_count = (int) candidates.size();

    if (policy == "sum") {
        result.source = "linux_gpu_memory_sum";
        result.adapter_name = "sum_of_detected_linux_adapters";
        for (const Candidate & c : candidates) {
            result.dedicated_video_memory_bytes += c.dedicated;
            result.current_usage_bytes += c.current_usage;
            result.available_budget_bytes += c.available_budget;
        }
        result.budget_bytes = result.dedicated_video_memory_bytes;
    } else {
        auto best = std::max_element(candidates.begin(), candidates.end(), [](const Candidate & a, const Candidate & b) {
            return a.dedicated < b.dedicated;
        });

        result.source = best->source;
        result.adapter_name = best->name;
        result.dedicated_video_memory_bytes = best->dedicated;
        result.budget_bytes = best->dedicated;
        result.current_usage_bytes = best->current_usage;
        result.available_budget_bytes = best->available_budget;
    }
#endif

    return result;
}

static uint64_t os_total_physical_memory_bytes() {
#if defined(_WIN32)
    MEMORYSTATUSEX status;
    status.dwLength = sizeof(status);
    if (GlobalMemoryStatusEx(&status)) {
        return (uint64_t) status.ullTotalPhys;
    }
    return 0;
#else
    const long pages = sysconf(_SC_PHYS_PAGES);
    const long page_size = sysconf(_SC_PAGE_SIZE);
    if (pages > 0 && page_size > 0) {
        return (uint64_t) pages * (uint64_t) page_size;
    }
    return 0;
#endif
}

static uint64_t os_available_physical_memory_bytes() {
#if defined(_WIN32)
    MEMORYSTATUSEX status;
    status.dwLength = sizeof(status);
    if (GlobalMemoryStatusEx(&status)) {
        return (uint64_t) status.ullAvailPhys;
    }
    return std::numeric_limits<uint64_t>::max();
#else
#if defined(_SC_AVPHYS_PAGES)
    const long pages = sysconf(_SC_AVPHYS_PAGES);
    const long page_size = sysconf(_SC_PAGE_SIZE);
    if (pages > 0 && page_size > 0) {
        return (uint64_t) pages * (uint64_t) page_size;
    }
#endif
    return std::numeric_limits<uint64_t>::max();
#endif
}

static double bytes_to_mib(uint64_t bytes) {
    return (double) bytes / (1024.0 * 1024.0);
}

static uint64_t estimate_request_memory_bytes(const App & app, int32_t n_ctx, int32_t n_batch, int32_t n_ubatch) {
    const uint64_t token_slots = (uint64_t) std::max(n_ctx, std::max(n_batch, n_ubatch));
    return app.cfg.request_base_memory_bytes + token_slots * app.cfg.bytes_per_token;
}

static bool context_pool_enabled(const App & app) {
    return app.cfg.context_pool_size > 0 && !app.context_pool.empty();
}

static bool context_pool_can_satisfy(const App & app, int32_t n_ctx, int32_t n_batch, int32_t n_ubatch) {
    if (!context_pool_enabled(app)) {
        return false;
    }

    return n_ctx <= app.context_pool_n_ctx_actual &&
           n_batch <= app.context_pool_n_batch_actual &&
           n_ubatch <= app.context_pool_n_ubatch_actual;
}

static uint64_t estimate_active_request_memory_bytes(
    const App & app,
    int32_t n_ctx,
    int32_t n_batch,
    int32_t n_ubatch,
    bool will_use_context_pool
) {
    // If the request uses a pre-created pooled context, the large n_ctx/n_batch
    // GPU allocations are already resident and accounted as context_pool_memory.
    // Keep only a conservative per-request working estimate so the scheduler
    // still limits active CPU/output buffers and transient backend allocations.
    if (will_use_context_pool) {
        return app.cfg.request_base_memory_bytes;
    }

    return estimate_request_memory_bytes(app, n_ctx, n_batch, n_ubatch);
}

struct ContextLease {
    App * app = nullptr;
    llama_context * ctx = nullptr;
    PooledContextSlot * slot = nullptr;
    bool pooled = false;

    ContextLease() = default;

    ContextLease(App * app_, llama_context * ctx_)
        : app(app_), ctx(ctx_), slot(nullptr), pooled(false) {}

    ContextLease(App * app_, PooledContextSlot * slot_)
        : app(app_), ctx(slot_ ? slot_->ctx : nullptr), slot(slot_), pooled(true) {}

    ContextLease(const ContextLease &) = delete;
    ContextLease & operator=(const ContextLease &) = delete;

    ContextLease(ContextLease && other) noexcept {
        app = other.app;
        ctx = other.ctx;
        slot = other.slot;
        pooled = other.pooled;
        other.app = nullptr;
        other.ctx = nullptr;
        other.slot = nullptr;
        other.pooled = false;
    }

    ContextLease & operator=(ContextLease && other) noexcept {
        if (this != &other) {
            release();
            app = other.app;
            ctx = other.ctx;
            slot = other.slot;
            pooled = other.pooled;
            other.app = nullptr;
            other.ctx = nullptr;
            other.slot = nullptr;
            other.pooled = false;
        }
        return *this;
    }

    ~ContextLease() {
        release();
    }

    void release() {
        if (!ctx) {
            return;
        }

        if (pooled && app && slot) {
            if (app->cfg.context_pool_clear_on_release) {
                llama_memory_clear(llama_get_memory(ctx), true);
            }

            {
                std::lock_guard<std::mutex> guard(app->context_pool_mutex);
                app->active_pooled_contexts = std::max(0, app->active_pooled_contexts - 1);
                app->context_pool_available.push_back(slot);
            }

            app->context_pool_cv.notify_one();
        } else {
            llama_free(ctx);
        }

        app = nullptr;
        ctx = nullptr;
        slot = nullptr;
        pooled = false;
    }
};

static ContextLease acquire_context_lease(
    App & app,
    int32_t n_ctx,
    int32_t n_batch,
    int32_t n_ubatch,
    bool & reused_out,
    uint64_t & pool_slot_id_out,
    double & pool_wait_seconds_out
) {
    reused_out = false;
    pool_slot_id_out = 0;
    pool_wait_seconds_out = 0.0;

    if (context_pool_can_satisfy(app, n_ctx, n_batch, n_ubatch)) {
        const auto wait_start = std::chrono::steady_clock::now();

        std::unique_lock<std::mutex> lock(app.context_pool_mutex);
        app.context_pool_waits++;
        app.context_pool_cv.wait(lock, [&]() {
            return !app.context_pool_available.empty();
        });

        PooledContextSlot * slot = app.context_pool_available.front();
        app.context_pool_available.pop_front();
        app.active_pooled_contexts++;
        app.context_pool_acquires++;
        app.context_pool_reuses++;

        lock.unlock();

        // Clear again on acquire so a failed previous request cannot leak state.
        if (app.cfg.context_pool_clear_on_release) {
            llama_memory_clear(llama_get_memory(slot->ctx), true);
        }
        llama_set_n_threads(slot->ctx, app.cfg.n_threads, app.cfg.n_threads);

        const auto wait_end = std::chrono::steady_clock::now();
        reused_out = true;
        pool_slot_id_out = slot->slot_id;
        pool_wait_seconds_out = seconds_between(wait_start, wait_end);
        return ContextLease(&app, slot);
    }

    if (app.cfg.context_pool_size > 0 && app.cfg.context_pool_strict) {
        std::ostringstream err;
        err << "request does not fit the configured context pool: requested n_ctx=" << n_ctx
            << ", n_batch=" << n_batch
            << ", n_ubatch=" << n_ubatch
            << "; pool n_ctx=" << app.context_pool_n_ctx_actual
            << ", n_batch=" << app.context_pool_n_batch_actual
            << ", n_ubatch=" << app.context_pool_n_ubatch_actual
            << ". Increase --context-pool-n-ctx/--context-pool-n-batch/--context-pool-n-ubatch or disable --context-pool-strict.";
        throw std::runtime_error(err.str());
    }

    {
        std::lock_guard<std::mutex> lock(app.context_pool_mutex);
        app.context_pool_misses++;
    }

    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx = n_ctx;
    ctx_params.n_batch = n_batch;
    ctx_params.n_ubatch = n_ubatch;
    ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;

    std::unique_lock<std::mutex> context_lock;
    if (app.cfg.serialize_context_creation) {
        context_lock = std::unique_lock<std::mutex>(app.context_init_mutex);
    }

    llama_context * ctx = llama_init_from_model(app.model, ctx_params);
    if (!ctx) {
        throw std::runtime_error("failed to create llama_context");
    }

    llama_set_n_threads(ctx, app.cfg.n_threads, app.cfg.n_threads);
    return ContextLease(&app, ctx);
}

static void init_context_pool(App & app) {
    if (app.cfg.context_pool_size <= 0) {
        return;
    }

    int32_t n_ctx = app.cfg.context_pool_n_ctx;
    if (n_ctx <= 0) {
        if (app.cfg.default_n_ctx > 0) {
            n_ctx = app.cfg.default_n_ctx;
        } else if (app.cfg.default_processing_tokens > 0) {
            n_ctx = app.cfg.default_processing_tokens;
        } else {
            const int32_t cl = app.canvas_length > 0 ? app.canvas_length : app.cfg.default_max_tokens;
            const int32_t blocks = app.canvas_length > 0
                ? std::max(1, (app.cfg.default_max_tokens + cl - 1) / cl)
                : 1;
            n_ctx = std::max(app.cfg.default_max_tokens, blocks * cl + app.cfg.ctx_headroom_tokens);
        }
    }

    int32_t n_batch = app.cfg.context_pool_n_batch > 0 ? app.cfg.context_pool_n_batch : n_ctx;
    int32_t n_ubatch = app.cfg.context_pool_n_ubatch > 0 ? app.cfg.context_pool_n_ubatch : n_batch;

    app.context_pool_n_ctx_actual = n_ctx;
    app.context_pool_n_batch_actual = n_batch;
    app.context_pool_n_ubatch_actual = n_ubatch;

    std::cerr << "Creating context pool: size=" << app.cfg.context_pool_size
              << ", n_ctx=" << n_ctx
              << ", n_batch=" << n_batch
              << ", n_ubatch=" << n_ubatch << "\n";

    uint64_t pool_bytes = 0;

    for (int i = 0; i < app.cfg.context_pool_size; ++i) {
        llama_context_params ctx_params = llama_context_default_params();
        ctx_params.n_ctx = n_ctx;
        ctx_params.n_batch = n_batch;
        ctx_params.n_ubatch = n_ubatch;
        ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;

        std::unique_lock<std::mutex> context_lock;
        if (app.cfg.serialize_context_creation) {
            context_lock = std::unique_lock<std::mutex>(app.context_init_mutex);
        }

        llama_context * ctx = llama_init_from_model(app.model, ctx_params);
        if (!ctx) {
            throw std::runtime_error("failed to create pooled llama_context");
        }

        llama_set_n_threads(ctx, app.cfg.n_threads, app.cfg.n_threads);
        llama_memory_clear(llama_get_memory(ctx), true);

        auto slot = std::make_unique<PooledContextSlot>();
        slot->ctx = ctx;
        slot->n_ctx = n_ctx;
        slot->n_batch = n_batch;
        slot->n_ubatch = n_ubatch;
        slot->slot_id = (uint64_t) i + 1;

        app.context_pool_available.push_back(slot.get());
        app.context_pool.push_back(std::move(slot));

        pool_bytes += estimate_request_memory_bytes(app, n_ctx, n_batch, n_ubatch);
    }

    app.context_pool_memory_bytes = pool_bytes;

    // The admission gate treats context_pool_memory_bytes as a separate resident
    // baseline. Active pooled requests then reserve only request_base_memory_bytes.

    std::cerr << "Context pool ready. Estimated resident pool memory = "
              << bytes_to_mib(pool_bytes) << " MiB\n";
}

static void free_context_pool(App & app) {
    std::lock_guard<std::mutex> lock(app.context_pool_mutex);

    for (auto & slot : app.context_pool) {
        if (slot && slot->ctx) {
            llama_free(slot->ctx);
            slot->ctx = nullptr;
        }
    }

    app.context_pool_available.clear();
    app.context_pool.clear();
    app.active_pooled_contexts = 0;
}

static json memory_status_snapshot_locked(const App & app) {
    const uint64_t os_total = os_total_physical_memory_bytes();
    const uint64_t os_avail = os_available_physical_memory_bytes();
    const GpuMemoryInfo gpu_now = detect_gpu_memory_info(app.cfg.gpu_memory_policy);

    const uint64_t tracked_total =
        app.model_memory_bytes +
        app.context_pool_memory_bytes +
        app.active_request_memory_bytes +
        app.cfg.memory_safety_margin_bytes;

    json j;
    j["model_memory_bytes"] = app.model_memory_bytes;
    j["model_memory_mib"] = bytes_to_mib(app.model_memory_bytes);
    j["active_request_memory_bytes"] = app.active_request_memory_bytes;
    j["active_request_memory_mib"] = bytes_to_mib(app.active_request_memory_bytes);
    j["peak_request_memory_bytes"] = app.peak_request_memory_bytes;
    j["peak_request_memory_mib"] = bytes_to_mib(app.peak_request_memory_bytes);
    j["queued_request_memory_bytes"] = app.queued_request_memory_bytes;
    j["queued_request_memory_mib"] = bytes_to_mib(app.queued_request_memory_bytes);
    j["peak_queued_request_memory_bytes"] = app.peak_queued_request_memory_bytes;
    j["peak_queued_request_memory_mib"] = bytes_to_mib(app.peak_queued_request_memory_bytes);
    j["tracked_total_with_safety_bytes"] = tracked_total;
    j["tracked_total_with_safety_mib"] = bytes_to_mib(tracked_total);
    j["memory_limit_bytes"] = app.cfg.memory_limit_bytes;
    j["memory_limit_mib"] = bytes_to_mib(app.cfg.memory_limit_bytes);
    j["memory_safety_margin_bytes"] = app.cfg.memory_safety_margin_bytes;
    j["memory_safety_margin_mib"] = bytes_to_mib(app.cfg.memory_safety_margin_bytes);
    j["bytes_per_token"] = app.cfg.bytes_per_token;
    j["request_base_memory_bytes"] = app.cfg.request_base_memory_bytes;
    j["request_base_memory_mib"] = bytes_to_mib(app.cfg.request_base_memory_bytes);
    j["active_requests"] = app.active_requests;
    j["queued_requests"] = app.queued_requests;
    j["queue_depth"] = (int) app.queue_order.size();
    j["next_queue_ticket"] = app.next_queue_ticket;
    j["max_concurrent_requests"] = app.cfg.max_concurrent_requests;
    j["fifo_queue"] = app.cfg.fifo_queue;
    j["max_queue_requests"] = app.cfg.max_queue_requests;
    j["max_queue_memory_bytes"] = app.cfg.max_queue_memory_bytes;
    j["max_queue_memory_mib"] = bytes_to_mib(app.cfg.max_queue_memory_bytes);
    j["serialize_context_creation"] = app.cfg.serialize_context_creation;
    j["serialize_generation"] = app.cfg.serialize_generation;
    j["post_context_memory_check"] = app.cfg.post_context_memory_check;
    j["http_worker_threads"] = app.cfg.http_worker_threads;
    j["context_pool_size"] = app.cfg.context_pool_size;
    j["context_pool_strict"] = app.cfg.context_pool_strict;
    j["context_pool_clear_on_release"] = app.cfg.context_pool_clear_on_release;
    j["context_pool_n_ctx"] = app.context_pool_n_ctx_actual;
    j["context_pool_n_batch"] = app.context_pool_n_batch_actual;
    j["context_pool_n_ubatch"] = app.context_pool_n_ubatch_actual;
    j["context_pool_memory_bytes"] = app.context_pool_memory_bytes;
    j["context_pool_memory_mib"] = bytes_to_mib(app.context_pool_memory_bytes);
    j["context_pool_available"] = (int) app.context_pool_available.size();
    j["active_pooled_contexts"] = app.active_pooled_contexts;
    j["context_pool_reuses"] = app.context_pool_reuses;
    j["context_pool_misses"] = app.context_pool_misses;
    j["os_total_physical_bytes"] = os_total;
    j["os_total_physical_mib"] = bytes_to_mib(os_total);
    j["os_available_physical_bytes"] = os_avail == std::numeric_limits<uint64_t>::max() ? 0 : os_avail;
    j["os_available_physical_mib"] = os_avail == std::numeric_limits<uint64_t>::max() ? 0.0 : bytes_to_mib(os_avail);

    j["gpu_memory_detected"] = app.gpu_memory_detected;
    j["gpu_memory_policy"] = app.cfg.gpu_memory_policy;
    j["gpu_memory_source"] = app.gpu_memory_source;
    j["gpu_adapter_name"] = app.gpu_adapter_name;
    j["gpu_adapter_count"] = app.gpu_adapter_count;
    j["gpu_dedicated_memory_bytes"] = app.gpu_dedicated_memory_bytes;
    j["gpu_dedicated_memory_mib"] = bytes_to_mib(app.gpu_dedicated_memory_bytes);
    j["gpu_budget_bytes_at_start"] = app.gpu_budget_bytes;
    j["gpu_budget_mib_at_start"] = bytes_to_mib(app.gpu_budget_bytes);
    j["gpu_current_usage_bytes_at_start"] = app.gpu_current_usage_bytes_at_start;
    j["gpu_current_usage_mib_at_start"] = bytes_to_mib(app.gpu_current_usage_bytes_at_start);
    j["gpu_available_budget_bytes_at_start"] = app.gpu_available_budget_bytes_at_start;
    j["gpu_available_budget_mib_at_start"] = bytes_to_mib(app.gpu_available_budget_bytes_at_start);
    j["gpu_budget_bytes_now"] = gpu_now.budget_bytes;
    j["gpu_budget_mib_now"] = bytes_to_mib(gpu_now.budget_bytes);
    j["gpu_current_usage_bytes_now"] = gpu_now.current_usage_bytes;
    j["gpu_current_usage_mib_now"] = bytes_to_mib(gpu_now.current_usage_bytes);
    j["gpu_available_budget_bytes_now"] = gpu_now.available_budget_bytes;
    j["gpu_available_budget_mib_now"] = bytes_to_mib(gpu_now.available_budget_bytes);
    j["gpu_dynamic_memory_guard"] = app.cfg.gpu_dynamic_memory_guard;
    j["memory_limit_auto_detected"] = app.cfg.memory_limit_auto_detected;

    return j;
}

static json memory_status_snapshot(App & app) {
    std::lock_guard<std::mutex> guard(app.memory_mutex);
    return memory_status_snapshot_locked(app);
}

static json physical_memory_pressure_snapshot(const App & app) {
    const uint64_t os_total = os_total_physical_memory_bytes();
    const uint64_t os_avail_raw = os_available_physical_memory_bytes();
    const bool os_avail_valid = os_avail_raw != std::numeric_limits<uint64_t>::max();
    const uint64_t os_avail = os_avail_valid ? os_avail_raw : 0;
    const uint64_t os_used = (os_total > 0 && os_avail_valid && os_total >= os_avail) ? (os_total - os_avail) : 0;

    const GpuMemoryInfo gpu_now = detect_gpu_memory_info(app.cfg.gpu_memory_policy);

    const bool gpu_valid = gpu_now.valid || app.gpu_memory_detected;
    const std::string gpu_source = gpu_now.valid ? gpu_now.source : app.gpu_memory_source;
    const std::string gpu_adapter = gpu_now.valid ? gpu_now.adapter_name : app.gpu_adapter_name;
    const uint64_t gpu_dedicated = gpu_now.valid ? gpu_now.dedicated_video_memory_bytes : app.gpu_dedicated_memory_bytes;
    const uint64_t gpu_budget = gpu_now.valid ? gpu_now.budget_bytes : app.gpu_budget_bytes;
    const uint64_t gpu_used = gpu_now.valid ? gpu_now.current_usage_bytes : app.gpu_current_usage_bytes_at_start;
    const uint64_t gpu_available = gpu_now.valid ? gpu_now.available_budget_bytes : app.gpu_available_budget_bytes_at_start;

    json j;

    j["system_physical"] = {
        {"total_bytes", os_total},
        {"total_mib", bytes_to_mib(os_total)},
        {"available_bytes", os_avail},
        {"available_mib", bytes_to_mib(os_avail)},
        {"currently_consumed_bytes", os_used},
        {"currently_consumed_mib", bytes_to_mib(os_used)},
        {"available_valid", os_avail_valid}
    };

    j["gpu_physical"] = {
        {"detected", gpu_valid},
        {"source", gpu_source},
        {"policy", app.cfg.gpu_memory_policy},
        {"adapter_name", gpu_adapter},
        {"dedicated_total_bytes", gpu_dedicated},
        {"dedicated_total_mib", bytes_to_mib(gpu_dedicated)},
        {"budget_bytes_now", gpu_budget},
        {"budget_mib_now", bytes_to_mib(gpu_budget)},
        {"currently_consumed_bytes_now", gpu_used},
        {"currently_consumed_mib_now", bytes_to_mib(gpu_used)},
        {"available_budget_bytes_now", gpu_available},
        {"available_budget_mib_now", bytes_to_mib(gpu_available)}
    };

    return j;
}

struct MemoryReservation {
    App * app = nullptr;
    uint64_t request_id = 0;
    uint64_t bytes = 0;
    bool active = false;

    MemoryReservation() = default;

    MemoryReservation(App * app_, uint64_t request_id_, uint64_t bytes_)
        : app(app_), request_id(request_id_), bytes(bytes_), active(true) {}

    MemoryReservation(const MemoryReservation &) = delete;
    MemoryReservation & operator=(const MemoryReservation &) = delete;

    MemoryReservation(MemoryReservation && other) noexcept {
        app = other.app;
        request_id = other.request_id;
        bytes = other.bytes;
        active = other.active;

        other.app = nullptr;
        other.bytes = 0;
        other.active = false;
    }

    MemoryReservation & operator=(MemoryReservation && other) noexcept {
        if (this != &other) {
            release();

            app = other.app;
            request_id = other.request_id;
            bytes = other.bytes;
            active = other.active;

            other.app = nullptr;
            other.bytes = 0;
            other.active = false;
        }

        return *this;
    }

    ~MemoryReservation() {
        release();
    }

    void release() {
        if (!active || !app) {
            return;
        }

        {
            std::lock_guard<std::mutex> guard(app->memory_mutex);

            if (app->active_request_memory_bytes >= bytes) {
                app->active_request_memory_bytes -= bytes;
            } else {
                app->active_request_memory_bytes = 0;
            }

            app->active_requests = std::max(0, app->active_requests - 1);
        }

        app->memory_cv.notify_all();

        active = false;
    }
};

static bool request_exceeds_fixed_budget_locked(const App & app, uint64_t request_bytes) {
    if (app.cfg.memory_limit_bytes == 0) {
        return false;
    }

    return app.model_memory_bytes +
           app.context_pool_memory_bytes +
           request_bytes +
           app.cfg.memory_safety_margin_bytes > app.cfg.memory_limit_bytes;
}

static bool queue_is_over_capacity_locked(const App & app, uint64_t request_bytes) {
    if (app.cfg.max_queue_requests > 0 && app.queued_requests >= app.cfg.max_queue_requests) {
        return true;
    }

    if (app.cfg.max_queue_memory_bytes > 0 &&
        app.queued_request_memory_bytes + request_bytes > app.cfg.max_queue_memory_bytes) {
        return true;
    }

    return false;
}

static void erase_queue_ticket_locked(App & app, uint64_t ticket) {
    auto it = std::find(app.queue_order.begin(), app.queue_order.end(), ticket);
    if (it != app.queue_order.end()) {
        app.queue_order.erase(it);
    }
}

static bool queue_turn_ready_locked(const App & app, uint64_t ticket) {
    if (!app.cfg.fifo_queue) {
        return true;
    }

    return !app.queue_order.empty() && app.queue_order.front() == ticket;
}

static bool can_admit_request_locked(const App & app, uint64_t request_bytes, uint64_t ticket) {
    if (!queue_turn_ready_locked(app, ticket)) {
        return false;
    }

    if (app.cfg.max_concurrent_requests > 0 && app.active_requests >= app.cfg.max_concurrent_requests) {
        return false;
    }

    if (app.cfg.memory_limit_bytes > 0) {
        const uint64_t projected =
            app.model_memory_bytes +
            app.context_pool_memory_bytes +
            app.active_request_memory_bytes +
            request_bytes +
            app.cfg.memory_safety_margin_bytes;

        if (projected > app.cfg.memory_limit_bytes) {
            return false;
        }
    }

    // Dynamic GPU memory guard. DXGI Budget/CurrentUsage reflects the current
    // local video-memory pressure, including other processes. This prevents a
    // request from starting when the GPU budget is already too tight.
    if (app.cfg.gpu_dynamic_memory_guard) {
        const GpuMemoryInfo gpu = detect_gpu_memory_info(app.cfg.gpu_memory_policy);
        if (gpu.valid && gpu.budget_bytes > 0) {
            if (request_bytes + app.cfg.memory_safety_margin_bytes > gpu.available_budget_bytes) {
                return false;
            }
        } else {
            // Fallback OS physical memory guard when GPU memory telemetry is unavailable.
            const uint64_t avail_phys = os_available_physical_memory_bytes();
            if (avail_phys != std::numeric_limits<uint64_t>::max()) {
                if (request_bytes + app.cfg.memory_safety_margin_bytes > avail_phys) {
                    return false;
                }
            }
        }
    }

    return true;
}

static MemoryReservation acquire_memory_reservation(
    App & app,
    uint64_t request_id,
    const std::string & endpoint,
    uint64_t request_bytes,
    uint64_t & queue_wait_ms_out,
    uint64_t & active_after_admit_out,
    uint64_t & os_available_at_admit_out,
    uint64_t & gpu_available_budget_at_admit_out,
    uint64_t & queue_ticket_out,
    int & active_requests_after_admit_out,
    int & queued_requests_after_admit_out,
    uint64_t & queued_memory_after_admit_out
) {
    const auto t_queue_start = std::chrono::steady_clock::now();

    std::unique_lock<std::mutex> lock(app.memory_mutex);

    if (request_exceeds_fixed_budget_locked(app, request_bytes)) {
        std::ostringstream err;
        err << "single request exceeds configured memory budget: model_memory_mib="
            << bytes_to_mib(app.model_memory_bytes)
            << ", context_pool_memory_mib=" << bytes_to_mib(app.context_pool_memory_bytes)
            << ", request_memory_mib=" << bytes_to_mib(request_bytes)
            << ", safety_margin_mib=" << bytes_to_mib(app.cfg.memory_safety_margin_bytes)
            << ", memory_limit_mib=" << bytes_to_mib(app.cfg.memory_limit_bytes)
            << ". Increase --memory-limit-mb or reduce max_tokens / processing_tokens.";
        throw std::runtime_error(err.str());
    }

    if (queue_is_over_capacity_locked(app, request_bytes)) {
        std::ostringstream err;
        err << "request rejected because memory queue is full: queued_requests="
            << app.queued_requests
            << ", max_queue_requests=" << app.cfg.max_queue_requests
            << ", queued_memory_mib=" << bytes_to_mib(app.queued_request_memory_bytes)
            << ", max_queue_memory_mib=" << bytes_to_mib(app.cfg.max_queue_memory_bytes);
        throw std::runtime_error(err.str());
    }

    const uint64_t ticket = app.next_queue_ticket++;
    queue_ticket_out = ticket;

    app.queued_requests++;
    app.queued_request_memory_bytes += request_bytes;
    app.peak_queued_request_memory_bytes = std::max(app.peak_queued_request_memory_bytes, app.queued_request_memory_bytes);
    app.queue_order.push_back(ticket);

    bool admitted = false;

    auto admitted_pred = [&]() {
        return can_admit_request_locked(app, request_bytes, ticket);
    };

    try {
        if (app.cfg.queue_timeout_ms == 0) {
            app.memory_cv.wait(lock, admitted_pred);
            admitted = true;
        } else {
            const auto timeout = std::chrono::milliseconds(app.cfg.queue_timeout_ms);
            admitted = app.memory_cv.wait_for(lock, timeout, admitted_pred);
        }
    } catch (...) {
        erase_queue_ticket_locked(app, ticket);
        app.queued_requests = std::max(0, app.queued_requests - 1);
        if (app.queued_request_memory_bytes >= request_bytes) {
            app.queued_request_memory_bytes -= request_bytes;
        } else {
            app.queued_request_memory_bytes = 0;
        }
        app.memory_cv.notify_all();
        throw;
    }

    if (!admitted) {
        erase_queue_ticket_locked(app, ticket);
        app.queued_requests = std::max(0, app.queued_requests - 1);
        if (app.queued_request_memory_bytes >= request_bytes) {
            app.queued_request_memory_bytes -= request_bytes;
        } else {
            app.queued_request_memory_bytes = 0;
        }
        app.memory_cv.notify_all();

        std::ostringstream err;
        err << "request timed out waiting for memory queue after "
            << app.cfg.queue_timeout_ms
            << " ms. request_memory_mib=" << bytes_to_mib(request_bytes)
            << ", ticket=" << ticket
            << ", current_memory=" << memory_status_snapshot_locked(app).dump();
        throw std::runtime_error(err.str());
    }

    erase_queue_ticket_locked(app, ticket);
    app.queued_requests = std::max(0, app.queued_requests - 1);
    if (app.queued_request_memory_bytes >= request_bytes) {
        app.queued_request_memory_bytes -= request_bytes;
    } else {
        app.queued_request_memory_bytes = 0;
    }

    app.active_request_memory_bytes += request_bytes;
    app.peak_request_memory_bytes = std::max(app.peak_request_memory_bytes, app.active_request_memory_bytes);
    app.active_requests++;

    active_after_admit_out = app.active_request_memory_bytes;
    active_requests_after_admit_out = app.active_requests;
    queued_requests_after_admit_out = app.queued_requests;
    queued_memory_after_admit_out = app.queued_request_memory_bytes;

    os_available_at_admit_out = os_available_physical_memory_bytes();
    const GpuMemoryInfo gpu_at_admit = detect_gpu_memory_info(app.cfg.gpu_memory_policy);
    gpu_available_budget_at_admit_out = gpu_at_admit.available_budget_bytes;

    const auto t_queue_end = std::chrono::steady_clock::now();
    queue_wait_ms_out = (uint64_t) std::chrono::duration_cast<std::chrono::milliseconds>(t_queue_end - t_queue_start).count();

    (void) endpoint;

    return MemoryReservation(&app, request_id, request_bytes);
}

static std::string json_error(const std::string & message, int code = 500) {
    json j;
    j["error"] = {
        {"message", message},
        {"type", "server_error"},
        {"code", code}
    };
    return j.dump();
}

static int json_int_any(const json & body, const std::vector<const char *> & keys, int def) {
    for (const char * key : keys) {
        if (body.contains(key) && body[key].is_number_integer()) {
            return body[key].get<int>();
        }
    }
    return def;
}

static bool json_bool_any(const json & body, const std::vector<const char *> & keys, bool def) {
    for (const char * key : keys) {
        if (body.contains(key) && body[key].is_boolean()) {
            return body[key].get<bool>();
        }
    }
    return def;
}

static RequestOptions request_options_from_body(const App & app, const json & body, bool completion_endpoint) {
    RequestOptions opts;

    opts.max_tokens = completion_endpoint
        ? json_int_any(body, {"n_predict", "max_tokens"}, app.cfg.default_max_tokens)
        : json_int_any(body, {"max_tokens", "n_predict"}, app.cfg.default_max_tokens);

    opts.temperature = body.value("temperature", app.cfg.temperature);
    opts.stream = json_bool_any(body, {"stream"}, false);

    opts.processing_tokens = json_int_any(
        body,
        {"processing_tokens", "allocated_tokens", "alloc_tokens"},
        app.cfg.default_processing_tokens
    );

    opts.n_ctx = json_int_any(body, {"n_ctx", "ctx_size", "context_size"}, app.cfg.default_n_ctx);
    opts.n_batch = json_int_any(body, {"n_batch", "batch_size"}, app.cfg.default_n_batch);
    opts.n_ubatch = json_int_any(body, {"n_ubatch", "ubatch_size", "micro_batch_size"}, app.cfg.default_n_ubatch);

    opts.ctx_headroom_tokens = json_int_any(
        body,
        {"ctx_headroom_tokens", "context_headroom", "processing_headroom"},
        app.cfg.ctx_headroom_tokens
    );

    opts.stream_chunk_chars = json_int_any(
        body,
        {"stream_chunk_chars", "stream_chunk_size"},
        app.cfg.stream_chunk_chars
    );

    if (opts.max_tokens <= 0) {
        opts.max_tokens = app.cfg.default_max_tokens;
    }
    if (opts.ctx_headroom_tokens < 0) {
        opts.ctx_headroom_tokens = 0;
    }
    if (opts.stream_chunk_chars <= 0) {
        opts.stream_chunk_chars = app.cfg.stream_chunk_chars > 0 ? app.cfg.stream_chunk_chars : 96;
    }

    return opts;
}


static const json * json_find_ptr(const json & root, const std::vector<const char *> & path) {
    const json * cur = &root;

    for (const char * key : path) {
        if (!cur->is_object() || !cur->contains(key)) {
            return nullptr;
        }
        cur = &((*cur)[key]);
    }

    return cur;
}

static double json_number_or(const json & root, const std::vector<const char *> & path, double def = 0.0) {
    const json * p = json_find_ptr(root, path);
    if (!p || !p->is_number()) {
        return def;
    }
    return p->get<double>();
}

static int64_t json_i64_or(const json & root, const std::vector<const char *> & path, int64_t def = 0) {
    const json * p = json_find_ptr(root, path);
    if (!p || !p->is_number_integer()) {
        return def;
    }
    return p->get<int64_t>();
}

static bool json_bool_or(const json & root, const std::vector<const char *> & path, bool def = false) {
    const json * p = json_find_ptr(root, path);
    if (!p || !p->is_boolean()) {
        return def;
    }
    return p->get<bool>();
}

static std::string json_string_or(const json & root, const std::vector<const char *> & path, const std::string & def = "") {
    const json * p = json_find_ptr(root, path);
    if (!p || !p->is_string()) {
        return def;
    }
    return p->get<std::string>();
}

static json compact_memory_pressure_summary(const App & app) {
    const json pressure = physical_memory_pressure_snapshot(app);

    json j;
    j["system_available_mib"] = json_number_or(pressure, {"system_physical", "available_mib"});
    j["system_consumed_mib"] = json_number_or(pressure, {"system_physical", "currently_consumed_mib"});
    j["system_total_mib"] = json_number_or(pressure, {"system_physical", "total_mib"});
    j["gpu_detected"] = json_bool_or(pressure, {"gpu_physical", "detected"});
    j["gpu_source"] = json_string_or(pressure, {"gpu_physical", "source"});
    j["gpu_adapter"] = json_string_or(pressure, {"gpu_physical", "adapter_name"});
    j["gpu_available_mib"] = json_number_or(pressure, {"gpu_physical", "available_budget_mib_now"});
    j["gpu_consumed_mib"] = json_number_or(pressure, {"gpu_physical", "currently_consumed_mib_now"});
    j["gpu_budget_mib"] = json_number_or(pressure, {"gpu_physical", "budget_mib_now"});
    j["gpu_dedicated_mib"] = json_number_or(pressure, {"gpu_physical", "dedicated_total_mib"});

    return j;
}

static json summarize_audit_record(App & app, const json & record) {
    const std::string event = record.value("event", "");

    json out;
    out["event"] = event;

    // Always include the resolved model identity in compact audit/debug logs.
    out["model"] = record.value("model", app.model_name);
    out["model_arch"] = record.value("model_arch", app.model_architecture);

    if (record.contains("timestamp")) {
        out["timestamp"] = record["timestamp"];
    }
    if (record.contains("request_id")) {
        out["request_id"] = record["request_id"];
    }
    if (record.contains("endpoint")) {
        out["endpoint"] = record["endpoint"];
    }
    if (record.contains("success")) {
        out["success"] = record["success"];
    }

    if (event == "generation_complete") {
        out["tokens"] = {
            {"prompt", json_i64_or(record, {"tokens", "prompt_tokens"})},
            {"requested", json_i64_or(record, {"tokens", "requested_max_tokens"})},
            {"output", json_i64_or(record, {"tokens", "output_tokens"})},
            {"canvas_processed", json_i64_or(record, {"tokens", "canvas_tokens_processed"})}
        };

        out["timing_s"] = {
            {"queue", json_number_or(record, {"timings_seconds", "queue_wait"})},
            {"context", json_number_or(record, {"timings_seconds", "context_create"})},
            {"context_pool_wait", json_number_or(record, {"timings_seconds", "context_pool_wait"})},
            {"generation", json_number_or(record, {"timings_seconds", "diffusion_generate"})},
            {"total", json_number_or(record, {"timings_seconds", "total_request"})}
        };

        out["speed"] = {
            {"output_tps_total", json_number_or(record, {"throughput", "output_tokens_per_sec_total"})},
            {"output_tps_generation", json_number_or(record, {"throughput", "output_tokens_per_sec_generation_only"})},
            {"canvas_tps_generation", json_number_or(record, {"throughput", "canvas_tokens_per_sec_generation_only"})}
        };

        out["allocation"] = {
            {"n_ctx", json_i64_or(record, {"diffusion", "n_ctx"})},
            {"n_batch", json_i64_or(record, {"diffusion", "n_batch"})},
            {"n_ubatch", json_i64_or(record, {"diffusion", "n_ubatch"})},
            {"blocks", json_i64_or(record, {"diffusion", "blocks_completed"})},
            {"canvas_length", json_i64_or(record, {"diffusion", "canvas_length"})},
            {"request_est_mib", json_number_or(record, {"memory", "estimated_request_memory_mib"})},
            {"model_mib", json_number_or(record, {"memory", "model_memory_mib"})},
            {"context_reused", json_bool_or(record, {"memory", "context_reused"})},
            {"context_pool_slot", json_i64_or(record, {"memory", "context_pool_slot_id"})},
            {"context_pool_mib", json_number_or(record, {"memory", "context_pool_memory_mib"})}
        };

        out["queue"] = {
            {"ticket", json_i64_or(record, {"memory", "queue_ticket"})},
            {"active_after_admit", json_i64_or(record, {"memory", "active_requests_after_admit"})},
            {"queued_after_admit", json_i64_or(record, {"memory", "queued_requests_after_admit"})},
            {"active_mib_after_admit", json_number_or(record, {"memory", "active_request_memory_mib_after_admit"})},
            {"active_mib_after_release", json_number_or(record, {"memory", "active_request_memory_mib_after_release"})}
        };

        json mem = compact_memory_pressure_summary(app);
        mem["memory_limit_mib"] = json_number_or(record, {"memory", "memory_limit_mib"});
        mem["safety_margin_mib"] = json_number_or(record, {"memory", "memory_safety_margin_mib"});
        out["memory"] = mem;
    } else if (event == "request_memory_admitted") {
        out["queue"] = {
            {"ticket", json_i64_or(record, {"queue_ticket"})},
            {"wait_ms", json_i64_or(record, {"queue_wait_ms"})},
            {"active_requests", json_i64_or(record, {"active_requests_after_admit"})},
            {"queued_requests", json_i64_or(record, {"queued_requests_after_admit"})}
        };

        out["memory"] = {
            {"request_est_mib", json_number_or(record, {"estimated_request_memory_mib"})},
            {"active_mib_after_admit", json_number_or(record, {"active_request_memory_mib_after_admit"})},
            {"queued_mib_after_admit", json_number_or(record, {"queued_request_memory_mib_after_admit"})},
            {"system_available_mib_at_admit", json_number_or(record, {"os_available_physical_mib_at_admit"})},
            {"gpu_available_mib_at_admit", json_number_or(record, {"gpu_available_budget_mib_at_admit"})}
        };
    } else if (event == "request_begin") {
        out["request"] = {
            {"max_tokens", record.value("max_tokens", 0)},
            {"temperature", record.value("temperature", 0.0)},
            {"stream", record.value("stream", false)},
            {"processing_tokens", record.value("processing_tokens", 0)},
            {"n_ctx", record.value("n_ctx", 0)},
            {"n_batch", record.value("n_batch", 0)},
            {"n_ubatch", record.value("n_ubatch", 0)},
            {"ctx_headroom_tokens", record.value("ctx_headroom_tokens", 0)}
        };
    } else if (event == "request_error") {
        out["error"] = record.value("error", "");
        out["memory"] = compact_memory_pressure_summary(app);
    } else if (event == "server_start") {
        out["server"] = {
            {"host", record.value("host", "")},
            {"port", record.value("port", 0)},
            {"model_name", record.value("model_name", app.model_name)},
            {"model_arch", record.value("model_arch", app.model_architecture)},
            {"model_path", record.value("model_path", app.cfg.model_path)},
            {"canvas_length", record.value("canvas_length", 0)},
            {"default_max_tokens", record.value("default_max_tokens", 0)},
            {"max_concurrent_requests", record.value("max_concurrent_requests", 0)},
            {"http_worker_threads", record.value("http_worker_threads", 0)}
        };
        out["memory"] = {
            {"model_mib", record.value("model_memory_mib", 0.0)},
            {"memory_limit_mib", record.value("memory_limit_mib", 0.0)},
            {"safety_margin_mib", record.value("memory_safety_margin_mib", 0.0)},
            {"gpu_detected", record.value("gpu_memory_detected", false)},
            {"gpu_source", record.value("gpu_memory_source", "")},
            {"gpu_adapter", record.value("gpu_adapter_name", "")},
            {"gpu_budget_mib", record.value("gpu_budget_mib", 0.0)},
            {"gpu_available_mib_at_start", record.value("gpu_available_budget_mib_at_start", 0.0)}
        };
    } else {
        out = record;
        out["memory"] = compact_memory_pressure_summary(app);
    }

    return out;
}

static void audit_log(App & app, json record) {
    if (!app.cfg.audit_enabled) {
        return;
    }

    if (!record.contains("timestamp")) {
        record["timestamp"] = timestamp_utc_ms();
    }

    // Full audit/debug records also carry model identity unless explicitly set.
    if (!record.contains("model")) {
        record["model"] = app.model_name;
    }
    if (!record.contains("model_arch")) {
        record["model_arch"] = app.model_architecture;
    }

    json log_record = record;

    if (app.cfg.audit_summary) {
        log_record = summarize_audit_record(app, record);
    } else if (app.cfg.audit_memory_pressure && !log_record.contains("memory_pressure")) {
        log_record["memory_pressure"] = physical_memory_pressure_snapshot(app);
    }

    const std::string line = log_record.dump();

    std::lock_guard<std::mutex> guard(app.audit_mutex);

    std::cerr << "[audit] " << line << "\n";

    if (!app.cfg.audit_log_path.empty()) {
        std::ofstream f(app.cfg.audit_log_path, std::ios::out | std::ios::app);
        if (f) {
            f << line << "\n";
        } else {
            std::cerr << "[audit_error] failed to open audit log file: " << app.cfg.audit_log_path << "\n";
        }
    }
}

static std::string clean_diffusion_output(std::string s) {
    const std::string raw = trim_ws(s);

    // DiffusionGemma may output:
    //   <|channel>thought ... <channel|>final answer
    // Keep the text after the last channel close marker, but only if non-empty.
    const std::string close = "<channel|>";
    const size_t p = s.rfind(close);
    if (p != std::string::npos) {
        std::string tail = trim_ws(s.substr(p + close.size()));
        if (!tail.empty()) {
            s = tail;
        }
    }

    s = trim_ws(s);

    // Remove leading channel markers, but never return empty just because cleanup was aggressive.
    const std::string open = "<|channel>";
    while (s.rfind(open, 0) == 0) {
        s.erase(0, open.size());
        s = trim_ws(s);
    }

    const char * prefixes[] = {
        "final",
        "answer",
        "assistant"
    };

    for (const char * pref : prefixes) {
        std::string pfx(pref);
        if (s.rfind(pfx, 0) == 0) {
            s.erase(0, pfx.size());
            s = trim_ws(s);
            break;
        }
    }

    const char * stops[] = {
        "<|end|>",
        "<end_of_turn>",
        "</s>"
    };

    for (const char * stop : stops) {
        const size_t q = s.find(stop);
        if (q != std::string::npos) {
            s.erase(q);
        }
    }

    s = trim_ws(s);
    return s.empty() ? raw : s;
}

static int32_t get_meta_i(llama_model * model, const char * key, int32_t def) {
    char buf[64] = {};
    if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) >= 0) {
        return (int32_t) strtol(buf, nullptr, 10);
    }
    return def;
}

static float get_meta_f(llama_model * model, const char * key, float def) {
    char buf[64] = {};
    if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) >= 0) {
        return strtof(buf, nullptr);
    }
    return def;
}

static bool get_meta_bool(llama_model * model, const char * key, bool def) {
    char buf[64] = {};
    if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) >= 0) {
        std::string v = buf;
        return v == "true" || v == "1";
    }
    return def;
}

static std::string get_meta_string(llama_model * model, const char * key, const std::string & def) {
    char buf[512] = {};
    if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) >= 0) {
        std::string v = trim_ws(buf);
        if (!v.empty()) {
            return v;
        }
    }
    return def;
}

static std::string basename_from_path(const std::string & path) {
    size_t p = path.find_last_of("\\/");
    if (p == std::string::npos) {
        return path;
    }
    return path.substr(p + 1);
}

static int count_gpu_devices() {
    int gpu_devs = 0;

    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
        const ggml_backend_dev_t dev = ggml_backend_dev_get(i);
        const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);

        if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
            gpu_devs++;
        }
    }

    return gpu_devs;
}

static size_t trim_canvas_tokens(const llama_vocab * vocab, const llama_token * canvas, size_t n) {
    size_t cut = n;

    for (size_t i = 0; i < n; i++) {
        if (llama_vocab_is_eog(vocab, canvas[i])) {
            cut = i;
            break;
        }
    }

    // DiffusionGemma checkpoints sometimes do not emit a clean stop token.
    // Cut obvious repetition loops.
    for (size_t i = 0; i + 1 < cut; i++) {
        bool loop = false;

        for (size_t stride = 1; stride <= 2 && !loop; stride++) {
            size_t reps = 0;

            for (size_t j = i; j + stride < cut && canvas[j] == canvas[j + stride]; j += stride) {
                reps++;
            }

            loop = reps >= 6;
        }

        if (loop) {
            cut = i;
            break;
        }
    }

    return cut;
}

static std::string messages_to_prompt(llama_model * model, const json & messages, bool use_chat_template) {
    if (!messages.is_array()) {
        throw std::runtime_error("messages must be an array");
    }

    if (!use_chat_template) {
        std::ostringstream raw;

        for (const auto & m : messages) {
            const std::string role = m.value("role", "user");
            const std::string content = m.value("content", "");
            raw << role << ": " << content << "\n";
        }

        raw << "assistant: ";
        return raw.str();
    }

    auto tmpls = common_chat_templates_init(model, "");

    common_chat_templates_inputs inputs;
    inputs.add_generation_prompt = true;

    for (const auto & m : messages) {
        common_chat_msg msg;
        msg.role = m.value("role", "user");

        if (m.contains("content") && m["content"].is_string()) {
            msg.content = m["content"].get<std::string>();
        } else if (m.contains("content") && m["content"].is_array()) {
            std::ostringstream content;

            for (const auto & part : m["content"]) {
                if (part.is_object() && part.value("type", "") == "text") {
                    content << part.value("text", "");
                }
            }

            msg.content = content.str();
        } else {
            msg.content = "";
        }

        inputs.messages.push_back(msg);
    }

    return common_chat_templates_apply(tmpls.get(), inputs).prompt;
}

static std::string completion_prompt_from_body(const json & body) {
    if (!body.contains("prompt")) {
        throw std::runtime_error("missing prompt");
    }

    if (!body["prompt"].is_string()) {
        throw std::runtime_error("prompt must be a string");
    }

    return body["prompt"].get<std::string>();
}

static GenerationResult run_one_request(
    App & app,
    uint64_t request_id,
    const std::string & endpoint,
    const std::string & formatted_prompt,
    const RequestOptions & opts
) {
    const auto t_enter = std::chrono::steady_clock::now();

    const int max_tokens = opts.max_tokens > 0 ? opts.max_tokens : app.cfg.default_max_tokens;
    const float temperature = opts.temperature;

    const auto t_tokenize_start = std::chrono::steady_clock::now();

    std::vector<llama_token> prefix = common_tokenize(
        app.vocab,
        formatted_prompt,
        true,
        true
    );

    const auto t_tokenize_end = std::chrono::steady_clock::now();

    if (prefix.empty()) {
        throw std::runtime_error("tokenization produced no tokens");
    }

    const int32_t n_input = (int32_t) prefix.size();

    llama_token mask_token_id = llama_vocab_mask(app.vocab);
    if (mask_token_id == LLAMA_TOKEN_NULL) {
        throw std::runtime_error("model has no mask token");
    }

    int32_t blocks_requested = 1;
    int32_t blocks_completed = 0;
    int32_t min_required_tokens = n_input + max_tokens;
    int32_t n_ctx = min_required_tokens;

    if (app.canvas_length > 0) {
        const int32_t cl = app.canvas_length;

        blocks_requested = (max_tokens + cl - 1) / cl;
        blocks_requested = std::max(1, blocks_requested);

        min_required_tokens = n_input + blocks_requested * cl;

        // Match diffusion-cli.cpp sizing style unless the user overrides it:
        //   -n 256 -> 1 block
        //   n_ctx/n_batch/n_ubatch = blocks * canvas_length + ctx_headroom_tokens
        const int32_t cli_style = blocks_requested * cl + opts.ctx_headroom_tokens;
        n_ctx = std::max(cli_style, min_required_tokens);
    }

    if (opts.processing_tokens > 0) {
        n_ctx = opts.processing_tokens;
    }
    if (opts.n_ctx > 0) {
        n_ctx = opts.n_ctx;
    }

    if (n_ctx < min_required_tokens) {
        std::ostringstream err;
        err << "allocated context is too small: n_ctx=" << n_ctx
            << " but minimum required is " << min_required_tokens
            << " tokens. Increase processing_tokens / allocated_tokens / n_ctx or lower max_tokens.";
        throw std::runtime_error(err.str());
    }

    int32_t n_batch = opts.n_batch > 0 ? opts.n_batch : n_ctx;
    int32_t n_ubatch = opts.n_ubatch > 0 ? opts.n_ubatch : n_batch;

    if (n_batch < min_required_tokens || n_ubatch < min_required_tokens) {
        std::ostringstream err;
        err << "allocated batch is too small: n_batch=" << n_batch
            << ", n_ubatch=" << n_ubatch
            << ", minimum required=" << min_required_tokens
            << ". For DiffusionGemma, keep n_batch/n_ubatch >= prompt_tokens + canvas tokens.";
        throw std::runtime_error(err.str());
    }

    const bool context_pool_candidate = context_pool_can_satisfy(app, n_ctx, n_batch, n_ubatch);
    const uint64_t estimated_request_memory_bytes = estimate_active_request_memory_bytes(
        app, n_ctx, n_batch, n_ubatch, context_pool_candidate
    );
    uint64_t queue_wait_ms = 0;
    uint64_t active_memory_after_admit = 0;
    uint64_t os_available_at_admit = 0;
    uint64_t gpu_available_budget_at_admit = 0;
    uint64_t queue_ticket = 0;
    int active_requests_after_admit = 0;
    int queued_requests_after_admit = 0;
    uint64_t queued_memory_after_admit = 0;

    MemoryReservation memory_reservation = acquire_memory_reservation(
        app,
        request_id,
        endpoint,
        estimated_request_memory_bytes,
        queue_wait_ms,
        active_memory_after_admit,
        os_available_at_admit,
        gpu_available_budget_at_admit,
        queue_ticket,
        active_requests_after_admit,
        queued_requests_after_admit,
        queued_memory_after_admit
    );

    const auto t_lock_acquired = std::chrono::steady_clock::now();

    json admit_audit;
    admit_audit["event"] = "request_memory_admitted";
    admit_audit["request_id"] = request_id;
    admit_audit["endpoint"] = endpoint;
    admit_audit["queue_ticket"] = queue_ticket;
    admit_audit["estimated_request_memory_bytes"] = estimated_request_memory_bytes;
    admit_audit["estimated_request_memory_mib"] = bytes_to_mib(estimated_request_memory_bytes);
    admit_audit["queue_wait_ms"] = queue_wait_ms;
    admit_audit["active_request_memory_bytes_after_admit"] = active_memory_after_admit;
    admit_audit["active_request_memory_mib_after_admit"] = bytes_to_mib(active_memory_after_admit);
    admit_audit["active_requests_after_admit"] = active_requests_after_admit;
    admit_audit["queued_requests_after_admit"] = queued_requests_after_admit;
    admit_audit["queued_request_memory_bytes_after_admit"] = queued_memory_after_admit;
    admit_audit["queued_request_memory_mib_after_admit"] = bytes_to_mib(queued_memory_after_admit);
    admit_audit["os_available_physical_bytes_at_admit"] = os_available_at_admit == std::numeric_limits<uint64_t>::max() ? 0 : os_available_at_admit;
    admit_audit["os_available_physical_mib_at_admit"] = os_available_at_admit == std::numeric_limits<uint64_t>::max() ? 0.0 : bytes_to_mib(os_available_at_admit);
    admit_audit["gpu_available_budget_bytes_at_admit"] = gpu_available_budget_at_admit;
    admit_audit["gpu_available_budget_mib_at_admit"] = bytes_to_mib(gpu_available_budget_at_admit);
    admit_audit["memory"] = memory_status_snapshot(app);
    audit_log(app, admit_audit);

    const auto t_context_lock_wait_start = std::chrono::steady_clock::now();
    const auto t_context_start = std::chrono::steady_clock::now();

    bool context_reused = false;
    uint64_t context_pool_slot_id = 0;
    double context_pool_wait_s = 0.0;

    ContextLease ctx_lease = acquire_context_lease(
        app,
        n_ctx,
        n_batch,
        n_ubatch,
        context_reused,
        context_pool_slot_id,
        context_pool_wait_s
    );

    llama_context * ctx = ctx_lease.ctx;
    if (!ctx) {
        throw std::runtime_error("failed to acquire llama_context");
    }

    const auto t_context_end = std::chrono::steady_clock::now();
    const double context_lock_wait_s = context_reused ? context_pool_wait_s : seconds_between(t_context_lock_wait_start, t_context_start);

    if (app.cfg.post_context_memory_check && app.cfg.gpu_dynamic_memory_guard) {
        const GpuMemoryInfo gpu_after_context = detect_gpu_memory_info(app.cfg.gpu_memory_policy);
        if (gpu_after_context.valid && gpu_after_context.available_budget_bytes > 0 &&
            gpu_after_context.available_budget_bytes < app.cfg.memory_safety_margin_bytes / 2) {
            ctx_lease.release();
            std::ostringstream err;
            err << "post-context GPU memory safety check failed: available_budget_mib="
                << bytes_to_mib(gpu_after_context.available_budget_bytes)
                << ", required_half_safety_margin_mib="
                << bytes_to_mib(app.cfg.memory_safety_margin_bytes / 2);
            throw std::runtime_error(err.str());
        }
    }

    std::vector<llama_token> output_tokens(n_ctx);
    std::vector<llama_token> response_tokens;

    diffusion_params diff_params;
    diff_params.mask_token_id = mask_token_id;
    diff_params.seed = app.cfg.seed;
    diff_params.temperature = temperature;
    diff_params.steps = app.cfg.diffusion_steps;
    diff_params.algorithm = DIFFUSION_ALGORITHM_CONFIDENCE_BASED;
    diff_params.top_p = app.cfg.top_p;
    diff_params.top_k = app.cfg.top_k;
    diff_params.visual_mode = false;
    diff_params.add_gumbel_noise = false;

    int eb_max_denoising_steps = 0;
    int gpu_devs = count_gpu_devices();

    const auto t_generation_lock_wait_start = std::chrono::steady_clock::now();
    std::unique_lock<std::mutex> generation_lock;
    if (app.cfg.serialize_generation) {
        generation_lock = std::unique_lock<std::mutex>(app.generation_mutex);
    }
    const auto t_diffusion_start = std::chrono::steady_clock::now();
    const double generation_lock_wait_s = seconds_between(t_generation_lock_wait_start, t_diffusion_start);

    if (app.canvas_length > 0) {
        diff_params.shift_logits = get_meta_bool(app.model, "diffusion.shift_logits", false);
        diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
        diff_params.eps = 1e-3f;
        diff_params.suppress_mask_token = true;
        diff_params.self_conditioning = true;
    } else {
        diff_params.shift_logits = get_meta_bool(app.model, "diffusion.shift_logits", true);
        diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
        diff_params.eps = 1e-3f;
        diff_params.max_length = n_ctx;
    }

    if (app.canvas_length > 0) {
        diffusion_eb_params eb_params;
        eb_params.max_denoising_steps = get_meta_i(app.model, "diffusion.eb_max_steps", 48);
        eb_params.t_min = get_meta_f(app.model, "diffusion.eb_t_min", 0.4f);
        eb_params.t_max = get_meta_f(app.model, "diffusion.eb_t_max", 0.8f);
        eb_params.entropy_bound = get_meta_f(app.model, "diffusion.eb_entropy_bound", 0.1f);
        eb_params.stability_threshold = get_meta_i(app.model, "diffusion.eb_stability_threshold", 1);
        eb_params.confidence_threshold = get_meta_f(app.model, "diffusion.eb_confidence_threshold", 0.005f);
        eb_params.seed = app.cfg.seed;
        eb_params.visual_mode = false;

        eb_max_denoising_steps = eb_params.max_denoising_steps;

        // Same auto-policy as diffusion-cli.cpp:
        // single GPU -> KV cache and device sampling on where backend supports it.
        // multi GPU  -> off because these paths are single-device.
        eb_params.kv_cache = gpu_devs <= 1;
        eb_params.gpu_sampling = gpu_devs <= 1;
        eb_params.gpu_sample_reduce = eb_params.gpu_sampling && gpu_devs == 1;

        for (int32_t b = 0; b < blocks_requested; b++) {
            const int32_t prefix_len = (int32_t) prefix.size();
            const int32_t max_length = prefix_len + app.canvas_length;

            if (max_length > n_ctx) {
                if (b == 0) {
                    ctx_lease.release();
                    throw std::runtime_error("prompt + canvas does not fit in context");
                }
                break;
            }

            eb_params.max_length = max_length;

            int32_t n_generated = 0;

            diffusion_generate_entropy_bound(
                ctx,
                prefix.data(),
                output_tokens.data(),
                prefix_len,
                eb_params,
                n_generated
            );

            if (n_generated <= prefix_len) {
                if (b == 0) {
                    ctx_lease.release();
                    throw std::runtime_error("diffusion generation failed");
                }
                break;
            }

            blocks_completed++;

            const llama_token * canvas = output_tokens.data() + prefix_len;
            const size_t cut = trim_canvas_tokens(app.vocab, canvas, (size_t) app.canvas_length);

            response_tokens.insert(response_tokens.end(), canvas, canvas + cut);

            if ((int) response_tokens.size() >= max_tokens) {
                response_tokens.resize(max_tokens);
                break;
            }

            if (cut < (size_t) app.canvas_length) {
                break;
            }

            // Commit this canvas block and generate next block.
            prefix.insert(prefix.end(), canvas, canvas + cut);
        }
    } else {
        int32_t n_generated = 0;

        diffusion_generate(
            ctx,
            prefix.data(),
            output_tokens.data(),
            n_input,
            diff_params,
            n_generated
        );

        if (n_generated <= n_input) {
            ctx_lease.release();
            throw std::runtime_error("diffusion generation failed");
        }

        response_tokens.assign(output_tokens.begin() + n_input, output_tokens.begin() + n_generated);

        if ((int) response_tokens.size() > max_tokens) {
            response_tokens.resize(max_tokens);
        }

        blocks_completed = 1;
    }

    const auto t_diffusion_end = std::chrono::steady_clock::now();

    if (generation_lock.owns_lock()) {
        generation_lock.unlock();
    }

    ctx_lease.release();

    const auto t_detokenize_start = std::chrono::steady_clock::now();

    std::string raw_text = common_detokenize(app.vocab, response_tokens, false);
    std::string clean_text = app.cfg.expose_raw ? trim_ws(raw_text) : clean_diffusion_output(raw_text);

    const auto t_detokenize_end = std::chrono::steady_clock::now();

    memory_reservation.release();

    uint64_t active_memory_after_release = 0;
    {
        std::lock_guard<std::mutex> mem_guard(app.memory_mutex);
        active_memory_after_release = app.active_request_memory_bytes;
    }

    if (app.cfg.log_raw_output) {
        std::cerr << "\n--- raw diffusion output start ---\n";
        std::cerr << raw_text << "\n";
        std::cerr << "--- raw diffusion output end ---\n";
    }

    const double queue_wait_s = (double) queue_wait_ms / 1000.0;
    const double tokenize_s = seconds_between(t_tokenize_start, t_tokenize_end);
    const double context_s = seconds_between(t_context_start, t_context_end);
    const double diffusion_s = seconds_between(t_diffusion_start, t_diffusion_end);
    const double detokenize_s = seconds_between(t_detokenize_start, t_detokenize_end);
    const double total_s = seconds_between(t_enter, t_detokenize_end);

    const int output_token_count = (int) response_tokens.size();
    const int canvas_tokens_processed = app.canvas_length > 0
        ? blocks_completed * app.canvas_length
        : output_token_count;

    json audit;
    audit["event"] = "generation_complete";
    audit["request_id"] = request_id;
    audit["endpoint"] = endpoint;
    audit["model"] = app.model_name;
    audit["model_arch"] = app.model_architecture;
    audit["success"] = true;

    audit["config"] = {
        {"temperature", temperature},
        {"top_p", app.cfg.top_p},
        {"top_k", app.cfg.top_k},
        {"seed", app.cfg.seed},
        {"n_gpu_layers", app.cfg.n_gpu_layers},
        {"n_threads", app.cfg.n_threads},
        {"diffusion_steps", app.cfg.diffusion_steps},
        {"eb_max_denoising_steps", eb_max_denoising_steps},
        {"streaming_enabled", app.cfg.streaming_enabled},
        {"stream_requested", opts.stream},
        {"fifo_queue", app.cfg.fifo_queue},
        {"max_concurrent_requests", app.cfg.max_concurrent_requests},
        {"max_queue_requests", app.cfg.max_queue_requests},
        {"serialize_context_creation", app.cfg.serialize_context_creation},
        {"serialize_generation", app.cfg.serialize_generation},
        {"post_context_memory_check", app.cfg.post_context_memory_check},
        {"context_pool_size", app.cfg.context_pool_size},
        {"context_pool_strict", app.cfg.context_pool_strict},
        {"context_pool_clear_on_release", app.cfg.context_pool_clear_on_release}
    };

    audit["tokens"] = {
        {"prompt_tokens", n_input},
        {"requested_max_tokens", max_tokens},
        {"processing_tokens_requested", opts.processing_tokens},
        {"output_tokens", output_token_count},
        {"canvas_tokens_processed", canvas_tokens_processed}
    };

    audit["diffusion"] = {
        {"canvas_mode", app.canvas_length > 0},
        {"canvas_length", app.canvas_length},
        {"blocks_requested", blocks_requested},
        {"blocks_completed", blocks_completed},
        {"min_required_tokens", min_required_tokens},
        {"ctx_headroom_tokens", opts.ctx_headroom_tokens},
        {"n_ctx", n_ctx},
        {"n_batch", n_batch},
        {"n_ubatch", n_ubatch},
        {"gpu_devices_seen", gpu_devs}
    };

    audit["sizes"] = {
        {"raw_chars", (int) raw_text.size()},
        {"clean_chars", (int) clean_text.size()}
    };

    audit["timings_seconds"] = {
        {"queue_wait", queue_wait_s},
        {"tokenize", tokenize_s},
        {"context_init_lock_wait", context_lock_wait_s},
        {"context_pool_wait", context_pool_wait_s},
        {"context_create", context_s},
        {"generation_lock_wait", generation_lock_wait_s},
        {"diffusion_generate", diffusion_s},
        {"detokenize_and_clean", detokenize_s},
        {"total_request", total_s}
    };

    audit["throughput"] = {
        {"output_tokens_per_sec_total", tokens_per_second(output_token_count, total_s)},
        {"output_tokens_per_sec_generation_only", tokens_per_second(output_token_count, diffusion_s)},
        {"canvas_tokens_per_sec_total", tokens_per_second(canvas_tokens_processed, total_s)},
        {"canvas_tokens_per_sec_generation_only", tokens_per_second(canvas_tokens_processed, diffusion_s)},
        {"raw_chars_per_sec_total", tokens_per_second((int) raw_text.size(), total_s)},
        {"clean_chars_per_sec_total", tokens_per_second((int) clean_text.size(), total_s)}
    };

    const json memory_pressure_after_release = physical_memory_pressure_snapshot(app);

    audit["memory"] = {
        {"model_memory_bytes", app.model_memory_bytes},
        {"model_memory_mib", bytes_to_mib(app.model_memory_bytes)},
        {"estimated_request_memory_bytes", estimated_request_memory_bytes},
        {"estimated_request_memory_mib", bytes_to_mib(estimated_request_memory_bytes)},
        {"queue_ticket", queue_ticket},
        {"active_requests_after_admit", active_requests_after_admit},
        {"queued_requests_after_admit", queued_requests_after_admit},
        {"queued_request_memory_bytes_after_admit", queued_memory_after_admit},
        {"queued_request_memory_mib_after_admit", bytes_to_mib(queued_memory_after_admit)},
        {"active_request_memory_bytes_after_admit", active_memory_after_admit},
        {"active_request_memory_mib_after_admit", bytes_to_mib(active_memory_after_admit)},
        {"active_request_memory_bytes_after_release", active_memory_after_release},
        {"active_request_memory_mib_after_release", bytes_to_mib(active_memory_after_release)},
        {"memory_limit_bytes", app.cfg.memory_limit_bytes},
        {"memory_limit_mib", bytes_to_mib(app.cfg.memory_limit_bytes)},
        {"memory_safety_margin_bytes", app.cfg.memory_safety_margin_bytes},
        {"memory_safety_margin_mib", bytes_to_mib(app.cfg.memory_safety_margin_bytes)},
        {"bytes_per_token", app.cfg.bytes_per_token},
        {"request_base_memory_bytes", app.cfg.request_base_memory_bytes},
        {"request_base_memory_mib", bytes_to_mib(app.cfg.request_base_memory_bytes)},
        {"context_reused", context_reused},
        {"context_pool_slot_id", context_pool_slot_id},
        {"context_pool_size", app.cfg.context_pool_size},
        {"context_pool_n_ctx", app.context_pool_n_ctx_actual},
        {"context_pool_n_batch", app.context_pool_n_batch_actual},
        {"context_pool_n_ubatch", app.context_pool_n_ubatch_actual},
        {"context_pool_memory_bytes", app.context_pool_memory_bytes},
        {"context_pool_memory_mib", bytes_to_mib(app.context_pool_memory_bytes)},
        {"os_total_physical_bytes", os_total_physical_memory_bytes()},
        {"os_total_physical_mib", bytes_to_mib(os_total_physical_memory_bytes())},
        {"os_available_physical_bytes_at_admit", os_available_at_admit},
        {"os_available_physical_mib_at_admit", bytes_to_mib(os_available_at_admit)},
        {"gpu_available_budget_bytes_at_admit", gpu_available_budget_at_admit},
        {"gpu_available_budget_mib_at_admit", bytes_to_mib(gpu_available_budget_at_admit)},
        {"gpu_memory_detected", app.gpu_memory_detected},
        {"gpu_memory_source", app.gpu_memory_source},
        {"gpu_memory_policy", app.cfg.gpu_memory_policy},
        {"gpu_adapter_name", app.gpu_adapter_name},
        {"gpu_dedicated_memory_bytes", app.gpu_dedicated_memory_bytes},
        {"gpu_dedicated_memory_mib", bytes_to_mib(app.gpu_dedicated_memory_bytes)},
        {"gpu_budget_bytes_at_start", app.gpu_budget_bytes},
        {"gpu_budget_mib_at_start", bytes_to_mib(app.gpu_budget_bytes)},
        {"gpu_dynamic_memory_guard", app.cfg.gpu_dynamic_memory_guard},
        {"memory_limit_auto_detected", app.cfg.memory_limit_auto_detected},
        {"memory_pressure_after_release", memory_pressure_after_release},
        {"memory_status_after_release", memory_status_snapshot(app)}
    };

    audit_log(app, audit);

    GenerationResult result;
    result.text = clean_text;
    result.raw_text = raw_text;
    result.prompt_tokens = n_input;
    result.output_tokens = output_token_count;
    result.raw_chars = (int) raw_text.size();
    result.clean_chars = (int) clean_text.size();
    result.requested_max_tokens = max_tokens;
    result.processing_tokens_requested = opts.processing_tokens;
    result.ctx_headroom_tokens = opts.ctx_headroom_tokens;
    result.min_required_tokens = min_required_tokens;
    result.n_ctx = n_ctx;
    result.n_batch = n_batch;
    result.n_ubatch = n_ubatch;
    result.blocks_requested = blocks_requested;
    result.blocks_completed = blocks_completed;
    result.canvas_tokens_processed = canvas_tokens_processed;
    result.total_seconds = total_s;
    result.generation_seconds = diffusion_s;
    result.estimated_request_memory_bytes = estimated_request_memory_bytes;
    result.model_memory_bytes = app.model_memory_bytes;
    result.active_request_memory_bytes_after_admit = active_memory_after_admit;
    result.active_request_memory_bytes_after_release = active_memory_after_release;
    result.memory_limit_bytes = app.cfg.memory_limit_bytes;
    result.memory_safety_margin_bytes = app.cfg.memory_safety_margin_bytes;
    result.os_total_physical_bytes = os_total_physical_memory_bytes();
    result.os_available_physical_bytes_at_admit = os_available_at_admit;
    result.gpu_available_budget_bytes_at_admit = gpu_available_budget_at_admit;
    result.queue_wait_ms = queue_wait_ms;
    result.queue_ticket = queue_ticket;
    result.active_requests_after_admit = active_requests_after_admit;
    result.queued_requests_after_admit = queued_requests_after_admit;
    result.queued_request_memory_bytes_after_admit = queued_memory_after_admit;
    result.context_init_lock_seconds = context_lock_wait_s;
    result.generation_lock_wait_seconds = generation_lock_wait_s;
    result.context_pool_wait_seconds = context_pool_wait_s;
    result.context_reused = context_reused;
    result.context_pool_slot_id = context_pool_slot_id;

    return result;
}


static std::vector<std::string> split_text_chunks(const std::string & text, int chunk_chars) {
    std::vector<std::string> chunks;

    if (chunk_chars <= 0) {
        chunk_chars = 96;
    }

    for (size_t i = 0; i < text.size(); i += (size_t) chunk_chars) {
        chunks.push_back(text.substr(i, (size_t) chunk_chars));
    }

    if (chunks.empty()) {
        chunks.push_back("");
    }

    return chunks;
}

static std::string make_openai_chat_stream_response(
    const std::string & model_name,
    uint64_t request_id,
    const GenerationResult & result,
    int chunk_chars
) {
    const int64_t created = (int64_t) std::time(nullptr);
    const std::string id = "chatcmpl-diffusion-local-" + std::to_string(request_id);
    std::ostringstream out;

    json first;
    first["id"] = id;
    first["object"] = "chat.completion.chunk";
    first["created"] = created;
    first["model"] = model_name;
    first["choices"] = json::array({{
        {"index", 0},
        {"delta", {{"role", "assistant"}}},
        {"finish_reason", nullptr}
    }});
    out << "data: " << first.dump() << "\n\n";

    for (const std::string & chunk : split_text_chunks(result.text, chunk_chars)) {
        json j;
        j["id"] = id;
        j["object"] = "chat.completion.chunk";
        j["created"] = created;
        j["model"] = model_name;
        j["choices"] = json::array({{
            {"index", 0},
            {"delta", {{"content", chunk}}},
            {"finish_reason", nullptr}
        }});
        out << "data: " << j.dump() << "\n\n";
    }

    json last;
    last["id"] = id;
    last["object"] = "chat.completion.chunk";
    last["created"] = created;
    last["model"] = model_name;
    last["choices"] = json::array({{
        {"index", 0},
        {"delta", json::object()},
        {"finish_reason", "stop"}
    }});
    last["usage"] = {
        {"prompt_tokens", result.prompt_tokens},
        {"completion_tokens", result.output_tokens},
        {"total_tokens", result.prompt_tokens + result.output_tokens}
    };
    out << "data: " << last.dump() << "\n\n";
    out << "data: [DONE]\n\n";

    return out.str();
}

static std::string make_completion_stream_response(
    const std::string & model_name,
    uint64_t request_id,
    const GenerationResult & result,
    int chunk_chars
) {
    const int64_t created = (int64_t) std::time(nullptr);
    const std::string id = "cmpl-diffusion-local-" + std::to_string(request_id);
    std::ostringstream out;

    for (const std::string & chunk : split_text_chunks(result.text, chunk_chars)) {
        json j;
        j["id"] = id;
        j["object"] = "text_completion.chunk";
        j["created"] = created;
        j["model"] = model_name;
        j["choices"] = json::array({{
            {"text", chunk},
            {"index", 0},
            {"finish_reason", nullptr}
        }});
        out << "data: " << j.dump() << "\n\n";
    }

    json last;
    last["id"] = id;
    last["object"] = "text_completion.chunk";
    last["created"] = created;
    last["model"] = model_name;
    last["choices"] = json::array({{
        {"text", ""},
        {"index", 0},
        {"finish_reason", "stop"}
    }});
    last["usage"] = {
        {"prompt_tokens", result.prompt_tokens},
        {"completion_tokens", result.output_tokens},
        {"total_tokens", result.prompt_tokens + result.output_tokens}
    };
    out << "data: " << last.dump() << "\n\n";
    out << "data: [DONE]\n\n";

    return out.str();
}

static json make_openai_chat_response(
    const std::string & model_name,
    uint64_t request_id,
    const GenerationResult & result
) {
    json j;

    j["id"] = "chatcmpl-diffusion-local-" + std::to_string(request_id);
    j["object"] = "chat.completion";
    j["created"] = (int64_t) std::time(nullptr);
    j["model"] = model_name;

    j["choices"] = json::array({
        {
            {"index", 0},
            {"message", {
                {"role", "assistant"},
                {"content", result.text}
            }},
            {"finish_reason", "stop"}
        }
    });

    j["usage"] = {
        {"prompt_tokens", result.prompt_tokens},
        {"completion_tokens", result.output_tokens},
        {"total_tokens", result.prompt_tokens + result.output_tokens}
    };

    return j;
}

static json make_completion_response(
    const std::string & model_name,
    uint64_t request_id,
    const GenerationResult & result
) {
    json j;

    j["id"] = "cmpl-diffusion-local-" + std::to_string(request_id);
    j["object"] = "text_completion";
    j["created"] = (int64_t) std::time(nullptr);
    j["model"] = model_name;

    j["choices"] = json::array({
        {
            {"text", result.text},
            {"index", 0},
            {"finish_reason", "stop"}
        }
    });

    j["usage"] = {
        {"prompt_tokens", result.prompt_tokens},
        {"completion_tokens", result.output_tokens},
        {"total_tokens", result.prompt_tokens + result.output_tokens}
    };

    return j;
}

static void usage() {
    std::cerr
        << "Usage:\n"
        << "  llama-diffusion-http -m MODEL.gguf [--host 127.0.0.1] [--port 8081]\n"
        << "                       [-ngl 99] [-t 20] [-n 256]\n"
        << "                       [--raw] [--log-raw]\n"
        << "                       [--audit-log PATH] [--no-audit]\n"
        << "                       [--audit-summary] [--audit-full] [--no-audit-memory-pressure]\n"
        << "                       [--no-streaming] [--stream-chunk-chars N]\n"
        << "                       [--processing-tokens N] [--ctx-headroom-tokens N]\n"
        << "                       [--n-ctx N] [--n-batch N] [--n-ubatch N]\n"
        << "                       [--context-pool-size N] [--context-pool-n-ctx N]\n"
        << "                       [--context-pool-n-batch N] [--context-pool-n-ubatch N]\n"
        << "                       [--context-pool-strict] [--no-context-pool-clear]\n"
        << "                       [--max-concurrent N]\n"
        << "                       [--memory-limit-mb N] [--memory-safety-margin-mb N]\n"
        << "                       [--model-memory-mb N]\n"
        << "                       [--bytes-per-token N] [--token-memory-kb N]\n"
        << "                       [--request-base-memory-mb N] [--queue-timeout-ms N]\n"
        << "                       [--max-queue-requests N] [--max-queue-memory-mb N]\n"
        << "                       [--fifo-queue|--no-fifo-queue]\n"
        << "                       [--serialize-context-creation|--no-serialize-context-creation]\n"
        << "                       [--serialize-generation|--parallel-generation]\n"
        << "                       [--post-context-memory-check|--no-post-context-memory-check]\n"
        << "                       [--gpu-memory-policy largest|sum] [--no-gpu-memory-guard]\n"
        << "                       [--http-worker-threads N]\n";
}

static bool parse_args(int argc, char ** argv, ServerConfig & cfg) {
    for (int i = 1; i < argc; ++i) {
        std::string a = argv[i];

        auto need_value = [&](const std::string & name) -> std::string {
            if (i + 1 >= argc) {
                throw std::runtime_error("missing value for " + name);
            }
            return argv[++i];
        };

        if (a == "-m" || a == "--model") {
            cfg.model_path = need_value(a);
        } else if (a == "--host") {
            cfg.host = need_value(a);
        } else if (a == "--port") {
            cfg.port = std::stoi(need_value(a));
        } else if (a == "-ngl" || a == "--n-gpu-layers") {
            cfg.n_gpu_layers = std::stoi(need_value(a));
        } else if (a == "-t" || a == "--threads") {
            cfg.n_threads = std::stoi(need_value(a));
        } else if (a == "-n" || a == "--max-tokens") {
            cfg.default_max_tokens = std::stoi(need_value(a));
        } else if (a == "--temp" || a == "--temperature") {
            cfg.temperature = std::stof(need_value(a));
        } else if (a == "--top-p") {
            cfg.top_p = std::stof(need_value(a));
        } else if (a == "--top-k") {
            cfg.top_k = std::stoi(need_value(a));
        } else if (a == "--seed") {
            cfg.seed = std::stoi(need_value(a));
        } else if (a == "--diffusion-steps") {
            cfg.diffusion_steps = std::stoi(need_value(a));
        } else if (a == "--no-chat-template") {
            cfg.use_chat_template = false;
        } else if (a == "--raw") {
            cfg.expose_raw = true;
        } else if (a == "--log-raw") {
            cfg.log_raw_output = true;
        } else if (a == "--audit-log") {
            cfg.audit_log_path = need_value(a);
        } else if (a == "--no-audit") {
            cfg.audit_enabled = false;
        } else if (a == "--audit-summary" || a == "--audit-compact") {
            cfg.audit_summary = true;
        } else if (a == "--audit-full" || a == "--audit-verbose") {
            cfg.audit_summary = false;
        } else if (a == "--no-audit-memory-pressure") {
            cfg.audit_memory_pressure = false;
        } else if (a == "--audit-memory-pressure") {
            cfg.audit_memory_pressure = true;
        } else if (a == "--no-streaming") {
            cfg.streaming_enabled = false;
        } else if (a == "--streaming") {
            cfg.streaming_enabled = true;
        } else if (a == "--stream-chunk-chars") {
            cfg.stream_chunk_chars = std::stoi(need_value(a));
        } else if (a == "--processing-tokens" || a == "--allocated-tokens" || a == "--alloc-tokens") {
            cfg.default_processing_tokens = std::stoi(need_value(a));
        } else if (a == "--ctx-headroom-tokens" || a == "--ctx-headroom") {
            cfg.ctx_headroom_tokens = std::stoi(need_value(a));
        } else if (a == "--n-ctx") {
            cfg.default_n_ctx = std::stoi(need_value(a));
        } else if (a == "--n-batch") {
            cfg.default_n_batch = std::stoi(need_value(a));
        } else if (a == "--n-ubatch") {
            cfg.default_n_ubatch = std::stoi(need_value(a));
        } else if (a == "--context-pool-size" || a == "--ctx-pool-size") {
            cfg.context_pool_size = std::stoi(need_value(a));
        } else if (a == "--context-pool-n-ctx" || a == "--ctx-pool-n-ctx") {
            cfg.context_pool_n_ctx = std::stoi(need_value(a));
        } else if (a == "--context-pool-n-batch" || a == "--ctx-pool-n-batch") {
            cfg.context_pool_n_batch = std::stoi(need_value(a));
        } else if (a == "--context-pool-n-ubatch" || a == "--ctx-pool-n-ubatch") {
            cfg.context_pool_n_ubatch = std::stoi(need_value(a));
        } else if (a == "--context-pool-strict") {
            cfg.context_pool_strict = true;
        } else if (a == "--no-context-pool-strict") {
            cfg.context_pool_strict = false;
        } else if (a == "--no-context-pool-clear") {
            cfg.context_pool_clear_on_release = false;
        } else if (a == "--context-pool-clear") {
            cfg.context_pool_clear_on_release = true;
        } else if (a == "--max-concurrent" || a == "--max-concurrent-requests") {
            cfg.max_concurrent_requests = std::stoi(need_value(a));
        } else if (a == "--memory-limit-mb" || a == "--mem-limit-mb") {
            cfg.memory_limit_bytes = mb_to_bytes((uint64_t) std::stoull(need_value(a)));
            cfg.memory_limit_auto_detected = false;
        } else if (a == "--gpu-memory-policy") {
            cfg.gpu_memory_policy = need_value(a);
            if (cfg.gpu_memory_policy != "largest" && cfg.gpu_memory_policy != "sum") {
                throw std::runtime_error("--gpu-memory-policy must be 'largest' or 'sum'");
            }
        } else if (a == "--no-gpu-memory-guard") {
            cfg.gpu_dynamic_memory_guard = false;
        } else if (a == "--gpu-memory-guard") {
            cfg.gpu_dynamic_memory_guard = true;
        } else if (a == "--memory-safety-margin-mb" || a == "--mem-safety-margin-mb") {
            cfg.memory_safety_margin_bytes = mb_to_bytes((uint64_t) std::stoull(need_value(a)));
        } else if (a == "--model-memory-mb") {
            cfg.model_memory_bytes_override = mb_to_bytes((uint64_t) std::stoull(need_value(a)));
        } else if (a == "--bytes-per-token") {
            cfg.bytes_per_token = (uint64_t) std::stoull(need_value(a));
        } else if (a == "--token-memory-kb") {
            cfg.bytes_per_token = kb_to_bytes((uint64_t) std::stoull(need_value(a)));
        } else if (a == "--request-base-memory-mb") {
            cfg.request_base_memory_bytes = mb_to_bytes((uint64_t) std::stoull(need_value(a)));
        } else if (a == "--queue-timeout-ms") {
            cfg.queue_timeout_ms = (uint64_t) std::stoull(need_value(a));
        } else if (a == "--max-queue-requests") {
            cfg.max_queue_requests = std::stoi(need_value(a));
        } else if (a == "--max-queue-memory-mb") {
            cfg.max_queue_memory_bytes = mb_to_bytes((uint64_t) std::stoull(need_value(a)));
        } else if (a == "--fifo-queue") {
            cfg.fifo_queue = true;
        } else if (a == "--no-fifo-queue") {
            cfg.fifo_queue = false;
        } else if (a == "--serialize-context-creation") {
            cfg.serialize_context_creation = true;
        } else if (a == "--no-serialize-context-creation") {
            cfg.serialize_context_creation = false;
        } else if (a == "--serialize-generation") {
            cfg.serialize_generation = true;
        } else if (a == "--parallel-generation") {
            cfg.serialize_generation = false;
        } else if (a == "--post-context-memory-check") {
            cfg.post_context_memory_check = true;
        } else if (a == "--no-post-context-memory-check") {
            cfg.post_context_memory_check = false;
        } else if (a == "--http-worker-threads") {
            cfg.http_worker_threads = std::stoi(need_value(a));
        } else if (a == "-h" || a == "--help") {
            usage();
            return false;
        } else {
            throw std::runtime_error("unknown argument: " + a);
        }
    }

    if (cfg.model_path.empty()) {
        usage();
        throw std::runtime_error("missing -m MODEL.gguf");
    }

    return true;
}

int main(int argc, char ** argv) {
    App app;

    try {
        if (!parse_args(argc, argv, app.cfg)) {
            return 0;
        }

        std::setlocale(LC_NUMERIC, "C");
        ggml_time_init();
        common_init();
        llama_backend_init();

        llama_model_params model_params = llama_model_default_params();
        model_params.n_gpu_layers = app.cfg.n_gpu_layers;

        std::cerr << "Loading model once: " << app.cfg.model_path << "\n";

        app.model = llama_model_load_from_file(app.cfg.model_path.c_str(), model_params);
        if (!app.model) {
            throw std::runtime_error("failed to load model");
        }

        app.model_memory_bytes = app.cfg.model_memory_bytes_override > 0
            ? app.cfg.model_memory_bytes_override
            : file_size_bytes(app.cfg.model_path);

        const GpuMemoryInfo startup_gpu = detect_gpu_memory_info(app.cfg.gpu_memory_policy);
        app.gpu_memory_detected = startup_gpu.valid;
        app.gpu_memory_source = startup_gpu.source;
        app.gpu_adapter_name = startup_gpu.adapter_name;
        app.gpu_adapter_count = startup_gpu.adapter_count;
        app.gpu_dedicated_memory_bytes = startup_gpu.dedicated_video_memory_bytes;
        app.gpu_budget_bytes = startup_gpu.budget_bytes;
        app.gpu_current_usage_bytes_at_start = startup_gpu.current_usage_bytes;
        app.gpu_available_budget_bytes_at_start = startup_gpu.available_budget_bytes;

        if (app.cfg.memory_limit_bytes == 0 && startup_gpu.valid) {
            app.cfg.memory_limit_bytes = startup_gpu.budget_bytes > 0
                ? startup_gpu.budget_bytes
                : startup_gpu.dedicated_video_memory_bytes;
            app.cfg.memory_limit_auto_detected = true;
        }

        if (!llama_model_is_diffusion(app.model)) {
            throw std::runtime_error("model is not a llama.cpp diffusion model");
        }

        app.model_architecture = get_meta_string(app.model, "general.architecture", app.model_architecture);
        app.model_name = get_meta_string(app.model, "general.name", "");
        if (app.model_name.empty()) {
            app.model_name = basename_from_path(app.cfg.model_path);
        }

        std::cerr << "Model name = " << app.model_name << "\n";
        std::cerr << "Model architecture = " << app.model_architecture << "\n";

        app.vocab = llama_model_get_vocab(app.model);

        char canvas_str[32] = {};
        if (llama_model_meta_val_str(app.model, "diffusion.canvas_length", canvas_str, sizeof(canvas_str)) >= 0) {
            app.canvas_length = (int32_t) strtol(canvas_str, nullptr, 10);
        }

        if (app.canvas_length > 0) {
            llama_diffusion_set_sc(app.model, nullptr, 0.0f, 1.0f, true);
        }

        std::cerr << "Diffusion canvas_length = " << app.canvas_length << "\n";

        init_context_pool(app);

        if (app.cfg.context_pool_size > 0) {
            std::cerr << "Model stays loaded; requests lease isolated warm contexts from the pool when they fit.\n";
        } else {
            std::cerr << "Model stays loaded; every request gets a fresh llama_context.\n";
        }

        json startup_audit;
        startup_audit["event"] = "server_start";
        startup_audit["model"] = app.model_name;
        startup_audit["model_name"] = app.model_name;
        startup_audit["model_arch"] = app.model_architecture;
        startup_audit["model_path"] = app.cfg.model_path;
        startup_audit["host"] = app.cfg.host;
        startup_audit["port"] = app.cfg.port;
        startup_audit["canvas_length"] = app.canvas_length;
        startup_audit["n_gpu_layers"] = app.cfg.n_gpu_layers;
        startup_audit["n_threads"] = app.cfg.n_threads;
        startup_audit["default_max_tokens"] = app.cfg.default_max_tokens;
        startup_audit["diffusion_steps"] = app.cfg.diffusion_steps;
        startup_audit["streaming_enabled"] = app.cfg.streaming_enabled;
        startup_audit["stream_chunk_chars"] = app.cfg.stream_chunk_chars;
        startup_audit["default_processing_tokens"] = app.cfg.default_processing_tokens;
        startup_audit["ctx_headroom_tokens"] = app.cfg.ctx_headroom_tokens;
        startup_audit["default_n_ctx"] = app.cfg.default_n_ctx;
        startup_audit["default_n_batch"] = app.cfg.default_n_batch;
        startup_audit["default_n_ubatch"] = app.cfg.default_n_ubatch;
        startup_audit["context_pool_size"] = app.cfg.context_pool_size;
        startup_audit["context_pool_strict"] = app.cfg.context_pool_strict;
        startup_audit["context_pool_clear_on_release"] = app.cfg.context_pool_clear_on_release;
        startup_audit["context_pool_n_ctx"] = app.context_pool_n_ctx_actual;
        startup_audit["context_pool_n_batch"] = app.context_pool_n_batch_actual;
        startup_audit["context_pool_n_ubatch"] = app.context_pool_n_ubatch_actual;
        startup_audit["context_pool_memory_bytes"] = app.context_pool_memory_bytes;
        startup_audit["context_pool_memory_mib"] = bytes_to_mib(app.context_pool_memory_bytes);
        startup_audit["max_concurrent_requests"] = app.cfg.max_concurrent_requests;
        startup_audit["model_memory_bytes"] = app.model_memory_bytes;
        startup_audit["model_memory_mib"] = bytes_to_mib(app.model_memory_bytes);
        startup_audit["memory_limit_bytes"] = app.cfg.memory_limit_bytes;
        startup_audit["memory_limit_mib"] = bytes_to_mib(app.cfg.memory_limit_bytes);
        startup_audit["memory_safety_margin_bytes"] = app.cfg.memory_safety_margin_bytes;
        startup_audit["memory_safety_margin_mib"] = bytes_to_mib(app.cfg.memory_safety_margin_bytes);
        startup_audit["bytes_per_token"] = app.cfg.bytes_per_token;
        startup_audit["request_base_memory_bytes"] = app.cfg.request_base_memory_bytes;
        startup_audit["request_base_memory_mib"] = bytes_to_mib(app.cfg.request_base_memory_bytes);
        startup_audit["queue_timeout_ms"] = app.cfg.queue_timeout_ms;
        startup_audit["fifo_queue"] = app.cfg.fifo_queue;
        startup_audit["max_queue_requests"] = app.cfg.max_queue_requests;
        startup_audit["max_queue_memory_bytes"] = app.cfg.max_queue_memory_bytes;
        startup_audit["max_queue_memory_mib"] = bytes_to_mib(app.cfg.max_queue_memory_bytes);
        startup_audit["serialize_context_creation"] = app.cfg.serialize_context_creation;
        startup_audit["serialize_generation"] = app.cfg.serialize_generation;
        startup_audit["post_context_memory_check"] = app.cfg.post_context_memory_check;
        startup_audit["http_worker_threads"] = app.cfg.http_worker_threads;
        startup_audit["gpu_memory_detected"] = app.gpu_memory_detected;
        startup_audit["gpu_memory_source"] = app.gpu_memory_source;
        startup_audit["gpu_memory_policy"] = app.cfg.gpu_memory_policy;
        startup_audit["gpu_adapter_name"] = app.gpu_adapter_name;
        startup_audit["gpu_adapter_count"] = app.gpu_adapter_count;
        startup_audit["gpu_dedicated_memory_bytes"] = app.gpu_dedicated_memory_bytes;
        startup_audit["gpu_dedicated_memory_mib"] = bytes_to_mib(app.gpu_dedicated_memory_bytes);
        startup_audit["gpu_budget_bytes"] = app.gpu_budget_bytes;
        startup_audit["gpu_budget_mib"] = bytes_to_mib(app.gpu_budget_bytes);
        startup_audit["gpu_current_usage_bytes_at_start"] = app.gpu_current_usage_bytes_at_start;
        startup_audit["gpu_current_usage_mib_at_start"] = bytes_to_mib(app.gpu_current_usage_bytes_at_start);
        startup_audit["gpu_available_budget_bytes_at_start"] = app.gpu_available_budget_bytes_at_start;
        startup_audit["gpu_available_budget_mib_at_start"] = bytes_to_mib(app.gpu_available_budget_bytes_at_start);
        startup_audit["gpu_dynamic_memory_guard"] = app.cfg.gpu_dynamic_memory_guard;
        startup_audit["memory_limit_auto_detected"] = app.cfg.memory_limit_auto_detected;
        startup_audit["audit_summary"] = app.cfg.audit_summary;
        startup_audit["audit_memory_pressure"] = app.cfg.audit_memory_pressure;
        startup_audit["os_total_physical_bytes"] = os_total_physical_memory_bytes();
        startup_audit["os_total_physical_mib"] = bytes_to_mib(os_total_physical_memory_bytes());
        startup_audit["os_available_physical_bytes"] = os_available_physical_memory_bytes();
        startup_audit["os_available_physical_mib"] = bytes_to_mib(os_available_physical_memory_bytes());
        audit_log(app, startup_audit);

        httplib::Server server;

        if (app.cfg.http_worker_threads > 0) {
            server.new_task_queue = [&app] {
                return new httplib::ThreadPool((size_t) app.cfg.http_worker_threads);
            };
        }

        server.Get("/health", [](const httplib::Request &, httplib::Response & res) {
            res.set_content("{\"status\":\"ok\"}", "application/json");
        });

        server.Get("/memory", [&](const httplib::Request &, httplib::Response & res) {
            json j = memory_status_snapshot(app);
            j["status"] = "ok";
            res.set_content(j.dump(), "application/json");
        });

        server.Get("/v1/models", [&](const httplib::Request &, httplib::Response & res) {
            json j;
            j["object"] = "list";
            j["data"] = json::array({
                {
                    {"id", app.model_name},
                    {"object", "model"},
                    {"owned_by", "local"},
                    {"architecture", app.model_architecture},
                    {"path", app.cfg.model_path}
                }
            });

            res.set_content(j.dump(), "application/json");
        });

        server.Post("/v1/chat/completions", [&](const httplib::Request & req, httplib::Response & res) {
            const uint64_t request_id = app.next_request_id.fetch_add(1);

            try {
                json body = json::parse(req.body);

                RequestOptions opts = request_options_from_body(app, body, false);

                json begin;
                begin["event"] = "request_begin";
                begin["request_id"] = request_id;
                begin["endpoint"] = "/v1/chat/completions";
                begin["max_tokens"] = opts.max_tokens;
                begin["temperature"] = opts.temperature;
                begin["stream"] = opts.stream;
                begin["processing_tokens"] = opts.processing_tokens;
                begin["n_ctx"] = opts.n_ctx;
                begin["n_batch"] = opts.n_batch;
                begin["n_ubatch"] = opts.n_ubatch;
                begin["ctx_headroom_tokens"] = opts.ctx_headroom_tokens;
                begin["scheduler"] = memory_status_snapshot(app);
                audit_log(app, begin);

                if (opts.stream && !app.cfg.streaming_enabled) {
                    res.status = 400;
                    res.set_content(json_error("stream=true requested, but streaming is disabled by --no-streaming", 400), "application/json");
                    return;
                }

                std::string prompt = messages_to_prompt(app.model, body.at("messages"), app.cfg.use_chat_template);

                GenerationResult result = run_one_request(
                    app,
                    request_id,
                    "/v1/chat/completions",
                    prompt,
                    opts
                );

                if (opts.stream) {
                    res.set_header("Cache-Control", "no-cache");
                    res.set_header("Connection", "keep-alive");
                    res.set_content(make_openai_chat_stream_response(app.model_name, request_id, result, opts.stream_chunk_chars), "text/event-stream");
                } else {
                    json out = make_openai_chat_response(app.model_name, request_id, result);
                    res.set_content(out.dump(), "application/json");
                }
            } catch (const std::exception & e) {
                json err;
                err["event"] = "request_error";
                err["request_id"] = request_id;
                err["endpoint"] = "/v1/chat/completions";
                err["error"] = e.what();
                audit_log(app, err);

                res.status = 500;
                res.set_content(json_error(e.what()), "application/json");
            }
        });

        server.Post("/completion", [&](const httplib::Request & req, httplib::Response & res) {
            const uint64_t request_id = app.next_request_id.fetch_add(1);

            try {
                json body = json::parse(req.body);

                RequestOptions opts = request_options_from_body(app, body, true);

                json begin;
                begin["event"] = "request_begin";
                begin["request_id"] = request_id;
                begin["endpoint"] = "/completion";
                begin["max_tokens"] = opts.max_tokens;
                begin["temperature"] = opts.temperature;
                begin["stream"] = opts.stream;
                begin["processing_tokens"] = opts.processing_tokens;
                begin["n_ctx"] = opts.n_ctx;
                begin["n_batch"] = opts.n_batch;
                begin["n_ubatch"] = opts.n_ubatch;
                begin["ctx_headroom_tokens"] = opts.ctx_headroom_tokens;
                begin["scheduler"] = memory_status_snapshot(app);
                audit_log(app, begin);

                if (opts.stream && !app.cfg.streaming_enabled) {
                    res.status = 400;
                    res.set_content(json_error("stream=true requested, but streaming is disabled by --no-streaming", 400), "application/json");
                    return;
                }

                std::string prompt = completion_prompt_from_body(body);

                GenerationResult result = run_one_request(
                    app,
                    request_id,
                    "/completion",
                    prompt,
                    opts
                );

                if (opts.stream) {
                    res.set_header("Cache-Control", "no-cache");
                    res.set_header("Connection", "keep-alive");
                    res.set_content(make_completion_stream_response(app.model_name, request_id, result, opts.stream_chunk_chars), "text/event-stream");
                } else {
                    json out = make_completion_response(app.model_name, request_id, result);
                    res.set_content(out.dump(), "application/json");
                }
            } catch (const std::exception & e) {
                json err;
                err["event"] = "request_error";
                err["request_id"] = request_id;
                err["endpoint"] = "/completion";
                err["error"] = e.what();
                audit_log(app, err);

                res.status = 500;
                res.set_content(json_error(e.what()), "application/json");
            }
        });

        std::cerr << "Listening on http://" << app.cfg.host << ":" << app.cfg.port << "\n";

        if (!server.listen(app.cfg.host, app.cfg.port)) {
            throw std::runtime_error("failed to bind server");
        }

        free_context_pool(app);
        llama_model_free(app.model);
        llama_backend_free();

        return 0;
    } catch (const std::exception & e) {
        std::cerr << "fatal: " << e.what() << "\n";

        free_context_pool(app);

        if (app.model) {
            llama_model_free(app.model);
        }

        llama_backend_free();
        return 1;
    }
}
