From 3ab67e80e16996faf41579a4b16d10477ae11133 Mon Sep 17 00:00:00 2001 From: Mike Mueller Date: Fri, 22 May 2026 21:41:54 +0200 Subject: [PATCH] Implement performance monitor --- docs/AGENT-API-AUTH.md | 11 +- src/api/admin/i18n.rs | 131 ++++++++++ src/api/admin/pages/devices.rs | 430 ++++++++++++++++++++++++++++++++- src/api/metrics.rs | 183 ++++++++++++++ src/api/mod.rs | 4 + src/api/perf_events.rs | 141 +++++++++++ src/database.rs | 253 +++++++++++++++++++ 7 files changed, 1149 insertions(+), 4 deletions(-) create mode 100644 src/api/metrics.rs create mode 100644 src/api/perf_events.rs diff --git a/docs/AGENT-API-AUTH.md b/docs/AGENT-API-AUTH.md index 57b4788..d912536 100644 --- a/docs/AGENT-API-AUTH.md +++ b/docs/AGENT-API-AUTH.md @@ -1,7 +1,7 @@ # Agent API authentication Reference for the per-device signature gate on the agent-facing HTTP -API. Five endpoints are gated: +API. Seven endpoints are gated: - `POST /api/heartbeat` - `POST /api/sysinfo` @@ -11,6 +11,15 @@ API. Five endpoints are gated: by the agent. Same TOFU lifecycle as heartbeat / sysinfo: stock RustDesk doesn't post here at all, so in practice every caller is a managed agent; the legacy/unsigned path is kept for symmetry. +- `POST /api/agent/metrics` — continuous CPU / memory / top-process + samples (≈1 / minute). Surfaced on the admin Devices detail page as + a 24 h sparkline + live snapshot card. +- `POST /api/agent/perf-events` — sparse Windows-event-log entries + flagged by `Microsoft-Windows-Diagnostics-Performance/Operational`, + `Microsoft-Windows-Resource-Exhaustion-Detector/Operational`, and + hand-picked `System` IDs (41 / 6008 / 1001 — unexpected reboot / + dirty shutdown / BSOD). Server dedups via UNIQUE (peer_id, provider, + record_id). For the operator workflow — turning it on, the dashboard toggle, what happens when a managed agent is uninstalled — see the matching section diff --git a/src/api/admin/i18n.rs b/src/api/admin/i18n.rs index ee89296..f812131 100644 --- a/src/api/admin/i18n.rs +++ b/src/api/admin/i18n.rs @@ -1224,6 +1224,137 @@ pub fn t(lang: Lang, key: &str) -> &'static str { "Istoric autentificări", "Historial de inicio de sesión", ), + "devices.performance" => ( + "Performance", + "Leistung", + "Performances", + "Performanță", + "Rendimiento", + ), + "devices.perf_none" => ( + "No performance data reported yet. The agent collects CPU / memory samples once per minute and Windows-reported performance events as they happen.", + "Noch keine Leistungsdaten gemeldet. Der Agent sammelt CPU-/Speicher-Stichproben einmal pro Minute und die von Windows gemeldeten Leistungsereignisse, sobald sie auftreten.", + "Aucune donnée de performance signalée pour l'instant. L'agent collecte des échantillons CPU / mémoire une fois par minute et les événements de performance signalés par Windows au fur et à mesure.", + "Niciun fel de date de performanță raportate încă. Agentul colectează eșantioane CPU / memorie o dată pe minut și evenimentele de performanță raportate de Windows pe măsură ce se întâmplă.", + "Aún no se han reportado datos de rendimiento. El agente recopila muestras de CPU / memoria una vez por minuto y los eventos de rendimiento reportados por Windows a medida que ocurren.", + ), + "devices.perf_no_live" => ( + "No live snapshot yet — waiting for the agent's first sample.", + "Noch keine Live-Stichprobe – warte auf die erste Probe des Agenten.", + "Pas encore d'instantané en direct — en attente du premier échantillon de l'agent.", + "Niciun instantaneu live încă — se așteaptă prima probă a agentului.", + "Aún no hay instantánea en vivo — esperando la primera muestra del agente.", + ), + "devices.perf_now" => ("Live", "Live", "En direct", "Live", "En vivo"), + "devices.perf_sampled_ago" => ( + "Sampled {0} ago", + "Vor {0} aufgenommen", + "Échantillonné il y a {0}", + "Eșantionat acum {0}", + "Muestreado hace {0}", + ), + "devices.perf_cpu" => ("CPU", "CPU", "Processeur", "CPU", "CPU"), + "devices.perf_mem" => ( + "Memory", + "Speicher", + "Mémoire", + "Memorie", + "Memoria", + ), + "devices.perf_top_cpu" => ( + "Top CPU", + "Top-CPU", + "Plus gros CPU", + "Top CPU", + "Mayor CPU", + ), + "devices.perf_top_mem" => ( + "Top memory", + "Top-Speicher", + "Plus grosse mémoire", + "Top memorie", + "Mayor memoria", + ), + "devices.perf_uptime" => ( + "Uptime", + "Laufzeit", + "Disponibilité", + "Timp activ", + "Tiempo activo", + ), + "devices.perf_proc_count" => ( + "Processes", + "Prozesse", + "Processus", + "Procese", + "Procesos", + ), + "devices.perf_no_chart" => ( + "No samples in the last 24 h", + "Keine Daten in den letzten 24 Std", + "Aucun échantillon ces 24 dernières h", + "Niciun eșantion în ultimele 24 h", + "Sin muestras en las últimas 24 h", + ), + "devices.perf_peak" => ("peak", "Spitze", "max.", "vârf", "máx."), + "devices.perf_latest" => ("last", "letzter", "dernier", "ultim", "último"), + "devices.perf_now_short" => ("now", "jetzt", "maint.", "acum", "ahora"), + "devices.perf_events_heading" => ( + "Recent performance events", + "Aktuelle Leistungsereignisse", + "Événements de performance récents", + "Evenimente recente de performanță", + "Eventos de rendimiento recientes", + ), + "devices.perf_events_none" => ( + "No performance events reported. Windows flags boot / shutdown / sleep slow paths, memory exhaustion, unexpected reboots and BSODs here.", + "Keine Leistungsereignisse gemeldet. Windows markiert hier verlangsamte Start-/Herunterfahren-/Standby-Vorgänge, Speichermangel, unerwartete Neustarts und Bluescreens.", + "Aucun événement de performance signalé. Windows signale ici les démarrages / arrêts / veilles lents, l'épuisement de la mémoire, les redémarrages inattendus et les BSOD.", + "Niciun eveniment de performanță raportat. Windows marchează aici pornirile / opririle / repausurile lente, epuizarea memoriei, repornirile neașteptate și BSOD-urile.", + "No se han reportado eventos de rendimiento. Windows registra aquí arranques / apagados / suspensiones lentos, agotamiento de memoria, reinicios inesperados y BSOD.", + ), + "devices.perf_events_col_when" => ( + "When (UTC)", + "Wann (UTC)", + "Quand (UTC)", + "Când (UTC)", + "Cuándo (UTC)", + ), + "devices.perf_events_col_source" => ( + "Source", + "Quelle", + "Source", + "Sursă", + "Origen", + ), + "devices.perf_events_col_summary" => ( + "Summary", + "Zusammenfassung", + "Résumé", + "Rezumat", + "Resumen", + ), + "devices.perf_src_diag_perf" => ( + "Diag-Perf", + "Diag-Perf", + "Diag-Perf", + "Diag-Perf", + "Diag-Perf", + ), + "devices.perf_src_res_exh" => ( + "Res-Exh", + "Res-Exh", + "Res-Exh", + "Res-Exh", + "Res-Exh", + ), + "devices.perf_src_system" => ( + "System", + "System", + "Système", + "Sistem", + "Sistema", + ), "devices.login_none" => ( "No login events recorded yet. The agent reports logons and logoffs as it observes them.", "Noch keine Anmeldeereignisse aufgezeichnet. Der Agent meldet An- und Abmeldungen, sobald er sie beobachtet.", diff --git a/src/api/admin/pages/devices.rs b/src/api/admin/pages/devices.rs index 5440f35..c476fa9 100644 --- a/src/api/admin/pages/devices.rs +++ b/src/api/admin/pages/devices.rs @@ -6,7 +6,7 @@ use crate::api::admin::i18n::{t, tf1, tf2, tf3, Lang}; use crate::api::error::ApiError; use crate::api::middleware::AuthedUser; use crate::api::state::AppState; -use crate::database::{DashboardDeviceRow, LoginEventRow}; +use crate::database::{DashboardDeviceRow, LoginEventRow, MetricsSampleRow, PerfEventRow}; use axum::extract::{Extension, Form, Path, Query}; use axum::response::Html; use serde::Deserialize; @@ -470,7 +470,36 @@ pub async fn detail( .login_events_for_peer(&d.id, 50) .await .unwrap_or_default(); - render_detail(lang, &d, &events) + // Performance: pull the most recent metrics sample for the + // "right now" card, plus 24 h of samples for the sparkline, + // plus the most recent perf events (boot/shutdown/memory- + // exhaustion etc.) for the "recent slow events" table. + // All three are best-effort — none of them is required for + // the detail page to render meaningfully. + let metrics_latest = state + .db + .metrics_latest(&d.id) + .await + .unwrap_or_default(); + let since_24h = chrono::Utc::now().timestamp() - 24 * 3600; + let metrics_24h = state + .db + .metrics_samples_since(&d.id, since_24h) + .await + .unwrap_or_default(); + let perf_events = state + .db + .perf_events_for_peer(&d.id, 20) + .await + .unwrap_or_default(); + render_detail( + lang, + &d, + &events, + metrics_latest.as_ref(), + &metrics_24h, + &perf_events, + ) } None => format!( r##"
@@ -1142,7 +1171,14 @@ fn fmt_inv_value(v: Option<&serde_json::Value>) -> String { } } -fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventRow]) -> String { +fn render_detail( + lang: Lang, + d: &DashboardDeviceRow, + login_events: &[LoginEventRow], + metrics_latest: Option<&MetricsSampleRow>, + metrics_24h: &[MetricsSampleRow], + perf_events: &[PerfEventRow], +) -> String { let parsed: serde_json::Value = serde_json::from_str(&d.sysinfo_payload).unwrap_or(serde_json::Value::Null); let pick = |k: &str| -> String { @@ -1237,6 +1273,7 @@ fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventR }; let login_section = render_login_events(lang, login_events); + let perf_section = render_performance(lang, metrics_latest, metrics_24h, perf_events); format!( r##"
@@ -1245,6 +1282,8 @@ fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventR
{detail_view}
{header} +

{performance}

+ {perf}

{inventory}

{inv}

{login_history}

@@ -1252,6 +1291,8 @@ fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventR
"##, back = back_button(lang), detail_view = t(lang, "devices.detail_view"), + performance = t(lang, "devices.performance"), + perf = perf_section, inventory = t(lang, "devices.inventory"), header = header, inv = inventory_section, @@ -1260,6 +1301,389 @@ fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventR ) } +/// Top-level Performance section: snapshot card, two sparklines (CPU / +/// memory), and a recent-events table. The whole thing is omitted in +/// favour of a "no data yet" panel when the agent hasn't reported. +fn render_performance( + lang: Lang, + latest: Option<&MetricsSampleRow>, + series: &[MetricsSampleRow], + events: &[PerfEventRow], +) -> String { + if latest.is_none() && series.is_empty() && events.is_empty() { + return format!( + r##"
+ {msg} +
"##, + msg = t(lang, "devices.perf_none"), + ); + } + + let snapshot = render_perf_snapshot(lang, latest); + let cpu_chart = render_sparkline( + lang, + series.iter().map(|s| (s.at, s.cpu_pct)).collect(), + 100.0, + true, + t(lang, "devices.perf_cpu"), + ); + let mem_chart = { + // Mem is reported as MB used / MB total; chart uses % so the + // y-axis stays comparable to the CPU panel. + let series_pct: Vec<(i64, f64)> = series + .iter() + .filter(|s| s.mem_total_mb > 0) + .map(|s| { + let pct = 100.0 * (s.mem_used_mb as f64) / (s.mem_total_mb as f64); + (s.at, pct) + }) + .collect(); + render_sparkline(lang, series_pct, 100.0, true, t(lang, "devices.perf_mem")) + }; + let events_section = render_perf_events_table(lang, events); + + format!( + r##"
+ {snapshot} +
+ {cpu} + {mem} +
+ {events} +
"##, + snapshot = snapshot, + cpu = cpu_chart, + mem = mem_chart, + events = events_section, + ) +} + +/// "Right now" card — the most recent metrics sample. Drawn as a 4-up +/// stat tile so the supporter can glance at CPU / memory / top +/// processes without reading a chart. Falls back to a thin "no live +/// data" pill when the agent has never reported. +fn render_perf_snapshot(lang: Lang, latest: Option<&MetricsSampleRow>) -> String { + let Some(s) = latest else { + return format!( + r##"
+ {msg} +
"##, + msg = t(lang, "devices.perf_no_live"), + ); + }; + let now = chrono::Utc::now().timestamp(); + let age = (now - s.at).max(0); + let age_str = fmt_age(age); + let cpu_color = pct_color(s.cpu_pct); + let mem_pct = if s.mem_total_mb > 0 { + 100.0 * (s.mem_used_mb as f64) / (s.mem_total_mb as f64) + } else { + 0.0 + }; + let mem_color = pct_color(mem_pct); + let mem_used_gb = (s.mem_used_mb as f64) / 1024.0; + let mem_total_gb = (s.mem_total_mb as f64) / 1024.0; + let top_cpu = if s.top_cpu_name.is_empty() { + "—".to_string() + } else { + format!( + "{name} {pct:.0}%", + name = html_escape(&s.top_cpu_name), + pct = s.top_cpu_pct, + ) + }; + let top_mem = if s.top_mem_name.is_empty() { + "—".to_string() + } else { + let mb = s.top_mem_mb; + let mem_disp = if mb >= 1024 { + format!("{:.1} GB", (mb as f64) / 1024.0) + } else { + format!("{} MB", mb) + }; + format!( + "{name} {disp}", + name = html_escape(&s.top_mem_name), + disp = html_escape(&mem_disp), + ) + }; + let uptime_str = if s.uptime_secs > 0 { + fmt_age(s.uptime_secs) + } else { + "—".to_string() + }; + + format!( + r##"
+
+

{l_now}

+ {l_age} +
+
+
+
{l_cpu}
+
{cpu:.0}%
+
+
+
{l_mem}
+
{mem_pct:.0}%
+
{used:.1} / {total:.1} GB
+
+
+
{l_top_cpu}
+
{top_cpu}
+
+
+
{l_top_mem}
+
{top_mem}
+
+
+
{l_uptime}
+
{uptime}
+
+
+
{l_procs}
+
{procs}
+
+
+
"##, + l_now = t(lang, "devices.perf_now"), + l_age = tf1(lang, "devices.perf_sampled_ago", &age_str), + at_full = html_escape(&fmt_unix_utc(s.at)), + l_cpu = t(lang, "devices.perf_cpu"), + cpu_cls = cpu_color, + cpu = s.cpu_pct, + l_mem = t(lang, "devices.perf_mem"), + mem_cls = mem_color, + mem_pct = mem_pct, + used = mem_used_gb, + total = mem_total_gb, + l_top_cpu = t(lang, "devices.perf_top_cpu"), + top_cpu_raw = html_escape(&s.top_cpu_name), + top_cpu = top_cpu, + l_top_mem = t(lang, "devices.perf_top_mem"), + top_mem_raw = html_escape(&s.top_mem_name), + top_mem = top_mem, + l_uptime = t(lang, "devices.perf_uptime"), + uptime = html_escape(&uptime_str), + l_procs = t(lang, "devices.perf_proc_count"), + procs = s.proc_count, + ) +} + +/// Color-code a percentage value (0–100) — green up to 60, amber up to +/// 85, red above. Used for the snapshot stat tiles so the supporter +/// can spot a wedged-laptop at a glance. +fn pct_color(pct: f64) -> &'static str { + if pct >= 85.0 { + "text-rose-400" + } else if pct >= 60.0 { + "text-amber-300" + } else { + "text-emerald-300" + } +} + +/// Render an inline-SVG sparkline. `series` is a (unix-seconds, value) +/// vector; `max_y` clamps the y-axis (so two side-by-side charts share +/// a scale); `bucketed = true` downsamples by averaging into 96 buckets +/// so the polyline string stays short for a wide time window. +fn render_sparkline( + lang: Lang, + series: Vec<(i64, f64)>, + max_y: f64, + bucketed: bool, + title: &str, +) -> String { + const WIDTH: f64 = 600.0; + const HEIGHT: f64 = 80.0; + const PAD: f64 = 4.0; + + if series.is_empty() { + return format!( + r##"
+

{title}

+
{msg}
+
"##, + title = html_escape(title), + msg = t(lang, "devices.perf_no_chart"), + ); + } + + let points = if bucketed && series.len() > 96 { + downsample_avg(&series, 96) + } else { + series.clone() + }; + + let min_x = points.first().map(|p| p.0).unwrap_or(0); + let max_x = points.last().map(|p| p.0).unwrap_or(0); + let span_x = (max_x - min_x).max(1) as f64; + + let plot_w = WIDTH - 2.0 * PAD; + let plot_h = HEIGHT - 2.0 * PAD; + + let mut path = String::new(); + let mut area = String::new(); + let mut peak: f64 = 0.0; + let mut last: f64 = 0.0; + for (i, (t, v)) in points.iter().enumerate() { + let x = PAD + plot_w * ((t - min_x) as f64) / span_x; + let y_norm = (v / max_y).clamp(0.0, 1.0); + let y = PAD + plot_h * (1.0 - y_norm); + if i == 0 { + path.push_str(&format!("M{:.1},{:.1}", x, y)); + area.push_str(&format!("M{:.1},{:.1}", x, PAD + plot_h)); + area.push_str(&format!(" L{:.1},{:.1}", x, y)); + } else { + path.push_str(&format!(" L{:.1},{:.1}", x, y)); + area.push_str(&format!(" L{:.1},{:.1}", x, y)); + } + peak = peak.max(*v); + last = *v; + } + let last_x = PAD + plot_w; + area.push_str(&format!(" L{:.1},{:.1} Z", last_x, PAD + plot_h)); + + // Hours-from-now labels: oldest point's age, "now" on the right. + let span_secs = (max_x - min_x).max(0); + let span_label = if span_secs >= 3600 { + format!("-{}h", span_secs / 3600) + } else if span_secs >= 60 { + format!("-{}m", span_secs / 60) + } else { + format!("-{}s", span_secs) + }; + + format!( + r##"
+
+

{title}

+ {l_peak} {peak:.0}%   {l_now} {last:.0}% +
+ + + + + +
+ {older} + {l_now_short} +
+
"##, + title = html_escape(title), + l_peak = t(lang, "devices.perf_peak"), + peak = peak, + l_now = t(lang, "devices.perf_latest"), + last = last, + w = WIDTH, + h = HEIGHT, + pad = PAD, + ymid = PAD + plot_h * 0.5, + xend = WIDTH - PAD, + area = area, + path = path, + older = html_escape(&span_label), + l_now_short = t(lang, "devices.perf_now_short"), + ) +} + +/// Mean-pool a (timestamp, value) series down to `target` buckets, +/// keeping the bucket-mean timestamp as the bucket's x. Empty buckets +/// are dropped so the resulting polyline doesn't draw zero-lines for +/// stretches where the agent was offline. +fn downsample_avg(series: &[(i64, f64)], target: usize) -> Vec<(i64, f64)> { + if series.len() <= target { + return series.to_vec(); + } + let min_x = series.first().map(|p| p.0).unwrap_or(0); + let max_x = series.last().map(|p| p.0).unwrap_or(0); + let span = (max_x - min_x).max(1); + let bucket_secs = (span as usize) / target.max(1); + let bucket_secs = bucket_secs.max(1) as i64; + + let mut buckets: Vec<(i64, f64, usize)> = Vec::with_capacity(target); + let mut current_bucket: i64 = -1; + for (t, v) in series { + let b = (t - min_x) / bucket_secs; + if b != current_bucket { + buckets.push((*t, *v, 1)); + current_bucket = b; + } else if let Some(last) = buckets.last_mut() { + last.1 += *v; + last.2 += 1; + } + } + buckets + .into_iter() + .map(|(t, sum, n)| (t, sum / (n as f64))) + .collect() +} + +/// Recent perf-events table — boot/shutdown/sleep degradation, memory +/// exhaustion, BSODs, unexpected reboots. Empty list → a neutral +/// "nothing flagged yet" panel so the heading still has a body. +fn render_perf_events_table(lang: Lang, events: &[PerfEventRow]) -> String { + if events.is_empty() { + return format!( + r##"
+ {msg} +
"##, + msg = t(lang, "devices.perf_events_none"), + ); + } + let mut s = format!( + r##"
+

{l_events}

+
+ + + + + + + + + "##, + l_events = t(lang, "devices.perf_events_heading"), + c_when = t(lang, "devices.perf_events_col_when"), + c_source = t(lang, "devices.perf_events_col_source"), + c_summary = t(lang, "devices.perf_events_col_summary"), + ); + for ev in events { + let when = fmt_unix_utc(ev.at); + let (level_cls, _level_label) = match ev.level { + 1 => ("bg-rose-900/40 text-rose-300 border-rose-800", "critical"), + 2 => ("bg-rose-900/30 text-rose-300 border-rose-900", "error"), + 3 => ("bg-amber-900/40 text-amber-300 border-amber-800", "warning"), + _ => ("bg-slate-800 text-slate-300 border-slate-700", "info"), + }; + let source_label = match ev.provider.as_str() { + "diag-perf" => t(lang, "devices.perf_src_diag_perf"), + "res-exh" => t(lang, "devices.perf_src_res_exh"), + "system" => t(lang, "devices.perf_src_system"), + other => other, + }; + let _ = write!( + s, + r##" + + + +"##, + when = html_escape(&when), + lvl_cls = level_cls, + src = html_escape(source_label), + eid = ev.event_id, + summary = html_escape(&ev.summary), + ); + } + s.push_str("
{c_when}{c_source}{c_summary}
{when} + {src} · {eid} + {summary}
"); + s +} + /// Render the per-device login history table. Empty input → a neutral /// "no events yet" panel so the heading still has something under it. /// We render the agent-reported `at` in the standard SQLite UTC format diff --git a/src/api/metrics.rs b/src/api/metrics.rs new file mode 100644 index 0000000..594a09a --- /dev/null +++ b/src/api/metrics.rs @@ -0,0 +1,183 @@ +//! `POST /api/agent/metrics` — continuous performance time-series the +//! agent samples at ~1/min. The admin Devices detail page renders this +//! as a CPU / memory sparkline plus a "current snapshot" card. +//! +//! Auth: same per-peer signed-API gate as the other agent endpoints — +//! see [`crate::api::device_auth`]. Body shape (batched so an agent +//! that's catching up after a transport outage can land everything in +//! one POST): +//! +//! ```json +//! { +//! "id": "", +//! "uuid": "", +//! "samples": [ +//! { +//! "at": 1717920000, +//! "cpu_pct": 42.5, +//! "mem_used_mb": 7820, +//! "mem_total_mb": 16384, +//! "proc_count": 341, +//! "uptime_secs": 173000, +//! "top_cpu_name": "chrome.exe", +//! "top_cpu_pct": 18.3, +//! "top_mem_name": "chrome.exe", +//! "top_mem_mb": 1240 +//! } +//! ] +//! } +//! ``` + +use crate::api::device_auth::{self, AuthOutcome}; +use crate::api::error::ApiError; +use crate::api::state::AppState; +use crate::database::MetricsSampleRow; +use axum::body::Bytes; +use axum::extract::Extension; +use axum::http::HeaderMap; +use serde::Deserialize; +use std::sync::Arc; + +#[derive(Debug, Deserialize)] +pub struct MetricsSampleIn { + pub at: i64, + #[serde(default)] + pub cpu_pct: f64, + #[serde(default)] + pub mem_used_mb: i64, + #[serde(default)] + pub mem_total_mb: i64, + #[serde(default)] + pub proc_count: i64, + #[serde(default)] + pub uptime_secs: i64, + #[serde(default)] + pub top_cpu_name: String, + #[serde(default)] + pub top_cpu_pct: f64, + #[serde(default)] + pub top_mem_name: String, + #[serde(default)] + pub top_mem_mb: i64, +} + +#[derive(Debug, Deserialize)] +pub struct MetricsBody { + pub id: String, + pub uuid: String, + pub samples: Vec, +} + +/// Cap per request. At 60s sampling cadence + the agent's 30-minute +/// retry-and-drain budget, even a long outage should fit well under this. +const MAX_SAMPLES_PER_POST: usize = 512; + +/// Defensive bound on string fields the agent puts in `top_*_name` — a +/// runaway process name doesn't get to balloon the DB row. +const MAX_PROC_NAME_LEN: usize = 128; + +pub async fn metrics( + Extension(state): Extension>, + headers: HeaderMap, + body: Bytes, +) -> Result { + let outcome = + device_auth::verify(&state, "POST", "/api/agent/metrics", &headers, &body).await?; + + let payload: MetricsBody = serde_json::from_slice(&body) + .map_err(|_| ApiError::BadRequest("invalid json".into()))?; + + if payload.id.is_empty() || payload.uuid.is_empty() { + return Err(ApiError::BadRequest("id and uuid are required".into())); + } + if payload.samples.is_empty() { + return Ok("OK".to_string()); + } + if payload.samples.len() > MAX_SAMPLES_PER_POST { + return Err(ApiError::BadRequest(format!( + "too many samples in one POST (max {MAX_SAMPLES_PER_POST})" + ))); + } + + let id = match outcome { + AuthOutcome::Verified { id: signed_id } => { + if payload.id != signed_id { + return Err(ApiError::Unauthorized); + } + signed_id + } + AuthOutcome::LegacyUnsigned => { + device_auth::enforce_managed_for_id(&state, &payload.id).await?; + payload.id.clone() + } + }; + + let peer = state + .db + .get_peer(&id) + .await + .map_err(|e| ApiError::Internal(e.to_string()))?; + if peer.is_none() { + return Ok("ID_NOT_FOUND".to_string()); + } + + let mut accepted = 0usize; + for s in &payload.samples { + // Sanity-clamp the floats and string lengths. The agent should + // produce well-formed values, but the public-API shape means + // garbage-in shouldn't propagate to garbage-on-screen. + let cpu_pct = clamp_pct(s.cpu_pct); + let top_cpu_pct = clamp_pct(s.top_cpu_pct); + let row = MetricsSampleRow { + at: s.at, + cpu_pct, + mem_used_mb: s.mem_used_mb.max(0), + mem_total_mb: s.mem_total_mb.max(0), + proc_count: s.proc_count.max(0), + uptime_secs: s.uptime_secs.max(0), + top_cpu_name: truncate(&s.top_cpu_name, MAX_PROC_NAME_LEN), + top_cpu_pct, + top_mem_name: truncate(&s.top_mem_name, MAX_PROC_NAME_LEN), + top_mem_mb: s.top_mem_mb.max(0), + }; + if let Err(e) = state + .db + .metrics_sample_insert(&id, &payload.uuid, &row) + .await + { + hbb_common::log::warn!( + "metrics_sample_insert for peer {} failed: {}", + id, + e + ); + continue; + } + accepted += 1; + } + + hbb_common::log::debug!( + "metrics: peer={} accepted={}/{}", + id, + accepted, + payload.samples.len() + ); + Ok("OK".to_string()) +} + +fn clamp_pct(v: f64) -> f64 { + if v.is_nan() { + 0.0 + } else { + v.clamp(0.0, 100.0) + } +} + +/// Char-aware truncate (so we don't slice mid-multibyte). The cap is +/// generous so process names that include arguments or Unicode survive. +fn truncate(s: &str, max_chars: usize) -> String { + if s.chars().count() <= max_chars { + s.to_string() + } else { + s.chars().take(max_chars).collect() + } +} diff --git a/src/api/mod.rs b/src/api/mod.rs index 47ab53d..ce8763c 100644 --- a/src/api/mod.rs +++ b/src/api/mod.rs @@ -16,6 +16,8 @@ pub mod groups; pub mod heartbeat; pub mod http_proxy; pub mod login_event; +pub mod metrics; +pub mod perf_events; pub mod middleware; pub mod oidc; pub mod pagination; @@ -53,6 +55,8 @@ pub fn router(state: Arc) -> Router { .route("/api/sysinfo", post(sysinfo::sysinfo)) .route("/api/agent/exec-result", post(agent_exec::exec_result)) .route("/api/agent/login-event", post(login_event::login_event)) + .route("/api/agent/metrics", post(metrics::metrics)) + .route("/api/agent/perf-events", post(perf_events::perf_events)) .route( "/api/unattended-password", post(unattended::unattended_password), diff --git a/src/api/perf_events.rs b/src/api/perf_events.rs new file mode 100644 index 0000000..a2f4d6b --- /dev/null +++ b/src/api/perf_events.rs @@ -0,0 +1,141 @@ +//! `POST /api/agent/perf-events` — performance-related Windows event log +//! entries the agent surfaced from `Microsoft-Windows-Diagnostics- +//! Performance/Operational`, `Microsoft-Windows-Resource-Exhaustion- +//! Detector/Operational`, and a few hand-picked IDs from `System` +//! (unexpected reboots, BSODs, dirty shutdowns). The admin UI shows +//! the recent ones in the device's Performance section. +//! +//! Auth: same per-peer signed-API gate as the other agent endpoints. +//! Server-side dedup is via the UNIQUE (peer_id, provider, record_id) +//! index — the agent persists a per-channel cursor to disk, but a +//! restart that loses the cursor can safely re-emit overlapping ranges. + +use crate::api::device_auth::{self, AuthOutcome}; +use crate::api::error::ApiError; +use crate::api::state::AppState; +use crate::database::PerfEventRow; +use axum::body::Bytes; +use axum::extract::Extension; +use axum::http::HeaderMap; +use serde::Deserialize; +use std::sync::Arc; + +#[derive(Debug, Deserialize)] +pub struct PerfEventIn { + pub at: i64, + pub provider: String, + pub event_id: i64, + #[serde(default = "default_level")] + pub level: i64, + #[serde(default)] + pub record_id: i64, + #[serde(default)] + pub summary: String, + #[serde(default)] + pub detail_json: String, +} + +fn default_level() -> i64 { + 4 // WEL "Information" +} + +#[derive(Debug, Deserialize)] +pub struct PerfEventsBody { + pub id: String, + pub uuid: String, + pub events: Vec, +} + +const MAX_EVENTS_PER_POST: usize = 128; +const MAX_PROVIDER_LEN: usize = 64; +const MAX_SUMMARY_LEN: usize = 512; +const MAX_DETAIL_LEN: usize = 8 * 1024; + +pub async fn perf_events( + Extension(state): Extension>, + headers: HeaderMap, + body: Bytes, +) -> Result { + let outcome = + device_auth::verify(&state, "POST", "/api/agent/perf-events", &headers, &body).await?; + + let payload: PerfEventsBody = serde_json::from_slice(&body) + .map_err(|_| ApiError::BadRequest("invalid json".into()))?; + + if payload.id.is_empty() || payload.uuid.is_empty() { + return Err(ApiError::BadRequest("id and uuid are required".into())); + } + if payload.events.is_empty() { + return Ok("OK".to_string()); + } + if payload.events.len() > MAX_EVENTS_PER_POST { + return Err(ApiError::BadRequest(format!( + "too many events in one POST (max {MAX_EVENTS_PER_POST})" + ))); + } + + let id = match outcome { + AuthOutcome::Verified { id: signed_id } => { + if payload.id != signed_id { + return Err(ApiError::Unauthorized); + } + signed_id + } + AuthOutcome::LegacyUnsigned => { + device_auth::enforce_managed_for_id(&state, &payload.id).await?; + payload.id.clone() + } + }; + + let peer = state + .db + .get_peer(&id) + .await + .map_err(|e| ApiError::Internal(e.to_string()))?; + if peer.is_none() { + return Ok("ID_NOT_FOUND".to_string()); + } + + let mut accepted = 0usize; + for e in &payload.events { + let provider = e.provider.trim(); + if provider.is_empty() { + continue; + } + let row = PerfEventRow { + at: e.at, + provider: truncate(provider, MAX_PROVIDER_LEN), + event_id: e.event_id, + level: e.level, + record_id: e.record_id, + summary: truncate(&e.summary, MAX_SUMMARY_LEN), + detail_json: truncate(&e.detail_json, MAX_DETAIL_LEN), + received_at: 0, // server fills via DEFAULT on INSERT + }; + if let Err(err) = state.db.perf_event_insert(&id, &payload.uuid, &row).await { + hbb_common::log::warn!( + "perf_event_insert for peer {} failed: {}", + id, + err + ); + continue; + } + accepted += 1; + } + + hbb_common::log::debug!( + "perf-events: peer={} accepted={}/{}", + id, + accepted, + payload.events.len() + ); + Ok("OK".to_string()) +} + +fn truncate(s: &str, max_chars: usize) -> String { + if s.chars().count() <= max_chars { + s.to_string() + } else { + s.chars().take(max_chars).collect() + } +} diff --git a/src/database.rs b/src/database.rs index 2119640..5f0b2a2 100644 --- a/src/database.rs +++ b/src/database.rs @@ -260,6 +260,22 @@ pub struct ExecQueued { pub script: String, } +fn metrics_sample_row_from(r: sqlx::sqlite::SqliteRow) -> MetricsSampleRow { + use sqlx::Row; + MetricsSampleRow { + at: r.try_get("at").unwrap_or(0), + cpu_pct: r.try_get("cpu_pct").unwrap_or(0.0), + mem_used_mb: r.try_get("mem_used_mb").unwrap_or(0), + mem_total_mb: r.try_get("mem_total_mb").unwrap_or(0), + proc_count: r.try_get("proc_count").unwrap_or(0), + uptime_secs: r.try_get("uptime_secs").unwrap_or(0), + top_cpu_name: r.try_get("top_cpu_name").unwrap_or_default(), + top_cpu_pct: r.try_get("top_cpu_pct").unwrap_or(0.0), + top_mem_name: r.try_get("top_mem_name").unwrap_or_default(), + top_mem_mb: r.try_get("top_mem_mb").unwrap_or(0), + } +} + fn exec_row_from(r: sqlx::sqlite::SqliteRow) -> ExecRow { use sqlx::Row; ExecRow { @@ -284,6 +300,43 @@ fn exec_row_from(r: sqlx::sqlite::SqliteRow) -> ExecRow { /// `received_at` is when the row landed in the DB. `kind` is currently /// `"logon"` or `"logoff"` — anything else is treated as an unknown kind /// by the renderer and shown verbatim. +/// One agent-reported continuous-metrics sample (1/min cadence). +/// `cpu_pct` is the overall system CPU% averaged over the sample window; +/// `top_cpu_*` / `top_mem_*` are the heaviest single process at the +/// moment of capture — enough to answer "what was eating the CPU at +/// 2 AM" without storing a full process tree on every row. +#[derive(Debug, Clone, Default)] +pub struct MetricsSampleRow { + pub at: i64, + pub cpu_pct: f64, + pub mem_used_mb: i64, + pub mem_total_mb: i64, + pub proc_count: i64, + pub uptime_secs: i64, + pub top_cpu_name: String, + pub top_cpu_pct: f64, + pub top_mem_name: String, + pub top_mem_mb: i64, +} + +/// One row from `device_perf_events`. `provider` is the event-log +/// channel short-name (`diag-perf`, `res-exh`, `system`); `event_id` is +/// the numeric Windows event id. `summary` is a UI-ready short string +/// the agent prepared (we don't re-localize Windows event messages +/// server-side); `detail_json` carries the structured payload for the +/// detail-row expansion. +#[derive(Debug, Clone, Default)] +pub struct PerfEventRow { + pub at: i64, + pub provider: String, + pub event_id: i64, + pub level: i64, + pub record_id: i64, + pub summary: String, + pub detail_json: String, + pub received_at: i64, +} + #[derive(Debug, Clone, Default)] pub struct LoginEventRow { pub at: i64, @@ -472,6 +525,12 @@ impl Database { .execute(self.pool.get().await?.deref_mut()) .await?; } + // M8 schema: agent-reported performance time-series + perf events. + for stmt in M8_SCHEMA { + sqlx::query(stmt) + .execute(self.pool.get().await?.deref_mut()) + .await?; + } // Soft-ALTERs run after schema creation. SQLite < 3.35 lacks // `ADD COLUMN IF NOT EXISTS`; swallow the duplicate-column error // so re-runs are idempotent. Newly-added soft alters get appended @@ -3517,6 +3576,141 @@ impl Database { Ok(()) } + // ────────────────────── device_metrics_samples (M8) ──────────────────── + // + // Time-series performance samples reported by the agent. INSERT OR + // IGNORE keeps the on-the-wire retry path idempotent. + + pub async fn metrics_sample_insert( + &self, + peer_id: &str, + peer_uuid: &str, + s: &MetricsSampleRow, + ) -> ResultType<()> { + sqlx::query( + "insert or ignore into device_metrics_samples \ + (peer_id, peer_uuid, at, cpu_pct, mem_used_mb, mem_total_mb, \ + proc_count, uptime_secs, top_cpu_name, top_cpu_pct, \ + top_mem_name, top_mem_mb) \ + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ) + .bind(peer_id) + .bind(peer_uuid) + .bind(s.at) + .bind(s.cpu_pct) + .bind(s.mem_used_mb) + .bind(s.mem_total_mb) + .bind(s.proc_count) + .bind(s.uptime_secs) + .bind(&s.top_cpu_name) + .bind(s.top_cpu_pct) + .bind(&s.top_mem_name) + .bind(s.top_mem_mb) + .execute(self.pool.get().await?.deref_mut()) + .await?; + Ok(()) + } + + /// Range query — used by the sparkline renderer. `since_at` is a + /// unix epoch second; rows are returned oldest-first because the + /// chart polyline draws left-to-right. + pub async fn metrics_samples_since( + &self, + peer_id: &str, + since_at: i64, + ) -> ResultType> { + let rows = sqlx::query( + "select at, cpu_pct, mem_used_mb, mem_total_mb, proc_count, \ + uptime_secs, top_cpu_name, top_cpu_pct, top_mem_name, top_mem_mb \ + from device_metrics_samples \ + where peer_id = ? and at >= ? \ + order by at asc", + ) + .bind(peer_id) + .bind(since_at) + .fetch_all(self.pool.get().await?.deref_mut()) + .await?; + Ok(rows.into_iter().map(metrics_sample_row_from).collect()) + } + + /// "What's happening right now" — the most recent sample, for the + /// snapshot card on the device detail page. None if the agent has + /// never reported. + pub async fn metrics_latest( + &self, + peer_id: &str, + ) -> ResultType> { + let row = sqlx::query( + "select at, cpu_pct, mem_used_mb, mem_total_mb, proc_count, \ + uptime_secs, top_cpu_name, top_cpu_pct, top_mem_name, top_mem_mb \ + from device_metrics_samples \ + where peer_id = ? \ + order by at desc limit 1", + ) + .bind(peer_id) + .fetch_optional(self.pool.get().await?.deref_mut()) + .await?; + Ok(row.map(metrics_sample_row_from)) + } + + // ─────────────────────── device_perf_events (M8) ──────────────────────── + + pub async fn perf_event_insert( + &self, + peer_id: &str, + peer_uuid: &str, + e: &PerfEventRow, + ) -> ResultType<()> { + sqlx::query( + "insert or ignore into device_perf_events \ + (peer_id, peer_uuid, at, provider, event_id, level, record_id, \ + summary, detail_json) \ + values (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ) + .bind(peer_id) + .bind(peer_uuid) + .bind(e.at) + .bind(&e.provider) + .bind(e.event_id) + .bind(e.level) + .bind(e.record_id) + .bind(&e.summary) + .bind(&e.detail_json) + .execute(self.pool.get().await?.deref_mut()) + .await?; + Ok(()) + } + + pub async fn perf_events_for_peer( + &self, + peer_id: &str, + limit: i64, + ) -> ResultType> { + let rows = sqlx::query( + "select at, provider, event_id, level, record_id, summary, detail_json, received_at \ + from device_perf_events \ + where peer_id = ? \ + order by at desc, id desc limit ?", + ) + .bind(peer_id) + .bind(limit) + .fetch_all(self.pool.get().await?.deref_mut()) + .await?; + Ok(rows + .into_iter() + .map(|r| PerfEventRow { + at: r.try_get("at").unwrap_or(0), + provider: r.try_get("provider").unwrap_or_default(), + event_id: r.try_get("event_id").unwrap_or(0), + level: r.try_get("level").unwrap_or(4), + record_id: r.try_get("record_id").unwrap_or(0), + summary: r.try_get("summary").unwrap_or_default(), + detail_json: r.try_get("detail_json").unwrap_or_default(), + received_at: r.try_get("received_at").unwrap_or(0), + }) + .collect()) + } + /// Most-recent-first list for the device detail page. pub async fn login_events_for_peer( &self, @@ -3989,6 +4183,65 @@ const M7_SCHEMA: &[&str] = &[ ON device_login_events(peer_id, kind, session_id, at, username)", ]; +/// M8: agent-reported performance data — both continuous time-series +/// samples (1/min) and episodic events the OS itself flagged in the +/// `Microsoft-Windows-Diagnostics-Performance` / `Resource-Exhaustion- +/// Detector` / `System` event logs. Sized for "supporter confirms a +/// user's slow-laptop complaint": +/// +/// * `device_metrics_samples` — overall CPU%, memory used/total, top CPU +/// / memory process. 60-second cadence, ~1440 rows/device/day, +/// ~10K/week. The UI plots a 24 h sparkline off this. +/// * `device_perf_events` — boot / shutdown / sleep degradation, memory +/// exhaustion, unexpected reboots, BSODs. Sparse — one device might +/// produce a handful per week. UI shows the most recent on the detail +/// page. +/// +/// Both tables carry a UNIQUE INDEX that pairs with `INSERT OR IGNORE`, +/// so an agent that re-POSTs after a transient transport failure (or +/// across a restart that loses its in-memory queue) doesn't pile up +/// duplicates. Retention isn't enforced server-side yet — rows +/// accumulate, same model as `audit_conn` / `exec_history`. +const M8_SCHEMA: &[&str] = &[ + "CREATE TABLE IF NOT EXISTS device_metrics_samples ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + peer_id TEXT NOT NULL, + peer_uuid TEXT NOT NULL, + at INTEGER NOT NULL, + cpu_pct REAL NOT NULL DEFAULT 0, + mem_used_mb INTEGER NOT NULL DEFAULT 0, + mem_total_mb INTEGER NOT NULL DEFAULT 0, + proc_count INTEGER NOT NULL DEFAULT 0, + uptime_secs INTEGER NOT NULL DEFAULT 0, + top_cpu_name TEXT NOT NULL DEFAULT '', + top_cpu_pct REAL NOT NULL DEFAULT 0, + top_mem_name TEXT NOT NULL DEFAULT '', + top_mem_mb INTEGER NOT NULL DEFAULT 0, + received_at INTEGER NOT NULL DEFAULT (strftime('%s','now')) + )", + "CREATE INDEX IF NOT EXISTS idx_dms_peer_at \ + ON device_metrics_samples(peer_id, at DESC)", + "CREATE UNIQUE INDEX IF NOT EXISTS uq_dms \ + ON device_metrics_samples(peer_id, at)", + "CREATE TABLE IF NOT EXISTS device_perf_events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + peer_id TEXT NOT NULL, + peer_uuid TEXT NOT NULL, + at INTEGER NOT NULL, + provider TEXT NOT NULL, + event_id INTEGER NOT NULL, + level INTEGER NOT NULL DEFAULT 4, + record_id INTEGER NOT NULL DEFAULT 0, + summary TEXT NOT NULL DEFAULT '', + detail_json TEXT NOT NULL DEFAULT '', + received_at INTEGER NOT NULL DEFAULT (strftime('%s','now')) + )", + "CREATE INDEX IF NOT EXISTS idx_dpe_peer_at \ + ON device_perf_events(peer_id, at DESC)", + "CREATE UNIQUE INDEX IF NOT EXISTS uq_dpe \ + ON device_perf_events(peer_id, provider, record_id)", +]; + #[cfg(test)] mod tests { use hbb_common::tokio;