Implement performance monitor

2026-05-22 21:41:54 +02:00
parent 62a8870ea2
commit 3ab67e80e1
7 changed files with 1149 additions and 4 deletions
@@ -1,7 +1,7 @@
 # Agent API authentication

 Reference for the per-device signature gate on the agent-facing HTTP
-API. Five endpoints are gated:
+API. Seven endpoints are gated:

 - `POST /api/heartbeat`
 - `POST /api/sysinfo`
@@ -11,6 +11,15 @@ API. Five endpoints are gated:
  by the agent. Same TOFU lifecycle as heartbeat / sysinfo: stock
  RustDesk doesn't post here at all, so in practice every caller is a
  managed agent; the legacy/unsigned path is kept for symmetry.
+- `POST /api/agent/metrics` — continuous CPU / memory / top-process
+  samples (≈1 / minute). Surfaced on the admin Devices detail page as
+  a 24 h sparkline + live snapshot card.
+- `POST /api/agent/perf-events` — sparse Windows-event-log entries
+  flagged by `Microsoft-Windows-Diagnostics-Performance/Operational`,
+  `Microsoft-Windows-Resource-Exhaustion-Detector/Operational`, and
+  hand-picked `System` IDs (41 / 6008 / 1001 — unexpected reboot /
+  dirty shutdown / BSOD). Server dedups via UNIQUE (peer_id, provider,
+  record_id).

 For the operator workflow — turning it on, the dashboard toggle, what
 happens when a managed agent is uninstalled — see the matching section
@@ -1224,6 +1224,137 @@ pub fn t(lang: Lang, key: &str) -> &'static str {
            "Istoric autentificări",
            "Historial de inicio de sesión",
        ),
+        "devices.performance" => (
+            "Performance",
+            "Leistung",
+            "Performances",
+            "Performanță",
+            "Rendimiento",
+        ),
+        "devices.perf_none" => (
+            "No performance data reported yet. The agent collects CPU / memory samples once per minute and Windows-reported performance events as they happen.",
+            "Noch keine Leistungsdaten gemeldet. Der Agent sammelt CPU-/Speicher-Stichproben einmal pro Minute und die von Windows gemeldeten Leistungsereignisse, sobald sie auftreten.",
+            "Aucune donnée de performance signalée pour l'instant. L'agent collecte des échantillons CPU / mémoire une fois par minute et les événements de performance signalés par Windows au fur et à mesure.",
+            "Niciun fel de date de performanță raportate încă. Agentul colectează eșantioane CPU / memorie o dată pe minut și evenimentele de performanță raportate de Windows pe măsură ce se întâmplă.",
+            "Aún no se han reportado datos de rendimiento. El agente recopila muestras de CPU / memoria una vez por minuto y los eventos de rendimiento reportados por Windows a medida que ocurren.",
+        ),
+        "devices.perf_no_live" => (
+            "No live snapshot yet — waiting for the agent's first sample.",
+            "Noch keine Live-Stichprobe – warte auf die erste Probe des Agenten.",
+            "Pas encore d'instantané en direct — en attente du premier échantillon de l'agent.",
+            "Niciun instantaneu live încă — se așteaptă prima probă a agentului.",
+            "Aún no hay instantánea en vivo — esperando la primera muestra del agente.",
+        ),
+        "devices.perf_now" => ("Live", "Live", "En direct", "Live", "En vivo"),
+        "devices.perf_sampled_ago" => (
+            "Sampled {0} ago",
+            "Vor {0} aufgenommen",
+            "Échantillonné il y a {0}",
+            "Eșantionat acum {0}",
+            "Muestreado hace {0}",
+        ),
+        "devices.perf_cpu" => ("CPU", "CPU", "Processeur", "CPU", "CPU"),
+        "devices.perf_mem" => (
+            "Memory",
+            "Speicher",
+            "Mémoire",
+            "Memorie",
+            "Memoria",
+        ),
+        "devices.perf_top_cpu" => (
+            "Top CPU",
+            "Top-CPU",
+            "Plus gros CPU",
+            "Top CPU",
+            "Mayor CPU",
+        ),
+        "devices.perf_top_mem" => (
+            "Top memory",
+            "Top-Speicher",
+            "Plus grosse mémoire",
+            "Top memorie",
+            "Mayor memoria",
+        ),
+        "devices.perf_uptime" => (
+            "Uptime",
+            "Laufzeit",
+            "Disponibilité",
+            "Timp activ",
+            "Tiempo activo",
+        ),
+        "devices.perf_proc_count" => (
+            "Processes",
+            "Prozesse",
+            "Processus",
+            "Procese",
+            "Procesos",
+        ),
+        "devices.perf_no_chart" => (
+            "No samples in the last 24 h",
+            "Keine Daten in den letzten 24 Std",
+            "Aucun échantillon ces 24 dernières h",
+            "Niciun eșantion în ultimele 24 h",
+            "Sin muestras en las últimas 24 h",
+        ),
+        "devices.perf_peak" => ("peak", "Spitze", "max.", "vârf", "máx."),
+        "devices.perf_latest" => ("last", "letzter", "dernier", "ultim", "último"),
+        "devices.perf_now_short" => ("now", "jetzt", "maint.", "acum", "ahora"),
+        "devices.perf_events_heading" => (
+            "Recent performance events",
+            "Aktuelle Leistungsereignisse",
+            "Événements de performance récents",
+            "Evenimente recente de performanță",
+            "Eventos de rendimiento recientes",
+        ),
+        "devices.perf_events_none" => (
+            "No performance events reported. Windows flags boot / shutdown / sleep slow paths, memory exhaustion, unexpected reboots and BSODs here.",
+            "Keine Leistungsereignisse gemeldet. Windows markiert hier verlangsamte Start-/Herunterfahren-/Standby-Vorgänge, Speichermangel, unerwartete Neustarts und Bluescreens.",
+            "Aucun événement de performance signalé. Windows signale ici les démarrages / arrêts / veilles lents, l'épuisement de la mémoire, les redémarrages inattendus et les BSOD.",
+            "Niciun eveniment de performanță raportat. Windows marchează aici pornirile / opririle / repausurile lente, epuizarea memoriei, repornirile neașteptate și BSOD-urile.",
+            "No se han reportado eventos de rendimiento. Windows registra aquí arranques / apagados / suspensiones lentos, agotamiento de memoria, reinicios inesperados y BSOD.",
+        ),
+        "devices.perf_events_col_when" => (
+            "When (UTC)",
+            "Wann (UTC)",
+            "Quand (UTC)",
+            "Când (UTC)",
+            "Cuándo (UTC)",
+        ),
+        "devices.perf_events_col_source" => (
+            "Source",
+            "Quelle",
+            "Source",
+            "Sursă",
+            "Origen",
+        ),
+        "devices.perf_events_col_summary" => (
+            "Summary",
+            "Zusammenfassung",
+            "Résumé",
+            "Rezumat",
+            "Resumen",
+        ),
+        "devices.perf_src_diag_perf" => (
+            "Diag-Perf",
+            "Diag-Perf",
+            "Diag-Perf",
+            "Diag-Perf",
+            "Diag-Perf",
+        ),
+        "devices.perf_src_res_exh" => (
+            "Res-Exh",
+            "Res-Exh",
+            "Res-Exh",
+            "Res-Exh",
+            "Res-Exh",
+        ),
+        "devices.perf_src_system" => (
+            "System",
+            "System",
+            "Système",
+            "Sistem",
+            "Sistema",
+        ),
        "devices.login_none" => (
            "No login events recorded yet. The agent reports logons and logoffs as it observes them.",
            "Noch keine Anmeldeereignisse aufgezeichnet. Der Agent meldet An- und Abmeldungen, sobald er sie beobachtet.",
@@ -6,7 +6,7 @@ use crate::api::admin::i18n::{t, tf1, tf2, tf3, Lang};
 use crate::api::error::ApiError;
 use crate::api::middleware::AuthedUser;
 use crate::api::state::AppState;
-use crate::database::{DashboardDeviceRow, LoginEventRow};
+use crate::database::{DashboardDeviceRow, LoginEventRow, MetricsSampleRow, PerfEventRow};
 use axum::extract::{Extension, Form, Path, Query};
 use axum::response::Html;
 use serde::Deserialize;
@@ -470,7 +470,36 @@ pub async fn detail(
                .login_events_for_peer(&d.id, 50)
                .await
                .unwrap_or_default();
-            render_detail(lang, &d, &events)
+            // Performance: pull the most recent metrics sample for the
+            // "right now" card, plus 24 h of samples for the sparkline,
+            // plus the most recent perf events (boot/shutdown/memory-
+            // exhaustion etc.) for the "recent slow events" table.
+            // All three are best-effort — none of them is required for
+            // the detail page to render meaningfully.
+            let metrics_latest = state
+                .db
+                .metrics_latest(&d.id)
+                .await
+                .unwrap_or_default();
+            let since_24h = chrono::Utc::now().timestamp() - 24 * 3600;
+            let metrics_24h = state
+                .db
+                .metrics_samples_since(&d.id, since_24h)
+                .await
+                .unwrap_or_default();
+            let perf_events = state
+                .db
+                .perf_events_for_peer(&d.id, 20)
+                .await
+                .unwrap_or_default();
+            render_detail(
+                lang,
+                &d,
+                &events,
+                metrics_latest.as_ref(),
+                &metrics_24h,
+                &perf_events,
+            )
        }
        None => format!(
            r##"<div class="space-y-4">
@@ -1142,7 +1171,14 @@ fn fmt_inv_value(v: Option<&serde_json::Value>) -> String {
    }
 }

-fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventRow]) -> String {
+fn render_detail(
+    lang: Lang,
+    d: &DashboardDeviceRow,
+    login_events: &[LoginEventRow],
+    metrics_latest: Option<&MetricsSampleRow>,
+    metrics_24h: &[MetricsSampleRow],
+    perf_events: &[PerfEventRow],
+) -> String {
    let parsed: serde_json::Value =
        serde_json::from_str(&d.sysinfo_payload).unwrap_or(serde_json::Value::Null);
    let pick = |k: &str| -> String {
@@ -1237,6 +1273,7 @@ fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventR
    };

    let login_section = render_login_events(lang, login_events);
+    let perf_section = render_performance(lang, metrics_latest, metrics_24h, perf_events);

    format!(
        r##"<div class="space-y-4">
@@ -1245,6 +1282,8 @@ fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventR
    <div class="text-xs text-slate-500">{detail_view}</div>
  </div>
  {header}
+  <h3 class="text-sm font-semibold text-slate-300 mt-4">{performance}</h3>
+  {perf}
  <h3 class="text-sm font-semibold text-slate-300 mt-4">{inventory}</h3>
  {inv}
  <h3 class="text-sm font-semibold text-slate-300 mt-4">{login_history}</h3>
@@ -1252,6 +1291,8 @@ fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventR
 </div>"##,
        back = back_button(lang),
        detail_view = t(lang, "devices.detail_view"),
+        performance = t(lang, "devices.performance"),
+        perf = perf_section,
        inventory = t(lang, "devices.inventory"),
        header = header,
        inv = inventory_section,
@@ -1260,6 +1301,389 @@ fn render_detail(lang: Lang, d: &DashboardDeviceRow, login_events: &[LoginEventR
    )
 }

+/// Top-level Performance section: snapshot card, two sparklines (CPU /
+/// memory), and a recent-events table. The whole thing is omitted in
+/// favour of a "no data yet" panel when the agent hasn't reported.
+fn render_performance(
+    lang: Lang,
+    latest: Option<&MetricsSampleRow>,
+    series: &[MetricsSampleRow],
+    events: &[PerfEventRow],
+) -> String {
+    if latest.is_none() && series.is_empty() && events.is_empty() {
+        return format!(
+            r##"<div class="rounded-md border border-slate-700 bg-slate-900 p-3 text-sm text-slate-400">
+  {msg}
+</div>"##,
+            msg = t(lang, "devices.perf_none"),
+        );
+    }
+
+    let snapshot = render_perf_snapshot(lang, latest);
+    let cpu_chart = render_sparkline(
+        lang,
+        series.iter().map(|s| (s.at, s.cpu_pct)).collect(),
+        100.0,
+        true,
+        t(lang, "devices.perf_cpu"),
+    );
+    let mem_chart = {
+        // Mem is reported as MB used / MB total; chart uses % so the
+        // y-axis stays comparable to the CPU panel.
+        let series_pct: Vec<(i64, f64)> = series
+            .iter()
+            .filter(|s| s.mem_total_mb > 0)
+            .map(|s| {
+                let pct = 100.0 * (s.mem_used_mb as f64) / (s.mem_total_mb as f64);
+                (s.at, pct)
+            })
+            .collect();
+        render_sparkline(lang, series_pct, 100.0, true, t(lang, "devices.perf_mem"))
+    };
+    let events_section = render_perf_events_table(lang, events);
+
+    format!(
+        r##"<div class="space-y-4">
+  {snapshot}
+  <div class="grid grid-cols-1 gap-4 md:grid-cols-2">
+    {cpu}
+    {mem}
+  </div>
+  {events}
+</div>"##,
+        snapshot = snapshot,
+        cpu = cpu_chart,
+        mem = mem_chart,
+        events = events_section,
+    )
+}
+
+/// "Right now" card — the most recent metrics sample. Drawn as a 4-up
+/// stat tile so the supporter can glance at CPU / memory / top
+/// processes without reading a chart. Falls back to a thin "no live
+/// data" pill when the agent has never reported.
+fn render_perf_snapshot(lang: Lang, latest: Option<&MetricsSampleRow>) -> String {
+    let Some(s) = latest else {
+        return format!(
+            r##"<div class="rounded-md border border-slate-800 bg-slate-900 p-3 text-xs text-slate-500">
+  {msg}
+</div>"##,
+            msg = t(lang, "devices.perf_no_live"),
+        );
+    };
+    let now = chrono::Utc::now().timestamp();
+    let age = (now - s.at).max(0);
+    let age_str = fmt_age(age);
+    let cpu_color = pct_color(s.cpu_pct);
+    let mem_pct = if s.mem_total_mb > 0 {
+        100.0 * (s.mem_used_mb as f64) / (s.mem_total_mb as f64)
+    } else {
+        0.0
+    };
+    let mem_color = pct_color(mem_pct);
+    let mem_used_gb = (s.mem_used_mb as f64) / 1024.0;
+    let mem_total_gb = (s.mem_total_mb as f64) / 1024.0;
+    let top_cpu = if s.top_cpu_name.is_empty() {
+        "—".to_string()
+    } else {
+        format!(
+            "{name} <span class=\"text-xs text-slate-400\">{pct:.0}%</span>",
+            name = html_escape(&s.top_cpu_name),
+            pct = s.top_cpu_pct,
+        )
+    };
+    let top_mem = if s.top_mem_name.is_empty() {
+        "—".to_string()
+    } else {
+        let mb = s.top_mem_mb;
+        let mem_disp = if mb >= 1024 {
+            format!("{:.1} GB", (mb as f64) / 1024.0)
+        } else {
+            format!("{} MB", mb)
+        };
+        format!(
+            "{name} <span class=\"text-xs text-slate-400\">{disp}</span>",
+            name = html_escape(&s.top_mem_name),
+            disp = html_escape(&mem_disp),
+        )
+    };
+    let uptime_str = if s.uptime_secs > 0 {
+        fmt_age(s.uptime_secs)
+    } else {
+        "—".to_string()
+    };
+
+    format!(
+        r##"<div class="rounded-md border border-slate-800 bg-slate-900 p-4">
+  <div class="flex items-baseline justify-between mb-3">
+    <h4 class="text-sm font-semibold text-slate-200">{l_now}</h4>
+    <span class="text-xs text-slate-500" title="{at_full} UTC">{l_age}</span>
+  </div>
+  <dl class="grid grid-cols-2 gap-x-6 gap-y-3 text-sm md:grid-cols-4">
+    <div>
+      <dt class="text-xs text-slate-500">{l_cpu}</dt>
+      <dd class="text-lg font-semibold {cpu_cls} tabular-nums">{cpu:.0}%</dd>
+    </div>
+    <div>
+      <dt class="text-xs text-slate-500">{l_mem}</dt>
+      <dd class="text-lg font-semibold {mem_cls} tabular-nums">{mem_pct:.0}%</dd>
+      <dd class="text-xs text-slate-500 tabular-nums">{used:.1} / {total:.1} GB</dd>
+    </div>
+    <div>
+      <dt class="text-xs text-slate-500">{l_top_cpu}</dt>
+      <dd class="text-slate-200 font-mono text-xs truncate" title="{top_cpu_raw}">{top_cpu}</dd>
+    </div>
+    <div>
+      <dt class="text-xs text-slate-500">{l_top_mem}</dt>
+      <dd class="text-slate-200 font-mono text-xs truncate" title="{top_mem_raw}">{top_mem}</dd>
+    </div>
+    <div>
+      <dt class="text-xs text-slate-500">{l_uptime}</dt>
+      <dd class="text-slate-300 tabular-nums">{uptime}</dd>
+    </div>
+    <div>
+      <dt class="text-xs text-slate-500">{l_procs}</dt>
+      <dd class="text-slate-300 tabular-nums">{procs}</dd>
+    </div>
+  </dl>
+</div>"##,
+        l_now = t(lang, "devices.perf_now"),
+        l_age = tf1(lang, "devices.perf_sampled_ago", &age_str),
+        at_full = html_escape(&fmt_unix_utc(s.at)),
+        l_cpu = t(lang, "devices.perf_cpu"),
+        cpu_cls = cpu_color,
+        cpu = s.cpu_pct,
+        l_mem = t(lang, "devices.perf_mem"),
+        mem_cls = mem_color,
+        mem_pct = mem_pct,
+        used = mem_used_gb,
+        total = mem_total_gb,
+        l_top_cpu = t(lang, "devices.perf_top_cpu"),
+        top_cpu_raw = html_escape(&s.top_cpu_name),
+        top_cpu = top_cpu,
+        l_top_mem = t(lang, "devices.perf_top_mem"),
+        top_mem_raw = html_escape(&s.top_mem_name),
+        top_mem = top_mem,
+        l_uptime = t(lang, "devices.perf_uptime"),
+        uptime = html_escape(&uptime_str),
+        l_procs = t(lang, "devices.perf_proc_count"),
+        procs = s.proc_count,
+    )
+}
+
+/// Color-code a percentage value (0–100) — green up to 60, amber up to
+/// 85, red above. Used for the snapshot stat tiles so the supporter
+/// can spot a wedged-laptop at a glance.
+fn pct_color(pct: f64) -> &'static str {
+    if pct >= 85.0 {
+        "text-rose-400"
+    } else if pct >= 60.0 {
+        "text-amber-300"
+    } else {
+        "text-emerald-300"
+    }
+}
+
+/// Render an inline-SVG sparkline. `series` is a (unix-seconds, value)
+/// vector; `max_y` clamps the y-axis (so two side-by-side charts share
+/// a scale); `bucketed = true` downsamples by averaging into 96 buckets
+/// so the polyline string stays short for a wide time window.
+fn render_sparkline(
+    lang: Lang,
+    series: Vec<(i64, f64)>,
+    max_y: f64,
+    bucketed: bool,
+    title: &str,
+) -> String {
+    const WIDTH: f64 = 600.0;
+    const HEIGHT: f64 = 80.0;
+    const PAD: f64 = 4.0;
+
+    if series.is_empty() {
+        return format!(
+            r##"<div class="rounded-md border border-slate-800 bg-slate-900 p-3">
+  <h4 class="text-xs uppercase text-slate-500 mb-1">{title}</h4>
+  <div class="text-xs text-slate-500">{msg}</div>
+</div>"##,
+            title = html_escape(title),
+            msg = t(lang, "devices.perf_no_chart"),
+        );
+    }
+
+    let points = if bucketed && series.len() > 96 {
+        downsample_avg(&series, 96)
+    } else {
+        series.clone()
+    };
+
+    let min_x = points.first().map(|p| p.0).unwrap_or(0);
+    let max_x = points.last().map(|p| p.0).unwrap_or(0);
+    let span_x = (max_x - min_x).max(1) as f64;
+
+    let plot_w = WIDTH - 2.0 * PAD;
+    let plot_h = HEIGHT - 2.0 * PAD;
+
+    let mut path = String::new();
+    let mut area = String::new();
+    let mut peak: f64 = 0.0;
+    let mut last: f64 = 0.0;
+    for (i, (t, v)) in points.iter().enumerate() {
+        let x = PAD + plot_w * ((t - min_x) as f64) / span_x;
+        let y_norm = (v / max_y).clamp(0.0, 1.0);
+        let y = PAD + plot_h * (1.0 - y_norm);
+        if i == 0 {
+            path.push_str(&format!("M{:.1},{:.1}", x, y));
+            area.push_str(&format!("M{:.1},{:.1}", x, PAD + plot_h));
+            area.push_str(&format!(" L{:.1},{:.1}", x, y));
+        } else {
+            path.push_str(&format!(" L{:.1},{:.1}", x, y));
+            area.push_str(&format!(" L{:.1},{:.1}", x, y));
+        }
+        peak = peak.max(*v);
+        last = *v;
+    }
+    let last_x = PAD + plot_w;
+    area.push_str(&format!(" L{:.1},{:.1} Z", last_x, PAD + plot_h));
+
+    // Hours-from-now labels: oldest point's age, "now" on the right.
+    let span_secs = (max_x - min_x).max(0);
+    let span_label = if span_secs >= 3600 {
+        format!("-{}h", span_secs / 3600)
+    } else if span_secs >= 60 {
+        format!("-{}m", span_secs / 60)
+    } else {
+        format!("-{}s", span_secs)
+    };
+
+    format!(
+        r##"<div class="rounded-md border border-slate-800 bg-slate-900 p-3">
+  <div class="flex items-baseline justify-between mb-1">
+    <h4 class="text-xs uppercase text-slate-500">{title}</h4>
+    <span class="text-[11px] text-slate-500 tabular-nums">{l_peak} <span class="text-slate-300">{peak:.0}%</span> &nbsp; {l_now} <span class="text-slate-300">{last:.0}%</span></span>
+  </div>
+  <svg viewBox="0 0 {w} {h}" preserveAspectRatio="none" class="w-full h-20">
+    <line x1="{pad}" y1="{ymid:.1}" x2="{xend:.1}" y2="{ymid:.1}" stroke="#1f2937" stroke-width="1" stroke-dasharray="2,3"/>
+    <path d="{area}" fill="#0ea5e9" fill-opacity="0.10" stroke="none"/>
+    <path d="{path}" fill="none" stroke="#38bdf8" stroke-width="1.5" stroke-linejoin="round"/>
+  </svg>
+  <div class="flex justify-between text-[10px] text-slate-500 mt-1 tabular-nums">
+    <span>{older}</span>
+    <span>{l_now_short}</span>
+  </div>
+</div>"##,
+        title = html_escape(title),
+        l_peak = t(lang, "devices.perf_peak"),
+        peak = peak,
+        l_now = t(lang, "devices.perf_latest"),
+        last = last,
+        w = WIDTH,
+        h = HEIGHT,
+        pad = PAD,
+        ymid = PAD + plot_h * 0.5,
+        xend = WIDTH - PAD,
+        area = area,
+        path = path,
+        older = html_escape(&span_label),
+        l_now_short = t(lang, "devices.perf_now_short"),
+    )
+}
+
+/// Mean-pool a (timestamp, value) series down to `target` buckets,
+/// keeping the bucket-mean timestamp as the bucket's x. Empty buckets
+/// are dropped so the resulting polyline doesn't draw zero-lines for
+/// stretches where the agent was offline.
+fn downsample_avg(series: &[(i64, f64)], target: usize) -> Vec<(i64, f64)> {
+    if series.len() <= target {
+        return series.to_vec();
+    }
+    let min_x = series.first().map(|p| p.0).unwrap_or(0);
+    let max_x = series.last().map(|p| p.0).unwrap_or(0);
+    let span = (max_x - min_x).max(1);
+    let bucket_secs = (span as usize) / target.max(1);
+    let bucket_secs = bucket_secs.max(1) as i64;
+
+    let mut buckets: Vec<(i64, f64, usize)> = Vec::with_capacity(target);
+    let mut current_bucket: i64 = -1;
+    for (t, v) in series {
+        let b = (t - min_x) / bucket_secs;
+        if b != current_bucket {
+            buckets.push((*t, *v, 1));
+            current_bucket = b;
+        } else if let Some(last) = buckets.last_mut() {
+            last.1 += *v;
+            last.2 += 1;
+        }
+    }
+    buckets
+        .into_iter()
+        .map(|(t, sum, n)| (t, sum / (n as f64)))
+        .collect()
+}
+
+/// Recent perf-events table — boot/shutdown/sleep degradation, memory
+/// exhaustion, BSODs, unexpected reboots. Empty list → a neutral
+/// "nothing flagged yet" panel so the heading still has a body.
+fn render_perf_events_table(lang: Lang, events: &[PerfEventRow]) -> String {
+    if events.is_empty() {
+        return format!(
+            r##"<div class="rounded-md border border-slate-800 bg-slate-900 p-3 text-xs text-slate-500">
+  {msg}
+</div>"##,
+            msg = t(lang, "devices.perf_events_none"),
+        );
+    }
+    let mut s = format!(
+        r##"<div>
+<h4 class="text-xs uppercase text-slate-500 mb-1">{l_events}</h4>
+<div class="rounded-md border border-slate-800 bg-slate-900 overflow-hidden">
+<table class="w-full text-sm">
+  <thead class="text-xs uppercase text-slate-500 bg-slate-950">
+    <tr>
+      <th class="text-left font-medium px-3 py-2">{c_when}</th>
+      <th class="text-left font-medium px-3 py-2">{c_source}</th>
+      <th class="text-left font-medium px-3 py-2">{c_summary}</th>
+    </tr>
+  </thead>
+  <tbody class="divide-y divide-slate-800">"##,
+        l_events = t(lang, "devices.perf_events_heading"),
+        c_when = t(lang, "devices.perf_events_col_when"),
+        c_source = t(lang, "devices.perf_events_col_source"),
+        c_summary = t(lang, "devices.perf_events_col_summary"),
+    );
+    for ev in events {
+        let when = fmt_unix_utc(ev.at);
+        let (level_cls, _level_label) = match ev.level {
+            1 => ("bg-rose-900/40 text-rose-300 border-rose-800", "critical"),
+            2 => ("bg-rose-900/30 text-rose-300 border-rose-900", "error"),
+            3 => ("bg-amber-900/40 text-amber-300 border-amber-800", "warning"),
+            _ => ("bg-slate-800 text-slate-300 border-slate-700", "info"),
+        };
+        let source_label = match ev.provider.as_str() {
+            "diag-perf" => t(lang, "devices.perf_src_diag_perf"),
+            "res-exh" => t(lang, "devices.perf_src_res_exh"),
+            "system" => t(lang, "devices.perf_src_system"),
+            other => other,
+        };
+        let _ = write!(
+            s,
+            r##"<tr class="hover:bg-slate-800/40 align-top">
+  <td class="px-3 py-2 font-mono text-xs text-slate-300 whitespace-nowrap">{when}</td>
+  <td class="px-3 py-2">
+    <span class="inline-block text-[11px] px-1.5 py-0.5 rounded border {lvl_cls}">{src} · {eid}</span>
+  </td>
+  <td class="px-3 py-2 text-slate-200 text-xs">{summary}</td>
+</tr>"##,
+            when = html_escape(&when),
+            lvl_cls = level_cls,
+            src = html_escape(source_label),
+            eid = ev.event_id,
+            summary = html_escape(&ev.summary),
+        );
+    }
+    s.push_str("</tbody></table></div></div>");
+    s
+}
+
 /// Render the per-device login history table. Empty input → a neutral
 /// "no events yet" panel so the heading still has something under it.
 /// We render the agent-reported `at` in the standard SQLite UTC format
@@ -0,0 +1,183 @@
+//! `POST /api/agent/metrics` — continuous performance time-series the
+//! agent samples at ~1/min. The admin Devices detail page renders this
+//! as a CPU / memory sparkline plus a "current snapshot" card.
+//!
+//! Auth: same per-peer signed-API gate as the other agent endpoints —
+//! see [`crate::api::device_auth`]. Body shape (batched so an agent
+//! that's catching up after a transport outage can land everything in
+//! one POST):
+//!
+//! ```json
+//! {
+//!   "id":   "<peer id>",
+//!   "uuid": "<peer uuid>",
+//!   "samples": [
+//!     {
+//!       "at":           1717920000,
+//!       "cpu_pct":      42.5,
+//!       "mem_used_mb":  7820,
+//!       "mem_total_mb": 16384,
+//!       "proc_count":   341,
+//!       "uptime_secs":  173000,
+//!       "top_cpu_name": "chrome.exe",
+//!       "top_cpu_pct":  18.3,
+//!       "top_mem_name": "chrome.exe",
+//!       "top_mem_mb":   1240
+//!     }
+//!   ]
+//! }
+//! ```
+
+use crate::api::device_auth::{self, AuthOutcome};
+use crate::api::error::ApiError;
+use crate::api::state::AppState;
+use crate::database::MetricsSampleRow;
+use axum::body::Bytes;
+use axum::extract::Extension;
+use axum::http::HeaderMap;
+use serde::Deserialize;
+use std::sync::Arc;
+
+#[derive(Debug, Deserialize)]
+pub struct MetricsSampleIn {
+    pub at: i64,
+    #[serde(default)]
+    pub cpu_pct: f64,
+    #[serde(default)]
+    pub mem_used_mb: i64,
+    #[serde(default)]
+    pub mem_total_mb: i64,
+    #[serde(default)]
+    pub proc_count: i64,
+    #[serde(default)]
+    pub uptime_secs: i64,
+    #[serde(default)]
+    pub top_cpu_name: String,
+    #[serde(default)]
+    pub top_cpu_pct: f64,
+    #[serde(default)]
+    pub top_mem_name: String,
+    #[serde(default)]
+    pub top_mem_mb: i64,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct MetricsBody {
+    pub id: String,
+    pub uuid: String,
+    pub samples: Vec<MetricsSampleIn>,
+}
+
+/// Cap per request. At 60s sampling cadence + the agent's 30-minute
+/// retry-and-drain budget, even a long outage should fit well under this.
+const MAX_SAMPLES_PER_POST: usize = 512;
+
+/// Defensive bound on string fields the agent puts in `top_*_name` — a
+/// runaway process name doesn't get to balloon the DB row.
+const MAX_PROC_NAME_LEN: usize = 128;
+
+pub async fn metrics(
+    Extension(state): Extension<Arc<AppState>>,
+    headers: HeaderMap,
+    body: Bytes,
+) -> Result<String, ApiError> {
+    let outcome =
+        device_auth::verify(&state, "POST", "/api/agent/metrics", &headers, &body).await?;
+
+    let payload: MetricsBody = serde_json::from_slice(&body)
+        .map_err(|_| ApiError::BadRequest("invalid json".into()))?;
+
+    if payload.id.is_empty() || payload.uuid.is_empty() {
+        return Err(ApiError::BadRequest("id and uuid are required".into()));
+    }
+    if payload.samples.is_empty() {
+        return Ok("OK".to_string());
+    }
+    if payload.samples.len() > MAX_SAMPLES_PER_POST {
+        return Err(ApiError::BadRequest(format!(
+            "too many samples in one POST (max {MAX_SAMPLES_PER_POST})"
+        )));
+    }
+
+    let id = match outcome {
+        AuthOutcome::Verified { id: signed_id } => {
+            if payload.id != signed_id {
+                return Err(ApiError::Unauthorized);
+            }
+            signed_id
+        }
+        AuthOutcome::LegacyUnsigned => {
+            device_auth::enforce_managed_for_id(&state, &payload.id).await?;
+            payload.id.clone()
+        }
+    };
+
+    let peer = state
+        .db
+        .get_peer(&id)
+        .await
+        .map_err(|e| ApiError::Internal(e.to_string()))?;
+    if peer.is_none() {
+        return Ok("ID_NOT_FOUND".to_string());
+    }
+
+    let mut accepted = 0usize;
+    for s in &payload.samples {
+        // Sanity-clamp the floats and string lengths. The agent should
+        // produce well-formed values, but the public-API shape means
+        // garbage-in shouldn't propagate to garbage-on-screen.
+        let cpu_pct = clamp_pct(s.cpu_pct);
+        let top_cpu_pct = clamp_pct(s.top_cpu_pct);
+        let row = MetricsSampleRow {
+            at: s.at,
+            cpu_pct,
+            mem_used_mb: s.mem_used_mb.max(0),
+            mem_total_mb: s.mem_total_mb.max(0),
+            proc_count: s.proc_count.max(0),
+            uptime_secs: s.uptime_secs.max(0),
+            top_cpu_name: truncate(&s.top_cpu_name, MAX_PROC_NAME_LEN),
+            top_cpu_pct,
+            top_mem_name: truncate(&s.top_mem_name, MAX_PROC_NAME_LEN),
+            top_mem_mb: s.top_mem_mb.max(0),
+        };
+        if let Err(e) = state
+            .db
+            .metrics_sample_insert(&id, &payload.uuid, &row)
+            .await
+        {
+            hbb_common::log::warn!(
+                "metrics_sample_insert for peer {} failed: {}",
+                id,
+                e
+            );
+            continue;
+        }
+        accepted += 1;
+    }
+
+    hbb_common::log::debug!(
+        "metrics: peer={} accepted={}/{}",
+        id,
+        accepted,
+        payload.samples.len()
+    );
+    Ok("OK".to_string())
+}
+
+fn clamp_pct(v: f64) -> f64 {
+    if v.is_nan() {
+        0.0
+    } else {
+        v.clamp(0.0, 100.0)
+    }
+}
+
+/// Char-aware truncate (so we don't slice mid-multibyte). The cap is
+/// generous so process names that include arguments or Unicode survive.
+fn truncate(s: &str, max_chars: usize) -> String {
+    if s.chars().count() <= max_chars {
+        s.to_string()
+    } else {
+        s.chars().take(max_chars).collect()
+    }
+}
@@ -16,6 +16,8 @@ pub mod groups;
 pub mod heartbeat;
 pub mod http_proxy;
 pub mod login_event;
+pub mod metrics;
+pub mod perf_events;
 pub mod middleware;
 pub mod oidc;
 pub mod pagination;
@@ -53,6 +55,8 @@ pub fn router(state: Arc<AppState>) -> Router {
        .route("/api/sysinfo", post(sysinfo::sysinfo))
        .route("/api/agent/exec-result", post(agent_exec::exec_result))
        .route("/api/agent/login-event", post(login_event::login_event))
+        .route("/api/agent/metrics", post(metrics::metrics))
+        .route("/api/agent/perf-events", post(perf_events::perf_events))
        .route(
            "/api/unattended-password",
            post(unattended::unattended_password),
@@ -0,0 +1,141 @@
+//! `POST /api/agent/perf-events` — performance-related Windows event log
+//! entries the agent surfaced from `Microsoft-Windows-Diagnostics-
+//! Performance/Operational`, `Microsoft-Windows-Resource-Exhaustion-
+//! Detector/Operational`, and a few hand-picked IDs from `System`
+//! (unexpected reboots, BSODs, dirty shutdowns). The admin UI shows
+//! the recent ones in the device's Performance section.
+//!
+//! Auth: same per-peer signed-API gate as the other agent endpoints.
+//! Server-side dedup is via the UNIQUE (peer_id, provider, record_id)
+//! index — the agent persists a per-channel cursor to disk, but a
+//! restart that loses the cursor can safely re-emit overlapping ranges.
+
+use crate::api::device_auth::{self, AuthOutcome};
+use crate::api::error::ApiError;
+use crate::api::state::AppState;
+use crate::database::PerfEventRow;
+use axum::body::Bytes;
+use axum::extract::Extension;
+use axum::http::HeaderMap;
+use serde::Deserialize;
+use std::sync::Arc;
+
+#[derive(Debug, Deserialize)]
+pub struct PerfEventIn {
+    pub at: i64,
+    pub provider: String,
+    pub event_id: i64,
+    #[serde(default = "default_level")]
+    pub level: i64,
+    #[serde(default)]
+    pub record_id: i64,
+    #[serde(default)]
+    pub summary: String,
+    #[serde(default)]
+    pub detail_json: String,
+}
+
+fn default_level() -> i64 {
+    4 // WEL "Information"
+}
+
+#[derive(Debug, Deserialize)]
+pub struct PerfEventsBody {
+    pub id: String,
+    pub uuid: String,
+    pub events: Vec<PerfEventIn>,
+}
+
+const MAX_EVENTS_PER_POST: usize = 128;
+const MAX_PROVIDER_LEN: usize = 64;
+const MAX_SUMMARY_LEN: usize = 512;
+const MAX_DETAIL_LEN: usize = 8 * 1024;
+
+pub async fn perf_events(
+    Extension(state): Extension<Arc<AppState>>,
+    headers: HeaderMap,
+    body: Bytes,
+) -> Result<String, ApiError> {
+    let outcome =
+        device_auth::verify(&state, "POST", "/api/agent/perf-events", &headers, &body).await?;
+
+    let payload: PerfEventsBody = serde_json::from_slice(&body)
+        .map_err(|_| ApiError::BadRequest("invalid json".into()))?;
+
+    if payload.id.is_empty() || payload.uuid.is_empty() {
+        return Err(ApiError::BadRequest("id and uuid are required".into()));
+    }
+    if payload.events.is_empty() {
+        return Ok("OK".to_string());
+    }
+    if payload.events.len() > MAX_EVENTS_PER_POST {
+        return Err(ApiError::BadRequest(format!(
+            "too many events in one POST (max {MAX_EVENTS_PER_POST})"
+        )));
+    }
+
+    let id = match outcome {
+        AuthOutcome::Verified { id: signed_id } => {
+            if payload.id != signed_id {
+                return Err(ApiError::Unauthorized);
+            }
+            signed_id
+        }
+        AuthOutcome::LegacyUnsigned => {
+            device_auth::enforce_managed_for_id(&state, &payload.id).await?;
+            payload.id.clone()
+        }
+    };
+
+    let peer = state
+        .db
+        .get_peer(&id)
+        .await
+        .map_err(|e| ApiError::Internal(e.to_string()))?;
+    if peer.is_none() {
+        return Ok("ID_NOT_FOUND".to_string());
+    }
+
+    let mut accepted = 0usize;
+    for e in &payload.events {
+        let provider = e.provider.trim();
+        if provider.is_empty() {
+            continue;
+        }
+        let row = PerfEventRow {
+            at: e.at,
+            provider: truncate(provider, MAX_PROVIDER_LEN),
+            event_id: e.event_id,
+            level: e.level,
+            record_id: e.record_id,
+            summary: truncate(&e.summary, MAX_SUMMARY_LEN),
+            detail_json: truncate(&e.detail_json, MAX_DETAIL_LEN),
+            received_at: 0, // server fills via DEFAULT on INSERT
+        };
+        if let Err(err) = state.db.perf_event_insert(&id, &payload.uuid, &row).await {
+            hbb_common::log::warn!(
+                "perf_event_insert for peer {} failed: {}",
+                id,
+                err
+            );
+            continue;
+        }
+        accepted += 1;
+    }
+
+    hbb_common::log::debug!(
+        "perf-events: peer={} accepted={}/{}",
+        id,
+        accepted,
+        payload.events.len()
+    );
+    Ok("OK".to_string())
+}
+
+fn truncate(s: &str, max_chars: usize) -> String {
+    if s.chars().count() <= max_chars {
+        s.to_string()
+    } else {
+        s.chars().take(max_chars).collect()
+    }
+}
@@ -260,6 +260,22 @@ pub struct ExecQueued {
    pub script: String,
 }

+fn metrics_sample_row_from(r: sqlx::sqlite::SqliteRow) -> MetricsSampleRow {
+    use sqlx::Row;
+    MetricsSampleRow {
+        at: r.try_get("at").unwrap_or(0),
+        cpu_pct: r.try_get("cpu_pct").unwrap_or(0.0),
+        mem_used_mb: r.try_get("mem_used_mb").unwrap_or(0),
+        mem_total_mb: r.try_get("mem_total_mb").unwrap_or(0),
+        proc_count: r.try_get("proc_count").unwrap_or(0),
+        uptime_secs: r.try_get("uptime_secs").unwrap_or(0),
+        top_cpu_name: r.try_get("top_cpu_name").unwrap_or_default(),
+        top_cpu_pct: r.try_get("top_cpu_pct").unwrap_or(0.0),
+        top_mem_name: r.try_get("top_mem_name").unwrap_or_default(),
+        top_mem_mb: r.try_get("top_mem_mb").unwrap_or(0),
+    }
+}
+
 fn exec_row_from(r: sqlx::sqlite::SqliteRow) -> ExecRow {
    use sqlx::Row;
    ExecRow {
@@ -284,6 +300,43 @@ fn exec_row_from(r: sqlx::sqlite::SqliteRow) -> ExecRow {
 /// `received_at` is when the row landed in the DB. `kind` is currently
 /// `"logon"` or `"logoff"` — anything else is treated as an unknown kind
 /// by the renderer and shown verbatim.
+/// One agent-reported continuous-metrics sample (1/min cadence).
+/// `cpu_pct` is the overall system CPU% averaged over the sample window;
+/// `top_cpu_*` / `top_mem_*` are the heaviest single process at the
+/// moment of capture — enough to answer "what was eating the CPU at
+/// 2 AM" without storing a full process tree on every row.
+#[derive(Debug, Clone, Default)]
+pub struct MetricsSampleRow {
+    pub at: i64,
+    pub cpu_pct: f64,
+    pub mem_used_mb: i64,
+    pub mem_total_mb: i64,
+    pub proc_count: i64,
+    pub uptime_secs: i64,
+    pub top_cpu_name: String,
+    pub top_cpu_pct: f64,
+    pub top_mem_name: String,
+    pub top_mem_mb: i64,
+}
+
+/// One row from `device_perf_events`. `provider` is the event-log
+/// channel short-name (`diag-perf`, `res-exh`, `system`); `event_id` is
+/// the numeric Windows event id. `summary` is a UI-ready short string
+/// the agent prepared (we don't re-localize Windows event messages
+/// server-side); `detail_json` carries the structured payload for the
+/// detail-row expansion.
+#[derive(Debug, Clone, Default)]
+pub struct PerfEventRow {
+    pub at: i64,
+    pub provider: String,
+    pub event_id: i64,
+    pub level: i64,
+    pub record_id: i64,
+    pub summary: String,
+    pub detail_json: String,
+    pub received_at: i64,
+}
+
 #[derive(Debug, Clone, Default)]
 pub struct LoginEventRow {
    pub at: i64,
@@ -472,6 +525,12 @@ impl Database {
                .execute(self.pool.get().await?.deref_mut())
                .await?;
        }
+        // M8 schema: agent-reported performance time-series + perf events.
+        for stmt in M8_SCHEMA {
+            sqlx::query(stmt)
+                .execute(self.pool.get().await?.deref_mut())
+                .await?;
+        }
        // Soft-ALTERs run after schema creation. SQLite < 3.35 lacks
        // `ADD COLUMN IF NOT EXISTS`; swallow the duplicate-column error
        // so re-runs are idempotent. Newly-added soft alters get appended
@@ -3517,6 +3576,141 @@ impl Database {
        Ok(())
    }

+    // ────────────────────── device_metrics_samples (M8) ────────────────────
+    //
+    // Time-series performance samples reported by the agent. INSERT OR
+    // IGNORE keeps the on-the-wire retry path idempotent.
+
+    pub async fn metrics_sample_insert(
+        &self,
+        peer_id: &str,
+        peer_uuid: &str,
+        s: &MetricsSampleRow,
+    ) -> ResultType<()> {
+        sqlx::query(
+            "insert or ignore into device_metrics_samples \
+                (peer_id, peer_uuid, at, cpu_pct, mem_used_mb, mem_total_mb, \
+                 proc_count, uptime_secs, top_cpu_name, top_cpu_pct, \
+                 top_mem_name, top_mem_mb) \
+             values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+        )
+        .bind(peer_id)
+        .bind(peer_uuid)
+        .bind(s.at)
+        .bind(s.cpu_pct)
+        .bind(s.mem_used_mb)
+        .bind(s.mem_total_mb)
+        .bind(s.proc_count)
+        .bind(s.uptime_secs)
+        .bind(&s.top_cpu_name)
+        .bind(s.top_cpu_pct)
+        .bind(&s.top_mem_name)
+        .bind(s.top_mem_mb)
+        .execute(self.pool.get().await?.deref_mut())
+        .await?;
+        Ok(())
+    }
+
+    /// Range query — used by the sparkline renderer. `since_at` is a
+    /// unix epoch second; rows are returned oldest-first because the
+    /// chart polyline draws left-to-right.
+    pub async fn metrics_samples_since(
+        &self,
+        peer_id: &str,
+        since_at: i64,
+    ) -> ResultType<Vec<MetricsSampleRow>> {
+        let rows = sqlx::query(
+            "select at, cpu_pct, mem_used_mb, mem_total_mb, proc_count, \
+                    uptime_secs, top_cpu_name, top_cpu_pct, top_mem_name, top_mem_mb \
+             from device_metrics_samples \
+             where peer_id = ? and at >= ? \
+             order by at asc",
+        )
+        .bind(peer_id)
+        .bind(since_at)
+        .fetch_all(self.pool.get().await?.deref_mut())
+        .await?;
+        Ok(rows.into_iter().map(metrics_sample_row_from).collect())
+    }
+
+    /// "What's happening right now" — the most recent sample, for the
+    /// snapshot card on the device detail page. None if the agent has
+    /// never reported.
+    pub async fn metrics_latest(
+        &self,
+        peer_id: &str,
+    ) -> ResultType<Option<MetricsSampleRow>> {
+        let row = sqlx::query(
+            "select at, cpu_pct, mem_used_mb, mem_total_mb, proc_count, \
+                    uptime_secs, top_cpu_name, top_cpu_pct, top_mem_name, top_mem_mb \
+             from device_metrics_samples \
+             where peer_id = ? \
+             order by at desc limit 1",
+        )
+        .bind(peer_id)
+        .fetch_optional(self.pool.get().await?.deref_mut())
+        .await?;
+        Ok(row.map(metrics_sample_row_from))
+    }
+
+    // ─────────────────────── device_perf_events (M8) ────────────────────────
+
+    pub async fn perf_event_insert(
+        &self,
+        peer_id: &str,
+        peer_uuid: &str,
+        e: &PerfEventRow,
+    ) -> ResultType<()> {
+        sqlx::query(
+            "insert or ignore into device_perf_events \
+                (peer_id, peer_uuid, at, provider, event_id, level, record_id, \
+                 summary, detail_json) \
+             values (?, ?, ?, ?, ?, ?, ?, ?, ?)",
+        )
+        .bind(peer_id)
+        .bind(peer_uuid)
+        .bind(e.at)
+        .bind(&e.provider)
+        .bind(e.event_id)
+        .bind(e.level)
+        .bind(e.record_id)
+        .bind(&e.summary)
+        .bind(&e.detail_json)
+        .execute(self.pool.get().await?.deref_mut())
+        .await?;
+        Ok(())
+    }
+
+    pub async fn perf_events_for_peer(
+        &self,
+        peer_id: &str,
+        limit: i64,
+    ) -> ResultType<Vec<PerfEventRow>> {
+        let rows = sqlx::query(
+            "select at, provider, event_id, level, record_id, summary, detail_json, received_at \
+             from device_perf_events \
+             where peer_id = ? \
+             order by at desc, id desc limit ?",
+        )
+        .bind(peer_id)
+        .bind(limit)
+        .fetch_all(self.pool.get().await?.deref_mut())
+        .await?;
+        Ok(rows
+            .into_iter()
+            .map(|r| PerfEventRow {
+                at: r.try_get("at").unwrap_or(0),
+                provider: r.try_get("provider").unwrap_or_default(),
+                event_id: r.try_get("event_id").unwrap_or(0),
+                level: r.try_get("level").unwrap_or(4),
+                record_id: r.try_get("record_id").unwrap_or(0),
+                summary: r.try_get("summary").unwrap_or_default(),
+                detail_json: r.try_get("detail_json").unwrap_or_default(),
+                received_at: r.try_get("received_at").unwrap_or(0),
+            })
+            .collect())
+    }
+
    /// Most-recent-first list for the device detail page.
    pub async fn login_events_for_peer(
        &self,
@@ -3989,6 +4183,65 @@ const M7_SCHEMA: &[&str] = &[
        ON device_login_events(peer_id, kind, session_id, at, username)",
 ];

+/// M8: agent-reported performance data — both continuous time-series
+/// samples (1/min) and episodic events the OS itself flagged in the
+/// `Microsoft-Windows-Diagnostics-Performance` / `Resource-Exhaustion-
+/// Detector` / `System` event logs. Sized for "supporter confirms a
+/// user's slow-laptop complaint":
+///
+/// * `device_metrics_samples` — overall CPU%, memory used/total, top CPU
+///   / memory process. 60-second cadence, ~1440 rows/device/day,
+///   ~10K/week. The UI plots a 24 h sparkline off this.
+/// * `device_perf_events` — boot / shutdown / sleep degradation, memory
+///   exhaustion, unexpected reboots, BSODs. Sparse — one device might
+///   produce a handful per week. UI shows the most recent on the detail
+///   page.
+///
+/// Both tables carry a UNIQUE INDEX that pairs with `INSERT OR IGNORE`,
+/// so an agent that re-POSTs after a transient transport failure (or
+/// across a restart that loses its in-memory queue) doesn't pile up
+/// duplicates. Retention isn't enforced server-side yet — rows
+/// accumulate, same model as `audit_conn` / `exec_history`.
+const M8_SCHEMA: &[&str] = &[
+    "CREATE TABLE IF NOT EXISTS device_metrics_samples (
+        id              INTEGER PRIMARY KEY AUTOINCREMENT,
+        peer_id         TEXT    NOT NULL,
+        peer_uuid       TEXT    NOT NULL,
+        at              INTEGER NOT NULL,
+        cpu_pct         REAL    NOT NULL DEFAULT 0,
+        mem_used_mb     INTEGER NOT NULL DEFAULT 0,
+        mem_total_mb    INTEGER NOT NULL DEFAULT 0,
+        proc_count      INTEGER NOT NULL DEFAULT 0,
+        uptime_secs     INTEGER NOT NULL DEFAULT 0,
+        top_cpu_name    TEXT    NOT NULL DEFAULT '',
+        top_cpu_pct     REAL    NOT NULL DEFAULT 0,
+        top_mem_name    TEXT    NOT NULL DEFAULT '',
+        top_mem_mb      INTEGER NOT NULL DEFAULT 0,
+        received_at     INTEGER NOT NULL DEFAULT (strftime('%s','now'))
+    )",
+    "CREATE INDEX IF NOT EXISTS idx_dms_peer_at \
+        ON device_metrics_samples(peer_id, at DESC)",
+    "CREATE UNIQUE INDEX IF NOT EXISTS uq_dms \
+        ON device_metrics_samples(peer_id, at)",
+    "CREATE TABLE IF NOT EXISTS device_perf_events (
+        id              INTEGER PRIMARY KEY AUTOINCREMENT,
+        peer_id         TEXT    NOT NULL,
+        peer_uuid       TEXT    NOT NULL,
+        at              INTEGER NOT NULL,
+        provider        TEXT    NOT NULL,
+        event_id        INTEGER NOT NULL,
+        level           INTEGER NOT NULL DEFAULT 4,
+        record_id       INTEGER NOT NULL DEFAULT 0,
+        summary         TEXT    NOT NULL DEFAULT '',
+        detail_json     TEXT    NOT NULL DEFAULT '',
+        received_at     INTEGER NOT NULL DEFAULT (strftime('%s','now'))
+    )",
+    "CREATE INDEX IF NOT EXISTS idx_dpe_peer_at \
+        ON device_perf_events(peer_id, at DESC)",
+    "CREATE UNIQUE INDEX IF NOT EXISTS uq_dpe \
+        ON device_perf_events(peer_id, provider, record_id)",
+];
+
 #[cfg(test)]
 mod tests {
    use hbb_common::tokio;