184 lines
5.3 KiB
Rust
184 lines
5.3 KiB
Rust
//! `POST /api/agent/metrics` — continuous performance time-series the
|
|
//! agent samples at ~1/min. The admin Devices detail page renders this
|
|
//! as a CPU / memory sparkline plus a "current snapshot" card.
|
|
//!
|
|
//! Auth: same per-peer signed-API gate as the other agent endpoints —
|
|
//! see [`crate::api::device_auth`]. Body shape (batched so an agent
|
|
//! that's catching up after a transport outage can land everything in
|
|
//! one POST):
|
|
//!
|
|
//! ```json
|
|
//! {
|
|
//! "id": "<peer id>",
|
|
//! "uuid": "<peer uuid>",
|
|
//! "samples": [
|
|
//! {
|
|
//! "at": 1717920000,
|
|
//! "cpu_pct": 42.5,
|
|
//! "mem_used_mb": 7820,
|
|
//! "mem_total_mb": 16384,
|
|
//! "proc_count": 341,
|
|
//! "uptime_secs": 173000,
|
|
//! "top_cpu_name": "chrome.exe",
|
|
//! "top_cpu_pct": 18.3,
|
|
//! "top_mem_name": "chrome.exe",
|
|
//! "top_mem_mb": 1240
|
|
//! }
|
|
//! ]
|
|
//! }
|
|
//! ```
|
|
|
|
use crate::api::device_auth::{self, AuthOutcome};
|
|
use crate::api::error::ApiError;
|
|
use crate::api::state::AppState;
|
|
use crate::database::MetricsSampleRow;
|
|
use axum::body::Bytes;
|
|
use axum::extract::Extension;
|
|
use axum::http::HeaderMap;
|
|
use serde::Deserialize;
|
|
use std::sync::Arc;
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
pub struct MetricsSampleIn {
|
|
pub at: i64,
|
|
#[serde(default)]
|
|
pub cpu_pct: f64,
|
|
#[serde(default)]
|
|
pub mem_used_mb: i64,
|
|
#[serde(default)]
|
|
pub mem_total_mb: i64,
|
|
#[serde(default)]
|
|
pub proc_count: i64,
|
|
#[serde(default)]
|
|
pub uptime_secs: i64,
|
|
#[serde(default)]
|
|
pub top_cpu_name: String,
|
|
#[serde(default)]
|
|
pub top_cpu_pct: f64,
|
|
#[serde(default)]
|
|
pub top_mem_name: String,
|
|
#[serde(default)]
|
|
pub top_mem_mb: i64,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
pub struct MetricsBody {
|
|
pub id: String,
|
|
pub uuid: String,
|
|
pub samples: Vec<MetricsSampleIn>,
|
|
}
|
|
|
|
/// Cap per request. At 60s sampling cadence + the agent's 30-minute
|
|
/// retry-and-drain budget, even a long outage should fit well under this.
|
|
const MAX_SAMPLES_PER_POST: usize = 512;
|
|
|
|
/// Defensive bound on string fields the agent puts in `top_*_name` — a
|
|
/// runaway process name doesn't get to balloon the DB row.
|
|
const MAX_PROC_NAME_LEN: usize = 128;
|
|
|
|
pub async fn metrics(
|
|
Extension(state): Extension<Arc<AppState>>,
|
|
headers: HeaderMap,
|
|
body: Bytes,
|
|
) -> Result<String, ApiError> {
|
|
let outcome =
|
|
device_auth::verify(&state, "POST", "/api/agent/metrics", &headers, &body).await?;
|
|
|
|
let payload: MetricsBody = serde_json::from_slice(&body)
|
|
.map_err(|_| ApiError::BadRequest("invalid json".into()))?;
|
|
|
|
if payload.id.is_empty() || payload.uuid.is_empty() {
|
|
return Err(ApiError::BadRequest("id and uuid are required".into()));
|
|
}
|
|
if payload.samples.is_empty() {
|
|
return Ok("OK".to_string());
|
|
}
|
|
if payload.samples.len() > MAX_SAMPLES_PER_POST {
|
|
return Err(ApiError::BadRequest(format!(
|
|
"too many samples in one POST (max {MAX_SAMPLES_PER_POST})"
|
|
)));
|
|
}
|
|
|
|
let id = match outcome {
|
|
AuthOutcome::Verified { id: signed_id } => {
|
|
if payload.id != signed_id {
|
|
return Err(ApiError::Unauthorized);
|
|
}
|
|
signed_id
|
|
}
|
|
AuthOutcome::LegacyUnsigned => {
|
|
device_auth::enforce_managed_for_id(&state, &payload.id).await?;
|
|
payload.id.clone()
|
|
}
|
|
};
|
|
|
|
let peer = state
|
|
.db
|
|
.get_peer(&id)
|
|
.await
|
|
.map_err(|e| ApiError::Internal(e.to_string()))?;
|
|
if peer.is_none() {
|
|
return Ok("ID_NOT_FOUND".to_string());
|
|
}
|
|
|
|
let mut accepted = 0usize;
|
|
for s in &payload.samples {
|
|
// Sanity-clamp the floats and string lengths. The agent should
|
|
// produce well-formed values, but the public-API shape means
|
|
// garbage-in shouldn't propagate to garbage-on-screen.
|
|
let cpu_pct = clamp_pct(s.cpu_pct);
|
|
let top_cpu_pct = clamp_pct(s.top_cpu_pct);
|
|
let row = MetricsSampleRow {
|
|
at: s.at,
|
|
cpu_pct,
|
|
mem_used_mb: s.mem_used_mb.max(0),
|
|
mem_total_mb: s.mem_total_mb.max(0),
|
|
proc_count: s.proc_count.max(0),
|
|
uptime_secs: s.uptime_secs.max(0),
|
|
top_cpu_name: truncate(&s.top_cpu_name, MAX_PROC_NAME_LEN),
|
|
top_cpu_pct,
|
|
top_mem_name: truncate(&s.top_mem_name, MAX_PROC_NAME_LEN),
|
|
top_mem_mb: s.top_mem_mb.max(0),
|
|
};
|
|
if let Err(e) = state
|
|
.db
|
|
.metrics_sample_insert(&id, &payload.uuid, &row)
|
|
.await
|
|
{
|
|
hbb_common::log::warn!(
|
|
"metrics_sample_insert for peer {} failed: {}",
|
|
id,
|
|
e
|
|
);
|
|
continue;
|
|
}
|
|
accepted += 1;
|
|
}
|
|
|
|
hbb_common::log::debug!(
|
|
"metrics: peer={} accepted={}/{}",
|
|
id,
|
|
accepted,
|
|
payload.samples.len()
|
|
);
|
|
Ok("OK".to_string())
|
|
}
|
|
|
|
fn clamp_pct(v: f64) -> f64 {
|
|
if v.is_nan() {
|
|
0.0
|
|
} else {
|
|
v.clamp(0.0, 100.0)
|
|
}
|
|
}
|
|
|
|
/// Char-aware truncate (so we don't slice mid-multibyte). The cap is
|
|
/// generous so process names that include arguments or Unicode survive.
|
|
fn truncate(s: &str, max_chars: usize) -> String {
|
|
if s.chars().count() <= max_chars {
|
|
s.to_string()
|
|
} else {
|
|
s.chars().take(max_chars).collect()
|
|
}
|
|
}
|