Implement auto-update routine

2026-05-21 13:34:02 +02:00
parent d10e547b70
commit e45abbe64d
10 changed files with 654 additions and 201 deletions
@@ -3,7 +3,7 @@
 // Three responsibilities:
 //
 // 1. `install()` — copy the binary to %ProgramFiles%\hello-agent, mirror the
-//    calling user's `HelloAgent.toml` into the LocalService-effective
+//    calling user's `hello-agent.toml` into the LocalService-effective
 //    config dir so the SYSTEM service inherits the --config blob, register
 //    the service with the SCM pointing at the installed exe, and start it.
 //    Idempotent.
@@ -29,14 +29,21 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};

 use windows_service::service::{
-    ServiceAccess, ServiceControl, ServiceControlAccept, ServiceErrorControl, ServiceExitCode,
+    ServiceAccess, ServiceAction, ServiceActionType, ServiceControl, ServiceControlAccept,
+    ServiceErrorControl, ServiceExitCode, ServiceFailureActions, ServiceFailureResetPeriod,
    ServiceInfo, ServiceStartType, ServiceState, ServiceStatus, ServiceType,
 };
 use windows_service::service_control_handler::{self, ServiceControlHandlerResult};
 use windows_service::service_dispatcher;
 use windows_service::service_manager::{ServiceManager, ServiceManagerAccess};

-const SERVICE_NAME: &str = "HelloAgent";
+/// Internal service name registered with the SCM. Must equal `crate::APP_NAME`
+/// because upstream `librustdesk::platform::is_self_service_running` queries
+/// `is_service_running(&crate::get_app_name())` — i.e. it looks up the
+/// service whose name *is* the app name. If these diverge, the `--update`
+/// path's `sc stop` / `sc start` use the wrong name and the service is
+/// left in a Stopped state after a self-update.
+const SERVICE_NAME: &str = crate::APP_NAME;
 const DISPLAY_NAME: &str = "HelloAgent Remote Support";
 const SERVICE_DESCRIPTION: &str =
    "HelloAgent — headless remote-support agent (RustDesk-protocol-compatible). \
@@ -47,6 +54,11 @@ const SERVICE_TYPE: ServiceType = ServiceType::OWN_PROCESS;
 const INSTALL_SUBDIR: &str = "hello-agent";
 const INSTALLED_EXE_NAME: &str = "hello-agent.exe";

+/// Display name used for the Windows Firewall rule. Stable across versions
+/// so `--uninstall` (or a re-install that clears it before re-adding) can
+/// find and delete the prior entry by name.
+const FIREWALL_RULE_NAME: &str = "HelloAgent";
+
 // ----------------------------- paths ---------------------------------------

 /// `%ProgramFiles%\hello-agent`. Falls back to `C:\Program Files\hello-agent`
@@ -68,9 +80,9 @@ fn install_dir() -> PathBuf {
 /// Note the trailing `config` segment: `directories_next::ProjectDirs`,
 /// which hbb_common uses on Windows, appends a literal `\config` to the
 /// app's roaming dir (so the user-side path is
-/// `%APPDATA%\HelloAgent\config\HelloAgent.toml`, not
-/// `…\HelloAgent\…`). The SYSTEM-side path follows the same convention.
-/// The `HelloAgent` segment is sourced from `crate::APP_NAME` so it stays
+/// `%APPDATA%\hello-agent\config\hello-agent.toml`, not
+/// `…\hello-agent\…`). The SYSTEM-side path follows the same convention.
+/// The `hello-agent` segment is sourced from `crate::APP_NAME` so it stays
 /// in lockstep with the `APP_NAME` we install into hbb_common at startup.
 fn service_config_dir() -> PathBuf {
    let system_root = std::env::var_os("SystemRoot")
@@ -88,11 +100,15 @@ fn service_config_dir() -> PathBuf {
 // ----------------------------- install --------------------------------------

 pub fn install() -> Result<()> {
+    // Probe-open the SCM with CREATE_SERVICE rights up front; if the caller
+    // isn't elevated this fails with ERROR_ACCESS_DENIED (raw_os_error == 5)
+    // and we surface a single human-readable message instead of bubbling
+    // up a Win32 errno string. Anything else propagates as-is.
    let scm = ServiceManager::local_computer(
        None::<&str>,
        ServiceManagerAccess::CONNECT | ServiceManagerAccess::CREATE_SERVICE,
    )
-    .context("open SCM")?;
+    .map_err(map_scm_open_error)?;

    // 1. If a previous install left a running service, stop it before we
    //    overwrite its binary. Otherwise the file copy in step 2 fails
@@ -106,8 +122,8 @@ pub fn install() -> Result<()> {
    //     idempotent / usable as an in-place update — without it, the
    //     `stage_binary` file copy below fails with "access denied"
    //     whenever a `--cm` child is still holding the old exe open.
-    //     `kill_orphan_processes` uses taskkill with `/FI "PID ne <ours>"`
-    //     so it never kills the running installer.
+    //     `kill_orphan_processes` walks the process table via sysinfo and
+    //     filters out our own pid so the installer doesn't suicide.
    kill_orphan_processes();

    // 2. Pin the binary to %ProgramFiles%\hello-agent. The user might be
@@ -120,17 +136,17 @@ pub fn install() -> Result<()> {
    //    first, fall back to popup). Older hello-agent installs wrote
    //    "click" here, which disabled the password path; clearing it
    //    every install makes upgrades idempotent. These write into the
-    //    *calling user's* %APPDATA%\HelloAgent\ — we mirror the result
+    //    *calling user's* %APPDATA%\hello-agent\ — we mirror the result
    //    into the service's effective dir in step 4.
    hbb_common::config::Config::set_option("stop-service".into(), "".into());
    hbb_common::config::Config::set_option("approve-mode".into(), "".into());

-    // 4. Mirror the calling user's `HelloAgent.toml` / `HelloAgent2.toml`
+    // 4. Mirror the calling user's `hello-agent.toml` / `hello-agent2.toml`
    //    into the LocalService-effective config root that the SYSTEM
    //    service will actually read. Without this, --config writes to e.g.
-    //    C:\Users\Admin\AppData\Roaming\HelloAgent\, but the service runs
+    //    C:\Users\Admin\AppData\Roaming\hello-agent\, but the service runs
    //    as LocalSystem and (via hbb_common's `patch()`) reads from
-    //    C:\Windows\ServiceProfiles\LocalService\AppData\Roaming\HelloAgent\.
+    //    C:\Windows\ServiceProfiles\LocalService\AppData\Roaming\hello-agent\.
    if let Err(e) = mirror_config_to_service_dir() {
        log::warn!(
            "could not mirror config to service dir ({e:#}); the service may not see --config until first heartbeat"
@@ -183,6 +199,60 @@ pub fn install() -> Result<()> {

    let _ = svc.set_description(SERVICE_DESCRIPTION);

+    // 5b. Configure SCM auto-restart on unexpected exit. Without this,
+    //     a panic in the `--service` supervisor leaves the agent permanently
+    //     Stopped until the host reboots. The schedule restarts after
+    //     5s, 30s, 60s and gives up after that; the failure-count reset
+    //     window is one day, so transient hiccups don't accumulate and
+    //     stable hosts converge back to "running" within a minute.
+    //
+    //     `set_failure_actions_on_non_crash_failures(true)` is what makes
+    //     these actions fire when the service exits cleanly with a non-zero
+    //     code (panic via abort, for instance), not just on outright
+    //     crashes detected by the SCM. Both are best-effort; the SCM
+    //     accepts the call but doesn't error if the underlying ChangeServiceConfig2
+    //     fails for some reason — we log and continue.
+    let failure_actions = ServiceFailureActions {
+        reset_period: ServiceFailureResetPeriod::After(Duration::from_secs(60 * 60 * 24)),
+        reboot_msg: None,
+        command: None,
+        actions: Some(vec![
+            ServiceAction {
+                action_type: ServiceActionType::Restart,
+                delay: Duration::from_secs(5),
+            },
+            ServiceAction {
+                action_type: ServiceActionType::Restart,
+                delay: Duration::from_secs(30),
+            },
+            ServiceAction {
+                action_type: ServiceActionType::Restart,
+                delay: Duration::from_secs(60),
+            },
+        ]),
+    };
+    if let Err(e) = svc.update_failure_actions(failure_actions) {
+        log::warn!("could not set SCM failure actions ({e}); auto-restart-on-crash disabled");
+    }
+    if let Err(e) = svc.set_failure_actions_on_non_crash_failures(true) {
+        log::warn!(
+            "could not enable failure actions for clean-exit-with-error ({e}); only hard crashes will trigger restart"
+        );
+    }
+
+    // 5c. Allow inbound TCP/UDP to hello-agent.exe at the Windows Firewall.
+    //     A vanilla deploy doesn't actually need it (the rendezvous/relay
+    //     connections are outbound), but operators who enable `direct-server`
+    //     (TCP 21118) or `enable-lan-discovery` (UDP 21119) via the --config
+    //     blob need this rule or those features silently fail. Cheaper to
+    //     add it always than to discover at support-call time that the
+    //     deploy never matched a firewall rule. Best-effort: if netsh
+    //     isn't present (extremely stripped-down server SKUs) we log and
+    //     continue.
+    if let Err(e) = install_firewall_rule(&target_exe) {
+        log::warn!("could not install firewall rule ({e:#}); inbound connections may be blocked");
+    }
+
    // 6. Start the service. (Step 1 already stopped any prior instance.)
    svc.start::<&str>(&[]).context("start service")?;
    log::info!(
@@ -250,7 +320,7 @@ fn stage_binary() -> Result<PathBuf> {
    Ok(dest)
 }

-/// Copy the calling user's `HelloAgent.toml` + `HelloAgent2.toml` into
+/// Copy the calling user's `hello-agent.toml` + `hello-agent2.toml` into
 /// the LocalService-effective config dir so the SYSTEM service sees them.
 fn mirror_config_to_service_dir() -> Result<()> {
    let dest_dir = service_config_dir();
@@ -272,7 +342,7 @@ fn mirror_config_to_service_dir() -> Result<()> {
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                // Calling user never had this file (e.g. --install without
                // --config, or first ever run on this machine, or the user
-                // wiped %APPDATA%\HelloAgent\ between tests). Logged at
+                // wiped %APPDATA%\hello-agent\ between tests). Logged at
                // info so the post-install log shows clearly which toml
                // files were available and which weren't.
                log::info!(
@@ -298,6 +368,16 @@ fn mirror_config_to_service_dir() -> Result<()> {
 // ----------------------------- uninstall ------------------------------------

 pub fn uninstall() -> Result<()> {
+    // Probe-open the SCM with the rights we'll need (CONNECT for the SCM
+    // handle itself, and DELETE on the per-service open below). The same
+    // elevation-error mapping as install() — surface a single clear message
+    // when the operator forgot the elevated prompt.
+    let scm = ServiceManager::local_computer(
+        None::<&str>,
+        ServiceManagerAccess::CONNECT,
+    )
+    .map_err(map_scm_open_error)?;
+
    // Kill every hello-agent.exe process except ourselves *first*. We can't
    // rely on the SCM Stop control alone because the `--cm` child spawned
    // via `run_as_user` runs under the logged-in user's token, not SYSTEM,
@@ -305,15 +385,9 @@ pub fn uninstall() -> Result<()> {
    // Doing this up front means the SCM stop below is usually a no-op
    // (service process already gone) and the rmdir at the end no longer
    // races a lingering child holding hello-agent.exe open. Our own PID
-    // is excluded via taskkill's `/FI` so the uninstaller doesn't suicide.
+    // is excluded via the sysinfo filter so the uninstaller doesn't suicide.
    kill_orphan_processes();

-    let scm = ServiceManager::local_computer(
-        None::<&str>,
-        ServiceManagerAccess::CONNECT,
-    )
-    .context("open SCM")?;
-
    match scm.open_service(
        SERVICE_NAME,
        ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE,
@@ -343,9 +417,17 @@ pub fn uninstall() -> Result<()> {
        Err(e) => return Err(anyhow!("open_service: {e}")),
    }

+    // Remove the firewall rule we installed (best-effort). netsh delete is
+    // idempotent — if the rule was never there (or someone manually removed
+    // it) netsh returns 1 with "No rules match the specified criteria",
+    // which we treat as success.
+    if let Err(e) = delete_firewall_rule() {
+        log::warn!("could not delete firewall rule ({e:#}); remove it manually if needed");
+    }
+
    cleanup_install_dir();
    // We deliberately do NOT delete the LocalService config dir here.
-    // `HelloAgent.toml` in that directory holds the agent's id + keypair,
+    // `hello-agent.toml` in that directory holds the agent's id + keypair,
    // which the rustdesk-server / rendezvous server has registered against
    // the agent's id. Wiping it forces the next --install to generate
    // fresh keys, which the rendezvous server's cached entry (and any
@@ -354,7 +436,7 @@ pub fn uninstall() -> Result<()> {
    // the connection sits idle until the peer times out.
    //
    // Operators who want a true hard wipe can run:
-    //     rmdir /s /q "%SystemRoot%\ServiceProfiles\LocalService\AppData\Roaming\HelloAgent"
+    //     rmdir /s /q "%SystemRoot%\ServiceProfiles\LocalService\AppData\Roaming\hello-agent"
    // and then delete the device record from the rustdesk-server admin UI.
    log::info!("preserved LocalService config dir to keep agent keys/id stable across reinstalls");
    Ok(())
@@ -365,58 +447,175 @@ pub fn uninstall() -> Result<()> {
 /// old `--cm` child holding the exe open) and `--uninstall` (so the
 /// rmdir at the end isn't racing a lingering child).
 ///
-/// Shells out to the built-in `taskkill` rather than re-implementing the
-/// Toolhelp32 enumeration in winapi: taskkill ships in every Windows
-/// install since XP, runs in milliseconds, and the `/FI "PID ne <ours>"`
-/// filter handles the "don't suicide ourselves" requirement declaratively.
-///
-/// Exit code 128 from taskkill means "no matching processes" — common
-/// case when there's no orphan to clean up — and we treat it the same
-/// as success. Anything else gets logged but does not fail the caller.
+/// Walks the process table via `hbb_common::sysinfo` (the same enumerator
+/// the vendored rustdesk uses internally) and calls `Process::kill` —
+/// equivalent to `TerminateProcess` under the hood. After issuing the
+/// kills we poll the process table for actual exit rather than guessing
+/// at a 500 ms sleep: `TerminateProcess` marks the process as exited but
+/// the kernel takes a variable amount of time to release the image-file
+/// handle, and we only want to return once those handles are gone (so
+/// the install-time file copy and uninstall-time rmdir don't race a
+/// half-finalized victim).
 fn kill_orphan_processes() {
+    // hbb_common pulls the rustdesk-org sysinfo 0.29 fork, which exposes
+    // System/Process/Pid with inherent methods (no SystemExt/ProcessExt
+    // trait imports needed — that style was removed when this fork
+    // diverged from upstream 0.30).
+    use hbb_common::sysinfo::{Pid, System};
+
    let our_pid = std::process::id();
-    let pid_filter = format!("PID ne {our_pid}");
-    let output = std::process::Command::new("taskkill")
-        .args([
-            "/F",
-            "/IM",
-            INSTALLED_EXE_NAME,
-            "/FI",
-            &pid_filter,
-        ])
-        .output();
-    match output {
-        Ok(out) => {
-            let code = out.status.code();
-            let stdout = String::from_utf8_lossy(&out.stdout);
-            let stderr = String::from_utf8_lossy(&out.stderr);
-            if out.status.success() {
-                log::info!(
-                    "taskkill killed orphan {INSTALLED_EXE_NAME} processes (excluding pid {our_pid}): {}",
-                    stdout.trim()
-                );
-                // TerminateProcess is synchronous w.r.t. the kernel marking
-                // the process as exited, but kernel-mode finalization
-                // (releasing file handles, paging out the image section)
-                // can lag by up to a few hundred ms. The rmdir that follows
-                // races against this: without the pause, an immediate
-                // remove_dir_all can still see "file in use" on the just-
-                // killed process's exe.
-                std::thread::sleep(Duration::from_millis(500));
-            } else if code == Some(128) {
-                log::info!("no orphan {INSTALLED_EXE_NAME} processes to kill");
+    let target = INSTALLED_EXE_NAME;
+
+    let mut system = System::new();
+    system.refresh_processes();
+    let victims: Vec<Pid> = system
+        .processes()
+        .iter()
+        .filter(|(pid, p)| {
+            pid.as_u32() != our_pid && p.name().eq_ignore_ascii_case(target)
+        })
+        .map(|(pid, _)| *pid)
+        .collect();
+
+    if victims.is_empty() {
+        log::info!("no orphan {target} processes to kill");
+        return;
+    }
+
+    let killed: Vec<u32> = victims
+        .iter()
+        .filter_map(|pid| {
+            let process = system.process(*pid)?;
+            if process.kill() {
+                Some(pid.as_u32())
            } else {
-                log::warn!(
-                    "taskkill returned {code:?}: stdout={} stderr={}",
-                    stdout.trim(),
-                    stderr.trim(),
-                );
+                log::warn!("Process::kill failed for pid {}", pid.as_u32());
+                None
            }
+        })
+        .collect();
+    log::info!("issued kill on {} {target} process(es): {killed:?}", killed.len());
+
+    // Poll for actual exit. 5 s ceiling is generous (TerminateProcess
+    // usually finalizes within tens of ms) but cheap — we only burn it
+    // when the kernel really is dragging its feet, which is the exact
+    // case the old `sleep(500ms)` heuristic couldn't handle.
+    let deadline = Instant::now() + Duration::from_secs(5);
+    while Instant::now() < deadline {
+        system.refresh_processes();
+        let still_alive = victims.iter().any(|pid| system.process(*pid).is_some());
+        if !still_alive {
+            return;
        }
-        Err(e) => {
-            log::warn!("could not invoke taskkill: {e}");
+        std::thread::sleep(Duration::from_millis(50));
+    }
+    log::warn!(
+        "some {target} processes were still alive after 5 s; subsequent file ops may fail with sharing violation"
+    );
+}
+
+/// Translate a `windows_service::Error` from `ServiceManager::local_computer`
+/// into a friendlier user-facing message. ERROR_ACCESS_DENIED (Win32 err 5)
+/// is the overwhelmingly common case — operator forgot to elevate — and
+/// deserves a single clear line rather than the raw Win32 errno string.
+fn map_scm_open_error(e: windows_service::Error) -> anyhow::Error {
+    if let windows_service::Error::Winapi(ref ioe) = e {
+        if ioe.raw_os_error() == Some(5) {
+            return anyhow!(
+                "requires an elevated (Administrator) prompt — re-run from \"Run as administrator\""
+            );
        }
    }
+    anyhow!("open SCM: {e}")
+}
+
+/// Add a Windows Firewall rule allowing inbound TCP/UDP to the installed
+/// hello-agent.exe. Idempotent: we delete any prior rule by the same name
+/// first, so re-running --install (or upgrading in place) doesn't pile up
+/// duplicate entries in the firewall's per-name list.
+///
+/// We use the program-scoped form (`program=<path>`) rather than port-scoped
+/// rules because hello-agent's optional listeners (direct-server TCP 21118,
+/// LAN-discovery UDP 21119) are gated on operator-controlled config flags;
+/// rule-by-program covers whatever ports the agent actually decides to bind.
+fn install_firewall_rule(exe_path: &PathBuf) -> Result<()> {
+    // Drop any pre-existing rule first; netsh quietly succeeds-with-exit-1
+    // when nothing matches, so we ignore the result.
+    let _ = run_netsh(&[
+        "advfirewall",
+        "firewall",
+        "delete",
+        "rule",
+        &format!("name={FIREWALL_RULE_NAME}"),
+    ]);
+
+    let program_arg = format!(
+        "program={}",
+        exe_path.to_str().ok_or_else(|| anyhow!(
+            "non-UTF-8 install path can't be passed to netsh: {}",
+            exe_path.display()
+        ))?
+    );
+    let status = run_netsh(&[
+        "advfirewall",
+        "firewall",
+        "add",
+        "rule",
+        &format!("name={FIREWALL_RULE_NAME}"),
+        "dir=in",
+        "action=allow",
+        "enable=yes",
+        "profile=any",
+        &program_arg,
+    ])?;
+    if !status {
+        return Err(anyhow!("netsh add rule failed"));
+    }
+    log::info!(
+        "added firewall rule '{FIREWALL_RULE_NAME}' for {}",
+        exe_path.display()
+    );
+    Ok(())
+}
+
+/// Remove the hello-agent firewall rule by name. netsh exits non-zero when
+/// no rule matches; we translate that into success since the post-condition
+/// (no rule by that name) is what we want anyway.
+fn delete_firewall_rule() -> Result<()> {
+    let status = run_netsh(&[
+        "advfirewall",
+        "firewall",
+        "delete",
+        "rule",
+        &format!("name={FIREWALL_RULE_NAME}"),
+    ]);
+    match status {
+        Ok(_) => {
+            log::info!("removed firewall rule '{FIREWALL_RULE_NAME}' (or none was present)");
+            Ok(())
+        }
+        Err(e) => Err(e),
+    }
+}
+
+/// Shell out to netsh.exe with the given args. Returns Ok(true) on
+/// exit-0, Ok(false) on a non-zero exit that *netsh itself* produced
+/// (e.g. "rule already exists" or "no rules match"), and Err only when
+/// the binary couldn't be invoked at all (PATH stripped, etc.).
+fn run_netsh(args: &[&str]) -> Result<bool> {
+    let out = std::process::Command::new("netsh")
+        .args(args)
+        .output()
+        .context("invoke netsh")?;
+    if !out.status.success() {
+        let stderr = String::from_utf8_lossy(&out.stderr);
+        log::debug!(
+            "netsh {args:?} exited {:?}: {}",
+            out.status.code(),
+            stderr.trim()
+        );
+    }
+    Ok(out.status.success())
 }

 /// Remove %ProgramFiles%\hello-agent. Best-effort: if the user ran