Implement auto-update routine

2026-05-21 13:34:02 +02:00
parent d10e547b70
commit 648b69aacc
8 changed files with 652 additions and 199 deletions
@@ -31,6 +31,13 @@ pub enum Action {
    /// — every `Data::FS(...)` frame the server sends is executed here, in
    /// the user's security context.
    Cm,
+    /// `--update`. Self-replacement entry point launched as an elevated child
+    /// by the running service's updater (see `librustdesk::updater`) after it
+    /// has downloaded and SHA256-verified a new hello-agent.exe from the
+    /// Gitea releases page. `current_exe()` here points at the staged new
+    /// binary in `%TEMP%`; it copies itself over the installed location and
+    /// restarts the service via `librustdesk::platform::update_me`.
+    Update,
 }

 #[derive(Debug)]
@@ -47,6 +54,7 @@ impl ParsedArgs {
        let mut service = false;
        let mut server = false;
        let mut cm = false;
+        let mut update = false;
        let mut config_blob: Option<String> = None;

        let mut i = 0;
@@ -56,6 +64,7 @@ impl ParsedArgs {
                "--uninstall" => uninstall = true,
                "--service" => service = true,
                "--server" => server = true,
+                "--update" => update = true,
                // Connection-manager popup mode. Treat `--cm-no-ui` (the
                // Linux-headless variant librustdesk also tries) as a
                // synonym; either way we run cm_popup.
@@ -81,14 +90,21 @@ impl ParsedArgs {
        }

        // Mutual-exclusion rules. --install + --config is the MDM one-liner;
-        // everything else is one-action-at-a-time.
-        let exclusive = [uninstall, service, server, cm].iter().filter(|x| **x).count();
+        // everything else is one-action-at-a-time. --update is launched by
+        // the updater as a standalone elevated child, never combined.
+        let exclusive = [uninstall, service, server, cm, update]
+            .iter()
+            .filter(|x| **x)
+            .count();
        if exclusive > 1 {
-            bail!("--uninstall, --service, --server, --cm are mutually exclusive");
+            bail!("--uninstall, --service, --server, --cm, --update are mutually exclusive");
        }
        if uninstall && (install || config_blob.is_some()) {
            bail!("--uninstall cannot be combined with other flags");
        }
+        if update && (install || config_blob.is_some()) {
+            bail!("--update cannot be combined with other flags");
+        }

        let action = if uninstall {
            Action::Uninstall
@@ -100,6 +116,8 @@ impl ParsedArgs {
            Action::Server
        } else if cm {
            Action::Cm
+        } else if update {
+            Action::Update
        } else if config_blob.is_some() {
            Action::ConfigOnly
        } else {
@@ -131,6 +149,10 @@ OPTIONS:
    --service              SCM entry point. Do not invoke manually.
    --server               Worker mode (launched by the service shell into
                           the active console session).
+    --update               Self-replacement entry point. Launched by the
+                           running service's updater after downloading and
+                           SHA256-verifying a new release from Gitea. Do
+                           not invoke manually.
    -h, --help             Show this help.
    -V, --version          Show version.

@@ -191,4 +213,24 @@ mod tests {
    fn unknown_arg() {
        assert!(parse(&["--no-such-flag"]).is_err());
    }
+
+    #[test]
+    fn update_alone() {
+        assert_eq!(parse(&["--update"]).unwrap().action, Action::Update);
+    }
+
+    #[test]
+    fn update_install_conflict() {
+        assert!(parse(&["--update", "--install"]).is_err());
+    }
+
+    #[test]
+    fn update_service_conflict() {
+        assert!(parse(&["--update", "--service"]).is_err());
+    }
+
+    #[test]
+    fn update_config_conflict() {
+        assert!(parse(&["--update", "--config", "BLOB"]).is_err());
+    }
 }
@@ -15,7 +15,7 @@
 // lazily from a `RwLock<String>` whenever any path is computed (config dir,
 // log dir, named-pipe namespace, …), so setting it before any of those
 // initializers fire is enough to redirect all hbb_common state under
-// `%APPDATA%\HelloAgent\` and the matching LocalService path. Identical
+// `%APPDATA%\hello-agent\` and the matching LocalService path. Identical
 // to the `read_custom_client` write path the upstream Flutter build uses
 // for OEM rebrands.

@@ -39,17 +39,28 @@ use cli::{Action, ParsedArgs};
 /// Product name used to namespace all on-disk state and the IPC pipe path.
 /// Written into `hbb_common::config::APP_NAME` at the top of `main` so
 /// every subsequent path computation (config dir, log dir, named pipe)
-/// targets `%APPDATA%\HelloAgent\` rather than the upstream default of
+/// targets `%APPDATA%\hello-agent\` rather than the upstream default of
 /// `%APPDATA%\RustDesk\`. Must be set before any code touches a path —
 /// `hbb_common` initializes path globals lazily on first read.
-pub const APP_NAME: &str = "HelloAgent";
+///
+/// Important: this value also drives upstream's installer lookup paths.
+/// `librustdesk::platform::get_install_info` computes the expected install
+/// dir as `%ProgramFiles%\<APP_NAME>` and the expected exe filename as
+/// `<APP_NAME>.exe`. Keeping `APP_NAME` aligned with the lowercase-hyphenated
+/// install path (`%ProgramFiles%\hello-agent\hello-agent.exe`) is what
+/// makes `--update` (which delegates to `librustdesk::platform::update_me`)
+/// find the binary it needs to replace, kill the right process by image
+/// name, and rename the staged exe to `hello-agent.exe` after the copy.
+/// Renaming this constant without renaming the install dir / exe will
+/// silently break self-update.
+pub const APP_NAME: &str = "hello-agent";

 /// Set up logging. We delegate to `hbb_common::init_log`, which:
 ///   * In **debug** builds: installs `env_logger` writing to stderr.
 ///   * In **release** builds: installs `flexi_logger` writing to a rolling
 ///     file under `<config_dir>/log/<mode>/` — the SYSTEM service log ends
-///     up at `%SystemRoot%\ServiceProfiles\LocalService\AppData\Roaming\HelloAgent\log\<mode>\`
-///     and the user-mode log at `%APPDATA%\HelloAgent\log\<mode>\`.
+///     up at `%SystemRoot%\ServiceProfiles\LocalService\AppData\Roaming\hello-agent\log\<mode>\`
+///     and the user-mode log at `%APPDATA%\hello-agent\log\<mode>\`.
 ///
 /// The `mode` label segregates per-run-mode log files so service worker
 /// chatter doesn't tangle with --install diagnostics. `init_log` is
@@ -65,7 +76,7 @@ fn main() {
    // we'd never recover.
    *hbb_common::config::APP_NAME.write().unwrap() = APP_NAME.to_owned();
    // Identify ourselves to the rustdesk-server's /api/sysinfo endpoint
-    // so the admin Devices page can show "HelloAgent 0.1.0" instead of
+    // so the admin Devices page can show "hello-agent 0.1.0" instead of
    // the embedded rustdesk core version. These RwLocks are read once
    // per sysinfo upload by hbbs_http::sync; setting them here (before
    // start_server) ensures the very first upload carries the identity.
@@ -90,10 +101,40 @@ fn main() {
        Action::Service => "service",
        Action::Server => "server",
        Action::Cm => "cm",
+        Action::Update => "update",
        Action::ConfigOnly | Action::None => "hello-agent",
    };
    init_logging(mode);

+    // --update is the self-replacement re-entry: the running service's
+    // updater downloads a new hello-agent.exe to %TEMP%, verifies its
+    // SHA256, then launches `<temp>\hello-agent.exe --update` as an
+    // elevated child. We are that child — `current_exe()` is the staged
+    // new binary, and our only job is to copy ourselves over the
+    // installed location and restart the service. Do it before the
+    // config-import dance below so a corrupt-on-disk config can't block
+    // an update from going through.
+    if parsed.action == Action::Update {
+        #[cfg(target_os = "windows")]
+        {
+            match librustdesk::platform::update_me(false) {
+                Ok(()) => {
+                    log::info!("hello-agent: --update completed");
+                }
+                Err(e) => {
+                    log::error!("hello-agent: --update failed: {e:#}");
+                    std::process::exit(1);
+                }
+            }
+        }
+        #[cfg(not(target_os = "windows"))]
+        {
+            eprintln!("hello-agent: --update is Windows-only.");
+            std::process::exit(1);
+        }
+        return;
+    }
+
    // --config is allowed to combine with --install (one-line MDM deploy)
    // but on its own is a separate operation. Apply it first so --install
    // sees the populated config.
@@ -108,7 +149,14 @@ fn main() {
    // (or a prior install) already set custom-rendezvous-server, this is a
    // no-op. Without this, a bare `hello-agent.exe --install` would land
    // at an unconfigured agent that can't reach any server.
-    config_import::apply_defaults_if_empty();
+    //
+    // Skipped for `--uninstall`: an uninstall flow has no business mutating
+    // the calling user's config, and otherwise we'd write defaults into
+    // %APPDATA% right before tearing the agent down. (`--update` is
+    // dispatched in the early-return block above and never reaches here.)
+    if parsed.action != Action::Uninstall {
+        config_import::apply_defaults_if_empty();
+    }

    match parsed.action {
        Action::Install => {
@@ -172,6 +220,11 @@ fn main() {
            // can watch logs. Production deployments use --install + --service.
            run_server();
        }
+        Action::Update => {
+            // Handled in the early-return block above (before config-import).
+            // The match has to cover this variant for exhaustiveness.
+            unreachable!("Action::Update is dispatched before this match");
+        }
    }
 }

@@ -3,7 +3,7 @@
 // Three responsibilities:
 //
 // 1. `install()` — copy the binary to %ProgramFiles%\hello-agent, mirror the
-//    calling user's `HelloAgent.toml` into the LocalService-effective
+//    calling user's `hello-agent.toml` into the LocalService-effective
 //    config dir so the SYSTEM service inherits the --config blob, register
 //    the service with the SCM pointing at the installed exe, and start it.
 //    Idempotent.
@@ -29,14 +29,21 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};

 use windows_service::service::{
-    ServiceAccess, ServiceControl, ServiceControlAccept, ServiceErrorControl, ServiceExitCode,
+    ServiceAccess, ServiceAction, ServiceActionType, ServiceControl, ServiceControlAccept,
+    ServiceErrorControl, ServiceExitCode, ServiceFailureActions, ServiceFailureResetPeriod,
    ServiceInfo, ServiceStartType, ServiceState, ServiceStatus, ServiceType,
 };
 use windows_service::service_control_handler::{self, ServiceControlHandlerResult};
 use windows_service::service_dispatcher;
 use windows_service::service_manager::{ServiceManager, ServiceManagerAccess};

-const SERVICE_NAME: &str = "HelloAgent";
+/// Internal service name registered with the SCM. Must equal `crate::APP_NAME`
+/// because upstream `librustdesk::platform::is_self_service_running` queries
+/// `is_service_running(&crate::get_app_name())` — i.e. it looks up the
+/// service whose name *is* the app name. If these diverge, the `--update`
+/// path's `sc stop` / `sc start` use the wrong name and the service is
+/// left in a Stopped state after a self-update.
+const SERVICE_NAME: &str = crate::APP_NAME;
 const DISPLAY_NAME: &str = "HelloAgent Remote Support";
 const SERVICE_DESCRIPTION: &str =
    "HelloAgent — headless remote-support agent (RustDesk-protocol-compatible). \
@@ -47,6 +54,11 @@ const SERVICE_TYPE: ServiceType = ServiceType::OWN_PROCESS;
 const INSTALL_SUBDIR: &str = "hello-agent";
 const INSTALLED_EXE_NAME: &str = "hello-agent.exe";

+/// Display name used for the Windows Firewall rule. Stable across versions
+/// so `--uninstall` (or a re-install that clears it before re-adding) can
+/// find and delete the prior entry by name.
+const FIREWALL_RULE_NAME: &str = "HelloAgent";
+
 // ----------------------------- paths ---------------------------------------

 /// `%ProgramFiles%\hello-agent`. Falls back to `C:\Program Files\hello-agent`
@@ -68,9 +80,9 @@ fn install_dir() -> PathBuf {
 /// Note the trailing `config` segment: `directories_next::ProjectDirs`,
 /// which hbb_common uses on Windows, appends a literal `\config` to the
 /// app's roaming dir (so the user-side path is
-/// `%APPDATA%\HelloAgent\config\HelloAgent.toml`, not
-/// `…\HelloAgent\…`). The SYSTEM-side path follows the same convention.
-/// The `HelloAgent` segment is sourced from `crate::APP_NAME` so it stays
+/// `%APPDATA%\hello-agent\config\hello-agent.toml`, not
+/// `…\hello-agent\…`). The SYSTEM-side path follows the same convention.
+/// The `hello-agent` segment is sourced from `crate::APP_NAME` so it stays
 /// in lockstep with the `APP_NAME` we install into hbb_common at startup.
 fn service_config_dir() -> PathBuf {
    let system_root = std::env::var_os("SystemRoot")
@@ -88,11 +100,15 @@ fn service_config_dir() -> PathBuf {
 // ----------------------------- install --------------------------------------

 pub fn install() -> Result<()> {
+    // Probe-open the SCM with CREATE_SERVICE rights up front; if the caller
+    // isn't elevated this fails with ERROR_ACCESS_DENIED (raw_os_error == 5)
+    // and we surface a single human-readable message instead of bubbling
+    // up a Win32 errno string. Anything else propagates as-is.
    let scm = ServiceManager::local_computer(
        None::<&str>,
        ServiceManagerAccess::CONNECT | ServiceManagerAccess::CREATE_SERVICE,
    )
-    .context("open SCM")?;
+    .map_err(map_scm_open_error)?;

    // 1. If a previous install left a running service, stop it before we
    //    overwrite its binary. Otherwise the file copy in step 2 fails
@@ -106,8 +122,8 @@ pub fn install() -> Result<()> {
    //     idempotent / usable as an in-place update — without it, the
    //     `stage_binary` file copy below fails with "access denied"
    //     whenever a `--cm` child is still holding the old exe open.
-    //     `kill_orphan_processes` uses taskkill with `/FI "PID ne <ours>"`
-    //     so it never kills the running installer.
+    //     `kill_orphan_processes` walks the process table via sysinfo and
+    //     filters out our own pid so the installer doesn't suicide.
    kill_orphan_processes();

    // 2. Pin the binary to %ProgramFiles%\hello-agent. The user might be
@@ -120,17 +136,17 @@ pub fn install() -> Result<()> {
    //    first, fall back to popup). Older hello-agent installs wrote
    //    "click" here, which disabled the password path; clearing it
    //    every install makes upgrades idempotent. These write into the
-    //    *calling user's* %APPDATA%\HelloAgent\ — we mirror the result
+    //    *calling user's* %APPDATA%\hello-agent\ — we mirror the result
    //    into the service's effective dir in step 4.
    hbb_common::config::Config::set_option("stop-service".into(), "".into());
    hbb_common::config::Config::set_option("approve-mode".into(), "".into());

-    // 4. Mirror the calling user's `HelloAgent.toml` / `HelloAgent2.toml`
+    // 4. Mirror the calling user's `hello-agent.toml` / `hello-agent2.toml`
    //    into the LocalService-effective config root that the SYSTEM
    //    service will actually read. Without this, --config writes to e.g.
-    //    C:\Users\Admin\AppData\Roaming\HelloAgent\, but the service runs
+    //    C:\Users\Admin\AppData\Roaming\hello-agent\, but the service runs
    //    as LocalSystem and (via hbb_common's `patch()`) reads from
-    //    C:\Windows\ServiceProfiles\LocalService\AppData\Roaming\HelloAgent\.
+    //    C:\Windows\ServiceProfiles\LocalService\AppData\Roaming\hello-agent\.
    if let Err(e) = mirror_config_to_service_dir() {
        log::warn!(
            "could not mirror config to service dir ({e:#}); the service may not see --config until first heartbeat"
@@ -183,6 +199,60 @@ pub fn install() -> Result<()> {

    let _ = svc.set_description(SERVICE_DESCRIPTION);

+    // 5b. Configure SCM auto-restart on unexpected exit. Without this,
+    //     a panic in the `--service` supervisor leaves the agent permanently
+    //     Stopped until the host reboots. The schedule restarts after
+    //     5s, 30s, 60s and gives up after that; the failure-count reset
+    //     window is one day, so transient hiccups don't accumulate and
+    //     stable hosts converge back to "running" within a minute.
+    //
+    //     `set_failure_actions_on_non_crash_failures(true)` is what makes
+    //     these actions fire when the service exits cleanly with a non-zero
+    //     code (panic via abort, for instance), not just on outright
+    //     crashes detected by the SCM. Both are best-effort; the SCM
+    //     accepts the call but doesn't error if the underlying ChangeServiceConfig2
+    //     fails for some reason — we log and continue.
+    let failure_actions = ServiceFailureActions {
+        reset_period: ServiceFailureResetPeriod::After(Duration::from_secs(60 * 60 * 24)),
+        reboot_msg: None,
+        command: None,
+        actions: Some(vec![
+            ServiceAction {
+                action_type: ServiceActionType::Restart,
+                delay: Duration::from_secs(5),
+            },
+            ServiceAction {
+                action_type: ServiceActionType::Restart,
+                delay: Duration::from_secs(30),
+            },
+            ServiceAction {
+                action_type: ServiceActionType::Restart,
+                delay: Duration::from_secs(60),
+            },
+        ]),
+    };
+    if let Err(e) = svc.update_failure_actions(failure_actions) {
+        log::warn!("could not set SCM failure actions ({e}); auto-restart-on-crash disabled");
+    }
+    if let Err(e) = svc.set_failure_actions_on_non_crash_failures(true) {
+        log::warn!(
+            "could not enable failure actions for clean-exit-with-error ({e}); only hard crashes will trigger restart"
+        );
+    }
+
+    // 5c. Allow inbound TCP/UDP to hello-agent.exe at the Windows Firewall.
+    //     A vanilla deploy doesn't actually need it (the rendezvous/relay
+    //     connections are outbound), but operators who enable `direct-server`
+    //     (TCP 21118) or `enable-lan-discovery` (UDP 21119) via the --config
+    //     blob need this rule or those features silently fail. Cheaper to
+    //     add it always than to discover at support-call time that the
+    //     deploy never matched a firewall rule. Best-effort: if netsh
+    //     isn't present (extremely stripped-down server SKUs) we log and
+    //     continue.
+    if let Err(e) = install_firewall_rule(&target_exe) {
+        log::warn!("could not install firewall rule ({e:#}); inbound connections may be blocked");
+    }
+
    // 6. Start the service. (Step 1 already stopped any prior instance.)
    svc.start::<&str>(&[]).context("start service")?;
    log::info!(
@@ -250,7 +320,7 @@ fn stage_binary() -> Result<PathBuf> {
    Ok(dest)
 }

-/// Copy the calling user's `HelloAgent.toml` + `HelloAgent2.toml` into
+/// Copy the calling user's `hello-agent.toml` + `hello-agent2.toml` into
 /// the LocalService-effective config dir so the SYSTEM service sees them.
 fn mirror_config_to_service_dir() -> Result<()> {
    let dest_dir = service_config_dir();
@@ -272,7 +342,7 @@ fn mirror_config_to_service_dir() -> Result<()> {
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                // Calling user never had this file (e.g. --install without
                // --config, or first ever run on this machine, or the user
-                // wiped %APPDATA%\HelloAgent\ between tests). Logged at
+                // wiped %APPDATA%\hello-agent\ between tests). Logged at
                // info so the post-install log shows clearly which toml
                // files were available and which weren't.
                log::info!(
@@ -298,6 +368,16 @@ fn mirror_config_to_service_dir() -> Result<()> {
 // ----------------------------- uninstall ------------------------------------

 pub fn uninstall() -> Result<()> {
+    // Probe-open the SCM with the rights we'll need (CONNECT for the SCM
+    // handle itself, and DELETE on the per-service open below). The same
+    // elevation-error mapping as install() — surface a single clear message
+    // when the operator forgot the elevated prompt.
+    let scm = ServiceManager::local_computer(
+        None::<&str>,
+        ServiceManagerAccess::CONNECT,
+    )
+    .map_err(map_scm_open_error)?;
+
    // Kill every hello-agent.exe process except ourselves *first*. We can't
    // rely on the SCM Stop control alone because the `--cm` child spawned
    // via `run_as_user` runs under the logged-in user's token, not SYSTEM,
@@ -305,15 +385,9 @@ pub fn uninstall() -> Result<()> {
    // Doing this up front means the SCM stop below is usually a no-op
    // (service process already gone) and the rmdir at the end no longer
    // races a lingering child holding hello-agent.exe open. Our own PID
-    // is excluded via taskkill's `/FI` so the uninstaller doesn't suicide.
+    // is excluded via the sysinfo filter so the uninstaller doesn't suicide.
    kill_orphan_processes();

-    let scm = ServiceManager::local_computer(
-        None::<&str>,
-        ServiceManagerAccess::CONNECT,
-    )
-    .context("open SCM")?;
-
    match scm.open_service(
        SERVICE_NAME,
        ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE,
@@ -343,9 +417,17 @@ pub fn uninstall() -> Result<()> {
        Err(e) => return Err(anyhow!("open_service: {e}")),
    }

+    // Remove the firewall rule we installed (best-effort). netsh delete is
+    // idempotent — if the rule was never there (or someone manually removed
+    // it) netsh returns 1 with "No rules match the specified criteria",
+    // which we treat as success.
+    if let Err(e) = delete_firewall_rule() {
+        log::warn!("could not delete firewall rule ({e:#}); remove it manually if needed");
+    }
+
    cleanup_install_dir();
    // We deliberately do NOT delete the LocalService config dir here.
-    // `HelloAgent.toml` in that directory holds the agent's id + keypair,
+    // `hello-agent.toml` in that directory holds the agent's id + keypair,
    // which the rustdesk-server / rendezvous server has registered against
    // the agent's id. Wiping it forces the next --install to generate
    // fresh keys, which the rendezvous server's cached entry (and any
@@ -354,7 +436,7 @@ pub fn uninstall() -> Result<()> {
    // the connection sits idle until the peer times out.
    //
    // Operators who want a true hard wipe can run:
-    //     rmdir /s /q "%SystemRoot%\ServiceProfiles\LocalService\AppData\Roaming\HelloAgent"
+    //     rmdir /s /q "%SystemRoot%\ServiceProfiles\LocalService\AppData\Roaming\hello-agent"
    // and then delete the device record from the rustdesk-server admin UI.
    log::info!("preserved LocalService config dir to keep agent keys/id stable across reinstalls");
    Ok(())
@@ -365,58 +447,175 @@ pub fn uninstall() -> Result<()> {
 /// old `--cm` child holding the exe open) and `--uninstall` (so the
 /// rmdir at the end isn't racing a lingering child).
 ///
-/// Shells out to the built-in `taskkill` rather than re-implementing the
-/// Toolhelp32 enumeration in winapi: taskkill ships in every Windows
-/// install since XP, runs in milliseconds, and the `/FI "PID ne <ours>"`
-/// filter handles the "don't suicide ourselves" requirement declaratively.
-///
-/// Exit code 128 from taskkill means "no matching processes" — common
-/// case when there's no orphan to clean up — and we treat it the same
-/// as success. Anything else gets logged but does not fail the caller.
+/// Walks the process table via `hbb_common::sysinfo` (the same enumerator
+/// the vendored rustdesk uses internally) and calls `Process::kill` —
+/// equivalent to `TerminateProcess` under the hood. After issuing the
+/// kills we poll the process table for actual exit rather than guessing
+/// at a 500 ms sleep: `TerminateProcess` marks the process as exited but
+/// the kernel takes a variable amount of time to release the image-file
+/// handle, and we only want to return once those handles are gone (so
+/// the install-time file copy and uninstall-time rmdir don't race a
+/// half-finalized victim).
 fn kill_orphan_processes() {
+    // hbb_common pulls the rustdesk-org sysinfo 0.29 fork, which exposes
+    // System/Process/Pid with inherent methods (no SystemExt/ProcessExt
+    // trait imports needed — that style was removed when this fork
+    // diverged from upstream 0.30).
+    use hbb_common::sysinfo::{Pid, System};
+
    let our_pid = std::process::id();
-    let pid_filter = format!("PID ne {our_pid}");
-    let output = std::process::Command::new("taskkill")
-        .args([
-            "/F",
-            "/IM",
-            INSTALLED_EXE_NAME,
-            "/FI",
-            &pid_filter,
-        ])
-        .output();
-    match output {
-        Ok(out) => {
-            let code = out.status.code();
-            let stdout = String::from_utf8_lossy(&out.stdout);
-            let stderr = String::from_utf8_lossy(&out.stderr);
-            if out.status.success() {
-                log::info!(
-                    "taskkill killed orphan {INSTALLED_EXE_NAME} processes (excluding pid {our_pid}): {}",
-                    stdout.trim()
-                );
-                // TerminateProcess is synchronous w.r.t. the kernel marking
-                // the process as exited, but kernel-mode finalization
-                // (releasing file handles, paging out the image section)
-                // can lag by up to a few hundred ms. The rmdir that follows
-                // races against this: without the pause, an immediate
-                // remove_dir_all can still see "file in use" on the just-
-                // killed process's exe.
-                std::thread::sleep(Duration::from_millis(500));
-            } else if code == Some(128) {
-                log::info!("no orphan {INSTALLED_EXE_NAME} processes to kill");
+    let target = INSTALLED_EXE_NAME;
+
+    let mut system = System::new();
+    system.refresh_processes();
+    let victims: Vec<Pid> = system
+        .processes()
+        .iter()
+        .filter(|(pid, p)| {
+            pid.as_u32() != our_pid && p.name().eq_ignore_ascii_case(target)
+        })
+        .map(|(pid, _)| *pid)
+        .collect();
+
+    if victims.is_empty() {
+        log::info!("no orphan {target} processes to kill");
+        return;
+    }
+
+    let killed: Vec<u32> = victims
+        .iter()
+        .filter_map(|pid| {
+            let process = system.process(*pid)?;
+            if process.kill() {
+                Some(pid.as_u32())
            } else {
-                log::warn!(
-                    "taskkill returned {code:?}: stdout={} stderr={}",
-                    stdout.trim(),
-                    stderr.trim(),
-                );
+                log::warn!("Process::kill failed for pid {}", pid.as_u32());
+                None
            }
+        })
+        .collect();
+    log::info!("issued kill on {} {target} process(es): {killed:?}", killed.len());
+
+    // Poll for actual exit. 5 s ceiling is generous (TerminateProcess
+    // usually finalizes within tens of ms) but cheap — we only burn it
+    // when the kernel really is dragging its feet, which is the exact
+    // case the old `sleep(500ms)` heuristic couldn't handle.
+    let deadline = Instant::now() + Duration::from_secs(5);
+    while Instant::now() < deadline {
+        system.refresh_processes();
+        let still_alive = victims.iter().any(|pid| system.process(*pid).is_some());
+        if !still_alive {
+            return;
        }
-        Err(e) => {
-            log::warn!("could not invoke taskkill: {e}");
+        std::thread::sleep(Duration::from_millis(50));
+    }
+    log::warn!(
+        "some {target} processes were still alive after 5 s; subsequent file ops may fail with sharing violation"
+    );
+}
+
+/// Translate a `windows_service::Error` from `ServiceManager::local_computer`
+/// into a friendlier user-facing message. ERROR_ACCESS_DENIED (Win32 err 5)
+/// is the overwhelmingly common case — operator forgot to elevate — and
+/// deserves a single clear line rather than the raw Win32 errno string.
+fn map_scm_open_error(e: windows_service::Error) -> anyhow::Error {
+    if let windows_service::Error::Winapi(ref ioe) = e {
+        if ioe.raw_os_error() == Some(5) {
+            return anyhow!(
+                "requires an elevated (Administrator) prompt — re-run from \"Run as administrator\""
+            );
        }
    }
+    anyhow!("open SCM: {e}")
+}
+
+/// Add a Windows Firewall rule allowing inbound TCP/UDP to the installed
+/// hello-agent.exe. Idempotent: we delete any prior rule by the same name
+/// first, so re-running --install (or upgrading in place) doesn't pile up
+/// duplicate entries in the firewall's per-name list.
+///
+/// We use the program-scoped form (`program=<path>`) rather than port-scoped
+/// rules because hello-agent's optional listeners (direct-server TCP 21118,
+/// LAN-discovery UDP 21119) are gated on operator-controlled config flags;
+/// rule-by-program covers whatever ports the agent actually decides to bind.
+fn install_firewall_rule(exe_path: &PathBuf) -> Result<()> {
+    // Drop any pre-existing rule first; netsh quietly succeeds-with-exit-1
+    // when nothing matches, so we ignore the result.
+    let _ = run_netsh(&[
+        "advfirewall",
+        "firewall",
+        "delete",
+        "rule",
+        &format!("name={FIREWALL_RULE_NAME}"),
+    ]);
+
+    let program_arg = format!(
+        "program={}",
+        exe_path.to_str().ok_or_else(|| anyhow!(
+            "non-UTF-8 install path can't be passed to netsh: {}",
+            exe_path.display()
+        ))?
+    );
+    let status = run_netsh(&[
+        "advfirewall",
+        "firewall",
+        "add",
+        "rule",
+        &format!("name={FIREWALL_RULE_NAME}"),
+        "dir=in",
+        "action=allow",
+        "enable=yes",
+        "profile=any",
+        &program_arg,
+    ])?;
+    if !status {
+        return Err(anyhow!("netsh add rule failed"));
+    }
+    log::info!(
+        "added firewall rule '{FIREWALL_RULE_NAME}' for {}",
+        exe_path.display()
+    );
+    Ok(())
+}
+
+/// Remove the hello-agent firewall rule by name. netsh exits non-zero when
+/// no rule matches; we translate that into success since the post-condition
+/// (no rule by that name) is what we want anyway.
+fn delete_firewall_rule() -> Result<()> {
+    let status = run_netsh(&[
+        "advfirewall",
+        "firewall",
+        "delete",
+        "rule",
+        &format!("name={FIREWALL_RULE_NAME}"),
+    ]);
+    match status {
+        Ok(_) => {
+            log::info!("removed firewall rule '{FIREWALL_RULE_NAME}' (or none was present)");
+            Ok(())
+        }
+        Err(e) => Err(e),
+    }
+}
+
+/// Shell out to netsh.exe with the given args. Returns Ok(true) on
+/// exit-0, Ok(false) on a non-zero exit that *netsh itself* produced
+/// (e.g. "rule already exists" or "no rules match"), and Err only when
+/// the binary couldn't be invoked at all (PATH stripped, etc.).
+fn run_netsh(args: &[&str]) -> Result<bool> {
+    let out = std::process::Command::new("netsh")
+        .args(args)
+        .output()
+        .context("invoke netsh")?;
+    if !out.status.success() {
+        let stderr = String::from_utf8_lossy(&out.stderr);
+        log::debug!(
+            "netsh {args:?} exited {:?}: {}",
+            out.status.code(),
+            stderr.trim()
+        );
+    }
+    Ok(out.status.success())
 }

 /// Remove %ProgramFiles%\hello-agent. Best-effort: if the user ran