hello-agent/src/service.rs

// Windows service shell.
//
// Three responsibilities:
//
// 1. `install()` — copy the binary to %ProgramFiles%\hello-agent, mirror the
//    calling user's `hello-agent.toml` into the LocalService-effective
//    config dir so the SYSTEM service inherits the --config blob, register
//    the service with the SCM pointing at the installed exe, and start it.
//    Idempotent.
//
// 2. `uninstall()` — stop the service, delete it, remove the install dir
//    (best effort if uninstall is run from somewhere other than the install
//    dir itself), and clear the LocalService config copy.
//
// 3. `run_as_service()` — the SCM dispatcher entry. Watches for active
//    console session changes and (re)launches `hello-agent.exe --server`
//    into that session via `librustdesk::platform::launch_privileged_process`,
//    so the worker inherits the SYSTEM token in the user's session. (We
//    intentionally do NOT use `run_as_user` here — that drops to the
//    logged-in user's token, and the worker would then read config from
//    the user's %APPDATA% instead of the LocalService path the install
//    flow mirrors to.)

use anyhow::{anyhow, Context, Result};
use std::ffi::OsString;
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};

use windows_service::service::{
    ServiceAccess, ServiceAction, ServiceActionType, ServiceControl, ServiceControlAccept,
    ServiceErrorControl, ServiceExitCode, ServiceFailureActions, ServiceFailureResetPeriod,
    ServiceInfo, ServiceStartType, ServiceState, ServiceStatus, ServiceType,
};
use windows_service::service_control_handler::{self, ServiceControlHandlerResult};
use windows_service::service_dispatcher;
use windows_service::service_manager::{ServiceManager, ServiceManagerAccess};

/// Internal service name registered with the SCM. Must equal `crate::APP_NAME`
/// because upstream `librustdesk::platform::is_self_service_running` queries
/// `is_service_running(&crate::get_app_name())` — i.e. it looks up the
/// service whose name *is* the app name. If these diverge, the `--update`
/// path's `sc stop` / `sc start` use the wrong name and the service is
/// left in a Stopped state after a self-update.
const SERVICE_NAME: &str = crate::APP_NAME;
const DISPLAY_NAME: &str = "HelloAgent Remote Support";
const SERVICE_DESCRIPTION: &str =
    "HelloAgent — headless remote-support agent (RustDesk-protocol-compatible). \
     Lets a remote supporter connect, subject to local user approval.";

const SERVICE_TYPE: ServiceType = ServiceType::OWN_PROCESS;

const INSTALL_SUBDIR: &str = "hello-agent";
const INSTALLED_EXE_NAME: &str = "hello-agent.exe";

/// Display name used for the Windows Firewall rule. Stable across versions
/// so `--uninstall` (or a re-install that clears it before re-adding) can
/// find and delete the prior entry by name.
const FIREWALL_RULE_NAME: &str = "HelloAgent";

// ----------------------------- paths ---------------------------------------

/// `%ProgramFiles%\hello-agent`. Falls back to `C:\Program Files\hello-agent`
/// if the env var isn't set (shouldn't happen on a real Windows install,
/// but we don't want to crash the installer if it does).
fn install_dir() -> PathBuf {
    let base = std::env::var_os("ProgramFiles")
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from(r"C:\Program Files"));
    base.join(INSTALL_SUBDIR)
}

/// hbb_common's `patch()` rewrites `system32\config\systemprofile` →
/// `ServiceProfiles\LocalService` on Windows so that LocalSystem and
/// LocalService share a config root. The SYSTEM service therefore reads
/// from this path; we mirror the calling user's config files here so the
/// --config blob makes it across.
///
/// Note the trailing `config` segment: `directories_next::ProjectDirs`,
/// which hbb_common uses on Windows, appends a literal `\config` to the
/// app's roaming dir (so the user-side path is
/// `%APPDATA%\hello-agent\config\hello-agent.toml`, not
/// `…\hello-agent\…`). The SYSTEM-side path follows the same convention.
/// The `hello-agent` segment is sourced from `crate::APP_NAME` so it stays
/// in lockstep with the `APP_NAME` we install into hbb_common at startup.
fn service_config_dir() -> PathBuf {
    let system_root = std::env::var_os("SystemRoot")
        .map(PathBuf::from)
        .unwrap_or_else(|| PathBuf::from(r"C:\Windows"));
    system_root
        .join("ServiceProfiles")
        .join("LocalService")
        .join("AppData")
        .join("Roaming")
        .join(crate::APP_NAME)
        .join("config")
}

// ----------------------------- install --------------------------------------

pub fn install() -> Result<()> {
    // Probe-open the SCM with CREATE_SERVICE rights up front; if the caller
    // isn't elevated this fails with ERROR_ACCESS_DENIED (raw_os_error == 5)
    // and we surface a single human-readable message instead of bubbling
    // up a Win32 errno string. Anything else propagates as-is.
    let scm = ServiceManager::local_computer(
        None::<&str>,
        ServiceManagerAccess::CONNECT | ServiceManagerAccess::CREATE_SERVICE,
    )
    .map_err(map_scm_open_error)?;

    // 1. If a previous install left a running service, stop it before we
    //    overwrite its binary. Otherwise the file copy in step 2 fails
    //    with "access denied" because the SCM holds an exclusive handle on
    //    the running exe.
    stop_existing_service(&scm);

    // 1b. Kill any lingering hello-agent.exe (notably the `--cm` user-token
    //     child, which lives outside the service's process tree and is
    //     therefore not stopped by SCM Stop). This makes `--install`
    //     idempotent / usable as an in-place update — without it, the
    //     `stage_binary` file copy below fails with "access denied"
    //     whenever a `--cm` child is still holding the old exe open.
    //     `kill_orphan_processes` walks the process table via sysinfo and
    //     filters out our own pid so the installer doesn't suicide.
    kill_orphan_processes();

    // 2. Pin the binary to %ProgramFiles%\hello-agent. The user might be
    //    running --install from C:\Users\…\Downloads\, a USB stick, etc.;
    //    we don't want the SCM pointing back at any of those.
    let target_exe = stage_binary().context("stage_binary")?;

    // 3. Clear stop-service and reset approve-mode to "both" (empty
    //    string → librustdesk treats as ApproveMode::Both: try password
    //    first, fall back to popup). Older hello-agent installs wrote
    //    "click" here, which disabled the password path; clearing it
    //    every install makes upgrades idempotent. These write into the
    //    *calling user's* %APPDATA%\hello-agent\ — we mirror the result
    //    into the service's effective dir in step 4.
    hbb_common::config::Config::set_option("stop-service".into(), "".into());
    hbb_common::config::Config::set_option("approve-mode".into(), "".into());

    // 4. Mirror the calling user's `hello-agent.toml` / `hello-agent2.toml`
    //    into the LocalService-effective config root that the SYSTEM
    //    service will actually read. Without this, --config writes to e.g.
    //    C:\Users\Admin\AppData\Roaming\hello-agent\, but the service runs
    //    as LocalSystem and (via hbb_common's `patch()`) reads from
    //    C:\Windows\ServiceProfiles\LocalService\AppData\Roaming\hello-agent\.
    if let Err(e) = mirror_config_to_service_dir() {
        log::warn!(
            "could not mirror config to service dir ({e:#}); the service may not see --config until first heartbeat"
        );
    }

    // 5. Register / reconfigure the SCM entry. Idempotent: if the service
    //    already exists we reuse the handle and change_config it to the
    //    new exe path + args.

    let info = ServiceInfo {
        name: OsString::from(SERVICE_NAME),
        display_name: OsString::from(DISPLAY_NAME),
        service_type: SERVICE_TYPE,
        start_type: ServiceStartType::AutoStart,
        error_control: ServiceErrorControl::Normal,
        executable_path: target_exe.clone(),
        launch_arguments: vec![OsString::from("--service")],
        dependencies: vec![],
        account_name: None, // LocalSystem
        account_password: None,
    };

    let svc = match scm.create_service(
        &info,
        ServiceAccess::CHANGE_CONFIG
            | ServiceAccess::START
            | ServiceAccess::STOP
            | ServiceAccess::QUERY_STATUS,
    ) {
        Ok(s) => s,
        Err(windows_service::Error::Winapi(e))
            if e.raw_os_error() == Some(winapi::shared::winerror::ERROR_SERVICE_EXISTS as i32) =>
        {
            log::info!("service exists; reusing");
            let svc = scm
                .open_service(
                    SERVICE_NAME,
                    ServiceAccess::CHANGE_CONFIG
                        | ServiceAccess::START
                        | ServiceAccess::STOP
                        | ServiceAccess::QUERY_STATUS,
                )
                .context("open existing service")?;
            svc.change_config(&info).context("change_config")?;
            svc
        }
        Err(e) => return Err(anyhow!("create_service: {e}")),
    };

    let _ = svc.set_description(SERVICE_DESCRIPTION);

    // 5b. Configure SCM auto-restart on unexpected exit. Without this,
    //     a panic in the `--service` supervisor leaves the agent permanently
    //     Stopped until the host reboots. The schedule restarts after
    //     5s, 30s, 60s and gives up after that; the failure-count reset
    //     window is one day, so transient hiccups don't accumulate and
    //     stable hosts converge back to "running" within a minute.
    //
    //     `set_failure_actions_on_non_crash_failures(true)` is what makes
    //     these actions fire when the service exits cleanly with a non-zero
    //     code (panic via abort, for instance), not just on outright
    //     crashes detected by the SCM. Both are best-effort; the SCM
    //     accepts the call but doesn't error if the underlying ChangeServiceConfig2
    //     fails for some reason — we log and continue.
    let failure_actions = ServiceFailureActions {
        reset_period: ServiceFailureResetPeriod::After(Duration::from_secs(60 * 60 * 24)),
        reboot_msg: None,
        command: None,
        actions: Some(vec![
            ServiceAction {
                action_type: ServiceActionType::Restart,
                delay: Duration::from_secs(5),
            },
            ServiceAction {
                action_type: ServiceActionType::Restart,
                delay: Duration::from_secs(30),
            },
            ServiceAction {
                action_type: ServiceActionType::Restart,
                delay: Duration::from_secs(60),
            },
        ]),
    };
    if let Err(e) = svc.update_failure_actions(failure_actions) {
        log::warn!("could not set SCM failure actions ({e}); auto-restart-on-crash disabled");
    }
    if let Err(e) = svc.set_failure_actions_on_non_crash_failures(true) {
        log::warn!(
            "could not enable failure actions for clean-exit-with-error ({e}); only hard crashes will trigger restart"
        );
    }

    // 5c. Allow inbound TCP/UDP to hello-agent.exe at the Windows Firewall.
    //     A vanilla deploy doesn't actually need it (the rendezvous/relay
    //     connections are outbound), but operators who enable `direct-server`
    //     (TCP 21118) or `enable-lan-discovery` (UDP 21119) via the --config
    //     blob need this rule or those features silently fail. Cheaper to
    //     add it always than to discover at support-call time that the
    //     deploy never matched a firewall rule. Best-effort: if netsh
    //     isn't present (extremely stripped-down server SKUs) we log and
    //     continue.
    if let Err(e) = install_firewall_rule(&target_exe) {
        log::warn!("could not install firewall rule ({e:#}); inbound connections may be blocked");
    }

    // 6. Start the service. (Step 1 already stopped any prior instance.)
    svc.start::<&str>(&[]).context("start service")?;
    log::info!(
        "service '{}' installed at {} and started",
        SERVICE_NAME,
        target_exe.display()
    );
    Ok(())
}

/// Best-effort stop + wait of an existing HelloAgent service. No-op if the
/// service doesn't exist or is already stopped. We use a short connection
/// here (STOP|QUERY_STATUS only) so the install path can call this without
/// holding the broader CHANGE_CONFIG handle from later steps.
fn stop_existing_service(scm: &ServiceManager) {
    let svc = match scm.open_service(
        SERVICE_NAME,
        ServiceAccess::STOP | ServiceAccess::QUERY_STATUS,
    ) {
        Ok(s) => s,
        Err(_) => return, // doesn't exist; nothing to stop
    };

    if let Ok(status) = svc.query_status() {
        if status.current_state == ServiceState::Stopped {
            return;
        }
    }
    let _ = svc.stop();
    wait_for_state(&svc, ServiceState::Stopped, Duration::from_secs(20));
}

/// Copy the running exe to %ProgramFiles%\hello-agent\hello-agent.exe and
/// return the destination path. If the running exe is already the installed
/// path (e.g., the user ran `hello-agent.exe --install` from the install
/// directory after a manual update), we skip the copy.
fn stage_binary() -> Result<PathBuf> {
    let src = std::env::current_exe().context("current_exe")?;
    let src = src.canonicalize().unwrap_or(src);
    let dest_dir = install_dir();
    let dest = dest_dir.join(INSTALLED_EXE_NAME);

    let dest_canon = dest.canonicalize().ok();
    if dest_canon.as_ref() == Some(&src) {
        log::info!("running exe is already installed at {}", dest.display());
        return Ok(dest);
    }

    std::fs::create_dir_all(&dest_dir)
        .with_context(|| format!("create_dir_all {}", dest_dir.display()))?;

    // If something is already there (an old install), Windows allows
    // overwriting if no process holds the file open. The service was either
    // never installed or we'll restart it after this; either way, the
    // running --install process is the only handle we worry about, and that
    // handle is on `src`, not `dest`.
    std::fs::copy(&src, &dest).with_context(|| {
        format!("copy {} -> {}", src.display(), dest.display())
    })?;
    log::info!(
        "installed binary: {} -> {}",
        src.display(),
        dest.display()
    );
    Ok(dest)
}

/// Copy the calling user's `hello-agent.toml` + `hello-agent2.toml` into
/// the LocalService-effective config dir so the SYSTEM service sees them.
fn mirror_config_to_service_dir() -> Result<()> {
    let dest_dir = service_config_dir();
    std::fs::create_dir_all(&dest_dir)
        .with_context(|| format!("create_dir_all {}", dest_dir.display()))?;

    let user_main = hbb_common::config::Config::file();
    let user_aux = hbb_common::config::Config2::file();

    let mut copied = 0usize;
    for src in [user_main, user_aux] {
        let Some(name) = src.file_name() else { continue };
        let dest = dest_dir.join(name);
        match std::fs::copy(&src, &dest) {
            Ok(_) => {
                copied += 1;
                log::info!("mirrored {} -> {}", src.display(), dest.display());
            }
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                // Calling user never had this file (e.g. --install without
                // --config, or first ever run on this machine, or the user
                // wiped %APPDATA%\hello-agent\ between tests). Logged at
                // info so the post-install log shows clearly which toml
                // files were available and which weren't.
                log::info!(
                    "no source file at {} (skipped — service worker will generate it)",
                    src.display()
                );
            }
            Err(e) => {
                log::warn!("mirror {} -> {}: {e}", src.display(), dest.display());
            }
        }
    }

    if copied == 0 {
        log::info!(
            "no user-side config files to mirror to {}",
            dest_dir.display()
        );
    }
    Ok(())
}

// ----------------------------- uninstall ------------------------------------

pub fn uninstall() -> Result<()> {
    // Probe-open the SCM with the rights we'll need (CONNECT for the SCM
    // handle itself, and DELETE on the per-service open below). The same
    // elevation-error mapping as install() — surface a single clear message
    // when the operator forgot the elevated prompt.
    let scm = ServiceManager::local_computer(
        None::<&str>,
        ServiceManagerAccess::CONNECT,
    )
    .map_err(map_scm_open_error)?;

    // Kill every hello-agent.exe process except ourselves *first*. We can't
    // rely on the SCM Stop control alone because the `--cm` child spawned
    // via `run_as_user` runs under the logged-in user's token, not SYSTEM,
    // so it isn't in the service's process tree and SCM won't reach it.
    // Doing this up front means the SCM stop below is usually a no-op
    // (service process already gone) and the rmdir at the end no longer
    // races a lingering child holding hello-agent.exe open. Our own PID
    // is excluded via the sysinfo filter so the uninstaller doesn't suicide.
    kill_orphan_processes();

    match scm.open_service(
        SERVICE_NAME,
        ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE,
    ) {
        Ok(svc) => {
            // Stop, wait, delete. Each step is best-effort; we want
            // --uninstall to leave nothing behind even if the service is
            // already in a weird state. After the kill above the service
            // process is typically already gone, so SCM transitions to
            // Stopped within a poll cycle; the 20s wait is a safety net
            // for the rare case taskkill couldn't reach the supervisor.
            if let Ok(status) = svc.query_status() {
                if status.current_state != ServiceState::Stopped {
                    let _ = svc.stop();
                    wait_for_state(&svc, ServiceState::Stopped, Duration::from_secs(20));
                }
            }
            svc.delete().context("delete service")?;
            log::info!("service '{}' deleted", SERVICE_NAME);
        }
        Err(windows_service::Error::Winapi(e))
            if e.raw_os_error()
                == Some(winapi::shared::winerror::ERROR_SERVICE_DOES_NOT_EXIST as i32) =>
        {
            log::info!("service '{}' not present (no-op)", SERVICE_NAME);
        }
        Err(e) => return Err(anyhow!("open_service: {e}")),
    }

    // Remove the firewall rule we installed (best-effort). netsh delete is
    // idempotent — if the rule was never there (or someone manually removed
    // it) netsh returns 1 with "No rules match the specified criteria",
    // which we treat as success.
    if let Err(e) = delete_firewall_rule() {
        log::warn!("could not delete firewall rule ({e:#}); remove it manually if needed");
    }

    cleanup_install_dir();
    // We deliberately do NOT delete the LocalService config dir here.
    // `hello-agent.toml` in that directory holds the agent's id + keypair,
    // which the rustdesk-server / rendezvous server has registered against
    // the agent's id. Wiping it forces the next --install to generate
    // fresh keys, which the rendezvous server's cached entry (and any
    // supporter that resolved the agent recently) will mismatch with — the
    // encrypted handshake then silently fails on the supporter side and
    // the connection sits idle until the peer times out.
    //
    // Operators who want a true hard wipe can run:
    //     rmdir /s /q "%SystemRoot%\ServiceProfiles\LocalService\AppData\Roaming\hello-agent"
    // and then delete the device record from the rustdesk-server admin UI.
    log::info!("preserved LocalService config dir to keep agent keys/id stable across reinstalls");
    Ok(())
}

/// Best-effort sweep of every hello-agent.exe process other than ourselves.
/// Used by both `--install` (so an in-place update isn't blocked by an
/// old `--cm` child holding the exe open) and `--uninstall` (so the
/// rmdir at the end isn't racing a lingering child).
///
/// Walks the process table via `hbb_common::sysinfo` (the same enumerator
/// the vendored rustdesk uses internally) and calls `Process::kill` —
/// equivalent to `TerminateProcess` under the hood. After issuing the
/// kills we poll the process table for actual exit rather than guessing
/// at a 500 ms sleep: `TerminateProcess` marks the process as exited but
/// the kernel takes a variable amount of time to release the image-file
/// handle, and we only want to return once those handles are gone (so
/// the install-time file copy and uninstall-time rmdir don't race a
/// half-finalized victim).
fn kill_orphan_processes() {
    // hbb_common pulls the rustdesk-org sysinfo 0.29 fork, which exposes
    // System/Process/Pid with inherent methods (no SystemExt/ProcessExt
    // trait imports needed — that style was removed when this fork
    // diverged from upstream 0.30).
    use hbb_common::sysinfo::{Pid, System};

    let our_pid = std::process::id();
    let target = INSTALLED_EXE_NAME;

    let mut system = System::new();
    system.refresh_processes();
    let victims: Vec<Pid> = system
        .processes()
        .iter()
        .filter(|(pid, p)| {
            pid.as_u32() != our_pid && p.name().eq_ignore_ascii_case(target)
        })
        .map(|(pid, _)| *pid)
        .collect();

    if victims.is_empty() {
        log::info!("no orphan {target} processes to kill");
        return;
    }

    let killed: Vec<u32> = victims
        .iter()
        .filter_map(|pid| {
            let process = system.process(*pid)?;
            if process.kill() {
                Some(pid.as_u32())
            } else {
                log::warn!("Process::kill failed for pid {}", pid.as_u32());
                None
            }
        })
        .collect();
    log::info!("issued kill on {} {target} process(es): {killed:?}", killed.len());

    // Poll for actual exit. 5 s ceiling is generous (TerminateProcess
    // usually finalizes within tens of ms) but cheap — we only burn it
    // when the kernel really is dragging its feet, which is the exact
    // case the old `sleep(500ms)` heuristic couldn't handle.
    let deadline = Instant::now() + Duration::from_secs(5);
    while Instant::now() < deadline {
        system.refresh_processes();
        let still_alive = victims.iter().any(|pid| system.process(*pid).is_some());
        if !still_alive {
            return;
        }
        std::thread::sleep(Duration::from_millis(50));
    }
    log::warn!(
        "some {target} processes were still alive after 5 s; subsequent file ops may fail with sharing violation"
    );
}

/// Translate a `windows_service::Error` from `ServiceManager::local_computer`
/// into a friendlier user-facing message. ERROR_ACCESS_DENIED (Win32 err 5)
/// is the overwhelmingly common case — operator forgot to elevate — and
/// deserves a single clear line rather than the raw Win32 errno string.
fn map_scm_open_error(e: windows_service::Error) -> anyhow::Error {
    if let windows_service::Error::Winapi(ref ioe) = e {
        if ioe.raw_os_error() == Some(5) {
            return anyhow!(
                "requires an elevated (Administrator) prompt — re-run from \"Run as administrator\""
            );
        }
    }
    anyhow!("open SCM: {e}")
}

/// Add a Windows Firewall rule allowing inbound TCP/UDP to the installed
/// hello-agent.exe. Idempotent: we delete any prior rule by the same name
/// first, so re-running --install (or upgrading in place) doesn't pile up
/// duplicate entries in the firewall's per-name list.
///
/// We use the program-scoped form (`program=<path>`) rather than port-scoped
/// rules because hello-agent's optional listeners (direct-server TCP 21118,
/// LAN-discovery UDP 21119) are gated on operator-controlled config flags;
/// rule-by-program covers whatever ports the agent actually decides to bind.
fn install_firewall_rule(exe_path: &PathBuf) -> Result<()> {
    // Drop any pre-existing rule first; netsh quietly succeeds-with-exit-1
    // when nothing matches, so we ignore the result.
    let _ = run_netsh(&[
        "advfirewall",
        "firewall",
        "delete",
        "rule",
        &format!("name={FIREWALL_RULE_NAME}"),
    ]);

    let program_arg = format!(
        "program={}",
        exe_path.to_str().ok_or_else(|| anyhow!(
            "non-UTF-8 install path can't be passed to netsh: {}",
            exe_path.display()
        ))?
    );
    let status = run_netsh(&[
        "advfirewall",
        "firewall",
        "add",
        "rule",
        &format!("name={FIREWALL_RULE_NAME}"),
        "dir=in",
        "action=allow",
        "enable=yes",
        "profile=any",
        &program_arg,
    ])?;
    if !status {
        return Err(anyhow!("netsh add rule failed"));
    }
    log::info!(
        "added firewall rule '{FIREWALL_RULE_NAME}' for {}",
        exe_path.display()
    );
    Ok(())
}

/// Remove the hello-agent firewall rule by name. netsh exits non-zero when
/// no rule matches; we translate that into success since the post-condition
/// (no rule by that name) is what we want anyway.
fn delete_firewall_rule() -> Result<()> {
    let status = run_netsh(&[
        "advfirewall",
        "firewall",
        "delete",
        "rule",
        &format!("name={FIREWALL_RULE_NAME}"),
    ]);
    match status {
        Ok(_) => {
            log::info!("removed firewall rule '{FIREWALL_RULE_NAME}' (or none was present)");
            Ok(())
        }
        Err(e) => Err(e),
    }
}

/// Shell out to netsh.exe with the given args. Returns Ok(true) on
/// exit-0, Ok(false) on a non-zero exit that *netsh itself* produced
/// (e.g. "rule already exists" or "no rules match"), and Err only when
/// the binary couldn't be invoked at all (PATH stripped, etc.).
fn run_netsh(args: &[&str]) -> Result<bool> {
    let out = std::process::Command::new("netsh")
        .args(args)
        .output()
        .context("invoke netsh")?;
    if !out.status.success() {
        let stderr = String::from_utf8_lossy(&out.stderr);
        log::debug!(
            "netsh {args:?} exited {:?}: {}",
            out.status.code(),
            stderr.trim()
        );
    }
    Ok(out.status.success())
}

/// Remove %ProgramFiles%\hello-agent. Best-effort: if the user ran
/// --uninstall from inside the install dir, the running exe is locked
/// open by the OS and the rmdir will fail. We log and move on; the
/// remaining files are harmless and can be deleted manually after exit.
fn cleanup_install_dir() {
    let dir = install_dir();
    if !dir.exists() {
        return;
    }
    match std::fs::remove_dir_all(&dir) {
        Ok(()) => log::info!("removed install dir {}", dir.display()),
        Err(e) => log::warn!(
            "could not remove {} ({}); delete manually if needed",
            dir.display(),
            e
        ),
    }
}

fn wait_for_state(
    svc: &windows_service::service::Service,
    target: ServiceState,
    timeout: Duration,
) -> bool {
    let start = Instant::now();
    while start.elapsed() < timeout {
        match svc.query_status() {
            Ok(s) if s.current_state == target => return true,
            _ => std::thread::sleep(Duration::from_millis(250)),
        }
    }
    false
}

// ----------------------------- service runtime ------------------------------

windows_service::define_windows_service!(ffi_service_main, service_main);

pub fn run_as_service() -> Result<()> {
    service_dispatcher::start(SERVICE_NAME, ffi_service_main)
        .map_err(|e| anyhow!("service_dispatcher::start: {e}"))
}

fn service_main(_args: Vec<OsString>) {
    if let Err(e) = service_main_inner() {
        log::error!("service_main: {e:#}");
    }
}

fn service_main_inner() -> Result<()> {
    let stop_flag = Arc::new(AtomicBool::new(false));
    let stop_flag_handler = stop_flag.clone();

    // We poll WTSGetActiveConsoleSessionId every iteration of the main loop,
    // so we don't need session-change events from the SCM. Keeping the
    // handler set narrow (Stop/Shutdown/Interrogate) means SCM won't deliver
    // events we'd just throw away.
    let event_handler = move |control_event| -> ServiceControlHandlerResult {
        match control_event {
            ServiceControl::Stop | ServiceControl::Shutdown => {
                stop_flag_handler.store(true, Ordering::SeqCst);
                ServiceControlHandlerResult::NoError
            }
            ServiceControl::Interrogate => ServiceControlHandlerResult::NoError,
            _ => ServiceControlHandlerResult::NotImplemented,
        }
    };

    let status_handle = service_control_handler::register(SERVICE_NAME, event_handler)
        .map_err(|e| anyhow!("register handler: {e}"))?;

    set_status(
        &status_handle,
        ServiceState::Running,
        ServiceControlAccept::STOP | ServiceControlAccept::SHUTDOWN,
    )?;

    log::info!("hello-agent service started");

    // Generate a fresh per-boot unattended-access password and report it
    // to the rustdesk-server admin API. Runs in a background thread with
    // its own Tokio runtime so it doesn't block the supervisor poll loop;
    // retries internally until the server acknowledges (early attempts
    // can race the rendezvous registration done by `--server`).
    crate::unattended_password::rotate_and_report();

    // Worker process handle. Killed on Stop, replaced on session change.
    // `last_state` carries (session_id, had_user). The `had_user` bit is
    // what forces a respawn when a user logs in to a session we're
    // *already* running in (login-screen console → same session, but now
    // with a user) — the new `--server` needs to pre-spawn its `--cm`
    // child against the freshly-available user token, which the prior
    // `--server` couldn't do.
    let mut worker: Option<Worker> = None;
    let mut last_state: Option<(u32, bool)> = None;

    while !stop_flag.load(Ordering::SeqCst) {
        // Pick a target session in this priority order:
        //
        //   1. Active *user* session (RDP-connected user, or physical
        //      console with a logged-in user) — the normal case, full
        //      screen capture / input / popup.
        //   2. Physical console session at the login or lock screen
        //      (no user, but `winlogon.exe` is running so
        //      `launch_privileged_process` works and DXGI desktop
        //      duplication can capture the login screen). This is what
        //      enables unattended access via the per-boot password — the
        //      supporter sees the actual login screen, not a black
        //      "No displays" panel.
        //   3. Session 0 (where this supervisor itself lives as
        //      LocalSystem). Last-ditch fallback, no display, no input —
        //      rendezvous + heartbeat keep flowing but capture is
        //      empty. We avoid it whenever (2) is reachable.
        let active = find_active_user_session();
        let target = active
            .or_else(active_console_session_for_capture)
            .unwrap_or(0);
        let target_has_user = active.is_some();
        let target_state = (target, target_has_user);
        let worker_dead = worker.as_ref().map(|w| !w.is_alive()).unwrap_or(false);

        let needs_respawn = match (worker.is_some(), last_state) {
            (false, _) => true,
            (_, Some(prev)) if prev != target_state => true,
            _ if worker_dead => true,
            _ => false,
        };

        if needs_respawn {
            if let Some(prev) = worker.take() {
                prev.kill_and_wait(Duration::from_secs(5));
            }
            let spawn_result = if target == 0 {
                Worker::spawn_in_service_session()
            } else {
                Worker::spawn(target)
            };
            match spawn_result {
                Ok(w) => {
                    if target == 0 {
                        log::info!(
                            "no console or user session reachable; spawned --server \
                             in Session 0 (registration only — screen capture \
                             unavailable until a session is available)"
                        );
                    } else if active.is_some() {
                        log::info!(
                            "spawned --server worker into user session {target}"
                        );
                    } else {
                        log::info!(
                            "no user logged in; spawned --server into console \
                             session {target} (login screen capture)"
                        );
                    }
                    worker = Some(w);
                    last_state = Some(target_state);
                }
                Err(e) => {
                    log::warn!("spawn worker failed: {e:#}");
                    std::thread::sleep(Duration::from_secs(5));
                }
            }
        }

        std::thread::sleep(Duration::from_millis(750));
    }

    // Shutdown.
    if let Some(prev) = worker.take() {
        prev.kill_and_wait(Duration::from_secs(5));
    }

    set_status(
        &status_handle,
        ServiceState::Stopped,
        ServiceControlAccept::empty(),
    )?;
    log::info!("hello-agent service stopped");
    Ok(())
}

fn set_status(
    handle: &service_control_handler::ServiceStatusHandle,
    state: ServiceState,
    accept: ServiceControlAccept,
) -> Result<()> {
    handle
        .set_service_status(ServiceStatus {
            service_type: SERVICE_TYPE,
            current_state: state,
            controls_accepted: accept,
            exit_code: ServiceExitCode::Win32(0),
            checkpoint: 0,
            wait_hint: Duration::from_secs(5),
            process_id: None,
        })
        .map_err(|e| anyhow!("set_service_status: {e}"))
}

/// Worker process handle. We use `librustdesk::platform::launch_privileged_process`
/// (the same path stock rustdesk's `--service` uses) which calls
/// `LaunchProcessWin(..., as_user=FALSE, ...)` — the new process runs as
/// SYSTEM in the active console session. SYSTEM-in-user-session can both
/// (a) read config from the LocalService-effective path our install flow
/// mirrors to, and (b) draw UI / capture screen / send input on the user's
/// desktop (it's the standard service-side-of-remote-control pattern).
///
/// We get back a Win32 HANDLE rather than a `std::process::Child`; this
/// thin wrapper exposes the few operations the supervisor loop needs and
/// closes the handle on drop.
struct Worker {
    handle: winapi::shared::ntdef::HANDLE,
}

// HANDLE is `*mut c_void`, which isn't Send by default; the inner pointer
// is opaque to the OS and safe to move between threads.
unsafe impl Send for Worker {}

impl Worker {
    fn spawn(session_id: u32) -> Result<Self> {
        let exe = std::env::current_exe().context("current_exe")?;
        let exe_str = exe
            .to_str()
            .ok_or_else(|| anyhow!("non-UTF-8 exe path: {}", exe.display()))?;
        let cmd = format!("\"{exe_str}\" --server");
        let handle = librustdesk::platform::launch_privileged_process(session_id, &cmd)
            .map_err(|e| anyhow!("launch_privileged_process: {e}"))?;
        if handle.is_null() {
            return Err(anyhow!(
                "launch_privileged_process returned NULL handle (session {session_id} not ready?)"
            ));
        }
        Ok(Self { handle })
    }

    /// Spawn `--server` in our own session (Session 0, LocalSystem). Used
    /// when no user is logged in: we can't `launch_privileged_process` for
    /// session 0 because that helper resolves the target token via
    /// `winlogon.exe`/`explorer.exe`, neither of which run in Session 0.
    /// The supervisor itself is LocalSystem-in-Session-0, so a plain
    /// `Command::spawn` puts the child in the same place with the same
    /// token — exactly what we want for the no-user-logged-in fallback.
    fn spawn_in_service_session() -> Result<Self> {
        use std::os::windows::io::IntoRawHandle;

        let exe = std::env::current_exe().context("current_exe")?;
        let child = std::process::Command::new(&exe)
            .arg("--server")
            .spawn()
            .with_context(|| format!("spawn {} --server", exe.display()))?;
        // Take ownership of the child's process HANDLE; this suppresses
        // `Child::Drop`'s close so kill_and_wait / Drop on Worker manage
        // the lifetime cleanly via TerminateProcess + CloseHandle.
        let handle = child.into_raw_handle() as winapi::shared::ntdef::HANDLE;
        Ok(Self { handle })
    }

    fn is_alive(&self) -> bool {
        // WAIT_TIMEOUT (0x102) means the wait expired without the handle
        // being signaled — i.e., the process is still running. Anything
        // else (WAIT_OBJECT_0 = exited, WAIT_FAILED = error) we treat as
        // dead so the supervisor will respawn.
        const WAIT_TIMEOUT: u32 = 0x0000_0102;
        let r = unsafe { winapi::um::synchapi::WaitForSingleObject(self.handle, 0) };
        r == WAIT_TIMEOUT
    }

    fn kill_and_wait(self, timeout: Duration) {
        unsafe {
            winapi::um::processthreadsapi::TerminateProcess(self.handle, 1);
            let ms = timeout.as_millis().min(u32::MAX as u128) as u32;
            let _ = winapi::um::synchapi::WaitForSingleObject(self.handle, ms);
        }
        // Drop closes the handle.
    }
}

impl Drop for Worker {
    fn drop(&mut self) {
        unsafe {
            winapi::um::handleapi::CloseHandle(self.handle);
        }
    }
}

/// Pick the session that hosts the user's *active* interactive desktop —
/// physical console *or* RDP. Returns `None` if no user is actively logged
/// in anywhere.
///
/// We can't use `WTSGetActiveConsoleSessionId()` here: it only returns the
/// session attached to the **physical** console. When the user is connected
/// via RDP only, the console session is empty (or at the lock screen), and
/// this primitive gives us the wrong target. The popup ends up rendered on
/// the invisible console desktop while the RDP user sees nothing.
///
/// Instead enumerate sessions and pick one in `WTSActive` state with a
/// resolvable user token. `WTSActive` means "the user is at the keyboard
/// of this session right now" — which is true for the RDP session when
/// they're on RDP, and for the console session when they're at the
/// physical machine. A user who logged in to RDP and then disconnected
/// without logging out shows up as `WTSDisconnected` and we correctly
/// skip them.
pub(crate) fn find_active_user_session() -> Option<u32> {
    use winapi::shared::ntdef::HANDLE;
    use winapi::um::handleapi::CloseHandle;
    use winapi::um::wtsapi32::WTSQueryUserToken;

    #[repr(C)]
    struct WtsSessionInfoW {
        session_id: u32,
        win_station_name: *mut u16,
        state: i32, // WTS_CONNECTSTATE_CLASS
    }

    const WTS_ACTIVE: i32 = 0;
    extern "system" {
        fn WTSEnumerateSessionsW(
            h_server: HANDLE,
            reserved: u32,
            version: u32,
            pp_session_info: *mut *mut WtsSessionInfoW,
            p_count: *mut u32,
        ) -> i32;
        fn WTSFreeMemory(p_memory: *mut std::ffi::c_void);
    }

    let mut sessions: *mut WtsSessionInfoW = std::ptr::null_mut();
    let mut count: u32 = 0;
    let ok = unsafe {
        WTSEnumerateSessionsW(
            std::ptr::null_mut(), // WTS_CURRENT_SERVER_HANDLE
            0,
            1, // version
            &mut sessions,
            &mut count,
        )
    };
    if ok == 0 || sessions.is_null() {
        return None;
    }

    let mut chosen: Option<u32> = None;
    for i in 0..count {
        let info = unsafe { &*sessions.add(i as usize) };
        if info.state != WTS_ACTIVE {
            continue;
        }
        // Skip the login-screen session (no logged-in user → no token).
        let mut token: HANDLE = std::ptr::null_mut();
        let token_ok = unsafe { WTSQueryUserToken(info.session_id, &mut token) };
        if token_ok != 0 && !token.is_null() {
            unsafe { CloseHandle(token) };
            chosen = Some(info.session_id);
            break;
        }
    }

    unsafe { WTSFreeMemory(sessions as *mut _) };
    chosen
}

/// Physical-console session ID — used as the fallback target when no user
/// is logged in. At the login or lock screen `winlogon.exe` is running in
/// this session, which is enough for `launch_privileged_process` to find
/// a SYSTEM token there and spawn `--server` into a session that has an
/// actual display (Session 0 doesn't). Returns None when Windows reports
/// no console attached (boot, fast-user-switching mid-detach).
pub(crate) fn active_console_session_for_capture() -> Option<u32> {
    use winapi::um::winbase::WTSGetActiveConsoleSessionId;
    let id = unsafe { WTSGetActiveConsoleSessionId() };
    // 0xFFFF_FFFF: no console attached. 0: same as our own session, no
    // gain over the Session 0 fallback that comes after.
    if id == 0xFFFF_FFFF || id == 0 {
        None
    } else {
        Some(id)
    }
}

/// Returns the session ID of the calling process. Used by `--server` to
/// know which session it itself was launched into, so the `--cm` child
/// lands in the *same* session (and therefore on the same interactive
/// desktop the user is actually using).
fn current_process_session() -> Option<u32> {
    use winapi::um::processthreadsapi::{GetCurrentProcessId, ProcessIdToSessionId};
    let mut sid: u32 = 0;
    let ok = unsafe { ProcessIdToSessionId(GetCurrentProcessId(), &mut sid) };
    if ok == 0 {
        None
    } else {
        Some(sid)
    }
}

/// Spawn `hello-agent.exe --cm` into the active console session as the
/// logged-in user, **on the user's interactive desktop**.
///
/// Why we don't just call `librustdesk::platform::run_as_user(["--cm"])`:
/// the C-side `LaunchProcessWin` only sets `STARTUPINFO.lpDesktop =
/// L"winsta0\\default"` when its `show` parameter is `TRUE`. `run_as_user`
/// hardcodes `show=false`, leaving `lpDesktop = NULL`. With NULL, the new
/// process inherits the *parent's* desktop. Our parent chain (`--service`
/// in Session 0 → `--server` in user session as SYSTEM token) is rooted
/// in Session 0's `Service-0x...\Default` desktop, so any UI rendered by
/// the resulting `--cm` child draws there — invisible to the logged-in
/// user. This helper sets `lpDesktop` explicitly so the popup actually
/// reaches the user's screen.
/// Convenience wrapper used by `run_server`: spawn `--cm` into the same
/// session the calling process itself is running in. Falls back to
/// `find_active_user_session` if `ProcessIdToSessionId` fails for some
/// reason.
pub(crate) fn spawn_cm_in_my_session() -> Result<u32> {
    let session_id = current_process_session()
        .or_else(find_active_user_session)
        .ok_or_else(|| anyhow!("no active user session to spawn --cm into"))?;
    spawn_cm_into_user_desktop(session_id)
}

pub(crate) fn spawn_cm_into_user_desktop(session_id: u32) -> Result<u32> {
    use std::os::windows::ffi::OsStrExt;
    use winapi::shared::ntdef::HANDLE;
    use winapi::um::handleapi::CloseHandle;
    use winapi::um::processthreadsapi::{CreateProcessAsUserW, PROCESS_INFORMATION, STARTUPINFOW};
    use winapi::um::winbase::DETACHED_PROCESS;
    use winapi::um::wtsapi32::WTSQueryUserToken;

    // 1. Grab the user's primary access token for this session. Requires
    //    SE_TCB_NAME; SYSTEM has it by default.
    let mut user_token: HANDLE = std::ptr::null_mut();
    let ok = unsafe { WTSQueryUserToken(session_id, &mut user_token) };
    if ok == 0 {
        let err = std::io::Error::last_os_error();
        return Err(anyhow!(
            "WTSQueryUserToken(session={}): {} (no user logged in?)",
            session_id,
            err
        ));
    }

    // 2. Build the command line. CreateProcessAsUserW may patch the
    //    lpCommandLine buffer in place, so it has to be a mutable Vec.
    let exe = std::env::current_exe().context("current_exe")?;
    let cmd_str = format!("\"{}\" --cm", exe.display());
    let mut cmd_w: Vec<u16> = std::ffi::OsStr::new(&cmd_str)
        .encode_wide()
        .chain(Some(0))
        .collect();

    // 3. The desktop string is referenced by si.lpDesktop and must stay
    //    alive until CreateProcessAsUserW returns.
    let mut desktop_w: Vec<u16> = std::ffi::OsStr::new("winsta0\\default")
        .encode_wide()
        .chain(Some(0))
        .collect();

    let mut si: STARTUPINFOW = unsafe { std::mem::zeroed() };
    si.cb = std::mem::size_of::<STARTUPINFOW>() as u32;
    si.lpDesktop = desktop_w.as_mut_ptr();

    let mut pi: PROCESS_INFORMATION = unsafe { std::mem::zeroed() };

    // 4. Spawn. DETACHED_PROCESS so the child has no console attached and
    //    isn't tied to ours. We do not pass an environment block — NULL
    //    means "inherit ours", which is fine for cm_popup.
    let cp_ok = unsafe {
        CreateProcessAsUserW(
            user_token,
            std::ptr::null(),
            cmd_w.as_mut_ptr(),
            std::ptr::null_mut(),
            std::ptr::null_mut(),
            0,
            DETACHED_PROCESS,
            std::ptr::null_mut(),
            std::ptr::null(),
            &mut si,
            &mut pi,
        )
    };
    let cp_err = std::io::Error::last_os_error();

    unsafe { CloseHandle(user_token) };

    if cp_ok == 0 {
        return Err(anyhow!("CreateProcessAsUserW: {}", cp_err));
    }

    let pid = pi.dwProcessId;
    // We don't track the child's lifetime here. It will outlive the
    // calling --server until either the user session ends (Windows reaps
    // it) or it exits voluntarily on cm_popup error.
    unsafe {
        CloseHandle(pi.hProcess);
        CloseHandle(pi.hThread);
    }
    Ok(pid)
}