From b4878bef4e45d52819a8be45bbd803b256103609 Mon Sep 17 00:00:00 2001 From: Gal Zaidman Date: Wed, 24 Jun 2026 21:19:36 +0300 Subject: [PATCH] fix(server): prevent exec relays from hanging on idle connections Add HTTP/2 keepalive on supervisor multiplex connections so half-dead sessions cannot leave in-flight exec relays parked indefinitely. Configure SSH keepalive on exec relay clients so long silent commands are not timed out on stdout idle alone; wedged or orphaned relays fail after missed keepalives instead. After a command reports exit status, bound how long the gateway waits for the trailing channel close. Return UNAVAILABLE when a relay closes before reporting exit status rather than defaulting to exit code 1. Signed-off-by: Gal Zaidman --- architecture/gateway.md | 9 +++ crates/openshell-server/src/grpc/sandbox.rs | 66 +++++++++++++++++++-- crates/openshell-server/src/multiplex.rs | 13 +++- 3 files changed, 80 insertions(+), 8 deletions(-) diff --git a/architecture/gateway.md b/architecture/gateway.md index 979422d7e..84693ad5d 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -365,6 +365,15 @@ The same relay pattern backs interactive SSH, command execution, file sync, and local service forwarding. The gateway tracks live sessions in memory and persists session records so tokens can expire or be revoked. +Relay liveness has two backstops so a reset supervisor session cannot leave a +request parked forever. The gateway runs server-side HTTP/2 keepalive on +supervisor connections, and each exec relay's SSH client uses SSH keepalive: an +exec channel may be legitimately silent for a long time (e.g. an agent whose +stdout is redirected to a file), so the exec is never ended on output-idle +alone — instead an unanswered keepalive on a wedged or orphaned relay closes the +channel and returns the exec with an error. Once a command reports its exit +status, the gateway also bounds how long it waits for the trailing channel close. + `ForwardTcp` is the client-facing byte stream for SSH and service forwarding. The first frame is a `TcpForwardInit` that carries the sandbox ID, an authorization token from `CreateSshSession`, and an explicit target: diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 28377394f..850493a41 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -1484,6 +1484,36 @@ fn shell_escape(value: &str) -> Result { /// Maximum total length of the assembled shell command string. const MAX_COMMAND_STRING_LEN: usize = 256 * 1024; // 256 KiB +/// SSH keepalive for silent exec relays; stdout idle is not a timeout signal. +const EXEC_KEEPALIVE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(15); + +/// Allow this many missed keepalive responses before russh fails the relay. +const EXEC_KEEPALIVE_MAX: usize = 4; + +/// Max wait for a trailing `Close` after `ExitStatus`. +const EXEC_POST_EXIT_CLOSE_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(500); + +/// russh client config for exec relays. +fn exec_ssh_client_config() -> russh::client::Config { + russh::client::Config { + keepalive_interval: Some(EXEC_KEEPALIVE_INTERVAL), + keepalive_max: EXEC_KEEPALIVE_MAX, + ..Default::default() + } +} + +/// Treat channel EOF before an exit status as relay failure, not exit code 1. +fn exec_loop_result(exit_code: Option) -> Result { + exit_code.map_or_else( + || { + Err(Status::unavailable( + "exec relay closed before the command reported an exit status", + )) + }, + Ok, + ) +} + fn build_remote_exec_command(req: &ExecSandboxRequest) -> Result { let mut parts = Vec::new(); let mut env_entries = req.environment.iter().collect::>(); @@ -1690,7 +1720,7 @@ async fn run_interactive_exec_with_russh( .await .map_err(|e| Status::internal(format!("failed to connect to ssh proxy: {e}")))?; - let config = Arc::new(russh::client::Config::default()); + let config = Arc::new(exec_ssh_client_config()); let mut client = russh::client::connect_stream(config, stream, SandboxSshClientHandler) .await .map_err(|e| Status::internal(format!("failed to establish ssh transport: {e}")))?; @@ -1746,7 +1776,19 @@ async fn run_interactive_exec_with_russh( }); let mut exit_code: Option = None; - while let Some(msg) = read_half.wait().await { + loop { + // Bound the post-ExitStatus wait against a lost Close. + let msg = if exit_code.is_some() { + match tokio::time::timeout(EXEC_POST_EXIT_CLOSE_TIMEOUT, read_half.wait()).await { + Ok(Some(msg)) => msg, + Ok(None) | Err(_) => break, + } + } else { + match read_half.wait().await { + Some(msg) => msg, + None => break, + } + }; match msg { ChannelMsg::Data { data } => { let event = Ok(ExecSandboxEvent { @@ -1787,7 +1829,7 @@ async fn run_interactive_exec_with_russh( .disconnect(russh::Disconnect::ByApplication, "exec complete", "en") .await; - Ok(exit_code.unwrap_or(1)) + exec_loop_result(exit_code) } /// Create a localhost SSH proxy that bridges to a relay `DuplexStream`. @@ -1849,7 +1891,7 @@ async fn run_exec_with_russh( .await .map_err(|e| Status::internal(format!("failed to connect to ssh proxy: {e}")))?; - let config = Arc::new(russh::client::Config::default()); + let config = Arc::new(exec_ssh_client_config()); let mut client = russh::client::connect_stream(config, stream, SandboxSshClientHandler) .await .map_err(|e| Status::internal(format!("failed to establish ssh transport: {e}")))?; @@ -1897,7 +1939,19 @@ async fn run_exec_with_russh( .map_err(|e| Status::internal(format!("failed to close ssh stdin: {e}")))?; let mut exit_code: Option = None; - while let Some(msg) = channel.wait().await { + loop { + // Bound the post-ExitStatus wait against a lost Close. + let msg = if exit_code.is_some() { + match tokio::time::timeout(EXEC_POST_EXIT_CLOSE_TIMEOUT, channel.wait()).await { + Ok(Some(msg)) => msg, + Ok(None) | Err(_) => break, + } + } else { + match channel.wait().await { + Some(msg) => msg, + None => break, + } + }; match msg { ChannelMsg::Data { data } => { let _ = tx @@ -1935,7 +1989,7 @@ async fn run_exec_with_russh( .disconnect(russh::Disconnect::ByApplication, "exec complete", "en") .await; - Ok(exit_code.unwrap_or(1)) + exec_loop_result(exit_code) } // --------------------------------------------------------------------------- diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index f4faa0867..9e70c6472 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -12,7 +12,7 @@ use http_body::Body; use http_body_util::BodyExt; use hyper::body::Incoming; use hyper_util::{ - rt::{TokioExecutor, TokioIo}, + rt::{TokioExecutor, TokioIo, TokioTimer}, server::conn::auto::Builder, service::TowerToHyperService, }; @@ -185,7 +185,16 @@ impl MultiplexService { let service = MultiplexedService::new(grpc_service, http_service); let mut builder = Builder::new(TokioExecutor::new()); - builder.http2().adaptive_window(true); + // Server-side HTTP/2 keepalive: supervisors hold long-lived sessions, and without + // it the gateway never PINGs them, so idle/half-dead connections linger and orphan + // in-flight relay execs. The timer is required — hyper panics on the keepalive + // interval without one. + builder + .http2() + .timer(TokioTimer::new()) + .adaptive_window(true) + .keep_alive_interval(Some(Duration::from_secs(20))) + .keep_alive_timeout(Duration::from_secs(10)); builder .serve_connection_with_upgrades(TokioIo::new(stream), service)