diff --git a/architecture/gateway.md b/architecture/gateway.md index 979422d7e..84693ad5d 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -365,6 +365,15 @@ The same relay pattern backs interactive SSH, command execution, file sync, and local service forwarding. The gateway tracks live sessions in memory and persists session records so tokens can expire or be revoked. +Relay liveness has two backstops so a reset supervisor session cannot leave a +request parked forever. The gateway runs server-side HTTP/2 keepalive on +supervisor connections, and each exec relay's SSH client uses SSH keepalive: an +exec channel may be legitimately silent for a long time (e.g. an agent whose +stdout is redirected to a file), so the exec is never ended on output-idle +alone — instead an unanswered keepalive on a wedged or orphaned relay closes the +channel and returns the exec with an error. Once a command reports its exit +status, the gateway also bounds how long it waits for the trailing channel close. + `ForwardTcp` is the client-facing byte stream for SSH and service forwarding. The first frame is a `TcpForwardInit` that carries the sandbox ID, an authorization token from `CreateSshSession`, and an explicit target: diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 28377394f..850493a41 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -1484,6 +1484,36 @@ fn shell_escape(value: &str) -> Result { /// Maximum total length of the assembled shell command string. const MAX_COMMAND_STRING_LEN: usize = 256 * 1024; // 256 KiB +/// SSH keepalive for silent exec relays; stdout idle is not a timeout signal. +const EXEC_KEEPALIVE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(15); + +/// Allow this many missed keepalive responses before russh fails the relay. +const EXEC_KEEPALIVE_MAX: usize = 4; + +/// Max wait for a trailing `Close` after `ExitStatus`. +const EXEC_POST_EXIT_CLOSE_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(500); + +/// russh client config for exec relays. +fn exec_ssh_client_config() -> russh::client::Config { + russh::client::Config { + keepalive_interval: Some(EXEC_KEEPALIVE_INTERVAL), + keepalive_max: EXEC_KEEPALIVE_MAX, + ..Default::default() + } +} + +/// Treat channel EOF before an exit status as relay failure, not exit code 1. +fn exec_loop_result(exit_code: Option) -> Result { + exit_code.map_or_else( + || { + Err(Status::unavailable( + "exec relay closed before the command reported an exit status", + )) + }, + Ok, + ) +} + fn build_remote_exec_command(req: &ExecSandboxRequest) -> Result { let mut parts = Vec::new(); let mut env_entries = req.environment.iter().collect::>(); @@ -1690,7 +1720,7 @@ async fn run_interactive_exec_with_russh( .await .map_err(|e| Status::internal(format!("failed to connect to ssh proxy: {e}")))?; - let config = Arc::new(russh::client::Config::default()); + let config = Arc::new(exec_ssh_client_config()); let mut client = russh::client::connect_stream(config, stream, SandboxSshClientHandler) .await .map_err(|e| Status::internal(format!("failed to establish ssh transport: {e}")))?; @@ -1746,7 +1776,19 @@ async fn run_interactive_exec_with_russh( }); let mut exit_code: Option = None; - while let Some(msg) = read_half.wait().await { + loop { + // Bound the post-ExitStatus wait against a lost Close. + let msg = if exit_code.is_some() { + match tokio::time::timeout(EXEC_POST_EXIT_CLOSE_TIMEOUT, read_half.wait()).await { + Ok(Some(msg)) => msg, + Ok(None) | Err(_) => break, + } + } else { + match read_half.wait().await { + Some(msg) => msg, + None => break, + } + }; match msg { ChannelMsg::Data { data } => { let event = Ok(ExecSandboxEvent { @@ -1787,7 +1829,7 @@ async fn run_interactive_exec_with_russh( .disconnect(russh::Disconnect::ByApplication, "exec complete", "en") .await; - Ok(exit_code.unwrap_or(1)) + exec_loop_result(exit_code) } /// Create a localhost SSH proxy that bridges to a relay `DuplexStream`. @@ -1849,7 +1891,7 @@ async fn run_exec_with_russh( .await .map_err(|e| Status::internal(format!("failed to connect to ssh proxy: {e}")))?; - let config = Arc::new(russh::client::Config::default()); + let config = Arc::new(exec_ssh_client_config()); let mut client = russh::client::connect_stream(config, stream, SandboxSshClientHandler) .await .map_err(|e| Status::internal(format!("failed to establish ssh transport: {e}")))?; @@ -1897,7 +1939,19 @@ async fn run_exec_with_russh( .map_err(|e| Status::internal(format!("failed to close ssh stdin: {e}")))?; let mut exit_code: Option = None; - while let Some(msg) = channel.wait().await { + loop { + // Bound the post-ExitStatus wait against a lost Close. + let msg = if exit_code.is_some() { + match tokio::time::timeout(EXEC_POST_EXIT_CLOSE_TIMEOUT, channel.wait()).await { + Ok(Some(msg)) => msg, + Ok(None) | Err(_) => break, + } + } else { + match channel.wait().await { + Some(msg) => msg, + None => break, + } + }; match msg { ChannelMsg::Data { data } => { let _ = tx @@ -1935,7 +1989,7 @@ async fn run_exec_with_russh( .disconnect(russh::Disconnect::ByApplication, "exec complete", "en") .await; - Ok(exit_code.unwrap_or(1)) + exec_loop_result(exit_code) } // --------------------------------------------------------------------------- diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index f4faa0867..9e70c6472 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -12,7 +12,7 @@ use http_body::Body; use http_body_util::BodyExt; use hyper::body::Incoming; use hyper_util::{ - rt::{TokioExecutor, TokioIo}, + rt::{TokioExecutor, TokioIo, TokioTimer}, server::conn::auto::Builder, service::TowerToHyperService, }; @@ -185,7 +185,16 @@ impl MultiplexService { let service = MultiplexedService::new(grpc_service, http_service); let mut builder = Builder::new(TokioExecutor::new()); - builder.http2().adaptive_window(true); + // Server-side HTTP/2 keepalive: supervisors hold long-lived sessions, and without + // it the gateway never PINGs them, so idle/half-dead connections linger and orphan + // in-flight relay execs. The timer is required — hyper panics on the keepalive + // interval without one. + builder + .http2() + .timer(TokioTimer::new()) + .adaptive_window(true) + .keep_alive_interval(Some(Duration::from_secs(20))) + .keep_alive_timeout(Duration::from_secs(10)); builder .serve_connection_with_upgrades(TokioIo::new(stream), service)