From d29aa814d9a32ba523d5f383081d2f9f1168890a Mon Sep 17 00:00:00 2001
From: jolah1 <jokunlade@gmail.com>
Date: Thu, 4 Jun 2026 00:20:46 +0100
Subject: [PATCH] connection: per-peer exponential reconnection backoff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, the background reconnection task retried every persisted peer
on a fixed 60s interval with no backoff, so an unreachable peer was retried
indefinitely at the same cadence — log spam and wasted work. This became
more visible after #895 retained peers across force-closes so that
channel_reestablish recovery can run.

Track per-peer reconnect state in ConnectionManager: on failure, double the
retry interval up to PEER_RECONNECTION_MAX_INTERVAL (30 min); on success
(including user-initiated connects), clear the state so a subsequent drop
retries promptly. The 60s tokio::time::interval is kept as the wakeup,
gated per-peer by next_retry_at, since #588's inline-sleep form does not
generalize to N peers. Backoff state is in-memory and resets on restart —
a fresh post-restart attempt is the correct behavior. State is also
cleared when a peer is removed from the persisted store.

Closes #918.
---
 src/config.rs     |   3 +
 src/connection.rs | 138 +++++++++++++++++++++++++++++++++++++++++++++-
 src/lib.rs        |  17 +++++-
 3 files changed, 153 insertions(+), 5 deletions(-)
diff --git a/src/config.rs b/src/config.rs
index 558a4d0618..d8eae5d900 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -70,6 +70,9 @@ pub(crate) const LDK_PAYMENT_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
 // The time in-between peer reconnection attempts.
 pub(crate) const PEER_RECONNECTION_INTERVAL: Duration = Duration::from_secs(60);
 
+// The upper bound on the per-peer exponential backoff applied to failed reconnection attempts.
+pub(crate) const PEER_RECONNECTION_MAX_INTERVAL: Duration = Duration::from_secs(60 * 30);
+
 // The time in-between RGS sync attempts.
 pub(crate) const RGS_SYNC_INTERVAL: Duration = Duration::from_secs(60 * 60);
 
diff --git a/src/connection.rs b/src/connection.rs
index b8946ffe3a..91b7f76d97 100644
--- a/src/connection.rs
+++ b/src/connection.rs
@@ -8,22 +8,51 @@
 use std::collections::hash_map::{self, HashMap};
 use std::ops::Deref;
 use std::sync::{Arc, Mutex};
-use std::time::Duration;
+use std::time::{Duration, Instant};
 
 use bitcoin::secp256k1::PublicKey;
 use lightning::ln::msgs::SocketAddress;
 
-use crate::config::TorConfig;
+use crate::config::{TorConfig, PEER_RECONNECTION_INTERVAL, PEER_RECONNECTION_MAX_INTERVAL};
 use crate::logger::{log_debug, log_error, log_info, LdkLogger};
 use crate::types::{KeysManager, PeerManager};
 use crate::Error;
 
+struct PeerReconnectState {
+	consecutive_failures: u32,
+	next_retry_at: Instant,
+	next_backoff: Duration,
+}
+
+impl PeerReconnectState {
+	fn new(now: Instant) -> Self {
+		Self {
+			consecutive_failures: 0,
+			next_retry_at: now,
+			next_backoff: PEER_RECONNECTION_INTERVAL,
+		}
+	}
+
+	/// Bumps the failure count, schedules `next_retry_at` to `now + current backoff`,
+	/// and doubles the backoff for the following failure (capped at
+	/// [`PEER_RECONNECTION_MAX_INTERVAL`]). Returns the backoff that was scheduled.
+	fn record_failure(&mut self, now: Instant) -> Duration {
+		self.consecutive_failures = self.consecutive_failures.saturating_add(1);
+		let scheduled_backoff = self.next_backoff;
+		self.next_retry_at = now + scheduled_backoff;
+		self.next_backoff =
+			std::cmp::min(scheduled_backoff.saturating_mul(2), PEER_RECONNECTION_MAX_INTERVAL);
+		scheduled_backoff
+	}
+}
+
 pub(crate) struct ConnectionManager<L: Deref + Clone + Sync + Send>
 where
 	L::Target: LdkLogger,
 {
 	pending_connections:
 		Mutex<HashMap<PublicKey, Vec<tokio::sync::oneshot::Sender<Result<(), Error>>>>>,
+	reconnect_state: Mutex<HashMap<PublicKey, PeerReconnectState>>,
 	peer_manager: Arc<PeerManager>,
 	tor_proxy_config: Option<TorConfig>,
 	keys_manager: Arc<KeysManager>,
@@ -39,8 +68,60 @@ where
 		keys_manager: Arc<KeysManager>, logger: L,
 	) -> Self {
 		let pending_connections = Mutex::new(HashMap::new());
+		let reconnect_state = Mutex::new(HashMap::new());
+
+		Self {
+			pending_connections,
+			reconnect_state,
+			peer_manager,
+			tor_proxy_config,
+			keys_manager,
+			logger,
+		}
+	}
+
+	/// Returns whether the background reconnection task should attempt to reconnect
+	/// to `node_id` now, based on per-peer exponential backoff state.
+	pub(crate) fn is_reconnect_due(&self, node_id: &PublicKey) -> bool {
+		self.reconnect_state
+			.lock()
+			.expect("lock")
+			.get(node_id)
+			.map_or(true, |state| Instant::now() >= state.next_retry_at)
+	}
+
+	/// Records the outcome of a reconnection attempt and updates per-peer backoff.
+	///
+	/// On success, any existing backoff state is cleared. On failure, the per-peer
+	/// retry interval is doubled (up to [`PEER_RECONNECTION_MAX_INTERVAL`]) and the
+	/// next retry is scheduled accordingly.
+	pub(crate) fn record_reconnect_attempt(&self, node_id: &PublicKey, result: &Result<(), Error>) {
+		let mut state_lock = self.reconnect_state.lock().expect("lock");
+		if result.is_ok() {
+			if state_lock.remove(node_id).is_some() {
+				log_debug!(self.logger, "Cleared reconnection backoff for peer {}", node_id);
+			}
+			return;
+		}
 
-		Self { pending_connections, peer_manager, tor_proxy_config, keys_manager, logger }
+		let now = Instant::now();
+		let state = state_lock.entry(*node_id).or_insert_with(|| PeerReconnectState::new(now));
+		let scheduled_backoff = state.record_failure(now);
+
+		log_debug!(
+			self.logger,
+			"Reconnection to peer {} failed ({} consecutive failures); next retry in {}s",
+			node_id,
+			state.consecutive_failures,
+			scheduled_backoff.as_secs(),
+		);
+	}
+
+	/// Removes any per-peer backoff state for `node_id`, so a subsequent attempt
+	/// is treated as a fresh first try. Called when a peer is removed from the
+	/// persisted peer store.
+	pub(crate) fn clear_reconnect_state(&self, node_id: &PublicKey) {
+		self.reconnect_state.lock().expect("lock").remove(node_id);
 	}
 
 	pub(crate) async fn connect_peer_if_necessary(
@@ -57,6 +138,11 @@ where
 		&self, node_id: PublicKey, addr: SocketAddress,
 	) -> Result<(), Error> {
 		let res = self.do_connect_peer_internal(node_id, addr).await;
+		if res.is_ok() {
+			// Any successful connect (including user-initiated ones) resets backoff so the
+			// background reconnection loop retries promptly if the peer drops again.
+			self.clear_reconnect_state(&node_id);
+		}
 		self.propagate_result_to_subscribers(&node_id, res);
 		res
 	}
@@ -273,3 +359,49 @@ where
 		}
 	}
 }
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn reconnect_state_doubles_until_capped() {
+		let start = Instant::now();
+		let mut state = PeerReconnectState::new(start);
+
+		let scheduled = state.record_failure(start);
+		assert_eq!(scheduled, PEER_RECONNECTION_INTERVAL);
+		assert_eq!(state.consecutive_failures, 1);
+		assert_eq!(state.next_retry_at, start + PEER_RECONNECTION_INTERVAL);
+
+		let mut expected = PEER_RECONNECTION_INTERVAL;
+		for failure_count in 2..32 {
+			expected = std::cmp::min(expected.saturating_mul(2), PEER_RECONNECTION_MAX_INTERVAL);
+			let scheduled = state.record_failure(start);
+			assert_eq!(scheduled, expected);
+			assert_eq!(state.consecutive_failures, failure_count);
+			assert_eq!(state.next_retry_at, start + expected);
+			assert!(state.next_backoff <= PEER_RECONNECTION_MAX_INTERVAL);
+		}
+
+		// Once capped, further failures stay at the cap.
+		assert_eq!(state.next_backoff, PEER_RECONNECTION_MAX_INTERVAL);
+		let scheduled = state.record_failure(start);
+		assert_eq!(scheduled, PEER_RECONNECTION_MAX_INTERVAL);
+		assert_eq!(state.next_backoff, PEER_RECONNECTION_MAX_INTERVAL);
+	}
+
+	#[test]
+	fn reconnect_state_schedules_relative_to_failure_time() {
+		let t0 = Instant::now();
+		let mut state = PeerReconnectState::new(t0);
+
+		let _ = state.record_failure(t0);
+		assert_eq!(state.next_retry_at, t0 + PEER_RECONNECTION_INTERVAL);
+
+		let t1 = t0 + Duration::from_secs(5);
+		let scheduled = state.record_failure(t1);
+		assert_eq!(scheduled, PEER_RECONNECTION_INTERVAL * 2);
+		assert_eq!(state.next_retry_at, t1 + PEER_RECONNECTION_INTERVAL * 2);
+	}
+}
diff --git a/src/lib.rs b/src/lib.rs
index 614be098b0..c926e1050d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -473,11 +473,22 @@ impl Node {
 								.map(|peer| peer.counterparty_node_id)
 								.collect::<Vec<_>>();
 
-							for peer_info in connect_peer_store.list_peers().iter().filter(|info| !pm_peers.contains(&info.node_id)) {
-								let _ = connect_cm.do_connect_peer(
+							for peer_info in connect_peer_store.list_peers().iter() {
+								if pm_peers.contains(&peer_info.node_id) {
+									// A connected peer (e.g., via an inbound connection) is
+									// proven reachable: reset any backoff so a future
+									// disconnect is retried promptly again.
+									connect_cm.clear_reconnect_state(&peer_info.node_id);
+									continue;
+								}
+								if !connect_cm.is_reconnect_due(&peer_info.node_id) {
+									continue;
+								}
+								let res = connect_cm.do_connect_peer(
 									peer_info.node_id,
 									peer_info.address.clone(),
 									).await;
+								connect_cm.record_reconnect_attempt(&peer_info.node_id, &res);
 							}
 						}
 				}
@@ -1144,6 +1155,7 @@ impl Node {
 				log_error!(self.logger, "Failed to remove peer {}: {}", counterparty_node_id, e)
 			},
 		}
+		self.connection_manager.clear_reconnect_state(&counterparty_node_id);
 
 		self.peer_manager.disconnect_by_node_id(counterparty_node_id);
 		Ok(())
@@ -1862,6 +1874,7 @@ impl Node {
 			// Check if this was the last open channel, if so, forget the peer.
 			if open_channels.len() == 1 {
 				self.peer_store.remove_peer(&counterparty_node_id)?;
+				self.connection_manager.clear_reconnect_state(&counterparty_node_id);
 			}
 		}