From e343caf2d2520abaaf0f2adcd2eee4814855eabd Mon Sep 17 00:00:00 2001 From: torlando-tech Date: Mon, 23 Feb 2026 12:30:30 -0500 Subject: [PATCH] Stability: WDT yield, BLE mutex fixes, time-based desync recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduces crash rate from every 60-85s to 1 reboot per 6+ minutes. Zero WDT triggers in 10-minute stability test. BLE mutex fixes (BLEInterface.cpp): - Release _mutex before blocking GATT ops in onConnected() and onServicesDiscovered() — prevents 5-30s main-loop stalls during service discovery, notification subscribe, identity exchange - Non-blocking try_lock() for peerCount(), getConnectedPeerSummaries(), get_stats() — returns empty/default if BLE task holds mutex - Write-without-response in initiateHandshake() WDT and persistence (main.cpp, sdkconfig.defaults, microReticulum): - 30s WDT timeout (up from 10s) for SPIFFS flash I/O headroom - Register Identity::set_persist_yield_callback() to feed WDT every 5 entries during save_known_destinations() (70+ entries = 30-50s) - WDT feeds between reticulum and identity persist calls BLE host desync recovery (NimBLEPlatform): - Time-based desync tracking instead of aggressive counter-based reboot - 60s tolerance without connections, 5 minutes with active connections (data still flows over existing BLE mesh links) - Remove immediate recoverBLEStack() from 574 handler and enterErrorRecovery() — let startScan() manage reboot decision - Increase CONNECTION_COOLDOWN from 3s to 10s to reduce 574 risk - Increase SCAN_FAIL_RECOVERY_THRESHOLD from 5 to 10 Co-Authored-By: Claude Opus 4.6 --- deps/microReticulum | 2 +- lib/ble_interface/BLEInterface.cpp | 75 +++++++++++-------- lib/ble_interface/BLEInterface.h | 2 +- .../platforms/NimBLEPlatform.cpp | 68 +++++++++++------ lib/ble_interface/platforms/NimBLEPlatform.h | 10 ++- sdkconfig.defaults | 4 +- src/main.cpp | 19 +++-- 7 files changed, 116 insertions(+), 64 deletions(-) diff --git a/deps/microReticulum b/deps/microReticulum index 2e2322b..12ef1ae 160000 --- a/deps/microReticulum +++ b/deps/microReticulum @@ -1 +1 @@ -Subproject commit 2e2322b671c482bdf3818ede2c522de7e8108718 +Subproject commit 12ef1ae89447c0c36d286f82c9f2a434bd848e90 diff --git a/lib/ble_interface/BLEInterface.cpp b/lib/ble_interface/BLEInterface.cpp index e200cc5..5956ec6 100644 --- a/lib/ble_interface/BLEInterface.cpp +++ b/lib/ble_interface/BLEInterface.cpp @@ -412,14 +412,19 @@ bool BLEInterface::sendToPeer(const Bytes& peer_identity, const Bytes& data) { //============================================================================= size_t BLEInterface::peerCount() const { - std::lock_guard lock(_mutex); - return _peer_manager.connectedCount(); + // Non-blocking: return 0 if BLE task is busy (avoids main loop WDT) + if (!_mutex.try_lock()) return 0; + size_t count = _peer_manager.connectedCount(); + _mutex.unlock(); + return count; } size_t BLEInterface::getConnectedPeerSummaries(PeerSummary* out, size_t max_count) const { if (!out || max_count == 0) return 0; - std::lock_guard lock(_mutex); + // Non-blocking: return 0 if BLE task is busy (avoids main loop WDT) + if (!_mutex.try_lock()) return 0; + std::lock_guard lock(_mutex, std::adopt_lock); // Cast away const for read-only access to non-const getConnectedPeers() auto& mutable_peer_manager = const_cast(_peer_manager); @@ -465,7 +470,9 @@ std::map BLEInterface::get_stats() const { stats["peripheral_connections"] = 0.0f; try { - std::lock_guard lock(_mutex); + // Non-blocking: return defaults if BLE task is busy (avoids main loop WDT) + if (!_mutex.try_lock()) return stats; + std::lock_guard lock(_mutex, std::adopt_lock); // Count central vs peripheral connections int central_count = 0; @@ -594,30 +601,33 @@ void BLEInterface::onScanResult(const ScanResult& result) { } void BLEInterface::onConnected(const ConnectionHandle& conn) { - std::lock_guard lock(_mutex); + { + std::lock_guard lock(_mutex); - Bytes mac = conn.peer_address.toBytes(); + Bytes mac = conn.peer_address.toBytes(); - // Update peer state - _peer_manager.setPeerState(mac, PeerState::HANDSHAKING); - _peer_manager.setPeerHandle(mac, conn.handle); + // Update peer state + _peer_manager.setPeerState(mac, PeerState::HANDSHAKING); + _peer_manager.setPeerHandle(mac, conn.handle); - // Set MTU from connection (onMTUChange only fires for peripheral connections) - if (conn.mtu > 0) { - _peer_manager.setPeerMTU(mac, conn.mtu); - } + // Set MTU from connection (onMTUChange only fires for peripheral connections) + if (conn.mtu > 0) { + _peer_manager.setPeerMTU(mac, conn.mtu); + } - // Mark as central connection (we initiated the connection) - PeerInfo* peer = _peer_manager.getPeerByMac(mac); - if (peer) { - peer->is_central = true; // We ARE central in this connection - } + // Mark as central connection (we initiated the connection) + PeerInfo* peer = _peer_manager.getPeerByMac(mac); + if (peer) { + peer->is_central = true; // We ARE central in this connection + } - INFO("BLE: Connected to " + conn.peer_address.toString() + - " handle=" + std::to_string(conn.handle) + - " mtu=" + std::to_string(conn.mtu) + " (we are central)"); + INFO("BLE: Connected to " + conn.peer_address.toString() + + " handle=" + std::to_string(conn.handle) + + " mtu=" + std::to_string(conn.mtu) + " (we are central)"); + } // _mutex released BEFORE blocking GATT service discovery - // Discover services + // Discover services — this does blocking GATT reads (3-15s) and must NOT + // hold _mutex, otherwise the NimBLE host task and main loop both block. _platform->discoverServices(conn.handle); } @@ -672,15 +682,15 @@ void BLEInterface::onMTUChanged(const ConnectionHandle& conn, uint16_t mtu) { } void BLEInterface::onServicesDiscovered(const ConnectionHandle& conn, bool success) { - std::lock_guard lock(_mutex); - if (!success) { WARNING("BLEInterface: Service discovery failed for " + conn.peer_address.toString()); - // Clean up peer state - NimBLE may have already disconnected internally, - // so onDisconnected callback might not fire. Manually reset peer state. - Bytes mac = conn.peer_address.toBytes(); - _peer_manager.connectionFailed(mac); + // Clean up peer state under brief lock + { + std::lock_guard lock(_mutex); + Bytes mac = conn.peer_address.toBytes(); + _peer_manager.connectionFailed(mac); + } // Try to disconnect (may be no-op if already disconnected) _platform->disconnect(conn.handle); @@ -689,6 +699,11 @@ void BLEInterface::onServicesDiscovered(const ConnectionHandle& conn, bool succe INFO("BLE: Services discovered for " + conn.peer_address.toString()); + // All operations below are blocking GATT ops — do NOT hold _mutex. + // Holding _mutex during these blocks the NimBLE host task (which needs + // _mutex for its own callbacks), causing "BLE stack stuck" detection, + // and blocks the main loop's peer status queries, causing WDT triggers. + // Enable notifications on TX characteristic _platform->enableNotifications(conn.handle, true); @@ -1094,8 +1109,8 @@ void BLEInterface::initiateHandshake(const ConnectionHandle& conn) { Bytes handshake = _identity_manager.initiateHandshake(mac); if (handshake.size() > 0) { - // Write our identity to peer's RX characteristic - _platform->write(conn.handle, handshake, true); + // Write our identity to peer's RX characteristic (no-response to avoid blocking) + _platform->write(conn.handle, handshake, false); DEBUG("BLEInterface: Sent identity handshake to " + conn.peer_address.toString()); } diff --git a/lib/ble_interface/BLEInterface.h b/lib/ble_interface/BLEInterface.h index 0d9538e..0659da6 100644 --- a/lib/ble_interface/BLEInterface.h +++ b/lib/ble_interface/BLEInterface.h @@ -49,7 +49,7 @@ public: static constexpr double SCAN_INTERVAL = 5.0; // Seconds between scans static constexpr double KEEPALIVE_INTERVAL = 15.0; // Seconds between keepalives static constexpr double MAINTENANCE_INTERVAL = 1.0; // Seconds between maintenance - static constexpr double CONNECTION_COOLDOWN = 3.0; // Seconds to wait after connection failure + static constexpr double CONNECTION_COOLDOWN = 10.0; // Seconds between connection attempts (reduces 574 risk) public: /** diff --git a/lib/ble_interface/platforms/NimBLEPlatform.cpp b/lib/ble_interface/platforms/NimBLEPlatform.cpp index b041de9..aacc2de 100644 --- a/lib/ble_interface/platforms/NimBLEPlatform.cpp +++ b/lib/ble_interface/platforms/NimBLEPlatform.cpp @@ -586,24 +586,25 @@ void NimBLEPlatform::enterErrorRecovery() { _slave_paused_for_master = false; // Wait for host to sync after any reset operation - // This is critical - the host needs time to fully reset and resync with controller + // Give the host up to 5s — NimBLE typically re-syncs within 1-3s if (!ble_hs_synced()) { - WARNING("NimBLEPlatform: Host not synced, waiting..."); + WARNING("NimBLEPlatform: Host not synced, waiting up to 5s..."); uint32_t sync_start = millis(); - while (!ble_hs_synced() && (millis() - sync_start) < 3000) { + while (!ble_hs_synced() && (millis() - sync_start) < 5000) { delay(50); - esp_task_wdt_reset(); } if (ble_hs_synced()) { INFO("NimBLEPlatform: Host sync restored after " + std::to_string(millis() - sync_start) + "ms"); } else { - ERROR("NimBLEPlatform: Host sync failed - escalating to full stack reset"); - // The host is stuck in reset. enterErrorRecovery can't fix this. - // Must do a full deinit/reinit cycle to recover the BLE radio. + // Don't immediately reboot — track desync time and let startScan() + // handle the reboot decision based on prolonged desync (30s). + WARNING("NimBLEPlatform: Host not synced after 5s, will retry on next scan cycle"); + if (_host_desync_since == 0) { + _host_desync_since = millis(); + } in_recovery = false; - recoverBLEStack(); - return; // recoverBLEStack handles state reset and advertising restart + return; } } @@ -662,28 +663,53 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) { return true; } - // Wait for host sync before trying to scan (host may be resetting after connection failure) + // Wait for host sync before trying to scan (host may be resetting after connection failure). + // NimBLE host self-recovers from most desyncs within 1-5s. Only reboot after prolonged desync. if (!ble_hs_synced()) { + // Track when desync started + if (_host_desync_since == 0) { + _host_desync_since = millis(); + } + DEBUG("NimBLEPlatform: Host not synced, waiting before scan..."); uint32_t sync_wait = millis(); - while (!ble_hs_synced() && (millis() - sync_wait) < 2000) { + while (!ble_hs_synced() && (millis() - sync_wait) < 3000) { delay(50); - esp_task_wdt_reset(); } if (!ble_hs_synced()) { + unsigned long desync_duration = millis() - _host_desync_since; _scan_fail_count++; - WARNING("NimBLEPlatform: Host still not synced after 2s, scan aborted (fail " + + WARNING("NimBLEPlatform: Host not synced, desync " + + std::to_string(desync_duration / 1000) + "s (fail " + std::to_string(_scan_fail_count) + "/" + std::to_string(SCAN_FAIL_RECOVERY_THRESHOLD) + ")"); - if (_scan_fail_count >= SCAN_FAIL_RECOVERY_THRESHOLD) { - WARNING("NimBLEPlatform: Host stuck, triggering full stack recovery"); + + // Only reboot after prolonged desync — brief desyncs self-recover. + // With active connections, data still flows over BLE mesh — we just + // can't discover new peers. Extend tolerance to 5 minutes. + unsigned long reboot_threshold = HOST_DESYNC_REBOOT_MS; // 60s base + if (getConnectionCount() > 0) { + reboot_threshold = 300000; // 5 min with active connections + } + if (desync_duration >= reboot_threshold) { + ERROR("NimBLEPlatform: Host desynced for " + + std::to_string(desync_duration / 1000) + "s (conns=" + + std::to_string(getConnectionCount()) + "), rebooting"); _scan_fail_count = 0; + _host_desync_since = 0; recoverBLEStack(); } return false; } } + // Host is synced — clear desync tracking + if (_host_desync_since != 0) { + unsigned long recovery_time = millis() - _host_desync_since; + INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms"); + _host_desync_since = 0; + } + // Log GAP hardware state before checking DEBUG("NimBLEPlatform: Pre-scan GAP state: disc=" + std::to_string(ble_gap_disc_active()) + " adv=" + std::to_string(ble_gap_adv_active()) + @@ -1027,18 +1053,14 @@ int NimBLEPlatform::nativeGapEventHandler(struct ble_gap_event* event, void* arg platform->_native_connect_success = false; platform->_native_connect_pending = false; - // Track connection establishment failures (574 = BLE_ERR_CONN_ESTABLISHMENT) + // Track connection establishment failures (574 = BLE_ERR_CONN_ESTABLISHMENT). + // These commonly cause brief host desyncs that self-recover. + // Don't escalate to enterErrorRecovery here — let the time-based + // desync tracking in startScan() handle reboot decisions. if (disc_reason == 574) { platform->_conn_establish_fail_count++; WARNING("NimBLEPlatform: Connection establishment failed (574), count=" + std::to_string(platform->_conn_establish_fail_count)); - - // If too many consecutive failures, trigger recovery - if (platform->_conn_establish_fail_count >= CONN_ESTABLISH_FAIL_THRESHOLD) { - WARNING("NimBLEPlatform: Too many connection establishment failures, entering recovery"); - platform->_conn_establish_fail_count = 0; - platform->enterErrorRecovery(); - } } } diff --git a/lib/ble_interface/platforms/NimBLEPlatform.h b/lib/ble_interface/platforms/NimBLEPlatform.h index 3ca26d1..b1ac518 100644 --- a/lib/ble_interface/platforms/NimBLEPlatform.h +++ b/lib/ble_interface/platforms/NimBLEPlatform.h @@ -252,15 +252,19 @@ private: Bytes _identity_data; unsigned long _scan_stop_time = 0; // millis() when to stop continuous scan - // BLE stack recovery + // BLE stack recovery — time-based desync tracking + // The NimBLE host self-recovers from most desyncs within 1-5s. + // We only reboot after prolonged desync (HOST_DESYNC_REBOOT_MS). uint8_t _scan_fail_count = 0; uint8_t _lightweight_reset_fails = 0; uint8_t _conn_establish_fail_count = 0; // rc=574 connection establishment failures unsigned long _last_full_recovery_time = 0; - static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 5; + unsigned long _host_desync_since = 0; // millis() when host first lost sync (0 = synced) + static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 10; static constexpr uint8_t LIGHTWEIGHT_RESET_MAX_FAILS = 3; - static constexpr uint8_t CONN_ESTABLISH_FAIL_THRESHOLD = 3; // Threshold for rc=574 + static constexpr uint8_t CONN_ESTABLISH_FAIL_THRESHOLD = 5; static constexpr unsigned long FULL_RECOVERY_COOLDOWN_MS = 60000; // 60 seconds + static constexpr unsigned long HOST_DESYNC_REBOOT_MS = 60000; // Reboot after 60s desync (no connections) bool recoverBLEStack(); // NimBLE objects diff --git a/sdkconfig.defaults b/sdkconfig.defaults index eb26dcc..a9676ab 100644 --- a/sdkconfig.defaults +++ b/sdkconfig.defaults @@ -77,8 +77,10 @@ CONFIG_FREERTOS_TIMER_TASK_STACK_DEPTH=4096 # ============================================================================ # Task Watchdog Timer (TWDT) # Detects task starvation and deadlock conditions +# 30s timeout accommodates SPIFFS flash I/O during identity persistence +# (writing 40-50 destinations to flash takes 5-15s with GC/erase) # ============================================================================ CONFIG_ESP_TASK_WDT_EN=y -CONFIG_ESP_TASK_WDT_TIMEOUT_S=10 +CONFIG_ESP_TASK_WDT_TIMEOUT_S=30 CONFIG_ESP_TASK_WDT_CHECK_IDLE_TASK_CPU0=y CONFIG_ESP_TASK_WDT_CHECK_IDLE_TASK_CPU1=y diff --git a/src/main.cpp b/src/main.cpp index 7b04533..a3b784b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1194,11 +1194,15 @@ void setup() { INFO("╚══════════════════════════════════════╝"); INFO(""); - // Subscribe main loop to Task Watchdog — detects hangs/deadlocks - // If loop() blocks for >10s (CONFIG_ESP_TASK_WDT_TIMEOUT_S), WDT fires - // with a backtrace showing exactly where the hang is - esp_task_wdt_add(NULL); // NULL = current task (loopTask) - INFO("Task Watchdog: loopTask subscribed"); + // Reconfigure Task Watchdog with 30s timeout (default 10s is too tight + // for SPIFFS flash I/O — identity persistence writes 40-50 entries and + // can take 5-15s with sector erases and garbage collection) + esp_task_wdt_init(30, true); // 30s timeout, panic on trigger + esp_task_wdt_add(NULL); // Subscribe loopTask + INFO("Task Watchdog: loopTask subscribed (30s timeout)"); + + // Feed WDT during long Identity persistence (71+ entries to SPIFFS can take >30s) + Identity::set_persist_yield_callback([]() { esp_task_wdt_reset(); }); // Show startup message INFO("Press any key to start messaging"); @@ -1275,10 +1279,15 @@ void loop() { reticulum->loop(); // Periodically persist identity/transport data (display names, paths, etc.) + // NOTE: Identity persistence writes 40-50 entries to SPIFFS flash, which + // involves sector erases (100ms each) and can take 5-15s total. + // WDT feeds between calls prevent timeout during heavy flash I/O. LOOP_STEP(5); // persist data reticulum->should_persist_data(); + esp_task_wdt_reset(); // Fast-persist known destinations (5s after dirty) to survive crashes Identity::should_persist_data(); + esp_task_wdt_reset(); // Process TCP interface LOOP_STEP(6); // TCP loop