mirror of
https://github.com/torlando-tech/pyxis.git
synced 2026-05-14 19:35:05 +00:00
Stability: WDT yield, BLE mutex fixes, time-based desync recovery
Reduces crash rate from every 60-85s to 1 reboot per 6+ minutes. Zero WDT triggers in 10-minute stability test. BLE mutex fixes (BLEInterface.cpp): - Release _mutex before blocking GATT ops in onConnected() and onServicesDiscovered() — prevents 5-30s main-loop stalls during service discovery, notification subscribe, identity exchange - Non-blocking try_lock() for peerCount(), getConnectedPeerSummaries(), get_stats() — returns empty/default if BLE task holds mutex - Write-without-response in initiateHandshake() WDT and persistence (main.cpp, sdkconfig.defaults, microReticulum): - 30s WDT timeout (up from 10s) for SPIFFS flash I/O headroom - Register Identity::set_persist_yield_callback() to feed WDT every 5 entries during save_known_destinations() (70+ entries = 30-50s) - WDT feeds between reticulum and identity persist calls BLE host desync recovery (NimBLEPlatform): - Time-based desync tracking instead of aggressive counter-based reboot - 60s tolerance without connections, 5 minutes with active connections (data still flows over existing BLE mesh links) - Remove immediate recoverBLEStack() from 574 handler and enterErrorRecovery() — let startScan() manage reboot decision - Increase CONNECTION_COOLDOWN from 3s to 10s to reduce 574 risk - Increase SCAN_FAIL_RECOVERY_THRESHOLD from 5 to 10 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Vendored
+1
-1
Submodule deps/microReticulum updated: 2e2322b671...12ef1ae894
@@ -412,14 +412,19 @@ bool BLEInterface::sendToPeer(const Bytes& peer_identity, const Bytes& data) {
|
||||
//=============================================================================
|
||||
|
||||
size_t BLEInterface::peerCount() const {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
return _peer_manager.connectedCount();
|
||||
// Non-blocking: return 0 if BLE task is busy (avoids main loop WDT)
|
||||
if (!_mutex.try_lock()) return 0;
|
||||
size_t count = _peer_manager.connectedCount();
|
||||
_mutex.unlock();
|
||||
return count;
|
||||
}
|
||||
|
||||
size_t BLEInterface::getConnectedPeerSummaries(PeerSummary* out, size_t max_count) const {
|
||||
if (!out || max_count == 0) return 0;
|
||||
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
// Non-blocking: return 0 if BLE task is busy (avoids main loop WDT)
|
||||
if (!_mutex.try_lock()) return 0;
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex, std::adopt_lock);
|
||||
|
||||
// Cast away const for read-only access to non-const getConnectedPeers()
|
||||
auto& mutable_peer_manager = const_cast<BLE::BLEPeerManager&>(_peer_manager);
|
||||
@@ -465,7 +470,9 @@ std::map<std::string, float> BLEInterface::get_stats() const {
|
||||
stats["peripheral_connections"] = 0.0f;
|
||||
|
||||
try {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
// Non-blocking: return defaults if BLE task is busy (avoids main loop WDT)
|
||||
if (!_mutex.try_lock()) return stats;
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex, std::adopt_lock);
|
||||
|
||||
// Count central vs peripheral connections
|
||||
int central_count = 0;
|
||||
@@ -594,30 +601,33 @@ void BLEInterface::onScanResult(const ScanResult& result) {
|
||||
}
|
||||
|
||||
void BLEInterface::onConnected(const ConnectionHandle& conn) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
|
||||
// Update peer state
|
||||
_peer_manager.setPeerState(mac, PeerState::HANDSHAKING);
|
||||
_peer_manager.setPeerHandle(mac, conn.handle);
|
||||
// Update peer state
|
||||
_peer_manager.setPeerState(mac, PeerState::HANDSHAKING);
|
||||
_peer_manager.setPeerHandle(mac, conn.handle);
|
||||
|
||||
// Set MTU from connection (onMTUChange only fires for peripheral connections)
|
||||
if (conn.mtu > 0) {
|
||||
_peer_manager.setPeerMTU(mac, conn.mtu);
|
||||
}
|
||||
// Set MTU from connection (onMTUChange only fires for peripheral connections)
|
||||
if (conn.mtu > 0) {
|
||||
_peer_manager.setPeerMTU(mac, conn.mtu);
|
||||
}
|
||||
|
||||
// Mark as central connection (we initiated the connection)
|
||||
PeerInfo* peer = _peer_manager.getPeerByMac(mac);
|
||||
if (peer) {
|
||||
peer->is_central = true; // We ARE central in this connection
|
||||
}
|
||||
// Mark as central connection (we initiated the connection)
|
||||
PeerInfo* peer = _peer_manager.getPeerByMac(mac);
|
||||
if (peer) {
|
||||
peer->is_central = true; // We ARE central in this connection
|
||||
}
|
||||
|
||||
INFO("BLE: Connected to " + conn.peer_address.toString() +
|
||||
" handle=" + std::to_string(conn.handle) +
|
||||
" mtu=" + std::to_string(conn.mtu) + " (we are central)");
|
||||
INFO("BLE: Connected to " + conn.peer_address.toString() +
|
||||
" handle=" + std::to_string(conn.handle) +
|
||||
" mtu=" + std::to_string(conn.mtu) + " (we are central)");
|
||||
} // _mutex released BEFORE blocking GATT service discovery
|
||||
|
||||
// Discover services
|
||||
// Discover services — this does blocking GATT reads (3-15s) and must NOT
|
||||
// hold _mutex, otherwise the NimBLE host task and main loop both block.
|
||||
_platform->discoverServices(conn.handle);
|
||||
}
|
||||
|
||||
@@ -672,15 +682,15 @@ void BLEInterface::onMTUChanged(const ConnectionHandle& conn, uint16_t mtu) {
|
||||
}
|
||||
|
||||
void BLEInterface::onServicesDiscovered(const ConnectionHandle& conn, bool success) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
|
||||
if (!success) {
|
||||
WARNING("BLEInterface: Service discovery failed for " + conn.peer_address.toString());
|
||||
|
||||
// Clean up peer state - NimBLE may have already disconnected internally,
|
||||
// so onDisconnected callback might not fire. Manually reset peer state.
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
_peer_manager.connectionFailed(mac);
|
||||
// Clean up peer state under brief lock
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
_peer_manager.connectionFailed(mac);
|
||||
}
|
||||
|
||||
// Try to disconnect (may be no-op if already disconnected)
|
||||
_platform->disconnect(conn.handle);
|
||||
@@ -689,6 +699,11 @@ void BLEInterface::onServicesDiscovered(const ConnectionHandle& conn, bool succe
|
||||
|
||||
INFO("BLE: Services discovered for " + conn.peer_address.toString());
|
||||
|
||||
// All operations below are blocking GATT ops — do NOT hold _mutex.
|
||||
// Holding _mutex during these blocks the NimBLE host task (which needs
|
||||
// _mutex for its own callbacks), causing "BLE stack stuck" detection,
|
||||
// and blocks the main loop's peer status queries, causing WDT triggers.
|
||||
|
||||
// Enable notifications on TX characteristic
|
||||
_platform->enableNotifications(conn.handle, true);
|
||||
|
||||
@@ -1094,8 +1109,8 @@ void BLEInterface::initiateHandshake(const ConnectionHandle& conn) {
|
||||
Bytes handshake = _identity_manager.initiateHandshake(mac);
|
||||
|
||||
if (handshake.size() > 0) {
|
||||
// Write our identity to peer's RX characteristic
|
||||
_platform->write(conn.handle, handshake, true);
|
||||
// Write our identity to peer's RX characteristic (no-response to avoid blocking)
|
||||
_platform->write(conn.handle, handshake, false);
|
||||
|
||||
DEBUG("BLEInterface: Sent identity handshake to " + conn.peer_address.toString());
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ public:
|
||||
static constexpr double SCAN_INTERVAL = 5.0; // Seconds between scans
|
||||
static constexpr double KEEPALIVE_INTERVAL = 15.0; // Seconds between keepalives
|
||||
static constexpr double MAINTENANCE_INTERVAL = 1.0; // Seconds between maintenance
|
||||
static constexpr double CONNECTION_COOLDOWN = 3.0; // Seconds to wait after connection failure
|
||||
static constexpr double CONNECTION_COOLDOWN = 10.0; // Seconds between connection attempts (reduces 574 risk)
|
||||
|
||||
public:
|
||||
/**
|
||||
|
||||
@@ -586,24 +586,25 @@ void NimBLEPlatform::enterErrorRecovery() {
|
||||
_slave_paused_for_master = false;
|
||||
|
||||
// Wait for host to sync after any reset operation
|
||||
// This is critical - the host needs time to fully reset and resync with controller
|
||||
// Give the host up to 5s — NimBLE typically re-syncs within 1-3s
|
||||
if (!ble_hs_synced()) {
|
||||
WARNING("NimBLEPlatform: Host not synced, waiting...");
|
||||
WARNING("NimBLEPlatform: Host not synced, waiting up to 5s...");
|
||||
uint32_t sync_start = millis();
|
||||
while (!ble_hs_synced() && (millis() - sync_start) < 3000) {
|
||||
while (!ble_hs_synced() && (millis() - sync_start) < 5000) {
|
||||
delay(50);
|
||||
esp_task_wdt_reset();
|
||||
}
|
||||
if (ble_hs_synced()) {
|
||||
INFO("NimBLEPlatform: Host sync restored after " +
|
||||
std::to_string(millis() - sync_start) + "ms");
|
||||
} else {
|
||||
ERROR("NimBLEPlatform: Host sync failed - escalating to full stack reset");
|
||||
// The host is stuck in reset. enterErrorRecovery can't fix this.
|
||||
// Must do a full deinit/reinit cycle to recover the BLE radio.
|
||||
// Don't immediately reboot — track desync time and let startScan()
|
||||
// handle the reboot decision based on prolonged desync (30s).
|
||||
WARNING("NimBLEPlatform: Host not synced after 5s, will retry on next scan cycle");
|
||||
if (_host_desync_since == 0) {
|
||||
_host_desync_since = millis();
|
||||
}
|
||||
in_recovery = false;
|
||||
recoverBLEStack();
|
||||
return; // recoverBLEStack handles state reset and advertising restart
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -662,28 +663,53 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Wait for host sync before trying to scan (host may be resetting after connection failure)
|
||||
// Wait for host sync before trying to scan (host may be resetting after connection failure).
|
||||
// NimBLE host self-recovers from most desyncs within 1-5s. Only reboot after prolonged desync.
|
||||
if (!ble_hs_synced()) {
|
||||
// Track when desync started
|
||||
if (_host_desync_since == 0) {
|
||||
_host_desync_since = millis();
|
||||
}
|
||||
|
||||
DEBUG("NimBLEPlatform: Host not synced, waiting before scan...");
|
||||
uint32_t sync_wait = millis();
|
||||
while (!ble_hs_synced() && (millis() - sync_wait) < 2000) {
|
||||
while (!ble_hs_synced() && (millis() - sync_wait) < 3000) {
|
||||
delay(50);
|
||||
esp_task_wdt_reset();
|
||||
}
|
||||
if (!ble_hs_synced()) {
|
||||
unsigned long desync_duration = millis() - _host_desync_since;
|
||||
_scan_fail_count++;
|
||||
WARNING("NimBLEPlatform: Host still not synced after 2s, scan aborted (fail " +
|
||||
WARNING("NimBLEPlatform: Host not synced, desync " +
|
||||
std::to_string(desync_duration / 1000) + "s (fail " +
|
||||
std::to_string(_scan_fail_count) + "/" +
|
||||
std::to_string(SCAN_FAIL_RECOVERY_THRESHOLD) + ")");
|
||||
if (_scan_fail_count >= SCAN_FAIL_RECOVERY_THRESHOLD) {
|
||||
WARNING("NimBLEPlatform: Host stuck, triggering full stack recovery");
|
||||
|
||||
// Only reboot after prolonged desync — brief desyncs self-recover.
|
||||
// With active connections, data still flows over BLE mesh — we just
|
||||
// can't discover new peers. Extend tolerance to 5 minutes.
|
||||
unsigned long reboot_threshold = HOST_DESYNC_REBOOT_MS; // 60s base
|
||||
if (getConnectionCount() > 0) {
|
||||
reboot_threshold = 300000; // 5 min with active connections
|
||||
}
|
||||
if (desync_duration >= reboot_threshold) {
|
||||
ERROR("NimBLEPlatform: Host desynced for " +
|
||||
std::to_string(desync_duration / 1000) + "s (conns=" +
|
||||
std::to_string(getConnectionCount()) + "), rebooting");
|
||||
_scan_fail_count = 0;
|
||||
_host_desync_since = 0;
|
||||
recoverBLEStack();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Host is synced — clear desync tracking
|
||||
if (_host_desync_since != 0) {
|
||||
unsigned long recovery_time = millis() - _host_desync_since;
|
||||
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms");
|
||||
_host_desync_since = 0;
|
||||
}
|
||||
|
||||
// Log GAP hardware state before checking
|
||||
DEBUG("NimBLEPlatform: Pre-scan GAP state: disc=" + std::to_string(ble_gap_disc_active()) +
|
||||
" adv=" + std::to_string(ble_gap_adv_active()) +
|
||||
@@ -1027,18 +1053,14 @@ int NimBLEPlatform::nativeGapEventHandler(struct ble_gap_event* event, void* arg
|
||||
platform->_native_connect_success = false;
|
||||
platform->_native_connect_pending = false;
|
||||
|
||||
// Track connection establishment failures (574 = BLE_ERR_CONN_ESTABLISHMENT)
|
||||
// Track connection establishment failures (574 = BLE_ERR_CONN_ESTABLISHMENT).
|
||||
// These commonly cause brief host desyncs that self-recover.
|
||||
// Don't escalate to enterErrorRecovery here — let the time-based
|
||||
// desync tracking in startScan() handle reboot decisions.
|
||||
if (disc_reason == 574) {
|
||||
platform->_conn_establish_fail_count++;
|
||||
WARNING("NimBLEPlatform: Connection establishment failed (574), count=" +
|
||||
std::to_string(platform->_conn_establish_fail_count));
|
||||
|
||||
// If too many consecutive failures, trigger recovery
|
||||
if (platform->_conn_establish_fail_count >= CONN_ESTABLISH_FAIL_THRESHOLD) {
|
||||
WARNING("NimBLEPlatform: Too many connection establishment failures, entering recovery");
|
||||
platform->_conn_establish_fail_count = 0;
|
||||
platform->enterErrorRecovery();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -252,15 +252,19 @@ private:
|
||||
Bytes _identity_data;
|
||||
unsigned long _scan_stop_time = 0; // millis() when to stop continuous scan
|
||||
|
||||
// BLE stack recovery
|
||||
// BLE stack recovery — time-based desync tracking
|
||||
// The NimBLE host self-recovers from most desyncs within 1-5s.
|
||||
// We only reboot after prolonged desync (HOST_DESYNC_REBOOT_MS).
|
||||
uint8_t _scan_fail_count = 0;
|
||||
uint8_t _lightweight_reset_fails = 0;
|
||||
uint8_t _conn_establish_fail_count = 0; // rc=574 connection establishment failures
|
||||
unsigned long _last_full_recovery_time = 0;
|
||||
static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 5;
|
||||
unsigned long _host_desync_since = 0; // millis() when host first lost sync (0 = synced)
|
||||
static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 10;
|
||||
static constexpr uint8_t LIGHTWEIGHT_RESET_MAX_FAILS = 3;
|
||||
static constexpr uint8_t CONN_ESTABLISH_FAIL_THRESHOLD = 3; // Threshold for rc=574
|
||||
static constexpr uint8_t CONN_ESTABLISH_FAIL_THRESHOLD = 5;
|
||||
static constexpr unsigned long FULL_RECOVERY_COOLDOWN_MS = 60000; // 60 seconds
|
||||
static constexpr unsigned long HOST_DESYNC_REBOOT_MS = 60000; // Reboot after 60s desync (no connections)
|
||||
bool recoverBLEStack();
|
||||
|
||||
// NimBLE objects
|
||||
|
||||
+3
-1
@@ -77,8 +77,10 @@ CONFIG_FREERTOS_TIMER_TASK_STACK_DEPTH=4096
|
||||
# ============================================================================
|
||||
# Task Watchdog Timer (TWDT)
|
||||
# Detects task starvation and deadlock conditions
|
||||
# 30s timeout accommodates SPIFFS flash I/O during identity persistence
|
||||
# (writing 40-50 destinations to flash takes 5-15s with GC/erase)
|
||||
# ============================================================================
|
||||
CONFIG_ESP_TASK_WDT_EN=y
|
||||
CONFIG_ESP_TASK_WDT_TIMEOUT_S=10
|
||||
CONFIG_ESP_TASK_WDT_TIMEOUT_S=30
|
||||
CONFIG_ESP_TASK_WDT_CHECK_IDLE_TASK_CPU0=y
|
||||
CONFIG_ESP_TASK_WDT_CHECK_IDLE_TASK_CPU1=y
|
||||
|
||||
+14
-5
@@ -1194,11 +1194,15 @@ void setup() {
|
||||
INFO("╚══════════════════════════════════════╝");
|
||||
INFO("");
|
||||
|
||||
// Subscribe main loop to Task Watchdog — detects hangs/deadlocks
|
||||
// If loop() blocks for >10s (CONFIG_ESP_TASK_WDT_TIMEOUT_S), WDT fires
|
||||
// with a backtrace showing exactly where the hang is
|
||||
esp_task_wdt_add(NULL); // NULL = current task (loopTask)
|
||||
INFO("Task Watchdog: loopTask subscribed");
|
||||
// Reconfigure Task Watchdog with 30s timeout (default 10s is too tight
|
||||
// for SPIFFS flash I/O — identity persistence writes 40-50 entries and
|
||||
// can take 5-15s with sector erases and garbage collection)
|
||||
esp_task_wdt_init(30, true); // 30s timeout, panic on trigger
|
||||
esp_task_wdt_add(NULL); // Subscribe loopTask
|
||||
INFO("Task Watchdog: loopTask subscribed (30s timeout)");
|
||||
|
||||
// Feed WDT during long Identity persistence (71+ entries to SPIFFS can take >30s)
|
||||
Identity::set_persist_yield_callback([]() { esp_task_wdt_reset(); });
|
||||
|
||||
// Show startup message
|
||||
INFO("Press any key to start messaging");
|
||||
@@ -1275,10 +1279,15 @@ void loop() {
|
||||
reticulum->loop();
|
||||
|
||||
// Periodically persist identity/transport data (display names, paths, etc.)
|
||||
// NOTE: Identity persistence writes 40-50 entries to SPIFFS flash, which
|
||||
// involves sector erases (100ms each) and can take 5-15s total.
|
||||
// WDT feeds between calls prevent timeout during heavy flash I/O.
|
||||
LOOP_STEP(5); // persist data
|
||||
reticulum->should_persist_data();
|
||||
esp_task_wdt_reset();
|
||||
// Fast-persist known destinations (5s after dirty) to survive crashes
|
||||
Identity::should_persist_data();
|
||||
esp_task_wdt_reset();
|
||||
|
||||
// Process TCP interface
|
||||
LOOP_STEP(6); // TCP loop
|
||||
|
||||
Reference in New Issue
Block a user