mirror of
https://github.com/torlando-tech/pyxis.git
synced 2026-03-31 14:15:38 +00:00
Stability: WDT yield, BLE mutex fixes, time-based desync recovery
Reduces crash rate from every 60-85s to 1 reboot per 6+ minutes. Zero WDT triggers in 10-minute stability test. BLE mutex fixes (BLEInterface.cpp): - Release _mutex before blocking GATT ops in onConnected() and onServicesDiscovered() — prevents 5-30s main-loop stalls during service discovery, notification subscribe, identity exchange - Non-blocking try_lock() for peerCount(), getConnectedPeerSummaries(), get_stats() — returns empty/default if BLE task holds mutex - Write-without-response in initiateHandshake() WDT and persistence (main.cpp, sdkconfig.defaults, microReticulum): - 30s WDT timeout (up from 10s) for SPIFFS flash I/O headroom - Register Identity::set_persist_yield_callback() to feed WDT every 5 entries during save_known_destinations() (70+ entries = 30-50s) - WDT feeds between reticulum and identity persist calls BLE host desync recovery (NimBLEPlatform): - Time-based desync tracking instead of aggressive counter-based reboot - 60s tolerance without connections, 5 minutes with active connections (data still flows over existing BLE mesh links) - Remove immediate recoverBLEStack() from 574 handler and enterErrorRecovery() — let startScan() manage reboot decision - Increase CONNECTION_COOLDOWN from 3s to 10s to reduce 574 risk - Increase SCAN_FAIL_RECOVERY_THRESHOLD from 5 to 10 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -412,14 +412,19 @@ bool BLEInterface::sendToPeer(const Bytes& peer_identity, const Bytes& data) {
|
||||
//=============================================================================
|
||||
|
||||
size_t BLEInterface::peerCount() const {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
return _peer_manager.connectedCount();
|
||||
// Non-blocking: return 0 if BLE task is busy (avoids main loop WDT)
|
||||
if (!_mutex.try_lock()) return 0;
|
||||
size_t count = _peer_manager.connectedCount();
|
||||
_mutex.unlock();
|
||||
return count;
|
||||
}
|
||||
|
||||
size_t BLEInterface::getConnectedPeerSummaries(PeerSummary* out, size_t max_count) const {
|
||||
if (!out || max_count == 0) return 0;
|
||||
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
// Non-blocking: return 0 if BLE task is busy (avoids main loop WDT)
|
||||
if (!_mutex.try_lock()) return 0;
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex, std::adopt_lock);
|
||||
|
||||
// Cast away const for read-only access to non-const getConnectedPeers()
|
||||
auto& mutable_peer_manager = const_cast<BLE::BLEPeerManager&>(_peer_manager);
|
||||
@@ -465,7 +470,9 @@ std::map<std::string, float> BLEInterface::get_stats() const {
|
||||
stats["peripheral_connections"] = 0.0f;
|
||||
|
||||
try {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
// Non-blocking: return defaults if BLE task is busy (avoids main loop WDT)
|
||||
if (!_mutex.try_lock()) return stats;
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex, std::adopt_lock);
|
||||
|
||||
// Count central vs peripheral connections
|
||||
int central_count = 0;
|
||||
@@ -594,30 +601,33 @@ void BLEInterface::onScanResult(const ScanResult& result) {
|
||||
}
|
||||
|
||||
void BLEInterface::onConnected(const ConnectionHandle& conn) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
|
||||
// Update peer state
|
||||
_peer_manager.setPeerState(mac, PeerState::HANDSHAKING);
|
||||
_peer_manager.setPeerHandle(mac, conn.handle);
|
||||
// Update peer state
|
||||
_peer_manager.setPeerState(mac, PeerState::HANDSHAKING);
|
||||
_peer_manager.setPeerHandle(mac, conn.handle);
|
||||
|
||||
// Set MTU from connection (onMTUChange only fires for peripheral connections)
|
||||
if (conn.mtu > 0) {
|
||||
_peer_manager.setPeerMTU(mac, conn.mtu);
|
||||
}
|
||||
// Set MTU from connection (onMTUChange only fires for peripheral connections)
|
||||
if (conn.mtu > 0) {
|
||||
_peer_manager.setPeerMTU(mac, conn.mtu);
|
||||
}
|
||||
|
||||
// Mark as central connection (we initiated the connection)
|
||||
PeerInfo* peer = _peer_manager.getPeerByMac(mac);
|
||||
if (peer) {
|
||||
peer->is_central = true; // We ARE central in this connection
|
||||
}
|
||||
// Mark as central connection (we initiated the connection)
|
||||
PeerInfo* peer = _peer_manager.getPeerByMac(mac);
|
||||
if (peer) {
|
||||
peer->is_central = true; // We ARE central in this connection
|
||||
}
|
||||
|
||||
INFO("BLE: Connected to " + conn.peer_address.toString() +
|
||||
" handle=" + std::to_string(conn.handle) +
|
||||
" mtu=" + std::to_string(conn.mtu) + " (we are central)");
|
||||
INFO("BLE: Connected to " + conn.peer_address.toString() +
|
||||
" handle=" + std::to_string(conn.handle) +
|
||||
" mtu=" + std::to_string(conn.mtu) + " (we are central)");
|
||||
} // _mutex released BEFORE blocking GATT service discovery
|
||||
|
||||
// Discover services
|
||||
// Discover services — this does blocking GATT reads (3-15s) and must NOT
|
||||
// hold _mutex, otherwise the NimBLE host task and main loop both block.
|
||||
_platform->discoverServices(conn.handle);
|
||||
}
|
||||
|
||||
@@ -672,15 +682,15 @@ void BLEInterface::onMTUChanged(const ConnectionHandle& conn, uint16_t mtu) {
|
||||
}
|
||||
|
||||
void BLEInterface::onServicesDiscovered(const ConnectionHandle& conn, bool success) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
|
||||
if (!success) {
|
||||
WARNING("BLEInterface: Service discovery failed for " + conn.peer_address.toString());
|
||||
|
||||
// Clean up peer state - NimBLE may have already disconnected internally,
|
||||
// so onDisconnected callback might not fire. Manually reset peer state.
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
_peer_manager.connectionFailed(mac);
|
||||
// Clean up peer state under brief lock
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
_peer_manager.connectionFailed(mac);
|
||||
}
|
||||
|
||||
// Try to disconnect (may be no-op if already disconnected)
|
||||
_platform->disconnect(conn.handle);
|
||||
@@ -689,6 +699,11 @@ void BLEInterface::onServicesDiscovered(const ConnectionHandle& conn, bool succe
|
||||
|
||||
INFO("BLE: Services discovered for " + conn.peer_address.toString());
|
||||
|
||||
// All operations below are blocking GATT ops — do NOT hold _mutex.
|
||||
// Holding _mutex during these blocks the NimBLE host task (which needs
|
||||
// _mutex for its own callbacks), causing "BLE stack stuck" detection,
|
||||
// and blocks the main loop's peer status queries, causing WDT triggers.
|
||||
|
||||
// Enable notifications on TX characteristic
|
||||
_platform->enableNotifications(conn.handle, true);
|
||||
|
||||
@@ -1094,8 +1109,8 @@ void BLEInterface::initiateHandshake(const ConnectionHandle& conn) {
|
||||
Bytes handshake = _identity_manager.initiateHandshake(mac);
|
||||
|
||||
if (handshake.size() > 0) {
|
||||
// Write our identity to peer's RX characteristic
|
||||
_platform->write(conn.handle, handshake, true);
|
||||
// Write our identity to peer's RX characteristic (no-response to avoid blocking)
|
||||
_platform->write(conn.handle, handshake, false);
|
||||
|
||||
DEBUG("BLEInterface: Sent identity handshake to " + conn.peer_address.toString());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user