Merge pull request #7 from torlando-tech/ble-stability-audit

BLE stability: fix desync crash loops and scan recovery
This commit is contained in:
Torlando
2026-03-03 23:39:38 -05:00
committed by GitHub
9 changed files with 410 additions and 105 deletions
+15 -5
View File
@@ -220,6 +220,14 @@ void BLEInterface::loop() {
if (resolved.size() == Limits::IDENTITY_SIZE) {
stored_id = resolved;
} else {
// Expire entries that have waited longer than HANDSHAKE_TIMEOUT.
// If a peer sends data but never completes handshake (e.g., disconnect
// during handshake), these entries would stay indefinitely.
if (now - _pending_data_pool[i].queued_at > Timing::HANDSHAKE_TIMEOUT) {
DEBUG("BLEInterface: Expiring stale pending data (no identity after " +
std::to_string((int)(now - _pending_data_pool[i].queued_at)) + "s)");
continue; // Drop this entry
}
// Still no identity — keep for next loop iteration
if (requeue_count != i) {
_pending_data_pool[requeue_count] = _pending_data_pool[i];
@@ -759,8 +767,8 @@ void BLEInterface::onCentralConnected(const ConnectionHandle& conn) {
Bytes mac = conn.peer_address.toBytes();
// Update peer manager
_peer_manager.addDiscoveredPeer(mac, 0);
// Update peer manager with connection RSSI
_peer_manager.addDiscoveredPeer(mac, conn.rssi);
_peer_manager.setPeerState(mac, PeerState::HANDSHAKING);
_peer_manager.setPeerHandle(mac, conn.handle);
@@ -923,10 +931,10 @@ void BLEInterface::processDiscoveredPeers() {
last_peer_log = now;
}
if (candidate && candidate->mac_address.size() >= Limits::MAC_SIZE) {
if (candidate && candidate->mac_address.size() >= Limits::MAC_SIZE &&
_peer_manager.canAcceptConnection()) {
INFO("BLE: Connection candidate: " + BLEAddress(candidate->mac_address.data()).toString() +
" type=" + std::to_string(candidate->address_type) +
" canAccept=" + std::string(_peer_manager.canAcceptConnection() ? "yes" : "no"));
" type=" + std::to_string(candidate->address_type));
}
if (candidate && _peer_manager.canAcceptConnection()) {
@@ -1086,6 +1094,7 @@ void BLEInterface::handleIncomingData(const ConnectionHandle& conn, const Bytes&
PendingData& pending = _pending_data_pool[_pending_data_count];
pending.identity = mac; // Use MAC as temporary key
pending.data = data;
pending.queued_at = Utilities::OS::time();
_pending_data_count++;
}
return;
@@ -1099,6 +1108,7 @@ void BLEInterface::handleIncomingData(const ConnectionHandle& conn, const Bytes&
PendingData& pending = _pending_data_pool[_pending_data_count];
pending.identity = identity;
pending.data = data;
pending.queued_at = Utilities::OS::time();
_pending_data_count++;
}
+1
View File
@@ -270,6 +270,7 @@ private:
struct PendingData {
RNS::Bytes identity;
RNS::Bytes data;
double queued_at = 0; // Timestamp for expiry of unresolvable entries
};
PendingData _pending_data_pool[MAX_PENDING_DATA];
size_t _pending_data_count = 0;
+2
View File
@@ -325,6 +325,7 @@ struct ConnectionHandle {
Role local_role = Role::NONE; // Our role in this connection
ConnectionState state = ConnectionState::DISCONNECTED;
uint16_t mtu = MTU::MINIMUM; // Negotiated MTU
int8_t rssi = 0; // Connection RSSI at connect time
// Characteristic handles (discovered after connection)
uint16_t rx_char_handle = 0; // Handle for RX characteristic
@@ -340,6 +341,7 @@ struct ConnectionHandle {
local_role = Role::NONE;
state = ConnectionState::DISCONNECTED;
mtu = MTU::MINIMUM;
rssi = 0;
rx_char_handle = 0;
tx_char_handle = 0;
tx_cccd_handle = 0;
+240 -99
View File
@@ -36,6 +36,10 @@ extern "C" {
void ble_hs_sched_reset(int reason);
}
// Defined in patched NimBLEDevice.cpp — set in onReset callback with the reason code.
// Poll from BLE loop to log via UDP (NimBLE's own logging only reaches serial UART).
extern volatile int nimble_host_reset_reason;
namespace RNS { namespace BLE {
//=============================================================================
@@ -204,6 +208,16 @@ void NimBLEPlatform::loop() {
return;
}
// Process deferred disconnects from NimBLE host task callbacks.
// Must run before other loop logic to ensure stale connections are cleaned up.
processPendingDisconnects();
// Process deferred error recovery (requested from callback context)
if (_error_recovery_requested) {
_error_recovery_requested = false;
enterErrorRecovery();
}
// Check if continuous scan should stop
portENTER_CRITICAL(&_state_mux);
MasterState ms = _master_state;
@@ -221,6 +235,7 @@ void NimBLEPlatform::loop() {
// Stuck-state safety net: if GAP hardware is idle but our state machine
// thinks we're busy, reset state machine. This recovers from missed callbacks
// (e.g., service discovery disconnect not properly cleaning up state).
// Skip during CONNECTING — connectNative() can legitimately take up to 10s.
static uint32_t last_stuck_check = 0;
uint32_t now_ms = millis();
if (now_ms - last_stuck_check >= 5000) { // Check every 5 seconds
@@ -232,22 +247,27 @@ void NimBLEPlatform::loop() {
SlaveState ss = _slave_state;
portEXIT_CRITICAL(&_state_mux);
bool gap_idle = !ble_gap_disc_active() && !ble_gap_adv_active() && !ble_gap_conn_active();
// Don't fire stuck detector while a connection attempt is in progress
if (ms2 == MasterState::CONNECTING || ms2 == MasterState::CONN_STARTING) {
// Expected — connect can take several seconds
} else {
bool gap_idle = !ble_gap_disc_active() && !ble_gap_adv_active() && !ble_gap_conn_active();
if (gap_idle && (gs != GAPState::READY || ms2 != MasterState::IDLE || ss != SlaveState::IDLE)) {
WARNING(std::string("NimBLEPlatform: Stuck state detected - GAP idle but state=") +
gapStateName(gs) + " master=" + masterStateName(ms2) +
" slave=" + slaveStateName(ss) + ". Resetting.");
portENTER_CRITICAL(&_state_mux);
_gap_state = GAPState::READY;
_master_state = MasterState::IDLE;
_slave_state = SlaveState::IDLE;
_slave_paused_for_master = false;
portEXIT_CRITICAL(&_state_mux);
if (gap_idle && (gs != GAPState::READY || ms2 != MasterState::IDLE || ss != SlaveState::IDLE)) {
WARNING(std::string("NimBLEPlatform: Stuck state detected - GAP idle but state=") +
gapStateName(gs) + " master=" + masterStateName(ms2) +
" slave=" + slaveStateName(ss) + ". Resetting.");
portENTER_CRITICAL(&_state_mux);
_gap_state = GAPState::READY;
_master_state = MasterState::IDLE;
_slave_state = SlaveState::IDLE;
_slave_paused_for_master = false;
portEXIT_CRITICAL(&_state_mux);
// Restart advertising in dual/peripheral mode
if (_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) {
startAdvertising();
// Restart advertising in dual/peripheral mode
if (_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) {
startAdvertising();
}
}
}
}
@@ -309,6 +329,9 @@ void NimBLEPlatform::shutdown() {
// Notify higher layers about all disconnections BEFORE deinit,
// so the peer manager can reset peer states properly.
// Do NOT delete clients individually — deinit(true) handles all client cleanup.
// Process any remaining deferred disconnects before shutdown cleanup
processPendingDisconnects();
if (xSemaphoreTake(_conn_mutex, pdMS_TO_TICKS(1000))) {
if (_on_disconnected) {
for (auto& kv : _connections) {
@@ -317,6 +340,7 @@ void NimBLEPlatform::shutdown() {
}
_clients.clear();
_connections.clear();
_cached_rx_chars.clear();
_discovered_devices.clear();
_discovered_order.clear();
xSemaphoreGive(_conn_mutex);
@@ -329,6 +353,7 @@ void NimBLEPlatform::shutdown() {
}
_clients.clear();
_connections.clear();
_cached_rx_chars.clear();
_discovered_devices.clear();
_discovered_order.clear();
}
@@ -598,6 +623,10 @@ void NimBLEPlatform::enterErrorRecovery() {
if (ble_gap_disc_active()) {
ble_gap_disc_cancel();
}
if (ble_gap_conn_active()) {
WARNING("NimBLEPlatform: Cancelling stuck GAP connection in error recovery");
ble_gap_conn_cancel();
}
if (ble_gap_adv_active()) {
ble_gap_adv_stop();
}
@@ -620,6 +649,7 @@ void NimBLEPlatform::enterErrorRecovery() {
uint32_t sync_start = millis();
while (!ble_hs_synced() && (millis() - sync_start) < 5000) {
delay(50);
esp_task_wdt_reset();
}
if (ble_hs_synced()) {
INFO("NimBLEPlatform: Host sync restored after " +
@@ -636,8 +666,28 @@ void NimBLEPlatform::enterErrorRecovery() {
}
}
// Force host-controller resync to clear stale HCI state (fixes rc=530 / Invalid HCI params)
// After a 574 desync, the controller's scan state can become corrupted even after host re-syncs.
INFO("NimBLEPlatform: Scheduling host reset for controller resync");
ble_hs_sched_reset(BLE_HS_ECONTROLLER);
// Wait for host to re-sync after reset
{
uint32_t reset_start = millis();
while (!ble_hs_synced() && (millis() - reset_start) < 5000) {
delay(50);
esp_task_wdt_reset();
}
if (ble_hs_synced()) {
INFO("NimBLEPlatform: Host-controller resync after " +
std::to_string(millis() - reset_start) + "ms");
} else {
WARNING("NimBLEPlatform: Host-controller resync failed after 5s");
}
}
// DELAY RATIONALE: Connect attempt recovery - ESP32-S3 settling time after host sync
delay(50);
delay(100);
// Re-acquire scan object to reset NimBLE internal state
// This is necessary because NimBLE scan object can get into stuck state
@@ -671,6 +721,71 @@ void NimBLEPlatform::enterErrorRecovery() {
in_recovery = false;
}
//=============================================================================
// Deferred Disconnect Processing
//=============================================================================
void NimBLEPlatform::queueDisconnect(uint16_t conn_handle, int reason, bool is_peripheral) {
uint8_t next = (_pending_disc_write + 1) % PENDING_DISC_QUEUE_SIZE;
if (next == _pending_disc_read) {
// Queue full — more simultaneous disconnects than queue size
WARNING("NimBLEPlatform: Pending disconnect queue full, dropping handle=" +
std::to_string(conn_handle));
return;
}
_pending_disc_queue[_pending_disc_write] = {conn_handle, reason, is_peripheral};
_pending_disc_write = next;
}
void NimBLEPlatform::processPendingDisconnects() {
while (_pending_disc_read != _pending_disc_write) {
PendingDisconnect& pd = _pending_disc_queue[_pending_disc_read];
auto conn_it = _connections.find(pd.conn_handle);
if (conn_it != _connections.end()) {
ConnectionHandle conn = conn_it->second;
_connections.erase(conn_it);
INFO("NimBLEPlatform: Processing deferred disconnect for " +
conn.peer_address.toString() + " reason=" + std::to_string(pd.reason));
if (!pd.is_peripheral) {
// Central mode: clean up client object and cached char pointer
auto client_it = _clients.find(pd.conn_handle);
if (client_it != _clients.end()) {
if (client_it->second) {
NimBLEDevice::deleteClient(client_it->second);
}
_clients.erase(client_it);
}
_cached_rx_chars.erase(pd.conn_handle);
}
// Clear operation queue for this connection
clearForConnection(pd.conn_handle);
// Notify higher layers
if (pd.is_peripheral) {
if (_on_central_disconnected) {
_on_central_disconnected(conn);
}
} else {
if (_on_disconnected) {
_on_disconnected(conn, static_cast<uint8_t>(pd.reason));
}
}
// Restart advertising if in peripheral/dual mode
if ((_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) &&
!isAdvertising()) {
startAdvertising();
}
}
_pending_disc_read = (_pending_disc_read + 1) % PENDING_DISC_QUEUE_SIZE;
}
}
//=============================================================================
// Central Mode - Scanning
//=============================================================================
@@ -707,11 +822,18 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
if (!ble_hs_synced()) {
unsigned long desync_duration = millis() - _host_desync_since;
_scan_fail_count++;
// Capture NimBLE's internal reset reason (set in patched onReset callback)
int reset_reason = nimble_host_reset_reason;
if (reset_reason != 0) {
nimble_host_reset_reason = 0;
}
WARNING("NimBLEPlatform: Host not synced, desync " +
std::to_string(desync_duration / 1000) + "s (fail " +
std::to_string(_scan_fail_count) + "/" +
std::to_string(SCAN_FAIL_RECOVERY_THRESHOLD) +
", resets=" + std::to_string(_host_reset_attempts) + ")");
", resets=" + std::to_string(_host_reset_attempts) +
(reset_reason != 0 ? ", nimble_reason=" + std::to_string(reset_reason) : "") +
")");
// Tiered recovery:
// 0-10s: Wait for natural self-recovery
@@ -752,19 +874,33 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
// Host is synced — clear desync tracking
if (_host_desync_since != 0) {
unsigned long recovery_time = millis() - _host_desync_since;
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms");
// Capture any remaining reset reason from NimBLE
int reset_reason = nimble_host_reset_reason;
if (reset_reason != 0) {
nimble_host_reset_reason = 0;
}
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms" +
(reset_reason != 0 ? " (nimble_reason=" + std::to_string(reset_reason) + ")" : ""));
_host_desync_since = 0;
_host_reset_attempts = 0;
_last_desync_recovery = millis(); // Start cooldown before allowing connections
}
// Log GAP hardware state before checking
DEBUG("NimBLEPlatform: Pre-scan GAP state: disc=" + std::to_string(ble_gap_disc_active()) +
" adv=" + std::to_string(ble_gap_adv_active()) +
" conn=" + std::to_string(ble_gap_conn_active()));
// Log GAP hardware state before checking (INFO for UDP visibility during soak test)
INFO("NimBLEPlatform: Pre-scan GAP: disc=" + std::to_string(ble_gap_disc_active()) +
" adv=" + std::to_string(ble_gap_adv_active()) +
" conn=" + std::to_string(ble_gap_conn_active()));
// If a stale GAP connection is blocking scan, cancel it proactively
if (ble_gap_conn_active() && _master_state == MasterState::IDLE) {
WARNING("NimBLEPlatform: Stale GAP conn blocking scan - cancelling");
ble_gap_conn_cancel();
delay(50); // Let GAP process the cancel
}
// Verify we can start scan
if (!canStartScan()) {
DEBUG("NimBLEPlatform: Cannot start scan - state check failed" +
WARNING("NimBLEPlatform: Cannot start scan - state check failed" +
std::string(" master=") + masterStateName(current_master) +
" gap_disc=" + std::to_string(ble_gap_disc_active()) +
" gap_conn=" + std::to_string(ble_gap_conn_active()));
@@ -823,8 +959,11 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
return true;
}
// Scan failed
ERROR("NimBLEPlatform: Failed to start scan");
// Scan failed — log GAP state for diagnosis
ERROR("NimBLEPlatform: Failed to start scan - GAP: disc=" + std::to_string(ble_gap_disc_active()) +
" conn=" + std::to_string(ble_gap_conn_active()) +
" adv=" + std::to_string(ble_gap_adv_active()) +
" master=" + masterStateName(_master_state));
// Reset state
portENTER_CRITICAL(&_state_mux);
@@ -834,9 +973,20 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
_scan_fail_count++;
if (_scan_fail_count >= SCAN_FAIL_RECOVERY_THRESHOLD) {
WARNING("NimBLEPlatform: Too many scan failures, entering error recovery");
_scan_fail_count = 0; // Reset so we don't immediately re-enter after recovery
enterErrorRecovery();
_lightweight_reset_fails++;
if (_lightweight_reset_fails >= LIGHTWEIGHT_RESET_MAX_FAILS) {
WARNING("NimBLEPlatform: " + std::to_string(_lightweight_reset_fails) +
" error recoveries failed to restore scan, escalating to full stack recovery");
_lightweight_reset_fails = 0;
recoverBLEStack();
} else {
WARNING("NimBLEPlatform: Too many scan failures, entering error recovery (" +
std::to_string(_lightweight_reset_fails) + "/" +
std::to_string(LIGHTWEIGHT_RESET_MAX_FAILS) + ")");
enterErrorRecovery();
}
}
resumeSlave();
@@ -899,6 +1049,14 @@ bool NimBLEPlatform::isScanning() const {
bool NimBLEPlatform::connect(const BLEAddress& address, uint16_t timeout_ms) {
NimBLEAddress nimAddr = toNimBLE(address);
// Skip connections during desync cooldown — connecting while the NimBLE
// stack is recovering from a desync can hang client->connect() (the host
// task can't process the completion event), leading to WDT crashes.
if (_host_desync_since != 0 || (_last_desync_recovery > 0 && millis() - _last_desync_recovery < DESYNC_CONNECT_COOLDOWN_MS)) {
DEBUG("NimBLEPlatform: Skipping connect during desync cooldown");
return false;
}
// Rate limit connections to avoid overwhelming the BLE stack
// Non-blocking: return false if too soon, caller can retry later
static unsigned long last_connect_time = 0;
@@ -1116,38 +1274,10 @@ int NimBLEPlatform::nativeGapEventHandler(struct ble_gap_event* event, void* arg
break;
}
// Clean up established connections (handles MAC rotation, out of range, etc.)
auto conn_it = platform->_connections.find(disc_handle);
if (conn_it != platform->_connections.end()) {
ConnectionHandle conn = conn_it->second;
platform->_connections.erase(conn_it);
INFO("NimBLEPlatform: Native connection lost to " + conn.peer_address.toString() +
" reason=" + std::to_string(disc_reason));
// Clean up client object
auto client_it = platform->_clients.find(disc_handle);
if (client_it != platform->_clients.end()) {
if (client_it->second) {
NimBLEDevice::deleteClient(client_it->second);
}
platform->_clients.erase(client_it);
}
// Clear operation queue for this connection
platform->clearForConnection(disc_handle);
// Notify higher layers
if (platform->_on_disconnected) {
platform->_on_disconnected(conn, static_cast<uint8_t>(disc_reason));
}
// Restart advertising if in peripheral/dual mode and not currently advertising
if ((platform->_config.role == Role::PERIPHERAL || platform->_config.role == Role::DUAL) &&
!platform->isAdvertising()) {
platform->startAdvertising();
}
}
// Defer map cleanup to BLE loop task to avoid data race.
// This callback runs in the NimBLE host task while the BLE loop task
// may be iterating _connections/_clients concurrently.
platform->queueDisconnect(disc_handle, disc_reason, false);
break;
}
@@ -1527,13 +1657,22 @@ bool NimBLEPlatform::write(uint16_t conn_handle, const Bytes& data, bool respons
return false;
}
NimBLERemoteService* service = client->getService(UUID::SERVICE);
if (!service) {
WARNING("NimBLEPlatform::write: service not found for handle " + std::to_string(conn_handle));
return false;
// Use cached RX characteristic pointer to avoid repeated service/char lookups
NimBLERemoteCharacteristic* rxChar = nullptr;
auto cached_it = _cached_rx_chars.find(conn_handle);
if (cached_it != _cached_rx_chars.end()) {
rxChar = cached_it->second;
} else {
NimBLERemoteService* service = client->getService(UUID::SERVICE);
if (!service) {
WARNING("NimBLEPlatform::write: service not found for handle " + std::to_string(conn_handle));
return false;
}
rxChar = service->getCharacteristic(UUID::RX_CHAR);
if (rxChar) {
_cached_rx_chars[conn_handle] = rxChar;
}
}
NimBLERemoteCharacteristic* rxChar = service->getCharacteristic(UUID::RX_CHAR);
if (!rxChar) {
WARNING("NimBLEPlatform::write: RX char not found for handle " + std::to_string(conn_handle));
return false;
@@ -1591,6 +1730,11 @@ bool NimBLEPlatform::read(uint16_t conn_handle, uint16_t char_handle,
}
NimBLEClient* client = client_it->second;
if (!client->isConnected()) {
if (callback) callback(OperationResult::DISCONNECTED, Bytes());
return false;
}
NimBLERemoteService* service = client->getService(UUID::SERVICE);
if (!service) {
if (callback) callback(OperationResult::NOT_FOUND, Bytes());
@@ -1624,6 +1768,8 @@ bool NimBLEPlatform::enableNotifications(uint16_t conn_handle, bool enable) {
}
NimBLEClient* client = client_it->second;
if (!client->isConnected()) return false;
NimBLERemoteService* service = client->getService(UUID::SERVICE);
if (!service) return false;
@@ -1631,11 +1777,17 @@ bool NimBLEPlatform::enableNotifications(uint16_t conn_handle, bool enable) {
if (!txChar) return false;
if (enable) {
// Subscribe to notifications
auto notifyCb = [this, conn_handle](NimBLERemoteCharacteristic* pChar,
// Subscribe to notifications.
// Capture peer_address to guard against conn_handle reuse: if peer A disconnects
// (handle=1) and peer B connects (handle=1), we must not deliver B's data as A's.
BLEAddress expected_peer = getConnection(conn_handle).peer_address;
auto notifyCb = [this, conn_handle, expected_peer](NimBLERemoteCharacteristic* pChar,
uint8_t* pData, size_t length, bool isNotify) {
if (_on_data_received) {
ConnectionHandle conn = getConnection(conn_handle);
if (!conn.isValid() || conn.peer_address != expected_peer) {
return; // Stale handle — peer changed
}
Bytes data(pData, length);
_on_data_received(conn, data);
}
@@ -1810,9 +1962,16 @@ void NimBLEPlatform::onConnect(NimBLEServer* pServer, NimBLEConnInfo& connInfo)
conn.state = ConnectionState::CONNECTED;
conn.mtu = MTU::MINIMUM - MTU::ATT_OVERHEAD;
// Read connection RSSI
int8_t rssi_val = 0;
if (ble_gap_conn_rssi(conn_handle, &rssi_val) == 0) {
conn.rssi = rssi_val;
}
_connections[conn_handle] = conn;
DEBUG("NimBLEPlatform: Central connected: " + conn.peer_address.toString());
DEBUG("NimBLEPlatform: Central connected: " + conn.peer_address.toString() +
" rssi=" + std::to_string(conn.rssi));
if (_on_central_connected) {
_on_central_connected(conn);
@@ -1829,21 +1988,13 @@ void NimBLEPlatform::onDisconnect(NimBLEServer* pServer, NimBLEConnInfo& connInf
uint16_t conn_handle = connInfo.getConnHandle();
auto it = _connections.find(conn_handle);
if (it != _connections.end()) {
ConnectionHandle conn = it->second;
_connections.erase(it);
DEBUG("NimBLEPlatform: Central disconnect event for handle=" + std::to_string(conn_handle) +
" reason=" + std::to_string(reason));
DEBUG("NimBLEPlatform: Central disconnected: " + conn.peer_address.toString() +
" reason: " + std::to_string(reason));
if (_on_central_disconnected) {
_on_central_disconnected(conn);
}
}
// Clear operation queue for this connection
BLEOperationQueue::clearForConnection(conn_handle);
// Defer map cleanup to BLE loop task to avoid data race.
// This callback runs in the NimBLE host task while the BLE loop task
// may be iterating _connections concurrently.
queueDisconnect(conn_handle, reason, true);
}
void NimBLEPlatform::onMTUChange(uint16_t MTU, NimBLEConnInfo& connInfo) {
@@ -1959,25 +2110,15 @@ void NimBLEPlatform::onDisconnect(NimBLEClient* pClient, int reason) {
return;
}
auto it = _connections.find(conn_handle);
if (it != _connections.end()) {
ConnectionHandle conn = it->second;
_connections.erase(it);
DEBUG("NimBLEPlatform: Client disconnect event for handle=" + std::to_string(conn_handle) +
" reason=" + std::to_string(reason));
DEBUG("NimBLEPlatform: Disconnected from peripheral: " + conn.peer_address.toString() +
" reason: " + std::to_string(reason));
if (_on_disconnected) {
_on_disconnected(conn, static_cast<uint8_t>(reason));
}
}
// Remove client
_clients.erase(conn_handle);
NimBLEDevice::deleteClient(pClient);
// Clear operation queue
BLEOperationQueue::clearForConnection(conn_handle);
// Defer map cleanup to BLE loop task to avoid data race.
// This callback runs in the NimBLE host task while the BLE loop task
// may be iterating _connections/_clients concurrently.
// Note: NimBLEDevice::deleteClient() for this client will be called
// in processPendingDisconnects() from the loop task context.
queueDisconnect(conn_handle, reason, false);
}
//=============================================================================
+25 -1
View File
@@ -239,6 +239,25 @@ private:
void resumeSlave();
void enterErrorRecovery();
// Deferred disconnect queue (SPSC: NimBLE host task produces, BLE loop task consumes)
// Disconnect events arrive from the host task and must not modify _connections/_clients
// directly, as the BLE loop task may be iterating them concurrently.
static constexpr size_t PENDING_DISC_QUEUE_SIZE = 8;
struct PendingDisconnect {
uint16_t conn_handle;
int reason;
bool is_peripheral; // true = server disconnect, false = native GAP handler
};
PendingDisconnect _pending_disc_queue[PENDING_DISC_QUEUE_SIZE];
volatile uint8_t _pending_disc_write = 0; // Next write slot (host task only)
volatile uint8_t _pending_disc_read = 0; // Next read slot (loop task only)
void queueDisconnect(uint16_t conn_handle, int reason, bool is_peripheral);
void processPendingDisconnects();
// Deferred error recovery (set from any context, processed in loop task)
volatile bool _error_recovery_requested = false;
// Track if slave was paused for a master operation
bool _slave_paused_for_master = false;
@@ -260,11 +279,13 @@ private:
uint8_t _conn_establish_fail_count = 0; // rc=574 connection establishment failures
unsigned long _last_full_recovery_time = 0;
unsigned long _host_desync_since = 0; // millis() when host first lost sync (0 = synced)
unsigned long _last_desync_recovery = 0; // millis() when last desync recovered (for connect cooldown)
uint8_t _host_reset_attempts = 0; // ble_hs_sched_reset attempts since last sync
static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 10;
static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 5;
static constexpr uint8_t LIGHTWEIGHT_RESET_MAX_FAILS = 3;
static constexpr uint8_t CONN_ESTABLISH_FAIL_THRESHOLD = 5;
static constexpr unsigned long FULL_RECOVERY_COOLDOWN_MS = 60000; // 60 seconds
static constexpr unsigned long DESYNC_CONNECT_COOLDOWN_MS = 30000; // Don't connect for 30s after desync recovery
static constexpr unsigned long HOST_DESYNC_REBOOT_MS = 60000; // Reboot after 60s desync (no connections)
bool recoverBLEStack();
bool attemptHostReset();
@@ -281,6 +302,9 @@ private:
// Client connections (as central)
std::map<uint16_t, NimBLEClient*> _clients;
// Cached RX characteristic pointers (avoids repeated service/char lookups in write())
std::map<uint16_t, NimBLERemoteCharacteristic*> _cached_rx_chars;
// Connection tracking
std::map<uint16_t, ConnectionHandle> _connections;
@@ -195,6 +195,7 @@ void ConversationListScreen::refresh() {
_conversations.clear();
_conversation_containers.clear();
_peer_hash_pool.clear();
_has_unresolved_names = false;
// Load conversations from store
std::vector<Bytes> peer_hashes = _message_store->get_conversations();
@@ -226,17 +227,22 @@ void ConversationListScreen::refresh() {
item.peer_hash = peer_hash;
// Try to get display name from app_data, fall back to hash
bool resolved = false;
Bytes app_data = Identity::recall_app_data(peer_hash);
if (app_data && app_data.size() > 0) {
String display_name = parse_display_name(app_data);
if (display_name.length() > 0) {
item.peer_name = display_name;
resolved = true;
} else {
item.peer_name = truncate_hash(peer_hash);
}
} else {
item.peer_name = truncate_hash(peer_hash);
}
if (!resolved) {
_has_unresolved_names = true;
}
// Get message content for preview
String content((const char*)last_msg.content().data(), last_msg.content().size());
@@ -544,6 +550,23 @@ void ConversationListScreen::update_status() {
lv_obj_set_style_text_color(_label_battery_icon, battery_color, 0);
lv_obj_set_style_text_color(_label_battery_pct, battery_color, 0);
}
// Check if any unresolved display names can now be resolved
// (announces may have arrived since the list was last refreshed)
if (_has_unresolved_names && _message_store) {
for (const auto& item : _conversations) {
Bytes app_data = Identity::recall_app_data(item.peer_hash);
if (app_data && app_data.size() > 0) {
String name = parse_display_name(app_data);
if (name.length() > 0 && item.peer_name != name) {
// A name has become available — refresh the whole list
DEBUG("Display name resolved, refreshing conversation list");
refresh();
return;
}
}
}
}
}
void ConversationListScreen::on_conversation_clicked(lv_event_t* event) {
@@ -197,6 +197,7 @@ private:
std::vector<lv_obj_t*> _conversation_containers; // For focus group management
std::vector<RNS::Bytes> _peer_hash_pool; // Object pool to avoid per-item allocations
RNS::Bytes _pending_delete_hash; // Hash of conversation pending deletion
bool _has_unresolved_names = false; // True if any conversation shows hash instead of name
ConversationSelectedCallback _conversation_selected_callback;
ComposeCallback _compose_callback;
+78
View File
@@ -7,6 +7,16 @@ Patch 1 — ble_hs.c: Remove assert(0) in BLE_HS_SYNC_STATE_BRINGUP timer handle
Patch 2 NimBLEClient.cpp: Add null checks in PHY update event handler.
If a client is deleted while events are queued, the callback arg becomes a dangling
pointer. Guard against null pClient and null m_pClientCallbacks.
Patch 3 ble_gap.c: Handle BLE_ERR_CONN_ESTABLISHMENT (574) in conn complete handler.
NimBLE only handles 574 when BLE_PERIODIC_ADV_WITH_RESPONSES is enabled. Without this
patch, 574 falls through to assert(0) and the master GAP state is never cleaned up,
leaving scan/advertise permanently broken. The ESP32-S3 controller returns 574 when
connection establishment fails (peer disappeared, RF interference, etc.).
Patch 4 NimBLEDevice.cpp: Expose host reset reason via global volatile variable.
NimBLE's onReset callback logs reason to ESP_LOG (serial only). This patch adds a
global volatile int that application code can poll to capture the reason in UDP logs.
"""
Import("env")
import os
@@ -76,3 +86,71 @@ apply_patch(
} // BLE_GAP_EVENT_PHY_UPDATE_COMPLETE""",
"NimBLEClient.cpp: added null guard in PHY update handler"
)
# Patch 3: ble_gap.c — handle BLE_ERR_CONN_ESTABLISHMENT (574) without PAwR
# The ESP32-S3 controller returns error 574 when connection establishment fails.
# NimBLE only handles this in the BLE_PERIODIC_ADV_WITH_RESPONSES path. Without it,
# 574 hits the default case (assert(0) + no cleanup), leaving master GAP state stuck
# in BLE_GAP_OP_M_CONN — permanently blocking scan and advertising.
apply_patch(
os.path.join(NIMBLE_BASE, "nimble", "nimble", "host", "src", "ble_gap.c"),
"""#if MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES)
case BLE_ERR_CONN_ESTABLISHMENT:
if (!v1_evt) {
ble_gap_rx_conn_comp_failed(evt);
}
break;
#endif // MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES)
default:
/* this should never happen, unless controller is broken */
BLE_HS_LOG(INFO, "controller reported invalid error code in conn"
"complete event: %u", evt->status);
assert(0);
break;""",
""" case BLE_ERR_CONN_ESTABLISHMENT:
/* Connection establishment failed (e.g. peer disappeared).
* Clean up master GAP state so scan/advertise can resume.
* Without this, the master state stays stuck in BLE_GAP_OP_M_CONN. */
if (ble_gap_master_in_progress()) {
ble_gap_master_failed(BLE_HS_ECONTROLLER);
}
break;
default:
/* this should never happen, unless controller is broken */
BLE_HS_LOG(INFO, "controller reported invalid error code in conn"
"complete event: %u", evt->status);
if (ble_gap_master_in_progress()) {
ble_gap_master_failed(BLE_HS_ECONTROLLER);
}
break;""",
"ble_gap.c: handle BLE_ERR_CONN_ESTABLISHMENT (574) to prevent stuck GAP state"
)
# Patch 4: NimBLEDevice.cpp — expose host reset reason for application-level logging
# NimBLE's onReset callback logs via NIMBLE_LOGE which only goes to serial UART.
# This patch adds a global volatile int that our BLE loop can poll and log via UDP.
apply_patch(
os.path.join(NIMBLE_BASE, "NimBLEDevice.cpp"),
"""void NimBLEDevice::onReset(int reason) {
if (!m_synced) {
return;
}
m_synced = false;
NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason));
} // onReset""",
"""volatile int nimble_host_reset_reason = 0;
void NimBLEDevice::onReset(int reason) {
if (!m_synced) {
return;
}
m_synced = false;
nimble_host_reset_reason = reason;
NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason));
} // onReset""",
"NimBLEDevice.cpp: expose host reset reason for application-level logging"
)
+25
View File
@@ -1196,6 +1196,9 @@ void setup() {
INFO("╚══════════════════════════════════════╝");
INFO("");
// Capture ESP reset reason early (before WiFi) — logged after WiFi init for UDP visibility
esp_reset_reason_t _boot_reset_reason = esp_reset_reason();
// Check for LXST crash breadcrumb from previous boot
{
Preferences _dbg;
@@ -1251,6 +1254,28 @@ void setup() {
setup_wifi();
BOOT_PROFILE_END("wifi");
// Log ESP reset reason after WiFi so it reaches UDP logs
{
const char* reason_str = "UNKNOWN";
switch (_boot_reset_reason) {
case ESP_RST_POWERON: reason_str = "POWERON"; break;
case ESP_RST_SW: reason_str = "SOFTWARE"; break;
case ESP_RST_PANIC: reason_str = "PANIC"; break;
case ESP_RST_INT_WDT: reason_str = "INT_WDT"; break;
case ESP_RST_TASK_WDT: reason_str = "TASK_WDT"; break;
case ESP_RST_WDT: reason_str = "WDT"; break;
case ESP_RST_DEEPSLEEP: reason_str = "DEEPSLEEP"; break;
case ESP_RST_BROWNOUT: reason_str = "BROWNOUT"; break;
case ESP_RST_SDIO: reason_str = "SDIO"; break;
default: break;
}
if (_boot_reset_reason != ESP_RST_POWERON) {
WARNING("Reset reason: " + std::string(reason_str) + " (" + std::to_string((int)_boot_reset_reason) + ")");
} else {
INFO("Reset reason: " + std::string(reason_str));
}
}
// Initialize LVGL and hardware drivers
BOOT_PROFILE_START("lvgl");
setup_lvgl_and_ui();