mirror of
https://github.com/torlando-tech/pyxis.git
synced 2026-05-25 18:04:05 +00:00
Merge pull request #7 from torlando-tech/ble-stability-audit
BLE stability: fix desync crash loops and scan recovery
This commit is contained in:
@@ -220,6 +220,14 @@ void BLEInterface::loop() {
|
||||
if (resolved.size() == Limits::IDENTITY_SIZE) {
|
||||
stored_id = resolved;
|
||||
} else {
|
||||
// Expire entries that have waited longer than HANDSHAKE_TIMEOUT.
|
||||
// If a peer sends data but never completes handshake (e.g., disconnect
|
||||
// during handshake), these entries would stay indefinitely.
|
||||
if (now - _pending_data_pool[i].queued_at > Timing::HANDSHAKE_TIMEOUT) {
|
||||
DEBUG("BLEInterface: Expiring stale pending data (no identity after " +
|
||||
std::to_string((int)(now - _pending_data_pool[i].queued_at)) + "s)");
|
||||
continue; // Drop this entry
|
||||
}
|
||||
// Still no identity — keep for next loop iteration
|
||||
if (requeue_count != i) {
|
||||
_pending_data_pool[requeue_count] = _pending_data_pool[i];
|
||||
@@ -759,8 +767,8 @@ void BLEInterface::onCentralConnected(const ConnectionHandle& conn) {
|
||||
|
||||
Bytes mac = conn.peer_address.toBytes();
|
||||
|
||||
// Update peer manager
|
||||
_peer_manager.addDiscoveredPeer(mac, 0);
|
||||
// Update peer manager with connection RSSI
|
||||
_peer_manager.addDiscoveredPeer(mac, conn.rssi);
|
||||
_peer_manager.setPeerState(mac, PeerState::HANDSHAKING);
|
||||
_peer_manager.setPeerHandle(mac, conn.handle);
|
||||
|
||||
@@ -923,10 +931,10 @@ void BLEInterface::processDiscoveredPeers() {
|
||||
last_peer_log = now;
|
||||
}
|
||||
|
||||
if (candidate && candidate->mac_address.size() >= Limits::MAC_SIZE) {
|
||||
if (candidate && candidate->mac_address.size() >= Limits::MAC_SIZE &&
|
||||
_peer_manager.canAcceptConnection()) {
|
||||
INFO("BLE: Connection candidate: " + BLEAddress(candidate->mac_address.data()).toString() +
|
||||
" type=" + std::to_string(candidate->address_type) +
|
||||
" canAccept=" + std::string(_peer_manager.canAcceptConnection() ? "yes" : "no"));
|
||||
" type=" + std::to_string(candidate->address_type));
|
||||
}
|
||||
|
||||
if (candidate && _peer_manager.canAcceptConnection()) {
|
||||
@@ -1086,6 +1094,7 @@ void BLEInterface::handleIncomingData(const ConnectionHandle& conn, const Bytes&
|
||||
PendingData& pending = _pending_data_pool[_pending_data_count];
|
||||
pending.identity = mac; // Use MAC as temporary key
|
||||
pending.data = data;
|
||||
pending.queued_at = Utilities::OS::time();
|
||||
_pending_data_count++;
|
||||
}
|
||||
return;
|
||||
@@ -1099,6 +1108,7 @@ void BLEInterface::handleIncomingData(const ConnectionHandle& conn, const Bytes&
|
||||
PendingData& pending = _pending_data_pool[_pending_data_count];
|
||||
pending.identity = identity;
|
||||
pending.data = data;
|
||||
pending.queued_at = Utilities::OS::time();
|
||||
_pending_data_count++;
|
||||
}
|
||||
|
||||
|
||||
@@ -270,6 +270,7 @@ private:
|
||||
struct PendingData {
|
||||
RNS::Bytes identity;
|
||||
RNS::Bytes data;
|
||||
double queued_at = 0; // Timestamp for expiry of unresolvable entries
|
||||
};
|
||||
PendingData _pending_data_pool[MAX_PENDING_DATA];
|
||||
size_t _pending_data_count = 0;
|
||||
|
||||
@@ -325,6 +325,7 @@ struct ConnectionHandle {
|
||||
Role local_role = Role::NONE; // Our role in this connection
|
||||
ConnectionState state = ConnectionState::DISCONNECTED;
|
||||
uint16_t mtu = MTU::MINIMUM; // Negotiated MTU
|
||||
int8_t rssi = 0; // Connection RSSI at connect time
|
||||
|
||||
// Characteristic handles (discovered after connection)
|
||||
uint16_t rx_char_handle = 0; // Handle for RX characteristic
|
||||
@@ -340,6 +341,7 @@ struct ConnectionHandle {
|
||||
local_role = Role::NONE;
|
||||
state = ConnectionState::DISCONNECTED;
|
||||
mtu = MTU::MINIMUM;
|
||||
rssi = 0;
|
||||
rx_char_handle = 0;
|
||||
tx_char_handle = 0;
|
||||
tx_cccd_handle = 0;
|
||||
|
||||
@@ -36,6 +36,10 @@ extern "C" {
|
||||
void ble_hs_sched_reset(int reason);
|
||||
}
|
||||
|
||||
// Defined in patched NimBLEDevice.cpp — set in onReset callback with the reason code.
|
||||
// Poll from BLE loop to log via UDP (NimBLE's own logging only reaches serial UART).
|
||||
extern volatile int nimble_host_reset_reason;
|
||||
|
||||
namespace RNS { namespace BLE {
|
||||
|
||||
//=============================================================================
|
||||
@@ -204,6 +208,16 @@ void NimBLEPlatform::loop() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process deferred disconnects from NimBLE host task callbacks.
|
||||
// Must run before other loop logic to ensure stale connections are cleaned up.
|
||||
processPendingDisconnects();
|
||||
|
||||
// Process deferred error recovery (requested from callback context)
|
||||
if (_error_recovery_requested) {
|
||||
_error_recovery_requested = false;
|
||||
enterErrorRecovery();
|
||||
}
|
||||
|
||||
// Check if continuous scan should stop
|
||||
portENTER_CRITICAL(&_state_mux);
|
||||
MasterState ms = _master_state;
|
||||
@@ -221,6 +235,7 @@ void NimBLEPlatform::loop() {
|
||||
// Stuck-state safety net: if GAP hardware is idle but our state machine
|
||||
// thinks we're busy, reset state machine. This recovers from missed callbacks
|
||||
// (e.g., service discovery disconnect not properly cleaning up state).
|
||||
// Skip during CONNECTING — connectNative() can legitimately take up to 10s.
|
||||
static uint32_t last_stuck_check = 0;
|
||||
uint32_t now_ms = millis();
|
||||
if (now_ms - last_stuck_check >= 5000) { // Check every 5 seconds
|
||||
@@ -232,22 +247,27 @@ void NimBLEPlatform::loop() {
|
||||
SlaveState ss = _slave_state;
|
||||
portEXIT_CRITICAL(&_state_mux);
|
||||
|
||||
bool gap_idle = !ble_gap_disc_active() && !ble_gap_adv_active() && !ble_gap_conn_active();
|
||||
// Don't fire stuck detector while a connection attempt is in progress
|
||||
if (ms2 == MasterState::CONNECTING || ms2 == MasterState::CONN_STARTING) {
|
||||
// Expected — connect can take several seconds
|
||||
} else {
|
||||
bool gap_idle = !ble_gap_disc_active() && !ble_gap_adv_active() && !ble_gap_conn_active();
|
||||
|
||||
if (gap_idle && (gs != GAPState::READY || ms2 != MasterState::IDLE || ss != SlaveState::IDLE)) {
|
||||
WARNING(std::string("NimBLEPlatform: Stuck state detected - GAP idle but state=") +
|
||||
gapStateName(gs) + " master=" + masterStateName(ms2) +
|
||||
" slave=" + slaveStateName(ss) + ". Resetting.");
|
||||
portENTER_CRITICAL(&_state_mux);
|
||||
_gap_state = GAPState::READY;
|
||||
_master_state = MasterState::IDLE;
|
||||
_slave_state = SlaveState::IDLE;
|
||||
_slave_paused_for_master = false;
|
||||
portEXIT_CRITICAL(&_state_mux);
|
||||
if (gap_idle && (gs != GAPState::READY || ms2 != MasterState::IDLE || ss != SlaveState::IDLE)) {
|
||||
WARNING(std::string("NimBLEPlatform: Stuck state detected - GAP idle but state=") +
|
||||
gapStateName(gs) + " master=" + masterStateName(ms2) +
|
||||
" slave=" + slaveStateName(ss) + ". Resetting.");
|
||||
portENTER_CRITICAL(&_state_mux);
|
||||
_gap_state = GAPState::READY;
|
||||
_master_state = MasterState::IDLE;
|
||||
_slave_state = SlaveState::IDLE;
|
||||
_slave_paused_for_master = false;
|
||||
portEXIT_CRITICAL(&_state_mux);
|
||||
|
||||
// Restart advertising in dual/peripheral mode
|
||||
if (_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) {
|
||||
startAdvertising();
|
||||
// Restart advertising in dual/peripheral mode
|
||||
if (_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) {
|
||||
startAdvertising();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -309,6 +329,9 @@ void NimBLEPlatform::shutdown() {
|
||||
// Notify higher layers about all disconnections BEFORE deinit,
|
||||
// so the peer manager can reset peer states properly.
|
||||
// Do NOT delete clients individually — deinit(true) handles all client cleanup.
|
||||
// Process any remaining deferred disconnects before shutdown cleanup
|
||||
processPendingDisconnects();
|
||||
|
||||
if (xSemaphoreTake(_conn_mutex, pdMS_TO_TICKS(1000))) {
|
||||
if (_on_disconnected) {
|
||||
for (auto& kv : _connections) {
|
||||
@@ -317,6 +340,7 @@ void NimBLEPlatform::shutdown() {
|
||||
}
|
||||
_clients.clear();
|
||||
_connections.clear();
|
||||
_cached_rx_chars.clear();
|
||||
_discovered_devices.clear();
|
||||
_discovered_order.clear();
|
||||
xSemaphoreGive(_conn_mutex);
|
||||
@@ -329,6 +353,7 @@ void NimBLEPlatform::shutdown() {
|
||||
}
|
||||
_clients.clear();
|
||||
_connections.clear();
|
||||
_cached_rx_chars.clear();
|
||||
_discovered_devices.clear();
|
||||
_discovered_order.clear();
|
||||
}
|
||||
@@ -598,6 +623,10 @@ void NimBLEPlatform::enterErrorRecovery() {
|
||||
if (ble_gap_disc_active()) {
|
||||
ble_gap_disc_cancel();
|
||||
}
|
||||
if (ble_gap_conn_active()) {
|
||||
WARNING("NimBLEPlatform: Cancelling stuck GAP connection in error recovery");
|
||||
ble_gap_conn_cancel();
|
||||
}
|
||||
if (ble_gap_adv_active()) {
|
||||
ble_gap_adv_stop();
|
||||
}
|
||||
@@ -620,6 +649,7 @@ void NimBLEPlatform::enterErrorRecovery() {
|
||||
uint32_t sync_start = millis();
|
||||
while (!ble_hs_synced() && (millis() - sync_start) < 5000) {
|
||||
delay(50);
|
||||
esp_task_wdt_reset();
|
||||
}
|
||||
if (ble_hs_synced()) {
|
||||
INFO("NimBLEPlatform: Host sync restored after " +
|
||||
@@ -636,8 +666,28 @@ void NimBLEPlatform::enterErrorRecovery() {
|
||||
}
|
||||
}
|
||||
|
||||
// Force host-controller resync to clear stale HCI state (fixes rc=530 / Invalid HCI params)
|
||||
// After a 574 desync, the controller's scan state can become corrupted even after host re-syncs.
|
||||
INFO("NimBLEPlatform: Scheduling host reset for controller resync");
|
||||
ble_hs_sched_reset(BLE_HS_ECONTROLLER);
|
||||
|
||||
// Wait for host to re-sync after reset
|
||||
{
|
||||
uint32_t reset_start = millis();
|
||||
while (!ble_hs_synced() && (millis() - reset_start) < 5000) {
|
||||
delay(50);
|
||||
esp_task_wdt_reset();
|
||||
}
|
||||
if (ble_hs_synced()) {
|
||||
INFO("NimBLEPlatform: Host-controller resync after " +
|
||||
std::to_string(millis() - reset_start) + "ms");
|
||||
} else {
|
||||
WARNING("NimBLEPlatform: Host-controller resync failed after 5s");
|
||||
}
|
||||
}
|
||||
|
||||
// DELAY RATIONALE: Connect attempt recovery - ESP32-S3 settling time after host sync
|
||||
delay(50);
|
||||
delay(100);
|
||||
|
||||
// Re-acquire scan object to reset NimBLE internal state
|
||||
// This is necessary because NimBLE scan object can get into stuck state
|
||||
@@ -671,6 +721,71 @@ void NimBLEPlatform::enterErrorRecovery() {
|
||||
in_recovery = false;
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
// Deferred Disconnect Processing
|
||||
//=============================================================================
|
||||
|
||||
void NimBLEPlatform::queueDisconnect(uint16_t conn_handle, int reason, bool is_peripheral) {
|
||||
uint8_t next = (_pending_disc_write + 1) % PENDING_DISC_QUEUE_SIZE;
|
||||
if (next == _pending_disc_read) {
|
||||
// Queue full — more simultaneous disconnects than queue size
|
||||
WARNING("NimBLEPlatform: Pending disconnect queue full, dropping handle=" +
|
||||
std::to_string(conn_handle));
|
||||
return;
|
||||
}
|
||||
_pending_disc_queue[_pending_disc_write] = {conn_handle, reason, is_peripheral};
|
||||
_pending_disc_write = next;
|
||||
}
|
||||
|
||||
void NimBLEPlatform::processPendingDisconnects() {
|
||||
while (_pending_disc_read != _pending_disc_write) {
|
||||
PendingDisconnect& pd = _pending_disc_queue[_pending_disc_read];
|
||||
|
||||
auto conn_it = _connections.find(pd.conn_handle);
|
||||
if (conn_it != _connections.end()) {
|
||||
ConnectionHandle conn = conn_it->second;
|
||||
_connections.erase(conn_it);
|
||||
|
||||
INFO("NimBLEPlatform: Processing deferred disconnect for " +
|
||||
conn.peer_address.toString() + " reason=" + std::to_string(pd.reason));
|
||||
|
||||
if (!pd.is_peripheral) {
|
||||
// Central mode: clean up client object and cached char pointer
|
||||
auto client_it = _clients.find(pd.conn_handle);
|
||||
if (client_it != _clients.end()) {
|
||||
if (client_it->second) {
|
||||
NimBLEDevice::deleteClient(client_it->second);
|
||||
}
|
||||
_clients.erase(client_it);
|
||||
}
|
||||
_cached_rx_chars.erase(pd.conn_handle);
|
||||
}
|
||||
|
||||
// Clear operation queue for this connection
|
||||
clearForConnection(pd.conn_handle);
|
||||
|
||||
// Notify higher layers
|
||||
if (pd.is_peripheral) {
|
||||
if (_on_central_disconnected) {
|
||||
_on_central_disconnected(conn);
|
||||
}
|
||||
} else {
|
||||
if (_on_disconnected) {
|
||||
_on_disconnected(conn, static_cast<uint8_t>(pd.reason));
|
||||
}
|
||||
}
|
||||
|
||||
// Restart advertising if in peripheral/dual mode
|
||||
if ((_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) &&
|
||||
!isAdvertising()) {
|
||||
startAdvertising();
|
||||
}
|
||||
}
|
||||
|
||||
_pending_disc_read = (_pending_disc_read + 1) % PENDING_DISC_QUEUE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
// Central Mode - Scanning
|
||||
//=============================================================================
|
||||
@@ -707,11 +822,18 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
|
||||
if (!ble_hs_synced()) {
|
||||
unsigned long desync_duration = millis() - _host_desync_since;
|
||||
_scan_fail_count++;
|
||||
// Capture NimBLE's internal reset reason (set in patched onReset callback)
|
||||
int reset_reason = nimble_host_reset_reason;
|
||||
if (reset_reason != 0) {
|
||||
nimble_host_reset_reason = 0;
|
||||
}
|
||||
WARNING("NimBLEPlatform: Host not synced, desync " +
|
||||
std::to_string(desync_duration / 1000) + "s (fail " +
|
||||
std::to_string(_scan_fail_count) + "/" +
|
||||
std::to_string(SCAN_FAIL_RECOVERY_THRESHOLD) +
|
||||
", resets=" + std::to_string(_host_reset_attempts) + ")");
|
||||
", resets=" + std::to_string(_host_reset_attempts) +
|
||||
(reset_reason != 0 ? ", nimble_reason=" + std::to_string(reset_reason) : "") +
|
||||
")");
|
||||
|
||||
// Tiered recovery:
|
||||
// 0-10s: Wait for natural self-recovery
|
||||
@@ -752,19 +874,33 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
|
||||
// Host is synced — clear desync tracking
|
||||
if (_host_desync_since != 0) {
|
||||
unsigned long recovery_time = millis() - _host_desync_since;
|
||||
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms");
|
||||
// Capture any remaining reset reason from NimBLE
|
||||
int reset_reason = nimble_host_reset_reason;
|
||||
if (reset_reason != 0) {
|
||||
nimble_host_reset_reason = 0;
|
||||
}
|
||||
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms" +
|
||||
(reset_reason != 0 ? " (nimble_reason=" + std::to_string(reset_reason) + ")" : ""));
|
||||
_host_desync_since = 0;
|
||||
_host_reset_attempts = 0;
|
||||
_last_desync_recovery = millis(); // Start cooldown before allowing connections
|
||||
}
|
||||
|
||||
// Log GAP hardware state before checking
|
||||
DEBUG("NimBLEPlatform: Pre-scan GAP state: disc=" + std::to_string(ble_gap_disc_active()) +
|
||||
" adv=" + std::to_string(ble_gap_adv_active()) +
|
||||
" conn=" + std::to_string(ble_gap_conn_active()));
|
||||
// Log GAP hardware state before checking (INFO for UDP visibility during soak test)
|
||||
INFO("NimBLEPlatform: Pre-scan GAP: disc=" + std::to_string(ble_gap_disc_active()) +
|
||||
" adv=" + std::to_string(ble_gap_adv_active()) +
|
||||
" conn=" + std::to_string(ble_gap_conn_active()));
|
||||
|
||||
// If a stale GAP connection is blocking scan, cancel it proactively
|
||||
if (ble_gap_conn_active() && _master_state == MasterState::IDLE) {
|
||||
WARNING("NimBLEPlatform: Stale GAP conn blocking scan - cancelling");
|
||||
ble_gap_conn_cancel();
|
||||
delay(50); // Let GAP process the cancel
|
||||
}
|
||||
|
||||
// Verify we can start scan
|
||||
if (!canStartScan()) {
|
||||
DEBUG("NimBLEPlatform: Cannot start scan - state check failed" +
|
||||
WARNING("NimBLEPlatform: Cannot start scan - state check failed" +
|
||||
std::string(" master=") + masterStateName(current_master) +
|
||||
" gap_disc=" + std::to_string(ble_gap_disc_active()) +
|
||||
" gap_conn=" + std::to_string(ble_gap_conn_active()));
|
||||
@@ -823,8 +959,11 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Scan failed
|
||||
ERROR("NimBLEPlatform: Failed to start scan");
|
||||
// Scan failed — log GAP state for diagnosis
|
||||
ERROR("NimBLEPlatform: Failed to start scan - GAP: disc=" + std::to_string(ble_gap_disc_active()) +
|
||||
" conn=" + std::to_string(ble_gap_conn_active()) +
|
||||
" adv=" + std::to_string(ble_gap_adv_active()) +
|
||||
" master=" + masterStateName(_master_state));
|
||||
|
||||
// Reset state
|
||||
portENTER_CRITICAL(&_state_mux);
|
||||
@@ -834,9 +973,20 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
|
||||
|
||||
_scan_fail_count++;
|
||||
if (_scan_fail_count >= SCAN_FAIL_RECOVERY_THRESHOLD) {
|
||||
WARNING("NimBLEPlatform: Too many scan failures, entering error recovery");
|
||||
_scan_fail_count = 0; // Reset so we don't immediately re-enter after recovery
|
||||
enterErrorRecovery();
|
||||
_lightweight_reset_fails++;
|
||||
|
||||
if (_lightweight_reset_fails >= LIGHTWEIGHT_RESET_MAX_FAILS) {
|
||||
WARNING("NimBLEPlatform: " + std::to_string(_lightweight_reset_fails) +
|
||||
" error recoveries failed to restore scan, escalating to full stack recovery");
|
||||
_lightweight_reset_fails = 0;
|
||||
recoverBLEStack();
|
||||
} else {
|
||||
WARNING("NimBLEPlatform: Too many scan failures, entering error recovery (" +
|
||||
std::to_string(_lightweight_reset_fails) + "/" +
|
||||
std::to_string(LIGHTWEIGHT_RESET_MAX_FAILS) + ")");
|
||||
enterErrorRecovery();
|
||||
}
|
||||
}
|
||||
|
||||
resumeSlave();
|
||||
@@ -899,6 +1049,14 @@ bool NimBLEPlatform::isScanning() const {
|
||||
bool NimBLEPlatform::connect(const BLEAddress& address, uint16_t timeout_ms) {
|
||||
NimBLEAddress nimAddr = toNimBLE(address);
|
||||
|
||||
// Skip connections during desync cooldown — connecting while the NimBLE
|
||||
// stack is recovering from a desync can hang client->connect() (the host
|
||||
// task can't process the completion event), leading to WDT crashes.
|
||||
if (_host_desync_since != 0 || (_last_desync_recovery > 0 && millis() - _last_desync_recovery < DESYNC_CONNECT_COOLDOWN_MS)) {
|
||||
DEBUG("NimBLEPlatform: Skipping connect during desync cooldown");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Rate limit connections to avoid overwhelming the BLE stack
|
||||
// Non-blocking: return false if too soon, caller can retry later
|
||||
static unsigned long last_connect_time = 0;
|
||||
@@ -1116,38 +1274,10 @@ int NimBLEPlatform::nativeGapEventHandler(struct ble_gap_event* event, void* arg
|
||||
break;
|
||||
}
|
||||
|
||||
// Clean up established connections (handles MAC rotation, out of range, etc.)
|
||||
auto conn_it = platform->_connections.find(disc_handle);
|
||||
if (conn_it != platform->_connections.end()) {
|
||||
ConnectionHandle conn = conn_it->second;
|
||||
platform->_connections.erase(conn_it);
|
||||
|
||||
INFO("NimBLEPlatform: Native connection lost to " + conn.peer_address.toString() +
|
||||
" reason=" + std::to_string(disc_reason));
|
||||
|
||||
// Clean up client object
|
||||
auto client_it = platform->_clients.find(disc_handle);
|
||||
if (client_it != platform->_clients.end()) {
|
||||
if (client_it->second) {
|
||||
NimBLEDevice::deleteClient(client_it->second);
|
||||
}
|
||||
platform->_clients.erase(client_it);
|
||||
}
|
||||
|
||||
// Clear operation queue for this connection
|
||||
platform->clearForConnection(disc_handle);
|
||||
|
||||
// Notify higher layers
|
||||
if (platform->_on_disconnected) {
|
||||
platform->_on_disconnected(conn, static_cast<uint8_t>(disc_reason));
|
||||
}
|
||||
|
||||
// Restart advertising if in peripheral/dual mode and not currently advertising
|
||||
if ((platform->_config.role == Role::PERIPHERAL || platform->_config.role == Role::DUAL) &&
|
||||
!platform->isAdvertising()) {
|
||||
platform->startAdvertising();
|
||||
}
|
||||
}
|
||||
// Defer map cleanup to BLE loop task to avoid data race.
|
||||
// This callback runs in the NimBLE host task while the BLE loop task
|
||||
// may be iterating _connections/_clients concurrently.
|
||||
platform->queueDisconnect(disc_handle, disc_reason, false);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1527,13 +1657,22 @@ bool NimBLEPlatform::write(uint16_t conn_handle, const Bytes& data, bool respons
|
||||
return false;
|
||||
}
|
||||
|
||||
NimBLERemoteService* service = client->getService(UUID::SERVICE);
|
||||
if (!service) {
|
||||
WARNING("NimBLEPlatform::write: service not found for handle " + std::to_string(conn_handle));
|
||||
return false;
|
||||
// Use cached RX characteristic pointer to avoid repeated service/char lookups
|
||||
NimBLERemoteCharacteristic* rxChar = nullptr;
|
||||
auto cached_it = _cached_rx_chars.find(conn_handle);
|
||||
if (cached_it != _cached_rx_chars.end()) {
|
||||
rxChar = cached_it->second;
|
||||
} else {
|
||||
NimBLERemoteService* service = client->getService(UUID::SERVICE);
|
||||
if (!service) {
|
||||
WARNING("NimBLEPlatform::write: service not found for handle " + std::to_string(conn_handle));
|
||||
return false;
|
||||
}
|
||||
rxChar = service->getCharacteristic(UUID::RX_CHAR);
|
||||
if (rxChar) {
|
||||
_cached_rx_chars[conn_handle] = rxChar;
|
||||
}
|
||||
}
|
||||
|
||||
NimBLERemoteCharacteristic* rxChar = service->getCharacteristic(UUID::RX_CHAR);
|
||||
if (!rxChar) {
|
||||
WARNING("NimBLEPlatform::write: RX char not found for handle " + std::to_string(conn_handle));
|
||||
return false;
|
||||
@@ -1591,6 +1730,11 @@ bool NimBLEPlatform::read(uint16_t conn_handle, uint16_t char_handle,
|
||||
}
|
||||
|
||||
NimBLEClient* client = client_it->second;
|
||||
if (!client->isConnected()) {
|
||||
if (callback) callback(OperationResult::DISCONNECTED, Bytes());
|
||||
return false;
|
||||
}
|
||||
|
||||
NimBLERemoteService* service = client->getService(UUID::SERVICE);
|
||||
if (!service) {
|
||||
if (callback) callback(OperationResult::NOT_FOUND, Bytes());
|
||||
@@ -1624,6 +1768,8 @@ bool NimBLEPlatform::enableNotifications(uint16_t conn_handle, bool enable) {
|
||||
}
|
||||
|
||||
NimBLEClient* client = client_it->second;
|
||||
if (!client->isConnected()) return false;
|
||||
|
||||
NimBLERemoteService* service = client->getService(UUID::SERVICE);
|
||||
if (!service) return false;
|
||||
|
||||
@@ -1631,11 +1777,17 @@ bool NimBLEPlatform::enableNotifications(uint16_t conn_handle, bool enable) {
|
||||
if (!txChar) return false;
|
||||
|
||||
if (enable) {
|
||||
// Subscribe to notifications
|
||||
auto notifyCb = [this, conn_handle](NimBLERemoteCharacteristic* pChar,
|
||||
// Subscribe to notifications.
|
||||
// Capture peer_address to guard against conn_handle reuse: if peer A disconnects
|
||||
// (handle=1) and peer B connects (handle=1), we must not deliver B's data as A's.
|
||||
BLEAddress expected_peer = getConnection(conn_handle).peer_address;
|
||||
auto notifyCb = [this, conn_handle, expected_peer](NimBLERemoteCharacteristic* pChar,
|
||||
uint8_t* pData, size_t length, bool isNotify) {
|
||||
if (_on_data_received) {
|
||||
ConnectionHandle conn = getConnection(conn_handle);
|
||||
if (!conn.isValid() || conn.peer_address != expected_peer) {
|
||||
return; // Stale handle — peer changed
|
||||
}
|
||||
Bytes data(pData, length);
|
||||
_on_data_received(conn, data);
|
||||
}
|
||||
@@ -1810,9 +1962,16 @@ void NimBLEPlatform::onConnect(NimBLEServer* pServer, NimBLEConnInfo& connInfo)
|
||||
conn.state = ConnectionState::CONNECTED;
|
||||
conn.mtu = MTU::MINIMUM - MTU::ATT_OVERHEAD;
|
||||
|
||||
// Read connection RSSI
|
||||
int8_t rssi_val = 0;
|
||||
if (ble_gap_conn_rssi(conn_handle, &rssi_val) == 0) {
|
||||
conn.rssi = rssi_val;
|
||||
}
|
||||
|
||||
_connections[conn_handle] = conn;
|
||||
|
||||
DEBUG("NimBLEPlatform: Central connected: " + conn.peer_address.toString());
|
||||
DEBUG("NimBLEPlatform: Central connected: " + conn.peer_address.toString() +
|
||||
" rssi=" + std::to_string(conn.rssi));
|
||||
|
||||
if (_on_central_connected) {
|
||||
_on_central_connected(conn);
|
||||
@@ -1829,21 +1988,13 @@ void NimBLEPlatform::onDisconnect(NimBLEServer* pServer, NimBLEConnInfo& connInf
|
||||
|
||||
uint16_t conn_handle = connInfo.getConnHandle();
|
||||
|
||||
auto it = _connections.find(conn_handle);
|
||||
if (it != _connections.end()) {
|
||||
ConnectionHandle conn = it->second;
|
||||
_connections.erase(it);
|
||||
DEBUG("NimBLEPlatform: Central disconnect event for handle=" + std::to_string(conn_handle) +
|
||||
" reason=" + std::to_string(reason));
|
||||
|
||||
DEBUG("NimBLEPlatform: Central disconnected: " + conn.peer_address.toString() +
|
||||
" reason: " + std::to_string(reason));
|
||||
|
||||
if (_on_central_disconnected) {
|
||||
_on_central_disconnected(conn);
|
||||
}
|
||||
}
|
||||
|
||||
// Clear operation queue for this connection
|
||||
BLEOperationQueue::clearForConnection(conn_handle);
|
||||
// Defer map cleanup to BLE loop task to avoid data race.
|
||||
// This callback runs in the NimBLE host task while the BLE loop task
|
||||
// may be iterating _connections concurrently.
|
||||
queueDisconnect(conn_handle, reason, true);
|
||||
}
|
||||
|
||||
void NimBLEPlatform::onMTUChange(uint16_t MTU, NimBLEConnInfo& connInfo) {
|
||||
@@ -1959,25 +2110,15 @@ void NimBLEPlatform::onDisconnect(NimBLEClient* pClient, int reason) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto it = _connections.find(conn_handle);
|
||||
if (it != _connections.end()) {
|
||||
ConnectionHandle conn = it->second;
|
||||
_connections.erase(it);
|
||||
DEBUG("NimBLEPlatform: Client disconnect event for handle=" + std::to_string(conn_handle) +
|
||||
" reason=" + std::to_string(reason));
|
||||
|
||||
DEBUG("NimBLEPlatform: Disconnected from peripheral: " + conn.peer_address.toString() +
|
||||
" reason: " + std::to_string(reason));
|
||||
|
||||
if (_on_disconnected) {
|
||||
_on_disconnected(conn, static_cast<uint8_t>(reason));
|
||||
}
|
||||
}
|
||||
|
||||
// Remove client
|
||||
_clients.erase(conn_handle);
|
||||
NimBLEDevice::deleteClient(pClient);
|
||||
|
||||
// Clear operation queue
|
||||
BLEOperationQueue::clearForConnection(conn_handle);
|
||||
// Defer map cleanup to BLE loop task to avoid data race.
|
||||
// This callback runs in the NimBLE host task while the BLE loop task
|
||||
// may be iterating _connections/_clients concurrently.
|
||||
// Note: NimBLEDevice::deleteClient() for this client will be called
|
||||
// in processPendingDisconnects() from the loop task context.
|
||||
queueDisconnect(conn_handle, reason, false);
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
|
||||
@@ -239,6 +239,25 @@ private:
|
||||
void resumeSlave();
|
||||
void enterErrorRecovery();
|
||||
|
||||
// Deferred disconnect queue (SPSC: NimBLE host task produces, BLE loop task consumes)
|
||||
// Disconnect events arrive from the host task and must not modify _connections/_clients
|
||||
// directly, as the BLE loop task may be iterating them concurrently.
|
||||
static constexpr size_t PENDING_DISC_QUEUE_SIZE = 8;
|
||||
struct PendingDisconnect {
|
||||
uint16_t conn_handle;
|
||||
int reason;
|
||||
bool is_peripheral; // true = server disconnect, false = native GAP handler
|
||||
};
|
||||
PendingDisconnect _pending_disc_queue[PENDING_DISC_QUEUE_SIZE];
|
||||
volatile uint8_t _pending_disc_write = 0; // Next write slot (host task only)
|
||||
volatile uint8_t _pending_disc_read = 0; // Next read slot (loop task only)
|
||||
|
||||
void queueDisconnect(uint16_t conn_handle, int reason, bool is_peripheral);
|
||||
void processPendingDisconnects();
|
||||
|
||||
// Deferred error recovery (set from any context, processed in loop task)
|
||||
volatile bool _error_recovery_requested = false;
|
||||
|
||||
// Track if slave was paused for a master operation
|
||||
bool _slave_paused_for_master = false;
|
||||
|
||||
@@ -260,11 +279,13 @@ private:
|
||||
uint8_t _conn_establish_fail_count = 0; // rc=574 connection establishment failures
|
||||
unsigned long _last_full_recovery_time = 0;
|
||||
unsigned long _host_desync_since = 0; // millis() when host first lost sync (0 = synced)
|
||||
unsigned long _last_desync_recovery = 0; // millis() when last desync recovered (for connect cooldown)
|
||||
uint8_t _host_reset_attempts = 0; // ble_hs_sched_reset attempts since last sync
|
||||
static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 10;
|
||||
static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 5;
|
||||
static constexpr uint8_t LIGHTWEIGHT_RESET_MAX_FAILS = 3;
|
||||
static constexpr uint8_t CONN_ESTABLISH_FAIL_THRESHOLD = 5;
|
||||
static constexpr unsigned long FULL_RECOVERY_COOLDOWN_MS = 60000; // 60 seconds
|
||||
static constexpr unsigned long DESYNC_CONNECT_COOLDOWN_MS = 30000; // Don't connect for 30s after desync recovery
|
||||
static constexpr unsigned long HOST_DESYNC_REBOOT_MS = 60000; // Reboot after 60s desync (no connections)
|
||||
bool recoverBLEStack();
|
||||
bool attemptHostReset();
|
||||
@@ -281,6 +302,9 @@ private:
|
||||
// Client connections (as central)
|
||||
std::map<uint16_t, NimBLEClient*> _clients;
|
||||
|
||||
// Cached RX characteristic pointers (avoids repeated service/char lookups in write())
|
||||
std::map<uint16_t, NimBLERemoteCharacteristic*> _cached_rx_chars;
|
||||
|
||||
// Connection tracking
|
||||
std::map<uint16_t, ConnectionHandle> _connections;
|
||||
|
||||
|
||||
@@ -195,6 +195,7 @@ void ConversationListScreen::refresh() {
|
||||
_conversations.clear();
|
||||
_conversation_containers.clear();
|
||||
_peer_hash_pool.clear();
|
||||
_has_unresolved_names = false;
|
||||
|
||||
// Load conversations from store
|
||||
std::vector<Bytes> peer_hashes = _message_store->get_conversations();
|
||||
@@ -226,17 +227,22 @@ void ConversationListScreen::refresh() {
|
||||
item.peer_hash = peer_hash;
|
||||
|
||||
// Try to get display name from app_data, fall back to hash
|
||||
bool resolved = false;
|
||||
Bytes app_data = Identity::recall_app_data(peer_hash);
|
||||
if (app_data && app_data.size() > 0) {
|
||||
String display_name = parse_display_name(app_data);
|
||||
if (display_name.length() > 0) {
|
||||
item.peer_name = display_name;
|
||||
resolved = true;
|
||||
} else {
|
||||
item.peer_name = truncate_hash(peer_hash);
|
||||
}
|
||||
} else {
|
||||
item.peer_name = truncate_hash(peer_hash);
|
||||
}
|
||||
if (!resolved) {
|
||||
_has_unresolved_names = true;
|
||||
}
|
||||
|
||||
// Get message content for preview
|
||||
String content((const char*)last_msg.content().data(), last_msg.content().size());
|
||||
@@ -544,6 +550,23 @@ void ConversationListScreen::update_status() {
|
||||
lv_obj_set_style_text_color(_label_battery_icon, battery_color, 0);
|
||||
lv_obj_set_style_text_color(_label_battery_pct, battery_color, 0);
|
||||
}
|
||||
|
||||
// Check if any unresolved display names can now be resolved
|
||||
// (announces may have arrived since the list was last refreshed)
|
||||
if (_has_unresolved_names && _message_store) {
|
||||
for (const auto& item : _conversations) {
|
||||
Bytes app_data = Identity::recall_app_data(item.peer_hash);
|
||||
if (app_data && app_data.size() > 0) {
|
||||
String name = parse_display_name(app_data);
|
||||
if (name.length() > 0 && item.peer_name != name) {
|
||||
// A name has become available — refresh the whole list
|
||||
DEBUG("Display name resolved, refreshing conversation list");
|
||||
refresh();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ConversationListScreen::on_conversation_clicked(lv_event_t* event) {
|
||||
|
||||
@@ -197,6 +197,7 @@ private:
|
||||
std::vector<lv_obj_t*> _conversation_containers; // For focus group management
|
||||
std::vector<RNS::Bytes> _peer_hash_pool; // Object pool to avoid per-item allocations
|
||||
RNS::Bytes _pending_delete_hash; // Hash of conversation pending deletion
|
||||
bool _has_unresolved_names = false; // True if any conversation shows hash instead of name
|
||||
|
||||
ConversationSelectedCallback _conversation_selected_callback;
|
||||
ComposeCallback _compose_callback;
|
||||
|
||||
@@ -7,6 +7,16 @@ Patch 1 — ble_hs.c: Remove assert(0) in BLE_HS_SYNC_STATE_BRINGUP timer handle
|
||||
Patch 2 — NimBLEClient.cpp: Add null checks in PHY update event handler.
|
||||
If a client is deleted while events are queued, the callback arg becomes a dangling
|
||||
pointer. Guard against null pClient and null m_pClientCallbacks.
|
||||
|
||||
Patch 3 — ble_gap.c: Handle BLE_ERR_CONN_ESTABLISHMENT (574) in conn complete handler.
|
||||
NimBLE only handles 574 when BLE_PERIODIC_ADV_WITH_RESPONSES is enabled. Without this
|
||||
patch, 574 falls through to assert(0) and the master GAP state is never cleaned up,
|
||||
leaving scan/advertise permanently broken. The ESP32-S3 controller returns 574 when
|
||||
connection establishment fails (peer disappeared, RF interference, etc.).
|
||||
|
||||
Patch 4 — NimBLEDevice.cpp: Expose host reset reason via global volatile variable.
|
||||
NimBLE's onReset callback logs reason to ESP_LOG (serial only). This patch adds a
|
||||
global volatile int that application code can poll to capture the reason in UDP logs.
|
||||
"""
|
||||
Import("env")
|
||||
import os
|
||||
@@ -76,3 +86,71 @@ apply_patch(
|
||||
} // BLE_GAP_EVENT_PHY_UPDATE_COMPLETE""",
|
||||
"NimBLEClient.cpp: added null guard in PHY update handler"
|
||||
)
|
||||
|
||||
# Patch 3: ble_gap.c — handle BLE_ERR_CONN_ESTABLISHMENT (574) without PAwR
|
||||
# The ESP32-S3 controller returns error 574 when connection establishment fails.
|
||||
# NimBLE only handles this in the BLE_PERIODIC_ADV_WITH_RESPONSES path. Without it,
|
||||
# 574 hits the default case (assert(0) + no cleanup), leaving master GAP state stuck
|
||||
# in BLE_GAP_OP_M_CONN — permanently blocking scan and advertising.
|
||||
apply_patch(
|
||||
os.path.join(NIMBLE_BASE, "nimble", "nimble", "host", "src", "ble_gap.c"),
|
||||
"""#if MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES)
|
||||
case BLE_ERR_CONN_ESTABLISHMENT:
|
||||
if (!v1_evt) {
|
||||
ble_gap_rx_conn_comp_failed(evt);
|
||||
}
|
||||
break;
|
||||
#endif // MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES)
|
||||
default:
|
||||
/* this should never happen, unless controller is broken */
|
||||
BLE_HS_LOG(INFO, "controller reported invalid error code in conn"
|
||||
"complete event: %u", evt->status);
|
||||
assert(0);
|
||||
break;""",
|
||||
""" case BLE_ERR_CONN_ESTABLISHMENT:
|
||||
/* Connection establishment failed (e.g. peer disappeared).
|
||||
* Clean up master GAP state so scan/advertise can resume.
|
||||
* Without this, the master state stays stuck in BLE_GAP_OP_M_CONN. */
|
||||
if (ble_gap_master_in_progress()) {
|
||||
ble_gap_master_failed(BLE_HS_ECONTROLLER);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/* this should never happen, unless controller is broken */
|
||||
BLE_HS_LOG(INFO, "controller reported invalid error code in conn"
|
||||
"complete event: %u", evt->status);
|
||||
if (ble_gap_master_in_progress()) {
|
||||
ble_gap_master_failed(BLE_HS_ECONTROLLER);
|
||||
}
|
||||
break;""",
|
||||
"ble_gap.c: handle BLE_ERR_CONN_ESTABLISHMENT (574) to prevent stuck GAP state"
|
||||
)
|
||||
|
||||
# Patch 4: NimBLEDevice.cpp — expose host reset reason for application-level logging
|
||||
# NimBLE's onReset callback logs via NIMBLE_LOGE which only goes to serial UART.
|
||||
# This patch adds a global volatile int that our BLE loop can poll and log via UDP.
|
||||
apply_patch(
|
||||
os.path.join(NIMBLE_BASE, "NimBLEDevice.cpp"),
|
||||
"""void NimBLEDevice::onReset(int reason) {
|
||||
if (!m_synced) {
|
||||
return;
|
||||
}
|
||||
|
||||
m_synced = false;
|
||||
|
||||
NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason));
|
||||
} // onReset""",
|
||||
"""volatile int nimble_host_reset_reason = 0;
|
||||
|
||||
void NimBLEDevice::onReset(int reason) {
|
||||
if (!m_synced) {
|
||||
return;
|
||||
}
|
||||
|
||||
m_synced = false;
|
||||
nimble_host_reset_reason = reason;
|
||||
|
||||
NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason));
|
||||
} // onReset""",
|
||||
"NimBLEDevice.cpp: expose host reset reason for application-level logging"
|
||||
)
|
||||
|
||||
@@ -1196,6 +1196,9 @@ void setup() {
|
||||
INFO("╚══════════════════════════════════════╝");
|
||||
INFO("");
|
||||
|
||||
// Capture ESP reset reason early (before WiFi) — logged after WiFi init for UDP visibility
|
||||
esp_reset_reason_t _boot_reset_reason = esp_reset_reason();
|
||||
|
||||
// Check for LXST crash breadcrumb from previous boot
|
||||
{
|
||||
Preferences _dbg;
|
||||
@@ -1251,6 +1254,28 @@ void setup() {
|
||||
setup_wifi();
|
||||
BOOT_PROFILE_END("wifi");
|
||||
|
||||
// Log ESP reset reason after WiFi so it reaches UDP logs
|
||||
{
|
||||
const char* reason_str = "UNKNOWN";
|
||||
switch (_boot_reset_reason) {
|
||||
case ESP_RST_POWERON: reason_str = "POWERON"; break;
|
||||
case ESP_RST_SW: reason_str = "SOFTWARE"; break;
|
||||
case ESP_RST_PANIC: reason_str = "PANIC"; break;
|
||||
case ESP_RST_INT_WDT: reason_str = "INT_WDT"; break;
|
||||
case ESP_RST_TASK_WDT: reason_str = "TASK_WDT"; break;
|
||||
case ESP_RST_WDT: reason_str = "WDT"; break;
|
||||
case ESP_RST_DEEPSLEEP: reason_str = "DEEPSLEEP"; break;
|
||||
case ESP_RST_BROWNOUT: reason_str = "BROWNOUT"; break;
|
||||
case ESP_RST_SDIO: reason_str = "SDIO"; break;
|
||||
default: break;
|
||||
}
|
||||
if (_boot_reset_reason != ESP_RST_POWERON) {
|
||||
WARNING("Reset reason: " + std::string(reason_str) + " (" + std::to_string((int)_boot_reset_reason) + ")");
|
||||
} else {
|
||||
INFO("Reset reason: " + std::string(reason_str));
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize LVGL and hardware drivers
|
||||
BOOT_PROFILE_START("lvgl");
|
||||
setup_lvgl_and_ui();
|
||||
|
||||
Reference in New Issue
Block a user