diff --git a/lib/ble_interface/BLEInterface.cpp b/lib/ble_interface/BLEInterface.cpp index 5956ec60..34d92d2e 100644 --- a/lib/ble_interface/BLEInterface.cpp +++ b/lib/ble_interface/BLEInterface.cpp @@ -220,6 +220,14 @@ void BLEInterface::loop() { if (resolved.size() == Limits::IDENTITY_SIZE) { stored_id = resolved; } else { + // Expire entries that have waited longer than HANDSHAKE_TIMEOUT. + // If a peer sends data but never completes handshake (e.g., disconnect + // during handshake), these entries would stay indefinitely. + if (now - _pending_data_pool[i].queued_at > Timing::HANDSHAKE_TIMEOUT) { + DEBUG("BLEInterface: Expiring stale pending data (no identity after " + + std::to_string((int)(now - _pending_data_pool[i].queued_at)) + "s)"); + continue; // Drop this entry + } // Still no identity — keep for next loop iteration if (requeue_count != i) { _pending_data_pool[requeue_count] = _pending_data_pool[i]; @@ -759,8 +767,8 @@ void BLEInterface::onCentralConnected(const ConnectionHandle& conn) { Bytes mac = conn.peer_address.toBytes(); - // Update peer manager - _peer_manager.addDiscoveredPeer(mac, 0); + // Update peer manager with connection RSSI + _peer_manager.addDiscoveredPeer(mac, conn.rssi); _peer_manager.setPeerState(mac, PeerState::HANDSHAKING); _peer_manager.setPeerHandle(mac, conn.handle); @@ -923,10 +931,10 @@ void BLEInterface::processDiscoveredPeers() { last_peer_log = now; } - if (candidate && candidate->mac_address.size() >= Limits::MAC_SIZE) { + if (candidate && candidate->mac_address.size() >= Limits::MAC_SIZE && + _peer_manager.canAcceptConnection()) { INFO("BLE: Connection candidate: " + BLEAddress(candidate->mac_address.data()).toString() + - " type=" + std::to_string(candidate->address_type) + - " canAccept=" + std::string(_peer_manager.canAcceptConnection() ? "yes" : "no")); + " type=" + std::to_string(candidate->address_type)); } if (candidate && _peer_manager.canAcceptConnection()) { @@ -1086,6 +1094,7 @@ void BLEInterface::handleIncomingData(const ConnectionHandle& conn, const Bytes& PendingData& pending = _pending_data_pool[_pending_data_count]; pending.identity = mac; // Use MAC as temporary key pending.data = data; + pending.queued_at = Utilities::OS::time(); _pending_data_count++; } return; @@ -1099,6 +1108,7 @@ void BLEInterface::handleIncomingData(const ConnectionHandle& conn, const Bytes& PendingData& pending = _pending_data_pool[_pending_data_count]; pending.identity = identity; pending.data = data; + pending.queued_at = Utilities::OS::time(); _pending_data_count++; } diff --git a/lib/ble_interface/BLEInterface.h b/lib/ble_interface/BLEInterface.h index 0659da60..335c85ee 100644 --- a/lib/ble_interface/BLEInterface.h +++ b/lib/ble_interface/BLEInterface.h @@ -270,6 +270,7 @@ private: struct PendingData { RNS::Bytes identity; RNS::Bytes data; + double queued_at = 0; // Timestamp for expiry of unresolvable entries }; PendingData _pending_data_pool[MAX_PENDING_DATA]; size_t _pending_data_count = 0; diff --git a/lib/ble_interface/BLETypes.h b/lib/ble_interface/BLETypes.h index 6cd915ca..8f23ad62 100644 --- a/lib/ble_interface/BLETypes.h +++ b/lib/ble_interface/BLETypes.h @@ -325,6 +325,7 @@ struct ConnectionHandle { Role local_role = Role::NONE; // Our role in this connection ConnectionState state = ConnectionState::DISCONNECTED; uint16_t mtu = MTU::MINIMUM; // Negotiated MTU + int8_t rssi = 0; // Connection RSSI at connect time // Characteristic handles (discovered after connection) uint16_t rx_char_handle = 0; // Handle for RX characteristic @@ -340,6 +341,7 @@ struct ConnectionHandle { local_role = Role::NONE; state = ConnectionState::DISCONNECTED; mtu = MTU::MINIMUM; + rssi = 0; rx_char_handle = 0; tx_char_handle = 0; tx_cccd_handle = 0; diff --git a/lib/ble_interface/platforms/NimBLEPlatform.cpp b/lib/ble_interface/platforms/NimBLEPlatform.cpp index a3d2ff64..5b6cc79d 100644 --- a/lib/ble_interface/platforms/NimBLEPlatform.cpp +++ b/lib/ble_interface/platforms/NimBLEPlatform.cpp @@ -36,6 +36,10 @@ extern "C" { void ble_hs_sched_reset(int reason); } +// Defined in patched NimBLEDevice.cpp — set in onReset callback with the reason code. +// Poll from BLE loop to log via UDP (NimBLE's own logging only reaches serial UART). +extern volatile int nimble_host_reset_reason; + namespace RNS { namespace BLE { //============================================================================= @@ -204,6 +208,16 @@ void NimBLEPlatform::loop() { return; } + // Process deferred disconnects from NimBLE host task callbacks. + // Must run before other loop logic to ensure stale connections are cleaned up. + processPendingDisconnects(); + + // Process deferred error recovery (requested from callback context) + if (_error_recovery_requested) { + _error_recovery_requested = false; + enterErrorRecovery(); + } + // Check if continuous scan should stop portENTER_CRITICAL(&_state_mux); MasterState ms = _master_state; @@ -221,6 +235,7 @@ void NimBLEPlatform::loop() { // Stuck-state safety net: if GAP hardware is idle but our state machine // thinks we're busy, reset state machine. This recovers from missed callbacks // (e.g., service discovery disconnect not properly cleaning up state). + // Skip during CONNECTING — connectNative() can legitimately take up to 10s. static uint32_t last_stuck_check = 0; uint32_t now_ms = millis(); if (now_ms - last_stuck_check >= 5000) { // Check every 5 seconds @@ -232,22 +247,27 @@ void NimBLEPlatform::loop() { SlaveState ss = _slave_state; portEXIT_CRITICAL(&_state_mux); - bool gap_idle = !ble_gap_disc_active() && !ble_gap_adv_active() && !ble_gap_conn_active(); + // Don't fire stuck detector while a connection attempt is in progress + if (ms2 == MasterState::CONNECTING || ms2 == MasterState::CONN_STARTING) { + // Expected — connect can take several seconds + } else { + bool gap_idle = !ble_gap_disc_active() && !ble_gap_adv_active() && !ble_gap_conn_active(); - if (gap_idle && (gs != GAPState::READY || ms2 != MasterState::IDLE || ss != SlaveState::IDLE)) { - WARNING(std::string("NimBLEPlatform: Stuck state detected - GAP idle but state=") + - gapStateName(gs) + " master=" + masterStateName(ms2) + - " slave=" + slaveStateName(ss) + ". Resetting."); - portENTER_CRITICAL(&_state_mux); - _gap_state = GAPState::READY; - _master_state = MasterState::IDLE; - _slave_state = SlaveState::IDLE; - _slave_paused_for_master = false; - portEXIT_CRITICAL(&_state_mux); + if (gap_idle && (gs != GAPState::READY || ms2 != MasterState::IDLE || ss != SlaveState::IDLE)) { + WARNING(std::string("NimBLEPlatform: Stuck state detected - GAP idle but state=") + + gapStateName(gs) + " master=" + masterStateName(ms2) + + " slave=" + slaveStateName(ss) + ". Resetting."); + portENTER_CRITICAL(&_state_mux); + _gap_state = GAPState::READY; + _master_state = MasterState::IDLE; + _slave_state = SlaveState::IDLE; + _slave_paused_for_master = false; + portEXIT_CRITICAL(&_state_mux); - // Restart advertising in dual/peripheral mode - if (_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) { - startAdvertising(); + // Restart advertising in dual/peripheral mode + if (_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) { + startAdvertising(); + } } } } @@ -309,6 +329,9 @@ void NimBLEPlatform::shutdown() { // Notify higher layers about all disconnections BEFORE deinit, // so the peer manager can reset peer states properly. // Do NOT delete clients individually — deinit(true) handles all client cleanup. + // Process any remaining deferred disconnects before shutdown cleanup + processPendingDisconnects(); + if (xSemaphoreTake(_conn_mutex, pdMS_TO_TICKS(1000))) { if (_on_disconnected) { for (auto& kv : _connections) { @@ -317,6 +340,7 @@ void NimBLEPlatform::shutdown() { } _clients.clear(); _connections.clear(); + _cached_rx_chars.clear(); _discovered_devices.clear(); _discovered_order.clear(); xSemaphoreGive(_conn_mutex); @@ -329,6 +353,7 @@ void NimBLEPlatform::shutdown() { } _clients.clear(); _connections.clear(); + _cached_rx_chars.clear(); _discovered_devices.clear(); _discovered_order.clear(); } @@ -598,6 +623,10 @@ void NimBLEPlatform::enterErrorRecovery() { if (ble_gap_disc_active()) { ble_gap_disc_cancel(); } + if (ble_gap_conn_active()) { + WARNING("NimBLEPlatform: Cancelling stuck GAP connection in error recovery"); + ble_gap_conn_cancel(); + } if (ble_gap_adv_active()) { ble_gap_adv_stop(); } @@ -620,6 +649,7 @@ void NimBLEPlatform::enterErrorRecovery() { uint32_t sync_start = millis(); while (!ble_hs_synced() && (millis() - sync_start) < 5000) { delay(50); + esp_task_wdt_reset(); } if (ble_hs_synced()) { INFO("NimBLEPlatform: Host sync restored after " + @@ -636,8 +666,28 @@ void NimBLEPlatform::enterErrorRecovery() { } } + // Force host-controller resync to clear stale HCI state (fixes rc=530 / Invalid HCI params) + // After a 574 desync, the controller's scan state can become corrupted even after host re-syncs. + INFO("NimBLEPlatform: Scheduling host reset for controller resync"); + ble_hs_sched_reset(BLE_HS_ECONTROLLER); + + // Wait for host to re-sync after reset + { + uint32_t reset_start = millis(); + while (!ble_hs_synced() && (millis() - reset_start) < 5000) { + delay(50); + esp_task_wdt_reset(); + } + if (ble_hs_synced()) { + INFO("NimBLEPlatform: Host-controller resync after " + + std::to_string(millis() - reset_start) + "ms"); + } else { + WARNING("NimBLEPlatform: Host-controller resync failed after 5s"); + } + } + // DELAY RATIONALE: Connect attempt recovery - ESP32-S3 settling time after host sync - delay(50); + delay(100); // Re-acquire scan object to reset NimBLE internal state // This is necessary because NimBLE scan object can get into stuck state @@ -671,6 +721,71 @@ void NimBLEPlatform::enterErrorRecovery() { in_recovery = false; } +//============================================================================= +// Deferred Disconnect Processing +//============================================================================= + +void NimBLEPlatform::queueDisconnect(uint16_t conn_handle, int reason, bool is_peripheral) { + uint8_t next = (_pending_disc_write + 1) % PENDING_DISC_QUEUE_SIZE; + if (next == _pending_disc_read) { + // Queue full — more simultaneous disconnects than queue size + WARNING("NimBLEPlatform: Pending disconnect queue full, dropping handle=" + + std::to_string(conn_handle)); + return; + } + _pending_disc_queue[_pending_disc_write] = {conn_handle, reason, is_peripheral}; + _pending_disc_write = next; +} + +void NimBLEPlatform::processPendingDisconnects() { + while (_pending_disc_read != _pending_disc_write) { + PendingDisconnect& pd = _pending_disc_queue[_pending_disc_read]; + + auto conn_it = _connections.find(pd.conn_handle); + if (conn_it != _connections.end()) { + ConnectionHandle conn = conn_it->second; + _connections.erase(conn_it); + + INFO("NimBLEPlatform: Processing deferred disconnect for " + + conn.peer_address.toString() + " reason=" + std::to_string(pd.reason)); + + if (!pd.is_peripheral) { + // Central mode: clean up client object and cached char pointer + auto client_it = _clients.find(pd.conn_handle); + if (client_it != _clients.end()) { + if (client_it->second) { + NimBLEDevice::deleteClient(client_it->second); + } + _clients.erase(client_it); + } + _cached_rx_chars.erase(pd.conn_handle); + } + + // Clear operation queue for this connection + clearForConnection(pd.conn_handle); + + // Notify higher layers + if (pd.is_peripheral) { + if (_on_central_disconnected) { + _on_central_disconnected(conn); + } + } else { + if (_on_disconnected) { + _on_disconnected(conn, static_cast(pd.reason)); + } + } + + // Restart advertising if in peripheral/dual mode + if ((_config.role == Role::PERIPHERAL || _config.role == Role::DUAL) && + !isAdvertising()) { + startAdvertising(); + } + } + + _pending_disc_read = (_pending_disc_read + 1) % PENDING_DISC_QUEUE_SIZE; + } +} + //============================================================================= // Central Mode - Scanning //============================================================================= @@ -707,11 +822,18 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) { if (!ble_hs_synced()) { unsigned long desync_duration = millis() - _host_desync_since; _scan_fail_count++; + // Capture NimBLE's internal reset reason (set in patched onReset callback) + int reset_reason = nimble_host_reset_reason; + if (reset_reason != 0) { + nimble_host_reset_reason = 0; + } WARNING("NimBLEPlatform: Host not synced, desync " + std::to_string(desync_duration / 1000) + "s (fail " + std::to_string(_scan_fail_count) + "/" + std::to_string(SCAN_FAIL_RECOVERY_THRESHOLD) + - ", resets=" + std::to_string(_host_reset_attempts) + ")"); + ", resets=" + std::to_string(_host_reset_attempts) + + (reset_reason != 0 ? ", nimble_reason=" + std::to_string(reset_reason) : "") + + ")"); // Tiered recovery: // 0-10s: Wait for natural self-recovery @@ -752,19 +874,33 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) { // Host is synced — clear desync tracking if (_host_desync_since != 0) { unsigned long recovery_time = millis() - _host_desync_since; - INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms"); + // Capture any remaining reset reason from NimBLE + int reset_reason = nimble_host_reset_reason; + if (reset_reason != 0) { + nimble_host_reset_reason = 0; + } + INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms" + + (reset_reason != 0 ? " (nimble_reason=" + std::to_string(reset_reason) + ")" : "")); _host_desync_since = 0; _host_reset_attempts = 0; + _last_desync_recovery = millis(); // Start cooldown before allowing connections } - // Log GAP hardware state before checking - DEBUG("NimBLEPlatform: Pre-scan GAP state: disc=" + std::to_string(ble_gap_disc_active()) + - " adv=" + std::to_string(ble_gap_adv_active()) + - " conn=" + std::to_string(ble_gap_conn_active())); + // Log GAP hardware state before checking (INFO for UDP visibility during soak test) + INFO("NimBLEPlatform: Pre-scan GAP: disc=" + std::to_string(ble_gap_disc_active()) + + " adv=" + std::to_string(ble_gap_adv_active()) + + " conn=" + std::to_string(ble_gap_conn_active())); + + // If a stale GAP connection is blocking scan, cancel it proactively + if (ble_gap_conn_active() && _master_state == MasterState::IDLE) { + WARNING("NimBLEPlatform: Stale GAP conn blocking scan - cancelling"); + ble_gap_conn_cancel(); + delay(50); // Let GAP process the cancel + } // Verify we can start scan if (!canStartScan()) { - DEBUG("NimBLEPlatform: Cannot start scan - state check failed" + + WARNING("NimBLEPlatform: Cannot start scan - state check failed" + std::string(" master=") + masterStateName(current_master) + " gap_disc=" + std::to_string(ble_gap_disc_active()) + " gap_conn=" + std::to_string(ble_gap_conn_active())); @@ -823,8 +959,11 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) { return true; } - // Scan failed - ERROR("NimBLEPlatform: Failed to start scan"); + // Scan failed — log GAP state for diagnosis + ERROR("NimBLEPlatform: Failed to start scan - GAP: disc=" + std::to_string(ble_gap_disc_active()) + + " conn=" + std::to_string(ble_gap_conn_active()) + + " adv=" + std::to_string(ble_gap_adv_active()) + + " master=" + masterStateName(_master_state)); // Reset state portENTER_CRITICAL(&_state_mux); @@ -834,9 +973,20 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) { _scan_fail_count++; if (_scan_fail_count >= SCAN_FAIL_RECOVERY_THRESHOLD) { - WARNING("NimBLEPlatform: Too many scan failures, entering error recovery"); _scan_fail_count = 0; // Reset so we don't immediately re-enter after recovery - enterErrorRecovery(); + _lightweight_reset_fails++; + + if (_lightweight_reset_fails >= LIGHTWEIGHT_RESET_MAX_FAILS) { + WARNING("NimBLEPlatform: " + std::to_string(_lightweight_reset_fails) + + " error recoveries failed to restore scan, escalating to full stack recovery"); + _lightweight_reset_fails = 0; + recoverBLEStack(); + } else { + WARNING("NimBLEPlatform: Too many scan failures, entering error recovery (" + + std::to_string(_lightweight_reset_fails) + "/" + + std::to_string(LIGHTWEIGHT_RESET_MAX_FAILS) + ")"); + enterErrorRecovery(); + } } resumeSlave(); @@ -899,6 +1049,14 @@ bool NimBLEPlatform::isScanning() const { bool NimBLEPlatform::connect(const BLEAddress& address, uint16_t timeout_ms) { NimBLEAddress nimAddr = toNimBLE(address); + // Skip connections during desync cooldown — connecting while the NimBLE + // stack is recovering from a desync can hang client->connect() (the host + // task can't process the completion event), leading to WDT crashes. + if (_host_desync_since != 0 || (_last_desync_recovery > 0 && millis() - _last_desync_recovery < DESYNC_CONNECT_COOLDOWN_MS)) { + DEBUG("NimBLEPlatform: Skipping connect during desync cooldown"); + return false; + } + // Rate limit connections to avoid overwhelming the BLE stack // Non-blocking: return false if too soon, caller can retry later static unsigned long last_connect_time = 0; @@ -1116,38 +1274,10 @@ int NimBLEPlatform::nativeGapEventHandler(struct ble_gap_event* event, void* arg break; } - // Clean up established connections (handles MAC rotation, out of range, etc.) - auto conn_it = platform->_connections.find(disc_handle); - if (conn_it != platform->_connections.end()) { - ConnectionHandle conn = conn_it->second; - platform->_connections.erase(conn_it); - - INFO("NimBLEPlatform: Native connection lost to " + conn.peer_address.toString() + - " reason=" + std::to_string(disc_reason)); - - // Clean up client object - auto client_it = platform->_clients.find(disc_handle); - if (client_it != platform->_clients.end()) { - if (client_it->second) { - NimBLEDevice::deleteClient(client_it->second); - } - platform->_clients.erase(client_it); - } - - // Clear operation queue for this connection - platform->clearForConnection(disc_handle); - - // Notify higher layers - if (platform->_on_disconnected) { - platform->_on_disconnected(conn, static_cast(disc_reason)); - } - - // Restart advertising if in peripheral/dual mode and not currently advertising - if ((platform->_config.role == Role::PERIPHERAL || platform->_config.role == Role::DUAL) && - !platform->isAdvertising()) { - platform->startAdvertising(); - } - } + // Defer map cleanup to BLE loop task to avoid data race. + // This callback runs in the NimBLE host task while the BLE loop task + // may be iterating _connections/_clients concurrently. + platform->queueDisconnect(disc_handle, disc_reason, false); break; } @@ -1527,13 +1657,22 @@ bool NimBLEPlatform::write(uint16_t conn_handle, const Bytes& data, bool respons return false; } - NimBLERemoteService* service = client->getService(UUID::SERVICE); - if (!service) { - WARNING("NimBLEPlatform::write: service not found for handle " + std::to_string(conn_handle)); - return false; + // Use cached RX characteristic pointer to avoid repeated service/char lookups + NimBLERemoteCharacteristic* rxChar = nullptr; + auto cached_it = _cached_rx_chars.find(conn_handle); + if (cached_it != _cached_rx_chars.end()) { + rxChar = cached_it->second; + } else { + NimBLERemoteService* service = client->getService(UUID::SERVICE); + if (!service) { + WARNING("NimBLEPlatform::write: service not found for handle " + std::to_string(conn_handle)); + return false; + } + rxChar = service->getCharacteristic(UUID::RX_CHAR); + if (rxChar) { + _cached_rx_chars[conn_handle] = rxChar; + } } - - NimBLERemoteCharacteristic* rxChar = service->getCharacteristic(UUID::RX_CHAR); if (!rxChar) { WARNING("NimBLEPlatform::write: RX char not found for handle " + std::to_string(conn_handle)); return false; @@ -1591,6 +1730,11 @@ bool NimBLEPlatform::read(uint16_t conn_handle, uint16_t char_handle, } NimBLEClient* client = client_it->second; + if (!client->isConnected()) { + if (callback) callback(OperationResult::DISCONNECTED, Bytes()); + return false; + } + NimBLERemoteService* service = client->getService(UUID::SERVICE); if (!service) { if (callback) callback(OperationResult::NOT_FOUND, Bytes()); @@ -1624,6 +1768,8 @@ bool NimBLEPlatform::enableNotifications(uint16_t conn_handle, bool enable) { } NimBLEClient* client = client_it->second; + if (!client->isConnected()) return false; + NimBLERemoteService* service = client->getService(UUID::SERVICE); if (!service) return false; @@ -1631,11 +1777,17 @@ bool NimBLEPlatform::enableNotifications(uint16_t conn_handle, bool enable) { if (!txChar) return false; if (enable) { - // Subscribe to notifications - auto notifyCb = [this, conn_handle](NimBLERemoteCharacteristic* pChar, + // Subscribe to notifications. + // Capture peer_address to guard against conn_handle reuse: if peer A disconnects + // (handle=1) and peer B connects (handle=1), we must not deliver B's data as A's. + BLEAddress expected_peer = getConnection(conn_handle).peer_address; + auto notifyCb = [this, conn_handle, expected_peer](NimBLERemoteCharacteristic* pChar, uint8_t* pData, size_t length, bool isNotify) { if (_on_data_received) { ConnectionHandle conn = getConnection(conn_handle); + if (!conn.isValid() || conn.peer_address != expected_peer) { + return; // Stale handle — peer changed + } Bytes data(pData, length); _on_data_received(conn, data); } @@ -1810,9 +1962,16 @@ void NimBLEPlatform::onConnect(NimBLEServer* pServer, NimBLEConnInfo& connInfo) conn.state = ConnectionState::CONNECTED; conn.mtu = MTU::MINIMUM - MTU::ATT_OVERHEAD; + // Read connection RSSI + int8_t rssi_val = 0; + if (ble_gap_conn_rssi(conn_handle, &rssi_val) == 0) { + conn.rssi = rssi_val; + } + _connections[conn_handle] = conn; - DEBUG("NimBLEPlatform: Central connected: " + conn.peer_address.toString()); + DEBUG("NimBLEPlatform: Central connected: " + conn.peer_address.toString() + + " rssi=" + std::to_string(conn.rssi)); if (_on_central_connected) { _on_central_connected(conn); @@ -1829,21 +1988,13 @@ void NimBLEPlatform::onDisconnect(NimBLEServer* pServer, NimBLEConnInfo& connInf uint16_t conn_handle = connInfo.getConnHandle(); - auto it = _connections.find(conn_handle); - if (it != _connections.end()) { - ConnectionHandle conn = it->second; - _connections.erase(it); + DEBUG("NimBLEPlatform: Central disconnect event for handle=" + std::to_string(conn_handle) + + " reason=" + std::to_string(reason)); - DEBUG("NimBLEPlatform: Central disconnected: " + conn.peer_address.toString() + - " reason: " + std::to_string(reason)); - - if (_on_central_disconnected) { - _on_central_disconnected(conn); - } - } - - // Clear operation queue for this connection - BLEOperationQueue::clearForConnection(conn_handle); + // Defer map cleanup to BLE loop task to avoid data race. + // This callback runs in the NimBLE host task while the BLE loop task + // may be iterating _connections concurrently. + queueDisconnect(conn_handle, reason, true); } void NimBLEPlatform::onMTUChange(uint16_t MTU, NimBLEConnInfo& connInfo) { @@ -1959,25 +2110,15 @@ void NimBLEPlatform::onDisconnect(NimBLEClient* pClient, int reason) { return; } - auto it = _connections.find(conn_handle); - if (it != _connections.end()) { - ConnectionHandle conn = it->second; - _connections.erase(it); + DEBUG("NimBLEPlatform: Client disconnect event for handle=" + std::to_string(conn_handle) + + " reason=" + std::to_string(reason)); - DEBUG("NimBLEPlatform: Disconnected from peripheral: " + conn.peer_address.toString() + - " reason: " + std::to_string(reason)); - - if (_on_disconnected) { - _on_disconnected(conn, static_cast(reason)); - } - } - - // Remove client - _clients.erase(conn_handle); - NimBLEDevice::deleteClient(pClient); - - // Clear operation queue - BLEOperationQueue::clearForConnection(conn_handle); + // Defer map cleanup to BLE loop task to avoid data race. + // This callback runs in the NimBLE host task while the BLE loop task + // may be iterating _connections/_clients concurrently. + // Note: NimBLEDevice::deleteClient() for this client will be called + // in processPendingDisconnects() from the loop task context. + queueDisconnect(conn_handle, reason, false); } //============================================================================= diff --git a/lib/ble_interface/platforms/NimBLEPlatform.h b/lib/ble_interface/platforms/NimBLEPlatform.h index 7982f082..19819df9 100644 --- a/lib/ble_interface/platforms/NimBLEPlatform.h +++ b/lib/ble_interface/platforms/NimBLEPlatform.h @@ -239,6 +239,25 @@ private: void resumeSlave(); void enterErrorRecovery(); + // Deferred disconnect queue (SPSC: NimBLE host task produces, BLE loop task consumes) + // Disconnect events arrive from the host task and must not modify _connections/_clients + // directly, as the BLE loop task may be iterating them concurrently. + static constexpr size_t PENDING_DISC_QUEUE_SIZE = 8; + struct PendingDisconnect { + uint16_t conn_handle; + int reason; + bool is_peripheral; // true = server disconnect, false = native GAP handler + }; + PendingDisconnect _pending_disc_queue[PENDING_DISC_QUEUE_SIZE]; + volatile uint8_t _pending_disc_write = 0; // Next write slot (host task only) + volatile uint8_t _pending_disc_read = 0; // Next read slot (loop task only) + + void queueDisconnect(uint16_t conn_handle, int reason, bool is_peripheral); + void processPendingDisconnects(); + + // Deferred error recovery (set from any context, processed in loop task) + volatile bool _error_recovery_requested = false; + // Track if slave was paused for a master operation bool _slave_paused_for_master = false; @@ -260,11 +279,13 @@ private: uint8_t _conn_establish_fail_count = 0; // rc=574 connection establishment failures unsigned long _last_full_recovery_time = 0; unsigned long _host_desync_since = 0; // millis() when host first lost sync (0 = synced) + unsigned long _last_desync_recovery = 0; // millis() when last desync recovered (for connect cooldown) uint8_t _host_reset_attempts = 0; // ble_hs_sched_reset attempts since last sync - static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 10; + static constexpr uint8_t SCAN_FAIL_RECOVERY_THRESHOLD = 5; static constexpr uint8_t LIGHTWEIGHT_RESET_MAX_FAILS = 3; static constexpr uint8_t CONN_ESTABLISH_FAIL_THRESHOLD = 5; static constexpr unsigned long FULL_RECOVERY_COOLDOWN_MS = 60000; // 60 seconds + static constexpr unsigned long DESYNC_CONNECT_COOLDOWN_MS = 30000; // Don't connect for 30s after desync recovery static constexpr unsigned long HOST_DESYNC_REBOOT_MS = 60000; // Reboot after 60s desync (no connections) bool recoverBLEStack(); bool attemptHostReset(); @@ -281,6 +302,9 @@ private: // Client connections (as central) std::map _clients; + // Cached RX characteristic pointers (avoids repeated service/char lookups in write()) + std::map _cached_rx_chars; + // Connection tracking std::map _connections; diff --git a/lib/tdeck_ui/UI/LXMF/ConversationListScreen.cpp b/lib/tdeck_ui/UI/LXMF/ConversationListScreen.cpp index cd33326e..27ef06d4 100644 --- a/lib/tdeck_ui/UI/LXMF/ConversationListScreen.cpp +++ b/lib/tdeck_ui/UI/LXMF/ConversationListScreen.cpp @@ -195,6 +195,7 @@ void ConversationListScreen::refresh() { _conversations.clear(); _conversation_containers.clear(); _peer_hash_pool.clear(); + _has_unresolved_names = false; // Load conversations from store std::vector peer_hashes = _message_store->get_conversations(); @@ -226,17 +227,22 @@ void ConversationListScreen::refresh() { item.peer_hash = peer_hash; // Try to get display name from app_data, fall back to hash + bool resolved = false; Bytes app_data = Identity::recall_app_data(peer_hash); if (app_data && app_data.size() > 0) { String display_name = parse_display_name(app_data); if (display_name.length() > 0) { item.peer_name = display_name; + resolved = true; } else { item.peer_name = truncate_hash(peer_hash); } } else { item.peer_name = truncate_hash(peer_hash); } + if (!resolved) { + _has_unresolved_names = true; + } // Get message content for preview String content((const char*)last_msg.content().data(), last_msg.content().size()); @@ -544,6 +550,23 @@ void ConversationListScreen::update_status() { lv_obj_set_style_text_color(_label_battery_icon, battery_color, 0); lv_obj_set_style_text_color(_label_battery_pct, battery_color, 0); } + + // Check if any unresolved display names can now be resolved + // (announces may have arrived since the list was last refreshed) + if (_has_unresolved_names && _message_store) { + for (const auto& item : _conversations) { + Bytes app_data = Identity::recall_app_data(item.peer_hash); + if (app_data && app_data.size() > 0) { + String name = parse_display_name(app_data); + if (name.length() > 0 && item.peer_name != name) { + // A name has become available — refresh the whole list + DEBUG("Display name resolved, refreshing conversation list"); + refresh(); + return; + } + } + } + } } void ConversationListScreen::on_conversation_clicked(lv_event_t* event) { diff --git a/lib/tdeck_ui/UI/LXMF/ConversationListScreen.h b/lib/tdeck_ui/UI/LXMF/ConversationListScreen.h index 0984fb69..2606bce3 100644 --- a/lib/tdeck_ui/UI/LXMF/ConversationListScreen.h +++ b/lib/tdeck_ui/UI/LXMF/ConversationListScreen.h @@ -197,6 +197,7 @@ private: std::vector _conversation_containers; // For focus group management std::vector _peer_hash_pool; // Object pool to avoid per-item allocations RNS::Bytes _pending_delete_hash; // Hash of conversation pending deletion + bool _has_unresolved_names = false; // True if any conversation shows hash instead of name ConversationSelectedCallback _conversation_selected_callback; ComposeCallback _compose_callback; diff --git a/patch_nimble.py b/patch_nimble.py index 6d572f39..f62a47be 100644 --- a/patch_nimble.py +++ b/patch_nimble.py @@ -7,6 +7,16 @@ Patch 1 — ble_hs.c: Remove assert(0) in BLE_HS_SYNC_STATE_BRINGUP timer handle Patch 2 — NimBLEClient.cpp: Add null checks in PHY update event handler. If a client is deleted while events are queued, the callback arg becomes a dangling pointer. Guard against null pClient and null m_pClientCallbacks. + +Patch 3 — ble_gap.c: Handle BLE_ERR_CONN_ESTABLISHMENT (574) in conn complete handler. + NimBLE only handles 574 when BLE_PERIODIC_ADV_WITH_RESPONSES is enabled. Without this + patch, 574 falls through to assert(0) and the master GAP state is never cleaned up, + leaving scan/advertise permanently broken. The ESP32-S3 controller returns 574 when + connection establishment fails (peer disappeared, RF interference, etc.). + +Patch 4 — NimBLEDevice.cpp: Expose host reset reason via global volatile variable. + NimBLE's onReset callback logs reason to ESP_LOG (serial only). This patch adds a + global volatile int that application code can poll to capture the reason in UDP logs. """ Import("env") import os @@ -76,3 +86,71 @@ apply_patch( } // BLE_GAP_EVENT_PHY_UPDATE_COMPLETE""", "NimBLEClient.cpp: added null guard in PHY update handler" ) + +# Patch 3: ble_gap.c — handle BLE_ERR_CONN_ESTABLISHMENT (574) without PAwR +# The ESP32-S3 controller returns error 574 when connection establishment fails. +# NimBLE only handles this in the BLE_PERIODIC_ADV_WITH_RESPONSES path. Without it, +# 574 hits the default case (assert(0) + no cleanup), leaving master GAP state stuck +# in BLE_GAP_OP_M_CONN — permanently blocking scan and advertising. +apply_patch( + os.path.join(NIMBLE_BASE, "nimble", "nimble", "host", "src", "ble_gap.c"), + """#if MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES) + case BLE_ERR_CONN_ESTABLISHMENT: + if (!v1_evt) { + ble_gap_rx_conn_comp_failed(evt); + } + break; +#endif // MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES) + default: + /* this should never happen, unless controller is broken */ + BLE_HS_LOG(INFO, "controller reported invalid error code in conn" + "complete event: %u", evt->status); + assert(0); + break;""", + """ case BLE_ERR_CONN_ESTABLISHMENT: + /* Connection establishment failed (e.g. peer disappeared). + * Clean up master GAP state so scan/advertise can resume. + * Without this, the master state stays stuck in BLE_GAP_OP_M_CONN. */ + if (ble_gap_master_in_progress()) { + ble_gap_master_failed(BLE_HS_ECONTROLLER); + } + break; + default: + /* this should never happen, unless controller is broken */ + BLE_HS_LOG(INFO, "controller reported invalid error code in conn" + "complete event: %u", evt->status); + if (ble_gap_master_in_progress()) { + ble_gap_master_failed(BLE_HS_ECONTROLLER); + } + break;""", + "ble_gap.c: handle BLE_ERR_CONN_ESTABLISHMENT (574) to prevent stuck GAP state" +) + +# Patch 4: NimBLEDevice.cpp — expose host reset reason for application-level logging +# NimBLE's onReset callback logs via NIMBLE_LOGE which only goes to serial UART. +# This patch adds a global volatile int that our BLE loop can poll and log via UDP. +apply_patch( + os.path.join(NIMBLE_BASE, "NimBLEDevice.cpp"), + """void NimBLEDevice::onReset(int reason) { + if (!m_synced) { + return; + } + + m_synced = false; + + NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason)); +} // onReset""", + """volatile int nimble_host_reset_reason = 0; + +void NimBLEDevice::onReset(int reason) { + if (!m_synced) { + return; + } + + m_synced = false; + nimble_host_reset_reason = reason; + + NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason)); +} // onReset""", + "NimBLEDevice.cpp: expose host reset reason for application-level logging" +) diff --git a/src/main.cpp b/src/main.cpp index 7efe5001..a8e108e1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1196,6 +1196,9 @@ void setup() { INFO("╚══════════════════════════════════════╝"); INFO(""); + // Capture ESP reset reason early (before WiFi) — logged after WiFi init for UDP visibility + esp_reset_reason_t _boot_reset_reason = esp_reset_reason(); + // Check for LXST crash breadcrumb from previous boot { Preferences _dbg; @@ -1251,6 +1254,28 @@ void setup() { setup_wifi(); BOOT_PROFILE_END("wifi"); + // Log ESP reset reason after WiFi so it reaches UDP logs + { + const char* reason_str = "UNKNOWN"; + switch (_boot_reset_reason) { + case ESP_RST_POWERON: reason_str = "POWERON"; break; + case ESP_RST_SW: reason_str = "SOFTWARE"; break; + case ESP_RST_PANIC: reason_str = "PANIC"; break; + case ESP_RST_INT_WDT: reason_str = "INT_WDT"; break; + case ESP_RST_TASK_WDT: reason_str = "TASK_WDT"; break; + case ESP_RST_WDT: reason_str = "WDT"; break; + case ESP_RST_DEEPSLEEP: reason_str = "DEEPSLEEP"; break; + case ESP_RST_BROWNOUT: reason_str = "BROWNOUT"; break; + case ESP_RST_SDIO: reason_str = "SDIO"; break; + default: break; + } + if (_boot_reset_reason != ESP_RST_POWERON) { + WARNING("Reset reason: " + std::string(reason_str) + " (" + std::to_string((int)_boot_reset_reason) + ")"); + } else { + INFO("Reset reason: " + std::string(reason_str)); + } + } + // Initialize LVGL and hardware drivers BOOT_PROFILE_START("lvgl"); setup_lvgl_and_ui();