NimBLE patches: fix 574 stuck GAP state, add desync diagnostics

Patch 3 (ble_gap.c): Handle BLE_ERR_CONN_ESTABLISHMENT (574) unconditionally.
NimBLE only handled 574 under BLE_PERIODIC_ADV_WITH_RESPONSES (disabled on
ESP32), causing ble_gap_master_failed() to never be called. This left the
master GAP state stuck in BLE_GAP_OP_M_CONN, permanently blocking scan and
advertising. Also clean up master state in the default case instead of
assert(0).

Patch 4 (NimBLEDevice.cpp): Expose host reset reason via global volatile int.
NimBLE's onReset callback logs the reason code through ESP_LOG (serial UART
only). This patch adds nimble_host_reset_reason that the BLE loop polls to
capture the reason in UDP log output for remote soak test monitoring.

NimBLEPlatform.cpp: Escalate persistent scan failures to full stack recovery.
After 3 consecutive enterErrorRecovery() rounds fail to restore scanning (30
total scan failures), escalate to recoverBLEStack() (clean reboot) instead
of looping indefinitely in a broken state.

Validated with 17+ hour soak test: device recovers from desyncs and maintains
3 active BLE connections with stable heap (~43K).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
torlando-tech
2026-03-03 12:49:41 -05:00
parent 8d23c03e3b
commit 74d832fb63
2 changed files with 110 additions and 4 deletions

View File

@@ -36,6 +36,10 @@ extern "C" {
void ble_hs_sched_reset(int reason);
}
// Defined in patched NimBLEDevice.cpp — set in onReset callback with the reason code.
// Poll from BLE loop to log via UDP (NimBLE's own logging only reaches serial UART).
extern volatile int nimble_host_reset_reason;
namespace RNS { namespace BLE {
//=============================================================================
@@ -794,11 +798,18 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
if (!ble_hs_synced()) {
unsigned long desync_duration = millis() - _host_desync_since;
_scan_fail_count++;
// Capture NimBLE's internal reset reason (set in patched onReset callback)
int reset_reason = nimble_host_reset_reason;
if (reset_reason != 0) {
nimble_host_reset_reason = 0;
}
WARNING("NimBLEPlatform: Host not synced, desync " +
std::to_string(desync_duration / 1000) + "s (fail " +
std::to_string(_scan_fail_count) + "/" +
std::to_string(SCAN_FAIL_RECOVERY_THRESHOLD) +
", resets=" + std::to_string(_host_reset_attempts) + ")");
", resets=" + std::to_string(_host_reset_attempts) +
(reset_reason != 0 ? ", nimble_reason=" + std::to_string(reset_reason) : "") +
")");
// Tiered recovery:
// 0-10s: Wait for natural self-recovery
@@ -839,7 +850,13 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
// Host is synced — clear desync tracking
if (_host_desync_since != 0) {
unsigned long recovery_time = millis() - _host_desync_since;
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms");
// Capture any remaining reset reason from NimBLE
int reset_reason = nimble_host_reset_reason;
if (reset_reason != 0) {
nimble_host_reset_reason = 0;
}
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms" +
(reset_reason != 0 ? " (nimble_reason=" + std::to_string(reset_reason) + ")" : ""));
_host_desync_since = 0;
_host_reset_attempts = 0;
}
@@ -921,10 +938,21 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
_scan_fail_count++;
if (_scan_fail_count >= SCAN_FAIL_RECOVERY_THRESHOLD) {
WARNING("NimBLEPlatform: Too many scan failures, entering error recovery");
_scan_fail_count = 0; // Reset so we don't immediately re-enter after recovery
_lightweight_reset_fails++;
if (_lightweight_reset_fails >= LIGHTWEIGHT_RESET_MAX_FAILS) {
WARNING("NimBLEPlatform: " + std::to_string(_lightweight_reset_fails) +
" error recoveries failed to restore scan, escalating to full stack recovery");
_lightweight_reset_fails = 0;
recoverBLEStack();
} else {
WARNING("NimBLEPlatform: Too many scan failures, entering error recovery (" +
std::to_string(_lightweight_reset_fails) + "/" +
std::to_string(LIGHTWEIGHT_RESET_MAX_FAILS) + ")");
enterErrorRecovery();
}
}
resumeSlave();
return false;

View File

@@ -7,6 +7,16 @@ Patch 1 — ble_hs.c: Remove assert(0) in BLE_HS_SYNC_STATE_BRINGUP timer handle
Patch 2 — NimBLEClient.cpp: Add null checks in PHY update event handler.
If a client is deleted while events are queued, the callback arg becomes a dangling
pointer. Guard against null pClient and null m_pClientCallbacks.
Patch 3 — ble_gap.c: Handle BLE_ERR_CONN_ESTABLISHMENT (574) in conn complete handler.
NimBLE only handles 574 when BLE_PERIODIC_ADV_WITH_RESPONSES is enabled. Without this
patch, 574 falls through to assert(0) and the master GAP state is never cleaned up,
leaving scan/advertise permanently broken. The ESP32-S3 controller returns 574 when
connection establishment fails (peer disappeared, RF interference, etc.).
Patch 4 — NimBLEDevice.cpp: Expose host reset reason via global volatile variable.
NimBLE's onReset callback logs reason to ESP_LOG (serial only). This patch adds a
global volatile int that application code can poll to capture the reason in UDP logs.
"""
Import("env")
import os
@@ -76,3 +86,71 @@ apply_patch(
} // BLE_GAP_EVENT_PHY_UPDATE_COMPLETE""",
"NimBLEClient.cpp: added null guard in PHY update handler"
)
# Patch 3: ble_gap.c — handle BLE_ERR_CONN_ESTABLISHMENT (574) without PAwR
# The ESP32-S3 controller returns error 574 when connection establishment fails.
# NimBLE only handles this in the BLE_PERIODIC_ADV_WITH_RESPONSES path. Without it,
# 574 hits the default case (assert(0) + no cleanup), leaving master GAP state stuck
# in BLE_GAP_OP_M_CONN — permanently blocking scan and advertising.
apply_patch(
os.path.join(NIMBLE_BASE, "nimble", "nimble", "host", "src", "ble_gap.c"),
"""#if MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES)
case BLE_ERR_CONN_ESTABLISHMENT:
if (!v1_evt) {
ble_gap_rx_conn_comp_failed(evt);
}
break;
#endif // MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES)
default:
/* this should never happen, unless controller is broken */
BLE_HS_LOG(INFO, "controller reported invalid error code in conn"
"complete event: %u", evt->status);
assert(0);
break;""",
""" case BLE_ERR_CONN_ESTABLISHMENT:
/* Connection establishment failed (e.g. peer disappeared).
* Clean up master GAP state so scan/advertise can resume.
* Without this, the master state stays stuck in BLE_GAP_OP_M_CONN. */
if (ble_gap_master_in_progress()) {
ble_gap_master_failed(BLE_HS_ECONTROLLER);
}
break;
default:
/* this should never happen, unless controller is broken */
BLE_HS_LOG(INFO, "controller reported invalid error code in conn"
"complete event: %u", evt->status);
if (ble_gap_master_in_progress()) {
ble_gap_master_failed(BLE_HS_ECONTROLLER);
}
break;""",
"ble_gap.c: handle BLE_ERR_CONN_ESTABLISHMENT (574) to prevent stuck GAP state"
)
# Patch 4: NimBLEDevice.cpp — expose host reset reason for application-level logging
# NimBLE's onReset callback logs via NIMBLE_LOGE which only goes to serial UART.
# This patch adds a global volatile int that our BLE loop can poll and log via UDP.
apply_patch(
os.path.join(NIMBLE_BASE, "NimBLEDevice.cpp"),
"""void NimBLEDevice::onReset(int reason) {
if (!m_synced) {
return;
}
m_synced = false;
NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason));
} // onReset""",
"""volatile int nimble_host_reset_reason = 0;
void NimBLEDevice::onReset(int reason) {
if (!m_synced) {
return;
}
m_synced = false;
nimble_host_reset_reason = reason;
NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason));
} // onReset""",
"NimBLEDevice.cpp: expose host reset reason for application-level logging"
)