mirror of
https://github.com/torlando-tech/pyxis.git
synced 2026-03-29 05:19:50 +00:00
NimBLE patches: fix 574 stuck GAP state, add desync diagnostics
Patch 3 (ble_gap.c): Handle BLE_ERR_CONN_ESTABLISHMENT (574) unconditionally. NimBLE only handled 574 under BLE_PERIODIC_ADV_WITH_RESPONSES (disabled on ESP32), causing ble_gap_master_failed() to never be called. This left the master GAP state stuck in BLE_GAP_OP_M_CONN, permanently blocking scan and advertising. Also clean up master state in the default case instead of assert(0). Patch 4 (NimBLEDevice.cpp): Expose host reset reason via global volatile int. NimBLE's onReset callback logs the reason code through ESP_LOG (serial UART only). This patch adds nimble_host_reset_reason that the BLE loop polls to capture the reason in UDP log output for remote soak test monitoring. NimBLEPlatform.cpp: Escalate persistent scan failures to full stack recovery. After 3 consecutive enterErrorRecovery() rounds fail to restore scanning (30 total scan failures), escalate to recoverBLEStack() (clean reboot) instead of looping indefinitely in a broken state. Validated with 17+ hour soak test: device recovers from desyncs and maintains 3 active BLE connections with stable heap (~43K). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -36,6 +36,10 @@ extern "C" {
|
||||
void ble_hs_sched_reset(int reason);
|
||||
}
|
||||
|
||||
// Defined in patched NimBLEDevice.cpp — set in onReset callback with the reason code.
|
||||
// Poll from BLE loop to log via UDP (NimBLE's own logging only reaches serial UART).
|
||||
extern volatile int nimble_host_reset_reason;
|
||||
|
||||
namespace RNS { namespace BLE {
|
||||
|
||||
//=============================================================================
|
||||
@@ -794,11 +798,18 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
|
||||
if (!ble_hs_synced()) {
|
||||
unsigned long desync_duration = millis() - _host_desync_since;
|
||||
_scan_fail_count++;
|
||||
// Capture NimBLE's internal reset reason (set in patched onReset callback)
|
||||
int reset_reason = nimble_host_reset_reason;
|
||||
if (reset_reason != 0) {
|
||||
nimble_host_reset_reason = 0;
|
||||
}
|
||||
WARNING("NimBLEPlatform: Host not synced, desync " +
|
||||
std::to_string(desync_duration / 1000) + "s (fail " +
|
||||
std::to_string(_scan_fail_count) + "/" +
|
||||
std::to_string(SCAN_FAIL_RECOVERY_THRESHOLD) +
|
||||
", resets=" + std::to_string(_host_reset_attempts) + ")");
|
||||
", resets=" + std::to_string(_host_reset_attempts) +
|
||||
(reset_reason != 0 ? ", nimble_reason=" + std::to_string(reset_reason) : "") +
|
||||
")");
|
||||
|
||||
// Tiered recovery:
|
||||
// 0-10s: Wait for natural self-recovery
|
||||
@@ -839,7 +850,13 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
|
||||
// Host is synced — clear desync tracking
|
||||
if (_host_desync_since != 0) {
|
||||
unsigned long recovery_time = millis() - _host_desync_since;
|
||||
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms");
|
||||
// Capture any remaining reset reason from NimBLE
|
||||
int reset_reason = nimble_host_reset_reason;
|
||||
if (reset_reason != 0) {
|
||||
nimble_host_reset_reason = 0;
|
||||
}
|
||||
INFO("NimBLEPlatform: Host re-synced after " + std::to_string(recovery_time) + "ms" +
|
||||
(reset_reason != 0 ? " (nimble_reason=" + std::to_string(reset_reason) + ")" : ""));
|
||||
_host_desync_since = 0;
|
||||
_host_reset_attempts = 0;
|
||||
}
|
||||
@@ -921,10 +938,21 @@ bool NimBLEPlatform::startScan(uint16_t duration_ms) {
|
||||
|
||||
_scan_fail_count++;
|
||||
if (_scan_fail_count >= SCAN_FAIL_RECOVERY_THRESHOLD) {
|
||||
WARNING("NimBLEPlatform: Too many scan failures, entering error recovery");
|
||||
_scan_fail_count = 0; // Reset so we don't immediately re-enter after recovery
|
||||
_lightweight_reset_fails++;
|
||||
|
||||
if (_lightweight_reset_fails >= LIGHTWEIGHT_RESET_MAX_FAILS) {
|
||||
WARNING("NimBLEPlatform: " + std::to_string(_lightweight_reset_fails) +
|
||||
" error recoveries failed to restore scan, escalating to full stack recovery");
|
||||
_lightweight_reset_fails = 0;
|
||||
recoverBLEStack();
|
||||
} else {
|
||||
WARNING("NimBLEPlatform: Too many scan failures, entering error recovery (" +
|
||||
std::to_string(_lightweight_reset_fails) + "/" +
|
||||
std::to_string(LIGHTWEIGHT_RESET_MAX_FAILS) + ")");
|
||||
enterErrorRecovery();
|
||||
}
|
||||
}
|
||||
|
||||
resumeSlave();
|
||||
return false;
|
||||
|
||||
@@ -7,6 +7,16 @@ Patch 1 — ble_hs.c: Remove assert(0) in BLE_HS_SYNC_STATE_BRINGUP timer handle
|
||||
Patch 2 — NimBLEClient.cpp: Add null checks in PHY update event handler.
|
||||
If a client is deleted while events are queued, the callback arg becomes a dangling
|
||||
pointer. Guard against null pClient and null m_pClientCallbacks.
|
||||
|
||||
Patch 3 — ble_gap.c: Handle BLE_ERR_CONN_ESTABLISHMENT (574) in conn complete handler.
|
||||
NimBLE only handles 574 when BLE_PERIODIC_ADV_WITH_RESPONSES is enabled. Without this
|
||||
patch, 574 falls through to assert(0) and the master GAP state is never cleaned up,
|
||||
leaving scan/advertise permanently broken. The ESP32-S3 controller returns 574 when
|
||||
connection establishment fails (peer disappeared, RF interference, etc.).
|
||||
|
||||
Patch 4 — NimBLEDevice.cpp: Expose host reset reason via global volatile variable.
|
||||
NimBLE's onReset callback logs reason to ESP_LOG (serial only). This patch adds a
|
||||
global volatile int that application code can poll to capture the reason in UDP logs.
|
||||
"""
|
||||
Import("env")
|
||||
import os
|
||||
@@ -76,3 +86,71 @@ apply_patch(
|
||||
} // BLE_GAP_EVENT_PHY_UPDATE_COMPLETE""",
|
||||
"NimBLEClient.cpp: added null guard in PHY update handler"
|
||||
)
|
||||
|
||||
# Patch 3: ble_gap.c — handle BLE_ERR_CONN_ESTABLISHMENT (574) without PAwR
|
||||
# The ESP32-S3 controller returns error 574 when connection establishment fails.
|
||||
# NimBLE only handles this in the BLE_PERIODIC_ADV_WITH_RESPONSES path. Without it,
|
||||
# 574 hits the default case (assert(0) + no cleanup), leaving master GAP state stuck
|
||||
# in BLE_GAP_OP_M_CONN — permanently blocking scan and advertising.
|
||||
apply_patch(
|
||||
os.path.join(NIMBLE_BASE, "nimble", "nimble", "host", "src", "ble_gap.c"),
|
||||
"""#if MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES)
|
||||
case BLE_ERR_CONN_ESTABLISHMENT:
|
||||
if (!v1_evt) {
|
||||
ble_gap_rx_conn_comp_failed(evt);
|
||||
}
|
||||
break;
|
||||
#endif // MYNEWT_VAL(BLE_PERIODIC_ADV_WITH_RESPONSES)
|
||||
default:
|
||||
/* this should never happen, unless controller is broken */
|
||||
BLE_HS_LOG(INFO, "controller reported invalid error code in conn"
|
||||
"complete event: %u", evt->status);
|
||||
assert(0);
|
||||
break;""",
|
||||
""" case BLE_ERR_CONN_ESTABLISHMENT:
|
||||
/* Connection establishment failed (e.g. peer disappeared).
|
||||
* Clean up master GAP state so scan/advertise can resume.
|
||||
* Without this, the master state stays stuck in BLE_GAP_OP_M_CONN. */
|
||||
if (ble_gap_master_in_progress()) {
|
||||
ble_gap_master_failed(BLE_HS_ECONTROLLER);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/* this should never happen, unless controller is broken */
|
||||
BLE_HS_LOG(INFO, "controller reported invalid error code in conn"
|
||||
"complete event: %u", evt->status);
|
||||
if (ble_gap_master_in_progress()) {
|
||||
ble_gap_master_failed(BLE_HS_ECONTROLLER);
|
||||
}
|
||||
break;""",
|
||||
"ble_gap.c: handle BLE_ERR_CONN_ESTABLISHMENT (574) to prevent stuck GAP state"
|
||||
)
|
||||
|
||||
# Patch 4: NimBLEDevice.cpp — expose host reset reason for application-level logging
|
||||
# NimBLE's onReset callback logs via NIMBLE_LOGE which only goes to serial UART.
|
||||
# This patch adds a global volatile int that our BLE loop can poll and log via UDP.
|
||||
apply_patch(
|
||||
os.path.join(NIMBLE_BASE, "NimBLEDevice.cpp"),
|
||||
"""void NimBLEDevice::onReset(int reason) {
|
||||
if (!m_synced) {
|
||||
return;
|
||||
}
|
||||
|
||||
m_synced = false;
|
||||
|
||||
NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason));
|
||||
} // onReset""",
|
||||
"""volatile int nimble_host_reset_reason = 0;
|
||||
|
||||
void NimBLEDevice::onReset(int reason) {
|
||||
if (!m_synced) {
|
||||
return;
|
||||
}
|
||||
|
||||
m_synced = false;
|
||||
nimble_host_reset_reason = reason;
|
||||
|
||||
NIMBLE_LOGE(LOG_TAG, "Host reset; reason=%d, %s", reason, NimBLEUtils::returnCodeToString(reason));
|
||||
} // onReset""",
|
||||
"NimBLEDevice.cpp: expose host reset reason for application-level logging"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user