Add system health monitoring and reporting functionality

- Implemented periodic updates of system health status to the database every 30 seconds.
- Added a new method to aggregate and return comprehensive health status of all components, including core connection, database, services, and web viewer.
- Introduced an API endpoint at /api/system-health to retrieve system health data, providing a structured response for health status and component details.
- Enhanced error handling for health data storage and retrieval processes, ensuring robust logging and feedback in case of issues.
This commit is contained in:
agessaman
2026-01-01 16:37:03 -08:00
parent be14617d34
commit 73492c8ff1
3 changed files with 154 additions and 0 deletions
+98
View File
@@ -887,6 +887,16 @@ use_zulu_time = false
except (AttributeError, TypeError) as e:
print(f"Web viewer health check failed: {e}")
# Periodically update system health in database (every 30 seconds)
if not hasattr(self, '_last_health_update'):
self._last_health_update = 0
if time.time() - self._last_health_update >= 30:
try:
await self.get_system_health() # This stores it in the database
self._last_health_update = time.time()
except Exception as e:
self.logger.debug(f"Error updating system health: {e}")
await asyncio.sleep(5) # Check every 5 seconds
except KeyboardInterrupt:
self.logger.info("Received interrupt signal")
@@ -931,6 +941,94 @@ use_zulu_time = false
except (AttributeError, TypeError):
print("Bot stopped")
async def get_system_health(self) -> Dict[str, Any]:
"""Aggregate health status from all components
Returns:
Dictionary containing overall health status and component details
"""
health = {
'status': 'healthy',
'timestamp': time.time(),
'uptime_seconds': time.time() - self.start_time,
'components': {}
}
# Check core connection
health['components']['meshcore'] = {
'healthy': self.connected and self.meshcore is not None,
'message': 'Connected' if (self.connected and self.meshcore is not None) else 'Disconnected'
}
# Check database
try:
stats = self.db_manager.get_database_stats()
health['components']['database'] = {
'healthy': True,
'entries': stats.get('geocoding_cache_entries', 0) + stats.get('generic_cache_entries', 0),
'message': 'Operational'
}
except Exception as e:
health['components']['database'] = {
'healthy': False,
'error': str(e),
'message': f'Error: {str(e)}'
}
# Check services
if hasattr(self, 'services') and self.services:
for name, service in self.services.items():
try:
# Services have is_running() method, not health_check()
is_running = service.is_running() if hasattr(service, 'is_running') else False
health['components'][f'service_{name}'] = {
'healthy': is_running,
'message': 'Running' if is_running else 'Stopped',
'enabled': getattr(service, 'enabled', True)
}
except Exception as e:
health['components'][f'service_{name}'] = {
'healthy': False,
'error': str(e),
'message': f'Error: {str(e)}'
}
# Check web viewer if available
if hasattr(self, 'web_viewer_integration') and self.web_viewer_integration:
try:
is_healthy = self.web_viewer_integration.is_viewer_healthy() if hasattr(
self.web_viewer_integration, 'is_viewer_healthy'
) else True
health['components']['web_viewer'] = {
'healthy': is_healthy,
'message': 'Operational' if is_healthy else 'Unhealthy'
}
except Exception as e:
health['components']['web_viewer'] = {
'healthy': False,
'error': str(e),
'message': f'Error: {str(e)}'
}
# Determine overall status
unhealthy = [
k for k, v in health['components'].items()
if not v.get('healthy', True)
]
if unhealthy:
if len(unhealthy) < len(health['components']):
health['status'] = 'degraded'
else:
health['status'] = 'unhealthy'
# Store health data in database for web viewer access
try:
self.db_manager.set_system_health(health)
except Exception as e:
self.logger.debug(f"Could not store system health in database: {e}")
return health
def _cleanup_web_viewer(self):
"""Cleanup web viewer on exit"""
try:
+21
View File
@@ -499,3 +499,24 @@ class DBManager:
def set_bot_start_time(self, start_time: float):
"""Set bot start time in metadata"""
self.set_metadata('start_time', str(start_time))
def set_system_health(self, health_data: Dict[str, Any]):
"""Store system health data in metadata"""
try:
import json
health_json = json.dumps(health_data)
self.set_metadata('system_health', health_json)
except Exception as e:
self.logger.error(f"Error storing system health: {e}")
def get_system_health(self) -> Optional[Dict[str, Any]]:
"""Get system health data from metadata"""
try:
import json
health_json = self.get_metadata('system_health')
if health_json:
return json.loads(health_json)
return None
except Exception as e:
self.logger.error(f"Error getting system health: {e}")
return None
+35
View File
@@ -291,6 +291,41 @@ class BotDataViewer:
'version': 'modern_2.0'
})
@self.app.route('/api/system-health')
def api_system_health():
"""Get comprehensive system health status from database"""
try:
# Read health data from database (consistent with how other data is accessed)
health_data = self.db_manager.get_system_health()
if not health_data:
# If no health data in database, return minimal status
return jsonify({
'status': 'unknown',
'timestamp': time.time(),
'message': 'Health data not available yet',
'components': {}
})
# Update timestamp to reflect current time (data may be slightly stale)
health_data['timestamp'] = time.time()
# Recalculate uptime if start_time is available
start_time = self.db_manager.get_bot_start_time()
if start_time:
health_data['uptime_seconds'] = time.time() - start_time
return jsonify(health_data)
except Exception as e:
self.logger.error(f"Error getting system health: {e}")
import traceback
self.logger.debug(traceback.format_exc())
return jsonify({
'error': str(e),
'status': 'error'
}), 500
@self.app.route('/api/stats')
def api_stats():
"""Get comprehensive database statistics for dashboard"""