mirror of
https://github.com/agessaman/meshcore-bot.git
synced 2026-05-18 21:35:29 +00:00
Add system health monitoring and reporting functionality
- Implemented periodic updates of system health status to the database every 30 seconds. - Added a new method to aggregate and return comprehensive health status of all components, including core connection, database, services, and web viewer. - Introduced an API endpoint at /api/system-health to retrieve system health data, providing a structured response for health status and component details. - Enhanced error handling for health data storage and retrieval processes, ensuring robust logging and feedback in case of issues.
This commit is contained in:
@@ -887,6 +887,16 @@ use_zulu_time = false
|
||||
except (AttributeError, TypeError) as e:
|
||||
print(f"Web viewer health check failed: {e}")
|
||||
|
||||
# Periodically update system health in database (every 30 seconds)
|
||||
if not hasattr(self, '_last_health_update'):
|
||||
self._last_health_update = 0
|
||||
if time.time() - self._last_health_update >= 30:
|
||||
try:
|
||||
await self.get_system_health() # This stores it in the database
|
||||
self._last_health_update = time.time()
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Error updating system health: {e}")
|
||||
|
||||
await asyncio.sleep(5) # Check every 5 seconds
|
||||
except KeyboardInterrupt:
|
||||
self.logger.info("Received interrupt signal")
|
||||
@@ -931,6 +941,94 @@ use_zulu_time = false
|
||||
except (AttributeError, TypeError):
|
||||
print("Bot stopped")
|
||||
|
||||
async def get_system_health(self) -> Dict[str, Any]:
|
||||
"""Aggregate health status from all components
|
||||
|
||||
Returns:
|
||||
Dictionary containing overall health status and component details
|
||||
"""
|
||||
health = {
|
||||
'status': 'healthy',
|
||||
'timestamp': time.time(),
|
||||
'uptime_seconds': time.time() - self.start_time,
|
||||
'components': {}
|
||||
}
|
||||
|
||||
# Check core connection
|
||||
health['components']['meshcore'] = {
|
||||
'healthy': self.connected and self.meshcore is not None,
|
||||
'message': 'Connected' if (self.connected and self.meshcore is not None) else 'Disconnected'
|
||||
}
|
||||
|
||||
# Check database
|
||||
try:
|
||||
stats = self.db_manager.get_database_stats()
|
||||
health['components']['database'] = {
|
||||
'healthy': True,
|
||||
'entries': stats.get('geocoding_cache_entries', 0) + stats.get('generic_cache_entries', 0),
|
||||
'message': 'Operational'
|
||||
}
|
||||
except Exception as e:
|
||||
health['components']['database'] = {
|
||||
'healthy': False,
|
||||
'error': str(e),
|
||||
'message': f'Error: {str(e)}'
|
||||
}
|
||||
|
||||
# Check services
|
||||
if hasattr(self, 'services') and self.services:
|
||||
for name, service in self.services.items():
|
||||
try:
|
||||
# Services have is_running() method, not health_check()
|
||||
is_running = service.is_running() if hasattr(service, 'is_running') else False
|
||||
health['components'][f'service_{name}'] = {
|
||||
'healthy': is_running,
|
||||
'message': 'Running' if is_running else 'Stopped',
|
||||
'enabled': getattr(service, 'enabled', True)
|
||||
}
|
||||
except Exception as e:
|
||||
health['components'][f'service_{name}'] = {
|
||||
'healthy': False,
|
||||
'error': str(e),
|
||||
'message': f'Error: {str(e)}'
|
||||
}
|
||||
|
||||
# Check web viewer if available
|
||||
if hasattr(self, 'web_viewer_integration') and self.web_viewer_integration:
|
||||
try:
|
||||
is_healthy = self.web_viewer_integration.is_viewer_healthy() if hasattr(
|
||||
self.web_viewer_integration, 'is_viewer_healthy'
|
||||
) else True
|
||||
health['components']['web_viewer'] = {
|
||||
'healthy': is_healthy,
|
||||
'message': 'Operational' if is_healthy else 'Unhealthy'
|
||||
}
|
||||
except Exception as e:
|
||||
health['components']['web_viewer'] = {
|
||||
'healthy': False,
|
||||
'error': str(e),
|
||||
'message': f'Error: {str(e)}'
|
||||
}
|
||||
|
||||
# Determine overall status
|
||||
unhealthy = [
|
||||
k for k, v in health['components'].items()
|
||||
if not v.get('healthy', True)
|
||||
]
|
||||
if unhealthy:
|
||||
if len(unhealthy) < len(health['components']):
|
||||
health['status'] = 'degraded'
|
||||
else:
|
||||
health['status'] = 'unhealthy'
|
||||
|
||||
# Store health data in database for web viewer access
|
||||
try:
|
||||
self.db_manager.set_system_health(health)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not store system health in database: {e}")
|
||||
|
||||
return health
|
||||
|
||||
def _cleanup_web_viewer(self):
|
||||
"""Cleanup web viewer on exit"""
|
||||
try:
|
||||
|
||||
@@ -499,3 +499,24 @@ class DBManager:
|
||||
def set_bot_start_time(self, start_time: float):
|
||||
"""Set bot start time in metadata"""
|
||||
self.set_metadata('start_time', str(start_time))
|
||||
|
||||
def set_system_health(self, health_data: Dict[str, Any]):
|
||||
"""Store system health data in metadata"""
|
||||
try:
|
||||
import json
|
||||
health_json = json.dumps(health_data)
|
||||
self.set_metadata('system_health', health_json)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error storing system health: {e}")
|
||||
|
||||
def get_system_health(self) -> Optional[Dict[str, Any]]:
|
||||
"""Get system health data from metadata"""
|
||||
try:
|
||||
import json
|
||||
health_json = self.get_metadata('system_health')
|
||||
if health_json:
|
||||
return json.loads(health_json)
|
||||
return None
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error getting system health: {e}")
|
||||
return None
|
||||
@@ -291,6 +291,41 @@ class BotDataViewer:
|
||||
'version': 'modern_2.0'
|
||||
})
|
||||
|
||||
@self.app.route('/api/system-health')
|
||||
def api_system_health():
|
||||
"""Get comprehensive system health status from database"""
|
||||
try:
|
||||
# Read health data from database (consistent with how other data is accessed)
|
||||
health_data = self.db_manager.get_system_health()
|
||||
|
||||
if not health_data:
|
||||
# If no health data in database, return minimal status
|
||||
return jsonify({
|
||||
'status': 'unknown',
|
||||
'timestamp': time.time(),
|
||||
'message': 'Health data not available yet',
|
||||
'components': {}
|
||||
})
|
||||
|
||||
# Update timestamp to reflect current time (data may be slightly stale)
|
||||
health_data['timestamp'] = time.time()
|
||||
|
||||
# Recalculate uptime if start_time is available
|
||||
start_time = self.db_manager.get_bot_start_time()
|
||||
if start_time:
|
||||
health_data['uptime_seconds'] = time.time() - start_time
|
||||
|
||||
return jsonify(health_data)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error getting system health: {e}")
|
||||
import traceback
|
||||
self.logger.debug(traceback.format_exc())
|
||||
return jsonify({
|
||||
'error': str(e),
|
||||
'status': 'error'
|
||||
}), 500
|
||||
|
||||
@self.app.route('/api/stats')
|
||||
def api_stats():
|
||||
"""Get comprehensive database statistics for dashboard"""
|
||||
|
||||
Reference in New Issue
Block a user