From 73492c8ff106af6b45896ce3e56ea2ffabc19f87 Mon Sep 17 00:00:00 2001 From: agessaman Date: Thu, 1 Jan 2026 16:37:03 -0800 Subject: [PATCH] Add system health monitoring and reporting functionality - Implemented periodic updates of system health status to the database every 30 seconds. - Added a new method to aggregate and return comprehensive health status of all components, including core connection, database, services, and web viewer. - Introduced an API endpoint at /api/system-health to retrieve system health data, providing a structured response for health status and component details. - Enhanced error handling for health data storage and retrieval processes, ensuring robust logging and feedback in case of issues. --- modules/core.py | 98 +++++++++++++++++++++++++++++++++++++++ modules/db_manager.py | 21 +++++++++ modules/web_viewer/app.py | 35 ++++++++++++++ 3 files changed, 154 insertions(+) diff --git a/modules/core.py b/modules/core.py index f4a5e42..34c0863 100644 --- a/modules/core.py +++ b/modules/core.py @@ -887,6 +887,16 @@ use_zulu_time = false except (AttributeError, TypeError) as e: print(f"Web viewer health check failed: {e}") + # Periodically update system health in database (every 30 seconds) + if not hasattr(self, '_last_health_update'): + self._last_health_update = 0 + if time.time() - self._last_health_update >= 30: + try: + await self.get_system_health() # This stores it in the database + self._last_health_update = time.time() + except Exception as e: + self.logger.debug(f"Error updating system health: {e}") + await asyncio.sleep(5) # Check every 5 seconds except KeyboardInterrupt: self.logger.info("Received interrupt signal") @@ -931,6 +941,94 @@ use_zulu_time = false except (AttributeError, TypeError): print("Bot stopped") + async def get_system_health(self) -> Dict[str, Any]: + """Aggregate health status from all components + + Returns: + Dictionary containing overall health status and component details + """ + health = { + 'status': 'healthy', + 'timestamp': time.time(), + 'uptime_seconds': time.time() - self.start_time, + 'components': {} + } + + # Check core connection + health['components']['meshcore'] = { + 'healthy': self.connected and self.meshcore is not None, + 'message': 'Connected' if (self.connected and self.meshcore is not None) else 'Disconnected' + } + + # Check database + try: + stats = self.db_manager.get_database_stats() + health['components']['database'] = { + 'healthy': True, + 'entries': stats.get('geocoding_cache_entries', 0) + stats.get('generic_cache_entries', 0), + 'message': 'Operational' + } + except Exception as e: + health['components']['database'] = { + 'healthy': False, + 'error': str(e), + 'message': f'Error: {str(e)}' + } + + # Check services + if hasattr(self, 'services') and self.services: + for name, service in self.services.items(): + try: + # Services have is_running() method, not health_check() + is_running = service.is_running() if hasattr(service, 'is_running') else False + health['components'][f'service_{name}'] = { + 'healthy': is_running, + 'message': 'Running' if is_running else 'Stopped', + 'enabled': getattr(service, 'enabled', True) + } + except Exception as e: + health['components'][f'service_{name}'] = { + 'healthy': False, + 'error': str(e), + 'message': f'Error: {str(e)}' + } + + # Check web viewer if available + if hasattr(self, 'web_viewer_integration') and self.web_viewer_integration: + try: + is_healthy = self.web_viewer_integration.is_viewer_healthy() if hasattr( + self.web_viewer_integration, 'is_viewer_healthy' + ) else True + health['components']['web_viewer'] = { + 'healthy': is_healthy, + 'message': 'Operational' if is_healthy else 'Unhealthy' + } + except Exception as e: + health['components']['web_viewer'] = { + 'healthy': False, + 'error': str(e), + 'message': f'Error: {str(e)}' + } + + # Determine overall status + unhealthy = [ + k for k, v in health['components'].items() + if not v.get('healthy', True) + ] + if unhealthy: + if len(unhealthy) < len(health['components']): + health['status'] = 'degraded' + else: + health['status'] = 'unhealthy' + + # Store health data in database for web viewer access + try: + self.db_manager.set_system_health(health) + except Exception as e: + self.logger.debug(f"Could not store system health in database: {e}") + + return health + def _cleanup_web_viewer(self): """Cleanup web viewer on exit""" try: diff --git a/modules/db_manager.py b/modules/db_manager.py index 09f2092..d381063 100644 --- a/modules/db_manager.py +++ b/modules/db_manager.py @@ -499,3 +499,24 @@ class DBManager: def set_bot_start_time(self, start_time: float): """Set bot start time in metadata""" self.set_metadata('start_time', str(start_time)) + + def set_system_health(self, health_data: Dict[str, Any]): + """Store system health data in metadata""" + try: + import json + health_json = json.dumps(health_data) + self.set_metadata('system_health', health_json) + except Exception as e: + self.logger.error(f"Error storing system health: {e}") + + def get_system_health(self) -> Optional[Dict[str, Any]]: + """Get system health data from metadata""" + try: + import json + health_json = self.get_metadata('system_health') + if health_json: + return json.loads(health_json) + return None + except Exception as e: + self.logger.error(f"Error getting system health: {e}") + return None \ No newline at end of file diff --git a/modules/web_viewer/app.py b/modules/web_viewer/app.py index bcb8b7e..5ad4cb7 100644 --- a/modules/web_viewer/app.py +++ b/modules/web_viewer/app.py @@ -291,6 +291,41 @@ class BotDataViewer: 'version': 'modern_2.0' }) + @self.app.route('/api/system-health') + def api_system_health(): + """Get comprehensive system health status from database""" + try: + # Read health data from database (consistent with how other data is accessed) + health_data = self.db_manager.get_system_health() + + if not health_data: + # If no health data in database, return minimal status + return jsonify({ + 'status': 'unknown', + 'timestamp': time.time(), + 'message': 'Health data not available yet', + 'components': {} + }) + + # Update timestamp to reflect current time (data may be slightly stale) + health_data['timestamp'] = time.time() + + # Recalculate uptime if start_time is available + start_time = self.db_manager.get_bot_start_time() + if start_time: + health_data['uptime_seconds'] = time.time() - start_time + + return jsonify(health_data) + + except Exception as e: + self.logger.error(f"Error getting system health: {e}") + import traceback + self.logger.debug(traceback.format_exc()) + return jsonify({ + 'error': str(e), + 'status': 'error' + }), 500 + @self.app.route('/api/stats') def api_stats(): """Get comprehensive database statistics for dashboard"""