Add crash detection

This commit is contained in:
Andrew 2022-01-27 20:43:23 -05:00
parent 4a2a1ab451
commit 005ac1216e
5 changed files with 55 additions and 15 deletions

View File

@ -146,6 +146,10 @@ class Servers_Controller:
return True
return False
@staticmethod
def is_crashed(server_id):
return servers_helper.is_crashed(server_id)
@staticmethod
def server_id_authorized_api_key(server_id: str, api_key: ApiKeys) -> bool:
# TODO

View File

@ -73,6 +73,7 @@ class Server_Stats(Model):
updating = BooleanField(default=False)
waiting_start = BooleanField(default=False)
first_run = BooleanField(default=True)
crashed = BooleanField(default=False)
class Meta:
@ -178,6 +179,25 @@ class helper_servers:
return False
return True
@staticmethod
def sever_crashed(server_id):
with database.atomic():
Server_Stats.update(crashed=True).where(Server_Stats.server_id == server_id).execute()
@staticmethod
def server_crash_reset(server_id):
with database.atomic():
Server_Stats.update(crashed=False).where(Server_Stats.server_id == server_id).execute()
@staticmethod
def is_crashed(server_id):
svr = Server_Stats.select().where(Server_Stats.server_id == server_id).get()
#pylint: disable=singleton-comparison
if svr.crashed == True:
return True
else:
return False
@staticmethod
def set_update(server_id, value):
try:
@ -190,8 +210,8 @@ class helper_servers:
@staticmethod
def get_update_status(server_id):
waiting_start = Server_Stats.select().where(Server_Stats.server_id == server_id).get()
return waiting_start.waiting_start
update_status = Server_Stats.select().where(Server_Stats.server_id == server_id).get()
return update_status.updating
@staticmethod
def set_first_run(server_id):

View File

@ -115,8 +115,12 @@ class Server:
self.stats = stats
tz = get_localzone()
self.server_scheduler = BackgroundScheduler(timezone=str(tz))
self.server_scheduler.start()
self.backup_thread = threading.Thread(target=self.a_backup_server, daemon=True, name=f"backup_{self.name}")
self.is_backingup = False
#Reset crash and update at initialization
servers_helper.server_crash_reset(self.server_id)
servers_helper.set_update(self.server_id, False)
def reload_server_settings(self):
server_data = servers_helper.get_server_data_by_id(self.server_id)
@ -141,7 +145,6 @@ class Server:
console.info(f"Scheduling server {self.name} to start in {delay} seconds")
self.server_scheduler.add_job(self.run_scheduled_server, 'interval', seconds=delay, id=str(self.server_id))
self.server_scheduler.start()
def run_scheduled_server(self):
console.info(f"Starting server ID: {self.server_id} - {self.name}")
@ -262,6 +265,7 @@ class Server:
threading.Thread(target=out_buf.check, daemon=True, name=f'{self.server_id}_virtual_terminal').start()
self.is_crashed = False
servers_helper.server_crash_reset(self.server_id)
self.start_time = str(datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'))
@ -269,6 +273,7 @@ class Server:
logger.info(f"Server {self.name} running with PID {self.process.pid}")
console.info(f"Server {self.name} running with PID {self.process.pid}")
self.is_crashed = False
servers_helper.server_crash_reset(self.server_id)
self.stats.record_stats()
check_internet_thread = threading.Thread(
target=self.check_internet_thread, daemon=True, args=(user_id, user_lang, ), name=f"{self.name}_Internet")
@ -299,7 +304,7 @@ class Server:
logger.info(f"Server {self.name} has crash detection enabled - starting watcher task")
console.info(f"Server {self.name} has crash detection enabled - starting watcher task")
self.server_scheduler.add_job(self.detect_crash, 'interval', seconds=30, id="c_{self.server_id}")
self.server_scheduler.add_job(self.detect_crash, 'interval', seconds=30, id=f"c_{self.server_id}")
def check_internet_thread(self, user_id, user_lang):
if user_id:
@ -317,6 +322,9 @@ class Server:
def stop_server(self):
if self.settings['stop_command']:
self.send_command(self.settings['stop_command'])
#remove crash detection watcher
logger.info(f"Removing crash watcher for server {self.name}")
self.server_scheduler.remove_job('c_' + str(self.server_id))
else:
#windows will need to be handled separately for Ctrl+C
self.process.terminate()
@ -398,8 +406,9 @@ class Server:
def crash_detected(self, name):
print("crash detected")
# clear the old scheduled watcher task
self.remove_watcher_thread()
self.server_scheduler.remove_job("c_"+str(self.server_id))
# the server crashed, or isn't found - so let's reset things.
logger.warning(f"The server {name} seems to have vanished unexpectedly, did it crash?")
@ -449,6 +458,7 @@ class Server:
if running:
return
servers_helper.sever_crashed(self.server_id)
# if we haven't tried to restart more 3 or more times
if self.restart_count <= 3:
@ -464,12 +474,12 @@ class Server:
logger.critical(f"Server {self.name} has been restarted {self.restart_count} times. It has crashed, not restarting.")
console.critical(f"Server {self.name} has been restarted {self.restart_count} times. It has crashed, not restarting.")
# set to 99 restart attempts so this elif is skipped next time. (no double logging)
self.restart_count = 99
self.restart_count = 0
self.is_crashed = True
servers_helper.sever_crashed(self.server_id)
# cancel the watcher task
self.remove_watcher_thread()
self.server_scheduler.remove_job("c_"+str(self.server_id))
def remove_watcher_thread(self):
logger.info("Removing old crash detection watcher thread")

View File

@ -16,6 +16,7 @@ from app.classes.minecraft.serverjars import server_jar_obj
from app.classes.models.management import management_helper
from app.classes.controllers.users_controller import Users_Controller
from app.classes.controllers.servers_controller import Servers_Controller
from app.classes.models.servers import servers_helper
logger = logging.getLogger('apscheduler')
@ -454,6 +455,7 @@ class TasksManager:
'players': srv['raw_ping_result'].get('players'),
'desc': srv['raw_ping_result'].get('desc'),
'version': srv['raw_ping_result'].get('version'),
'crashed': servers_helper.is_crashed(server_id),
})
if len(websocket_helper.clients) > 0:
websocket_helper.broadcast_user_page_params(
@ -478,6 +480,7 @@ class TasksManager:
'players': srv['raw_ping_result'].get('players'),
'desc': srv['raw_ping_result'].get('desc'),
'version': srv['raw_ping_result'].get('version'),
'crashed': servers_helper.is_crashed(server_id),
}
)
total_players += int(srv['raw_ping_result'].get('online'))

View File

@ -295,13 +295,15 @@ class PanelHandler(BaseHandler):
list(filter(lambda x: x['stats']['running'], page_data['servers'])))
page_data['server_stats']['stopped'] = len(page_data['servers']) - page_data['server_stats']['running']
for data in page_data['servers']:
try:
data['stats']['waiting_start'] = self.controller.servers.get_waiting_start(
str(data['stats']['server_id']['server_id']))
except Exception as e:
logger.error(f"Failed to get server waiting to start: {e}")
data['stats']['waiting_start'] = False
for data in page_data['servers']:
data['stats']['crashed'] = self.controller.servers.is_crashed(
str(data['stats']['server_id']['server_id']))
try:
data['stats']['waiting_start'] = self.controller.servers.get_waiting_start(
str(data['stats']['server_id']['server_id']))
except Exception as e:
logger.error(f"Failed to get server waiting to start: {e}")
data['stats']['waiting_start'] = False
try:
self.fetch_server_data(page_data)
@ -348,6 +350,7 @@ class PanelHandler(BaseHandler):
'Players': Enum_Permissions_Server.Players,
}
page_data['user_permissions'] = self.controller.server_perms.get_user_id_permissions_list(exec_user["user_id"], server_id)
page_data['server_stats']['crashed'] = self.controller.servers.is_crashed(server_id)
if subpage == 'term':
if not page_data['permissions']['Terminal'] in page_data['user_permissions']: