diff --git a/coordinator/coordinator.lua b/coordinator/coordinator.lua index 26009ff..c12f9fb 100644 --- a/coordinator/coordinator.lua +++ b/coordinator/coordinator.lua @@ -22,6 +22,8 @@ local SCADA_CRDN_TYPE = comms.SCADA_CRDN_TYPE local UNIT_COMMAND = comms.UNIT_COMMAND local FAC_COMMAND = comms.FAC_COMMAND +local LINK_TIMEOUT = 60.0 + local coordinator = {} -- request the user to select a monitor @@ -227,9 +229,12 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, sv_seq_num = 0, sv_r_seq_num = nil, sv_config_err = false, - connected = false, last_est_ack = ESTABLISH_ACK.ALLOW, - last_api_est_acks = {} + last_api_est_acks = {}, + est_start = 0, + est_last = 0, + est_tick_waiting = nil, + est_task_done = nil } comms.set_trusted_range(range) @@ -295,6 +300,63 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, ---@class coord_comms local public = {} + -- try to connect to the supervisor if not already linked + ---@param abort boolean? true to print out cancel info if not linked (use on program terminate) + ---@return boolean ok, boolean start_ui + function public.try_connect(abort) + local ok = true + local start_ui = false + + if not self.sv_linked then + if self.est_tick_waiting == nil then + self.est_start = util.time_s() + self.est_last = self.est_start + + self.est_tick_waiting, self.est_task_done = + coordinator.log_comms_connecting("attempting to connect to configured supervisor on channel " .. svr_channel) + + _send_establish() + else + self.est_tick_waiting(math.max(0, LINK_TIMEOUT - (util.time_s() - self.est_start))) + end + + if abort or (util.time_s() - self.est_start) >= LINK_TIMEOUT then + self.est_task_done(false) + + if abort then + coordinator.log_comms("supervisor connection attempt cancelled by user") + elseif self.sv_config_err then + coordinator.log_comms("supervisor cooling configuration invalid, check supervisor config file") + elseif not self.sv_linked then + if self.last_est_ack == ESTABLISH_ACK.DENY then + coordinator.log_comms("supervisor connection attempt denied") + elseif self.last_est_ack == ESTABLISH_ACK.COLLISION then + coordinator.log_comms("supervisor connection failed due to collision") + elseif self.last_est_ack == ESTABLISH_ACK.BAD_VERSION then + coordinator.log_comms("supervisor connection failed due to version mismatch") + else + coordinator.log_comms("supervisor connection failed with no valid response") + end + end + + ok = false + elseif self.sv_config_err then + coordinator.log_comms("supervisor cooling configuration invalid, check supervisor config file") + ok = false + elseif (util.time_s() - self.est_last) > 1.0 then + _send_establish() + self.est_last = util.time_s() + end + elseif self.est_tick_waiting ~= nil then + self.est_task_done(true) + self.est_tick_waiting = nil + self.est_task_done = nil + start_ui = true + end + + return ok, start_ui + end + -- close the connection to the server function public.close() sv_watchdog.cancel() @@ -305,64 +367,6 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, _send_sv(PROTOCOL.SCADA_MGMT, SCADA_MGMT_TYPE.CLOSE, {}) end - -- attempt to connect to the subervisor - ---@nodiscard - ---@param timeout_s number timeout in seconds - ---@param tick_dmesg_waiting function callback to tick dmesg waiting - ---@param task_done function callback to show done on dmesg - ---@return boolean sv_linked true if connected, false otherwise - --- EVENT_CONSUMER: this function consumes events - function public.sv_connect(timeout_s, tick_dmesg_waiting, task_done) - local clock = util.new_clock(1) - local start = util.time_s() - local terminated = false - - _send_establish() - - clock.start() - - while (util.time_s() - start) < timeout_s and (not self.sv_linked) and (not self.sv_config_err) do - local event, p1, p2, p3, p4, p5 = util.pull_event() - - if event == "timer" and clock.is_clock(p1) then - -- timed out attempt, try again - tick_dmesg_waiting(math.max(0, timeout_s - (util.time_s() - start))) - _send_establish() - clock.start() - elseif event == "timer" then - -- keep checking watchdog timers - apisessions.check_all_watchdogs(p1) - elseif event == "modem_message" then - -- handle message - local packet = public.parse_packet(p1, p2, p3, p4, p5) - public.handle_packet(packet) - elseif event == "terminate" then - terminated = true - break - end - end - - task_done(self.sv_linked) - - if terminated then - coordinator.log_comms("supervisor connection attempt cancelled by user") - elseif self.sv_config_err then - coordinator.log_comms("supervisor cooling configuration invalid, check supervisor config file") - elseif not self.sv_linked then - if self.last_est_ack == ESTABLISH_ACK.DENY then - coordinator.log_comms("supervisor connection attempt denied") - elseif self.last_est_ack == ESTABLISH_ACK.COLLISION then - coordinator.log_comms("supervisor connection failed due to collision") - elseif self.last_est_ack == ESTABLISH_ACK.BAD_VERSION then - coordinator.log_comms("supervisor connection failed due to version mismatch") - else - coordinator.log_comms("supervisor connection failed with no valid response") - end - end - - return self.sv_linked - end - -- send a facility command ---@param cmd FAC_COMMAND command ---@param option any? optional option options for the optional options (like waste mode) @@ -426,7 +430,10 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, -- handle a packet ---@param packet mgmt_frame|crdn_frame|capi_frame|nil + ---@return boolean close_ui function public.handle_packet(packet) + local was_linked = self.sv_linked + if packet ~= nil then local l_chan = packet.scada_frame.local_channel() local r_chan = packet.scada_frame.remote_channel() @@ -436,7 +443,9 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, if l_chan ~= crd_channel then log.debug("received packet on unconfigured channel " .. l_chan, true) elseif r_chan == pkt_channel then - if protocol == PROTOCOL.COORD_API then + if not self.sv_linked then + log.debug("discarding pocket API packet before linked to supervisor") + elseif protocol == PROTOCOL.COORD_API then ---@cast packet capi_frame -- look for an associated session local session = apisessions.find_session(src_addr) @@ -497,12 +506,12 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, -- check sequence number if self.sv_r_seq_num == nil then self.sv_r_seq_num = packet.scada_frame.seq_num() - elseif self.connected and ((self.sv_r_seq_num + 1) ~= packet.scada_frame.seq_num()) then + elseif self.sv_linked and ((self.sv_r_seq_num + 1) ~= packet.scada_frame.seq_num()) then log.warning("sequence out-of-order: last = " .. self.sv_r_seq_num .. ", new = " .. packet.scada_frame.seq_num()) - return + return false elseif self.sv_linked and src_addr ~= self.sv_addr then log.debug("received packet from unknown computer " .. src_addr .. " while linked; channel in use by another system?") - return + return false else self.sv_r_seq_num = packet.scada_frame.seq_num() end @@ -632,7 +641,37 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, end elseif protocol == PROTOCOL.SCADA_MGMT then ---@cast packet mgmt_frame - if packet.type == SCADA_MGMT_TYPE.ESTABLISH then + if self.sv_linked then + if packet.type == SCADA_MGMT_TYPE.KEEP_ALIVE then + -- keep alive request received, echo back + if packet.length == 1 then + local timestamp = packet.data[1] + local trip_time = util.time() - timestamp + + if trip_time > 750 then + log.warning("coordinator KEEP_ALIVE trip time > 750ms (" .. trip_time .. "ms)") + end + + -- log.debug("coordinator RTT = " .. trip_time .. "ms") + + iocontrol.get_db().facility.ps.publish("sv_ping", trip_time) + + _send_keep_alive_ack(timestamp) + else + log.debug("SCADA keep alive packet length mismatch") + end + elseif packet.type == SCADA_MGMT_TYPE.CLOSE then + -- handle session close + sv_watchdog.cancel() + self.sv_addr = comms.BROADCAST + self.sv_linked = false + self.sv_r_seq_num = nil + iocontrol.fp_link_state(types.PANEL_LINK_STATE.DISCONNECTED) + log.info("server connection closed by remote host") + else + log.debug("received unknown SCADA_MGMT packet type " .. packet.type) + end + elseif packet.type == SCADA_MGMT_TYPE.ESTABLISH then -- connection with supervisor established if packet.length == 2 then local est_ack = packet.data[1] @@ -662,6 +701,7 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, self.sv_addr = src_addr self.sv_linked = true + self.sv_r_seq_num = nil self.sv_config_err = false iocontrol.fp_link_state(types.PANEL_LINK_STATE.LINKED) @@ -703,36 +743,6 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, else log.debug("SCADA_MGMT establish packet length mismatch") end - elseif self.sv_linked then - if packet.type == SCADA_MGMT_TYPE.KEEP_ALIVE then - -- keep alive request received, echo back - if packet.length == 1 then - local timestamp = packet.data[1] - local trip_time = util.time() - timestamp - - if trip_time > 750 then - log.warning("coordinator KEEP_ALIVE trip time > 750ms (" .. trip_time .. "ms)") - end - - -- log.debug("coordinator RTT = " .. trip_time .. "ms") - - iocontrol.get_db().facility.ps.publish("sv_ping", trip_time) - - _send_keep_alive_ack(timestamp) - else - log.debug("SCADA keep alive packet length mismatch") - end - elseif packet.type == SCADA_MGMT_TYPE.CLOSE then - -- handle session close - sv_watchdog.cancel() - self.sv_addr = comms.BROADCAST - self.sv_linked = false - self.sv_r_seq_num = nil - iocontrol.fp_link_state(types.PANEL_LINK_STATE.DISCONNECTED) - log.info("server connection closed by remote host") - else - log.debug("received unknown SCADA_MGMT packet type " .. packet.type) - end else log.debug("discarding non-link SCADA_MGMT packet before linked") end @@ -743,6 +753,8 @@ function coordinator.comms(version, nic, crd_channel, svr_channel, pkt_channel, log.debug("received packet for unknown channel " .. r_chan, true) end end + + return was_linked and not self.sv_linked end -- check if the coordinator is still linked to the supervisor diff --git a/coordinator/startup.lua b/coordinator/startup.lua index 6a97896..1045549 100644 --- a/coordinator/startup.lua +++ b/coordinator/startup.lua @@ -22,7 +22,7 @@ local sounder = require("coordinator.sounder") local apisessions = require("coordinator.session.apisessions") -local COORDINATOR_VERSION = "v0.19.0" +local COORDINATOR_VERSION = "v0.19.1" local println = util.println local println_ts = util.println_ts @@ -31,7 +31,6 @@ local log_graphics = coordinator.log_graphics local log_sys = coordinator.log_sys local log_boot = coordinator.log_boot local log_comms = coordinator.log_comms -local log_comms_connecting = coordinator.log_comms_connecting local log_crypto = coordinator.log_crypto ---------------------------------------- @@ -173,7 +172,7 @@ local function main() local loop_clock = util.new_clock(MAIN_CLOCK) ---------------------------------------- - -- start front panel + -- start front panel & UI start function ---------------------------------------- log_graphics("starting front panel UI...") @@ -187,39 +186,9 @@ local function main() return else log_graphics("front panel ready") end - ---------------------------------------- - -- connect to the supervisor - ---------------------------------------- - - -- attempt to connect to the supervisor or exit - local function init_connect_sv() - local tick_waiting, task_done = log_comms_connecting("attempting to connect to configured supervisor on channel " .. config.SVR_CHANNEL) - - -- attempt to establish a connection with the supervisory computer - if not coord_comms.sv_connect(60, tick_waiting, task_done) then - log_sys("supervisor connection failed, shutting down...") - log.fatal("failed to connect to supervisor") - return false - end - - return true - end - - if not init_connect_sv() then - println("startup> failed to connect to supervisor") - log_sys("system shutdown") - return - else - log_sys("supervisor connected, proceeding to UI start") - end - - ---------------------------------------- - -- start the UI - ---------------------------------------- - - -- start up the UI + -- start up the main UI ---@return boolean ui_ok started ok - local function init_start_ui() + local function start_main_ui() log_graphics("starting main UI...") local draw_start = util.time_ms() @@ -228,36 +197,29 @@ local function main() if not ui_ok then renderer.close_ui() log_graphics(util.c("main UI error: ", ui_message)) - println_ts("main UI creation failed") log.fatal(util.c("main GUI render failed with error ", ui_message)) else - log_graphics("first main UI draw took " .. (util.time_ms() - draw_start) .. "ms") - - -- start clock - loop_clock.start() + log_graphics("main UI draw took " .. (util.time_ms() - draw_start) .. "ms") end return ui_ok end - local ui_ok = init_start_ui() - ---------------------------------------- -- main event loop ---------------------------------------- + local link_failed = false + local ui_ok = true local date_format = util.trinary(config.TIME_24_HOUR, "%X \x04 %A, %B %d %Y", "%r \x04 %A, %B %d %Y") - if ui_ok then - -- start connection watchdog - conn_watchdog.feed() - log.debug("startup> conn watchdog started") + -- start clock + loop_clock.start() - log_sys("system started successfully") - end + log_sys("system started successfully") -- main event loop - while ui_ok do + while true do local event, param1, param2, param3, param4, param5 = util.pull_event() -- handle event @@ -271,7 +233,6 @@ local function main() if nic.is_modem(device) then nic.disconnect() log_sys("comms modem disconnected") - println_ts("wireless modem disconnected!") -- close out UI renderer.close_ui() @@ -287,18 +248,13 @@ local function main() if renderer.is_monitor_used(device) then ---@todo will be handled properly in #249 -- "halt and catch fire" style handling - local msg = "lost a configured monitor, system will now exit" - println_ts(msg) - log_sys(msg) + log_sys("lost a configured monitor, system will now exit") break else log_sys("lost unused monitor, ignoring") end elseif type == "speaker" then - local msg = "lost alarm sounder speaker" - println_ts(msg) - log_sys(msg) - + log_sys("lost alarm sounder speaker") iocontrol.fp_has_speaker(false) end end @@ -309,15 +265,8 @@ local function main() if type == "modem" then if device.isWireless() then -- reconnected modem - nic.connect(device) - log_sys("comms modem reconnected") - println_ts("wireless modem reconnected.") - - -- re-init system - if not init_connect_sv() then break end - ui_ok = init_start_ui() - + nic.connect(device) iocontrol.fp_has_modem(true) else log_sys("wired modem reconnected") @@ -326,10 +275,7 @@ local function main() ---@todo will be handled properly in #249 -- not supported, system will exit on loss of in-use monitors elseif type == "speaker" then - local msg = "alarm sounder speaker reconnected" - println_ts(msg) - log_sys(msg) - + log_sys("alarm sounder speaker reconnected") sounder.reconnect(device) iocontrol.fp_has_speaker(true) end @@ -337,8 +283,25 @@ local function main() elseif event == "timer" then if loop_clock.is_clock(param1) then -- main loop tick + + -- toggle heartbeat iocontrol.heartbeat() + -- maintain connection + if nic.connected() then + local ok, start_ui = coord_comms.try_connect() + if not ok then + link_failed = true + log_sys("supervisor connection failed, shutting down...") + log.fatal("failed to connect to supervisor") + break + elseif start_ui then + log_sys("supervisor connected, proceeding to main UI start") + ui_ok = start_main_ui() + if not ui_ok then break end + end + end + -- iterate sessions apisessions.iterate_all() @@ -346,25 +309,19 @@ local function main() apisessions.free_all_closed() -- update date and time string for main display - iocontrol.get_db().facility.ps.publish("date_time", os.date(date_format)) + if coord_comms.is_linked() then + iocontrol.get_db().facility.ps.publish("date_time", os.date(date_format)) + end loop_clock.start() elseif conn_watchdog.is_timer(param1) then -- supervisor watchdog timeout - local msg = "supervisor server timeout" - log_comms(msg) - println_ts(msg) + log_comms("supervisor server timeout") - -- close connection, UI, and stop sounder + -- close connection, main UI, and stop sounder coord_comms.close() renderer.close_ui() sounder.stop() - - if nic.connected() then - -- try to re-connect to the supervisor - if not init_connect_sv() then break end - ui_ok = init_start_ui() - end else -- a non-clock/main watchdog timer event @@ -377,22 +334,15 @@ local function main() elseif event == "modem_message" then -- got a packet local packet = coord_comms.parse_packet(param1, param2, param3, param4, param5) - coord_comms.handle_packet(packet) - -- check if it was a disconnect - if not coord_comms.is_linked() then + -- handle then check if it was a disconnect + if coord_comms.handle_packet(packet) then log_comms("supervisor closed connection") -- close connection, UI, and stop sounder coord_comms.close() renderer.close_ui() sounder.stop() - - if nic.connected() then - -- try to re-connect to the supervisor - if not init_connect_sv() then break end - ui_ok = init_start_ui() - end end elseif event == "monitor_touch" or event == "mouse_click" or event == "mouse_up" or event == "mouse_drag" or event == "mouse_scroll" then @@ -405,10 +355,17 @@ local function main() -- check for termination request if event == "terminate" or ppm.should_terminate() then - println_ts("terminate requested, closing connections...") - log_comms("terminate requested, closing supervisor connection...") + -- handle supervisor connection + link_failed = coord_comms.try_connect(true) + + if coord_comms.is_linked() then + log_comms("terminate requested, closing supervisor connection...") + else link_failed = true end + coord_comms.close() log_comms("supervisor connection closed") + + -- handle API sessions log_comms("closing api sessions...") apisessions.close_all() log_comms("api sessions closed") @@ -421,6 +378,9 @@ local function main() sounder.stop() log_sys("system shutdown") + if link_failed then println_ts("failed to connect to supervisor") end + if not ui_ok then println_ts("main UI creation failed") end + println_ts("exited") log.info("exited") end