From 88b191dbbb6a98964458d1cfac22a764e405329d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=A4rtens?= Date: Fri, 26 Mar 2021 12:41:22 +0100 Subject: [PATCH 1/3] track state tick in a historgram too --- server/src/lib.rs | 13 +++++-------- server/src/metrics.rs | 29 +++++++++++++++++++++++++++-- server/src/sys/metrics.rs | 6 ++++++ 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/server/src/lib.rs b/server/src/lib.rs index e4ca587d56..c9fc9484a4 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -704,14 +704,11 @@ impl Server { .set((before_persistence_updates - before_entity_cleanup).as_nanos() as i64); tt.with_label_values(&["persistence_updates"]) .set((end_of_server_tick - before_persistence_updates).as_nanos() as i64); - } - - #[cfg(feature = "tracy")] - { - use common_base::tracy_client::{create_plot, Plot}; - let entity_count = self.state.ecs().entities().join().count(); - static ENTITY_COUNT: Plot = create_plot!("entity count"); - ENTITY_COUNT.point(entity_count as f64); + tick_metrics.tick_time_hist.observe( + end_of_server_tick + .duration_since(before_state_tick) + .as_secs_f64(), + ); } // 9) Finish the tick, pass control back to the frontend. diff --git a/server/src/metrics.rs b/server/src/metrics.rs index 8e9a0ffe0f..f8034d78d4 100644 --- a/server/src/metrics.rs +++ b/server/src/metrics.rs @@ -1,6 +1,6 @@ use prometheus::{ - Gauge, GaugeVec, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, - Opts, Registry, + Gauge, GaugeVec, Histogram, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, + IntGaugeVec, Opts, Registry, }; use std::{ convert::TryInto, @@ -48,6 +48,7 @@ pub struct TickMetrics { pub chunk_groups_count: IntGauge, pub entity_count: IntGauge, pub tick_time: IntGaugeVec, + pub tick_time_hist: Histogram, pub build_info: IntGauge, pub start_time: IntGauge, pub time_of_day: Gauge, @@ -262,6 +263,28 @@ impl TickMetrics { Opts::new("tick_time", "time in ns required for a tick of the server"), &["period"], )?; + // 33.33ms is the ideal tick time. So we have hight detail around it. + // 300/700 are to detect high I/O blocks + let bucket = vec![ + Duration::from_millis(8).as_secs_f64(), + Duration::from_millis(16).as_secs_f64(), + Duration::from_millis(24).as_secs_f64(), + Duration::from_millis(30).as_secs_f64(), + Duration::from_millis(33).as_secs_f64(), + Duration::from_millis(37).as_secs_f64(), + Duration::from_millis(45).as_secs_f64(), + Duration::from_millis(60).as_secs_f64(), + Duration::from_millis(100).as_secs_f64(), + Duration::from_millis(300).as_secs_f64(), + Duration::from_millis(700).as_secs_f64(), + ]; + let tick_time_hist = Histogram::with_opts( + HistogramOpts::new( + "tick_time_hist", + "shows the detailed time in ns spend for the whole tick as histogram", + ) + .buckets(bucket), + )?; let since_the_epoch = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -277,6 +300,7 @@ impl TickMetrics { registry.register(Box::new(time_of_day.clone()))?; registry.register(Box::new(light_count.clone()))?; registry.register(Box::new(tick_time.clone()))?; + registry.register(Box::new(tick_time_hist.clone()))?; Ok(Self { chonks_count, @@ -284,6 +308,7 @@ impl TickMetrics { chunk_groups_count, entity_count, tick_time, + tick_time_hist, build_info, start_time, time_of_day, diff --git a/server/src/sys/metrics.rs b/server/src/sys/metrics.rs index e8c21c680b..7b348b88ee 100644 --- a/server/src/sys/metrics.rs +++ b/server/src/sys/metrics.rs @@ -102,6 +102,12 @@ impl<'a> System<'a> for Sys { if let Some(entities) = entities { let entity_count = entities.join().count(); export_tick.entity_count.set(entity_count as i64); + #[cfg(feature = "tracy")] + { + use common_base::tracy_client::{create_plot, Plot}; + static ENTITY_COUNT: Plot = create_plot!("entity count"); + ENTITY_COUNT.point(entity_count as f64); + } } } From 6a95fb6b74b34ade1f9ca97d35e3da90643648ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=A4rtens?= Date: Sat, 27 Mar 2021 16:27:59 +0100 Subject: [PATCH 2/3] extend network of incomming TCP metrics and failed handshake metric --- network/src/metrics.rs | 22 ++++++++++++++++++++++ network/src/scheduler.rs | 14 +++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/network/src/metrics.rs b/network/src/metrics.rs index 9733b477a1..c46fe16bda 100644 --- a/network/src/metrics.rs +++ b/network/src/metrics.rs @@ -9,6 +9,8 @@ use std::error::Error; pub struct NetworkMetrics { pub listen_requests_total: IntCounterVec, pub connect_requests_total: IntCounterVec, + pub incoming_connections_total: IntCounterVec, + pub failed_handshakes_total: IntCounter, pub participants_connected_total: IntCounter, pub participants_disconnected_total: IntCounter, // channel id's, seperated by PARTICIPANT, max 5 @@ -44,6 +46,17 @@ impl NetworkMetrics { ), &["protocol"], )?; + let incoming_connections_total = IntCounterVec::new( + Opts::new( + "incoming_connections_total", + "Shows the number of external requests to the scheduler", + ), + &["protocol"], + )?; + let failed_handshakes_total = IntCounter::with_opts(Opts::new( + "failed_handshakes_total", + "Shows the number of failed handshakes", + ))?; let participants_connected_total = IntCounter::with_opts(Opts::new( "participants_connected_total", "Shows the number of participants connected to the network", @@ -110,6 +123,8 @@ impl NetworkMetrics { Ok(Self { listen_requests_total, connect_requests_total, + incoming_connections_total, + failed_handshakes_total, participants_connected_total, participants_disconnected_total, participants_channel_ids, @@ -125,6 +140,8 @@ impl NetworkMetrics { pub fn register(&self, registry: &Registry) -> Result<(), Box> { registry.register(Box::new(self.listen_requests_total.clone()))?; registry.register(Box::new(self.connect_requests_total.clone()))?; + registry.register(Box::new(self.incoming_connections_total.clone()))?; + registry.register(Box::new(self.failed_handshakes_total.clone()))?; registry.register(Box::new(self.participants_connected_total.clone()))?; registry.register(Box::new(self.participants_disconnected_total.clone()))?; registry.register(Box::new(self.participants_channel_ids.clone()))?; @@ -137,6 +154,11 @@ impl NetworkMetrics { Ok(()) } + pub(crate) fn connect_requests_cache(&self, protocol: &ProtocolAddr) -> prometheus::IntCounter { + self.incoming_connections_total + .with_label_values(&[protocol_name(protocol)]) + } + pub(crate) fn channels_connected(&self, remote_p: &str, no: usize, cid: Cid) { self.channels_connected_total .with_label_values(&[remote_p]) diff --git a/network/src/scheduler.rs b/network/src/scheduler.rs index abb54b6578..5d02c24876 100644 --- a/network/src/scheduler.rs +++ b/network/src/scheduler.rs @@ -384,6 +384,8 @@ impl Scheduler { s2a_listen_result_s: oneshot::Sender>, ) { trace!(?addr, "Start up channel creator"); + #[cfg(feature = "metrics")] + let mcache = self.metrics.connect_requests_cache(&addr); match addr { ProtocolAddr::Tcp(addr) => { let listener = match net::TcpListener::bind(addr).await { @@ -395,7 +397,7 @@ impl Scheduler { info!( ?addr, ?e, - "Tcp bind error durin listener startup" + "Tcp bind error during listener startup" ); s2a_listen_result_s.send(Err(e)).unwrap(); return; @@ -414,8 +416,10 @@ impl Scheduler { continue; }, }; - info!("Accepting Tcp from: {}", remote_addr); + #[cfg(feature = "metrics")] + mcache.inc(); let cid = self.channel_ids.fetch_add(1, Ordering::Relaxed); + info!(?remote_addr, ?cid, "Accepting Tcp from"); self.init_protocol(Protocols::new_tcp(stream, cid, Arc::clone(&self.protocol_metrics)), cid, None, true) .await; } @@ -433,8 +437,10 @@ impl Scheduler { } { let (remote_to_local_s, remote_to_local_r) = mpsc::channel(Self::MPSC_CHANNEL_BOUND); local_remote_to_local_s.send(remote_to_local_s).unwrap(); - info!(?addr, "Accepting Mpsc from"); + #[cfg(feature = "metrics")] + mcache.inc(); let cid = self.channel_ids.fetch_add(1, Ordering::Relaxed); + info!(?addr, ?cid, "Accepting Mpsc from"); self.init_protocol(Protocols::new_mpsc(local_to_remote_s, remote_to_local_r, cid, Arc::clone(&self.protocol_metrics)), cid, None, true) .await; } @@ -650,6 +656,8 @@ impl Scheduler { }, Err(e) => { debug!(?cid, ?e, "Handshake from a new connection failed"); + #[cfg(feature = "metrics")] + metrics.failed_handshakes_total.inc(); if let Some(pid_oneshot) = s2a_return_pid_s { // someone is waiting with `connect`, so give them their Error trace!(?cid, "returning the Err to api who requested the connect"); From c77446a014e5480afd9acf9287e5fec738508ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=A4rtens?= Date: Sat, 27 Mar 2021 16:41:27 +0100 Subject: [PATCH 3/3] fix some tracy only and no default features --- network/src/message.rs | 2 +- server/src/sys/metrics.rs | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/network/src/message.rs b/network/src/message.rs index cbee704a90..bc81e25802 100644 --- a/network/src/message.rs +++ b/network/src/message.rs @@ -129,7 +129,7 @@ impl Message { #[cfg(debug_assertions)] pub(crate) fn verify(&self, params: StreamParams) { #[cfg(not(feature = "compression"))] - let _stream = stream; + let _params = params; #[cfg(feature = "compression")] if self.compressed != params.promises.contains(Promises::COMPRESSED) { warn!( diff --git a/server/src/sys/metrics.rs b/server/src/sys/metrics.rs index 7b348b88ee..d122adf8bf 100644 --- a/server/src/sys/metrics.rs +++ b/server/src/sys/metrics.rs @@ -102,12 +102,7 @@ impl<'a> System<'a> for Sys { if let Some(entities) = entities { let entity_count = entities.join().count(); export_tick.entity_count.set(entity_count as i64); - #[cfg(feature = "tracy")] - { - use common_base::tracy_client::{create_plot, Plot}; - static ENTITY_COUNT: Plot = create_plot!("entity count"); - ENTITY_COUNT.point(entity_count as f64); - } + common_base::plot!("entity count", entity_count as f64); } }