Merge branch 'xMAC94x/server-stability' into 'master'

track state tick in a historgram too

See merge request veloren/veloren!1998
This commit is contained in:
Marcel 2021-03-27 21:05:02 +00:00
commit 4279720947
6 changed files with 67 additions and 14 deletions

View File

@ -129,7 +129,7 @@ impl Message {
#[cfg(debug_assertions)]
pub(crate) fn verify(&self, params: StreamParams) {
#[cfg(not(feature = "compression"))]
let _stream = stream;
let _params = params;
#[cfg(feature = "compression")]
if self.compressed != params.promises.contains(Promises::COMPRESSED) {
warn!(

View File

@ -9,6 +9,8 @@ use std::error::Error;
pub struct NetworkMetrics {
pub listen_requests_total: IntCounterVec,
pub connect_requests_total: IntCounterVec,
pub incoming_connections_total: IntCounterVec,
pub failed_handshakes_total: IntCounter,
pub participants_connected_total: IntCounter,
pub participants_disconnected_total: IntCounter,
// channel id's, seperated by PARTICIPANT, max 5
@ -44,6 +46,17 @@ impl NetworkMetrics {
),
&["protocol"],
)?;
let incoming_connections_total = IntCounterVec::new(
Opts::new(
"incoming_connections_total",
"Shows the number of external requests to the scheduler",
),
&["protocol"],
)?;
let failed_handshakes_total = IntCounter::with_opts(Opts::new(
"failed_handshakes_total",
"Shows the number of failed handshakes",
))?;
let participants_connected_total = IntCounter::with_opts(Opts::new(
"participants_connected_total",
"Shows the number of participants connected to the network",
@ -110,6 +123,8 @@ impl NetworkMetrics {
Ok(Self {
listen_requests_total,
connect_requests_total,
incoming_connections_total,
failed_handshakes_total,
participants_connected_total,
participants_disconnected_total,
participants_channel_ids,
@ -125,6 +140,8 @@ impl NetworkMetrics {
pub fn register(&self, registry: &Registry) -> Result<(), Box<dyn Error>> {
registry.register(Box::new(self.listen_requests_total.clone()))?;
registry.register(Box::new(self.connect_requests_total.clone()))?;
registry.register(Box::new(self.incoming_connections_total.clone()))?;
registry.register(Box::new(self.failed_handshakes_total.clone()))?;
registry.register(Box::new(self.participants_connected_total.clone()))?;
registry.register(Box::new(self.participants_disconnected_total.clone()))?;
registry.register(Box::new(self.participants_channel_ids.clone()))?;
@ -137,6 +154,11 @@ impl NetworkMetrics {
Ok(())
}
pub(crate) fn connect_requests_cache(&self, protocol: &ProtocolAddr) -> prometheus::IntCounter {
self.incoming_connections_total
.with_label_values(&[protocol_name(protocol)])
}
pub(crate) fn channels_connected(&self, remote_p: &str, no: usize, cid: Cid) {
self.channels_connected_total
.with_label_values(&[remote_p])

View File

@ -384,6 +384,8 @@ impl Scheduler {
s2a_listen_result_s: oneshot::Sender<io::Result<()>>,
) {
trace!(?addr, "Start up channel creator");
#[cfg(feature = "metrics")]
let mcache = self.metrics.connect_requests_cache(&addr);
match addr {
ProtocolAddr::Tcp(addr) => {
let listener = match net::TcpListener::bind(addr).await {
@ -395,7 +397,7 @@ impl Scheduler {
info!(
?addr,
?e,
"Tcp bind error durin listener startup"
"Tcp bind error during listener startup"
);
s2a_listen_result_s.send(Err(e)).unwrap();
return;
@ -414,8 +416,10 @@ impl Scheduler {
continue;
},
};
info!("Accepting Tcp from: {}", remote_addr);
#[cfg(feature = "metrics")]
mcache.inc();
let cid = self.channel_ids.fetch_add(1, Ordering::Relaxed);
info!(?remote_addr, ?cid, "Accepting Tcp from");
self.init_protocol(Protocols::new_tcp(stream, cid, Arc::clone(&self.protocol_metrics)), cid, None, true)
.await;
}
@ -433,8 +437,10 @@ impl Scheduler {
} {
let (remote_to_local_s, remote_to_local_r) = mpsc::channel(Self::MPSC_CHANNEL_BOUND);
local_remote_to_local_s.send(remote_to_local_s).unwrap();
info!(?addr, "Accepting Mpsc from");
#[cfg(feature = "metrics")]
mcache.inc();
let cid = self.channel_ids.fetch_add(1, Ordering::Relaxed);
info!(?addr, ?cid, "Accepting Mpsc from");
self.init_protocol(Protocols::new_mpsc(local_to_remote_s, remote_to_local_r, cid, Arc::clone(&self.protocol_metrics)), cid, None, true)
.await;
}
@ -650,6 +656,8 @@ impl Scheduler {
},
Err(e) => {
debug!(?cid, ?e, "Handshake from a new connection failed");
#[cfg(feature = "metrics")]
metrics.failed_handshakes_total.inc();
if let Some(pid_oneshot) = s2a_return_pid_s {
// someone is waiting with `connect`, so give them their Error
trace!(?cid, "returning the Err to api who requested the connect");

View File

@ -704,14 +704,11 @@ impl Server {
.set((before_persistence_updates - before_entity_cleanup).as_nanos() as i64);
tt.with_label_values(&["persistence_updates"])
.set((end_of_server_tick - before_persistence_updates).as_nanos() as i64);
}
#[cfg(feature = "tracy")]
{
use common_base::tracy_client::{create_plot, Plot};
let entity_count = self.state.ecs().entities().join().count();
static ENTITY_COUNT: Plot = create_plot!("entity count");
ENTITY_COUNT.point(entity_count as f64);
tick_metrics.tick_time_hist.observe(
end_of_server_tick
.duration_since(before_state_tick)
.as_secs_f64(),
);
}
// 9) Finish the tick, pass control back to the frontend.

View File

@ -1,6 +1,6 @@
use prometheus::{
Gauge, GaugeVec, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
Opts, Registry,
Gauge, GaugeVec, Histogram, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge,
IntGaugeVec, Opts, Registry,
};
use std::{
convert::TryInto,
@ -48,6 +48,7 @@ pub struct TickMetrics {
pub chunk_groups_count: IntGauge,
pub entity_count: IntGauge,
pub tick_time: IntGaugeVec,
pub tick_time_hist: Histogram,
pub build_info: IntGauge,
pub start_time: IntGauge,
pub time_of_day: Gauge,
@ -262,6 +263,28 @@ impl TickMetrics {
Opts::new("tick_time", "time in ns required for a tick of the server"),
&["period"],
)?;
// 33.33ms is the ideal tick time. So we have hight detail around it.
// 300/700 are to detect high I/O blocks
let bucket = vec![
Duration::from_millis(8).as_secs_f64(),
Duration::from_millis(16).as_secs_f64(),
Duration::from_millis(24).as_secs_f64(),
Duration::from_millis(30).as_secs_f64(),
Duration::from_millis(33).as_secs_f64(),
Duration::from_millis(37).as_secs_f64(),
Duration::from_millis(45).as_secs_f64(),
Duration::from_millis(60).as_secs_f64(),
Duration::from_millis(100).as_secs_f64(),
Duration::from_millis(300).as_secs_f64(),
Duration::from_millis(700).as_secs_f64(),
];
let tick_time_hist = Histogram::with_opts(
HistogramOpts::new(
"tick_time_hist",
"shows the detailed time in ns spend for the whole tick as histogram",
)
.buckets(bucket),
)?;
let since_the_epoch = SystemTime::now()
.duration_since(UNIX_EPOCH)
@ -277,6 +300,7 @@ impl TickMetrics {
registry.register(Box::new(time_of_day.clone()))?;
registry.register(Box::new(light_count.clone()))?;
registry.register(Box::new(tick_time.clone()))?;
registry.register(Box::new(tick_time_hist.clone()))?;
Ok(Self {
chonks_count,
@ -284,6 +308,7 @@ impl TickMetrics {
chunk_groups_count,
entity_count,
tick_time,
tick_time_hist,
build_info,
start_time,
time_of_day,

View File

@ -102,6 +102,7 @@ impl<'a> System<'a> for Sys {
if let Some(entities) = entities {
let entity_count = entities.join().count();
export_tick.entity_count.set(entity_count as i64);
common_base::plot!("entity count", entity_count as f64);
}
}