diff --git a/server/src/chunk_generator.rs b/server/src/chunk_generator.rs index b69d597c12..83de4c879e 100644 --- a/server/src/chunk_generator.rs +++ b/server/src/chunk_generator.rs @@ -1,3 +1,4 @@ +use crate::metrics::ChunkGenMetrics; #[cfg(not(feature = "worldgen"))] use crate::test_world::{IndexOwned, World}; use common::{generation::ChunkSupplement, terrain::TerrainChunk}; @@ -21,15 +22,17 @@ pub struct ChunkGenerator { chunk_tx: channel::Sender, chunk_rx: channel::Receiver, pending_chunks: HashMap, Arc>, + metrics: Arc, } impl ChunkGenerator { #[allow(clippy::new_without_default)] // TODO: Pending review in #587 - pub fn new() -> Self { + pub fn new(metrics: ChunkGenMetrics) -> Self { let (chunk_tx, chunk_rx) = channel::unbounded(); Self { chunk_tx, chunk_rx, pending_chunks: HashMap::new(), + metrics: Arc::new(metrics), } } @@ -49,6 +52,7 @@ impl ChunkGenerator { let cancel = Arc::new(AtomicBool::new(false)); v.insert(Arc::clone(&cancel)); let chunk_tx = self.chunk_tx.clone(); + self.metrics.chunks_requested.inc(); thread_pool.execute(move || { let index = index.as_index_ref(); let payload = world @@ -61,6 +65,7 @@ impl ChunkGenerator { pub fn recv_new_chunk(&mut self) -> Option { if let Ok((key, res)) = self.chunk_rx.try_recv() { self.pending_chunks.remove(&key); + self.metrics.chunks_served.inc(); // TODO: do anything else if res is an Err? Some((key, res)) } else { @@ -75,12 +80,15 @@ impl ChunkGenerator { pub fn cancel_if_pending(&mut self, key: Vec2) { if let Some(cancel) = self.pending_chunks.remove(&key) { cancel.store(true, Ordering::Relaxed); + self.metrics.chunks_canceled.inc(); } } pub fn cancel_all(&mut self) { + let metrics = self.metrics.clone(); self.pending_chunks.drain().for_each(|(_, cancel)| { cancel.store(true, Ordering::Relaxed); + metrics.chunks_canceled.inc(); }); } } diff --git a/server/src/lib.rs b/server/src/lib.rs index 9602957082..864e923ec8 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -97,6 +97,11 @@ impl Server { #[allow(clippy::expect_fun_call)] // TODO: Pending review in #587 #[allow(clippy::needless_update)] // TODO: Pending review in #587 pub fn new(settings: ServerSettings) -> Result { + let (chunk_gen_metrics, registry_chunk) = metrics::ChunkGenMetrics::new().unwrap(); + let (network_request_metrics, registry_network) = + metrics::NetworkRequestMetrics::new().unwrap(); + let (player_metrics, registry_player) = metrics::PlayerMetrics::new().unwrap(); + let mut state = State::default(); state.ecs_mut().insert(settings.clone()); state.ecs_mut().insert(EventBus::::default()); @@ -104,7 +109,11 @@ impl Server { .ecs_mut() .insert(LoginProvider::new(settings.auth_server_address.clone())); state.ecs_mut().insert(Tick(0)); - state.ecs_mut().insert(ChunkGenerator::new()); + state.ecs_mut().insert(network_request_metrics); + state.ecs_mut().insert(player_metrics); + state + .ecs_mut() + .insert(ChunkGenerator::new(chunk_gen_metrics)); state .ecs_mut() .insert(CharacterUpdater::new(settings.persistence_db_dir.clone())); @@ -275,11 +284,12 @@ impl Server { let mut metrics = ServerMetrics::new(); // register all metrics submodules here - let tick_metrics = TickMetrics::new(metrics.tick_clone()) + let (tick_metrics, registry_tick) = TickMetrics::new(metrics.tick_clone()) .expect("Failed to initialize server tick metrics submodule."); - tick_metrics - .register(&metrics.registry()) - .expect("failed to register tick metrics"); + registry_chunk(&metrics.registry()).expect("failed to register chunk gen metrics"); + registry_network(&metrics.registry()).expect("failed to register network request metrics"); + registry_player(&metrics.registry()).expect("failed to register player metrics"); + registry_tick(&metrics.registry()).expect("failed to register tick metrics"); let thread_pool = ThreadPoolBuilder::new() .name("veloren-worker".to_string()) @@ -646,9 +656,6 @@ impl Server { .set(stats_persistence_nanos); // Report other info - self.tick_metrics - .player_online - .set(self.state.ecs().read_storage::().join().count() as i64); self.tick_metrics .time_of_day .set(self.state.ecs().read_resource::().0); @@ -813,7 +820,9 @@ impl Server { .is_some() } - pub fn number_of_players(&self) -> i64 { self.tick_metrics.player_online.get() } + pub fn number_of_players(&self) -> i64 { + self.state.ecs().read_storage::().join().count() as i64 + } } impl Drop for Server { diff --git a/server/src/metrics.rs b/server/src/metrics.rs index 8c94041a2a..09bbfe152d 100644 --- a/server/src/metrics.rs +++ b/server/src/metrics.rs @@ -1,4 +1,6 @@ -use prometheus::{Encoder, Gauge, IntGauge, IntGaugeVec, Opts, Registry, TextEncoder}; +use prometheus::{ + Encoder, Gauge, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, Opts, Registry, TextEncoder, +}; use std::{ convert::TryInto, error::Error, @@ -12,10 +14,28 @@ use std::{ }; use tracing::{debug, error}; +type RegistryFn = Box Result<(), prometheus::Error>>; + +pub struct PlayerMetrics { + pub players_connected: IntCounter, + pub players_disconnected: IntCounterVec, // timeout, network_error, gracefully +} + +pub struct NetworkRequestMetrics { + pub chunks_request_dropped: IntCounter, + pub chunks_served_from_cache: IntCounter, + pub chunks_generation_triggered: IntCounter, +} + +pub struct ChunkGenMetrics { + pub chunks_requested: IntCounter, + pub chunks_served: IntCounter, + pub chunks_canceled: IntCounter, +} + pub struct TickMetrics { pub chonks_count: IntGauge, pub chunks_count: IntGauge, - pub player_online: IntGauge, pub entity_count: IntGauge, pub tick_time: IntGaugeVec, pub build_info: IntGauge, @@ -32,11 +52,122 @@ pub struct ServerMetrics { tick: Arc, } +impl PlayerMetrics { + pub fn new() -> Result<(Self, RegistryFn), prometheus::Error> { + let players_connected = IntCounter::with_opts(Opts::new( + "players_connected", + "shows the number of clients joined to the server", + ))?; + let players_disconnected = IntCounterVec::new( + Opts::new( + "players_disconnected", + "shows the number of clients disconnected from the server and the reason", + ), + &["reason"], + )?; + + let players_connected_clone = players_connected.clone(); + let players_disconnected_clone = players_disconnected.clone(); + + let f = |registry: &Registry| { + registry.register(Box::new(players_connected_clone))?; + registry.register(Box::new(players_disconnected_clone))?; + Ok(()) + }; + + Ok(( + Self { + players_connected, + players_disconnected, + }, + Box::new(f), + )) + } +} + +impl NetworkRequestMetrics { + pub fn new() -> Result<(Self, RegistryFn), prometheus::Error> { + let chunks_request_dropped = IntCounter::with_opts(Opts::new( + "chunks_request_dropped", + "number of all chunk request dropped, e.g because the player was to far away", + ))?; + let chunks_served_from_cache = IntCounter::with_opts(Opts::new( + "chunks_served_from_cache", + "number of all requested chunks already generated and could be served out of cache", + ))?; + let chunks_generation_triggered = IntCounter::with_opts(Opts::new( + "chunks_generation_triggered", + "number of all chunks that were requested and needs to be generated", + ))?; + + let chunks_request_dropped_clone = chunks_request_dropped.clone(); + let chunks_served_from_cache_clone = chunks_served_from_cache.clone(); + let chunks_generation_triggered_clone = chunks_generation_triggered.clone(); + + let f = |registry: &Registry| { + registry.register(Box::new(chunks_request_dropped_clone))?; + registry.register(Box::new(chunks_served_from_cache_clone))?; + registry.register(Box::new(chunks_generation_triggered_clone))?; + Ok(()) + }; + + Ok(( + Self { + chunks_request_dropped, + chunks_served_from_cache, + chunks_generation_triggered, + }, + Box::new(f), + )) + } +} + +impl ChunkGenMetrics { + pub fn new() -> Result<(Self, RegistryFn), prometheus::Error> { + let chunks_requested = IntCounter::with_opts(Opts::new( + "chunks_requested", + "number of all chunks requested on the server", + ))?; + let chunks_served = IntCounter::with_opts(Opts::new( + "chunks_served", + "number of all requested chunks already served on the server", + ))?; + let chunks_canceled = IntCounter::with_opts(Opts::new( + "chunks_canceled", + "number of all canceled chunks on the server", + ))?; + + let chunks_requested_clone = chunks_requested.clone(); + let chunks_served_clone = chunks_served.clone(); + let chunks_canceled_clone = chunks_canceled.clone(); + + let f = |registry: &Registry| { + registry.register(Box::new(chunks_requested_clone))?; + registry.register(Box::new(chunks_served_clone))?; + registry.register(Box::new(chunks_canceled_clone))?; + Ok(()) + }; + + Ok(( + Self { + chunks_requested, + chunks_served, + chunks_canceled, + }, + Box::new(f), + )) + } +} + impl TickMetrics { - pub fn new(tick: Arc) -> Result> { - let player_online = IntGauge::with_opts(Opts::new( - "player_online", - "shows the number of clients connected to the server", + pub fn new(tick: Arc) -> Result<(Self, RegistryFn), Box> { + let chonks_count = IntGauge::with_opts(Opts::new( + "chonks_count", + "number of all chonks currently active on the server", + ))?; + let chunks_count = IntGauge::with_opts(Opts::new( + "chunks_count", + "number of all chunks currently active on the server", ))?; let entity_count = IntGauge::with_opts(Opts::new( "entity_count", @@ -56,14 +187,6 @@ impl TickMetrics { "light_count", "number of all lights currently active on the server", ))?; - let chonks_count = IntGauge::with_opts(Opts::new( - "chonks_count", - "number of all chonks currently active on the server", - ))?; - let chunks_count = IntGauge::with_opts(Opts::new( - "chunks_count", - "number of all chunks currently active on the server", - ))?; let tick_time = IntGaugeVec::new( Opts::new("tick_time", "time in ns required for a tick of the server"), &["period"], @@ -74,30 +197,41 @@ impl TickMetrics { .expect("Time went backwards"); start_time.set(since_the_epoch.as_secs().try_into()?); - Ok(Self { - chonks_count, - chunks_count, - player_online, - entity_count, - tick_time, - build_info, - start_time, - time_of_day, - light_count, - tick, - }) - } + let chonks_count_clone = chonks_count.clone(); + let chunks_count_clone = chunks_count.clone(); + let entity_count_clone = entity_count.clone(); + let build_info_clone = build_info.clone(); + let start_time_clone = start_time.clone(); + let time_of_day_clone = time_of_day.clone(); + let light_count_clone = light_count.clone(); + let tick_time_clone = tick_time.clone(); - pub fn register(&self, registry: &Registry) -> Result<(), Box> { - registry.register(Box::new(self.player_online.clone()))?; - registry.register(Box::new(self.entity_count.clone()))?; - registry.register(Box::new(self.build_info.clone()))?; - registry.register(Box::new(self.start_time.clone()))?; - registry.register(Box::new(self.time_of_day.clone()))?; - registry.register(Box::new(self.chonks_count.clone()))?; - registry.register(Box::new(self.chunks_count.clone()))?; - registry.register(Box::new(self.tick_time.clone()))?; - Ok(()) + let f = |registry: &Registry| { + registry.register(Box::new(chonks_count_clone))?; + registry.register(Box::new(chunks_count_clone))?; + registry.register(Box::new(entity_count_clone))?; + registry.register(Box::new(build_info_clone))?; + registry.register(Box::new(start_time_clone))?; + registry.register(Box::new(time_of_day_clone))?; + registry.register(Box::new(light_count_clone))?; + registry.register(Box::new(tick_time_clone))?; + Ok(()) + }; + + Ok(( + Self { + chonks_count, + chunks_count, + entity_count, + tick_time, + build_info, + start_time, + time_of_day, + light_count, + tick, + }, + Box::new(f), + )) } pub fn is_100th_tick(&self) -> bool { self.tick.load(Ordering::Relaxed).rem_euclid(100) == 0 } diff --git a/server/src/sys/message.rs b/server/src/sys/message.rs index de747c301a..0022cf1d35 100644 --- a/server/src/sys/message.rs +++ b/server/src/sys/message.rs @@ -1,7 +1,11 @@ use super::SysTimer; use crate::{ - alias_validator::AliasValidator, client::Client, login_provider::LoginProvider, - persistence::character::CharacterLoader, ServerSettings, + alias_validator::AliasValidator, + client::Client, + login_provider::LoginProvider, + metrics::{NetworkRequestMetrics, PlayerMetrics}, + persistence::character::CharacterLoader, + ServerSettings, }; use common::{ comp::{ @@ -43,6 +47,8 @@ impl Sys { cnt: &mut u64, character_loader: &ReadExpect<'_, CharacterLoader>, terrain: &ReadExpect<'_, TerrainGrid>, + network_metrics: &ReadExpect<'_, NetworkRequestMetrics>, + player_metrics: &ReadExpect<'_, PlayerMetrics>, uids: &ReadStorage<'_, Uid>, can_build: &ReadStorage<'_, CanBuild>, force_updates: &ReadStorage<'_, ForceUpdate>, @@ -133,6 +139,7 @@ impl Sys { // Add to list to notify all clients of the new player new_players.push(entity); + player_metrics.players_connected.inc(); }, // Use RequestState instead (No need to send `player` again). _ => client.error_state(RequestStateError::Impossible), @@ -312,6 +319,7 @@ impl Sys { }, ClientMsg::TerrainChunkRequest { key } => match client.client_state { ClientState::Connected | ClientState::Registered => { + network_metrics.chunks_request_dropped.inc(); client.error_state(RequestStateError::Impossible); }, ClientState::Spectator | ClientState::Character => { @@ -329,12 +337,20 @@ impl Sys { }; if in_vd { match terrain.get_key(key) { - Some(chunk) => client.notify(ServerMsg::TerrainChunkUpdate { - key, - chunk: Ok(Box::new(chunk.clone())), - }), - None => server_emitter.emit(ServerEvent::ChunkRequest(entity, key)), + Some(chunk) => { + network_metrics.chunks_served_from_cache.inc(); + client.notify(ServerMsg::TerrainChunkUpdate { + key, + chunk: Ok(Box::new(chunk.clone())), + }) + }, + None => { + network_metrics.chunks_generation_triggered.inc(); + server_emitter.emit(ServerEvent::ChunkRequest(entity, key)) + }, } + } else { + network_metrics.chunks_request_dropped.inc(); } }, ClientState::Pending => {}, @@ -347,6 +363,10 @@ impl Sys { }, ClientMsg::Terminate => { debug!(?entity, "Client send message to termitate session"); + player_metrics + .players_disconnected + .with_label_values(&["gracefully"]) + .inc(); server_emitter.emit(ServerEvent::ClientDisconnect(entity)); break Ok(()); }, @@ -408,6 +428,8 @@ impl<'a> System<'a> for Sys { Read<'a, Time>, ReadExpect<'a, CharacterLoader>, ReadExpect<'a, TerrainGrid>, + ReadExpect<'a, NetworkRequestMetrics>, + ReadExpect<'a, PlayerMetrics>, Write<'a, SysTimer>, ReadStorage<'a, Uid>, ReadStorage<'a, CanBuild>, @@ -439,6 +461,8 @@ impl<'a> System<'a> for Sys { time, character_loader, terrain, + network_metrics, + player_metrics, mut timer, uids, can_build, @@ -489,9 +513,7 @@ impl<'a> System<'a> for Sys { let network_err: Result<(), crate::error::Error> = block_on(async { //TIMEOUT 0.02 ms for msg handling - select!( - _ = Delay::new(std::time::Duration::from_micros(20)).fuse() => Ok(()), - err = Self::handle_client_msg( + let work_future = Self::handle_client_msg( &mut server_emitter, &mut new_chat_msgs, &player_list, @@ -499,9 +521,10 @@ impl<'a> System<'a> for Sys { entity, client, &mut cnt, - &character_loader, &terrain, + &network_metrics, + &player_metrics, &uids, &can_build, &force_updates, @@ -518,7 +541,10 @@ impl<'a> System<'a> for Sys { &mut controllers, &settings, &alias_validator, - ).fuse() => err, + ); + select!( + _ = Delay::new(std::time::Duration::from_micros(20)).fuse() => Ok(()), + err = work_future.fuse() => err, ) }); @@ -529,11 +555,19 @@ impl<'a> System<'a> for Sys { // Timeout { info!(?entity, "timeout error with client, disconnecting"); + player_metrics + .players_disconnected + .with_label_values(&["timeout"]) + .inc(); server_emitter.emit(ServerEvent::ClientDisconnect(entity)); } else if network_err.is_err() // Postbox error { debug!(?entity, "postbox error with client, disconnecting"); + player_metrics + .players_disconnected + .with_label_values(&["network_error"]) + .inc(); server_emitter.emit(ServerEvent::ClientDisconnect(entity)); } else if time.0 - client.last_ping > settings.client_timeout.as_secs() as f64 * 0.5 { // Try pinging the client if the timeout is nearing.