mess (server startup time related experimentation, in particular with

pathfinding between sites)
2024-08-30 18:12:32 +00:00 · 2023-04-16 17:01:00 -04:00
parent ed4643e80b
commit 1f5ebbd100
6 changed files with 594 additions and 24 deletions
--- a/common/src/astar2.rs
+++ b/common/src/astar2.rs
@ -0,0 +1,399 @@
+#![allow(dead_code, unused_mut, unused_variables)]
+use crate::path::Path;
+use core::{
+    cmp::Ordering::{self, Equal},
+    fmt,
+    hash::{BuildHasher, Hash},
+};
+use hashbrown::HashMap;
+use std::collections::BinaryHeap;
+
+#[derive(Copy, Clone, Debug)]
+pub struct PathEntry<S> {
+    // cost so far + heursitic
+    priority: f32,
+    node: S,
+    //cost: f32,
+}
+
+impl<S: Eq> PartialEq for PathEntry<S> {
+    fn eq(&self, other: &PathEntry<S>) -> bool { self.node.eq(&other.node) }
+}
+
+impl<S: Eq> Eq for PathEntry<S> {}
+
+impl<S: Eq> Ord for PathEntry<S> {
+    // This method implements reverse ordering, so that the lowest cost
+    // will be ordered first
+    fn cmp(&self, other: &PathEntry<S>) -> Ordering {
+        other.priority.partial_cmp(&self.priority).unwrap_or(Equal)
+    }
+}
+
+impl<S: Eq> PartialOrd for PathEntry<S> {
+    fn partial_cmp(&self, other: &PathEntry<S>) -> Option<Ordering> { Some(self.cmp(other)) }
+
+    // This is particularily hot in `BinaryHeap::pop`, so we provide this
+    // implementation.
+    //
+    // NOTE: This probably doesn't handle edge cases like `NaNs` in a consistent
+    // manner with `Ord`, but I don't think we need to care about that here(?)
+    //
+    // See note about reverse ordering above.
+    fn le(&self, other: &PathEntry<S>) -> bool { other.priority <= self.priority }
+}
+
+pub enum PathResult<T> {
+    None(Path<T>),
+    Exhausted(Path<T>),
+    Path(Path<T>),
+    Pending,
+}
+
+impl<T> PathResult<T> {
+    pub fn into_path(self) -> Option<Path<T>> {
+        match self {
+            PathResult::Path(path) => Some(path),
+            _ => None,
+        }
+    }
+
+    pub fn map<U>(self, f: impl FnOnce(Path<T>) -> Path<U>) -> PathResult<U> {
+        match self {
+            PathResult::None(p) => PathResult::None(f(p)),
+            PathResult::Exhausted(p) => PathResult::Exhausted(f(p)),
+            PathResult::Path(p) => PathResult::Path(f(p)),
+            PathResult::Pending => PathResult::Pending,
+        }
+    }
+}
+
+// If node entry exists, this was visited!
+#[derive(Clone, Debug)]
+struct NodeEntry<S> {
+    // if came_from == self this is the start node!
+    came_from: S,
+    cheapest_score: f32,
+}
+
+#[derive(Clone, Debug)]
+struct Cluster<S> {
+    // TODO: we could use `(S, u8)` here?
+    // idea: if we bake in the gridness we could just store a direction
+    came_from: [Option<S>; 256],
+    cheapest_score: [f32; 256],
+}
+
+// ideas:
+// * merge hashmaps
+// * "chunked" exploration
+// * things we put on priority queue don't need to point into a hashmap (i.e. we
+//   only need a hashmap to map from new/unknown nodes to whatever
+//   datastructure)
+#[derive(Clone)]
+pub struct Astar<S, Hasher> {
+    iter: usize,
+    max_iters: usize,
+    potential_nodes: BinaryHeap<PathEntry<S>>, // cost, node pairs
+    // converting to single hash structure: 11349 ms -> 10462 ms / 10612 ms
+    // with two hash structures (came_from and cheapest_scores): 10861 ms
+    visited_nodes: HashMap<S, NodeEntry<S>, Hasher>,
+    // -> 25055 ms -> 15771 ms with Box -> fixed bugs 10731 ms, hmmm
+    clusters: HashMap<S, Box<Cluster<S>>, Hasher>, // TODO: Box cluster?
+    //came_from: HashMap<S, S, Hasher>,
+    //cheapest_scores: HashMap<S, f32, Hasher>,
+    //final_scores: HashMap<S, f32, Hasher>,
+    //visited: HashSet<S, Hasher>,
+    start_node: S,
+    cheapest_node: Option<S>,
+    cheapest_cost: Option<f32>,
+}
+
+/// NOTE: Must manually derive since Hasher doesn't implement it.
+impl<S: Clone + Eq + Hash + fmt::Debug, H: BuildHasher> fmt::Debug for Astar<S, H> {
+    fn fmt(&self, _f: &mut fmt::Formatter<'_>) -> fmt::Result { todo!() }
+}
+
+impl<S: Clone + Eq + Hash, H: BuildHasher + Clone> Astar<S, H> {
+    pub fn new(max_iters: usize, start: S, hasher: H) -> Self {
+        Self {
+            max_iters,
+            iter: 0,
+            potential_nodes: core::iter::once(PathEntry {
+                priority: 0.0,
+                //cost: 0.0,
+                node: start.clone(),
+            })
+            .collect(),
+            /*
+            came_from: HashMap::with_hasher(hasher.clone()),
+            cheapest_scores: {
+                let mut h = HashMap::with_capacity_and_hasher(1, hasher.clone());
+                h.extend(core::iter::once((start.clone(), 0.0)));
+                h
+            },
+            final_scores: {
+                let mut h = HashMap::with_capacity_and_hasher(1, hasher.clone());
+                h.extend(core::iter::once((start.clone(), 0.0)));
+                h
+            },
+            visited: {
+                let mut s = HashSet::with_capacity_and_hasher(1, hasher);
+                s.extend(core::iter::once(start));
+                s
+            },
+            */
+            visited_nodes: {
+                let mut s = HashMap::with_capacity_and_hasher(1, hasher.clone());
+                s.extend(core::iter::once((start.clone(), NodeEntry {
+                    came_from: start.clone(),
+                    cheapest_score: 0.0,
+                })));
+                s
+            },
+            clusters: HashMap::with_hasher(hasher),
+            start_node: start,
+            cheapest_node: None,
+            cheapest_cost: None,
+        }
+    }
+
+    pub fn poll<I>(
+        &mut self,
+        iters: usize,
+        // estimate how far we are from the target? but we are given two nodes... (current,
+        // previous)
+        mut heuristic: impl FnMut(&S, &S) -> f32,
+        // get neighboring nodes
+        mut neighbors: impl FnMut(&S) -> I,
+        // cost of edge between these two nodes
+        // I assume this is (source, destination)?
+        mut transition: impl FnMut(&S, &S) -> f32,
+        // have we reached a/the target?
+        mut satisfied: impl FnMut(&S) -> bool,
+        // this function clusters nodes together for cache locality purposes
+        // output (cluster base, offset in cluster)
+        cluster: impl Fn(&S) -> (S, u8),
+    ) -> PathResult<S>
+    where
+        // Combining transition into this: 9913 ms -> 8204 ms (~1.7 out of ~6.5 seconds)
+        I: Iterator<Item = (S, f32)>,
+    {
+        /*
+         */
+        if self.clusters.is_empty() {
+            let (key, index) = cluster(&self.start_node);
+            let mut came_from = std::array::from_fn(|_| None);
+            came_from[usize::from(index)] = Some(self.start_node.clone());
+            self.clusters.insert(
+                key,
+                Box::new(Cluster {
+                    came_from,
+                    cheapest_score: [0.0; 256],
+                }),
+            );
+        }
+        let iter_limit = self.max_iters.min(self.iter + iters);
+        while self.iter < iter_limit {
+            // pop highest priority node
+            if let Some(PathEntry { node, .. }) = self.potential_nodes.pop() {
+                // if this is the destination, we return
+                if satisfied(&node) {
+                    return PathResult::Path(self.reconstruct_path_to(node, cluster));
+                } else {
+                    let (cluster_key, index) = cluster(&node);
+                    let (node_cheapest, came_from) = self
+                        .clusters
+                        .get(&cluster_key)
+                        .map(|c| {
+                            (
+                                c.cheapest_score[usize::from(index)],
+                                c.came_from[usize::from(index)].clone().unwrap(),
+                            )
+                        })
+                        .unwrap();
+                    // regression
+                    //if node_cheapest < cost {
+                    // we already processed it
+                    //    continue;
+                    //}
+                    // 10700 ms -> 10477 ms (moving this out of the loop)
+                    // we have to fetch this even though it was put into the priority queu
+                    /*
+                    let node_cheapest = self
+                        .visited_nodes
+                        .get(&node)
+                        .map_or(f32::MAX, |n| n.cheapest_score);
+                    */
+                    // otherwise we iterate neighbors
+                    // TODO: try for_each here
+                    // 6879 ms -> 6989 ms (regression using for_each)
+                    //neighbors(&node).for_each(|(neighbor, transition)| {
+                    for (neighbor, transition) in neighbors(&node) {
+                        // skipping here: 10694 ms -> 9913 ms (almost whole second out of 7 taken
+                        // for this, this is because the `transition` call is fairly expensive)
+                        if neighbor == came_from {
+                            continue;
+                            //return;
+                        }
+                        let (cluster_key, index) = cluster(&neighbor);
+                        let mut previously_visited = false;
+                        let neighbor_cheapest = self
+                            .clusters
+                            .get(&cluster_key)
+                            .and_then(|c| {
+                                previously_visited = c.came_from[usize::from(index)].is_some();
+
+                                previously_visited.then(|| c.cheapest_score[usize::from(index)])
+                            })
+                            .unwrap_or(f32::MAX);
+                        /*
+                        let neighbor_cheapest = self
+                            .visited_nodes
+                            .get(&neighbor)
+                            .map_or(f32::MAX, |n| n.cheapest_score);
+                         */
+                        // 10573 ms -> 11546 ms (with entry api appears to be regression)
+                        /*
+                        let mut previously_visited = true;
+                        let neighbor_entry = self
+                            .visited_nodes
+                            .entry(neighbor.clone())
+                            .or_insert_with(|| {
+                                previously_visited = false;
+                                NodeEntry {
+                                    came_from: node.clone(),
+                                    cheapest_score: f32::MAX,
+                                }
+                            });
+                        let neighbor_cheapest = neighbor_entry.cheapest_score;
+                         */
+                        /*
+                        let node_cheapest = *self.cheapest_scores.get(&node).unwrap_or(&f32::MAX);
+                        let neighbor_cheapest =
+                            *self.cheapest_scores.get(&neighbor).unwrap_or(&f32::MAX);
+                        */
+
+                        // TODO: have caller provide transition cost with neighbors iterator (so
+                        // that duplicate costs in `transition` can be avoided?)
+                        // compute cost to traverse to each neighbor
+                        let cost = node_cheapest + transition; //transition(&node, &neighbor);
+                        // if this is cheaper than existing cost for that neighbor (or neighbor
+                        // hasn't been visited)
+                        // can we convince ourselves that this is always true if node was not
+                        // visited?
+                        if cost < neighbor_cheapest {
+                            //neighbor_entry.cheapest_score = cost;
+                            /*
+                            // note: unconditional insert, same cost as overwriting if it already
+                            // exists
+                            let previously_visited = self
+                                .came_from
+                                .insert(neighbor.clone(), node.clone())
+                                .is_some();
+                            self.cheapest_scores.insert(neighbor.clone(), cost);
+                            */
+                            /*
+                            let previously_visited = self
+                                .visited_nodes
+                                .insert(neighbor.clone(), NodeEntry {
+                                    came_from: node.clone(),
+                                    cheapest_score: cost,
+                                })
+                                .is_some();
+                             */
+                            let cluster_mut =
+                                self.clusters.entry(cluster_key).or_insert_with(|| {
+                                    Box::new(Cluster {
+                                        came_from: std::array::from_fn(|_| None),
+                                        cheapest_score: [0.0; 256],
+                                    })
+                                });
+                            cluster_mut.came_from[usize::from(index)] = Some(node.clone());
+                            cluster_mut.cheapest_score[usize::from(index)] = cost;
+
+                            let h = heuristic(&neighbor, &node);
+                            // note that cheapest_scores does not include the heuristic
+                            // this is what final_scores does, priority queue does include
+                            // heuristic
+                            let priority = cost + h;
+                            // note this is literally unused, removing saves ~350 ms out of 11349
+                            // (note this is all of startup time)
+                            //self.final_scores.insert(neighbor.clone(), neighbor_cost);
+
+                            if self.cheapest_cost.map(|cc| h < cc).unwrap_or(true) {
+                                self.cheapest_node = Some(node.clone());
+                                self.cheapest_cost = Some(h);
+                            };
+
+                            // commenting out if here: 11349 ms -> 12498 ms (but may give better
+                            // paths?) (about 1 extra second or +10% time)
+                            // with single hashmap change this has much more impact:
+                            // 3473 ms -> 11981 ms
+
+                            // if we hadn't already visted, add this to potential nodes, what about
+                            // its neighbors, wouldn't they need to be revisted???
+                            if !previously_visited {
+                                self.potential_nodes.push(PathEntry {
+                                    priority,
+                                    //cost,
+                                    node: neighbor,
+                                });
+                            }
+                        }
+                    }
+                    //});
+                }
+            } else {
+                return PathResult::None(
+                    self.cheapest_node
+                        .clone()
+                        .map(|lc| self.reconstruct_path_to(lc, cluster))
+                        .unwrap_or_default(),
+                );
+            }
+
+            self.iter += 1
+        }
+
+        if self.iter >= self.max_iters {
+            PathResult::Exhausted(
+                self.cheapest_node
+                    .clone()
+                    .map(|lc| self.reconstruct_path_to(lc, cluster))
+                    .unwrap_or_default(),
+            )
+        } else {
+            PathResult::Pending
+        }
+    }
+
+    pub fn get_cheapest_cost(&self) -> Option<f32> { self.cheapest_cost }
+
+    // At least in world site pathfinding this is super cheap compared to actually
+    // finding the path!
+    fn reconstruct_path_to(&mut self, end: S, cluster: impl Fn(&S) -> (S, u8)) -> Path<S> {
+        let mut path = vec![end.clone()];
+        let mut cnode = &end;
+        let (mut ckey, mut ci) = cluster(cnode);
+        while let Some(node) = self
+            .clusters
+            .get(&ckey)
+            .and_then(|c| c.came_from[usize::from(ci)].as_ref())
+            .filter(|n| *n != cnode)
+        /*
+        self
+            .visited_nodes
+            .get(cnode)
+            .map(|n| &n.came_from)
+            .filter(|n| *n != cnode)
+        */
+        //self.came_from.get(cnode)
+        {
+            path.push(node.clone());
+            cnode = node;
+            (ckey, ci) = cluster(cnode);
+        }
+        path.into_iter().rev().collect()
+    }
+}
--- a/common/src/lib.rs
+++ b/common/src/lib.rs
@ -38,6 +38,7 @@ pub mod uid;
 // NOTE: Comment out macro to get rustfmt to re-order these as needed.
 cfg_if! { if #[cfg(not(target_arch = "wasm32"))] {
    pub mod astar;
+    pub mod astar2;
    pub mod calendar;
    pub mod character;
    pub mod clock;
--- a/world/Cargo.toml
+++ b/world/Cargo.toml
@ -72,6 +72,9 @@ name = "tree"
 name = "chunk_compression_benchmarks"
 required-features = ["bin_compression"]

+[[example]]
+name = "world_generate_time"
+
 [[example]]
 name = "world_block_statistics"
 required-features = ["bin_compression"]
--- a/world/examples/world_generate_time.rs
+++ b/world/examples/world_generate_time.rs
@ -0,0 +1,23 @@
+use std::time::Instant;
+use veloren_world::{
+    sim::{FileOpts, WorldOpts, DEFAULT_WORLD_MAP},
+    World,
+};
+
+fn main() {
+    let threadpool = rayon::ThreadPoolBuilder::new().build().unwrap();
+
+    let start = Instant::now();
+    let (world, index) = World::generate(
+        0,
+        WorldOpts {
+            seed_elements: true,
+            // Load default map from assets.
+            world_file: FileOpts::LoadAsset(DEFAULT_WORLD_MAP.into()),
+            calendar: None,
+        },
+        &threadpool,
+    );
+    core::hint::black_box((world, index));
+    println!("{} ms", start.elapsed().as_nanos() / 1_000_000);
+}
--- a/world/src/civ/mod.rs
+++ b/world/src/civ/mod.rs
@ -21,6 +21,7 @@ use common::{
    },
    vol::RectVolSize,
 };
+use common_base::prof_span;
 use core::{fmt, hash::BuildHasherDefault, ops::Range};
 use fxhash::FxHasher64;
 use rand::prelude::*;
@ -54,7 +55,17 @@ pub struct Civs {
    /// (3) we have 8-byte keys (for which FxHash is fastest).
    pub track_map: DHashMap<Id<Site>, DHashMap<Id<Site>, Id<Track>>>,

-    pub bridges: DHashMap<Vec2<i32>, (Vec2<i32>, Id<Site>)>,
+    // 8249 ms -> 7680 ms (change when switching to ahash)
+    // 7495 ms -> 8057 ms -> 7481 ms (ahash -> sip13 -> fxhasher)
+    // TODO: deterministic(?), this is certainly faster, presumably due to less collisions
+    pub bridges: hashbrown::HashMap<
+        Vec2<i32>,
+        (Vec2<i32>, Id<Site>),
+        //std::hash::BuildHasherDefault<siphasher::sip::SipHasher13>,
+        std::hash::BuildHasherDefault<fxhash::FxHasher64>,
+        //std::hash::BuildHasherDefault<fxhash::FxHasher>,
+        //std::hash::BuildHasherDefault<fxhash::FxHasher32>, // too many collisions!
+    >,

    pub sites: Store<Site>,
    pub caves: Store<CaveInfo>,
@ -160,7 +171,7 @@ impl<'a, R: Rng> GenCtx<'a, R> {

 impl Civs {
    pub fn generate(seed: u32, sim: &mut WorldSim, index: &mut Index) -> Self {
-        common_base::prof_span!("Civs::generate");
+        prof_span!("Civs::generate");
        let mut this = Self::default();
        let rng = ChaChaRng::from_seed(seed_expan::rng_state(seed));
        let name_rng = rng.clone();
@ -181,14 +192,18 @@ impl Civs {
        // this.generate_caves(&mut ctx);

        info!("starting civilisation creation");
+        prof_span!(guard, "create civs");
        for _ in 0..initial_civ_count {
+            prof_span!("create civ");
            debug!("Creating civilisation...");
            if this.birth_civ(&mut ctx.reseed()).is_none() {
                warn!("Failed to find starting site for civilisation.");
            }
        }
+        drop(guard);
        info!(?initial_civ_count, "all civilisations created");

+        prof_span!(guard, "find locations and establish sites");
        for _ in 0..initial_civ_count * 3 {
            attempt(5, || {
                let (loc, kind) = match ctx.rng.gen_range(0..64) {
@ -260,6 +275,7 @@ impl Civs {
                }))
            });
        }
+        drop(guard);

        // Tick
        //=== old economy is gone
@ -511,6 +527,8 @@ impl Civs {
                }
            }
        }
+
+        dbg!(CC.load(Ordering::Relaxed));
    }

    // TODO: Move this
@ -730,7 +748,7 @@ impl Civs {

    /// Adds lake POIs and names them
    fn name_biomes(&mut self, ctx: &mut GenCtx<impl Rng>) {
-        common_base::prof_span!("name_biomes");
+        prof_span!("name_biomes");
        let map_size_lg = ctx.sim.map_size_lg();
        let world_size = map_size_lg.chunks();
        let mut biomes: Vec<(common::terrain::BiomeKind, Vec<usize>)> = Vec::new();
@ -769,7 +787,7 @@ impl Civs {
            biomes.push((biome, filled));
        }

-        common_base::prof_span!("after flood fill");
+        prof_span!("after flood fill");
        let mut biome_count = 0;
        for biome in biomes {
            let name = match biome.0 {
@ -1013,7 +1031,7 @@ impl Civs {

    /// Adds mountain POIs and name them
    fn name_peaks(&mut self, ctx: &mut GenCtx<impl Rng>) {
-        common_base::prof_span!("name_peaks");
+        prof_span!("name_peaks");
        let map_size_lg = ctx.sim.map_size_lg();
        const MIN_MOUNTAIN_ALT: f32 = 600.0;
        const MIN_MOUNTAIN_CHAOS: f32 = 0.35;
@ -1093,6 +1111,7 @@ impl Civs {
        loc: Vec2<i32>,
        site_fn: impl FnOnce(Id<Place>) -> Site,
    ) -> Id<Site> {
+        prof_span!("establish_site");
        const SITE_AREA: Range<usize> = 1..4; //64..256;

        fn establish_site(
@ -1101,6 +1120,7 @@ impl Civs {
            loc: Vec2<i32>,
            site_fn: impl FnOnce(Id<Place>) -> Site,
        ) -> Id<Site> {
+            prof_span!("establish site inner");
            let place = match ctx.sim.get(loc).and_then(|site| site.place) {
                Some(place) => place,
                None => civs.establish_place(ctx, loc, SITE_AREA),
@ -1112,6 +1132,7 @@ impl Civs {
        let site = establish_site(self, ctx, loc, site_fn);

        // Find neighbors
+        prof_span!(guard, "find neighbors");
        const MAX_NEIGHBOR_DISTANCE: f32 = 2000.0;
        let mut nearby = self
            .sites
@ -1131,6 +1152,7 @@ impl Civs {
            .filter(|(_, dist)| *dist < MAX_NEIGHBOR_DISTANCE)
            .collect::<Vec<_>>();
        nearby.sort_by_key(|(_, dist)| *dist as i32);
+        drop(guard);

        if let SiteKind::Refactor
        | SiteKind::Settlement
@ -1140,13 +1162,24 @@ impl Civs {
        | SiteKind::Castle = self.sites[site].kind
        {
            for (nearby, _) in nearby.into_iter().take(5) {
+                prof_span!("for nearby");
                // Find a novel path
-                if let Some((path, cost)) = find_path(
-                    ctx,
-                    |start| self.bridges.get(&start).map(|(end, _)| *end),
-                    loc,
-                    self.sites.get(nearby).center,
-                ) {
+                let maybe_path = {
+                    prof_span!("find path");
+                    find_path(
+                        ctx,
+                        |start| self.bridges.get(&start).map(|(end, _)| *end),
+                        loc,
+                        self.sites.get(nearby).center,
+                    )
+                };
+                if maybe_path.is_some() {
+                    info!("Succeed");
+                } else {
+                    info!("Fail");
+                }
+                if let Some((path, cost)) = maybe_path {
+                    prof_span!("with path");
                    // Find a path using existing paths
                    if self
                        .route_between(site, nearby)
@ -1180,6 +1213,7 @@ impl Civs {
                                    1 << (i as u8);
                                randomize_offset = true;
                            } else if !self.bridges.contains_key(&locs[1]) {
+                                //dbg!("here"); called 18 times
                                let center = (locs[1] + locs[2]) / 2;
                                let id =
                                    establish_site(self, &mut ctx.reseed(), center, move |place| {
@ -1305,45 +1339,144 @@ fn find_path(
 ) -> Option<(Path<Vec2<i32>>, f32)> {
    const MAX_PATH_ITERS: usize = 100_000;
    let sim = &ctx.sim;
+    // NOTE: If heuristic overestimates the actual cost, then A* is not guaranteed
+    // to produce the least-cost path (since it will explore partially based on
+    // the heuristic). TODO: heuristic can be larger than actual cost, since
+    // diagonals can only cost `1.0` if a path exists and since bridges have
+    // zero cost (and cover multiple tiles).
    let heuristic = move |l: &Vec2<i32>, _: &Vec2<i32>| (l.distance_squared(b) as f32).sqrt();
-    let get_bridge = &get_bridge;
    let neighbors = |l: &Vec2<i32>| {
        let l = *l;
+        let bridge = get_bridge(l);
+        /*
        NEIGHBORS
            .iter()
-            .filter_map(move |dir| walk_in_dir(sim, get_bridge, l, *dir))
-            .map(move |(p, _)| p)
+            .filter_map(move |dir| walk_in_dir(sim, bridge, l, *dir))
+         */
+        /*
+         */
+        // Using walk_in_all_dirs saves ~500 ms
+        let potential = walk_in_all_dirs(sim, bridge, l);
+        potential.into_iter().filter_map(|p| p)
    };
+    // transition cost?
    let transition = |a: &Vec2<i32>, b: &Vec2<i32>| {
-        1.0 + walk_in_dir(sim, get_bridge, *a, (*b - *a).map(|e| e.signum()))
+        // factoring this out: 7463 ms -> 7356 ms
+        let bridge = get_bridge(*a);
+        1.0 + walk_in_dir(sim, bridge, *a, (*b - *a).map(|e| e.signum()))
            .map_or(10000.0, |(_, cost)| cost)
    };
    let satisfied = |l: &Vec2<i32>| *l == b;
+    let cluster = |l: &Vec2<i32>| {
+        let bx = l.x.div_euclid(16);
+        let by = l.y.div_euclid(16);
+        let x = l.x % 16;
+        let y = l.y % 16;
+        (Vec2::new(bx, by), (x + y * 16) as u8)
+    };
    // We use this hasher (FxHasher64) because
    // (1) we don't care about DDOS attacks (ruling out SipHash);
    // (2) we care about determinism across computers (ruling out AAHash);
    // (3) we have 8-byte keys (for which FxHash is fastest).
-    let mut astar = Astar::new(
+    let mut astar = common::astar2::Astar::new(
        MAX_PATH_ITERS,
        a,
        BuildHasherDefault::<FxHasher64>::default(),
    );
    astar
-        .poll(MAX_PATH_ITERS, heuristic, neighbors, transition, satisfied)
+        .poll(
+            MAX_PATH_ITERS,
+            heuristic,
+            neighbors,
+            transition,
+            satisfied,
+            cluster,
+        )
        .into_path()
        .and_then(|path| astar.get_cheapest_cost().map(|cost| (path, cost)))
 }

+use core::sync::atomic::{AtomicUsize, Ordering};
+static CC: AtomicUsize = AtomicUsize::new(0);
+
+fn walk_in_all_dirs(
+    sim: &WorldSim,
+    bridge: Option<Vec2<i32>>,
+    a: Vec2<i32>,
+) -> [Option<(Vec2<i32>, f32)>; 8] {
+    let mut potential = [None; 8];
+
+    let mut adjacents = [a; 8];
+    for i in 0..8 {
+        adjacents[i] += NEIGHBORS[i];
+    }
+
+    let Some(a_chunk) = sim.get(a) else { return potential };
+    let mut chunks = [None; 8];
+    for i in 0..8 {
+        if loc_suitable_for_walking(sim, adjacents[i]) {
+            chunks[i] = sim.get(adjacents[i]);
+        }
+    }
+    for i in 0..8 {
+        let Some(b_chunk) = chunks[i] else { continue };
+
+        let hill_cost = ((b_chunk.alt - a_chunk.alt).abs() / 5.0).powi(2);
+        let water_cost = (b_chunk.water_alt - b_chunk.alt + 8.0).clamped(0.0, 8.0) * 3.0; // Try not to path swamps / tidal areas
+        let wild_cost = if b_chunk.path.0.is_way() {
+            0.0 // Traversing existing paths has no additional cost!
+        } else {
+            3.0 // + (1.0 - b_chunk.tree_density) * 20.0 // Prefer going through forests, for aesthetics
+        };
+
+        let cost = 1.0 + hill_cost + water_cost + wild_cost;
+        potential[i] = Some((adjacents[i], cost));
+    }
+
+    // Look for potential bridge spots in the cardinal directions if
+    // `loc_suitable_for_wallking` was false for the adjacent chunk.
+    for i in 0..4 {
+        // These happen to be the dirs where: dir.x == 0 || dir.y == 0
+        let i = i * 2;
+        if potential[i].is_none() {
+            let dir = NEIGHBORS[i];
+            // if we can skip over unsuitable area with a bridge
+            potential[i] = (4..=5).find_map(|i| {
+                loc_suitable_for_walking(sim, a + dir * i)
+                    .then(|| (a + dir * i, 120.0 + (i - 4) as f32 * 10.0))
+            });
+        }
+    }
+
+    // If current position is a bridge, skip to its destination.
+    if let Some(p) = bridge {
+        let dir = (p - a).map(|e| e.signum());
+        if let Some((dir_index, _)) = NEIGHBORS
+            .iter()
+            .enumerate()
+            .find(|(_, n_dir)| **n_dir == dir)
+        {
+            potential[dir_index] = Some((p, 0.0));
+        }
+    }
+
+    potential
+}
+
 /// Return Some if travel between a location and a chunk next to it is permitted
 /// If permitted, the approximate relative const of traversal is given
 // (TODO: by whom?)
+//
+// Return tuple: (final location, cost)
 fn walk_in_dir(
    sim: &WorldSim,
-    get_bridge: impl Fn(Vec2<i32>) -> Option<Vec2<i32>>,
+    // Is there a bridge at `a`?
+    bridge: Option<Vec2<i32>>,
    a: Vec2<i32>,
    dir: Vec2<i32>,
 ) -> Option<(Vec2<i32>, f32)> {
-    if let Some(p) = get_bridge(a).filter(|p| (p - a).map(|e| e.signum()) == dir) {
+    //CC.fetch_add(1, Ordering::Relaxed);
+    if let Some(p) = bridge.filter(|p| (p - a).map(|e| e.signum()) == dir) {
        // Traversing an existing bridge has no cost.
        Some((p, 0.0))
    } else if loc_suitable_for_walking(sim, a + dir) {
@ -1360,6 +1493,7 @@ fn walk_in_dir(
        };
        Some((a + dir, 1.0 + hill_cost + water_cost + wild_cost))
    } else if dir.x == 0 || dir.y == 0 {
+        // if we can skip over unsuitable area with a bridge
        (4..=5).find_map(|i| {
            loc_suitable_for_walking(sim, a + dir * i)
                .then(|| (a + dir * i, 120.0 + (i - 4) as f32 * 10.0))
@ -1372,10 +1506,14 @@ fn walk_in_dir(
 /// Return true if a position is suitable for walking on
 fn loc_suitable_for_walking(sim: &WorldSim, loc: Vec2<i32>) -> bool {
    if sim.get(loc).is_some() {
-        !NEIGHBORS.iter().any(|n| {
-            sim.get(loc + *n)
-                .map_or(false, |chunk| chunk.river.near_water())
-        })
+        // 7181 ms -> 6868 ms (300 ms! almost 10% of pathfinding time)
+        !NEIGHBORS
+            .iter()
+            .map(|n| {
+                sim.get(loc + *n)
+                    .map_or(false, |chunk| chunk.river.near_water())
+            })
+            .fold(false, |acc, near_water| acc & near_water)
    } else {
        false
    }
@ -1411,6 +1549,7 @@ fn find_site_loc(
    proximity_reqs: &ProximityRequirements,
    site_kind: SiteKind,
 ) -> Option<Vec2<i32>> {
+    prof_span!("find_site_loc");
    const MAX_ATTEMPTS: usize = 10000;
    let mut loc = None;
    for _ in 0..MAX_ATTEMPTS {
--- a/world/src/sim/erosion.rs
+++ b/world/src/sim/erosion.rs
@ -124,6 +124,7 @@ pub enum RiverKind {
 impl RiverKind {
    pub fn is_ocean(&self) -> bool { matches!(*self, RiverKind::Ocean) }

+    #[inline(always)] // saves ~100 ms on current `world_generate_time`
    pub fn is_river(&self) -> bool { matches!(*self, RiverKind::River { .. }) }

    pub fn is_lake(&self) -> bool { matches!(*self, RiverKind::Lake { .. }) }
@ -212,7 +213,11 @@ impl RiverData {

    pub fn near_river(&self) -> bool { self.is_river() || !self.neighbor_rivers.is_empty() }

-    pub fn near_water(&self) -> bool { self.near_river() || self.is_lake() || self.is_ocean() }
+    pub fn near_water(&self) -> bool {
+        // 7408 ms -> 7270 ms (only 50 ms difference now)
+        self.river_kind.is_some() || !self.neighbor_rivers.is_empty()
+        //self.near_river() || self.is_lake() || self.is_ocean()
+    }
 }

 /// Draw rivers and assign them heights, widths, and velocities.  Take some