brunocodutra
diff --git a/‎Cargo.lock
Lines changed: 11 additions & 0 deletions b/‎Cargo.lock
Lines changed: 11 additions & 0 deletions
diff --git a/‎Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎benches/search.rs
Lines changed: 46 additions & 67 deletions b/‎benches/search.rs
Lines changed: 46 additions & 67 deletions
diff --git a/‎lib/search.rs
Lines changed: 2 additions & 0 deletions b/‎lib/search.rs
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/search/driver.rs
Lines changed: 134 additions & 0 deletions b/‎lib/search/driver.rs
Lines changed: 134 additions & 0 deletions
@@ -38,6 +38,7 @@ ruzstd = { version = "0.7.0", default-features = false, features = ["std"] }
 
 [dev-dependencies]
 criterion = { version = "0.5.1", default-features = false, features = ["rayon"] }
+criterion-macro = { version = "0.4.0", default-features = false }
 proptest = { version = "1.4.0", default-features = false, features = ["std"] }
 test-strategy = { version = "0.3.1", default-features = false }
 
@@ -66,4 +67,3 @@ bench = false
 
 [[bench]]
 name = "search"
-harness = false
@@ -1,81 +1,60 @@
-use criterion::{criterion_group, criterion_main, Criterion, SamplingMode, Throughput};
-use lib::search::{Depth, Engine, Limits, Options, ThreadCount};
+#![feature(custom_test_frameworks)]
+#![test_runner(criterion::runner)]
+
+use criterion::{Criterion, SamplingMode, Throughput};
+use criterion_macro::criterion;
+use lib::search::{Depth, Engine, Limits, Options};
 use lib::{nnue::Evaluator, util::Integer};
-use std::thread::available_parallelism;
 use std::time::{Duration, Instant};
+use std::{str::FromStr, thread::available_parallelism};
+
+#[ctor::ctor]
+static POSITION: Evaluator =
+    Evaluator::from_str("6br/1KNp1n1r/2p2p2/P1ppRP2/1kP3pP/3PBB2/PN1P4/8 w - - 0 1").unwrap();
 
-fn bencher(reps: u64, positions: &[Evaluator], options: Options, limits: Limits) -> Duration {
+fn bench(reps: u64, options: Options, limits: Limits) -> Duration {
     let mut time = Duration::ZERO;
 
-    for pos in positions {
-        for _ in 0..reps {
-            let mut e = Engine::with_options(options);
-            let timer = Instant::now();
-            e.search(pos, limits);
-            time += timer.elapsed();
-        }
+    for _ in 0..reps {
+        let mut e = Engine::with_options(options);
+        let timer = Instant::now();
+        e.search(&POSITION, limits);
+        time += timer.elapsed();
     }
 
     time
 }
 
-fn bench(c: &mut Criterion) {
-    let positions: Vec<Evaluator> = FENS.iter().map(|p| p.parse().unwrap()).collect();
-    let options = match available_parallelism() {
-        Err(_) => Options::default(),
-        Ok(cores) => match cores.get() / 2 {
-            0 => Options::default(),
-            threads => Options {
-                threads: ThreadCount::new(threads),
-                ..Options::default()
-            },
-        },
+#[criterion]
+fn crit(c: &mut Criterion) {
+    let thread_limit = match available_parallelism() {
+        Ok(cores) => cores.get().div_ceil(2),
+        Err(_) => 1,
     };
 
-    let depth = Depth::new(7);
-    c.benchmark_group("search")
-        .sampling_mode(SamplingMode::Flat)
-        .bench_function("ttd", |b| {
-            b.iter_custom(|i| bencher(i, &positions, options, depth.into()))
-        });
+    let options = Vec::from_iter((0..=thread_limit.ilog2()).map(|threads| Options {
+        threads: 2usize.pow(threads).saturate(),
+        ..Options::default()
+    }));
+
+    for &o in &options {
+        let depth = Depth::new(14);
+        c.benchmark_group("ttd")
+            .sampling_mode(SamplingMode::Flat)
+            .sample_size(10 * o.threads.get())
+            .bench_function(o.threads.to_string(), |b| {
+                b.iter_custom(|i| bench(i, o, depth.into()))
+            });
+    }
 
-    let nodes = 10000;
-    c.benchmark_group("search")
-        .sampling_mode(SamplingMode::Flat)
-        .throughput(Throughput::Elements(nodes * positions.len() as u64))
-        .bench_function("nps", |b| {
-            b.iter_custom(|i| bencher(i, &positions, options, nodes.into()))
-        });
+    for &o in &options {
+        let nodes = 500_000;
+        c.benchmark_group("nps")
+            .sampling_mode(SamplingMode::Flat)
+            .sample_size(10 * o.threads.get())
+            .throughput(Throughput::Elements(nodes))
+            .bench_function(o.threads.to_string(), |b| {
+                b.iter_custom(|i| bench(i, o, nodes.into()))
+            });
+    }
 }
-
-criterion_group!(benches, bench);
-criterion_main!(benches);
-
-// https://www.chessprogramming.org/CCR_One_Hour_Test
-const FENS: &[&str] = &[
-    "rn1qkb1r/pp2pppp/5n2/3p1b2/3P4/2N1P3/PP3PPP/R1BQKBNR w KQkq - 0 1",
-    "rn1qkb1r/pp2pppp/5n2/3p1b2/3P4/1QN1P3/PP3PPP/R1B1KBNR b KQkq - 1 1",
-    "r1bqk2r/ppp2ppp/2n5/4P3/2Bp2n1/5N1P/PP1N1PP1/R2Q1RK1 b kq - 1 10",
-    "r1bqrnk1/pp2bp1p/2p2np1/3p2B1/3P4/2NBPN2/PPQ2PPP/1R3RK1 w - - 1 12",
-    "rnbqkb1r/ppp1pppp/5n2/8/3PP3/2N5/PP3PPP/R1BQKBNR b KQkq - 3 5",
-    "rnbq1rk1/pppp1ppp/4pn2/8/1bPP4/P1N5/1PQ1PPPP/R1B1KBNR b KQ - 1 5",
-    "r4rk1/3nppbp/bq1p1np1/2pP4/8/2N2NPP/PP2PPB1/R1BQR1K1 b - - 1 12",
-    "rn1qkb1r/pb1p1ppp/1p2pn2/2p5/2PP4/5NP1/PP2PPBP/RNBQK2R w KQkq c6 1 6",
-    "r1bq1rk1/1pp2pbp/p1np1np1/3Pp3/2P1P3/2N1BP2/PP4PP/R1NQKB1R b KQ - 1 9",
-    "rnbqr1k1/1p3pbp/p2p1np1/2pP4/4P3/2N5/PP1NBPPP/R1BQ1RK1 w - - 1 11",
-    "rnbqkb1r/pppp1ppp/5n2/4p3/4PP2/2N5/PPPP2PP/R1BQKBNR b KQkq f3 1 3",
-    "r1bqk1nr/pppnbppp/3p4/8/2BNP3/8/PPP2PPP/RNBQK2R w KQkq - 2 6",
-    "rnbq1b1r/ppp2kpp/3p1n2/8/3PP3/8/PPP2PPP/RNBQKB1R b KQ d3 1 5",
-    "rnbqkb1r/pppp1ppp/3n4/8/2BQ4/5N2/PPP2PPP/RNB2RK1 b kq - 1 6",
-    "r2q1rk1/2p1bppp/p2p1n2/1p2P3/4P1b1/1nP1BN2/PP3PPP/RN1QR1K1 w - - 1 12",
-    "r1bqkb1r/2pp1ppp/p1n5/1p2p3/3Pn3/1B3N2/PPP2PPP/RNBQ1RK1 b kq - 2 7",
-    "r2qkbnr/2p2pp1/p1pp4/4p2p/4P1b1/5N1P/PPPP1PP1/RNBQ1RK1 w kq - 1 8",
-    "r1bqkb1r/pp3ppp/2np1n2/4p1B1/3NP3/2N5/PPP2PPP/R2QKB1R w KQkq e6 1 7",
-    "rn1qk2r/1b2bppp/p2ppn2/1p6/3NP3/1BN5/PPP2PPP/R1BQR1K1 w kq - 5 10",
-    "r1b1kb1r/1pqpnppp/p1n1p3/8/3NP3/2N1B3/PPP1BPPP/R2QK2R w KQkq - 3 8",
-    "r1bqnr2/pp1ppkbp/4N1p1/n3P3/8/2N1B3/PPP2PPP/R2QK2R b KQ - 2 11",
-    "r3kb1r/pp1n1ppp/1q2p3/n2p4/3P1Bb1/2PB1N2/PPQ2PPP/RN2K2R w KQkq - 3 11",
-    "r1bq1rk1/pppnnppp/4p3/3pP3/1b1P4/2NB3N/PPP2PPP/R1BQK2R w KQ - 3 7",
-    "r2qkbnr/ppp1pp1p/3p2p1/3Pn3/4P1b1/2N2N2/PPP2PPP/R1BQKB1R w KQkq - 2 6",
-    "rn2kb1r/pp2pppp/1qP2n2/8/6b1/1Q6/PP1PPPBP/RNB1K1NR b KQkq - 1 6",
-];
@@ -1,4 +1,5 @@
 mod depth;
+mod driver;
 mod engine;
 mod killers;
 mod limits;
@@ -9,6 +10,7 @@ mod score;
 mod transposition;
 
 pub use depth::*;
+pub use driver::*;
 pub use engine::*;
 pub use killers::*;
 pub use limits::*;
 
@@ -0,0 +1,134 @@
+use crate::search::{Pv, ThreadCount};
+use crate::util::{Binary, Bits, Integer};
+use derive_more::{Deref, Display, Error, From};
+use rayon::{prelude::*, ThreadPool, ThreadPoolBuilder};
+use std::sync::atomic::{AtomicU64, Ordering};
+
+/// Indicates the search was interrupted upon reaching the configured [`crate::search::Limits`].
+#[derive(Debug, Display, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Error)]
+#[display("the search was interrupted")]
+pub struct Interrupted;
+
+/// Whether the search should be [`Interrupted`] or exited early.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, From)]
+pub enum ControlFlow {
+    Interrupt(Interrupted),
+    Break,
+}
+
+/// A parallel search driver.
+#[derive(Debug)]
+pub enum Driver {
+    Parallel(ThreadPool),
+    Sequential,
+}
+
+impl Driver {
+    /// Constructs a parallel search driver with the given [`ThreadCount`].
+    pub fn new(threads: ThreadCount) -> Self {
+        match threads.get() {
+            1 => Self::Sequential,
+            n => Self::Parallel(ThreadPoolBuilder::new().num_threads(n).build().unwrap()),
+        }
+    }
+
+    /// Drive the search, possibly across multiple threads in parallel.
+    ///
+    /// The order in which elements are processed and on which thread is unspecified.
+    #[inline(always)]
+    pub fn drive<M, F>(&self, mut best: Pv, moves: &[M], f: F) -> Result<Pv, Interrupted>
+    where
+        M: Sync,
+        F: Fn(&Pv, &M) -> Result<Pv, ControlFlow> + Sync,
+    {
+        match self {
+            Self::Sequential => {
+                for m in moves.iter().rev() {
+                    best = match f(&best, m) {
+                        Ok(pv) => pv.max(best),
+                        Err(ControlFlow::Break) => break,
+                        Err(ControlFlow::Interrupt(e)) => return Err(e),
+                    };
+                }
+
+                Ok(best)
+            }
+
+            Self::Parallel(e) => e.install(|| {
+                use Ordering::Relaxed;
+                let best = AtomicU64::new(IndexedPv(best, u32::MAX).encode().get());
+                let result = moves.par_iter().enumerate().rev().try_for_each(|(idx, m)| {
+                    let pv = f(&IndexedPv::decode(Bits::new(best.load(Relaxed))), m)?;
+                    best.fetch_max(IndexedPv(pv, idx.saturate()).encode().get(), Relaxed);
+                    Ok(())
+                });
+
+                if matches!(result, Ok(()) | Err(ControlFlow::Break)) {
+                    Ok(*IndexedPv::decode(Bits::new(best.into_inner())))
+                } else {
+                    Err(Interrupted)
+                }
+            }),
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Deref)]
+#[cfg_attr(test, derive(test_strategy::Arbitrary))]
+struct IndexedPv(#[deref] Pv, u32);
+
+impl Binary for IndexedPv {
+    type Bits = Bits<u64, 64>;
+
+    #[inline(always)]
+    fn encode(&self) -> Self::Bits {
+        let mut bits = Bits::default();
+        bits.push(self.score().encode());
+        bits.push(Bits::<u32, 32>::new(self.1));
+        bits.push(self.best().encode());
+        bits
+    }
+
+    #[inline(always)]
+    fn decode(mut bits: Self::Bits) -> Self {
+        let best = Binary::decode(bits.pop());
+        let idx = bits.pop::<u32, 32>().get();
+        let score = Binary::decode(bits.pop());
+        Self(Pv::new(score, best), idx)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{chess::Move, nnue::Value};
+    use std::cmp::max;
+    use test_strategy::proptest;
+
+    #[proptest]
+    fn decoding_encoded_indexed_pv_is_an_identity(pv: IndexedPv) {
+        assert_eq!(IndexedPv::decode(pv.encode()), pv);
+    }
+
+    #[proptest]
+    fn indexed_pv_with_higher_score_is_larger(a: Pv, b: Pv, i: u32) {
+        assert_eq!(a < b, IndexedPv(a, i) < IndexedPv(b, i));
+    }
+
+    #[proptest]
+    fn indexed_pv_with_same_score_but_higher_index_is_larger(pv: Pv, a: u32, b: u32) {
+        assert_eq!(a < b, IndexedPv(pv, a) < IndexedPv(pv, b));
+    }
+
+    #[proptest]
+    fn driver_finds_max_indexed_pv(c: ThreadCount, pv: Pv, ms: Vec<(Move, Value)>) {
+        assert_eq!(
+            Driver::new(c).drive(pv, &ms, |_, &(m, v)| Ok(Pv::new(v.saturate(), Some(m)))),
+            Ok(*ms
+                .into_iter()
+                .enumerate()
+                .map(|(i, (m, v))| IndexedPv(Pv::new(v.saturate(), Some(m)), i as _))
+                .fold(IndexedPv(pv, u32::MAX), max))
+        )
+    }
+}