blob: c54de55a211e21a5d7368b2ba787be7500e66393 [file] [log] [blame]
//! Kernel density estimation
pub mod kernel;
use self::kernel::Kernel;
use crate::stats::float::Float;
use crate::stats::univariate::Sample;
#[cfg(feature = "rayon")]
use rayon::prelude::*;
/// Univariate kernel density estimator
pub struct Kde<'a, A, K>
where
A: Float,
K: Kernel<A>,
{
bandwidth: A,
kernel: K,
sample: &'a Sample<A>,
}
impl<'a, A, K> Kde<'a, A, K>
where
A: 'a + Float,
K: Kernel<A>,
{
/// Creates a new kernel density estimator from the `sample`, using a kernel and estimating
/// the bandwidth using the method `bw`
pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> {
Kde {
bandwidth: bw.estimate(sample),
kernel,
sample,
}
}
/// Returns the bandwidth used by the estimator
pub fn bandwidth(&self) -> A {
self.bandwidth
}
/// Maps the KDE over `xs`
///
/// - Multihreaded
pub fn map(&self, xs: &[A]) -> Box<[A]> {
#[cfg(feature = "rayon")]
let iter = xs.par_iter();
#[cfg(not(feature = "rayon"))]
let iter = xs.iter();
iter.map(|&x| self.estimate(x))
.collect::<Vec<_>>()
.into_boxed_slice()
}
/// Estimates the probability density of `x`
pub fn estimate(&self, x: A) -> A {
let _0 = A::cast(0);
let slice = self.sample;
let h = self.bandwidth;
let n = A::cast(slice.len());
let sum = slice
.iter()
.fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h));
sum / (h * n)
}
}
/// Method to estimate the bandwidth
pub enum Bandwidth {
/// Use Silverman's rule of thumb to estimate the bandwidth from the sample
Silverman,
}
impl Bandwidth {
fn estimate<A: Float>(self, sample: &Sample<A>) -> A {
match self {
Bandwidth::Silverman => {
let factor = A::cast(4. / 3.);
let exponent = A::cast(1. / 5.);
let n = A::cast(sample.len());
let sigma = sample.std_dev(None);
sigma * (factor / n).powf(exponent)
}
}
}
}
#[cfg(test)]
macro_rules! test {
($ty:ident) => {
mod $ty {
use approx::relative_eq;
use quickcheck::quickcheck;
use quickcheck::TestResult;
use crate::stats::univariate::kde::kernel::Gaussian;
use crate::stats::univariate::kde::{Bandwidth, Kde};
use crate::stats::univariate::Sample;
// The [-inf inf] integral of the estimated PDF should be one
quickcheck! {
fn integral(size: u8, start: u8) -> TestResult {
let size = size as usize;
let start = start as usize;
const DX: $ty = 1e-3;
if let Some(v) = crate::stats::test::vec::<$ty>(size, start) {
let slice = &v[start..];
let data = Sample::new(slice);
let kde = Kde::new(data, Gaussian, Bandwidth::Silverman);
let h = kde.bandwidth();
// NB Obviously a [-inf inf] integral is not feasible, but this range works
// quite well
let (a, b) = (data.min() - 5. * h, data.max() + 5. * h);
let mut acc = 0.;
let mut x = a;
let mut y = kde.estimate(a);
while x < b {
acc += DX * y / 2.;
x += DX;
y = kde.estimate(x);
acc += DX * y / 2.;
}
TestResult::from_bool(relative_eq!(acc, 1., epsilon = 2e-5))
} else {
TestResult::discard()
}
}
}
}
};
}
#[cfg(test)]
mod test {
test!(f32);
test!(f64);
}