Compare commits

...

2 commits
v0.1.0 ... main

Author SHA1 Message Date
Pascal Engélibert 7c73d32ffa v0.3.0 2024-03-15 20:15:21 +01:00
Pascal Engélibert 7db8e9c3ab
feat: generic collection 2023-04-14 12:39:11 +02:00
5 changed files with 151 additions and 56 deletions

View file

@ -1,21 +1,34 @@
[package]
name = "median-accumulator"
version = "0.1.0"
version = "0.4.0"
edition = "2021"
authors = ["tuxmain <tuxmain@zettascript.org>"]
license = "AGPL-3.0-only"
repository = "https://git.txmn.tk/tuxmain/median-accumulator"
documentation = "https://docs.rs/median-accumulator/"
description = "Simple, fast, space-efficient accumulator for computing median"
categories = ["algorithms"]
description = "Simple, fast, space-efficient, generic accumulator for computing median"
categories = ["algorithms", "data-structures", "no-std"]
keywords = ["median"]
[dependencies]
cc-traits = { version = "2.0.0", default_features = false }
smallvec = { version = "^1.6", optional = true }
[features]
std = ["cc-traits/alloc", "cc-traits/std"]
smallvec = ["dep:smallvec", "cc-traits/smallvec"]
default = ["std"]
[dev-dependencies]
criterion = { version = "0.4.0", features = ["html_reports"] }
medianheap = "0.3.0"
criterion = { version = "0.5.1", features = ["html_reports"] }
medianheap = "0.4.1"
rand = "0.8.5"
smallvec = "^1.6"
[[bench]]
name = "comparison"
harness = false
[package.metadata.docs.rs]
features = ["std"]

View file

@ -7,17 +7,17 @@ Simple, space-efficient algorithm to compute the median of an accumulation of el
* **Space-efficient**: `O(D)` space, D being the number of _different_ samples, not the _total_ number of samples
* **Time-efficient**: push is `O(log(N))`
* **Generic**: `T: Clone + Ord`
* **Tested**
* **No unsafe**, no deps except `std`
* **No unsafe**
* **no_std** (optional): supports generic collections
Faster than other implementations if there are samples having the same value. If this is not your case, you should use another implementation.
Faster than other implementations if lots of samples have the same value. If this is not your case, you should use another implementation.
## Use
```rust
use median_accumulator::*;
let mut acc = MedianAcc::new();
let mut acc = vec::MedianAcc::new();
assert_eq!(acc.get_median(), None);
acc.push(7);
@ -30,9 +30,21 @@ assert_eq!(acc.get_median(), Some(MedianResult::One(7)));
If you ever encounter an `unreachable` panic, please file an issue or send me an e-mail.
## no_std
Example with [smallvec](https://crates.io/crates/smallvec): (`smallvec` feature required)
```rust
use median_accumulator::*;
let mut acc = MedianAcc::<i32, smallvec::SmallVec<[(i32, u32); 64]>>::new();
```
For other collections than `Vec` or `SmallVec`, you must implement [cc-traits](https://crates.io/crates/cc-traits) and `InsertIndex`.
## License
CopyLeft 2022 Pascal Engélibert
CopyLeft 2022-2024 Pascal Engélibert [(why copyleft?)](https://txmn.tk/blog/why-copyleft/)
This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, version 3 of the License.

View file

@ -1,54 +1,34 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use rand::Rng;
static ITERS: u32 = 10_000;
fn compare_crates(c: &mut Criterion) {
let mut rng = rand::thread_rng();
let mut group = c.benchmark_group("Comparison");
for len in [10, 50, 100, 500, 1000] {
let samples: Vec<u32> = (0..len).map(|_| rng.gen_range(0..len / 5)).collect();
for redundancy in [1, 5, 10, 20, 40] {
let samples: Vec<u32> = (0..ITERS)
.map(|_| rng.gen_range(0..ITERS / redundancy))
.collect();
group.bench_with_input(
BenchmarkId::new("median_accumulator 1:5", len),
BenchmarkId::new("median_accumulator", redundancy),
&samples,
|b, _i| {
b.iter(|| {
let mut median = median_accumulator::MedianAcc::new();
let mut median = median_accumulator::vec::MedianAcc::new();
samples.iter().for_each(|s| median.push(*s));
median.get_median()
black_box(median.get_median());
})
},
);
group.bench_with_input(
BenchmarkId::new("medianheap 1:5", len),
BenchmarkId::new("medianheap", redundancy),
&samples,
|b, _i| {
b.iter(|| {
let mut median = medianheap::MedianHeap::new();
samples.iter().for_each(|s| median.push(*s));
median.median()
})
},
);
let samples: Vec<u32> = (0..len).map(|_| rng.gen_range(0..len)).collect();
group.bench_with_input(
BenchmarkId::new("median_accumulator 1:1", len),
&samples,
|b, _i| {
b.iter(|| {
let mut median = median_accumulator::MedianAcc::new();
samples.iter().for_each(|s| median.push(*s));
median.get_median()
})
},
);
group.bench_with_input(
BenchmarkId::new("medianheap 1:1", len),
&samples,
|b, _i| {
b.iter(|| {
let mut median = medianheap::MedianHeap::new();
samples.iter().for_each(|s| median.push(*s));
median.median()
black_box(median.median());
})
},
);

View file

@ -1,7 +1,7 @@
//! ```rust
//! use median_accumulator::*;
//!
//! let mut acc = MedianAcc::new();
//! let mut acc = vec::MedianAcc::new();
//!
//! assert_eq!(acc.get_median(), None);
//! acc.push(7);
@ -14,14 +14,29 @@
//!
//! In doc comments, _N_ represents the number of samples, _D_ represents the number of different values taken by the samples.
use std::cmp::Ordering;
#![cfg_attr(not(feature = "std"), no_std)]
mod traits;
pub use traits::*;
use core::{cmp::Ordering, ops::DerefMut};
/// Accumulator for computing median
#[derive(Clone, Debug, Default)]
pub struct MedianAcc<T: Clone + Ord> {
samples: Vec<(T, u32)>,
pub struct MedianAcc<
T: Clone + Ord,
V: DerefMut<Target = [(T, u32)]> + cc_traits::VecMut<(T, u32)> + InsertIndex,
> {
samples: V,
median_index: Option<usize>,
median_subindex: u32,
_t: core::marker::PhantomData<T>,
}
#[cfg(feature = "std")]
pub mod vec {
pub type MedianAcc<T> = crate::MedianAcc<T, Vec<(T, u32)>>;
}
/// Computed median
@ -34,17 +49,41 @@ pub enum MedianResult<T: Clone + Ord> {
Two(T, T),
}
impl<T: Clone + Ord> MedianAcc<T> {
impl<
T: Clone + Ord,
V: DerefMut<Target = [(T, u32)]> + cc_traits::VecMut<(T, u32)> + InsertIndex,
> MedianAcc<T, V>
{
/// Create an empty accumulator
///
/// _O(1)_
///
/// Does not allocate until the first push.
pub fn new() -> Self {
/// If using `std::vec::Vec`, does not allocate until the first push.
pub fn new() -> Self
where
V: Default,
{
Self {
samples: Vec::new(),
samples: Default::default(),
median_index: None,
median_subindex: 0,
_t: Default::default(),
}
}
/// Create an empty accumulator from an existing (empty) collection
///
/// _O(1)_
///
/// Useful when using fixed-length collections or to avoid allocations.
pub fn new_from(collection: V) -> Self {
assert!(collection.is_empty(), "the collection must be empty");
Self {
samples: collection,
median_index: None,
median_subindex: 0,
_t: Default::default(),
}
}
@ -85,7 +124,7 @@ impl<T: Clone + Ord> MedianAcc<T> {
}
}
Err(sample_index) => {
self.samples.insert(sample_index, (sample, 1));
self.samples.insert_index(sample_index, (sample, 1));
if *median_index >= sample_index {
if self.median_subindex == 0 {
self.median_subindex =
@ -105,7 +144,7 @@ impl<T: Clone + Ord> MedianAcc<T> {
}
}
} else {
self.samples.push((sample, 1));
self.samples.push_back((sample, 1));
self.median_index = Some(0);
}
}
@ -148,19 +187,22 @@ impl<T: Clone + Ord> MedianAcc<T> {
}
/// Clear the data
pub fn clear(&mut self) {
pub fn clear(&mut self)
where
V: cc_traits::Clear,
{
self.samples.clear();
self.median_index = None;
self.median_subindex = 0;
}
/// Access the underlying vec
/// Access the underlying collection
///
/// Just in case you need finer allocation management.
///
/// # Safety
/// Leaving the vector in an invalid state may cause invalid result or panic (but no UB).
pub unsafe fn get_samples_mut(&mut self) -> &mut Vec<(T, u32)> {
pub unsafe fn get_samples_mut(&mut self) -> &mut V {
&mut self.samples
}
}
@ -171,6 +213,7 @@ mod tests {
use rand::Rng;
#[cfg(feature = "std")]
fn naive_median<T: Clone + Ord>(samples: &mut [T]) -> Option<MedianResult<T>> {
if samples.is_empty() {
None
@ -190,6 +233,7 @@ mod tests {
}
}
#[cfg(feature = "std")]
#[test]
fn correctness() {
let mut rng = rand::thread_rng();
@ -198,7 +242,25 @@ mod tests {
let len: usize = rng.gen_range(0..100);
let mut samples: Vec<i32> = (0..len).map(|_| rng.gen_range(-100..100)).collect();
let mut median = MedianAcc::new();
let mut median = vec::MedianAcc::new();
for sample in samples.iter() {
median.push(*sample);
}
assert_eq!(median.get_median(), naive_median(&mut samples));
}
}
#[cfg(feature = "smallvec")]
#[test]
fn correctness_smallvec() {
let mut rng = rand::thread_rng();
for _ in 0..100_000 {
let len: usize = rng.gen_range(0..64);
let mut samples: Vec<i32> = (0..len).map(|_| rng.gen_range(-100..100)).collect();
let mut median = MedianAcc::<i32, smallvec::SmallVec<[(i32, u32); 64]>>::new();
for sample in samples.iter() {
median.push(*sample);
}

28
src/traits.rs Normal file
View file

@ -0,0 +1,28 @@
/// Collection where an item can be inserted at a given index.
pub trait InsertIndex: cc_traits::Collection {
type Output;
fn insert_index(
&mut self,
index: usize,
element: <Self as cc_traits::Collection>::Item,
) -> Self::Output;
}
#[cfg(feature = "std")]
impl<T> InsertIndex for Vec<T> {
type Output = ();
fn insert_index(&mut self, index: usize, element: T) {
self.insert(index, element)
}
}
#[cfg(feature = "smallvec")]
impl<T, A: smallvec::Array<Item = T>> InsertIndex for smallvec::SmallVec<A> {
type Output = ();
fn insert_index(&mut self, index: usize, element: T) {
self.insert(index, element)
}
}