blob: f3e2f17e24c4e04dc590c6b264f1281c84b63e30 [file] [log] [blame]
use std::cmp;
/// Length-limited Huffman Codes
///
use std::io;
use bit;
const MAX_BITWIDTH: u8 = 15;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Code {
pub width: u8,
pub bits: u16,
}
impl Code {
pub fn new(width: u8, bits: u16) -> Self {
debug_assert!(width <= MAX_BITWIDTH);
Code { width, bits }
}
fn inverse_endian(&self) -> Self {
let mut f = self.bits;
let mut t = 0;
for _ in 0..self.width {
t <<= 1;
t |= f & 1;
f >>= 1;
}
Code::new(self.width, t)
}
}
pub trait Builder: Sized {
type Instance;
fn set_mapping(&mut self, symbol: u16, code: Code) -> io::Result<()>;
fn finish(self) -> Self::Instance;
fn restore_canonical_huffman_codes(mut self, bitwidthes: &[u8]) -> io::Result<Self::Instance> {
debug_assert!(!bitwidthes.is_empty());
let mut symbols = bitwidthes
.iter()
.enumerate()
.filter(|&(_, &code_bitwidth)| code_bitwidth > 0)
.map(|(symbol, &code_bitwidth)| (symbol as u16, code_bitwidth))
.collect::<Vec<_>>();
symbols.sort_by_key(|x| x.1);
let mut code = 0;
let mut prev_width = 0;
for (symbol, bitwidth) in symbols {
code <<= bitwidth - prev_width;
self.set_mapping(symbol, Code::new(bitwidth, code))?;
code += 1;
prev_width = bitwidth;
}
Ok(self.finish())
}
}
pub struct DecoderBuilder {
table: Vec<u16>,
eob_symbol: Option<u16>,
eob_bitwidth: u8,
max_bitwidth: u8,
}
impl DecoderBuilder {
pub fn new(max_bitwidth: u8, eob_symbol: Option<u16>) -> Self {
debug_assert!(max_bitwidth <= MAX_BITWIDTH);
DecoderBuilder {
table: vec![u16::from(MAX_BITWIDTH) + 1; 1 << max_bitwidth],
eob_symbol,
eob_bitwidth: max_bitwidth,
max_bitwidth,
}
}
pub fn from_bitwidthes(bitwidthes: &[u8], eob_symbol: Option<u16>) -> io::Result<Decoder> {
let builder = Self::new(bitwidthes.iter().cloned().max().unwrap_or(0), eob_symbol);
builder.restore_canonical_huffman_codes(bitwidthes)
}
}
impl Builder for DecoderBuilder {
type Instance = Decoder;
fn set_mapping(&mut self, symbol: u16, code: Code) -> io::Result<()> {
debug_assert!(code.width <= self.max_bitwidth);
if Some(symbol) == self.eob_symbol {
self.eob_bitwidth = code.width;
}
// `bitwidth` encoded `to` value
let value = (symbol << 5) | u16::from(code.width);
// Sets the mapping to all possible indices
let code_be = code.inverse_endian();
for padding in 0..(1 << (self.max_bitwidth - code.width)) {
let i = ((padding << code.width) | code_be.bits) as usize;
if self.table[i] != u16::from(MAX_BITWIDTH) + 1 {
let message = format!(
"Bit region conflict: i={}, old_value={}, new_value={}, symbol={}, code={:?}",
i, self.table[i], value, symbol, code
);
return Err(io::Error::new(io::ErrorKind::InvalidData, message));
}
self.table[i] = value;
}
Ok(())
}
fn finish(self) -> Self::Instance {
Decoder {
table: self.table,
eob_bitwidth: self.eob_bitwidth,
max_bitwidth: self.max_bitwidth,
}
}
}
#[derive(Debug)]
pub struct Decoder {
table: Vec<u16>,
eob_bitwidth: u8,
max_bitwidth: u8,
}
impl Decoder {
#[inline(always)]
pub fn decode<R>(&self, reader: &mut bit::BitReader<R>) -> io::Result<u16>
where
R: io::Read,
{
let v = self.decode_unchecked(reader);
reader.check_last_error()?;
Ok(v)
}
#[inline(always)]
pub fn decode_unchecked<R>(&self, reader: &mut bit::BitReader<R>) -> u16
where
R: io::Read,
{
let code = reader.peek_bits_unchecked(self.eob_bitwidth);
let mut value = self.table[code as usize];
let mut bitwidth = (value & 0b1_1111) as u8;
if bitwidth > self.eob_bitwidth {
let code = reader.peek_bits_unchecked(self.max_bitwidth);
value = self.table[code as usize];
bitwidth = (value & 0b1_1111) as u8;
if bitwidth > self.max_bitwidth {
reader.set_last_error(invalid_data_error!("Invalid huffman coded stream"));
}
}
reader.skip_bits(bitwidth as u8);
value >> 5
}
}
#[derive(Debug)]
pub struct EncoderBuilder {
table: Vec<Code>,
}
impl EncoderBuilder {
pub fn new(symbol_count: usize) -> Self {
EncoderBuilder {
table: vec![Code::new(0, 0); symbol_count],
}
}
pub fn from_bitwidthes(bitwidthes: &[u8]) -> io::Result<Encoder> {
let symbol_count = bitwidthes
.iter()
.enumerate()
.filter(|e| *e.1 > 0)
.last()
.map_or(0, |e| e.0)
+ 1;
let builder = Self::new(symbol_count);
builder.restore_canonical_huffman_codes(bitwidthes)
}
pub fn from_frequencies(symbol_frequencies: &[usize], max_bitwidth: u8) -> io::Result<Encoder> {
let max_bitwidth = cmp::min(
max_bitwidth,
ordinary_huffman_codes::calc_optimal_max_bitwidth(symbol_frequencies),
);
let code_bitwidthes = length_limited_huffman_codes::calc(max_bitwidth, symbol_frequencies);
Self::from_bitwidthes(&code_bitwidthes)
}
}
impl Builder for EncoderBuilder {
type Instance = Encoder;
fn set_mapping(&mut self, symbol: u16, code: Code) -> io::Result<()> {
debug_assert_eq!(self.table[symbol as usize], Code::new(0, 0));
self.table[symbol as usize] = code.inverse_endian();
Ok(())
}
fn finish(self) -> Self::Instance {
Encoder { table: self.table }
}
}
#[derive(Debug, Clone)]
pub struct Encoder {
table: Vec<Code>,
}
impl Encoder {
#[inline(always)]
pub fn encode<W>(&self, writer: &mut bit::BitWriter<W>, symbol: u16) -> io::Result<()>
where
W: io::Write,
{
let code = self.lookup(symbol);
debug_assert_ne!(code, Code::new(0, 0));
writer.write_bits(code.width, code.bits)
}
#[inline(always)]
pub fn lookup(&self, symbol: u16) -> Code {
debug_assert!(
symbol < self.table.len() as u16,
"symbol:{}, table:{}",
symbol,
self.table.len()
);
self.table[symbol as usize].clone()
}
pub fn used_max_symbol(&self) -> Option<u16> {
self.table
.iter()
.rev()
.position(|x| x.width > 0)
.map(|trailing_zeros| (self.table.len() - 1 - trailing_zeros) as u16)
}
}
#[allow(dead_code)]
mod ordinary_huffman_codes {
use std::cmp;
use std::collections::BinaryHeap;
pub fn calc_optimal_max_bitwidth(frequencies: &[usize]) -> u8 {
let mut heap = BinaryHeap::new();
for &freq in frequencies.iter().filter(|&&f| f > 0) {
let weight = -(freq as isize);
heap.push((weight, 0 as u8));
}
while heap.len() > 1 {
let (weight1, width1) = heap.pop().unwrap();
let (weight2, width2) = heap.pop().unwrap();
heap.push((weight1 + weight2, 1 + cmp::max(width1, width2)));
}
let max_bitwidth = heap.pop().map_or(0, |x| x.1);
cmp::max(1, max_bitwidth)
}
}
mod length_limited_huffman_codes {
use std::mem;
#[derive(Debug, Clone)]
struct Node {
symbols: Vec<u16>,
weight: usize,
}
impl Node {
pub fn empty() -> Self {
Node {
symbols: vec![],
weight: 0,
}
}
pub fn single(symbol: u16, weight: usize) -> Self {
Node {
symbols: vec![symbol],
weight,
}
}
pub fn merge(&mut self, other: Self) {
self.weight += other.weight;
self.symbols.extend(other.symbols);
}
}
/// Reference: [A Fast Algorithm for Optimal Length-Limited Huffman Codes][LenLimHuff.pdf]
///
/// [LenLimHuff.pdf]: https://www.ics.uci.edu/~dan/pubs/LenLimHuff.pdf
pub fn calc(max_bitwidth: u8, frequencies: &[usize]) -> Vec<u8> {
// NOTE: unoptimized implementation
let mut source = frequencies
.iter()
.enumerate()
.filter(|&(_, &f)| f > 0)
.map(|(symbol, &weight)| Node::single(symbol as u16, weight))
.collect::<Vec<_>>();
source.sort_by_key(|o| o.weight);
let weighted =
(0..max_bitwidth - 1).fold(source.clone(), |w, _| merge(package(w), source.clone()));
let mut code_bitwidthes = vec![0; frequencies.len()];
for symbol in package(weighted)
.into_iter()
.flat_map(|n| n.symbols.into_iter())
{
code_bitwidthes[symbol as usize] += 1;
}
code_bitwidthes
}
fn merge(x: Vec<Node>, y: Vec<Node>) -> Vec<Node> {
let mut z = Vec::with_capacity(x.len() + y.len());
let mut x = x.into_iter().peekable();
let mut y = y.into_iter().peekable();
loop {
let x_weight = x.peek().map(|s| s.weight);
let y_weight = y.peek().map(|s| s.weight);
if x_weight.is_none() {
z.extend(y);
break;
} else if y_weight.is_none() {
z.extend(x);
break;
} else if x_weight < y_weight {
z.push(x.next().unwrap());
} else {
z.push(y.next().unwrap());
}
}
z
}
fn package(mut nodes: Vec<Node>) -> Vec<Node> {
if nodes.len() >= 2 {
let new_len = nodes.len() / 2;
for i in 0..new_len {
nodes[i] = mem::replace(&mut nodes[i * 2], Node::empty());
let other = mem::replace(&mut nodes[i * 2 + 1], Node::empty());
nodes[i].merge(other);
}
nodes.truncate(new_len);
}
nodes
}
}
#[cfg(test)]
mod test {
#[test]
fn it_works() {}
}