This commit is contained in:
numzero 2026-02-17 01:43:37 +03:00
parent 3c3e6ca897
commit 22b399034f
2 changed files with 19 additions and 0 deletions

View File

@ -74,10 +74,12 @@ impl F8 {
Self::merge(m as u8 & M_STORAGE_MAX, e as u8) Self::merge(m as u8 & M_STORAGE_MAX, e as u8)
} }
/// Reinterpret `bits` as an [`F8`]. Equivalent to [`std::mem::transmute`] but safe.
pub const fn from_bits(bits: u8) -> Self { pub const fn from_bits(bits: u8) -> Self {
Self(bits) Self(bits)
} }
/// Reinterpret `self` as an [`u8`]. Equivalent to [`std::mem::transmute`] but safe.
pub const fn to_bits(self) -> u8 { pub const fn to_bits(self) -> u8 {
self.0 self.0
} }
@ -89,6 +91,12 @@ impl F8 {
/// Split `self` into `(base, scale)` such that `self = base * 2.pow(scale)`. /// Split `self` into `(base, scale)` such that `self = base * 2.pow(scale)`.
/// ///
/// Guarantees:
/// * `ldexp(val.frexp()) == val`.
/// * `base` will have at most [M_BITS]+1 low-order bits set.
/// * `scale.abs()` will have at most [E_BITS]+1 low-order bits set.
///
/// # Example
/// ``` /// ```
/// # use f8::F8; /// # use f8::F8;
/// # let val = F8::from_bits(42); /// # let val = F8::from_bits(42);

View File

@ -1,14 +1,24 @@
//! Software 8-bit floating-point math library. Not for production use.
mod conv; mod conv;
mod fmt; mod fmt;
mod ops; mod ops;
/// Mantissa width of [`F8`].
pub const M_BITS: u8 = 5; pub const M_BITS: u8 = 5;
/// Exponent width of [`F8`].
pub const E_BITS: u8 = 3; pub const E_BITS: u8 = 3;
/// The value such that `2.pow(E_CAP)` is just over the [`F8`] limit.
const E_CAP: u8 = 4; const E_CAP: u8 = 4;
/// Largest exponent value of [`F8`].
pub const E_MAX: u8 = E_CAP - 1; pub const E_MAX: u8 = E_CAP - 1;
static_assertions::const_assert_eq!(M_BITS + E_BITS, 8); static_assertions::const_assert_eq!(M_BITS + E_BITS, 8);
/// The largest integer up to and including which all integers are representable exactly.
pub const EXACT_INT_MAX: u8 = if E_MAX > M_BITS { pub const EXACT_INT_MAX: u8 = if E_MAX > M_BITS {
2 << M_BITS 2 << M_BITS
} else { } else {
@ -21,6 +31,7 @@ const E_BIAS: u8 = E_STORAGE_MAX - E_MAX;
const M_MASK: u8 = M_STORAGE_MAX; const M_MASK: u8 = M_STORAGE_MAX;
const E_MASK: u8 = E_STORAGE_MAX << M_BITS; const E_MASK: u8 = E_STORAGE_MAX << M_BITS;
/// 8-bit unsigned binary floating-point type, with [`M_BITS`] mantissa bits and [`E_BITS`] exponent bits.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
#[repr(transparent)] #[repr(transparent)]
pub struct F8(u8); pub struct F8(u8);