Compare commits

...

4 Commits

Author SHA1 Message Date
5bc55414a3 fixup! add readme 2026-02-17 05:32:35 +03:00
493ef7e293 document type properties 2026-02-17 05:26:36 +03:00
f84c434a1c add basic constants 2026-02-17 03:58:22 +03:00
fe44566139 add readme 2026-02-17 03:57:51 +03:00
2 changed files with 26 additions and 2 deletions

9
README.md Normal file
View File

@ -0,0 +1,9 @@
# F8: 8-bit floats
> :warning: Not for production use!
F8 is a toy software floating-point math library.
It provides an 8-bit floating point type `F8`, with 5 mantissa bits, 3 exponent bits, and no sign bit.
The format used resembles [IEEE 754] binary formats but stripped down to the bare necessities: the only special value supported is zero.
[IEEE 754]: https://en.wikipedia.org/wiki/IEEE_754

View File

@ -31,7 +31,22 @@ const E_BIAS: u8 = E_STORAGE_MAX - E_MAX;
const M_MASK: u8 = M_STORAGE_MAX; const M_MASK: u8 = M_STORAGE_MAX;
const E_MASK: u8 = E_STORAGE_MAX << M_BITS; const E_MASK: u8 = E_STORAGE_MAX << M_BITS;
/// 8-bit unsigned binary floating-point type, with [`M_BITS`] mantissa bits and [`E_BITS`] exponent bits. /// 8-bit unsigned binary floating-point type.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] ///
/// # Properties
///
/// * Mantissa width: 5 bits ([`M_BITS`])
/// * Exponent width: 3 bits ([`E_BITS`])
/// * Negative values: not supported
/// * Zero: special-cased
/// * Subnormals: not supported
/// * Infinity: not supported
/// * NaN: not supported
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)]
#[repr(transparent)] #[repr(transparent)]
pub struct F8(u8); pub struct F8(u8);
impl F8 {
pub const ZERO: Self = Self(0);
pub const ONE: Self = Self::merge(0, E_BIAS);
}