From 3056efc1a23224d2bc6cfd34ddd6a813d293f896 Mon Sep 17 00:00:00 2001 From: numzero Date: Sun, 2 Nov 2025 15:08:43 +0300 Subject: [PATCH 1/8] add public API --- src/conv.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/conv.rs b/src/conv.rs index 8472df9..4bd0660 100644 --- a/src/conv.rs +++ b/src/conv.rs @@ -73,6 +73,33 @@ impl F8 { } Self::merge(m as u8 & M_STORAGE_MAX, e as u8) } + + pub fn from_bits(bits: u8) -> Self { + Self(bits) + } + + pub fn to_bits(self) -> u8 { + self.0 + } + + /// Calculate `base * 2.pow(scale)`, preserving as much precision as possible. + pub fn ldexp(base: u32, scale: i32) -> Self { + Self::merge_unbias(base, scale) + } + + /// Split `self` into `(base, scale)` such that `self = base * 2.pow(scale)`. + /// + /// ``` + /// # use f8::F8; + /// # let val = F8::from_bits(42); + /// let (base, scale) = val.frexp(); + /// let val2 = F8::ldexp(base, scale); + /// assert_eq!(val, val2); + /// ``` + pub fn frexp(self) -> (u32, i32) { + let (base, scale) = self.split_unbias(); + (base.into(), scale.into()) + } } #[cfg(test)] -- 2.47.3 From ea84e653ac5cfafd0329d1a013e05613009ff274 Mon Sep 17 00:00:00 2001 From: numzero Date: Mon, 16 Feb 2026 15:55:30 +0300 Subject: [PATCH 2/8] move formatting into a mod --- src/fmt.rs | 41 +++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 41 +---------------------------------------- 2 files changed, 42 insertions(+), 40 deletions(-) create mode 100644 src/fmt.rs diff --git a/src/fmt.rs b/src/fmt.rs new file mode 100644 index 0000000..292b438 --- /dev/null +++ b/src/fmt.rs @@ -0,0 +1,41 @@ +use crate::{E_BIAS, F8}; + +impl std::fmt::Binary for F8 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if f.alternate() { + f.write_str("0b")?; + } + if self.0 == 0 { + f.write_str("0.00000p0")?; + return Ok(()); + } + let (m, e) = self.split(); + write!(f, "1.{m:05b}p{e}", e = e as i8 - E_BIAS as i8)?; + Ok(()) + } +} + +impl std::fmt::Debug for F8 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:#b}f8") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_display() { + fn fmt_split(m: u8, e: u8) -> String { + let v = F8::merge(m, e); + format!("{v:b}") + } + assert_eq!("0.00000p0", fmt_split(0, 0)); + assert_eq!("1.00000p0", fmt_split(0, E_BIAS)); + assert_eq!("1.00000p1", fmt_split(0, E_BIAS + 1)); + assert_eq!("1.00000p-1", fmt_split(0, E_BIAS - 1)); + assert_eq!("1.00001p0", fmt_split(1, E_BIAS)); + assert_eq!("1.11111p0", fmt_split(0b11111, E_BIAS)); + } +} diff --git a/src/lib.rs b/src/lib.rs index fb3d5b6..823639c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ mod conv; +mod fmt; mod ops; pub const M_BITS: u8 = 5; @@ -23,43 +24,3 @@ const E_MASK: u8 = E_STORAGE_MAX << M_BITS; #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] #[repr(transparent)] pub struct F8(u8); - -impl std::fmt::Binary for F8 { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if f.alternate() { - f.write_str("0b")?; - } - if self.0 == 0 { - f.write_str("0.00000p0")?; - return Ok(()); - } - let (m, e) = self.split(); - write!(f, "1.{m:05b}p{e}", e = e as i8 - E_BIAS as i8)?; - Ok(()) - } -} - -impl std::fmt::Debug for F8 { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{self:#b}f8") - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_display() { - fn fmt_split(m: u8, e: u8) -> String { - let v = F8::merge(m, e); - format!("{v:b}") - } - assert_eq!("0.00000p0", fmt_split(0, 0)); - assert_eq!("1.00000p0", fmt_split(0, E_BIAS)); - assert_eq!("1.00000p1", fmt_split(0, E_BIAS + 1)); - assert_eq!("1.00000p-1", fmt_split(0, E_BIAS - 1)); - assert_eq!("1.00001p0", fmt_split(1, E_BIAS)); - assert_eq!("1.11111p0", fmt_split(0b11111, E_BIAS)); - } -} -- 2.47.3 From b48b9bef92f44fa7a6e0f0f0800a46866f008bbb Mon Sep 17 00:00:00 2001 From: numzero Date: Tue, 17 Feb 2026 00:49:15 +0300 Subject: [PATCH 3/8] sprinkle const around --- src/conv.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/conv.rs b/src/conv.rs index 4bd0660..2dcad3a 100644 --- a/src/conv.rs +++ b/src/conv.rs @@ -41,23 +41,23 @@ impl From for f32 { impl F8 { /// Split self into the mantissa and exponent, as stored. - pub(crate) fn split(self) -> (u8, u8) { + pub(crate) const fn split(self) -> (u8, u8) { (self.0 & M_MASK, self.0 >> M_BITS) } /// Split self into integers (m, e) such that `self == m * 2.pow(e)`. - pub(crate) fn split_unbias(self) -> (u8, i8) { + pub(crate) const fn split_unbias(self) -> (u8, i8) { let (m, e) = self.split(); (m | M_BIAS, e as i8 - (E_BIAS + M_BITS) as i8) } - pub(crate) fn merge(m: u8, e: u8) -> Self { + pub(crate) const fn merge(m: u8, e: u8) -> Self { assert!(m <= M_STORAGE_MAX); assert!(e <= E_STORAGE_MAX); Self((e << M_BITS) | m) } - pub(crate) fn merge_unbias(in_m: u32, in_e: i32) -> Self { + pub(crate) const fn merge_unbias(in_m: u32, in_e: i32) -> Self { if in_m == 0 { return Self(0); } @@ -74,11 +74,11 @@ impl F8 { Self::merge(m as u8 & M_STORAGE_MAX, e as u8) } - pub fn from_bits(bits: u8) -> Self { + pub const fn from_bits(bits: u8) -> Self { Self(bits) } - pub fn to_bits(self) -> u8 { + pub const fn to_bits(self) -> u8 { self.0 } -- 2.47.3 From 3c3e6ca897fdbe88a547c7817a61e9500c90f088 Mon Sep 17 00:00:00 2001 From: numzero Date: Tue, 17 Feb 2026 01:43:23 +0300 Subject: [PATCH 4/8] make E_MAX public --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 823639c..6c9d645 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,10 +5,10 @@ mod ops; pub const M_BITS: u8 = 5; pub const E_BITS: u8 = 3; const E_CAP: u8 = 4; +pub const E_MAX: u8 = E_CAP - 1; static_assertions::const_assert_eq!(M_BITS + E_BITS, 8); -const E_MAX: u8 = E_CAP - 1; pub const EXACT_INT_MAX: u8 = if E_MAX > M_BITS { 2 << M_BITS } else { -- 2.47.3 From 22b399034f928c08e56765c6bdfb450ce4720a77 Mon Sep 17 00:00:00 2001 From: numzero Date: Tue, 17 Feb 2026 01:43:37 +0300 Subject: [PATCH 5/8] add docs --- src/conv.rs | 8 ++++++++ src/lib.rs | 11 +++++++++++ 2 files changed, 19 insertions(+) diff --git a/src/conv.rs b/src/conv.rs index 2dcad3a..1cf1f0e 100644 --- a/src/conv.rs +++ b/src/conv.rs @@ -74,10 +74,12 @@ impl F8 { Self::merge(m as u8 & M_STORAGE_MAX, e as u8) } + /// Reinterpret `bits` as an [`F8`]. Equivalent to [`std::mem::transmute`] but safe. pub const fn from_bits(bits: u8) -> Self { Self(bits) } + /// Reinterpret `self` as an [`u8`]. Equivalent to [`std::mem::transmute`] but safe. pub const fn to_bits(self) -> u8 { self.0 } @@ -89,6 +91,12 @@ impl F8 { /// Split `self` into `(base, scale)` such that `self = base * 2.pow(scale)`. /// + /// Guarantees: + /// * `ldexp(val.frexp()) == val`. + /// * `base` will have at most [M_BITS]+1 low-order bits set. + /// * `scale.abs()` will have at most [E_BITS]+1 low-order bits set. + /// + /// # Example /// ``` /// # use f8::F8; /// # let val = F8::from_bits(42); diff --git a/src/lib.rs b/src/lib.rs index 6c9d645..c082250 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,14 +1,24 @@ +//! Software 8-bit floating-point math library. Not for production use. + mod conv; mod fmt; mod ops; +/// Mantissa width of [`F8`]. pub const M_BITS: u8 = 5; + +/// Exponent width of [`F8`]. pub const E_BITS: u8 = 3; + +/// The value such that `2.pow(E_CAP)` is just over the [`F8`] limit. const E_CAP: u8 = 4; + +/// Largest exponent value of [`F8`]. pub const E_MAX: u8 = E_CAP - 1; static_assertions::const_assert_eq!(M_BITS + E_BITS, 8); +/// The largest integer up to and including which all integers are representable exactly. pub const EXACT_INT_MAX: u8 = if E_MAX > M_BITS { 2 << M_BITS } else { @@ -21,6 +31,7 @@ const E_BIAS: u8 = E_STORAGE_MAX - E_MAX; const M_MASK: u8 = M_STORAGE_MAX; const E_MASK: u8 = E_STORAGE_MAX << M_BITS; +/// 8-bit unsigned binary floating-point type, with [`M_BITS`] mantissa bits and [`E_BITS`] exponent bits. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] #[repr(transparent)] pub struct F8(u8); -- 2.47.3 From f0468502d8892e4c990b69041e4f5344d5bad751 Mon Sep 17 00:00:00 2001 From: numzero Date: Tue, 17 Feb 2026 03:57:51 +0300 Subject: [PATCH 6/8] add readme --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..6c856f1 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +# F8: 8-bit floats + +> ⚠️ Not for production use! + +F8 is a toy software floating-point math library. +It provides an 8-bit floating point type `F8`, with 5 mantissa bits, 3 exponent bits, and no sign bit. +The format used resembles [IEEE 754] binary formats but stripped down to the bare necessities: the only special value supported is zero. + +[IEEE 754]: https://en.wikipedia.org/wiki/IEEE_754 -- 2.47.3 From 40ba11f33c658adf539bdec24ed81a17af6c9ffd Mon Sep 17 00:00:00 2001 From: numzero Date: Tue, 17 Feb 2026 03:58:22 +0300 Subject: [PATCH 7/8] add basic constants --- src/lib.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index c082250..ba65b21 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,6 +32,11 @@ const M_MASK: u8 = M_STORAGE_MAX; const E_MASK: u8 = E_STORAGE_MAX << M_BITS; /// 8-bit unsigned binary floating-point type, with [`M_BITS`] mantissa bits and [`E_BITS`] exponent bits. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)] #[repr(transparent)] pub struct F8(u8); + +impl F8 { + pub const ZERO: Self = Self(0); + pub const ONE: Self = Self::merge(0, E_BIAS); +} -- 2.47.3 From 3b126e7fa6c61c9e56d9f445081dfcb02530ecb1 Mon Sep 17 00:00:00 2001 From: numzero Date: Tue, 17 Feb 2026 05:26:36 +0300 Subject: [PATCH 8/8] document type properties --- src/lib.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index ba65b21..7b04c13 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,7 +31,17 @@ const E_BIAS: u8 = E_STORAGE_MAX - E_MAX; const M_MASK: u8 = M_STORAGE_MAX; const E_MASK: u8 = E_STORAGE_MAX << M_BITS; -/// 8-bit unsigned binary floating-point type, with [`M_BITS`] mantissa bits and [`E_BITS`] exponent bits. +/// 8-bit unsigned binary floating-point type. +/// +/// # Properties +/// +/// * Mantissa width: 5 bits ([`M_BITS`]) +/// * Exponent width: 3 bits ([`E_BITS`]) +/// * Negative values: not supported +/// * Zero: special-cased +/// * Subnormals: not supported +/// * Infinity: not supported +/// * NaN: not supported #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)] #[repr(transparent)] pub struct F8(u8); -- 2.47.3