| // This file is part of ICU4X. For terms of use, please see the file |
| // called LICENSE at the top level of the ICU4X source tree |
| // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| |
| #![allow(clippy::upper_case_acronyms)] |
| //! ULE implementation for the `char` type. |
| |
| use super::*; |
| use crate::impl_ule_from_array; |
| use core::cmp::Ordering; |
| use core::convert::TryFrom; |
| |
| /// A u8 array of little-endian data corresponding to a Unicode scalar value. |
| /// |
| /// The bytes of a `CharULE` are guaranteed to represent a little-endian-encoded u32 that is a |
| /// valid `char` and can be converted without validation. |
| /// |
| /// # Examples |
| /// |
| /// Convert a `char` to a `CharULE` and back again: |
| /// |
| /// ``` |
| /// use zerovec::ule::{AsULE, CharULE, ULE}; |
| /// |
| /// let c1 = '𑄃'; |
| /// let ule = c1.to_unaligned(); |
| /// assert_eq!(CharULE::as_byte_slice(&[ule]), &[0x03, 0x11, 0x01]); |
| /// let c2 = char::from_unaligned(ule); |
| /// assert_eq!(c1, c2); |
| /// ``` |
| /// |
| /// Attempt to parse invalid bytes to a `CharULE`: |
| /// |
| /// ``` |
| /// use zerovec::ule::{CharULE, ULE}; |
| /// |
| /// let bytes: &[u8] = &[0xFF, 0xFF, 0xFF, 0xFF]; |
| /// CharULE::parse_byte_slice(bytes).expect_err("Invalid bytes"); |
| /// ``` |
| #[repr(transparent)] |
| #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] |
| pub struct CharULE([u8; 3]); |
| |
| impl CharULE { |
| /// Converts a [`char`] to a [`CharULE`]. This is equivalent to calling |
| /// [`AsULE::to_unaligned()`] |
| /// |
| /// See the type-level documentation for [`CharULE`] for more information. |
| #[inline] |
| pub const fn from_aligned(c: char) -> Self { |
| let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); |
| Self([u0, u1, u2]) |
| } |
| |
| impl_ule_from_array!(char, CharULE, Self([0; 3])); |
| } |
| |
| // Safety (based on the safety checklist on the ULE trait): |
| // 1. CharULE does not include any uninitialized or padding bytes. |
| // (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) |
| // 2. CharULE is aligned to 1 byte. |
| // (achieved by `#[repr(transparent)]` on a type that satisfies this invariant) |
| // 3. The impl of validate_byte_slice() returns an error if any byte is not valid. |
| // 4. The impl of validate_byte_slice() returns an error if there are extra bytes. |
| // 5. The other ULE methods use the default impl. |
| // 6. CharULE byte equality is semantic equality |
| unsafe impl ULE for CharULE { |
| #[inline] |
| fn validate_byte_slice(bytes: &[u8]) -> Result<(), ZeroVecError> { |
| if bytes.len() % 3 != 0 { |
| return Err(ZeroVecError::length::<Self>(bytes.len())); |
| } |
| // Validate the bytes |
| for chunk in bytes.chunks_exact(3) { |
| // TODO: Use slice::as_chunks() when stabilized |
| #[allow(clippy::indexing_slicing)] |
| // Won't panic because the chunks are always 3 bytes long |
| let u = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]); |
| char::try_from(u).map_err(|_| ZeroVecError::parse::<Self>())?; |
| } |
| Ok(()) |
| } |
| } |
| |
| impl AsULE for char { |
| type ULE = CharULE; |
| |
| #[inline] |
| fn to_unaligned(self) -> Self::ULE { |
| CharULE::from_aligned(self) |
| } |
| |
| #[inline] |
| fn from_unaligned(unaligned: Self::ULE) -> Self { |
| // Safe because the bytes of CharULE are defined to represent a valid Unicode scalar value. |
| unsafe { |
| Self::from_u32_unchecked(u32::from_le_bytes([ |
| unaligned.0[0], |
| unaligned.0[1], |
| unaligned.0[2], |
| 0, |
| ])) |
| } |
| } |
| } |
| |
| impl PartialOrd for CharULE { |
| fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
| char::from_unaligned(*self).partial_cmp(&char::from_unaligned(*other)) |
| } |
| } |
| |
| impl Ord for CharULE { |
| fn cmp(&self, other: &Self) -> Ordering { |
| char::from_unaligned(*self).cmp(&char::from_unaligned(*other)) |
| } |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use super::*; |
| |
| #[test] |
| fn test_from_array() { |
| const CHARS: [char; 2] = ['a', '🙃']; |
| const CHARS_ULE: [CharULE; 2] = CharULE::from_array(CHARS); |
| assert_eq!( |
| CharULE::as_byte_slice(&CHARS_ULE), |
| &[0x61, 0x00, 0x00, 0x43, 0xF6, 0x01] |
| ); |
| } |
| |
| #[test] |
| fn test_from_array_zst() { |
| const CHARS: [char; 0] = []; |
| const CHARS_ULE: [CharULE; 0] = CharULE::from_array(CHARS); |
| let bytes = CharULE::as_byte_slice(&CHARS_ULE); |
| let empty: &[u8] = &[]; |
| assert_eq!(bytes, empty); |
| } |
| |
| #[test] |
| fn test_parse() { |
| // 1-byte, 2-byte, 3-byte, and two 4-byte character in UTF-8 (not as relevant in UTF-32) |
| let chars = ['w', 'ω', '文', '𑄃', '🙃']; |
| let char_ules: Vec<CharULE> = chars.iter().copied().map(char::to_unaligned).collect(); |
| let char_bytes: &[u8] = CharULE::as_byte_slice(&char_ules); |
| |
| // Check parsing |
| let parsed_ules: &[CharULE] = CharULE::parse_byte_slice(char_bytes).unwrap(); |
| assert_eq!(char_ules, parsed_ules); |
| let parsed_chars: Vec<char> = parsed_ules |
| .iter() |
| .copied() |
| .map(char::from_unaligned) |
| .collect(); |
| assert_eq!(&chars, parsed_chars.as_slice()); |
| |
| // Compare to golden expected data |
| assert_eq!( |
| &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], |
| char_bytes |
| ); |
| } |
| |
| #[test] |
| fn test_failures() { |
| // 119 and 120 are valid, but not 0xD800 (high surrogate) |
| let u32s = [119, 0xD800, 120]; |
| let u32_ules: Vec<RawBytesULE<4>> = u32s |
| .iter() |
| .copied() |
| .map(<u32 as AsULE>::to_unaligned) |
| .collect(); |
| let u32_bytes: &[u8] = RawBytesULE::<4>::as_byte_slice(&u32_ules); |
| let parsed_ules_result = CharULE::parse_byte_slice(u32_bytes); |
| assert!(parsed_ules_result.is_err()); |
| |
| // 0x20FFFF is out of range for a char |
| let u32s = [0x20FFFF]; |
| let u32_ules: Vec<RawBytesULE<4>> = u32s |
| .iter() |
| .copied() |
| .map(<u32 as AsULE>::to_unaligned) |
| .collect(); |
| let u32_bytes: &[u8] = RawBytesULE::<4>::as_byte_slice(&u32_ules); |
| let parsed_ules_result = CharULE::parse_byte_slice(u32_bytes); |
| assert!(parsed_ules_result.is_err()); |
| } |
| } |