| // Copyright Mozilla Foundation. See the COPYRIGHT |
| // file at the top-level directory of this distribution. |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| //! Functions for converting between different in-RAM representations of text |
| //! and for quickly checking if the Unicode Bidirectional Algorithm can be |
| //! avoided. |
| //! |
| //! By using slices for output, the functions here seek to enable by-register |
| //! (ALU register or SIMD register as available) operations in order to |
| //! outperform iterator-based conversions available in the Rust standard |
| //! library. |
| //! |
| //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to |
| //! U+00FF, inclusive, and does not refer to the windows-1252 range. This |
| //! in-memory encoding is sometimes used as a storage optimization of text |
| //! when UTF-16 indexing and length semantics are exposed. |
| //! |
| //! The FFI binding for this module are in the |
| //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem). |
| |
| #[cfg(feature = "alloc")] |
| use alloc::borrow::Cow; |
| #[cfg(feature = "alloc")] |
| use alloc::string::String; |
| #[cfg(feature = "alloc")] |
| use alloc::vec::Vec; |
| |
| use super::in_inclusive_range16; |
| use super::in_inclusive_range32; |
| use super::in_inclusive_range8; |
| use super::in_range16; |
| use super::in_range32; |
| use super::DecoderResult; |
| use crate::ascii::*; |
| use crate::utf_8::*; |
| |
| macro_rules! non_fuzz_debug_assert { |
| ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); }) |
| } |
| |
| cfg_if! { |
| if #[cfg(feature = "simd-accel")] { |
| use ::core::intrinsics::likely; |
| use ::core::intrinsics::unlikely; |
| } else { |
| #[inline(always)] |
| fn likely(b: bool) -> bool { |
| b |
| } |
| #[inline(always)] |
| fn unlikely(b: bool) -> bool { |
| b |
| } |
| } |
| } |
| |
| /// Classification of text as Latin1 (all code points are below U+0100), |
| /// left-to-right with some non-Latin1 characters or as containing at least |
| /// some right-to-left characters. |
| #[must_use] |
| #[derive(Debug, PartialEq, Eq)] |
| #[repr(C)] |
| pub enum Latin1Bidi { |
| /// Every character is below U+0100. |
| Latin1 = 0, |
| /// There is at least one character that's U+0100 or higher, but there |
| /// are no right-to-left characters. |
| LeftToRight = 1, |
| /// There is at least one right-to-left character. |
| Bidi = 2, |
| } |
| |
| // `as` truncates, so works on 32-bit, too. |
| #[allow(dead_code)] |
| const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize; |
| |
| #[allow(unused_macros)] |
| macro_rules! by_unit_check_alu { |
| ($name:ident, $unit:ty, $bound:expr, $mask:ident) => { |
| #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))] |
| #[inline(always)] |
| fn $name(buffer: &[$unit]) -> bool { |
| let mut offset = 0usize; |
| let mut accu = 0usize; |
| let unit_size = ::core::mem::size_of::<$unit>(); |
| let len = buffer.len(); |
| if len >= ALU_ALIGNMENT / unit_size { |
| // The most common reason to return `false` is for the first code |
| // unit to fail the test, so check that first. |
| if buffer[0] >= $bound { |
| return false; |
| } |
| let src = buffer.as_ptr(); |
| let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) |
| & ALU_ALIGNMENT_MASK) |
| / unit_size; |
| if until_alignment + ALU_ALIGNMENT / unit_size <= len { |
| if until_alignment != 0 { |
| accu |= buffer[offset] as usize; |
| offset += 1; |
| until_alignment -= 1; |
| while until_alignment != 0 { |
| accu |= buffer[offset] as usize; |
| offset += 1; |
| until_alignment -= 1; |
| } |
| if accu >= $bound { |
| return false; |
| } |
| } |
| let len_minus_stride = len - ALU_ALIGNMENT / unit_size; |
| if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len { |
| let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size)); |
| loop { |
| let unroll_accu = unsafe { *(src.add(offset) as *const usize) } |
| | unsafe { |
| *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize) |
| } |
| | unsafe { |
| *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size))) |
| as *const usize) |
| } |
| | unsafe { |
| *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size))) |
| as *const usize) |
| }; |
| if unroll_accu & $mask != 0 { |
| return false; |
| } |
| offset += 4 * (ALU_ALIGNMENT / unit_size); |
| if offset > len_minus_unroll { |
| break; |
| } |
| } |
| } |
| while offset <= len_minus_stride { |
| accu |= unsafe { *(src.add(offset) as *const usize) }; |
| offset += ALU_ALIGNMENT / unit_size; |
| } |
| } |
| } |
| for &unit in &buffer[offset..] { |
| accu |= unit as usize; |
| } |
| accu & $mask == 0 |
| } |
| }; |
| } |
| |
| #[allow(unused_macros)] |
| macro_rules! by_unit_check_simd { |
| ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => { |
| #[inline(always)] |
| fn $name(buffer: &[$unit]) -> bool { |
| let mut offset = 0usize; |
| let mut accu = 0usize; |
| let unit_size = ::core::mem::size_of::<$unit>(); |
| let len = buffer.len(); |
| if len >= SIMD_STRIDE_SIZE / unit_size { |
| // The most common reason to return `false` is for the first code |
| // unit to fail the test, so check that first. |
| if buffer[0] >= $bound { |
| return false; |
| } |
| let src = buffer.as_ptr(); |
| let mut until_alignment = ((SIMD_ALIGNMENT |
| - ((src as usize) & SIMD_ALIGNMENT_MASK)) |
| & SIMD_ALIGNMENT_MASK) |
| / unit_size; |
| if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len { |
| if until_alignment != 0 { |
| accu |= buffer[offset] as usize; |
| offset += 1; |
| until_alignment -= 1; |
| while until_alignment != 0 { |
| accu |= buffer[offset] as usize; |
| offset += 1; |
| until_alignment -= 1; |
| } |
| if accu >= $bound { |
| return false; |
| } |
| } |
| let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; |
| if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len { |
| let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size)); |
| loop { |
| let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) } |
| | unsafe { |
| *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size)) |
| as *const $simd_ty) |
| } |
| | unsafe { |
| *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size))) |
| as *const $simd_ty) |
| } |
| | unsafe { |
| *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size))) |
| as *const $simd_ty) |
| }; |
| if !$func(unroll_accu) { |
| return false; |
| } |
| offset += 4 * (SIMD_STRIDE_SIZE / unit_size); |
| if offset > len_minus_unroll { |
| break; |
| } |
| } |
| } |
| let mut simd_accu = $splat; |
| while offset <= len_minus_stride { |
| simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) }; |
| offset += SIMD_STRIDE_SIZE / unit_size; |
| } |
| if !$func(simd_accu) { |
| return false; |
| } |
| } |
| } |
| for &unit in &buffer[offset..] { |
| accu |= unit as usize; |
| } |
| accu < $bound |
| } |
| }; |
| } |
| |
| cfg_if! { |
| if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { |
| use crate::simd_funcs::*; |
| use packed_simd::u8x16; |
| use packed_simd::u16x8; |
| |
| const SIMD_ALIGNMENT: usize = 16; |
| |
| const SIMD_ALIGNMENT_MASK: usize = 15; |
| |
| by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii); |
| by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin); |
| by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1); |
| |
| #[inline(always)] |
| fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { |
| // This function is a mess, because it simultaneously tries to do |
| // only aligned SIMD (perhaps misguidedly) and needs to deal with |
| // the last code unit in a SIMD stride being part of a valid |
| // surrogate pair. |
| let unit_size = ::core::mem::size_of::<u16>(); |
| let src = buffer.as_ptr(); |
| let len = buffer.len(); |
| let mut offset = 0usize; |
| 'outer: loop { |
| let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) & |
| SIMD_ALIGNMENT_MASK) / unit_size; |
| if until_alignment == 0 { |
| if offset + SIMD_STRIDE_SIZE / unit_size > len { |
| break; |
| } |
| } else { |
| let offset_plus_until_alignment = offset + until_alignment; |
| let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1; |
| if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len { |
| break; |
| } |
| let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]); |
| if up_to < until_alignment { |
| return offset + up_to; |
| } |
| if last_valid_low { |
| offset = offset_plus_until_alignment_plus_one; |
| continue; |
| } |
| offset = offset_plus_until_alignment; |
| } |
| let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; |
| loop { |
| let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size; |
| if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) { |
| if offset_plus_stride == len { |
| break 'outer; |
| } |
| let offset_plus_stride_plus_one = offset_plus_stride + 1; |
| let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]); |
| if up_to < SIMD_STRIDE_SIZE / unit_size { |
| return offset + up_to; |
| } |
| if last_valid_low { |
| offset = offset_plus_stride_plus_one; |
| continue 'outer; |
| } |
| } |
| offset = offset_plus_stride; |
| if offset > len_minus_stride { |
| break 'outer; |
| } |
| } |
| } |
| let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]); |
| offset + up_to |
| } |
| } else { |
| by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK); |
| by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK); |
| by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK); |
| |
| #[inline(always)] |
| fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { |
| let (up_to, _) = utf16_valid_up_to_alu(buffer); |
| up_to |
| } |
| } |
| } |
| |
| /// The second return value is true iff the last code unit of the slice was |
| /// reached and turned out to be a low surrogate that is part of a valid pair. |
| #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))] |
| #[inline(always)] |
| fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) { |
| let len = buffer.len(); |
| if len == 0 { |
| return (0, false); |
| } |
| let mut offset = 0usize; |
| loop { |
| let unit = buffer[offset]; |
| let next = offset + 1; |
| let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
| if unit_minus_surrogate_start > (0xDFFF - 0xD800) { |
| // Not a surrogate |
| offset = next; |
| if offset == len { |
| return (offset, false); |
| } |
| continue; |
| } |
| if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { |
| // high surrogate |
| if next < len { |
| let second = buffer[next]; |
| let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
| if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { |
| // The next code unit is a low surrogate. Advance position. |
| offset = next + 1; |
| if offset == len { |
| return (offset, true); |
| } |
| continue; |
| } |
| // The next code unit is not a low surrogate. Don't advance |
| // position and treat the high surrogate as unpaired. |
| // fall through |
| } |
| // Unpaired, fall through |
| } |
| // Unpaired surrogate |
| return (offset, false); |
| } |
| } |
| |
| cfg_if! { |
| if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { |
| #[inline(always)] |
| fn is_str_latin1_impl(buffer: &str) -> Option<usize> { |
| let mut offset = 0usize; |
| let bytes = buffer.as_bytes(); |
| let len = bytes.len(); |
| if len >= SIMD_STRIDE_SIZE { |
| let src = bytes.as_ptr(); |
| let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
| SIMD_ALIGNMENT_MASK; |
| if until_alignment + SIMD_STRIDE_SIZE <= len { |
| while until_alignment != 0 { |
| if bytes[offset] > 0xC3 { |
| return Some(offset); |
| } |
| offset += 1; |
| until_alignment -= 1; |
| } |
| let len_minus_stride = len - SIMD_STRIDE_SIZE; |
| loop { |
| if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) { |
| // TODO: Ensure this compiles away when inlined into `is_str_latin1()`. |
| while bytes[offset] & 0xC0 == 0x80 { |
| offset += 1; |
| } |
| return Some(offset); |
| } |
| offset += SIMD_STRIDE_SIZE; |
| if offset > len_minus_stride { |
| break; |
| } |
| } |
| } |
| } |
| for i in offset..len { |
| if bytes[i] > 0xC3 { |
| return Some(i); |
| } |
| } |
| None |
| } |
| } else { |
| #[inline(always)] |
| fn is_str_latin1_impl(buffer: &str) -> Option<usize> { |
| let mut bytes = buffer.as_bytes(); |
| let mut total = 0; |
| loop { |
| if let Some((byte, offset)) = validate_ascii(bytes) { |
| total += offset; |
| if byte > 0xC3 { |
| return Some(total); |
| } |
| bytes = &bytes[offset + 2..]; |
| total += 2; |
| } else { |
| return None; |
| } |
| } |
| } |
| } |
| } |
| |
| #[inline(always)] |
| fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> { |
| let mut bytes = buffer; |
| let mut total = 0; |
| loop { |
| if let Some((byte, offset)) = validate_ascii(bytes) { |
| total += offset; |
| if in_inclusive_range8(byte, 0xC2, 0xC3) { |
| let next = offset + 1; |
| if next == bytes.len() { |
| return Some(total); |
| } |
| if bytes[next] & 0xC0 != 0x80 { |
| return Some(total); |
| } |
| bytes = &bytes[offset + 2..]; |
| total += 2; |
| } else { |
| return Some(total); |
| } |
| } else { |
| return None; |
| } |
| } |
| } |
| |
| cfg_if! { |
| if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { |
| #[inline(always)] |
| fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { |
| let mut offset = 0usize; |
| let len = buffer.len(); |
| if len >= SIMD_STRIDE_SIZE / 2 { |
| let src = buffer.as_ptr(); |
| let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
| SIMD_ALIGNMENT_MASK) / 2; |
| if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { |
| while until_alignment != 0 { |
| if is_utf16_code_unit_bidi(buffer[offset]) { |
| return true; |
| } |
| offset += 1; |
| until_alignment -= 1; |
| } |
| let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); |
| loop { |
| if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) { |
| return true; |
| } |
| offset += SIMD_STRIDE_SIZE / 2; |
| if offset > len_minus_stride { |
| break; |
| } |
| } |
| } |
| } |
| for &u in &buffer[offset..] { |
| if is_utf16_code_unit_bidi(u) { |
| return true; |
| } |
| } |
| false |
| } |
| } else { |
| #[inline(always)] |
| fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { |
| for &u in buffer { |
| if is_utf16_code_unit_bidi(u) { |
| return true; |
| } |
| } |
| false |
| } |
| } |
| } |
| |
| cfg_if! { |
| if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { |
| #[inline(always)] |
| fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { |
| let mut offset = 0usize; |
| let len = buffer.len(); |
| if len >= SIMD_STRIDE_SIZE / 2 { |
| let src = buffer.as_ptr(); |
| let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
| SIMD_ALIGNMENT_MASK) / 2; |
| if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { |
| while until_alignment != 0 { |
| if buffer[offset] > 0xFF { |
| // This transition isn't optimal, since the aligment is recomputing |
| // but not tweaking further today. |
| if is_utf16_bidi_impl(&buffer[offset..]) { |
| return Latin1Bidi::Bidi; |
| } |
| return Latin1Bidi::LeftToRight; |
| } |
| offset += 1; |
| until_alignment -= 1; |
| } |
| let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); |
| loop { |
| let mut s = unsafe { *(src.add(offset) as *const u16x8) }; |
| if !simd_is_latin1(s) { |
| loop { |
| if is_u16x8_bidi(s) { |
| return Latin1Bidi::Bidi; |
| } |
| offset += SIMD_STRIDE_SIZE / 2; |
| if offset > len_minus_stride { |
| for &u in &buffer[offset..] { |
| if is_utf16_code_unit_bidi(u) { |
| return Latin1Bidi::Bidi; |
| } |
| } |
| return Latin1Bidi::LeftToRight; |
| } |
| s = unsafe { *(src.add(offset) as *const u16x8) }; |
| } |
| } |
| offset += SIMD_STRIDE_SIZE / 2; |
| if offset > len_minus_stride { |
| break; |
| } |
| } |
| } |
| } |
| let mut iter = (&buffer[offset..]).iter(); |
| loop { |
| if let Some(&u) = iter.next() { |
| if u > 0xFF { |
| let mut inner_u = u; |
| loop { |
| if is_utf16_code_unit_bidi(inner_u) { |
| return Latin1Bidi::Bidi; |
| } |
| if let Some(&code_unit) = iter.next() { |
| inner_u = code_unit; |
| } else { |
| return Latin1Bidi::LeftToRight; |
| } |
| } |
| } |
| } else { |
| return Latin1Bidi::Latin1; |
| } |
| } |
| } |
| } else { |
| #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))] |
| #[inline(always)] |
| fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { |
| let mut offset = 0usize; |
| let len = buffer.len(); |
| if len >= ALU_ALIGNMENT / 2 { |
| let src = buffer.as_ptr(); |
| let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & |
| ALU_ALIGNMENT_MASK) / 2; |
| if until_alignment + ALU_ALIGNMENT / 2 <= len { |
| while until_alignment != 0 { |
| if buffer[offset] > 0xFF { |
| if is_utf16_bidi_impl(&buffer[offset..]) { |
| return Latin1Bidi::Bidi; |
| } |
| return Latin1Bidi::LeftToRight; |
| } |
| offset += 1; |
| until_alignment -= 1; |
| } |
| let len_minus_stride = len - ALU_ALIGNMENT / 2; |
| loop { |
| if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 { |
| if is_utf16_bidi_impl(&buffer[offset..]) { |
| return Latin1Bidi::Bidi; |
| } |
| return Latin1Bidi::LeftToRight; |
| } |
| offset += ALU_ALIGNMENT / 2; |
| if offset > len_minus_stride { |
| break; |
| } |
| } |
| } |
| } |
| let mut iter = (&buffer[offset..]).iter(); |
| loop { |
| if let Some(&u) = iter.next() { |
| if u > 0xFF { |
| let mut inner_u = u; |
| loop { |
| if is_utf16_code_unit_bidi(inner_u) { |
| return Latin1Bidi::Bidi; |
| } |
| if let Some(&code_unit) = iter.next() { |
| inner_u = code_unit; |
| } else { |
| return Latin1Bidi::LeftToRight; |
| } |
| } |
| } |
| } else { |
| return Latin1Bidi::Latin1; |
| } |
| } |
| } |
| } |
| } |
| |
| /// Checks whether the buffer is all-ASCII. |
| /// |
| /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function |
| /// is not guaranteed to fail fast.) |
| pub fn is_ascii(buffer: &[u8]) -> bool { |
| is_ascii_impl(buffer) |
| } |
| |
| /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing |
| /// only ASCII characters). |
| /// |
| /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function |
| /// is not guaranteed to fail fast.) |
| pub fn is_basic_latin(buffer: &[u16]) -> bool { |
| is_basic_latin_impl(buffer) |
| } |
| |
| /// Checks whether the buffer is valid UTF-8 representing only code points |
| /// less than or equal to U+00FF. |
| /// |
| /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8 |
| /// invalidity or code points above U+00FF are discovered. |
| pub fn is_utf8_latin1(buffer: &[u8]) -> bool { |
| is_utf8_latin1_impl(buffer).is_none() |
| } |
| |
| /// Checks whether the buffer represents only code points less than or equal |
| /// to U+00FF. |
| /// |
| /// Fails fast. (I.e. returns before having read the whole buffer if code |
| /// points above U+00FF are discovered. |
| pub fn is_str_latin1(buffer: &str) -> bool { |
| is_str_latin1_impl(buffer).is_none() |
| } |
| |
| /// Checks whether the buffer represents only code point less than or equal |
| /// to U+00FF. |
| /// |
| /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function |
| /// is not guaranteed to fail fast.) |
| pub fn is_utf16_latin1(buffer: &[u16]) -> bool { |
| is_utf16_latin1_impl(buffer) |
| } |
| |
| /// Checks whether a potentially-invalid UTF-8 buffer contains code points |
| /// that trigger right-to-left processing. |
| /// |
| /// The check is done on a Unicode block basis without regard to assigned |
| /// vs. unassigned code points in the block. Hebrew presentation forms in |
| /// the Alphabetic Presentation Forms block are treated as if they formed |
| /// a block on their own (i.e. it treated as right-to-left). Additionally, |
| /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
| /// for. Control characters that are technically bidi controls but do not |
| /// cause right-to-left behavior without the presence of right-to-left |
| /// characters or right-to-left controls are not checked for. As a special |
| /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
| /// |
| /// Returns `true` if the input is invalid UTF-8 or the input contains an |
| /// RTL character. Returns `false` if the input is valid UTF-8 and contains |
| /// no RTL characters. |
| #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))] |
| #[inline] |
| pub fn is_utf8_bidi(buffer: &[u8]) -> bool { |
| // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster |
| // than UTF-8 validation followed by `is_str_bidi()` for German, |
| // Russian and Japanese. However, this is considerably slower for Thai. |
| // Chances are that the compiler makes some branch predictions that are |
| // unfortunate for Thai. Not spending the time to manually optimize |
| // further at this time, since it's unclear if this variant even has |
| // use cases. However, this is worth revisiting once Rust gets the |
| // ability to annotate relative priorities of match arms. |
| |
| // U+058F: D6 8F |
| // U+0590: D6 90 |
| // U+08FF: E0 A3 BF |
| // U+0900: E0 A4 80 |
| // |
| // U+200F: E2 80 8F |
| // U+202B: E2 80 AB |
| // U+202E: E2 80 AE |
| // U+2067: E2 81 A7 |
| // |
| // U+FB1C: EF AC 9C |
| // U+FB1D: EF AC 9D |
| // U+FDFF: EF B7 BF |
| // U+FE00: EF B8 80 |
| // |
| // U+FE6F: EF B9 AF |
| // U+FE70: EF B9 B0 |
| // U+FEFE: EF BB BE |
| // U+FEFF: EF BB BF |
| // |
| // U+107FF: F0 90 9F BF |
| // U+10800: F0 90 A0 80 |
| // U+10FFF: F0 90 BF BF |
| // U+11000: F0 91 80 80 |
| // |
| // U+1E7FF: F0 9E 9F BF |
| // U+1E800: F0 9E A0 80 |
| // U+1EFFF: F0 9E BF BF |
| // U+1F000: F0 9F 80 80 |
| let mut src = buffer; |
| 'outer: loop { |
| if let Some((mut byte, mut read)) = validate_ascii(src) { |
| // Check for the longest sequence to avoid checking twice for the |
| // multi-byte sequences. |
| if read + 4 <= src.len() { |
| 'inner: loop { |
| // At this point, `byte` is not included in `read`. |
| match byte { |
| 0..=0x7F => { |
| // ASCII: go back to SIMD. |
| read += 1; |
| src = &src[read..]; |
| continue 'outer; |
| } |
| 0xC2..=0xD5 => { |
| // Two-byte |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| if !in_inclusive_range8(second, 0x80, 0xBF) { |
| return true; |
| } |
| read += 2; |
| } |
| 0xD6 => { |
| // Two-byte |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| if !in_inclusive_range8(second, 0x80, 0xBF) { |
| return true; |
| } |
| // XXX consider folding the above and below checks |
| if second > 0x8F { |
| return true; |
| } |
| read += 2; |
| } |
| // two-byte starting with 0xD7 and above is bidi |
| 0xE1 | 0xE3..=0xEC | 0xEE => { |
| // Three-byte normal |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { |
| *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
| }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| read += 3; |
| } |
| 0xE2 => { |
| // Three-byte normal, potentially bidi |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { |
| *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
| }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| if second == 0x80 { |
| if third == 0x8F || third == 0xAB || third == 0xAE { |
| return true; |
| } |
| } else if second == 0x81 { |
| if third == 0xA7 { |
| return true; |
| } |
| } |
| read += 3; |
| } |
| 0xEF => { |
| // Three-byte normal, potentially bidi |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { |
| *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
| }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| if in_inclusive_range8(second, 0xAC, 0xB7) { |
| if second == 0xAC { |
| if third > 0x9C { |
| return true; |
| } |
| } else { |
| return true; |
| } |
| } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
| if second == 0xB9 { |
| if third > 0xAF { |
| return true; |
| } |
| } else if second == 0xBB { |
| if third != 0xBF { |
| return true; |
| } |
| } else { |
| return true; |
| } |
| } |
| read += 3; |
| } |
| 0xE0 => { |
| // Three-byte special lower bound, potentially bidi |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { |
| *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
| }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| // XXX can this be folded into the above validity check |
| if second < 0xA4 { |
| return true; |
| } |
| read += 3; |
| } |
| 0xED => { |
| // Three-byte special upper bound |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { |
| *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
| }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| read += 3; |
| } |
| 0xF1..=0xF4 => { |
| // Four-byte normal |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| let fourth = unsafe { *(src.get_unchecked(read + 3)) }; |
| if (u16::from( |
| UTF8_DATA.table[usize::from(second)] |
| & unsafe { |
| *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
| }, |
| ) | u16::from(third >> 6) |
| | (u16::from(fourth & 0xC0) << 2)) |
| != 0x202 |
| { |
| return true; |
| } |
| read += 4; |
| } |
| 0xF0 => { |
| // Four-byte special lower bound, potentially bidi |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| let fourth = unsafe { *(src.get_unchecked(read + 3)) }; |
| if (u16::from( |
| UTF8_DATA.table[usize::from(second)] |
| & unsafe { |
| *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
| }, |
| ) | u16::from(third >> 6) |
| | (u16::from(fourth & 0xC0) << 2)) |
| != 0x202 |
| { |
| return true; |
| } |
| if unlikely(second == 0x90 || second == 0x9E) { |
| let third = src[read + 2]; |
| if third >= 0xA0 { |
| return true; |
| } |
| } |
| read += 4; |
| } |
| _ => { |
| // Invalid lead or bidi-only lead |
| return true; |
| } |
| } |
| if read + 4 > src.len() { |
| if read == src.len() { |
| return false; |
| } |
| byte = src[read]; |
| break 'inner; |
| } |
| byte = src[read]; |
| continue 'inner; |
| } |
| } |
| // We can't have a complete 4-byte sequence, but we could still have |
| // a complete shorter sequence. |
| |
| // At this point, `byte` is not included in `read`. |
| match byte { |
| 0..=0x7F => { |
| // ASCII: go back to SIMD. |
| read += 1; |
| src = &src[read..]; |
| continue 'outer; |
| } |
| 0xC2..=0xD5 => { |
| // Two-byte |
| let new_read = read + 2; |
| if new_read > src.len() { |
| return true; |
| } |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| if !in_inclusive_range8(second, 0x80, 0xBF) { |
| return true; |
| } |
| read = new_read; |
| // We need to deal with the case where we came here with 3 bytes |
| // left, so we need to take a look at the last one. |
| src = &src[read..]; |
| continue 'outer; |
| } |
| 0xD6 => { |
| // Two-byte, potentially bidi |
| let new_read = read + 2; |
| if new_read > src.len() { |
| return true; |
| } |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| if !in_inclusive_range8(second, 0x80, 0xBF) { |
| return true; |
| } |
| // XXX consider folding the above and below checks |
| if second > 0x8F { |
| return true; |
| } |
| read = new_read; |
| // We need to deal with the case where we came here with 3 bytes |
| // left, so we need to take a look at the last one. |
| src = &src[read..]; |
| continue 'outer; |
| } |
| // two-byte starting with 0xD7 and above is bidi |
| 0xE1 | 0xE3..=0xEC | 0xEE => { |
| // Three-byte normal |
| let new_read = read + 3; |
| if new_read > src.len() { |
| return true; |
| } |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| } |
| 0xE2 => { |
| // Three-byte normal, potentially bidi |
| let new_read = read + 3; |
| if new_read > src.len() { |
| return true; |
| } |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| if second == 0x80 { |
| if third == 0x8F || third == 0xAB || third == 0xAE { |
| return true; |
| } |
| } else if second == 0x81 { |
| if third == 0xA7 { |
| return true; |
| } |
| } |
| } |
| 0xEF => { |
| // Three-byte normal, potentially bidi |
| let new_read = read + 3; |
| if new_read > src.len() { |
| return true; |
| } |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| if in_inclusive_range8(second, 0xAC, 0xB7) { |
| if second == 0xAC { |
| if third > 0x9C { |
| return true; |
| } |
| } else { |
| return true; |
| } |
| } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
| if second == 0xB9 { |
| if third > 0xAF { |
| return true; |
| } |
| } else if second == 0xBB { |
| if third != 0xBF { |
| return true; |
| } |
| } else { |
| return true; |
| } |
| } |
| } |
| 0xE0 => { |
| // Three-byte special lower bound, potentially bidi |
| let new_read = read + 3; |
| if new_read > src.len() { |
| return true; |
| } |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| // XXX can this be folded into the above validity check |
| if second < 0xA4 { |
| return true; |
| } |
| } |
| 0xED => { |
| // Three-byte special upper bound |
| let new_read = read + 3; |
| if new_read > src.len() { |
| return true; |
| } |
| let second = unsafe { *(src.get_unchecked(read + 1)) }; |
| let third = unsafe { *(src.get_unchecked(read + 2)) }; |
| if ((UTF8_DATA.table[usize::from(second)] |
| & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
| | (third >> 6)) |
| != 2 |
| { |
| return true; |
| } |
| } |
| _ => { |
| // Invalid lead, 4-byte lead or 2-byte bidi-only lead |
| return true; |
| } |
| } |
| return false; |
| } else { |
| return false; |
| } |
| } |
| } |
| |
| /// Checks whether a valid UTF-8 buffer contains code points that trigger |
| /// right-to-left processing. |
| /// |
| /// The check is done on a Unicode block basis without regard to assigned |
| /// vs. unassigned code points in the block. Hebrew presentation forms in |
| /// the Alphabetic Presentation Forms block are treated as if they formed |
| /// a block on their own (i.e. it treated as right-to-left). Additionally, |
| /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
| /// for. Control characters that are technically bidi controls but do not |
| /// cause right-to-left behavior without the presence of right-to-left |
| /// characters or right-to-left controls are not checked for. As a special |
| /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
| #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))] |
| #[inline] |
| pub fn is_str_bidi(buffer: &str) -> bool { |
| // U+058F: D6 8F |
| // U+0590: D6 90 |
| // U+08FF: E0 A3 BF |
| // U+0900: E0 A4 80 |
| // |
| // U+200F: E2 80 8F |
| // U+202B: E2 80 AB |
| // U+202E: E2 80 AE |
| // U+2067: E2 81 A7 |
| // |
| // U+FB1C: EF AC 9C |
| // U+FB1D: EF AC 9D |
| // U+FDFF: EF B7 BF |
| // U+FE00: EF B8 80 |
| // |
| // U+FE6F: EF B9 AF |
| // U+FE70: EF B9 B0 |
| // U+FEFE: EF BB BE |
| // U+FEFF: EF BB BF |
| // |
| // U+107FF: F0 90 9F BF |
| // U+10800: F0 90 A0 80 |
| // U+10FFF: F0 90 BF BF |
| // U+11000: F0 91 80 80 |
| // |
| // U+1E7FF: F0 9E 9F BF |
| // U+1E800: F0 9E A0 80 |
| // U+1EFFF: F0 9E BF BF |
| // U+1F000: F0 9F 80 80 |
| let mut bytes = buffer.as_bytes(); |
| 'outer: loop { |
| // TODO: Instead of just validating ASCII using SIMD, use SIMD |
| // to check for non-ASCII lead bytes, too, to quickly conclude |
| // that the vector consist entirely of CJK and below-Hebrew |
| // code points. |
| // Unfortunately, scripts above Arabic but below CJK share |
| // lead bytes with RTL. |
| if let Some((mut byte, mut read)) = validate_ascii(bytes) { |
| 'inner: loop { |
| // At this point, `byte` is not included in `read`. |
| if byte < 0xE0 { |
| if byte >= 0x80 { |
| // Two-byte |
| // Adding `unlikely` here improved throughput on |
| // Russian plain text by 33%! |
| if unlikely(byte >= 0xD6) { |
| if byte == 0xD6 { |
| let second = bytes[read + 1]; |
| if second > 0x8F { |
| return true; |
| } |
| } else { |
| return true; |
| } |
| } |
| read += 2; |
| } else { |
| // ASCII: write and go back to SIMD. |
| read += 1; |
| // Intuitively, we should go back to the outer loop only |
| // if byte is 0x30 or above, so as to avoid trashing on |
| // ASCII space, comma and period in non-Latin context. |
| // However, the extra branch seems to cost more than it's |
| // worth. |
| bytes = &bytes[read..]; |
| continue 'outer; |
| } |
| } else if byte < 0xF0 { |
| // Three-byte |
| if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) { |
| let second = bytes[read + 1]; |
| if byte == 0xE0 { |
| if second < 0xA4 { |
| return true; |
| } |
| } else if byte == 0xE2 { |
| let third = bytes[read + 2]; |
| if second == 0x80 { |
| if third == 0x8F || third == 0xAB || third == 0xAE { |
| return true; |
| } |
| } else if second == 0x81 { |
| if third == 0xA7 { |
| return true; |
| } |
| } |
| } else { |
| debug_assert_eq!(byte, 0xEF); |
| if in_inclusive_range8(second, 0xAC, 0xB7) { |
| if second == 0xAC { |
| let third = bytes[read + 2]; |
| if third > 0x9C { |
| return true; |
| } |
| } else { |
| return true; |
| } |
| } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
| if second == 0xB9 { |
| let third = bytes[read + 2]; |
| if third > 0xAF { |
| return true; |
| } |
| } else if second == 0xBB { |
| let third = bytes[read + 2]; |
| if third != 0xBF { |
| return true; |
| } |
| } else { |
| return true; |
| } |
| } |
| } |
| } |
| read += 3; |
| } else { |
| // Four-byte |
| let second = bytes[read + 1]; |
| if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) { |
| let third = bytes[read + 2]; |
| if third >= 0xA0 { |
| return true; |
| } |
| } |
| read += 4; |
| } |
| // The comparison is always < or == and never >, but including |
| // > here to let the compiler assume that < is true if this |
| // comparison is false. |
| if read >= bytes.len() { |
| return false; |
| } |
| byte = bytes[read]; |
| continue 'inner; |
| } |
| } else { |
| return false; |
| } |
| } |
| } |
| |
| /// Checks whether a UTF-16 buffer contains code points that trigger |
| /// right-to-left processing. |
| /// |
| /// The check is done on a Unicode block basis without regard to assigned |
| /// vs. unassigned code points in the block. Hebrew presentation forms in |
| /// the Alphabetic Presentation Forms block are treated as if they formed |
| /// a block on their own (i.e. it treated as right-to-left). Additionally, |
| /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
| /// for. Control characters that are technically bidi controls but do not |
| /// cause right-to-left behavior without the presence of right-to-left |
| /// characters or right-to-left controls are not checked for. As a special |
| /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
| /// |
| /// Returns `true` if the input contains an RTL character or an unpaired |
| /// high surrogate that could be the high half of an RTL character. |
| /// Returns `false` if the input contains neither RTL characters nor |
| /// unpaired high surrogates that could be higher halves of RTL characters. |
| pub fn is_utf16_bidi(buffer: &[u16]) -> bool { |
| is_utf16_bidi_impl(buffer) |
| } |
| |
| /// Checks whether a scalar value triggers right-to-left processing. |
| /// |
| /// The check is done on a Unicode block basis without regard to assigned |
| /// vs. unassigned code points in the block. Hebrew presentation forms in |
| /// the Alphabetic Presentation Forms block are treated as if they formed |
| /// a block on their own (i.e. it treated as right-to-left). Additionally, |
| /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
| /// for. Control characters that are technically bidi controls but do not |
| /// cause right-to-left behavior without the presence of right-to-left |
| /// characters or right-to-left controls are not checked for. As a special |
| /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
| #[inline(always)] |
| pub fn is_char_bidi(c: char) -> bool { |
| // Controls: |
| // Every control with RIGHT-TO-LEFT in its name in |
| // https://www.unicode.org/charts/PDF/U2000.pdf |
| // U+200F RLM |
| // U+202B RLE |
| // U+202E RLO |
| // U+2067 RLI |
| // |
| // BMP RTL: |
| // https://www.unicode.org/roadmaps/bmp/ |
| // U+0590...U+08FF |
| // U+FB1D...U+FDFF Hebrew presentation forms and |
| // Arabic Presentation Forms A |
| // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM) |
| // |
| // Supplementary RTL: |
| // https://www.unicode.org/roadmaps/smp/ |
| // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803) |
| // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B) |
| let code_point = u32::from(c); |
| if code_point < 0x0590 { |
| // Below Hebrew |
| return false; |
| } |
| if in_range32(code_point, 0x0900, 0xFB1D) { |
| // Above Arabic Extended-A and below Hebrew presentation forms |
| if in_inclusive_range32(code_point, 0x200F, 0x2067) { |
| // In the range that contains the RTL controls |
| return code_point == 0x200F |
| || code_point == 0x202B |
| || code_point == 0x202E |
| || code_point == 0x2067; |
| } |
| return false; |
| } |
| if code_point > 0x1EFFF { |
| // Above second astral RTL. (Emoji is here.) |
| return false; |
| } |
| if in_range32(code_point, 0x11000, 0x1E800) { |
| // Between astral RTL blocks |
| return false; |
| } |
| if in_range32(code_point, 0xFEFF, 0x10800) { |
| // Above Arabic Presentations Forms B (excl. BOM) and below first |
| // astral RTL |
| return false; |
| } |
| if in_range32(code_point, 0xFE00, 0xFE70) { |
| // Between Arabic Presentations Forms |
| return false; |
| } |
| true |
| } |
| |
| /// Checks whether a UTF-16 code unit triggers right-to-left processing. |
| /// |
| /// The check is done on a Unicode block basis without regard to assigned |
| /// vs. unassigned code points in the block. Hebrew presentation forms in |
| /// the Alphabetic Presentation Forms block are treated as if they formed |
| /// a block on their own (i.e. it treated as right-to-left). Additionally, |
| /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
| /// for. Control characters that are technically bidi controls but do not |
| /// cause right-to-left behavior without the presence of right-to-left |
| /// characters or right-to-left controls are not checked for. As a special |
| /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
| /// |
| /// Since supplementary-plane right-to-left blocks are identifiable from the |
| /// high surrogate without examining the low surrogate, this function returns |
| /// `true` for such high surrogates making the function suitable for handling |
| /// supplementary-plane text without decoding surrogate pairs to scalar |
| /// values. Obviously, such high surrogates are then reported as right-to-left |
| /// even if actually unpaired. |
| #[inline(always)] |
| pub fn is_utf16_code_unit_bidi(u: u16) -> bool { |
| if u < 0x0590 { |
| // Below Hebrew |
| return false; |
| } |
| if in_range16(u, 0x0900, 0xD802) { |
| // Above Arabic Extended-A and below first RTL surrogate |
| if in_inclusive_range16(u, 0x200F, 0x2067) { |
| // In the range that contains the RTL controls |
| return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067; |
| } |
| return false; |
| } |
| if in_range16(u, 0xD83C, 0xFB1D) { |
| // Between astral RTL high surrogates and Hebrew presentation forms |
| // (Emoji is here) |
| return false; |
| } |
| if in_range16(u, 0xD804, 0xD83A) { |
| // Between RTL high surragates |
| return false; |
| } |
| if u > 0xFEFE { |
| // Above Arabic Presentation Forms (excl. BOM) |
| return false; |
| } |
| if in_range16(u, 0xFE00, 0xFE70) { |
| // Between Arabic Presentations Forms |
| return false; |
| } |
| true |
| } |
| |
| /// Checks whether a potentially invalid UTF-8 buffer contains code points |
| /// that trigger right-to-left processing or is all-Latin1. |
| /// |
| /// Possibly more efficient than performing the checks separately. |
| /// |
| /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. |
| /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return |
| /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
| pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi { |
| if let Some(offset) = is_utf8_latin1_impl(buffer) { |
| if is_utf8_bidi(&buffer[offset..]) { |
| Latin1Bidi::Bidi |
| } else { |
| Latin1Bidi::LeftToRight |
| } |
| } else { |
| Latin1Bidi::Latin1 |
| } |
| } |
| |
| /// Checks whether a valid UTF-8 buffer contains code points |
| /// that trigger right-to-left processing or is all-Latin1. |
| /// |
| /// Possibly more efficient than performing the checks separately. |
| /// |
| /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. |
| /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return |
| /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
| pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi { |
| // The transition from the latin1 check to the bidi check isn't |
| // optimal but not tweaking it to perfection today. |
| if let Some(offset) = is_str_latin1_impl(buffer) { |
| if is_str_bidi(&buffer[offset..]) { |
| Latin1Bidi::Bidi |
| } else { |
| Latin1Bidi::LeftToRight |
| } |
| } else { |
| Latin1Bidi::Latin1 |
| } |
| } |
| |
| /// Checks whether a potentially invalid UTF-16 buffer contains code points |
| /// that trigger right-to-left processing or is all-Latin1. |
| /// |
| /// Possibly more efficient than performing the checks separately. |
| /// |
| /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. |
| /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return |
| /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
| pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi { |
| check_utf16_for_latin1_and_bidi_impl(buffer) |
| } |
| |
| /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced |
| /// with the REPLACEMENT CHARACTER. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer _plus one_. |
| /// |
| /// Returns the number of `u16`s written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize { |
| // TODO: Can the requirement for dst to be at least one unit longer |
| // be eliminated? |
| assert!(dst.len() > src.len()); |
| let mut decoder = Utf8Decoder::new_inner(); |
| let mut total_read = 0usize; |
| let mut total_written = 0usize; |
| loop { |
| let (result, read, written) = |
| decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true); |
| total_read += read; |
| total_written += written; |
| match result { |
| DecoderResult::InputEmpty => { |
| return total_written; |
| } |
| DecoderResult::OutputFull => { |
| unreachable!("The assert at the top of the function should have caught this."); |
| } |
| DecoderResult::Malformed(_, _) => { |
| // There should always be space for the U+FFFD, because |
| // otherwise we'd have gotten OutputFull already. |
| dst[total_written] = 0xFFFD; |
| total_written += 1; |
| } |
| } |
| } |
| } |
| |
| /// Converts valid UTF-8 to valid UTF-16. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer. |
| /// |
| /// Returns the number of `u16`s written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { |
| assert!( |
| dst.len() >= src.len(), |
| "Destination must not be shorter than the source." |
| ); |
| let bytes = src.as_bytes(); |
| let mut read = 0; |
| let mut written = 0; |
| 'outer: loop { |
| let mut byte = { |
| let src_remaining = &bytes[read..]; |
| let dst_remaining = &mut dst[written..]; |
| let length = src_remaining.len(); |
| match unsafe { |
| ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
| } { |
| None => { |
| written += length; |
| return written; |
| } |
| Some((non_ascii, consumed)) => { |
| read += consumed; |
| written += consumed; |
| non_ascii |
| } |
| } |
| }; |
| 'inner: loop { |
| // At this point, `byte` is not included in `read`. |
| if byte < 0xE0 { |
| if byte >= 0x80 { |
| // Two-byte |
| let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
| let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F); |
| unsafe { *(dst.get_unchecked_mut(written)) = point }; |
| read += 2; |
| written += 1; |
| } else { |
| // ASCII: write and go back to SIMD. |
| unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; |
| read += 1; |
| written += 1; |
| // Intuitively, we should go back to the outer loop only |
| // if byte is 0x30 or above, so as to avoid trashing on |
| // ASCII space, comma and period in non-Latin context. |
| // However, the extra branch seems to cost more than it's |
| // worth. |
| continue 'outer; |
| } |
| } else if byte < 0xF0 { |
| // Three-byte |
| let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
| let third = unsafe { *(bytes.get_unchecked(read + 2)) }; |
| let point = ((u16::from(byte) & 0xF) << 12) |
| | ((u16::from(second) & 0x3F) << 6) |
| | (u16::from(third) & 0x3F); |
| unsafe { *(dst.get_unchecked_mut(written)) = point }; |
| read += 3; |
| written += 1; |
| } else { |
| // Four-byte |
| let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
| let third = unsafe { *(bytes.get_unchecked(read + 2)) }; |
| let fourth = unsafe { *(bytes.get_unchecked(read + 3)) }; |
| let point = ((u32::from(byte) & 0x7) << 18) |
| | ((u32::from(second) & 0x3F) << 12) |
| | ((u32::from(third) & 0x3F) << 6) |
| | (u32::from(fourth) & 0x3F); |
| unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 }; |
| unsafe { |
| *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16 |
| }; |
| read += 4; |
| written += 2; |
| } |
| // The comparison is always < or == and never >, but including |
| // > here to let the compiler assume that < is true if this |
| // comparison is false. |
| if read >= src.len() { |
| return written; |
| } |
| byte = bytes[read]; |
| continue 'inner; |
| } |
| } |
| } |
| |
| /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer. |
| /// |
| /// Returns the number of `u16`s written or `None` if the input was invalid. |
| /// |
| /// When the input was invalid, some output may have been written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> { |
| assert!( |
| dst.len() >= src.len(), |
| "Destination must not be shorter than the source." |
| ); |
| let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst); |
| if read == src.len() { |
| return Some(written); |
| } |
| None |
| } |
| |
| /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
| /// with the REPLACEMENT CHARACTER with potentially insufficient output |
| /// space. |
| /// |
| /// Returns the number of code units read and the number of bytes written. |
| /// |
| /// Guarantees that the bytes in the destination beyond the number of |
| /// bytes claimed as written by the second item of the return tuple |
| /// are left unmodified. |
| /// |
| /// Not all code units are read if there isn't enough output space. |
| /// |
| /// Note that this method isn't designed for general streamability but for |
| /// not allocating memory for the worst case up front. Specifically, |
| /// if the input starts with or ends with an unpaired surrogate, those are |
| /// replaced with the REPLACEMENT CHARACTER. |
| /// |
| /// Matches the semantics of `TextEncoder.encodeInto()` from the |
| /// Encoding Standard. |
| /// |
| /// # Safety |
| /// |
| /// If you want to convert into a `&mut str`, use |
| /// `convert_utf16_to_str_partial()` instead of using this function |
| /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. |
| #[inline(always)] |
| pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) { |
| // The two functions called below are marked `inline(never)` to make |
| // transitions from the hot part (first function) into the cold part |
| // (second function) go through a return and another call to discouge |
| // the CPU from speculating from the hot code into the cold code. |
| // Letting the transitions be mere intra-function jumps, even to |
| // basic blocks out-of-lined to the end of the function would wipe |
| // away a quarter of Arabic encode performance on Haswell! |
| let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst); |
| if likely(read == src.len()) { |
| return (read, written); |
| } |
| let (tail_read, tail_written) = |
| convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]); |
| (read + tail_read, written + tail_written) |
| } |
| |
| /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
| /// with the REPLACEMENT CHARACTER. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer times three. |
| /// |
| /// Returns the number of bytes written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| /// |
| /// # Safety |
| /// |
| /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()` |
| /// instead of using this function together with the `unsafe` method |
| /// `as_bytes_mut()` on `&mut str`. |
| #[inline(always)] |
| pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize { |
| assert!(dst.len() >= src.len() * 3); |
| let (read, written) = convert_utf16_to_utf8_partial(src, dst); |
| debug_assert_eq!(read, src.len()); |
| written |
| } |
| |
| /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
| /// with the REPLACEMENT CHARACTER such that the validity of the output is |
| /// signaled using the Rust type system with potentially insufficient output |
| /// space. |
| /// |
| /// Returns the number of code units read and the number of bytes written. |
| /// |
| /// Not all code units are read if there isn't enough output space. |
| /// |
| /// Note that this method isn't designed for general streamability but for |
| /// not allocating memory for the worst case up front. Specifically, |
| /// if the input starts with or ends with an unpaired surrogate, those are |
| /// replaced with the REPLACEMENT CHARACTER. |
| pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) { |
| let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; |
| let (read, written) = convert_utf16_to_utf8_partial(src, bytes); |
| let len = bytes.len(); |
| let mut trail = written; |
| while trail < len && ((bytes[trail] & 0xC0) == 0x80) { |
| bytes[trail] = 0; |
| trail += 1; |
| } |
| (read, written) |
| } |
| |
| /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
| /// with the REPLACEMENT CHARACTER such that the validity of the output is |
| /// signaled using the Rust type system. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer times three. |
| /// |
| /// Returns the number of bytes written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| #[inline(always)] |
| pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize { |
| assert!(dst.len() >= src.len() * 3); |
| let (read, written) = convert_utf16_to_str_partial(src, dst); |
| debug_assert_eq!(read, src.len()); |
| written |
| } |
| |
| /// Converts bytes whose unsigned value is interpreted as Unicode code point |
| /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer. |
| /// |
| /// The number of `u16`s written equals the length of the source buffer. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) { |
| assert!( |
| dst.len() >= src.len(), |
| "Destination must not be shorter than the source." |
| ); |
| // TODO: On aarch64, the safe version autovectorizes to the same unpacking |
| // instructions and this code, but, yet, the autovectorized version is |
| // faster. |
| unsafe { |
| unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); |
| } |
| } |
| |
| /// Converts bytes whose unsigned value is interpreted as Unicode code point |
| /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient |
| /// output space. |
| /// |
| /// Returns the number of bytes read and the number of bytes written. |
| /// |
| /// If the output isn't large enough, not all input is consumed. |
| /// |
| /// # Safety |
| /// |
| /// If you want to convert into a `&mut str`, use |
| /// `convert_utf16_to_str_partial()` instead of using this function |
| /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. |
| pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) { |
| let src_len = src.len(); |
| let src_ptr = src.as_ptr(); |
| let dst_ptr = dst.as_mut_ptr(); |
| let dst_len = dst.len(); |
| let mut total_read = 0usize; |
| let mut total_written = 0usize; |
| loop { |
| // src can't advance more than dst |
| let src_left = src_len - total_read; |
| let dst_left = dst_len - total_written; |
| let min_left = ::core::cmp::min(src_left, dst_left); |
| if let Some((non_ascii, consumed)) = unsafe { |
| ascii_to_ascii( |
| src_ptr.add(total_read), |
| dst_ptr.add(total_written), |
| min_left, |
| ) |
| } { |
| total_read += consumed; |
| total_written += consumed; |
| if total_written.checked_add(2).unwrap() > dst_len { |
| return (total_read, total_written); |
| } |
| |
| total_read += 1; // consume `non_ascii` |
| |
| dst[total_written] = (non_ascii >> 6) | 0xC0; |
| total_written += 1; |
| dst[total_written] = (non_ascii & 0x3F) | 0x80; |
| total_written += 1; |
| continue; |
| } |
| return (total_read + min_left, total_written + min_left); |
| } |
| } |
| |
| /// Converts bytes whose unsigned value is interpreted as Unicode code point |
| /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer times two. |
| /// |
| /// Returns the number of bytes written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| /// |
| /// # Safety |
| /// |
| /// Note that this function may write garbage beyond the number of bytes |
| /// indicated by the return value, so using a `&mut str` interpreted as |
| /// `&mut [u8]` as the destination is not safe. If you want to convert into |
| /// a `&mut str`, use `convert_utf16_to_str()` instead of this function. |
| #[inline] |
| pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize { |
| assert!( |
| dst.len() >= src.len() * 2, |
| "Destination must not be shorter than the source times two." |
| ); |
| let (read, written) = convert_latin1_to_utf8_partial(src, dst); |
| debug_assert_eq!(read, src.len()); |
| written |
| } |
| |
| /// Converts bytes whose unsigned value is interpreted as Unicode code point |
| /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the |
| /// output is signaled using the Rust type system with potentially insufficient |
| /// output space. |
| /// |
| /// Returns the number of bytes read and the number of bytes written. |
| /// |
| /// If the output isn't large enough, not all input is consumed. |
| #[inline] |
| pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) { |
| let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; |
| let (read, written) = convert_latin1_to_utf8_partial(src, bytes); |
| let len = bytes.len(); |
| let mut trail = written; |
| let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE); |
| while trail < max { |
| bytes[trail] = 0; |
| trail += 1; |
| } |
| while trail < len && ((bytes[trail] & 0xC0) == 0x80) { |
| bytes[trail] = 0; |
| trail += 1; |
| } |
| (read, written) |
| } |
| |
| /// Converts bytes whose unsigned value is interpreted as Unicode code point |
| /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the |
| /// output is signaled using the Rust type system. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer times two. |
| /// |
| /// Returns the number of bytes written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| #[inline] |
| pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize { |
| assert!( |
| dst.len() >= src.len() * 2, |
| "Destination must not be shorter than the source times two." |
| ); |
| let (read, written) = convert_latin1_to_str_partial(src, dst); |
| debug_assert_eq!(read, src.len()); |
| written |
| } |
| |
| /// If the input is valid UTF-8 representing only Unicode code points from |
| /// U+0000 to U+00FF, inclusive, converts the input into output that |
| /// represents the value of each code point as the unsigned byte value of |
| /// each output byte. |
| /// |
| /// If the input does not fulfill the condition stated above, this function |
| /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise |
| /// does something that is memory-safe without any promises about any |
| /// properties of the output. In particular, callers shouldn't assume the |
| /// output to be the same across crate versions or CPU architectures and |
| /// should not assume that non-ASCII input can't map to ASCII output. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer. |
| /// |
| /// Returns the number of bytes written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| /// |
| /// If debug assertions are enabled (and not fuzzing) and the input is |
| /// not in the range U+0000 to U+00FF, inclusive. |
| pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize { |
| assert!( |
| dst.len() >= src.len(), |
| "Destination must not be shorter than the source." |
| ); |
| non_fuzz_debug_assert!(is_utf8_latin1(src)); |
| let src_len = src.len(); |
| let src_ptr = src.as_ptr(); |
| let dst_ptr = dst.as_mut_ptr(); |
| let mut total_read = 0usize; |
| let mut total_written = 0usize; |
| loop { |
| // dst can't advance more than src |
| let src_left = src_len - total_read; |
| if let Some((non_ascii, consumed)) = unsafe { |
| ascii_to_ascii( |
| src_ptr.add(total_read), |
| dst_ptr.add(total_written), |
| src_left, |
| ) |
| } { |
| total_read += consumed + 1; |
| total_written += consumed; |
| |
| if total_read == src_len { |
| return total_written; |
| } |
| |
| let trail = src[total_read]; |
| total_read += 1; |
| |
| dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F); |
| total_written += 1; |
| continue; |
| } |
| return total_written + src_left; |
| } |
| } |
| |
| /// If the input is valid UTF-16 representing only Unicode code points from |
| /// U+0000 to U+00FF, inclusive, converts the input into output that |
| /// represents the value of each code point as the unsigned byte value of |
| /// each output byte. |
| /// |
| /// If the input does not fulfill the condition stated above, does something |
| /// that is memory-safe without any promises about any properties of the |
| /// output and will probably assert in debug builds in future versions. |
| /// In particular, callers shouldn't assume the output to be the same across |
| /// crate versions or CPU architectures and should not assume that non-ASCII |
| /// input can't map to ASCII output. |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer. |
| /// |
| /// The number of bytes written equals the length of the source buffer. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| /// |
| /// (Probably in future versions if debug assertions are enabled (and not |
| /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) |
| pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) { |
| assert!( |
| dst.len() >= src.len(), |
| "Destination must not be shorter than the source." |
| ); |
| // non_fuzz_debug_assert!(is_utf16_latin1(src)); |
| unsafe { |
| pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); |
| } |
| } |
| |
| /// Converts bytes whose unsigned value is interpreted as Unicode code point |
| /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. |
| /// |
| /// Borrows if input is ASCII-only. Performs a single heap allocation |
| /// otherwise. |
| /// |
| /// Only available if the `alloc` feature is enabled (enabled by default). |
| #[cfg(feature = "alloc")] |
| pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { |
| let up_to = ascii_valid_up_to(bytes); |
| // >= makes later things optimize better than == |
| if up_to >= bytes.len() { |
| debug_assert_eq!(up_to, bytes.len()); |
| let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) }; |
| return Cow::Borrowed(s); |
| } |
| let (head, tail) = bytes.split_at(up_to); |
| let capacity = head.len() + tail.len() * 2; |
| let mut vec = Vec::with_capacity(capacity); |
| unsafe { |
| vec.set_len(capacity); |
| } |
| (&mut vec[..up_to]).copy_from_slice(head); |
| let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]); |
| vec.truncate(up_to + written); |
| Cow::Owned(unsafe { String::from_utf8_unchecked(vec) }) |
| } |
| |
| /// If the input is valid UTF-8 representing only Unicode code points from |
| /// U+0000 to U+00FF, inclusive, converts the input into output that |
| /// represents the value of each code point as the unsigned byte value of |
| /// each output byte. |
| /// |
| /// If the input does not fulfill the condition stated above, this function |
| /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise |
| /// does something that is memory-safe without any promises about any |
| /// properties of the output. In particular, callers shouldn't assume the |
| /// output to be the same across crate versions or CPU architectures and |
| /// should not assume that non-ASCII input can't map to ASCII output. |
| /// |
| /// Borrows if input is ASCII-only. Performs a single heap allocation |
| /// otherwise. |
| /// |
| /// Only available if the `alloc` feature is enabled (enabled by default). |
| #[cfg(feature = "alloc")] |
| pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> { |
| let bytes = string.as_bytes(); |
| let up_to = ascii_valid_up_to(bytes); |
| // >= makes later things optimize better than == |
| if up_to >= bytes.len() { |
| debug_assert_eq!(up_to, bytes.len()); |
| return Cow::Borrowed(bytes); |
| } |
| let (head, tail) = bytes.split_at(up_to); |
| let capacity = bytes.len(); |
| let mut vec = Vec::with_capacity(capacity); |
| unsafe { |
| vec.set_len(capacity); |
| } |
| (&mut vec[..up_to]).copy_from_slice(head); |
| let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]); |
| vec.truncate(up_to + written); |
| Cow::Owned(vec) |
| } |
| |
| /// Returns the index of the first unpaired surrogate or, if the input is |
| /// valid UTF-16 in its entirety, the length of the input. |
| pub fn utf16_valid_up_to(buffer: &[u16]) -> usize { |
| utf16_valid_up_to_impl(buffer) |
| } |
| |
| /// Returns the index of first byte that starts an invalid byte |
| /// sequence or a non-Latin1 byte sequence, or the length of the |
| /// string if there are neither. |
| pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize { |
| is_utf8_latin1_impl(buffer).unwrap_or(buffer.len()) |
| } |
| |
| /// Returns the index of first byte that starts a non-Latin1 byte |
| /// sequence, or the length of the string if there are none. |
| pub fn str_latin1_up_to(buffer: &str) -> usize { |
| is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len()) |
| } |
| |
| /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. |
| #[inline] |
| pub fn ensure_utf16_validity(buffer: &mut [u16]) { |
| let mut offset = 0; |
| loop { |
| offset += utf16_valid_up_to(&buffer[offset..]); |
| if offset == buffer.len() { |
| return; |
| } |
| buffer[offset] = 0xFFFD; |
| offset += 1; |
| } |
| } |
| |
| /// Copies ASCII from source to destination up to the first non-ASCII byte |
| /// (or the end of the input if it is ASCII in its entirety). |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer. |
| /// |
| /// Returns the number of bytes written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize { |
| assert!( |
| dst.len() >= src.len(), |
| "Destination must not be shorter than the source." |
| ); |
| if let Some((_, consumed)) = |
| unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } |
| { |
| consumed |
| } else { |
| src.len() |
| } |
| } |
| |
| /// Copies ASCII from source to destination zero-extending it to UTF-16 up to |
| /// the first non-ASCII byte (or the end of the input if it is ASCII in its |
| /// entirety). |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer. |
| /// |
| /// Returns the number of `u16`s written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize { |
| assert!( |
| dst.len() >= src.len(), |
| "Destination must not be shorter than the source." |
| ); |
| if let Some((_, consumed)) = |
| unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) } |
| { |
| consumed |
| } else { |
| src.len() |
| } |
| } |
| |
| /// Copies Basic Latin from source to destination narrowing it to ASCII up to |
| /// the first non-Basic Latin code unit (or the end of the input if it is |
| /// Basic Latin in its entirety). |
| /// |
| /// The length of the destination buffer must be at least the length of the |
| /// source buffer. |
| /// |
| /// Returns the number of bytes written. |
| /// |
| /// # Panics |
| /// |
| /// Panics if the destination buffer is shorter than stated above. |
| pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize { |
| assert!( |
| dst.len() >= src.len(), |
| "Destination must not be shorter than the source." |
| ); |
| if let Some((_, consumed)) = |
| unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } |
| { |
| consumed |
| } else { |
| src.len() |
| } |
| } |
| |
| // Any copyright to the test code below this comment is dedicated to the |
| // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
| |
| #[cfg(all(test, feature = "alloc"))] |
| mod tests { |
| use super::*; |
| |
| #[test] |
| fn test_is_ascii_success() { |
| let mut src: Vec<u8> = Vec::with_capacity(128); |
| src.resize(128, 0); |
| for i in 0..src.len() { |
| src[i] = i as u8; |
| } |
| for i in 0..src.len() { |
| assert!(is_ascii(&src[i..])); |
| } |
| } |
| |
| #[test] |
| fn test_is_ascii_fail() { |
| let mut src: Vec<u8> = Vec::with_capacity(128); |
| src.resize(128, 0); |
| for i in 0..src.len() { |
| src[i] = i as u8; |
| } |
| for i in 0..src.len() { |
| let tail = &mut src[i..]; |
| for j in 0..tail.len() { |
| tail[j] = 0xA0; |
| assert!(!is_ascii(tail)); |
| } |
| } |
| } |
| |
| #[test] |
| fn test_is_basic_latin_success() { |
| let mut src: Vec<u16> = Vec::with_capacity(128); |
| src.resize(128, 0); |
| for i in 0..src.len() { |
| src[i] = i as u16; |
| } |
| for i in 0..src.len() { |
| assert!(is_basic_latin(&src[i..])); |
| } |
| } |
| |
| #[test] |
| fn test_is_basic_latin_fail() { |
| let mut src: Vec<u16> = Vec::with_capacity(128); |
| src.resize(128, 0); |
| for i in 0..src.len() { |
| src[i] = i as u16; |
| } |
| for i in 0..src.len() { |
| let tail = &mut src[i..]; |
| for j in 0..tail.len() { |
| tail[j] = 0xA0; |
| assert!(!is_basic_latin(tail)); |
| } |
| } |
| } |
| |
| #[test] |
| fn test_is_utf16_latin1_success() { |
| let mut src: Vec<u16> = Vec::with_capacity(256); |
| src.resize(256, 0); |
| for i in 0..src.len() { |
| src[i] = i as u16; |
| } |
| for i in 0..src.len() { |
| assert!(is_utf16_latin1(&src[i..])); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&src[i..]), |
| Latin1Bidi::Latin1 |
| ); |
| } |
| } |
| |
| #[test] |
| fn test_is_utf16_latin1_fail() { |
| let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
| let mut src: Vec<u16> = Vec::with_capacity(len); |
| src.resize(len, 0); |
| for i in 0..src.len() { |
| src[i] = i as u16; |
| } |
| for i in 0..src.len() { |
| let tail = &mut src[i..]; |
| for j in 0..tail.len() { |
| tail[j] = 0x100 + j as u16; |
| assert!(!is_utf16_latin1(tail)); |
| assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1); |
| } |
| } |
| } |
| |
| #[test] |
| fn test_is_str_latin1_success() { |
| let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
| let mut src: Vec<u16> = Vec::with_capacity(len); |
| src.resize(len, 0); |
| for i in 0..src.len() { |
| src[i] = i as u16; |
| } |
| for i in 0..src.len() { |
| let s = String::from_utf16(&src[i..]).unwrap(); |
| assert!(is_str_latin1(&s[..])); |
| assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); |
| } |
| } |
| |
| #[test] |
| fn test_is_str_latin1_fail() { |
| let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow |
| let mut src: Vec<u16> = Vec::with_capacity(len); |
| src.resize(len, 0); |
| for i in 0..src.len() { |
| src[i] = i as u16; |
| } |
| for i in 0..src.len() { |
| let tail = &mut src[i..]; |
| for j in 0..tail.len() { |
| tail[j] = 0x100 + j as u16; |
| let s = String::from_utf16(tail).unwrap(); |
| assert!(!is_str_latin1(&s[..])); |
| assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); |
| } |
| } |
| } |
| |
| #[test] |
| fn test_is_utf8_latin1_success() { |
| let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
| let mut src: Vec<u16> = Vec::with_capacity(len); |
| src.resize(len, 0); |
| for i in 0..src.len() { |
| src[i] = i as u16; |
| } |
| for i in 0..src.len() { |
| let s = String::from_utf16(&src[i..]).unwrap(); |
| assert!(is_utf8_latin1(s.as_bytes())); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi(s.as_bytes()), |
| Latin1Bidi::Latin1 |
| ); |
| } |
| } |
| |
| #[test] |
| fn test_is_utf8_latin1_fail() { |
| let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow |
| let mut src: Vec<u16> = Vec::with_capacity(len); |
| src.resize(len, 0); |
| for i in 0..src.len() { |
| src[i] = i as u16; |
| } |
| for i in 0..src.len() { |
| let tail = &mut src[i..]; |
| for j in 0..tail.len() { |
| tail[j] = 0x100 + j as u16; |
| let s = String::from_utf16(tail).unwrap(); |
| assert!(!is_utf8_latin1(s.as_bytes())); |
| assert_ne!( |
| check_utf8_for_latin1_and_bidi(s.as_bytes()), |
| Latin1Bidi::Latin1 |
| ); |
| } |
| } |
| } |
| |
| #[test] |
| fn test_is_utf8_latin1_invalid() { |
| assert!(!is_utf8_latin1(b"\xC3")); |
| assert!(!is_utf8_latin1(b"a\xC3")); |
| assert!(!is_utf8_latin1(b"\xFF")); |
| assert!(!is_utf8_latin1(b"a\xFF")); |
| assert!(!is_utf8_latin1(b"\xC3\xFF")); |
| assert!(!is_utf8_latin1(b"a\xC3\xFF")); |
| } |
| |
| #[test] |
| fn test_convert_utf8_to_utf16() { |
| let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; |
| let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1); |
| dst.resize(src.len() + 1, 0); |
| let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]); |
| dst.truncate(len); |
| let reference: Vec<u16> = src.encode_utf16().collect(); |
| assert_eq!(dst, reference); |
| } |
| |
| #[test] |
| fn test_convert_str_to_utf16() { |
| let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; |
| let mut dst: Vec<u16> = Vec::with_capacity(src.len()); |
| dst.resize(src.len(), 0); |
| let len = convert_str_to_utf16(src, &mut dst[..]); |
| dst.truncate(len); |
| let reference: Vec<u16> = src.encode_utf16().collect(); |
| assert_eq!(dst, reference); |
| } |
| |
| #[test] |
| fn test_convert_utf16_to_utf8_partial() { |
| let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; |
| let src: Vec<u16> = reference.encode_utf16().collect(); |
| let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1); |
| dst.resize(src.len() * 3 + 1, 0); |
| let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]); |
| let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]); |
| dst.truncate(len); |
| assert_eq!(dst, reference.as_bytes()); |
| } |
| |
| #[test] |
| fn test_convert_utf16_to_utf8() { |
| let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; |
| let src: Vec<u16> = reference.encode_utf16().collect(); |
| let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1); |
| dst.resize(src.len() * 3 + 1, 0); |
| let len = convert_utf16_to_utf8(&src[..], &mut dst[..]); |
| dst.truncate(len); |
| assert_eq!(dst, reference.as_bytes()); |
| } |
| |
| #[test] |
| fn test_convert_latin1_to_utf16() { |
| let mut src: Vec<u8> = Vec::with_capacity(256); |
| src.resize(256, 0); |
| let mut reference: Vec<u16> = Vec::with_capacity(256); |
| reference.resize(256, 0); |
| for i in 0..256 { |
| src[i] = i as u8; |
| reference[i] = i as u16; |
| } |
| let mut dst: Vec<u16> = Vec::with_capacity(src.len()); |
| dst.resize(src.len(), 0); |
| convert_latin1_to_utf16(&src[..], &mut dst[..]); |
| assert_eq!(dst, reference); |
| } |
| |
| #[test] |
| fn test_convert_latin1_to_utf8_partial() { |
| let mut dst = [0u8, 2]; |
| let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]); |
| assert_eq!(read, 1); |
| assert_eq!(written, 1); |
| } |
| |
| #[test] |
| fn test_convert_latin1_to_utf8() { |
| let mut src: Vec<u8> = Vec::with_capacity(256); |
| src.resize(256, 0); |
| let mut reference: Vec<u16> = Vec::with_capacity(256); |
| reference.resize(256, 0); |
| for i in 0..256 { |
| src[i] = i as u8; |
| reference[i] = i as u16; |
| } |
| let s = String::from_utf16(&reference[..]).unwrap(); |
| let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2); |
| dst.resize(src.len() * 2, 0); |
| let len = convert_latin1_to_utf8(&src[..], &mut dst[..]); |
| dst.truncate(len); |
| assert_eq!(&dst[..], s.as_bytes()); |
| } |
| |
| #[test] |
| fn test_convert_utf8_to_latin1_lossy() { |
| let mut reference: Vec<u8> = Vec::with_capacity(256); |
| reference.resize(256, 0); |
| let mut src16: Vec<u16> = Vec::with_capacity(256); |
| src16.resize(256, 0); |
| for i in 0..256 { |
| src16[i] = i as u16; |
| reference[i] = i as u8; |
| } |
| let src = String::from_utf16(&src16[..]).unwrap(); |
| let mut dst: Vec<u8> = Vec::with_capacity(src.len()); |
| dst.resize(src.len(), 0); |
| let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]); |
| dst.truncate(len); |
| assert_eq!(dst, reference); |
| } |
| |
| #[cfg(all(debug_assertions, not(fuzzing)))] |
| #[test] |
| #[should_panic] |
| fn test_convert_utf8_to_latin1_lossy_panics() { |
| let mut dst = [0u8; 16]; |
| let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]); |
| } |
| |
| #[test] |
| fn test_convert_utf16_to_latin1_lossy() { |
| let mut src: Vec<u16> = Vec::with_capacity(256); |
| src.resize(256, 0); |
| let mut reference: Vec<u8> = Vec::with_capacity(256); |
| reference.resize(256, 0); |
| for i in 0..256 { |
| src[i] = i as u16; |
| reference[i] = i as u8; |
| } |
| let mut dst: Vec<u8> = Vec::with_capacity(src.len()); |
| dst.resize(src.len(), 0); |
| convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]); |
| assert_eq!(dst, reference); |
| } |
| |
| #[test] |
| // #[should_panic] |
| fn test_convert_utf16_to_latin1_lossy_panics() { |
| let mut dst = [0u8; 16]; |
| let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); |
| } |
| |
| #[test] |
| fn test_utf16_valid_up_to() { |
| let valid = vec![ |
| 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16, |
| 0xD83Du16, 0xDCA9u16, 0x00B6u16, |
| ]; |
| assert_eq!(utf16_valid_up_to(&valid[..]), 16); |
| let lone_high = vec![ |
| 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| 0x2603u16, 0xD83Du16, 0x00B6u16, |
| ]; |
| assert_eq!(utf16_valid_up_to(&lone_high[..]), 14); |
| let lone_low = vec![ |
| 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| 0x2603u16, 0xDCA9u16, 0x00B6u16, |
| ]; |
| assert_eq!(utf16_valid_up_to(&lone_low[..]), 14); |
| let lone_high_at_end = vec![ |
| 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| 0x2603u16, 0x00B6u16, 0xD83Du16, |
| ]; |
| assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15); |
| } |
| |
| #[test] |
| fn test_ensure_utf16_validity() { |
| let mut src = vec![ |
| 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| ]; |
| let reference = vec![ |
| 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
| ]; |
| ensure_utf16_validity(&mut src[..]); |
| assert_eq!(src, reference); |
| } |
| |
| #[test] |
| fn test_is_char_bidi() { |
| assert!(!is_char_bidi('a')); |
| assert!(!is_char_bidi('\u{03B1}')); |
| assert!(!is_char_bidi('\u{3041}')); |
| assert!(!is_char_bidi('\u{1F4A9}')); |
| assert!(!is_char_bidi('\u{FE00}')); |
| assert!(!is_char_bidi('\u{202C}')); |
| assert!(!is_char_bidi('\u{FEFF}')); |
| assert!(is_char_bidi('\u{0590}')); |
| assert!(is_char_bidi('\u{08FF}')); |
| assert!(is_char_bidi('\u{061C}')); |
| assert!(is_char_bidi('\u{FB50}')); |
| assert!(is_char_bidi('\u{FDFF}')); |
| assert!(is_char_bidi('\u{FE70}')); |
| assert!(is_char_bidi('\u{FEFE}')); |
| assert!(is_char_bidi('\u{200F}')); |
| assert!(is_char_bidi('\u{202B}')); |
| assert!(is_char_bidi('\u{202E}')); |
| assert!(is_char_bidi('\u{2067}')); |
| assert!(is_char_bidi('\u{10800}')); |
| assert!(is_char_bidi('\u{10FFF}')); |
| assert!(is_char_bidi('\u{1E800}')); |
| assert!(is_char_bidi('\u{1EFFF}')); |
| } |
| |
| #[test] |
| fn test_is_utf16_code_unit_bidi() { |
| assert!(!is_utf16_code_unit_bidi(0x0062)); |
| assert!(!is_utf16_code_unit_bidi(0x03B1)); |
| assert!(!is_utf16_code_unit_bidi(0x3041)); |
| assert!(!is_utf16_code_unit_bidi(0xD801)); |
| assert!(!is_utf16_code_unit_bidi(0xFE00)); |
| assert!(!is_utf16_code_unit_bidi(0x202C)); |
| assert!(!is_utf16_code_unit_bidi(0xFEFF)); |
| assert!(is_utf16_code_unit_bidi(0x0590)); |
| assert!(is_utf16_code_unit_bidi(0x08FF)); |
| assert!(is_utf16_code_unit_bidi(0x061C)); |
| assert!(is_utf16_code_unit_bidi(0xFB1D)); |
| assert!(is_utf16_code_unit_bidi(0xFB50)); |
| assert!(is_utf16_code_unit_bidi(0xFDFF)); |
| assert!(is_utf16_code_unit_bidi(0xFE70)); |
| assert!(is_utf16_code_unit_bidi(0xFEFE)); |
| assert!(is_utf16_code_unit_bidi(0x200F)); |
| assert!(is_utf16_code_unit_bidi(0x202B)); |
| assert!(is_utf16_code_unit_bidi(0x202E)); |
| assert!(is_utf16_code_unit_bidi(0x2067)); |
| assert!(is_utf16_code_unit_bidi(0xD802)); |
| assert!(is_utf16_code_unit_bidi(0xD803)); |
| assert!(is_utf16_code_unit_bidi(0xD83A)); |
| assert!(is_utf16_code_unit_bidi(0xD83B)); |
| } |
| |
| #[test] |
| fn test_is_str_bidi() { |
| assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop")); |
| assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop")); |
| assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop")); |
| assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop")); |
| assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop")); |
| assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop")); |
| assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop")); |
| assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop")); |
| } |
| |
| #[test] |
| fn test_is_utf8_bidi() { |
| assert!(!is_utf8_bidi( |
| "abcdefghijklmnopaabcdefghijklmnop".as_bytes() |
| )); |
| assert!(!is_utf8_bidi( |
| "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(!is_utf8_bidi( |
| "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(!is_utf8_bidi( |
| "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(!is_utf8_bidi( |
| "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(!is_utf8_bidi( |
| "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(!is_utf8_bidi( |
| "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes() |
| )); |
| assert!(is_utf8_bidi( |
| "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes() |
| )); |
| } |
| |
| #[test] |
| fn test_is_utf16_bidi() { |
| assert!(!is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(!is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(!is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(!is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(!is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(!is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(!is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66, |
| 0x67, 0x68, 0x69, |
| ])); |
| |
| assert!(is_utf16_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ])); |
| } |
| |
| #[test] |
| fn test_check_str_for_latin1_and_bidi() { |
| assert_ne!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"), |
| Latin1Bidi::Bidi |
| ); |
| } |
| |
| #[test] |
| fn test_check_utf8_for_latin1_and_bidi() { |
| assert_ne!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()), |
| Latin1Bidi::Bidi |
| ); |
| } |
| |
| #[test] |
| fn test_check_utf16_for_latin1_and_bidi() { |
| assert_ne!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_ne!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, |
| 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| |
| assert_eq!( |
| check_utf16_for_latin1_and_bidi(&[ |
| 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, |
| 0x65, 0x66, 0x67, 0x68, 0x69, |
| ]), |
| Latin1Bidi::Bidi |
| ); |
| } |
| |
| #[inline(always)] |
| pub fn reference_is_char_bidi(c: char) -> bool { |
| match c { |
| '\u{0590}'..='\u{08FF}' |
| | '\u{FB1D}'..='\u{FDFF}' |
| | '\u{FE70}'..='\u{FEFE}' |
| | '\u{10800}'..='\u{10FFF}' |
| | '\u{1E800}'..='\u{1EFFF}' |
| | '\u{200F}' |
| | '\u{202B}' |
| | '\u{202E}' |
| | '\u{2067}' => true, |
| _ => false, |
| } |
| } |
| |
| #[inline(always)] |
| pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool { |
| match u { |
| 0x0590..=0x08FF |
| | 0xFB1D..=0xFDFF |
| | 0xFE70..=0xFEFE |
| | 0xD802 |
| | 0xD803 |
| | 0xD83A |
| | 0xD83B |
| | 0x200F |
| | 0x202B |
| | 0x202E |
| | 0x2067 => true, |
| _ => false, |
| } |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_is_char_bidi_thoroughly() { |
| for i in 0..0xD800u32 { |
| let c: char = ::core::char::from_u32(i).unwrap(); |
| assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); |
| } |
| for i in 0xE000..0x110000u32 { |
| let c: char = ::core::char::from_u32(i).unwrap(); |
| assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); |
| } |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_is_utf16_code_unit_bidi_thoroughly() { |
| for i in 0..0x10000u32 { |
| let u = i as u16; |
| assert_eq!( |
| is_utf16_code_unit_bidi(u), |
| reference_is_utf16_code_unit_bidi(u) |
| ); |
| } |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_is_str_bidi_thoroughly() { |
| let mut buf = [0; 4]; |
| for i in 0..0xD800u32 { |
| let c: char = ::core::char::from_u32(i).unwrap(); |
| assert_eq!( |
| is_str_bidi(c.encode_utf8(&mut buf[..])), |
| reference_is_char_bidi(c) |
| ); |
| } |
| for i in 0xE000..0x110000u32 { |
| let c: char = ::core::char::from_u32(i).unwrap(); |
| assert_eq!( |
| is_str_bidi(c.encode_utf8(&mut buf[..])), |
| reference_is_char_bidi(c) |
| ); |
| } |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_is_utf8_bidi_thoroughly() { |
| let mut buf = [0; 8]; |
| for i in 0..0xD800u32 { |
| let c: char = ::core::char::from_u32(i).unwrap(); |
| let expect = reference_is_char_bidi(c); |
| { |
| let len = { |
| let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); |
| assert_eq!(is_utf8_bidi(bytes), expect); |
| bytes.len() |
| }; |
| { |
| let tail = &mut buf[len..]; |
| for b in tail.iter_mut() { |
| *b = 0; |
| } |
| } |
| } |
| assert_eq!(is_utf8_bidi(&buf[..]), expect); |
| } |
| for i in 0xE000..0x110000u32 { |
| let c: char = ::core::char::from_u32(i).unwrap(); |
| let expect = reference_is_char_bidi(c); |
| { |
| let len = { |
| let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); |
| assert_eq!(is_utf8_bidi(bytes), expect); |
| bytes.len() |
| }; |
| { |
| let tail = &mut buf[len..]; |
| for b in tail.iter_mut() { |
| *b = 0; |
| } |
| } |
| } |
| assert_eq!(is_utf8_bidi(&buf[..]), expect); |
| } |
| } |
| |
| #[test] |
| #[cfg_attr(miri, ignore)] // Miri is too slow |
| fn test_is_utf16_bidi_thoroughly() { |
| let mut buf = [0; 32]; |
| for i in 0..0x10000u32 { |
| let u = i as u16; |
| buf[15] = u; |
| assert_eq!( |
| is_utf16_bidi(&buf[..]), |
| reference_is_utf16_code_unit_bidi(u) |
| ); |
| } |
| } |
| |
| #[test] |
| fn test_is_utf8_bidi_edge_cases() { |
| assert!(!is_utf8_bidi(b"\xD5\xBF\x61")); |
| assert!(!is_utf8_bidi(b"\xD6\x80\x61")); |
| assert!(!is_utf8_bidi(b"abc")); |
| assert!(is_utf8_bidi(b"\xD5\xBF\xC2")); |
| assert!(is_utf8_bidi(b"\xD6\x80\xC2")); |
| assert!(is_utf8_bidi(b"ab\xC2")); |
| } |
| |
| #[test] |
| fn test_decode_latin1() { |
| match decode_latin1(b"ab") { |
| Cow::Borrowed(s) => { |
| assert_eq!(s, "ab"); |
| } |
| Cow::Owned(_) => { |
| unreachable!("Should have borrowed"); |
| } |
| } |
| assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}"); |
| } |
| |
| #[test] |
| fn test_encode_latin1_lossy() { |
| match encode_latin1_lossy("ab") { |
| Cow::Borrowed(s) => { |
| assert_eq!(s, b"ab"); |
| } |
| Cow::Owned(_) => { |
| unreachable!("Should have borrowed"); |
| } |
| } |
| assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]); |
| } |
| |
| #[test] |
| fn test_convert_utf8_to_utf16_without_replacement() { |
| let mut buf = [0u16; 5]; |
| assert_eq!( |
| convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]), |
| Some(2) |
| ); |
| assert_eq!(buf[0], u16::from(b'a')); |
| assert_eq!(buf[1], u16::from(b'b')); |
| assert_eq!(buf[2], 0); |
| assert_eq!( |
| convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]), |
| Some(2) |
| ); |
| assert_eq!(buf[0], 0xE4); |
| assert_eq!(buf[1], u16::from(b'c')); |
| assert_eq!(buf[2], 0); |
| assert_eq!( |
| convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]), |
| Some(1) |
| ); |
| assert_eq!(buf[0], 0x2603); |
| assert_eq!(buf[1], u16::from(b'c')); |
| assert_eq!(buf[2], 0); |
| assert_eq!( |
| convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]), |
| Some(2) |
| ); |
| assert_eq!(buf[0], 0x2603); |
| assert_eq!(buf[1], u16::from(b'd')); |
| assert_eq!(buf[2], 0); |
| assert_eq!( |
| convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]), |
| Some(2) |
| ); |
| assert_eq!(buf[0], 0x2603); |
| assert_eq!(buf[1], 0xE4); |
| assert_eq!(buf[2], 0); |
| assert_eq!( |
| convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]), |
| Some(2) |
| ); |
| assert_eq!(buf[0], 0xD83D); |
| assert_eq!(buf[1], 0xDCCE); |
| assert_eq!(buf[2], 0); |
| assert_eq!( |
| convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]), |
| Some(3) |
| ); |
| assert_eq!(buf[0], 0xD83D); |
| assert_eq!(buf[1], 0xDCCE); |
| assert_eq!(buf[2], u16::from(b'e')); |
| assert_eq!( |
| convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]), |
| None |
| ); |
| } |
| } |