vendor/regex-lite/src/utf8.rs - toolchain/rustc - Git at Google

 /// Returns true if and only if the given byte is considered a word character.
 /// This only applies to ASCII.
 pub(crate) fn is_word_byte(b: u8) -> bool {
     const fn mkwordset() -> [bool; 256] {
         // FIXME: Use as_usize() once const functions in traits are stable.
         let mut set = [false; 256];
         set[b'_' as usize] = true;

         let mut byte = b'0';
         while byte <= b'9' {
             set[byte as usize] = true;
             byte += 1;
         }
         byte = b'A';
         while byte <= b'Z' {
             set[byte as usize] = true;
             byte += 1;
         }
         byte = b'a';
         while byte <= b'z' {
             set[byte as usize] = true;
             byte += 1;
         }
         set
     }
     const WORD: [bool; 256] = mkwordset();
     WORD[b as usize]
 }

 /// The accept state index. When we enter this state, we know we've found a
 /// valid Unicode scalar value.
 const ACCEPT: usize = 12;
 /// The reject state index. When we enter this state, we know that we've found
 /// invalid UTF-8.
 const REJECT: usize = 0;

 /// Like `decode`, but automatically converts the `None` case to the
 /// replacement codepoint.
 pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
     match decode(slice) {
         (Some(ch), size) => (ch, size),
         (None, size) => ('\u{FFFD}', size),
     }
 }

 /// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
 ///
 /// When successful, the corresponding Unicode scalar value is returned along
 /// with the number of bytes it was encoded with. The number of bytes consumed
 /// for a successful decode is always between 1 and 4, inclusive.
 ///
 /// When unsuccessful, `None` is returned along with the number of bytes that
 /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
 /// the number of bytes consumed is always between 0 and 3, inclusive, where
 /// 0 is only returned when `slice` is empty.
 pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
     let slice = slice.as_ref();
     match slice.get(0) {
         None => return (None, 0),
         Some(&b) if b <= 0x7F => return (Some(b as char), 1),
         _ => {}
     }

     let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
     while i < slice.len() {
         decode_step(&mut state, &mut cp, slice[i]);
         i += 1;

         if state == ACCEPT {
             // OK since `decode_step` guarantees that `cp` is a valid Unicode
             // scalar value in an ACCEPT state.
             //
             // We don't have to use safe code here, but do so because perf
             // isn't our primary objective in regex-lite.
             let ch = char::from_u32(cp).unwrap();
             return (Some(ch), i);
         } else if state == REJECT {
             // At this point, we always want to advance at least one byte.
             return (None, core::cmp::max(1, i.saturating_sub(1)));
         }
     }
     (None, i)
 }

 /// Transitions to the next state and updates `cp` while it does.
 fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
     // Splits the space of all bytes into equivalence classes, such that
     // any byte in the same class can never discriminate between whether a
     // particular sequence is valid UTF-8 or not.
     #[cfg_attr(rustfmt, rustfmt::skip)]
     const CLASSES: [u8; 256] = [
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
        8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
       10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
     ];

     // A state machine taken from `bstr` which was in turn adapted from:
     // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
     #[cfg_attr(rustfmt, rustfmt::skip)]
     const STATES_FORWARD: &'static [u8] = &[
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
       0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
       0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
       0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
       0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
       0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     ];

     let class = CLASSES[usize::from(b)];
     if *state == ACCEPT {
         *cp = (0xFF >> class) & (b as u32);
     } else {
         *cp = (b as u32 & 0b111111) | (*cp << 6);
     }
     *state = usize::from(STATES_FORWARD[*state + usize::from(class)]);
 }

 #[cfg(test)]
 mod tests {
     use alloc::{vec, vec::Vec};

     use super::*;

     #[test]
     fn decode_valid() {
         fn d(mut s: &str) -> Vec<char> {
             let mut chars = vec![];
             while !s.is_empty() {
                 let (ch, size) = decode(s.as_bytes());
                 s = &s[size..];
                 chars.push(ch.unwrap());
             }
             chars
         }

         assert_eq!(vec!['☃'], d("☃"));
         assert_eq!(vec!['☃', '☃'], d("☃☃"));
         assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
         assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
         assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
     }

     #[test]
     fn decode_invalid() {
         let (ch, size) = decode(b"");
         assert_eq!(None, ch);
         assert_eq!(0, size);

         let (ch, size) = decode(b"\xFF");
         assert_eq!(None, ch);
         assert_eq!(1, size);

         let (ch, size) = decode(b"\xCE\xF0");
         assert_eq!(None, ch);
         assert_eq!(1, size);

         let (ch, size) = decode(b"\xE2\x98\xF0");
         assert_eq!(None, ch);
         assert_eq!(2, size);

         let (ch, size) = decode(b"\xF0\x9D\x9D");
         assert_eq!(None, ch);
         assert_eq!(3, size);

         let (ch, size) = decode(b"\xF0\x9D\x9D\xF0");
         assert_eq!(None, ch);
         assert_eq!(3, size);

         let (ch, size) = decode(b"\xF0\x82\x82\xAC");
         assert_eq!(None, ch);
         assert_eq!(1, size);

         let (ch, size) = decode(b"\xED\xA0\x80");
         assert_eq!(None, ch);
         assert_eq!(1, size);

         let (ch, size) = decode(b"\xCEa");
         assert_eq!(None, ch);
         assert_eq!(1, size);

         let (ch, size) = decode(b"\xE2\x98a");
         assert_eq!(None, ch);
         assert_eq!(2, size);

         let (ch, size) = decode(b"\xF0\x9D\x9Ca");
         assert_eq!(None, ch);
         assert_eq!(3, size);
     }

     #[test]
     fn decode_lossily() {
         let (ch, size) = decode_lossy(b"");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(0, size);

         let (ch, size) = decode_lossy(b"\xFF");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(1, size);

         let (ch, size) = decode_lossy(b"\xCE\xF0");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(1, size);

         let (ch, size) = decode_lossy(b"\xE2\x98\xF0");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(2, size);

         let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(3, size);

         let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(1, size);

         let (ch, size) = decode_lossy(b"\xED\xA0\x80");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(1, size);

         let (ch, size) = decode_lossy(b"\xCEa");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(1, size);

         let (ch, size) = decode_lossy(b"\xE2\x98a");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(2, size);

         let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca");
         assert_eq!('\u{FFFD}', ch);
         assert_eq!(3, size);
     }
 }
	/// Returns true if and only if the given byte is considered a word character.
	/// This only applies to ASCII.
	pub(crate) fn is_word_byte(b: u8) -> bool {
	const fn mkwordset() -> [bool; 256] {
	// FIXME: Use as_usize() once const functions in traits are stable.
	let mut set = [false; 256];
	set[b'_' as usize] = true;

	let mut byte = b'0';
	while byte <= b'9' {
	set[byte as usize] = true;
	byte += 1;
	}
	byte = b'A';
	while byte <= b'Z' {
	set[byte as usize] = true;
	byte += 1;
	}
	byte = b'a';
	while byte <= b'z' {
	set[byte as usize] = true;
	byte += 1;
	}
	set
	}
	const WORD: [bool; 256] = mkwordset();
	WORD[b as usize]
	}

	/// The accept state index. When we enter this state, we know we've found a
	/// valid Unicode scalar value.
	const ACCEPT: usize = 12;
	/// The reject state index. When we enter this state, we know that we've found
	/// invalid UTF-8.
	const REJECT: usize = 0;

	/// Like `decode`, but automatically converts the `None` case to the
	/// replacement codepoint.
	pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
	match decode(slice) {
	(Some(ch), size) => (ch, size),
	(None, size) => ('\u{FFFD}', size),
	}
	}

	/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
	///
	/// When successful, the corresponding Unicode scalar value is returned along
	/// with the number of bytes it was encoded with. The number of bytes consumed
	/// for a successful decode is always between 1 and 4, inclusive.
	///
	/// When unsuccessful, `None` is returned along with the number of bytes that
	/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
	/// the number of bytes consumed is always between 0 and 3, inclusive, where
	/// 0 is only returned when `slice` is empty.
	pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
	let slice = slice.as_ref();
	match slice.get(0) {
	None => return (None, 0),
	Some(&b) if b <= 0x7F => return (Some(b as char), 1),
	_ => {}
	}

	let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
	while i < slice.len() {
	decode_step(&mut state, &mut cp, slice[i]);
	i += 1;

	if state == ACCEPT {
	// OK since `decode_step` guarantees that `cp` is a valid Unicode
	// scalar value in an ACCEPT state.
	//
	// We don't have to use safe code here, but do so because perf
	// isn't our primary objective in regex-lite.
	let ch = char::from_u32(cp).unwrap();
	return (Some(ch), i);
	} else if state == REJECT {
	// At this point, we always want to advance at least one byte.
	return (None, core::cmp::max(1, i.saturating_sub(1)));
	}
	}
	(None, i)
	}

	/// Transitions to the next state and updates `cp` while it does.
	fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
	// Splits the space of all bytes into equivalence classes, such that
	// any byte in the same class can never discriminate between whether a
	// particular sequence is valid UTF-8 or not.
	#[cfg_attr(rustfmt, rustfmt::skip)]
	const CLASSES: [u8; 256] = [
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
	];

	// A state machine taken from `bstr` which was in turn adapted from:
	// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
	#[cfg_attr(rustfmt, rustfmt::skip)]
	const STATES_FORWARD: &'static [u8] = &[
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
	0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
	0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
	0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
	0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
	0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	];

	let class = CLASSES[usize::from(b)];
	if *state == ACCEPT {
	*cp = (0xFF >> class) & (b as u32);
	} else {
	cp = (b as u32 & 0b111111) \| (cp << 6);
	}
	state = usize::from(STATES_FORWARD[state + usize::from(class)]);
	}

	#[cfg(test)]
	mod tests {
	use alloc::{vec, vec::Vec};

	use super::*;

	#[test]
	fn decode_valid() {
	fn d(mut s: &str) -> Vec<char> {
	let mut chars = vec![];
	while !s.is_empty() {
	let (ch, size) = decode(s.as_bytes());
	s = &s[size..];
	chars.push(ch.unwrap());
	}
	chars
	}

	assert_eq!(vec!['☃'], d("☃"));
	assert_eq!(vec!['☃', '☃'], d("☃☃"));
	assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
	assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
	assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
	}

	#[test]
	fn decode_invalid() {
	let (ch, size) = decode(b"");
	assert_eq!(None, ch);
	assert_eq!(0, size);

	let (ch, size) = decode(b"\xFF");
	assert_eq!(None, ch);
	assert_eq!(1, size);

	let (ch, size) = decode(b"\xCE\xF0");
	assert_eq!(None, ch);
	assert_eq!(1, size);

	let (ch, size) = decode(b"\xE2\x98\xF0");
	assert_eq!(None, ch);
	assert_eq!(2, size);

	let (ch, size) = decode(b"\xF0\x9D\x9D");
	assert_eq!(None, ch);
	assert_eq!(3, size);

	let (ch, size) = decode(b"\xF0\x9D\x9D\xF0");
	assert_eq!(None, ch);
	assert_eq!(3, size);

	let (ch, size) = decode(b"\xF0\x82\x82\xAC");
	assert_eq!(None, ch);
	assert_eq!(1, size);

	let (ch, size) = decode(b"\xED\xA0\x80");
	assert_eq!(None, ch);
	assert_eq!(1, size);

	let (ch, size) = decode(b"\xCEa");
	assert_eq!(None, ch);
	assert_eq!(1, size);

	let (ch, size) = decode(b"\xE2\x98a");
	assert_eq!(None, ch);
	assert_eq!(2, size);

	let (ch, size) = decode(b"\xF0\x9D\x9Ca");
	assert_eq!(None, ch);
	assert_eq!(3, size);
	}

	#[test]
	fn decode_lossily() {
	let (ch, size) = decode_lossy(b"");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(0, size);

	let (ch, size) = decode_lossy(b"\xFF");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(1, size);

	let (ch, size) = decode_lossy(b"\xCE\xF0");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(1, size);

	let (ch, size) = decode_lossy(b"\xE2\x98\xF0");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(2, size);

	let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(3, size);

	let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(1, size);

	let (ch, size) = decode_lossy(b"\xED\xA0\x80");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(1, size);

	let (ch, size) = decode_lossy(b"\xCEa");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(1, size);

	let (ch, size) = decode_lossy(b"\xE2\x98a");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(2, size);

	let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca");
	assert_eq!('\u{FFFD}', ch);
	assert_eq!(3, size);
	}
	}