blob: 90b2f11606b6de35ac5ccfc58fdad42dc39747b7 [file] [log] [blame]
use std::mem;
use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
use memchr::{memchr, memchr2, memchr3, memmem};
use regex_syntax::hir::literal::{Literal, Literals};
/// A prefix extracted from a compiled regular expression.
///
/// A regex prefix is a set of literal strings that *must* be matched at the
/// beginning of a regex in order for the entire regex to match. Similarly
/// for a regex suffix.
#[derive(Clone, Debug)]
pub struct LiteralSearcher {
complete: bool,
lcp: Memmem,
lcs: Memmem,
matcher: Matcher,
}
#[derive(Clone, Debug)]
enum Matcher {
/// No literals. (Never advances through the input.)
Empty,
/// A set of four or more single byte literals.
Bytes(SingleByteSet),
/// A single substring, using vector accelerated routines when available.
Memmem(Memmem),
/// An Aho-Corasick automaton.
AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
/// A packed multiple substring searcher, using SIMD.
///
/// Note that Aho-Corasick will actually use this packed searcher
/// internally automatically, however, there is some overhead associated
/// with going through the Aho-Corasick machinery. So using the packed
/// searcher directly results in some gains.
Packed { s: packed::Searcher, lits: Vec<Literal> },
}
impl LiteralSearcher {
/// Returns a matcher that never matches and never advances the input.
pub fn empty() -> Self {
Self::new(Literals::empty(), Matcher::Empty)
}
/// Returns a matcher for literal prefixes from the given set.
pub fn prefixes(lits: Literals) -> Self {
let matcher = Matcher::prefixes(&lits);
Self::new(lits, matcher)
}
/// Returns a matcher for literal suffixes from the given set.
pub fn suffixes(lits: Literals) -> Self {
let matcher = Matcher::suffixes(&lits);
Self::new(lits, matcher)
}
fn new(lits: Literals, matcher: Matcher) -> Self {
let complete = lits.all_complete();
LiteralSearcher {
complete,
lcp: Memmem::new(lits.longest_common_prefix()),
lcs: Memmem::new(lits.longest_common_suffix()),
matcher,
}
}
/// Returns true if all matches comprise the entire regular expression.
///
/// This does not necessarily mean that a literal match implies a match
/// of the regular expression. For example, the regular expression `^a`
/// is comprised of a single complete literal `a`, but the regular
/// expression demands that it only match at the beginning of a string.
pub fn complete(&self) -> bool {
self.complete && !self.is_empty()
}
/// Find the position of a literal in `haystack` if it exists.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
use self::Matcher::*;
match self.matcher {
Empty => Some((0, 0)),
Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
AC { ref ac, .. } => {
ac.find(haystack).map(|m| (m.start(), m.end()))
}
Packed { ref s, .. } => {
s.find(haystack).map(|m| (m.start(), m.end()))
}
}
}
/// Like find, except matches must start at index `0`.
pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
for lit in self.iter() {
if lit.len() > haystack.len() {
continue;
}
if lit == &haystack[0..lit.len()] {
return Some((0, lit.len()));
}
}
None
}
/// Like find, except matches must end at index `haystack.len()`.
pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
for lit in self.iter() {
if lit.len() > haystack.len() {
continue;
}
if lit == &haystack[haystack.len() - lit.len()..] {
return Some((haystack.len() - lit.len(), haystack.len()));
}
}
None
}
/// Returns an iterator over all literals to be matched.
pub fn iter(&self) -> LiteralIter<'_> {
match self.matcher {
Matcher::Empty => LiteralIter::Empty,
Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
}
}
/// Returns a matcher for the longest common prefix of this matcher.
pub fn lcp(&self) -> &Memmem {
&self.lcp
}
/// Returns a matcher for the longest common suffix of this matcher.
pub fn lcs(&self) -> &Memmem {
&self.lcs
}
/// Returns true iff this prefix is empty.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the number of prefixes in this machine.
pub fn len(&self) -> usize {
use self::Matcher::*;
match self.matcher {
Empty => 0,
Bytes(ref sset) => sset.dense.len(),
Memmem(_) => 1,
AC { ref ac, .. } => ac.pattern_count(),
Packed { ref lits, .. } => lits.len(),
}
}
/// Return the approximate heap usage of literals in bytes.
pub fn approximate_size(&self) -> usize {
use self::Matcher::*;
match self.matcher {
Empty => 0,
Bytes(ref sset) => sset.approximate_size(),
Memmem(ref single) => single.approximate_size(),
AC { ref ac, .. } => ac.heap_bytes(),
Packed { ref s, .. } => s.heap_bytes(),
}
}
}
impl Matcher {
fn prefixes(lits: &Literals) -> Self {
let sset = SingleByteSet::prefixes(lits);
Matcher::new(lits, sset)
}
fn suffixes(lits: &Literals) -> Self {
let sset = SingleByteSet::suffixes(lits);
Matcher::new(lits, sset)
}
fn new(lits: &Literals, sset: SingleByteSet) -> Self {
if lits.literals().is_empty() {
return Matcher::Empty;
}
if sset.dense.len() >= 26 {
// Avoid trying to match a large number of single bytes.
// This is *very* sensitive to a frequency analysis comparison
// between the bytes in sset and the composition of the haystack.
// No matter the size of sset, if its members all are rare in the
// haystack, then it'd be worth using it. How to tune this... IDK.
// ---AG
return Matcher::Empty;
}
if sset.complete {
return Matcher::Bytes(sset);
}
if lits.literals().len() == 1 {
return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
}
let pats = lits.literals().to_owned();
let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
if lits.literals().len() <= 100 && !is_aho_corasick_fast {
let mut builder = packed::Config::new()
.match_kind(packed::MatchKind::LeftmostFirst)
.builder();
if let Some(s) = builder.extend(&pats).build() {
return Matcher::Packed { s, lits: pats };
}
}
let ac = AhoCorasickBuilder::new()
.match_kind(aho_corasick::MatchKind::LeftmostFirst)
.dfa(true)
.build_with_size::<u32, _, _>(&pats)
.unwrap();
Matcher::AC { ac, lits: pats }
}
}
#[derive(Debug)]
pub enum LiteralIter<'a> {
Empty,
Bytes(&'a [u8]),
Single(&'a [u8]),
AC(&'a [Literal]),
Packed(&'a [Literal]),
}
impl<'a> Iterator for LiteralIter<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
match *self {
LiteralIter::Empty => None,
LiteralIter::Bytes(ref mut many) => {
if many.is_empty() {
None
} else {
let next = &many[0..1];
*many = &many[1..];
Some(next)
}
}
LiteralIter::Single(ref mut one) => {
if one.is_empty() {
None
} else {
let next = &one[..];
*one = &[];
Some(next)
}
}
LiteralIter::AC(ref mut lits) => {
if lits.is_empty() {
None
} else {
let next = &lits[0];
*lits = &lits[1..];
Some(&**next)
}
}
LiteralIter::Packed(ref mut lits) => {
if lits.is_empty() {
None
} else {
let next = &lits[0];
*lits = &lits[1..];
Some(&**next)
}
}
}
}
}
#[derive(Clone, Debug)]
struct SingleByteSet {
sparse: Vec<bool>,
dense: Vec<u8>,
complete: bool,
all_ascii: bool,
}
impl SingleByteSet {
fn new() -> SingleByteSet {
SingleByteSet {
sparse: vec![false; 256],
dense: vec![],
complete: true,
all_ascii: true,
}
}
fn prefixes(lits: &Literals) -> SingleByteSet {
let mut sset = SingleByteSet::new();
for lit in lits.literals() {
sset.complete = sset.complete && lit.len() == 1;
if let Some(&b) = lit.get(0) {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
}
sset.dense.push(b);
sset.sparse[b as usize] = true;
}
}
}
sset
}
fn suffixes(lits: &Literals) -> SingleByteSet {
let mut sset = SingleByteSet::new();
for lit in lits.literals() {
sset.complete = sset.complete && lit.len() == 1;
if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
}
sset.dense.push(b);
sset.sparse[b as usize] = true;
}
}
}
sset
}
/// Faster find that special cases certain sizes to use memchr.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find(&self, text: &[u8]) -> Option<usize> {
match self.dense.len() {
0 => None,
1 => memchr(self.dense[0], text),
2 => memchr2(self.dense[0], self.dense[1], text),
3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
_ => self._find(text),
}
}
/// Generic find that works on any sized set.
fn _find(&self, haystack: &[u8]) -> Option<usize> {
for (i, &b) in haystack.iter().enumerate() {
if self.sparse[b as usize] {
return Some(i);
}
}
None
}
fn approximate_size(&self) -> usize {
(self.dense.len() * mem::size_of::<u8>())
+ (self.sparse.len() * mem::size_of::<bool>())
}
}
/// A simple wrapper around the memchr crate's memmem implementation.
///
/// The API this exposes mirrors the API of previous substring searchers that
/// this supplanted.
#[derive(Clone, Debug)]
pub struct Memmem {
finder: memmem::Finder<'static>,
char_len: usize,
}
impl Memmem {
fn new(pat: &[u8]) -> Memmem {
Memmem {
finder: memmem::Finder::new(pat).into_owned(),
char_len: char_len_lossy(pat),
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn find(&self, haystack: &[u8]) -> Option<usize> {
self.finder.find(haystack)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn is_suffix(&self, text: &[u8]) -> bool {
if text.len() < self.len() {
return false;
}
&text[text.len() - self.len()..] == self.finder.needle()
}
pub fn len(&self) -> usize {
self.finder.needle().len()
}
pub fn char_len(&self) -> usize {
self.char_len
}
fn approximate_size(&self) -> usize {
self.finder.needle().len() * mem::size_of::<u8>()
}
}
fn char_len_lossy(bytes: &[u8]) -> usize {
String::from_utf8_lossy(bytes).chars().count()
}