| /////////////////////////////////////////////////////////////////////////////// |
| // parse_charset.hpp |
| // |
| // Copyright 2008 Eric Niebler. Distributed under the Boost |
| // Software License, Version 1.0. (See accompanying file |
| // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| |
| #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005 |
| #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005 |
| |
| // MS compatible compilers support #pragma once |
| #if defined(_MSC_VER) && (_MSC_VER >= 1020) |
| # pragma once |
| #endif |
| |
| #include <boost/integer.hpp> |
| #include <boost/mpl/bool.hpp> |
| #include <boost/throw_exception.hpp> |
| #include <boost/numeric/conversion/converter.hpp> |
| #include <boost/xpressive/detail/detail_fwd.hpp> |
| #include <boost/xpressive/detail/dynamic/parser_enum.hpp> |
| #include <boost/xpressive/detail/utility/literals.hpp> |
| #include <boost/xpressive/detail/utility/chset/chset.hpp> |
| #include <boost/xpressive/regex_constants.hpp> |
| |
| namespace boost { namespace xpressive { namespace detail |
| { |
| |
| enum escape_type |
| { |
| escape_char |
| , escape_mark |
| , escape_class |
| }; |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // escape_value |
| // |
| template<typename Char, typename Class> |
| struct escape_value |
| { |
| Char ch_; |
| int mark_nbr_; |
| Class class_; |
| escape_type type_; |
| }; |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // char_overflow_handler |
| // |
| struct char_overflow_handler |
| { |
| void operator ()(numeric::range_check_result result) const // throw(regex_error) |
| { |
| if(numeric::cInRange != result) |
| { |
| BOOST_THROW_EXCEPTION( |
| regex_error( |
| regex_constants::error_escape |
| , "character escape too large to fit in target character type" |
| ) |
| ); |
| } |
| } |
| }; |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // parse_escape |
| // |
| template<typename FwdIter, typename CompilerTraits> |
| escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type> |
| parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &traits) |
| { |
| using namespace regex_constants; |
| typedef typename iterator_value<FwdIter>::type char_type; |
| typedef typename CompilerTraits::regex_traits regex_traits; |
| typedef typename regex_traits::char_class_type char_class_type; |
| |
| // define an unsigned type the same size as char_type |
| typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t; |
| BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type)); |
| typedef numeric::conversion_traits<uchar_t, int> converstion_traits; |
| |
| BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found"); |
| numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter; |
| escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char }; |
| bool const icase = (0 != (regex_constants::icase_ & traits.flags())); |
| regex_traits const &rxtraits = traits.traits(); |
| FwdIter tmp; |
| |
| esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase); |
| if(0 != esc.class_) |
| { |
| esc.type_ = escape_class; |
| return esc; |
| } |
| |
| if(-1 != rxtraits.value(*begin, 8)) |
| { |
| esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777)); |
| return esc; |
| } |
| |
| switch(*begin) |
| { |
| // bell character |
| case BOOST_XPR_CHAR_(char_type, 'a'): |
| esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a'); |
| ++begin; |
| break; |
| // escape character |
| case BOOST_XPR_CHAR_(char_type, 'e'): |
| esc.ch_ = converter(27); |
| ++begin; |
| break; |
| // control character |
| case BOOST_XPR_CHAR_(char_type, 'c'): |
| BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); |
| BOOST_XPR_ENSURE_ |
| ( |
| rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin) |
| || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin) |
| , error_escape |
| , "invalid escape control letter; must be one of a-z or A-Z" |
| ); |
| // Convert to character according to ECMA-262, section 15.10.2.10: |
| esc.ch_ = converter(*begin % 32); |
| ++begin; |
| break; |
| // formfeed character |
| case BOOST_XPR_CHAR_(char_type, 'f'): |
| esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f'); |
| ++begin; |
| break; |
| // newline |
| case BOOST_XPR_CHAR_(char_type, 'n'): |
| esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n'); |
| ++begin; |
| break; |
| // return |
| case BOOST_XPR_CHAR_(char_type, 'r'): |
| esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r'); |
| ++begin; |
| break; |
| // horizontal tab |
| case BOOST_XPR_CHAR_(char_type, 't'): |
| esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t'); |
| ++begin; |
| break; |
| // vertical tab |
| case BOOST_XPR_CHAR_(char_type, 'v'): |
| esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v'); |
| ++begin; |
| break; |
| // hex escape sequence |
| case BOOST_XPR_CHAR_(char_type, 'x'): |
| BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); |
| tmp = begin; |
| esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff)); |
| BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : " |
| "must be \\x HexDigit HexDigit"); |
| break; |
| // Unicode escape sequence |
| case BOOST_XPR_CHAR_(char_type, 'u'): |
| BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); |
| tmp = begin; |
| esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff)); |
| BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : " |
| "must be \\u HexDigit HexDigit HexDigit HexDigit"); |
| break; |
| // backslash |
| case BOOST_XPR_CHAR_(char_type, '\\'): |
| //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\'); |
| //++begin; |
| //break; |
| // all other escaped characters represent themselves |
| default: |
| esc.ch_ = *begin; |
| ++begin; |
| break; |
| } |
| |
| return esc; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| // parse_charset |
| // |
| template<typename FwdIter, typename RegexTraits, typename CompilerTraits> |
| inline void parse_charset |
| ( |
| FwdIter &begin |
| , FwdIter end |
| , compound_charset<RegexTraits> &chset |
| , CompilerTraits &traits |
| ) |
| { |
| using namespace regex_constants; |
| typedef typename RegexTraits::char_type char_type; |
| typedef typename RegexTraits::char_class_type char_class_type; |
| BOOST_ASSERT(begin != end); |
| RegexTraits const &rxtraits = traits.traits(); |
| bool const icase = (0 != (regex_constants::icase_ & traits.flags())); |
| FwdIter iprev = FwdIter(); |
| escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char}; |
| bool invert = false; |
| |
| // check to see if we have an inverse charset |
| if(begin != end && token_charset_invert == traits.get_charset_token(iprev = begin, end)) |
| { |
| begin = iprev; |
| invert = true; |
| } |
| |
| // skip the end token if-and-only-if it is the first token in the charset |
| if(begin != end && token_charset_end == traits.get_charset_token(iprev = begin, end)) |
| { |
| for(; begin != iprev; ++begin) |
| { |
| chset.set_char(*begin, rxtraits, icase); |
| } |
| } |
| |
| compiler_token_type tok; |
| char_type ch_prev = char_type(), ch_next = char_type(); |
| bool have_prev = false; |
| |
| BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); |
| |
| // remember the current position and grab the next token |
| iprev = begin; |
| tok = traits.get_charset_token(begin, end); |
| do |
| { |
| BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); |
| |
| if(token_charset_hyphen == tok && have_prev) |
| { |
| // remember the current position |
| FwdIter iprev2 = begin; |
| have_prev = false; |
| |
| // ch_prev is lower bound of a range |
| switch(traits.get_charset_token(begin, end)) |
| { |
| case token_charset_hyphen: |
| case token_charset_invert: |
| begin = iprev2; // un-get these tokens and fall through |
| case token_literal: |
| ch_next = *begin++; |
| BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range"); |
| chset.set_range(ch_prev, ch_next, rxtraits, icase); |
| continue; |
| case token_charset_backspace: |
| ch_next = char_type(8); // backspace |
| BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range"); |
| chset.set_range(ch_prev, ch_next, rxtraits, icase); |
| continue; |
| case token_escape: |
| esc = parse_escape(begin, end, traits); |
| if(escape_char == esc.type_) |
| { |
| BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range"); |
| chset.set_range(ch_prev, esc.ch_, rxtraits, icase); |
| continue; |
| } |
| case token_charset_end: // fall through |
| default: // not a range. |
| begin = iprev; // backup to hyphen token |
| chset.set_char(ch_prev, rxtraits, icase); |
| chset.set_char(*begin++, rxtraits, icase); |
| continue; |
| } |
| } |
| |
| if(have_prev) |
| { |
| chset.set_char(ch_prev, rxtraits, icase); |
| have_prev = false; |
| } |
| |
| switch(tok) |
| { |
| case token_charset_hyphen: |
| case token_charset_invert: |
| case token_charset_end: |
| case token_posix_charset_end: |
| begin = iprev; // un-get these tokens |
| ch_prev = *begin++; |
| have_prev = true; |
| continue; |
| |
| case token_charset_backspace: |
| ch_prev = char_type(8); // backspace |
| have_prev = true; |
| continue; |
| |
| case token_posix_charset_begin: |
| { |
| FwdIter tmp = begin, start = begin; |
| bool invert = (token_charset_invert == traits.get_charset_token(tmp, end)); |
| if(invert) |
| { |
| begin = start = tmp; |
| } |
| while(token_literal == (tok = traits.get_charset_token(begin, end))) |
| { |
| tmp = ++begin; |
| BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); |
| } |
| if(token_posix_charset_end == tok) |
| { |
| char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase); |
| BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name"); |
| chset.set_class(chclass, invert); |
| continue; |
| } |
| begin = iprev; // un-get this token |
| ch_prev = *begin++; |
| have_prev = true; |
| } |
| continue; |
| |
| case token_escape: |
| esc = parse_escape(begin, end, traits); |
| if(escape_char == esc.type_) |
| { |
| ch_prev = esc.ch_; |
| have_prev = true; |
| } |
| else if(escape_class == esc.type_) |
| { |
| char_class_type upper_ = lookup_classname(rxtraits, "upper"); |
| BOOST_ASSERT(0 != upper_); |
| chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_)); |
| } |
| else |
| { |
| BOOST_ASSERT(false); |
| } |
| continue; |
| |
| default: |
| ch_prev = *begin++; |
| have_prev = true; |
| continue; |
| } |
| } |
| while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"), |
| token_charset_end != (tok = traits.get_charset_token(begin, end))); |
| |
| if(have_prev) |
| { |
| chset.set_char(ch_prev, rxtraits, icase); |
| } |
| |
| if(invert) |
| { |
| chset.inverse(); |
| } |
| } |
| |
| }}} // namespace boost::xpressive::detail |
| |
| #endif |