vendor/unicode-properties/scripts/unicode.py - toolchain/rustc - Git at Google

 #!/usr/bin/env python3
 #
 # Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
 # file at the top-level directory of this distribution and at
 # http://rust-lang.org/COPYRIGHT.
 #
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.

 # This script uses the following Unicode UCD data:
 # - emoji/emoji-data.txt
 #
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the tables.rs file into git.

 import fileinput, re, os, sys, operator

 preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly

 #![allow(missing_docs, non_upper_case_globals, non_snake_case)]
 '''

 UNICODE_VERSION = (15, 0, 0)

 UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

 # Download a UCD table file
 def fetch_unidata(f):
     if not os.path.exists(os.path.basename(f)):
         os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s"
                   % (UNICODE_VERSION_NUMBER, f))

     if not os.path.exists(os.path.basename(f)):
         sys.stderr.write("cannot load %s" % f)
         exit(1)

 # Loads code point data from emoji-data.txt
 # Implementation from unicode-segmentation
 def load_emoji_properties(f):
     fetch_unidata(f)
     kinds = {}
     re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
     re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) *#")

     for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
         kind = None
         d_lo = 0
         d_hi = 0
         m = re1.match(line)
         if m:
             d_lo = m.group(1)
             d_hi = m.group(1)
             kind = m.group(2).strip()
         else:
             m = re2.match(line)
             if m:
                 d_lo = m.group(1)
                 d_hi = m.group(2)
                 kind = m.group(3).strip()
             else:
                 continue
         d_lo = int(d_lo, 16)
         d_hi = int(d_hi, 16)
         if kind not in kinds:
             kinds[kind] = []
         kinds[kind].append((d_lo, d_hi))

     return kinds


 def load_general_category_properties(f):
     fetch_unidata(f)
     general_category_list = []
     re1 = re.compile(r"^([0-9A-F]+);([^;]+);([A-Za-z]+);.*$")
     re2 = re.compile(r"^<(.*), First>$")
     re3 = re.compile(r"^<(.*), Last>$")
     re4 = re.compile(r"^<(.*)>$")

     special_group_lo = 0
     special_group_text = ''
     special_group_gc = ''
     for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
         d_ch = 0
         d_name = ''
         d_gc = ''
         d_lo = 0
         d_hi = 0
         m = re1.match(line)
         if not m:
             continue

         d_ch = m.group(1)
         d_name = m.group(2).strip()
         d_gc = m.group(3).strip()

         if not d_name.startswith('<'):
             d_lo = int(d_ch, 16)
             d_hi = d_lo
             general_category_list.append((d_lo, d_hi, d_gc))
             continue
         m2 = re2.match(d_name)
         if m2:
             special_group_lo = int(d_ch, 16)
             special_group_text = m2.group(1)
             special_group_gc = d_gc
             continue
         m3 = re3.match(d_name)
         if m3:
             assert(special_group_text == m3.group(1))
             assert(special_group_gc == d_gc)
             d_lo = special_group_lo
             d_hi = int(d_ch, 16)
             general_category_list.append((d_lo, d_hi, d_gc))
             continue
         m4 = re4.match(d_name)
         if m4:
             d_lo = int(d_ch, 16)
             d_hi = d_lo
             general_category_list.append((d_lo, d_hi, d_gc))
             continue
         raise ValueError("unreachable")
     return general_category_list

 def format_table_content(f, content, indent):
     line = " "*indent
     first = True
     for chunk in content.split(","):
         if len(line) + len(chunk) < 98:
             if first:
                 line += chunk
             else:
                 line += ", " + chunk
             first = False
         else:
             f.write(line + ",\n")
             line = " "*indent + chunk
     f.write(line)

 def escape_char(c):
     if c == 'multi':
         return "\"<multiple code points>\""
     return "'\\u{%x}'" % c

 def escape_char_list(l):
     line = "["
     first = True
     for c in l:
         if first:
             line += escape_char(c)
         else:
             line += ", " + escape_char(c)
         first = False
     line += "]"
     return line

 def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
         pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
     pub_string = "const"
     if not is_const:
         pub_string = "let"
     if is_pub:
         pub_string = "pub " + pub_string
     f.write("    %s %s: %s = &[\n" % (pub_string, name, t_type))
     data = ""
     first = True
     for dat in t_data:
         if not first:
             data += ","
         first = False
         data += pfun(dat)
     format_table_content(f, data, 8)
     f.write("\n    ];\n\n")

 def emit_general_category_module(f):
     f.write("""#[cfg(feature = \"general-category\")]
 pub mod general_category {""")
     f.write("""

     #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
     /// The most general classification of a character.
     pub enum GeneralCategory {
         /// `Lu`, an uppercase letter
         UppercaseLetter,
         /// `Ll`, a lowercase letter
         LowercaseLetter,
         /// `Lt`, a digraphic character, with first part uppercase
         TitlecaseLetter,
         /// `Lm`, a modifier letter
         ModifierLetter,
         /// `Lo`, other letters, including syllables and ideographs
         OtherLetter,
         /// `Mn`, a nonspacing combining mark (zero advance width)
         NonspacingMark,
         /// `Mc`, a spacing combining mark (positive advance width)
         SpacingMark,
         /// `Me`, an enclosing combining mark
         EnclosingMark,
         /// `Nd`, a decimal digit
         DecimalNumber,
         /// `Nl`, a letterlike numeric character
         LetterNumber,
         /// `No`, a numeric character of other type
         OtherNumber,
         /// `Pc`, a connecting punctuation mark, like a tie
         ConnectorPunctuation,
         /// `Pd`, a dash or hyphen punctuation mark
         DashPunctuation,
         /// `Ps`, an opening punctuation mark (of a pair)
         OpenPunctuation,
         /// `Pe`, a closing punctuation mark (of a pair)
         ClosePunctuation,
         /// `Pi`, an initial quotation mark
         InitialPunctuation,
         /// `Pf`, a final quotation mark
         FinalPunctuation,
         /// `Po`, a punctuation mark of other type
         OtherPunctuation,
         /// `Sm`, a symbol of mathematical use
         MathSymbol,
         /// `Sc`, a currency sign
         CurrencySymbol,
         /// `Sk`, a non-letterlike modifier symbol
         ModifierSymbol,
         /// `So`, a symbol of other type
         OtherSymbol,
         /// `Zs`, a space character (of various non-zero widths)
         SpaceSeparator,
         /// `Zl`, U+2028 LINE SEPARATOR only
         LineSeparator,
         /// `Zp`, U+2029 PARAGRAPH SEPARATOR only
         ParagraphSeparator,
         /// `Cc`, a C0 or C1 control code
         Control,
         /// `Cf`, a format control character
         Format,
         /// `Cs`, a surrogate code point
         Surrogate,
         /// `Co`, a private-use character
         PrivateUse,
         /// `Cn`, a reserved unassigned code point or a noncharacter
         Unassigned,
     }

     #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
     /// Groupings of the most general classification of a character.
     pub enum GeneralCategoryGroup {
         /// Lu | Ll | Lt | Lm | Lo
         Letter,
         /// Mn | Mc | Me
         Mark,
         /// Nd | Nl | No
         Number,
         /// Pc | Pd | Ps | Pe | Pi | Pf | Po
         Punctuation,
         /// Sm | Sc | Sk | So
         Symbol,
         /// Zs | Zl | Zp
         Separator,
         /// Cc | Cf | Cs | Co | Cn
         Other,
     }

     #[inline]
     pub(crate) fn general_category_of_char(c: char) -> GeneralCategory {
         match c as usize {
             _ => super::util::bsearch_range_value_table(c, GENERAL_CATEGORY).unwrap_or(GeneralCategory::Unassigned)
         }
     }

     #[inline]
     pub(crate) fn general_category_is_letter_cased(gc: GeneralCategory) -> bool {
         matches!(gc, GeneralCategory::UppercaseLetter | GeneralCategory::LowercaseLetter | GeneralCategory::TitlecaseLetter)
     }

     #[inline]
     pub(crate) fn general_category_group(gc: GeneralCategory) -> GeneralCategoryGroup {
         match gc {
             GeneralCategory::UppercaseLetter |
             GeneralCategory::LowercaseLetter |
             GeneralCategory::TitlecaseLetter |
             GeneralCategory::ModifierLetter |
             GeneralCategory::OtherLetter => GeneralCategoryGroup::Letter,
             GeneralCategory::NonspacingMark |
             GeneralCategory::SpacingMark |
             GeneralCategory::EnclosingMark => GeneralCategoryGroup::Mark,
             GeneralCategory::DecimalNumber |
             GeneralCategory::LetterNumber |
             GeneralCategory::OtherNumber => GeneralCategoryGroup::Number,
             GeneralCategory::ConnectorPunctuation |
             GeneralCategory::DashPunctuation |
             GeneralCategory::OpenPunctuation |
             GeneralCategory::ClosePunctuation |
             GeneralCategory::InitialPunctuation |
             GeneralCategory::FinalPunctuation |
             GeneralCategory::OtherPunctuation => GeneralCategoryGroup::Punctuation,
             GeneralCategory::MathSymbol |
             GeneralCategory::CurrencySymbol |
             GeneralCategory::ModifierSymbol |
             GeneralCategory::OtherSymbol => GeneralCategoryGroup::Symbol,
             GeneralCategory::SpaceSeparator |
             GeneralCategory::LineSeparator |
             GeneralCategory::ParagraphSeparator => GeneralCategoryGroup::Separator,
             GeneralCategory::Control |
             GeneralCategory::Format |
             GeneralCategory::Surrogate |
             GeneralCategory::PrivateUse |
             GeneralCategory::Unassigned => GeneralCategoryGroup::Other,
         }
     }
 """)
     gc_variants = {
         "Lu": "GeneralCategory::UppercaseLetter",
         "Ll": "GeneralCategory::LowercaseLetter" ,
         "Lt": "GeneralCategory::TitlecaseLetter" ,
         "Lm": "GeneralCategory::ModifierLetter" ,
         "Lo": "GeneralCategory::OtherLetter",
         "Mn": "GeneralCategory::NonspacingMark",
         "Mc": "GeneralCategory::SpacingMark" ,
         "Me": "GeneralCategory::EnclosingMark",
         "Nd": "GeneralCategory::DecimalNumber",
         "Nl": "GeneralCategory::LetterNumber" ,
         "No": "GeneralCategory::OtherNumber",
         "Pc": "GeneralCategory::ConnectorPunctuation",
         "Pd": "GeneralCategory::DashPunctuation" ,
         "Ps": "GeneralCategory::OpenPunctuation" ,
         "Pe": "GeneralCategory::ClosePunctuation" ,
         "Pi": "GeneralCategory::InitialPunctuation" ,
         "Pf": "GeneralCategory::FinalPunctuation" ,
         "Po": "GeneralCategory::OtherPunctuation",
         "Sm": "GeneralCategory::MathSymbol",
         "Sc": "GeneralCategory::CurrencySymbol" ,
         "Sk": "GeneralCategory::ModifierSymbol" ,
         "So": "GeneralCategory::OtherSymbol",
         "Zs": "GeneralCategory::SpaceSeparator",
         "Zl": "GeneralCategory::LineSeparator" ,
         "Zp": "GeneralCategory::ParagraphSeparator",
         "Cc": "GeneralCategory::Control",
         "Cf": "GeneralCategory::Format" ,
         "Cs": "GeneralCategory::Surrogate" ,
         "Co": "GeneralCategory::PrivateUse" ,
         "Cn": "GeneralCategory::Unassigned",
     }

     f.write("    // General category table:\n")
     general_category_char_table = load_general_category_properties("UnicodeData.txt")
     general_category_group_table = []
     for input_idx in range(len(general_category_char_table)):
         if general_category_char_table[input_idx][2] == "Cs":
             continue
         existing_group_count = len(general_category_group_table)
         if existing_group_count == 0:
             general_category_group_table.append(general_category_char_table[input_idx])
         elif (general_category_group_table[existing_group_count - 1][1] + 1 == general_category_char_table[input_idx][0] and
             general_category_group_table[existing_group_count - 1][2] == general_category_char_table[input_idx][2]):
             general_category_group_table[existing_group_count - 1] = (general_category_group_table[existing_group_count - 1][0],
                 general_category_char_table[input_idx][1], general_category_group_table[existing_group_count - 1][2])
         else:
             general_category_group_table.append(general_category_char_table[input_idx])
     emit_table(f, "GENERAL_CATEGORY", general_category_group_table, "&'static [(char, char, GeneralCategory)]", is_pub=False,
             pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), gc_variants[x[2]]))
     f.write("}\n\n")


 def emit_emoji_module(f):
     f.write("""#[cfg(feature = \"emoji\")]
 pub mod emoji {""")
     f.write("""

     #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
     #[non_exhaustive]
     /// The emoji character properties of a character.
     pub enum EmojiStatus {
         /// `Emoji=NO`, `Emoji_Component=NO`
         NonEmoji,
         /// `Emoji=NO`, `Emoji_Component=YES`
         NonEmojiButEmojiComponent,
         /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES`
         EmojiPresentation,
         /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Modifier_Base=YES`
         EmojiModifierBase,
         /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES`, `Emoji_Modifier_Base=YES`
         EmojiPresentationAndModifierBase,
         /// `Emoji=YES`, `Emoji_Component=NO`
         EmojiOther,
         /// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES`
         EmojiPresentationAndEmojiComponent,
         /// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES`, `Emoji_Modifier=YES`
         EmojiPresentationAndModifierAndEmojiComponent,
         /// `Emoji=YES`, `Emoji_Component=YES`
         EmojiOtherAndEmojiComponent,
     }
     #[inline]
     pub(crate) fn emoji_status(c: char) -> EmojiStatus {
         // FIXME: do we want to special case ASCII here?
         match c as usize {
             _ => super::util::bsearch_range_value_table(c, EMOJI_STATUS).unwrap()
         }
     }
     #[inline]
     pub(crate) fn is_emoji_status_for_emoji_char_or_emoji_component(s: EmojiStatus) -> bool {
         !matches!(s, EmojiStatus::NonEmoji)
     }
     #[inline]
     pub(crate) fn is_emoji_status_for_emoji_char(s: EmojiStatus) -> bool {
         !matches!(s, EmojiStatus::NonEmoji | EmojiStatus::NonEmojiButEmojiComponent)
     }
     #[inline]
     pub(crate) fn is_emoji_status_for_emoji_component(s: EmojiStatus) -> bool {
         matches!(s, EmojiStatus::EmojiPresentationAndEmojiComponent |
             EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent |
             EmojiStatus::EmojiOtherAndEmojiComponent)
     }
 """)

     f.write("    // Emoji status table:\n")
     emoji_status_table = load_emoji_properties("emoji/emoji-data.txt")
     # we combine things together here.

     # `Extended_Pictographic`` is only for future proof usages, we ignore it here.
     # emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component", "Extended_Pictographic"]
     emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component"]

     # need to skip surrogates because they're not representable by rust `char`s
     emoji_status_table["Surrogate"] = [(0xD800, 0xDFFF)]
     emoji_prop_list.append("Surrogate")

     emoji_prop_list_len = [len(emoji_status_table[x]) for x in emoji_prop_list]
     emoji_prop_count = len(emoji_prop_list)
     code_point_first = 0
     code_point_last = 0x10FFFF
     emoji_prop_list_pos = [0 for _ in emoji_prop_list]
     cur_group_first = code_point_first
     emoji_table = []
     def group_text(s):
         if s == "Surrogate":
             return "<Surrogate>"
         elif s == "":
             return "EmojiStatus::NonEmoji"
         elif s == "Emoji_Component":
             return "EmojiStatus::NonEmojiButEmojiComponent"
         elif s == "Emoji;Emoji_Presentation":
             return "EmojiStatus::EmojiPresentation"
         elif s == "Emoji;Emoji_Presentation;Emoji_Modifier_Base":
             return "EmojiStatus::EmojiPresentationAndModifierBase"
         elif s == "Emoji;Emoji_Modifier_Base":
             return "EmojiStatus::EmojiModifierBase"
         elif s == "Emoji":
             return "EmojiStatus::EmojiOther"
         elif s == "Emoji;Emoji_Presentation;Emoji_Component":
             return "EmojiStatus::EmojiPresentationAndEmojiComponent"
         elif s == "Emoji;Emoji_Presentation;Emoji_Modifier;Emoji_Component":
             return "EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent"
         elif s == "Emoji;Emoji_Component":
             return "EmojiStatus::EmojiOtherAndEmojiComponent"
         else:
             return "EmojiStatus::NewCombination(\"" + s + "\")"
     while cur_group_first <= code_point_last:
         cur_group_props = []
         cur_group_last = code_point_last
         for prop_list_idx in range(emoji_prop_count):
             if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]:
                 continue
             elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first:
                 cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] - 1)
             else:
                 cur_group_props.append(emoji_prop_list[prop_list_idx])
                 cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1])
         cur_group_text = group_text(";".join(cur_group_props))
         if cur_group_text != "<Surrogate>":
             emoji_table.append((cur_group_first, cur_group_last, cur_group_text))
         for prop_list_idx in range(emoji_prop_count):
             if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]:
                 continue
             elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first:
                 continue
             else:
                 if cur_group_last == emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]:
                     emoji_prop_list_pos[prop_list_idx] += 1
         cur_group_first = cur_group_last + 1

     emit_table(f, "EMOJI_STATUS", emoji_table, "&'static [(char, char, EmojiStatus)]", is_pub=False,
             pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
     f.write("}\n\n")

 def emit_util_mod(f):
     f.write("""
 #[allow(dead_code)]
 pub mod util {
     use core::result::Result::{Ok, Err};

     pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
             else if hi < c { Less }
             else { Greater }
         }) {
             Ok(idx) => {
                 let (_, _, cat) = r[idx];
                 Some(cat)
             }
             Err(_) => None
         }
     }

 }

 """)

 if __name__ == "__main__":
     r = "tables.rs"
     if os.path.exists(r):
         os.remove(r)
     with open(r, "w") as rf:
         # write the file's preamble
         rf.write(preamble)

         rf.write("""
 /// The version of [Unicode](http://www.unicode.org/)
 /// that this version of unicode-security is based on.
 pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);

 """ % UNICODE_VERSION)

         emit_util_mod(rf)
         ### general category module
         emit_general_category_module(rf)
         ### emoji module
         emit_emoji_module(rf)
	#!/usr/bin/env python3
	#
	# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
	# file at the top-level directory of this distribution and at
	# http://rust-lang.org/COPYRIGHT.
	#
	# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	# option. This file may not be copied, modified, or distributed
	# except according to those terms.

	# This script uses the following Unicode UCD data:
	# - emoji/emoji-data.txt
	#
	# Since this should not require frequent updates, we just store this
	# out-of-line and check the tables.rs file into git.

	import fileinput, re, os, sys, operator

	preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
	// file at the top-level directory of this distribution and at
	// http://rust-lang.org/COPYRIGHT.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly

	#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
	'''

	UNICODE_VERSION = (15, 0, 0)

	UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

	# Download a UCD table file
	def fetch_unidata(f):
	if not os.path.exists(os.path.basename(f)):
	os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s"
	% (UNICODE_VERSION_NUMBER, f))

	if not os.path.exists(os.path.basename(f)):
	sys.stderr.write("cannot load %s" % f)
	exit(1)

	# Loads code point data from emoji-data.txt
	# Implementation from unicode-segmentation
	def load_emoji_properties(f):
	fetch_unidata(f)
	kinds = {}
	re1 = re.compile(r"^ ([0-9A-F]+) ; *(\w+)")
	re2 = re.compile(r"^ ([0-9A-F]+)\.\.([0-9A-F]+) ; (\w+) #")

	for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
	kind = None
	d_lo = 0
	d_hi = 0
	m = re1.match(line)
	if m:
	d_lo = m.group(1)
	d_hi = m.group(1)
	kind = m.group(2).strip()
	else:
	m = re2.match(line)
	if m:
	d_lo = m.group(1)
	d_hi = m.group(2)
	kind = m.group(3).strip()
	else:
	continue
	d_lo = int(d_lo, 16)
	d_hi = int(d_hi, 16)
	if kind not in kinds:
	kinds[kind] = []
	kinds[kind].append((d_lo, d_hi))

	return kinds


	def load_general_category_properties(f):
	fetch_unidata(f)
	general_category_list = []
	re1 = re.compile(r"^([0-9A-F]+);([^;]+);([A-Za-z]+);.*$")
	re2 = re.compile(r"^<(.*), First>$")
	re3 = re.compile(r"^<(.*), Last>$")
	re4 = re.compile(r"^<(.*)>$")

	special_group_lo = 0
	special_group_text = ''
	special_group_gc = ''
	for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
	d_ch = 0
	d_name = ''
	d_gc = ''
	d_lo = 0
	d_hi = 0
	m = re1.match(line)
	if not m:
	continue

	d_ch = m.group(1)
	d_name = m.group(2).strip()
	d_gc = m.group(3).strip()

	if not d_name.startswith('<'):
	d_lo = int(d_ch, 16)
	d_hi = d_lo
	general_category_list.append((d_lo, d_hi, d_gc))
	continue
	m2 = re2.match(d_name)
	if m2:
	special_group_lo = int(d_ch, 16)
	special_group_text = m2.group(1)
	special_group_gc = d_gc
	continue
	m3 = re3.match(d_name)
	if m3:
	assert(special_group_text == m3.group(1))
	assert(special_group_gc == d_gc)
	d_lo = special_group_lo
	d_hi = int(d_ch, 16)
	general_category_list.append((d_lo, d_hi, d_gc))
	continue
	m4 = re4.match(d_name)
	if m4:
	d_lo = int(d_ch, 16)
	d_hi = d_lo
	general_category_list.append((d_lo, d_hi, d_gc))
	continue
	raise ValueError("unreachable")
	return general_category_list

	def format_table_content(f, content, indent):
	line = " "*indent
	first = True
	for chunk in content.split(","):
	if len(line) + len(chunk) < 98:
	if first:
	line += chunk
	else:
	line += ", " + chunk
	first = False
	else:
	f.write(line + ",\n")
	line = " "*indent + chunk
	f.write(line)

	def escape_char(c):
	if c == 'multi':
	return "\"<multiple code points>\""
	return "'\\u{%x}'" % c

	def escape_char_list(l):
	line = "["
	first = True
	for c in l:
	if first:
	line += escape_char(c)
	else:
	line += ", " + escape_char(c)
	first = False
	line += "]"
	return line

	def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
	pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
	pub_string = "const"
	if not is_const:
	pub_string = "let"
	if is_pub:
	pub_string = "pub " + pub_string
	f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
	data = ""
	first = True
	for dat in t_data:
	if not first:
	data += ","
	first = False
	data += pfun(dat)
	format_table_content(f, data, 8)
	f.write("\n ];\n\n")

	def emit_general_category_module(f):
	f.write("""#[cfg(feature = \"general-category\")]
	pub mod general_category {""")
	f.write("""

	#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
	/// The most general classification of a character.
	pub enum GeneralCategory {
	/// `Lu`, an uppercase letter
	UppercaseLetter,
	/// `Ll`, a lowercase letter
	LowercaseLetter,
	/// `Lt`, a digraphic character, with first part uppercase
	TitlecaseLetter,
	/// `Lm`, a modifier letter
	ModifierLetter,
	/// `Lo`, other letters, including syllables and ideographs
	OtherLetter,
	/// `Mn`, a nonspacing combining mark (zero advance width)
	NonspacingMark,
	/// `Mc`, a spacing combining mark (positive advance width)
	SpacingMark,
	/// `Me`, an enclosing combining mark
	EnclosingMark,
	/// `Nd`, a decimal digit
	DecimalNumber,
	/// `Nl`, a letterlike numeric character
	LetterNumber,
	/// `No`, a numeric character of other type
	OtherNumber,
	/// `Pc`, a connecting punctuation mark, like a tie
	ConnectorPunctuation,
	/// `Pd`, a dash or hyphen punctuation mark
	DashPunctuation,
	/// `Ps`, an opening punctuation mark (of a pair)
	OpenPunctuation,
	/// `Pe`, a closing punctuation mark (of a pair)
	ClosePunctuation,
	/// `Pi`, an initial quotation mark
	InitialPunctuation,
	/// `Pf`, a final quotation mark
	FinalPunctuation,
	/// `Po`, a punctuation mark of other type
	OtherPunctuation,
	/// `Sm`, a symbol of mathematical use
	MathSymbol,
	/// `Sc`, a currency sign
	CurrencySymbol,
	/// `Sk`, a non-letterlike modifier symbol
	ModifierSymbol,
	/// `So`, a symbol of other type
	OtherSymbol,
	/// `Zs`, a space character (of various non-zero widths)
	SpaceSeparator,
	/// `Zl`, U+2028 LINE SEPARATOR only
	LineSeparator,
	/// `Zp`, U+2029 PARAGRAPH SEPARATOR only
	ParagraphSeparator,
	/// `Cc`, a C0 or C1 control code
	Control,
	/// `Cf`, a format control character
	Format,
	/// `Cs`, a surrogate code point
	Surrogate,
	/// `Co`, a private-use character
	PrivateUse,
	/// `Cn`, a reserved unassigned code point or a noncharacter
	Unassigned,
	}

	#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
	/// Groupings of the most general classification of a character.
	pub enum GeneralCategoryGroup {
	/// Lu \| Ll \| Lt \| Lm \| Lo
	Letter,
	/// Mn \| Mc \| Me
	Mark,
	/// Nd \| Nl \| No
	Number,
	/// Pc \| Pd \| Ps \| Pe \| Pi \| Pf \| Po
	Punctuation,
	/// Sm \| Sc \| Sk \| So
	Symbol,
	/// Zs \| Zl \| Zp
	Separator,
	/// Cc \| Cf \| Cs \| Co \| Cn
	Other,
	}

	#[inline]
	pub(crate) fn general_category_of_char(c: char) -> GeneralCategory {
	match c as usize {
	_ => super::util::bsearch_range_value_table(c, GENERAL_CATEGORY).unwrap_or(GeneralCategory::Unassigned)
	}
	}

	#[inline]
	pub(crate) fn general_category_is_letter_cased(gc: GeneralCategory) -> bool {
	matches!(gc, GeneralCategory::UppercaseLetter \| GeneralCategory::LowercaseLetter \| GeneralCategory::TitlecaseLetter)
	}

	#[inline]
	pub(crate) fn general_category_group(gc: GeneralCategory) -> GeneralCategoryGroup {
	match gc {
	GeneralCategory::UppercaseLetter \|
	GeneralCategory::LowercaseLetter \|
	GeneralCategory::TitlecaseLetter \|
	GeneralCategory::ModifierLetter \|
	GeneralCategory::OtherLetter => GeneralCategoryGroup::Letter,
	GeneralCategory::NonspacingMark \|
	GeneralCategory::SpacingMark \|
	GeneralCategory::EnclosingMark => GeneralCategoryGroup::Mark,
	GeneralCategory::DecimalNumber \|
	GeneralCategory::LetterNumber \|
	GeneralCategory::OtherNumber => GeneralCategoryGroup::Number,
	GeneralCategory::ConnectorPunctuation \|
	GeneralCategory::DashPunctuation \|
	GeneralCategory::OpenPunctuation \|
	GeneralCategory::ClosePunctuation \|
	GeneralCategory::InitialPunctuation \|
	GeneralCategory::FinalPunctuation \|
	GeneralCategory::OtherPunctuation => GeneralCategoryGroup::Punctuation,
	GeneralCategory::MathSymbol \|
	GeneralCategory::CurrencySymbol \|
	GeneralCategory::ModifierSymbol \|
	GeneralCategory::OtherSymbol => GeneralCategoryGroup::Symbol,
	GeneralCategory::SpaceSeparator \|
	GeneralCategory::LineSeparator \|
	GeneralCategory::ParagraphSeparator => GeneralCategoryGroup::Separator,
	GeneralCategory::Control \|
	GeneralCategory::Format \|
	GeneralCategory::Surrogate \|
	GeneralCategory::PrivateUse \|
	GeneralCategory::Unassigned => GeneralCategoryGroup::Other,
	}
	}
	""")
	gc_variants = {
	"Lu": "GeneralCategory::UppercaseLetter",
	"Ll": "GeneralCategory::LowercaseLetter" ,
	"Lt": "GeneralCategory::TitlecaseLetter" ,
	"Lm": "GeneralCategory::ModifierLetter" ,
	"Lo": "GeneralCategory::OtherLetter",
	"Mn": "GeneralCategory::NonspacingMark",
	"Mc": "GeneralCategory::SpacingMark" ,
	"Me": "GeneralCategory::EnclosingMark",
	"Nd": "GeneralCategory::DecimalNumber",
	"Nl": "GeneralCategory::LetterNumber" ,
	"No": "GeneralCategory::OtherNumber",
	"Pc": "GeneralCategory::ConnectorPunctuation",
	"Pd": "GeneralCategory::DashPunctuation" ,
	"Ps": "GeneralCategory::OpenPunctuation" ,
	"Pe": "GeneralCategory::ClosePunctuation" ,
	"Pi": "GeneralCategory::InitialPunctuation" ,
	"Pf": "GeneralCategory::FinalPunctuation" ,
	"Po": "GeneralCategory::OtherPunctuation",
	"Sm": "GeneralCategory::MathSymbol",
	"Sc": "GeneralCategory::CurrencySymbol" ,
	"Sk": "GeneralCategory::ModifierSymbol" ,
	"So": "GeneralCategory::OtherSymbol",
	"Zs": "GeneralCategory::SpaceSeparator",
	"Zl": "GeneralCategory::LineSeparator" ,
	"Zp": "GeneralCategory::ParagraphSeparator",
	"Cc": "GeneralCategory::Control",
	"Cf": "GeneralCategory::Format" ,
	"Cs": "GeneralCategory::Surrogate" ,
	"Co": "GeneralCategory::PrivateUse" ,
	"Cn": "GeneralCategory::Unassigned",
	}

	f.write(" // General category table:\n")
	general_category_char_table = load_general_category_properties("UnicodeData.txt")
	general_category_group_table = []
	for input_idx in range(len(general_category_char_table)):
	if general_category_char_table[input_idx][2] == "Cs":
	continue
	existing_group_count = len(general_category_group_table)
	if existing_group_count == 0:
	general_category_group_table.append(general_category_char_table[input_idx])
	elif (general_category_group_table[existing_group_count - 1][1] + 1 == general_category_char_table[input_idx][0] and
	general_category_group_table[existing_group_count - 1][2] == general_category_char_table[input_idx][2]):
	general_category_group_table[existing_group_count - 1] = (general_category_group_table[existing_group_count - 1][0],
	general_category_char_table[input_idx][1], general_category_group_table[existing_group_count - 1][2])
	else:
	general_category_group_table.append(general_category_char_table[input_idx])
	emit_table(f, "GENERAL_CATEGORY", general_category_group_table, "&'static [(char, char, GeneralCategory)]", is_pub=False,
	pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), gc_variants[x[2]]))
	f.write("}\n\n")


	def emit_emoji_module(f):
	f.write("""#[cfg(feature = \"emoji\")]
	pub mod emoji {""")
	f.write("""

	#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
	#[non_exhaustive]
	/// The emoji character properties of a character.
	pub enum EmojiStatus {
	/// `Emoji=NO`, `Emoji_Component=NO`
	NonEmoji,
	/// `Emoji=NO`, `Emoji_Component=YES`
	NonEmojiButEmojiComponent,
	/// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES`
	EmojiPresentation,
	/// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Modifier_Base=YES`
	EmojiModifierBase,
	/// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES`, `Emoji_Modifier_Base=YES`
	EmojiPresentationAndModifierBase,
	/// `Emoji=YES`, `Emoji_Component=NO`
	EmojiOther,
	/// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES`
	EmojiPresentationAndEmojiComponent,
	/// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES`, `Emoji_Modifier=YES`
	EmojiPresentationAndModifierAndEmojiComponent,
	/// `Emoji=YES`, `Emoji_Component=YES`
	EmojiOtherAndEmojiComponent,
	}
	#[inline]
	pub(crate) fn emoji_status(c: char) -> EmojiStatus {
	// FIXME: do we want to special case ASCII here?
	match c as usize {
	_ => super::util::bsearch_range_value_table(c, EMOJI_STATUS).unwrap()
	}
	}
	#[inline]
	pub(crate) fn is_emoji_status_for_emoji_char_or_emoji_component(s: EmojiStatus) -> bool {
	!matches!(s, EmojiStatus::NonEmoji)
	}
	#[inline]
	pub(crate) fn is_emoji_status_for_emoji_char(s: EmojiStatus) -> bool {
	!matches!(s, EmojiStatus::NonEmoji \| EmojiStatus::NonEmojiButEmojiComponent)
	}
	#[inline]
	pub(crate) fn is_emoji_status_for_emoji_component(s: EmojiStatus) -> bool {
	matches!(s, EmojiStatus::EmojiPresentationAndEmojiComponent \|
	EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent \|
	EmojiStatus::EmojiOtherAndEmojiComponent)
	}
	""")

	f.write(" // Emoji status table:\n")
	emoji_status_table = load_emoji_properties("emoji/emoji-data.txt")
	# we combine things together here.

	# `Extended_Pictographic`` is only for future proof usages, we ignore it here.
	# emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component", "Extended_Pictographic"]
	emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component"]

	# need to skip surrogates because they're not representable by rust `char`s
	emoji_status_table["Surrogate"] = [(0xD800, 0xDFFF)]
	emoji_prop_list.append("Surrogate")

	emoji_prop_list_len = [len(emoji_status_table[x]) for x in emoji_prop_list]
	emoji_prop_count = len(emoji_prop_list)
	code_point_first = 0
	code_point_last = 0x10FFFF
	emoji_prop_list_pos = [0 for _ in emoji_prop_list]
	cur_group_first = code_point_first
	emoji_table = []
	def group_text(s):
	if s == "Surrogate":
	return "<Surrogate>"
	elif s == "":
	return "EmojiStatus::NonEmoji"
	elif s == "Emoji_Component":
	return "EmojiStatus::NonEmojiButEmojiComponent"
	elif s == "Emoji;Emoji_Presentation":
	return "EmojiStatus::EmojiPresentation"
	elif s == "Emoji;Emoji_Presentation;Emoji_Modifier_Base":
	return "EmojiStatus::EmojiPresentationAndModifierBase"
	elif s == "Emoji;Emoji_Modifier_Base":
	return "EmojiStatus::EmojiModifierBase"
	elif s == "Emoji":
	return "EmojiStatus::EmojiOther"
	elif s == "Emoji;Emoji_Presentation;Emoji_Component":
	return "EmojiStatus::EmojiPresentationAndEmojiComponent"
	elif s == "Emoji;Emoji_Presentation;Emoji_Modifier;Emoji_Component":
	return "EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent"
	elif s == "Emoji;Emoji_Component":
	return "EmojiStatus::EmojiOtherAndEmojiComponent"
	else:
	return "EmojiStatus::NewCombination(\"" + s + "\")"
	while cur_group_first <= code_point_last:
	cur_group_props = []
	cur_group_last = code_point_last
	for prop_list_idx in range(emoji_prop_count):
	if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]:
	continue
	elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first:
	cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] - 1)
	else:
	cur_group_props.append(emoji_prop_list[prop_list_idx])
	cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1])
	cur_group_text = group_text(";".join(cur_group_props))
	if cur_group_text != "<Surrogate>":
	emoji_table.append((cur_group_first, cur_group_last, cur_group_text))
	for prop_list_idx in range(emoji_prop_count):
	if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]:
	continue
	elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first:
	continue
	else:
	if cur_group_last == emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]:
	emoji_prop_list_pos[prop_list_idx] += 1
	cur_group_first = cur_group_last + 1

	emit_table(f, "EMOJI_STATUS", emoji_table, "&'static [(char, char, EmojiStatus)]", is_pub=False,
	pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
	f.write("}\n\n")

	def emit_util_mod(f):
	f.write("""
	#[allow(dead_code)]
	pub mod util {
	use core::result::Result::{Ok, Err};

	pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
	use core::cmp::Ordering::{Equal, Less, Greater};
	match r.binary_search_by(\|&(lo, hi, _)\| {
	if lo <= c && c <= hi { Equal }
	else if hi < c { Less }
	else { Greater }
	}) {
	Ok(idx) => {
	let (_, _, cat) = r[idx];
	Some(cat)
	}
	Err(_) => None
	}
	}

	}

	""")

	if __name__ == "__main__":
	r = "tables.rs"
	if os.path.exists(r):
	os.remove(r)
	with open(r, "w") as rf:
	# write the file's preamble
	rf.write(preamble)

	rf.write("""
	/// The version of [Unicode](http://www.unicode.org/)
	/// that this version of unicode-security is based on.
	pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);

	""" % UNICODE_VERSION)

	emit_util_mod(rf)
	### general category module
	emit_general_category_module(rf)
	### emoji module
	emit_emoji_module(rf)