| #!/usr/bin/python |
| |
| import collections |
| import re |
| |
| column_size = 8 |
| |
| categories = { |
| 'Cc': ['Other', 'Control'], |
| 'Cf': ['Other', 'Format'], |
| 'Cn': ['Other', 'NotAssigned'], |
| 'Co': ['Other', 'PrivateUse'], |
| 'Cs': ['Other', 'Surrogate'], |
| 'Ls': ['Letter', 'Cased'], |
| 'Ll': ['Letter', 'Lowercased'], |
| 'Lm': ['Letter', 'Modifier'], |
| 'Lo': ['Letter', 'Other'], |
| 'Lt': ['Letter', 'Titlecase'], |
| 'Lu': ['Letter', 'Uppercase'], |
| 'Mc': ['Mark', 'SpaceCombining'], |
| 'Me': ['Mark', 'Enclosing'], |
| 'Mn': ['Mark', 'Nonspacing'], |
| 'Nd': ['Number', 'DecimalDigit'], |
| 'Nl': ['Number', 'Letter'], |
| 'No': ['Number', 'Other'], |
| 'Pc': ['Punctuation', 'Connector'], |
| 'Pd': ['Punctuation', 'Dash'], |
| 'Pe': ['Punctuation', 'Close'], |
| 'Pf': ['Punctuation', 'FinalQuote'], |
| 'Pi': ['Punctuation', 'InitialQuote'], |
| 'Po': ['Punctuation', 'Other'], |
| 'Ps': ['Punctuation', 'Open'], |
| 'Sc': ['Symbol', 'Currency'], |
| 'Sk': ['Symbol', 'Modifier'], |
| 'Sm': ['Symbol', 'Math'], |
| 'So': ['Symbol', 'Other'], |
| 'Zl': ['Separator', 'Line'], |
| 'Zp': ['Separator', 'Paragraph'], |
| 'Zs': ['Separator', 'Space'] |
| } |
| |
| def generate_rows(): |
| with open('UnicodeData.txt', 'r') as ucd: |
| for line in ucd: |
| split = line.split(';') |
| char, category = split[0], split[2] |
| yield (char, category) |
| |
| |
| def generate_dict(rows_gen): |
| d = collections.defaultdict(list) |
| for char, category in rows_gen: |
| if category == 'Cs': |
| # for whatever reason, rust doesn't allow this class of characters |
| # as unicode literals. |
| continue |
| d[category].append(char) |
| return d |
| |
| def generate_tables(d): |
| new_dict = collections.defaultdict(list) |
| for key in d.keys(): |
| name = ''.join(categories[key]) |
| new_dict[name] = d[key] |
| return new_dict |
| |
| def print_header(): |
| print("// This file is autogenerated by scripts/unicode.py.\n") |
| |
| def main(): |
| print_header() |
| row_generator = generate_rows() |
| dictionary = generate_dict(row_generator) |
| named_table = generate_tables(dictionary) |
| output_tables(named_table) |
| |
| def output_tables(d): |
| for key in sorted(d.keys()): |
| name = camel_to_snake_case(key).upper() |
| rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key]) |
| table_lines = [] |
| for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]: |
| table_lines.append(' ' + ', '.join(chunk)) |
| table_string = ',\n'.join(table_lines) |
| print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string)) |
| |
| def camel_to_snake_case(name): |
| # thanks to http://stackoverflow.com/a/1176023/1030074 |
| s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) |
| return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() |
| |
| if __name__ == "__main__": |
| main() |