blob: 6a35d174262c3b854c49721aa5199d85662d1332 [file] [log] [blame]
#!/usr/bin/python
import collections
import re
column_size = 8
categories = {
'Cc': ['Other', 'Control'],
'Cf': ['Other', 'Format'],
'Cn': ['Other', 'NotAssigned'],
'Co': ['Other', 'PrivateUse'],
'Cs': ['Other', 'Surrogate'],
'Ls': ['Letter', 'Cased'],
'Ll': ['Letter', 'Lowercased'],
'Lm': ['Letter', 'Modifier'],
'Lo': ['Letter', 'Other'],
'Lt': ['Letter', 'Titlecase'],
'Lu': ['Letter', 'Uppercase'],
'Mc': ['Mark', 'SpaceCombining'],
'Me': ['Mark', 'Enclosing'],
'Mn': ['Mark', 'Nonspacing'],
'Nd': ['Number', 'DecimalDigit'],
'Nl': ['Number', 'Letter'],
'No': ['Number', 'Other'],
'Pc': ['Punctuation', 'Connector'],
'Pd': ['Punctuation', 'Dash'],
'Pe': ['Punctuation', 'Close'],
'Pf': ['Punctuation', 'FinalQuote'],
'Pi': ['Punctuation', 'InitialQuote'],
'Po': ['Punctuation', 'Other'],
'Ps': ['Punctuation', 'Open'],
'Sc': ['Symbol', 'Currency'],
'Sk': ['Symbol', 'Modifier'],
'Sm': ['Symbol', 'Math'],
'So': ['Symbol', 'Other'],
'Zl': ['Separator', 'Line'],
'Zp': ['Separator', 'Paragraph'],
'Zs': ['Separator', 'Space']
}
def generate_rows():
with open('UnicodeData.txt', 'r') as ucd:
for line in ucd:
split = line.split(';')
char, category = split[0], split[2]
yield (char, category)
def generate_dict(rows_gen):
d = collections.defaultdict(list)
for char, category in rows_gen:
if category == 'Cs':
# for whatever reason, rust doesn't allow this class of characters
# as unicode literals.
continue
d[category].append(char)
return d
def generate_tables(d):
new_dict = collections.defaultdict(list)
for key in d.keys():
name = ''.join(categories[key])
new_dict[name] = d[key]
return new_dict
def print_header():
print("// This file is autogenerated by scripts/unicode.py.\n")
def main():
print_header()
row_generator = generate_rows()
dictionary = generate_dict(row_generator)
named_table = generate_tables(dictionary)
output_tables(named_table)
def output_tables(d):
for key in sorted(d.keys()):
name = camel_to_snake_case(key).upper()
rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key])
table_lines = []
for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]:
table_lines.append(' ' + ', '.join(chunk))
table_string = ',\n'.join(table_lines)
print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string))
def camel_to_snake_case(name):
# thanks to http://stackoverflow.com/a/1176023/1030074
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
if __name__ == "__main__":
main()