| # This is a part of rust-encoding. |
| # Copyright (c) 2013-2015, Kang Seonghoon. |
| # See README.md and LICENSE.txt for details. |
| |
| import urllib |
| import sys |
| import os.path |
| |
| def whatwg_index(name, comments): |
| for line in urllib.urlopen('http://encoding.spec.whatwg.org/index-%s.txt' % name): |
| line = line.strip() |
| if not line: continue |
| if line.startswith('#'): |
| comments.append('//' + line[1:]) |
| continue |
| parts = line.split(None, 2) |
| key = int(parts[0], 0) |
| value = int(parts[1], 0) |
| yield key, value |
| |
| def mkdir_and_open(crate, name): |
| dirname = os.path.join(os.path.dirname(__file__), crate) |
| try: |
| os.mkdir(dirname) |
| except Exception: |
| pass |
| return open(os.path.join(dirname, '%s.rs' % name.replace('-', '_')), 'wb') |
| |
| def write_header(f, name, comments): |
| print >>f, '// AUTOGENERATED FROM index-%s.txt, ORIGINAL COMMENT FOLLOWS:' % name |
| print >>f, '//' |
| for line in comments: |
| print >>f, line |
| |
| def write_comma_separated(f, prefix, l, width=80): |
| buffered = '' |
| for i in l: |
| i = str(i) |
| if len(prefix) + len(buffered) + len(i) <= width: |
| buffered += i |
| else: |
| print >>f, prefix + buffered.rstrip() |
| buffered = i |
| if buffered: |
| print >>f, prefix + buffered.rstrip() |
| |
| def make_minimal_trie(invdata, lowerlimit=0x10000): |
| maxvalue = max(invdata.keys()) + 1 |
| best = 0xffffffff |
| besttrie = None |
| for triebits in xrange(21): |
| lower = [None] * (1<<triebits) |
| upper = [] |
| lowermap = {tuple(lower): 0} |
| for i in xrange(0, maxvalue, 1<<triebits): |
| blk = [invdata.get(j) for j in xrange(i, i + (1<<triebits))] |
| loweridx = lowermap.get(tuple(blk)) |
| if loweridx is None: |
| loweridx = len(lower) |
| lowermap[tuple(blk)] = loweridx |
| lower += blk |
| upper.append(loweridx) |
| if len(lower) < lowerlimit and best >= len(lower) + len(upper): |
| best = len(lower) + len(upper) |
| besttrie = (triebits, lower, upper) |
| return besttrie |
| |
| def generate_single_byte_index(crate, name): |
| modname = name.replace('-', '_') |
| |
| data = [None] * 128 |
| invdata = {} |
| comments = [] |
| for key, value in whatwg_index(name, comments): |
| assert 0 <= key < 128 and 0 <= value < 0xffff and data[key] is None and value not in invdata |
| data[key] = value |
| invdata[value] = key |
| |
| # generate a trie with a minimal amount of data |
| triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000) |
| |
| with mkdir_and_open(crate, name) as f: |
| write_header(f, name, comments) |
| print >>f |
| print >>f, "static FORWARD_TABLE: &'static [u16] = &[" |
| write_comma_separated(f, ' ', |
| ['%d, ' % (0xffff if value is None else value) for value in data]) |
| print >>f, '];' |
| print >>f |
| print >>f, '/// Returns the index code point for pointer `code` in this index.' |
| print >>f, '#[inline]' |
| print >>f, 'pub fn forward(code: u8) -> u16 {' |
| print >>f, ' FORWARD_TABLE[(code - 0x80) as usize]' |
| print >>f, '}' |
| print >>f |
| print >>f, "static BACKWARD_TABLE_LOWER: &'static [u8] = &[" |
| write_comma_separated(f, ' ', ['%d, ' % (0 if v is None else v+0x80) for v in lower]) |
| print >>f, '];' |
| print >>f |
| print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &[" |
| write_comma_separated(f, ' ', ['%d, ' % v for v in upper]) |
| print >>f, '];' |
| print >>f |
| print >>f, '/// Returns the index pointer for code point `code` in this index.' |
| print >>f, '#[inline]' |
| print >>f, 'pub fn backward(code: u32) -> u8 {' |
| print >>f, ' let offset = (code >> %d) as usize;' % triebits |
| print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper) |
| print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<<triebits)-1) |
| print >>f, '}' |
| print >>f |
| print >>f, '#[cfg(test)]' |
| print >>f, 'single_byte_tests!(' |
| print >>f, ' mod = %s' % modname |
| print >>f, ');' |
| |
| return 2 * len(data) + len(lower) + 2 * len(upper) |
| |
| def generate_multi_byte_index(crate, name): |
| modname = name.replace('-', '_') |
| |
| data = {} |
| invdata = {} |
| dups = [] |
| comments = [] |
| morebits = False |
| for key, value in whatwg_index(name, comments): |
| assert 0 <= key < 0xffff and 0 <= value < 0x110000 and value != 0xffff and key not in data |
| if value >= 0x10001: |
| assert (value >> 16) == 2 |
| morebits = True |
| data[key] = value |
| if value not in invdata: |
| invdata[value] = key |
| else: |
| dups.append(key) |
| |
| # Big5 has four two-letter forward mappings, we use special entries for them |
| if name == 'big5': |
| specialidx = [1133, 1135, 1164, 1166] |
| assert all(key not in data for key in specialidx) |
| assert all(value not in invdata for value in xrange(len(specialidx))) |
| for value, key in enumerate(specialidx): |
| data[key] = value |
| dups.append(key) # no consistency testing for them |
| |
| # generate a trie with a minimal amount of data |
| triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000) |
| |
| # JIS X 0208 index has two ranges [8272,8836) and [8836,11280) to support two slightly |
| # different encodings EUC-JP and Shift_JIS; the default backward function would favor |
| # the former, so we need a separate mapping for the latter. |
| # |
| # fortunately for us, all allocated codes in [8272,8836) have counterparts in others, |
| # so we only need a smaller remapping from [8272,8836) to other counterparts. |
| remap = None |
| if name == 'jis0208': |
| REMAP_MIN = 8272 |
| REMAP_MAX = 8835 |
| |
| invdataminusremap = {} |
| for key, value in data.items(): |
| if value not in invdataminusremap and not REMAP_MIN <= key <= REMAP_MAX: |
| invdataminusremap[value] = key |
| |
| remap = [] |
| for i in xrange(REMAP_MIN, REMAP_MAX+1): |
| if i in data: |
| assert data[i] in invdataminusremap |
| value = invdataminusremap[data[i]] |
| assert value < 0x10000 |
| remap.append(value) |
| else: |
| remap.append(0xffff) |
| |
| minkey = min(data) |
| maxkey = max(data) + 1 |
| with mkdir_and_open(crate, name) as f: |
| write_header(f, name, comments) |
| print >>f |
| print >>f, "static FORWARD_TABLE: &'static [u16] = &[" |
| write_comma_separated(f, ' ', |
| ['%d, ' % (data.get(key, 0xffff) & 0xffff) for key in xrange(minkey, maxkey)]) |
| print >>f, '];' |
| if morebits: |
| print >>f |
| print >>f, "static FORWARD_TABLE_MORE: &'static [u32] = &[" |
| bits = [] |
| for i in xrange(minkey, maxkey, 32): |
| v = 0 |
| for j in xrange(32): |
| v |= (data.get(i+j, 0) >= 0x10000) << j |
| bits.append(v) |
| write_comma_separated(f, ' ', ['%d, ' % v for v in bits]) |
| print >>f, '];' |
| print >>f |
| print >>f, '/// Returns the index code point for pointer `code` in this index.' |
| print >>f, '#[inline]' |
| print >>f, 'pub fn forward(code: u16) -> u32 {' |
| if minkey != 0: |
| print >>f, ' let code = (code as usize).wrapping_sub(%d);' % minkey |
| else: |
| print >>f, ' let code = code as usize;' |
| print >>f, ' if code < %d {' % (maxkey - minkey) |
| if morebits: |
| print >>f, ' (FORWARD_TABLE[code] as u32) | ' + \ |
| '(((FORWARD_TABLE_MORE[code >> 5] >> (code & 31)) & 1) << 17)' |
| else: |
| print >>f, ' FORWARD_TABLE[code] as u32' |
| print >>f, ' } else {' |
| print >>f, ' 0xffff' |
| print >>f, ' }' |
| print >>f, '}' |
| print >>f |
| print >>f, "static BACKWARD_TABLE_LOWER: &'static [u16] = &[" |
| write_comma_separated(f, ' ', ['%d, ' % (0xffff if v is None else v) for v in lower]) |
| print >>f, '];' |
| print >>f |
| print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &[" |
| write_comma_separated(f, ' ', ['%d, ' % v for v in upper]) |
| print >>f, '];' |
| if remap: |
| print >>f |
| print >>f, "static BACKWARD_TABLE_REMAPPED: &'static [u16] = &[" |
| write_comma_separated(f, ' ', ['%d, ' % v for v in remap]) |
| print >>f, '];' |
| print >>f |
| print >>f, '/// Returns the index pointer for code point `code` in this index.' |
| print >>f, '#[inline]' |
| print >>f, 'pub fn backward(code: u32) -> u16 {' |
| print >>f, ' let offset = (code >> %d) as usize;' % triebits |
| print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper) |
| print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<<triebits)-1) |
| print >>f, '}' |
| if remap: |
| print >>f |
| assert name == 'jis0208' |
| print >>f, '/// Returns the index shift_jis pointer for code point `code`.' |
| print >>f, '#[inline]' |
| print >>f, 'pub fn backward_remapped(code: u32) -> u16 {' |
| print >>f, ' let value = backward(code);' |
| print >>f, ' if %d <= value && value <= %d {' % (REMAP_MIN, REMAP_MAX) |
| print >>f, ' BACKWARD_TABLE_REMAPPED[(value - %d) as usize]' % REMAP_MIN |
| print >>f, ' } else {' |
| print >>f, ' value' |
| print >>f, ' }' |
| print >>f, '}' |
| print >>f |
| print >>f, '#[cfg(test)]' |
| print >>f, 'multi_byte_tests!(' |
| print >>f, ' mod = %s,' % modname |
| if remap: |
| print >>f, ' remap = [%d, %d],' % (REMAP_MIN, REMAP_MAX) |
| if dups: |
| print >>f, ' dups = [' |
| write_comma_separated(f, ' ', ['%d, ' % v for v in sorted(dups)]) |
| print >>f, ' ]' |
| else: |
| print >>f, ' dups = []' |
| print >>f, ');' |
| |
| tablesz = 2 * (maxkey - minkey) + 2 * len(lower) + 2 * len(upper) |
| if morebits: tablesz += 4 * ((maxkey - minkey + 31) // 32) |
| if remap: tablesz += 2 * len(remap) |
| return tablesz |
| |
| def generate_multi_byte_range_lbound_index(crate, name): |
| modname = name.replace('-', '_') |
| |
| data = [] |
| comments = [] |
| for key, value in whatwg_index(name, comments): |
| data.append((key, value)) |
| assert data and data == sorted(data) |
| |
| minkey, minvalue = data[0] |
| maxkey, maxvalue = data[-1] |
| if data[0] != (0, 0): |
| data.insert(0, (0, 0)) |
| maxlog2 = 0 |
| while 2**(maxlog2 + 1) <= len(data): |
| maxlog2 += 1 |
| |
| if name == 'gb18030-ranges': |
| keyubound = 0x110000 |
| valueubound = 126 * 10 * 126 * 10 |
| else: |
| keyubound = maxkey + 1 |
| valueubound = maxvalue + 1 |
| |
| with mkdir_and_open(crate, name) as f: |
| write_header(f, name, comments) |
| print >>f |
| print >>f, "static FORWARD_TABLE: &'static [u32] = &[" |
| write_comma_separated(f, ' ', ['%d, ' % value for key, value in data]) |
| print >>f, '];' |
| print >>f |
| print >>f, "static BACKWARD_TABLE: &'static [u32] = &[" |
| write_comma_separated(f, ' ', ['%d, ' % key for key, value in data]) |
| print >>f, '];' |
| print >>f |
| print >>f, '/// Returns the index code point for pointer `code` in this index.' |
| print >>f, '#[inline]' |
| print >>f, 'pub fn forward(code: u32) -> u32 {' |
| if minkey > 0: |
| print >>f, ' if code < %d { return 0xffffffff; }' % minkey |
| if name == 'gb18030-ranges': # has "invalid" region inside |
| print >>f, ' if (code > 39419 && code < 189000) || code > 1237575 { return 0xffffffff; }' |
| print >>f, ' let mut i = if code >= BACKWARD_TABLE[%d] {%d} else {0};' % \ |
| (2**maxlog2 - 1, len(data) - 2**maxlog2 + 1) |
| for i in xrange(maxlog2-1, -1, -1): |
| print >>f, ' if code >= BACKWARD_TABLE[i%s] { i += %d; }' % \ |
| ('+%d' % (2**i-1) if i > 0 else '', 2**i) |
| print >>f, ' (code - BACKWARD_TABLE[i-1]) + FORWARD_TABLE[i-1]' |
| print >>f, '}' |
| print >>f |
| print >>f, '/// Returns the index pointer for code point `code` in this index.' |
| print >>f, '#[inline]' |
| print >>f, 'pub fn backward(code: u32) -> u32 {' |
| if minvalue > 0: |
| print >>f, ' if code < %d { return 0xffffffff; }' % minvalue |
| print >>f, ' let mut i = if code >= FORWARD_TABLE[%d] {%d} else {0};' % \ |
| (2**maxlog2 - 1, len(data) - 2**maxlog2 + 1) |
| for i in xrange(maxlog2-1, -1, -1): |
| print >>f, ' if code >= FORWARD_TABLE[i%s] { i += %d; }' % \ |
| ('+%d' % (2**i-1) if i > 0 else '', 2**i) |
| print >>f, ' (code - FORWARD_TABLE[i-1]) + BACKWARD_TABLE[i-1]' |
| print >>f, '}' |
| print >>f |
| print >>f, '#[cfg(test)]' |
| print >>f, 'multi_byte_range_tests!(' |
| print >>f, ' mod = %s,' % modname |
| print >>f, ' key = [%d, %d], key < %d,' % (minkey, maxkey, keyubound) |
| print >>f, ' value = [%d, %d], value < %d' % (minvalue, maxvalue, valueubound) |
| print >>f, ');' |
| |
| return 8 * len(data) |
| |
| INDICES = { |
| 'singlebyte/ibm866': generate_single_byte_index, |
| 'singlebyte/iso-8859-2': generate_single_byte_index, |
| 'singlebyte/iso-8859-3': generate_single_byte_index, |
| 'singlebyte/iso-8859-4': generate_single_byte_index, |
| 'singlebyte/iso-8859-5': generate_single_byte_index, |
| 'singlebyte/iso-8859-6': generate_single_byte_index, |
| 'singlebyte/iso-8859-7': generate_single_byte_index, |
| 'singlebyte/iso-8859-8': generate_single_byte_index, |
| 'singlebyte/iso-8859-10': generate_single_byte_index, |
| 'singlebyte/iso-8859-13': generate_single_byte_index, |
| 'singlebyte/iso-8859-14': generate_single_byte_index, |
| 'singlebyte/iso-8859-15': generate_single_byte_index, |
| 'singlebyte/iso-8859-16': generate_single_byte_index, |
| 'singlebyte/koi8-r': generate_single_byte_index, |
| 'singlebyte/koi8-u': generate_single_byte_index, |
| 'singlebyte/macintosh': generate_single_byte_index, |
| 'singlebyte/windows-874': generate_single_byte_index, |
| 'singlebyte/windows-1250': generate_single_byte_index, |
| 'singlebyte/windows-1251': generate_single_byte_index, |
| 'singlebyte/windows-1252': generate_single_byte_index, |
| 'singlebyte/windows-1253': generate_single_byte_index, |
| 'singlebyte/windows-1254': generate_single_byte_index, |
| 'singlebyte/windows-1255': generate_single_byte_index, |
| 'singlebyte/windows-1256': generate_single_byte_index, |
| 'singlebyte/windows-1257': generate_single_byte_index, |
| 'singlebyte/windows-1258': generate_single_byte_index, |
| 'singlebyte/x-mac-cyrillic': generate_single_byte_index, |
| |
| 'tradchinese/big5': generate_multi_byte_index, |
| 'korean/euc-kr': generate_multi_byte_index, |
| 'simpchinese/gb18030': generate_multi_byte_index, |
| 'japanese/jis0208': generate_multi_byte_index, |
| 'japanese/jis0212': generate_multi_byte_index, |
| |
| 'simpchinese/gb18030-ranges': generate_multi_byte_range_lbound_index, |
| } |
| |
| if __name__ == '__main__': |
| import sys |
| filter = sys.argv[1] if len(sys.argv) > 1 else '' |
| for index, generate in INDICES.items(): |
| crate, _, index = index.partition('/') |
| if filter not in index: continue |
| print >>sys.stderr, 'generating index %s...' % index, |
| tablesz = generate(crate, index) |
| print >>sys.stderr, '%d bytes.' % tablesz |
| |