blob: 7a125a3b100db84ed17892da4e9c971bfad185db [file] [log] [blame]
# This is a part of rust-encoding.
# Copyright (c) 2013-2015, Kang Seonghoon.
# See README.md and LICENSE.txt for details.
import urllib
import sys
import os.path
def whatwg_index(name, comments):
for line in urllib.urlopen('http://encoding.spec.whatwg.org/index-%s.txt' % name):
line = line.strip()
if not line: continue
if line.startswith('#'):
comments.append('//' + line[1:])
continue
parts = line.split(None, 2)
key = int(parts[0], 0)
value = int(parts[1], 0)
yield key, value
def mkdir_and_open(crate, name):
dirname = os.path.join(os.path.dirname(__file__), crate)
try:
os.mkdir(dirname)
except Exception:
pass
return open(os.path.join(dirname, '%s.rs' % name.replace('-', '_')), 'wb')
def write_header(f, name, comments):
print >>f, '// AUTOGENERATED FROM index-%s.txt, ORIGINAL COMMENT FOLLOWS:' % name
print >>f, '//'
for line in comments:
print >>f, line
def write_comma_separated(f, prefix, l, width=80):
buffered = ''
for i in l:
i = str(i)
if len(prefix) + len(buffered) + len(i) <= width:
buffered += i
else:
print >>f, prefix + buffered.rstrip()
buffered = i
if buffered:
print >>f, prefix + buffered.rstrip()
def make_minimal_trie(invdata, lowerlimit=0x10000):
maxvalue = max(invdata.keys()) + 1
best = 0xffffffff
besttrie = None
for triebits in xrange(21):
lower = [None] * (1<<triebits)
upper = []
lowermap = {tuple(lower): 0}
for i in xrange(0, maxvalue, 1<<triebits):
blk = [invdata.get(j) for j in xrange(i, i + (1<<triebits))]
loweridx = lowermap.get(tuple(blk))
if loweridx is None:
loweridx = len(lower)
lowermap[tuple(blk)] = loweridx
lower += blk
upper.append(loweridx)
if len(lower) < lowerlimit and best >= len(lower) + len(upper):
best = len(lower) + len(upper)
besttrie = (triebits, lower, upper)
return besttrie
def generate_single_byte_index(crate, name):
modname = name.replace('-', '_')
data = [None] * 128
invdata = {}
comments = []
for key, value in whatwg_index(name, comments):
assert 0 <= key < 128 and 0 <= value < 0xffff and data[key] is None and value not in invdata
data[key] = value
invdata[value] = key
# generate a trie with a minimal amount of data
triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000)
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
print >>f
print >>f, "static FORWARD_TABLE: &'static [u16] = &["
write_comma_separated(f, ' ',
['%d, ' % (0xffff if value is None else value) for value in data])
print >>f, '];'
print >>f
print >>f, '/// Returns the index code point for pointer `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn forward(code: u8) -> u16 {'
print >>f, ' FORWARD_TABLE[(code - 0x80) as usize]'
print >>f, '}'
print >>f
print >>f, "static BACKWARD_TABLE_LOWER: &'static [u8] = &["
write_comma_separated(f, ' ', ['%d, ' % (0 if v is None else v+0x80) for v in lower])
print >>f, '];'
print >>f
print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &["
write_comma_separated(f, ' ', ['%d, ' % v for v in upper])
print >>f, '];'
print >>f
print >>f, '/// Returns the index pointer for code point `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn backward(code: u32) -> u8 {'
print >>f, ' let offset = (code >> %d) as usize;' % triebits
print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper)
print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<<triebits)-1)
print >>f, '}'
print >>f
print >>f, '#[cfg(test)]'
print >>f, 'single_byte_tests!('
print >>f, ' mod = %s' % modname
print >>f, ');'
return 2 * len(data) + len(lower) + 2 * len(upper)
def generate_multi_byte_index(crate, name):
modname = name.replace('-', '_')
data = {}
invdata = {}
dups = []
comments = []
morebits = False
for key, value in whatwg_index(name, comments):
assert 0 <= key < 0xffff and 0 <= value < 0x110000 and value != 0xffff and key not in data
if value >= 0x10001:
assert (value >> 16) == 2
morebits = True
data[key] = value
if value not in invdata:
invdata[value] = key
else:
dups.append(key)
# Big5 has four two-letter forward mappings, we use special entries for them
if name == 'big5':
specialidx = [1133, 1135, 1164, 1166]
assert all(key not in data for key in specialidx)
assert all(value not in invdata for value in xrange(len(specialidx)))
for value, key in enumerate(specialidx):
data[key] = value
dups.append(key) # no consistency testing for them
# generate a trie with a minimal amount of data
triebits, lower, upper = make_minimal_trie(invdata, lowerlimit=0x10000)
# JIS X 0208 index has two ranges [8272,8836) and [8836,11280) to support two slightly
# different encodings EUC-JP and Shift_JIS; the default backward function would favor
# the former, so we need a separate mapping for the latter.
#
# fortunately for us, all allocated codes in [8272,8836) have counterparts in others,
# so we only need a smaller remapping from [8272,8836) to other counterparts.
remap = None
if name == 'jis0208':
REMAP_MIN = 8272
REMAP_MAX = 8835
invdataminusremap = {}
for key, value in data.items():
if value not in invdataminusremap and not REMAP_MIN <= key <= REMAP_MAX:
invdataminusremap[value] = key
remap = []
for i in xrange(REMAP_MIN, REMAP_MAX+1):
if i in data:
assert data[i] in invdataminusremap
value = invdataminusremap[data[i]]
assert value < 0x10000
remap.append(value)
else:
remap.append(0xffff)
minkey = min(data)
maxkey = max(data) + 1
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
print >>f
print >>f, "static FORWARD_TABLE: &'static [u16] = &["
write_comma_separated(f, ' ',
['%d, ' % (data.get(key, 0xffff) & 0xffff) for key in xrange(minkey, maxkey)])
print >>f, '];'
if morebits:
print >>f
print >>f, "static FORWARD_TABLE_MORE: &'static [u32] = &["
bits = []
for i in xrange(minkey, maxkey, 32):
v = 0
for j in xrange(32):
v |= (data.get(i+j, 0) >= 0x10000) << j
bits.append(v)
write_comma_separated(f, ' ', ['%d, ' % v for v in bits])
print >>f, '];'
print >>f
print >>f, '/// Returns the index code point for pointer `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn forward(code: u16) -> u32 {'
if minkey != 0:
print >>f, ' let code = (code as usize).wrapping_sub(%d);' % minkey
else:
print >>f, ' let code = code as usize;'
print >>f, ' if code < %d {' % (maxkey - minkey)
if morebits:
print >>f, ' (FORWARD_TABLE[code] as u32) | ' + \
'(((FORWARD_TABLE_MORE[code >> 5] >> (code & 31)) & 1) << 17)'
else:
print >>f, ' FORWARD_TABLE[code] as u32'
print >>f, ' } else {'
print >>f, ' 0xffff'
print >>f, ' }'
print >>f, '}'
print >>f
print >>f, "static BACKWARD_TABLE_LOWER: &'static [u16] = &["
write_comma_separated(f, ' ', ['%d, ' % (0xffff if v is None else v) for v in lower])
print >>f, '];'
print >>f
print >>f, "static BACKWARD_TABLE_UPPER: &'static [u16] = &["
write_comma_separated(f, ' ', ['%d, ' % v for v in upper])
print >>f, '];'
if remap:
print >>f
print >>f, "static BACKWARD_TABLE_REMAPPED: &'static [u16] = &["
write_comma_separated(f, ' ', ['%d, ' % v for v in remap])
print >>f, '];'
print >>f
print >>f, '/// Returns the index pointer for code point `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn backward(code: u32) -> u16 {'
print >>f, ' let offset = (code >> %d) as usize;' % triebits
print >>f, ' let offset = if offset < %d {BACKWARD_TABLE_UPPER[offset] as usize} else {0};' % len(upper)
print >>f, ' BACKWARD_TABLE_LOWER[offset + ((code & %d) as usize)]' % ((1<<triebits)-1)
print >>f, '}'
if remap:
print >>f
assert name == 'jis0208'
print >>f, '/// Returns the index shift_jis pointer for code point `code`.'
print >>f, '#[inline]'
print >>f, 'pub fn backward_remapped(code: u32) -> u16 {'
print >>f, ' let value = backward(code);'
print >>f, ' if %d <= value && value <= %d {' % (REMAP_MIN, REMAP_MAX)
print >>f, ' BACKWARD_TABLE_REMAPPED[(value - %d) as usize]' % REMAP_MIN
print >>f, ' } else {'
print >>f, ' value'
print >>f, ' }'
print >>f, '}'
print >>f
print >>f, '#[cfg(test)]'
print >>f, 'multi_byte_tests!('
print >>f, ' mod = %s,' % modname
if remap:
print >>f, ' remap = [%d, %d],' % (REMAP_MIN, REMAP_MAX)
if dups:
print >>f, ' dups = ['
write_comma_separated(f, ' ', ['%d, ' % v for v in sorted(dups)])
print >>f, ' ]'
else:
print >>f, ' dups = []'
print >>f, ');'
tablesz = 2 * (maxkey - minkey) + 2 * len(lower) + 2 * len(upper)
if morebits: tablesz += 4 * ((maxkey - minkey + 31) // 32)
if remap: tablesz += 2 * len(remap)
return tablesz
def generate_multi_byte_range_lbound_index(crate, name):
modname = name.replace('-', '_')
data = []
comments = []
for key, value in whatwg_index(name, comments):
data.append((key, value))
assert data and data == sorted(data)
minkey, minvalue = data[0]
maxkey, maxvalue = data[-1]
if data[0] != (0, 0):
data.insert(0, (0, 0))
maxlog2 = 0
while 2**(maxlog2 + 1) <= len(data):
maxlog2 += 1
if name == 'gb18030-ranges':
keyubound = 0x110000
valueubound = 126 * 10 * 126 * 10
else:
keyubound = maxkey + 1
valueubound = maxvalue + 1
with mkdir_and_open(crate, name) as f:
write_header(f, name, comments)
print >>f
print >>f, "static FORWARD_TABLE: &'static [u32] = &["
write_comma_separated(f, ' ', ['%d, ' % value for key, value in data])
print >>f, '];'
print >>f
print >>f, "static BACKWARD_TABLE: &'static [u32] = &["
write_comma_separated(f, ' ', ['%d, ' % key for key, value in data])
print >>f, '];'
print >>f
print >>f, '/// Returns the index code point for pointer `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn forward(code: u32) -> u32 {'
if minkey > 0:
print >>f, ' if code < %d { return 0xffffffff; }' % minkey
if name == 'gb18030-ranges': # has "invalid" region inside
print >>f, ' if (code > 39419 && code < 189000) || code > 1237575 { return 0xffffffff; }'
print >>f, ' let mut i = if code >= BACKWARD_TABLE[%d] {%d} else {0};' % \
(2**maxlog2 - 1, len(data) - 2**maxlog2 + 1)
for i in xrange(maxlog2-1, -1, -1):
print >>f, ' if code >= BACKWARD_TABLE[i%s] { i += %d; }' % \
('+%d' % (2**i-1) if i > 0 else '', 2**i)
print >>f, ' (code - BACKWARD_TABLE[i-1]) + FORWARD_TABLE[i-1]'
print >>f, '}'
print >>f
print >>f, '/// Returns the index pointer for code point `code` in this index.'
print >>f, '#[inline]'
print >>f, 'pub fn backward(code: u32) -> u32 {'
if minvalue > 0:
print >>f, ' if code < %d { return 0xffffffff; }' % minvalue
print >>f, ' let mut i = if code >= FORWARD_TABLE[%d] {%d} else {0};' % \
(2**maxlog2 - 1, len(data) - 2**maxlog2 + 1)
for i in xrange(maxlog2-1, -1, -1):
print >>f, ' if code >= FORWARD_TABLE[i%s] { i += %d; }' % \
('+%d' % (2**i-1) if i > 0 else '', 2**i)
print >>f, ' (code - FORWARD_TABLE[i-1]) + BACKWARD_TABLE[i-1]'
print >>f, '}'
print >>f
print >>f, '#[cfg(test)]'
print >>f, 'multi_byte_range_tests!('
print >>f, ' mod = %s,' % modname
print >>f, ' key = [%d, %d], key < %d,' % (minkey, maxkey, keyubound)
print >>f, ' value = [%d, %d], value < %d' % (minvalue, maxvalue, valueubound)
print >>f, ');'
return 8 * len(data)
INDICES = {
'singlebyte/ibm866': generate_single_byte_index,
'singlebyte/iso-8859-2': generate_single_byte_index,
'singlebyte/iso-8859-3': generate_single_byte_index,
'singlebyte/iso-8859-4': generate_single_byte_index,
'singlebyte/iso-8859-5': generate_single_byte_index,
'singlebyte/iso-8859-6': generate_single_byte_index,
'singlebyte/iso-8859-7': generate_single_byte_index,
'singlebyte/iso-8859-8': generate_single_byte_index,
'singlebyte/iso-8859-10': generate_single_byte_index,
'singlebyte/iso-8859-13': generate_single_byte_index,
'singlebyte/iso-8859-14': generate_single_byte_index,
'singlebyte/iso-8859-15': generate_single_byte_index,
'singlebyte/iso-8859-16': generate_single_byte_index,
'singlebyte/koi8-r': generate_single_byte_index,
'singlebyte/koi8-u': generate_single_byte_index,
'singlebyte/macintosh': generate_single_byte_index,
'singlebyte/windows-874': generate_single_byte_index,
'singlebyte/windows-1250': generate_single_byte_index,
'singlebyte/windows-1251': generate_single_byte_index,
'singlebyte/windows-1252': generate_single_byte_index,
'singlebyte/windows-1253': generate_single_byte_index,
'singlebyte/windows-1254': generate_single_byte_index,
'singlebyte/windows-1255': generate_single_byte_index,
'singlebyte/windows-1256': generate_single_byte_index,
'singlebyte/windows-1257': generate_single_byte_index,
'singlebyte/windows-1258': generate_single_byte_index,
'singlebyte/x-mac-cyrillic': generate_single_byte_index,
'tradchinese/big5': generate_multi_byte_index,
'korean/euc-kr': generate_multi_byte_index,
'simpchinese/gb18030': generate_multi_byte_index,
'japanese/jis0208': generate_multi_byte_index,
'japanese/jis0212': generate_multi_byte_index,
'simpchinese/gb18030-ranges': generate_multi_byte_range_lbound_index,
}
if __name__ == '__main__':
import sys
filter = sys.argv[1] if len(sys.argv) > 1 else ''
for index, generate in INDICES.items():
crate, _, index = index.partition('/')
if filter not in index: continue
print >>sys.stderr, 'generating index %s...' % index,
tablesz = generate(crate, index)
print >>sys.stderr, '%d bytes.' % tablesz