mirror of
https://github.com/python/cpython.git
synced 2026-06-22 11:03:15 -04:00
ae62b53928
It was used to generate the _expat_decoding_table attribute of CodecInfo.
108 lines
3.0 KiB
Python
108 lines
3.0 KiB
Python
import sys
|
|
|
|
|
|
def info(msg, *args):
|
|
print(msg % args, file=sys.stderr)
|
|
|
|
def fail(msg, *args):
|
|
info(msg, *args)
|
|
sys.exit(2)
|
|
|
|
# ASCII characters that can appear in a well-formed XML document
|
|
# except the characters "$@\^`{}~".
|
|
compulsory_chars = (set(b'\t\n\r') | set(range(32, 127))) - set(br'$@\^`{}~')
|
|
|
|
def check_compatibility(encoding):
|
|
for i in compulsory_chars:
|
|
c = chr(i)
|
|
b = bytes([i])
|
|
try:
|
|
decoded = b.decode(encoding)
|
|
except UnicodeDecodeError:
|
|
info('Cannot decode byte %#04x (%a)', i, c)
|
|
return False
|
|
if decoded != c:
|
|
info('Incompatible encoding: %#04x (%a) -> %a', i, c, decoded)
|
|
return False
|
|
return True
|
|
|
|
def create_table(encoding):
|
|
mapping = [-1] * 256
|
|
|
|
non_bmp = None
|
|
for i in range(0x110000):
|
|
char = chr(i)
|
|
try:
|
|
encoded = char.encode(encoding)
|
|
except UnicodeEncodeError:
|
|
continue
|
|
if i >= 0x10000 and non_bmp is None:
|
|
non_bmp = i
|
|
info('Non-BMP character: %r (U+%04X)', char, i)
|
|
length = len(encoded)
|
|
k = encoded[0]
|
|
v = mapping[k]
|
|
if length == 1:
|
|
if v == -1:
|
|
mapping[k] = i
|
|
elif v < 0:
|
|
fail('Ambiguous mapping for %#04x: '
|
|
'%r (U+%04X) and %d-byte sequence',
|
|
k, c, i, -v)
|
|
else:
|
|
info('Ambiguous mapping for %#04x: '
|
|
'%r (U+%04X) and %r (U+%04X)',
|
|
k, chr(v), v, char, i)
|
|
else:
|
|
if length > 4:
|
|
fail('Too long encoding for %r (U+%04X): %r',
|
|
char, i, encoded)
|
|
if v == -1:
|
|
mapping[k] = -length
|
|
elif v != -length:
|
|
if v < 0:
|
|
fail('Ambiguous mapping for %#04x: '
|
|
'%d-byte sequence and %d-byte sequence %r',
|
|
k, -v, length, encoded)
|
|
else:
|
|
fail('Ambiguous mapping for %#04x: '
|
|
'%r (U+%04X) and %d-byte sequence %r',
|
|
k, chr(v), v, length, encoded)
|
|
return mapping
|
|
|
|
def print_table(mapping):
|
|
print(' '*8 + '_expat_decoding_table=(', end='')
|
|
if mapping[:128] == list(range(128)):
|
|
print('*range(128),')
|
|
start = 128
|
|
else:
|
|
print()
|
|
start = 0
|
|
line = ''
|
|
pos = 0
|
|
i = start
|
|
while True:
|
|
if i % 8 == 0:
|
|
if len(line) > 68:
|
|
print(' '*12 + line[:pos].rstrip())
|
|
line = line[pos:]
|
|
pos = len(line)
|
|
if i == 256:
|
|
break
|
|
v = mapping[i]
|
|
if v >= 0:
|
|
v = hex(v)
|
|
line += f'{v}, '
|
|
i += 1
|
|
print(' '*12 + line.rstrip().rstrip(',') + '),')
|
|
|
|
|
|
encoding = sys.argv[1]
|
|
|
|
if not check_compatibility(encoding):
|
|
print(' '*8 + '_expat_decoding_table=False,')
|
|
sys.exit(1)
|
|
|
|
mapping = create_table(encoding)
|
|
print_table(mapping)
|