mirror of
https://github.com/python/cpython.git
synced 2026-05-06 12:49:07 -04:00
gh-80667: Fix Tangut ideographs names in unicodedata (GH-144789)
Co-authored-by: Pierre Le Marre <dev@wismill.eu>
This commit is contained in:
@@ -111,6 +111,30 @@ class UnicodeNamesTest(unittest.TestCase):
|
||||
self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd")
|
||||
self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd")
|
||||
|
||||
def test_tangut_ideographs(self):
|
||||
self.checkletter("TANGUT IDEOGRAPH-17000", "\U00017000")
|
||||
self.checkletter("TANGUT IDEOGRAPH-187FF", "\U000187ff")
|
||||
self.checkletter("TANGUT IDEOGRAPH-18D00", "\U00018D00")
|
||||
self.checkletter("TANGUT IDEOGRAPH-18D1E", "\U00018d1e")
|
||||
self.checkletter("tangut ideograph-18d1e", "\U00018d1e")
|
||||
|
||||
def test_egyptian_hieroglyphs(self):
|
||||
self.checkletter("EGYPTIAN HIEROGLYPH-13460", "\U00013460")
|
||||
self.checkletter("EGYPTIAN HIEROGLYPH-143FA", "\U000143fa")
|
||||
self.checkletter("egyptian hieroglyph-143fa", "\U000143fa")
|
||||
|
||||
def test_khitan_small_script_characters(self):
|
||||
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18B00", "\U00018b00")
|
||||
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CD5", "\U00018cd5")
|
||||
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
|
||||
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
|
||||
self.checkletter("khitan small script character-18cff", "\U00018cff")
|
||||
|
||||
def test_nushu_characters(self):
|
||||
self.checkletter("NUSHU CHARACTER-1B170", "\U0001b170")
|
||||
self.checkletter("NUSHU CHARACTER-1B2FB", "\U0001b2fb")
|
||||
self.checkletter("nushu character-1b2fb", "\U0001b2fb")
|
||||
|
||||
def test_bmp_characters(self):
|
||||
for code in range(0x10000):
|
||||
char = chr(code)
|
||||
|
||||
@@ -128,6 +128,60 @@ class BaseUnicodeFunctionsTest:
|
||||
result = h.hexdigest()
|
||||
self.assertEqual(result, self.expectedchecksum)
|
||||
|
||||
def test_name(self):
|
||||
name = self.db.name
|
||||
self.assertRaises(ValueError, name, '\0')
|
||||
self.assertRaises(ValueError, name, '\n')
|
||||
self.assertRaises(ValueError, name, '\x1F')
|
||||
self.assertRaises(ValueError, name, '\x7F')
|
||||
self.assertRaises(ValueError, name, '\x9F')
|
||||
self.assertRaises(ValueError, name, '\uFFFE')
|
||||
self.assertRaises(ValueError, name, '\uFFFF')
|
||||
self.assertRaises(ValueError, name, '\U0010FFFF')
|
||||
self.assertEqual(name('\U0010FFFF', 42), 42)
|
||||
|
||||
self.assertEqual(name(' '), 'SPACE')
|
||||
self.assertEqual(name('1'), 'DIGIT ONE')
|
||||
self.assertEqual(name('A'), 'LATIN CAPITAL LETTER A')
|
||||
self.assertEqual(name('\xA0'), 'NO-BREAK SPACE')
|
||||
self.assertEqual(name('\u0221', None), None if self.old else
|
||||
'LATIN SMALL LETTER D WITH CURL')
|
||||
self.assertEqual(name('\u3400'), 'CJK UNIFIED IDEOGRAPH-3400')
|
||||
self.assertEqual(name('\u9FA5'), 'CJK UNIFIED IDEOGRAPH-9FA5')
|
||||
self.assertEqual(name('\uAC00'), 'HANGUL SYLLABLE GA')
|
||||
self.assertEqual(name('\uD7A3'), 'HANGUL SYLLABLE HIH')
|
||||
self.assertEqual(name('\uF900'), 'CJK COMPATIBILITY IDEOGRAPH-F900')
|
||||
self.assertEqual(name('\uFA6A'), 'CJK COMPATIBILITY IDEOGRAPH-FA6A')
|
||||
self.assertEqual(name('\uFBF9'),
|
||||
'ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA '
|
||||
'ABOVE WITH ALEF MAKSURA ISOLATED FORM')
|
||||
self.assertEqual(name('\U00013460', None), None if self.old else
|
||||
'EGYPTIAN HIEROGLYPH-13460')
|
||||
self.assertEqual(name('\U000143FA', None), None if self.old else
|
||||
'EGYPTIAN HIEROGLYPH-143FA')
|
||||
self.assertEqual(name('\U00017000', None), None if self.old else
|
||||
'TANGUT IDEOGRAPH-17000')
|
||||
self.assertEqual(name('\U00018B00', None), None if self.old else
|
||||
'KHITAN SMALL SCRIPT CHARACTER-18B00')
|
||||
self.assertEqual(name('\U00018CD5', None), None if self.old else
|
||||
'KHITAN SMALL SCRIPT CHARACTER-18CD5')
|
||||
self.assertEqual(name('\U00018CFF', None), None if self.old else
|
||||
'KHITAN SMALL SCRIPT CHARACTER-18CFF')
|
||||
self.assertEqual(name('\U00018D1E', None), None if self.old else
|
||||
'TANGUT IDEOGRAPH-18D1E')
|
||||
self.assertEqual(name('\U0001B170', None), None if self.old else
|
||||
'NUSHU CHARACTER-1B170')
|
||||
self.assertEqual(name('\U0001B2FB', None), None if self.old else
|
||||
'NUSHU CHARACTER-1B2FB')
|
||||
self.assertEqual(name('\U0001FBA8', None), None if self.old else
|
||||
'BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO '
|
||||
'MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE')
|
||||
self.assertEqual(name('\U0002A6D6'), 'CJK UNIFIED IDEOGRAPH-2A6D6')
|
||||
self.assertEqual(name('\U0002FA1D'), 'CJK COMPATIBILITY IDEOGRAPH-2FA1D')
|
||||
self.assertEqual(name('\U00033479', None), None if self.old else
|
||||
'CJK UNIFIED IDEOGRAPH-33479')
|
||||
|
||||
@requires_resource('cpu')
|
||||
def test_name_inverse_lookup(self):
|
||||
for char in iterallchars():
|
||||
looked_name = self.db.name(char, None)
|
||||
@@ -151,6 +205,17 @@ class BaseUnicodeFunctionsTest:
|
||||
"HANDBUG",
|
||||
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
|
||||
"???",
|
||||
"CJK UNIFIED IDEOGRAPH-03400",
|
||||
"CJK UNIFIED IDEOGRAPH-020000",
|
||||
"CJK UNIFIED IDEOGRAPH-33FF",
|
||||
"CJK UNIFIED IDEOGRAPH-F900",
|
||||
"CJK UNIFIED IDEOGRAPH-13460",
|
||||
"CJK UNIFIED IDEOGRAPH-17000",
|
||||
"CJK UNIFIED IDEOGRAPH-18B00",
|
||||
"CJK UNIFIED IDEOGRAPH-1B170",
|
||||
"CJK COMPATIBILITY IDEOGRAPH-3400",
|
||||
"TANGUT IDEOGRAPH-3400",
|
||||
"HANGUL SYLLABLE AC00",
|
||||
]:
|
||||
self.assertRaises(KeyError, self.db.lookup, nonexistent)
|
||||
|
||||
@@ -613,7 +678,47 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
|
||||
# (e.g. 'make distclean && make') to get the correct checksum.
|
||||
expectedchecksum = ('83cc43a2fbb779185832b4c049217d80b05bf349'
|
||||
if quicktest else
|
||||
'65670ae03a324c5f9e826a4de3e25bae4d73c9b7')
|
||||
'180bdc91143d8aa2eb9dd6726e66d37606205942')
|
||||
|
||||
@requires_resource('network')
|
||||
def test_all_names(self):
|
||||
TESTDATAFILE = "DerivedName.txt"
|
||||
testdata = download_test_data_file(TESTDATAFILE)
|
||||
|
||||
with testdata:
|
||||
self.run_name_tests(testdata)
|
||||
|
||||
def run_name_tests(self, testdata):
|
||||
names_ref = {}
|
||||
|
||||
def parse_cp(s):
|
||||
return int(s, 16)
|
||||
|
||||
# Parse data
|
||||
for line in testdata:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
raw_cp, name = line.split("; ")
|
||||
# Check for a range
|
||||
if ".." in raw_cp:
|
||||
cp1, cp2 = map(parse_cp, raw_cp.split(".."))
|
||||
# remove ‘*’ at the end
|
||||
assert name[-1] == '*', (raw_cp, name)
|
||||
name = name[:-1]
|
||||
for cp in range(cp1, cp2 + 1):
|
||||
names_ref[cp] = f"{name}{cp:04X}"
|
||||
elif name[-1] == '*':
|
||||
cp = parse_cp(raw_cp)
|
||||
name = name[:-1]
|
||||
names_ref[cp] = f"{name}{cp:04X}"
|
||||
else:
|
||||
assert '*' not in name, (raw_cp, name)
|
||||
cp = parse_cp(raw_cp)
|
||||
names_ref[cp] = name
|
||||
|
||||
for cp in range(0, sys.maxunicode + 1):
|
||||
self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp))
|
||||
|
||||
def test_isxidstart(self):
|
||||
self.assertTrue(self.db.isxidstart('S'))
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
Add support for Tangut Ideographs names in :mod:`unicodedata`.
|
||||
+62
-41
@@ -1052,22 +1052,18 @@ static const char * const hangul_syllables[][3] = {
|
||||
{ 0, 0, "H" }
|
||||
};
|
||||
|
||||
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
|
||||
static int
|
||||
is_unified_ideograph(Py_UCS4 code)
|
||||
find_prefix_id(Py_UCS4 code)
|
||||
{
|
||||
return
|
||||
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
|
||||
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
|
||||
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
|
||||
(0x2A700 <= code && code <= 0x2B73F) || /* CJK Ideograph Extension C */
|
||||
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
|
||||
(0x2B820 <= code && code <= 0x2CEAD) || /* CJK Ideograph Extension E */
|
||||
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
|
||||
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
|
||||
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
|
||||
(0x31350 <= code && code <= 0x323AF) || /* CJK Ideograph Extension H */
|
||||
(0x323B0 <= code && code <= 0x33479); /* CJK Ideograph Extension J */
|
||||
for (int i = 0; i < (int)Py_ARRAY_LENGTH(derived_name_ranges); i++) {
|
||||
if (code < derived_name_ranges[i].first) {
|
||||
return -1;
|
||||
}
|
||||
if (code <= derived_name_ranges[i].last) {
|
||||
return derived_name_ranges[i].prefixid;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* macros used to determine if the given code point is in the PUA range that
|
||||
@@ -1345,7 +1341,9 @@ _getucname(PyObject *self,
|
||||
}
|
||||
}
|
||||
|
||||
if (SBase <= code && code < SBase+SCount) {
|
||||
int prefixid = find_prefix_id(code);
|
||||
if (prefixid == 0) {
|
||||
assert(SBase <= code && code < SBase+SCount);
|
||||
/* Hangul syllable. */
|
||||
int SIndex = code - SBase;
|
||||
int L = SIndex / NCount;
|
||||
@@ -1367,11 +1365,11 @@ _getucname(PyObject *self,
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (is_unified_ideograph(code)) {
|
||||
if (buflen < 28)
|
||||
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
|
||||
if (prefixid > 0) {
|
||||
const char *prefix = derived_name_prefixes[prefixid];
|
||||
if (snprintf(buffer, buflen, "%s%04X", prefix, code) >= buflen) {
|
||||
return 0;
|
||||
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1428,6 +1426,35 @@ _check_alias_and_seq(Py_UCS4* code, int with_named_seq)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static Py_UCS4
|
||||
parse_hex_code(const char *name, int namelen)
|
||||
{
|
||||
if (namelen < 4 || namelen > 6) {
|
||||
return (Py_UCS4)-1;
|
||||
}
|
||||
if (*name == '0') {
|
||||
return (Py_UCS4)-1;
|
||||
}
|
||||
int v = 0;
|
||||
while (namelen--) {
|
||||
v *= 16;
|
||||
Py_UCS1 c = Py_TOUPPER(*name);
|
||||
if (c >= '0' && c <= '9') {
|
||||
v += c - '0';
|
||||
}
|
||||
else if (c >= 'A' && c <= 'F') {
|
||||
v += c - 'A' + 10;
|
||||
}
|
||||
else {
|
||||
return (Py_UCS4)-1;
|
||||
}
|
||||
name++;
|
||||
}
|
||||
if (v > 0x10ffff) {
|
||||
return (Py_UCS4)-1;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
static int
|
||||
_getcode(const char* name, int namelen, Py_UCS4* code)
|
||||
@@ -1436,8 +1463,19 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
||||
* Named aliases are not resolved, they are returned as a code point in the
|
||||
* PUA */
|
||||
|
||||
/* Check for hangul syllables. */
|
||||
if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
|
||||
int i = 0;
|
||||
size_t prefixlen;
|
||||
for (; i < (int)Py_ARRAY_LENGTH(derived_name_prefixes); i++) {
|
||||
const char *prefix = derived_name_prefixes[i];
|
||||
prefixlen = strlen(derived_name_prefixes[i]);
|
||||
if (PyOS_strnicmp(name, prefix, prefixlen) == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i == 0) {
|
||||
/* Hangul syllables. */
|
||||
assert(PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0);
|
||||
int len, L = -1, V = -1, T = -1;
|
||||
const char *pos = name + 16;
|
||||
find_syllable(pos, &len, &L, LCount, 0);
|
||||
@@ -1454,28 +1492,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Check for unified ideographs. */
|
||||
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
|
||||
/* Four or five hexdigits must follow. */
|
||||
unsigned int v;
|
||||
v = 0;
|
||||
name += 22;
|
||||
namelen -= 22;
|
||||
if (namelen != 4 && namelen != 5)
|
||||
if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
|
||||
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);
|
||||
if (find_prefix_id(v) != i) {
|
||||
return 0;
|
||||
while (namelen--) {
|
||||
v *= 16;
|
||||
Py_UCS1 c = Py_TOUPPER(*name);
|
||||
if (c >= '0' && c <= '9')
|
||||
v += c - '0';
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
v += c - 'A' + 10;
|
||||
else
|
||||
return 0;
|
||||
name++;
|
||||
}
|
||||
if (!is_unified_ideograph(v))
|
||||
return 0;
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
|
||||
Generated
+28
@@ -19684,3 +19684,31 @@ static const named_sequence named_sequences[] = {
|
||||
{2, {0x02E5, 0x02E9}},
|
||||
{2, {0x02E9, 0x02E5}},
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
Py_UCS4 first;
|
||||
Py_UCS4 last;
|
||||
int prefixid;
|
||||
} derived_name_range;
|
||||
|
||||
static const derived_name_range derived_name_ranges[] = {
|
||||
{0x3400, 0x4DBF, 1},
|
||||
{0x4E00, 0x9FFF, 1},
|
||||
{0xAC00, 0xD7A3, 0},
|
||||
{0x17000, 0x187FF, 2},
|
||||
{0x18D00, 0x18D1E, 2},
|
||||
{0x20000, 0x2A6DF, 1},
|
||||
{0x2A700, 0x2B73F, 1},
|
||||
{0x2B740, 0x2B81D, 1},
|
||||
{0x2B820, 0x2CEAD, 1},
|
||||
{0x2CEB0, 0x2EBE0, 1},
|
||||
{0x2EBF0, 0x2EE5D, 1},
|
||||
{0x30000, 0x3134A, 1},
|
||||
{0x31350, 0x323AF, 1},
|
||||
{0x323B0, 0x33479, 1},
|
||||
};
|
||||
static const char * const derived_name_prefixes[] = {
|
||||
"HANGUL SYLLABLE ",
|
||||
"CJK UNIFIED IDEOGRAPH-",
|
||||
"TANGUT IDEOGRAPH-",
|
||||
};
|
||||
|
||||
@@ -109,19 +109,13 @@ CASE_IGNORABLE_MASK = 0x1000
|
||||
CASED_MASK = 0x2000
|
||||
EXTENDED_CASE_MASK = 0x4000
|
||||
|
||||
# these ranges need to match unicodedata.c:is_unified_ideograph
|
||||
cjk_ranges = [
|
||||
('3400', '4DBF'), # CJK Ideograph Extension A CJK
|
||||
('4E00', '9FFF'), # CJK Ideograph
|
||||
('20000', '2A6DF'), # CJK Ideograph Extension B
|
||||
('2A700', '2B73F'), # CJK Ideograph Extension C
|
||||
('2B740', '2B81D'), # CJK Ideograph Extension D
|
||||
('2B820', '2CEAD'), # CJK Ideograph Extension E
|
||||
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
|
||||
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
|
||||
('30000', '3134A'), # CJK Ideograph Extension G
|
||||
('31350', '323AF'), # CJK Ideograph Extension H
|
||||
('323B0', '33479'), # CJK Ideograph Extension J
|
||||
# Maps the range names in UnicodeData.txt to prefixes for
|
||||
# derived names specified by rule NR2.
|
||||
# Hangul should always be at index 0, since it uses special format.
|
||||
derived_name_range_names = [
|
||||
("Hangul Syllable", "HANGUL SYLLABLE "),
|
||||
("CJK Ideograph", "CJK UNIFIED IDEOGRAPH-"),
|
||||
("Tangut Ideograph", "TANGUT IDEOGRAPH-"),
|
||||
]
|
||||
|
||||
|
||||
@@ -135,7 +129,7 @@ def maketables(trace=0):
|
||||
|
||||
for version in old_versions:
|
||||
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
||||
old_unicode = UnicodeData(version, cjk_check=False)
|
||||
old_unicode = UnicodeData(version, ideograph_check=False)
|
||||
print(len(list(filter(None, old_unicode.table))), "characters")
|
||||
merge_old_version(version, unicode, old_unicode)
|
||||
|
||||
@@ -731,6 +725,23 @@ def makeunicodename(unicode, trace):
|
||||
fprint(' {%d, {%s}},' % (len(sequence), seq_str))
|
||||
fprint('};')
|
||||
|
||||
fprint(dedent("""
|
||||
typedef struct {
|
||||
Py_UCS4 first;
|
||||
Py_UCS4 last;
|
||||
int prefixid;
|
||||
} derived_name_range;
|
||||
"""))
|
||||
|
||||
fprint('static const derived_name_range derived_name_ranges[] = {')
|
||||
for name_range in unicode.derived_name_ranges:
|
||||
fprint(' {0x%s, 0x%s, %d},' % name_range)
|
||||
fprint('};')
|
||||
|
||||
fprint('static const char * const derived_name_prefixes[] = {')
|
||||
for _, prefix in derived_name_range_names:
|
||||
fprint(' "%s",' % prefix)
|
||||
fprint('};')
|
||||
|
||||
def merge_old_version(version, new, old):
|
||||
# Changes to exclusion file not implemented yet
|
||||
@@ -946,14 +957,14 @@ def from_row(row: List[str]) -> UcdRecord:
|
||||
class UnicodeData:
|
||||
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
|
||||
|
||||
def __init__(self, version, cjk_check=True):
|
||||
def __init__(self, version, ideograph_check=True):
|
||||
self.changed = []
|
||||
table = [None] * 0x110000
|
||||
for s in UcdFile(UNICODE_DATA, version):
|
||||
char = int(s[0], 16)
|
||||
table[char] = from_row(s)
|
||||
|
||||
cjk_ranges_found = []
|
||||
self.derived_name_ranges = []
|
||||
|
||||
# expand first-last ranges
|
||||
field = None
|
||||
@@ -967,15 +978,15 @@ class UnicodeData:
|
||||
s.name = ""
|
||||
field = dataclasses.astuple(s)[:15]
|
||||
elif s.name[-5:] == "Last>":
|
||||
if s.name.startswith("<CJK Ideograph"):
|
||||
cjk_ranges_found.append((field[0],
|
||||
s.codepoint))
|
||||
for j, (rangename, _) in enumerate(derived_name_range_names):
|
||||
if s.name.startswith("<" + rangename):
|
||||
self.derived_name_ranges.append(
|
||||
(field[0], s.codepoint, j))
|
||||
break
|
||||
s.name = ""
|
||||
field = None
|
||||
elif field:
|
||||
table[i] = from_row(('%X' % i,) + field[1:])
|
||||
if cjk_check and cjk_ranges != cjk_ranges_found:
|
||||
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
|
||||
|
||||
# public attributes
|
||||
self.filename = UNICODE_DATA % ''
|
||||
|
||||
Reference in New Issue
Block a user