mirror of
https://github.com/python/cpython.git
synced 2026-05-06 12:49:07 -04:00
gh-142224: unicodedata: support bidi classes for unassigned code points (GH-144815)
This commit is contained in:
@@ -319,7 +319,7 @@ class BaseUnicodeFunctionsTest:
|
||||
self.assertRaises(TypeError, self.db.category, 'xx')
|
||||
|
||||
def test_bidirectional(self):
|
||||
self.assertEqual(self.db.bidirectional('\uFFFE'), '')
|
||||
self.assertEqual(self.db.bidirectional('\uFFFE'), 'BN')
|
||||
self.assertEqual(self.db.bidirectional(' '), 'WS')
|
||||
self.assertEqual(self.db.bidirectional('A'), 'L')
|
||||
self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
|
||||
@@ -347,6 +347,17 @@ class BaseUnicodeFunctionsTest:
|
||||
self.assertRaises(TypeError, self.db.bidirectional)
|
||||
self.assertRaises(TypeError, self.db.bidirectional, 'xx')
|
||||
|
||||
def test_bidirectional_unassigned(self):
|
||||
if self.old:
|
||||
return
|
||||
self.assertEqual(self.db.bidirectional('\u0378'), 'L')
|
||||
self.assertEqual(self.db.bidirectional('\u077F'), 'AL')
|
||||
self.assertEqual(self.db.bidirectional('\u20CF'), 'ET')
|
||||
self.assertEqual(self.db.bidirectional('\u0590'), 'R')
|
||||
self.assertEqual(self.db.bidirectional('\uFFFF'), 'BN')
|
||||
self.assertEqual(self.db.bidirectional('\U0001FFFE'), 'BN')
|
||||
self.assertEqual(self.db.bidirectional('\U00010D01'), 'AL')
|
||||
|
||||
def test_decomposition(self):
|
||||
self.assertEqual(self.db.decomposition('\uFFFE'),'')
|
||||
self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
|
||||
@@ -676,9 +687,9 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
|
||||
|
||||
# Update this if the database changes. Make sure to do a full rebuild
|
||||
# (e.g. 'make distclean && make') to get the correct checksum.
|
||||
expectedchecksum = ('83cc43a2fbb779185832b4c049217d80b05bf349'
|
||||
expectedchecksum = ('668dbbea1136e69d4f00677a5988b23bc78aefc6'
|
||||
if quicktest else
|
||||
'180bdc91143d8aa2eb9dd6726e66d37606205942')
|
||||
'b869af769bd8fe352c04622ab90533dc54df5cf3')
|
||||
|
||||
@requires_resource('network')
|
||||
def test_all_names(self):
|
||||
@@ -966,9 +977,9 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
|
||||
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
|
||||
db = unicodedata.ucd_3_2_0
|
||||
old = True
|
||||
expectedchecksum = ('4154d8d1232837e255edf3cdcbb5ab184d71f4a4'
|
||||
expectedchecksum = ('2164a66700e03cba9c9f5ed9e9a8d594d2da136a'
|
||||
if quicktest else
|
||||
'3aabaf66823b21b3d305dad804a62f6f6387c93e')
|
||||
'a8276cec9b6991779c5bdaa46c1ae7cc50bc2403')
|
||||
|
||||
|
||||
class UnicodeMiscTest(unittest.TestCase):
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
:func:`unicodedata.bidirectional` now return the correct default bidi class
|
||||
for unassigned code points.
|
||||
Generated
+2304
-2186
File diff suppressed because it is too large
Load Diff
@@ -29,6 +29,7 @@
|
||||
import dataclasses
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import zipfile
|
||||
|
||||
from functools import partial
|
||||
@@ -49,6 +50,7 @@ UNICODE_DATA = "UnicodeData%s.txt"
|
||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
||||
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||
UNIHAN = "Unihan%s.zip"
|
||||
DERIVED_BIDI_CLASS = "extracted/DerivedBidiClass%s.txt"
|
||||
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
||||
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
||||
LINE_BREAK = "LineBreak%s.txt"
|
||||
@@ -79,6 +81,33 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
|
||||
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
|
||||
"ON", "LRI", "RLI", "FSI", "PDI" ]
|
||||
|
||||
# https://www.unicode.org/reports/tr44/#BC_Values_Table
|
||||
BIDI_LONG_NAMES = {
|
||||
'Left_To_Right': 'L',
|
||||
'Right_To_Left': 'R',
|
||||
'Arabic_Letter': 'AL',
|
||||
'European_Number': 'EN',
|
||||
'European_Separator': 'ES',
|
||||
'European_Terminator': 'ET',
|
||||
'Arabic_Number': 'AN',
|
||||
'Common_Separator': 'CS',
|
||||
'Nonspacing_Mark': 'NSM',
|
||||
'Boundary_Neutral': 'BN',
|
||||
'Paragraph_Separator': 'B',
|
||||
'Segment_Separator': 'S',
|
||||
'White_Space': 'WS',
|
||||
'Other_Neutral': 'ON',
|
||||
'Left_To_Right_Embedding': 'LRE',
|
||||
'Left_To_Right_Override': 'LRO',
|
||||
'Right_To_Left_Embedding': 'RLE',
|
||||
'Right_To_Left_Override': 'RLO',
|
||||
'Pop_Directional_Format': 'PDF',
|
||||
'Left_To_Right_Isolate': 'LRI',
|
||||
'Right_To_Left_Isolate': 'RLI',
|
||||
'First_Strong_Isolate': 'FSI',
|
||||
'Pop_Directional_Isolate': 'PDI',
|
||||
}
|
||||
|
||||
# "Other" needs to be the first entry, see the comment in makeunicodedata
|
||||
GRAPHEME_CLUSTER_NAMES = [ 'Other', 'Prepend', 'CR', 'LF', 'Control',
|
||||
'Extend', 'Regional_Indicator', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT',
|
||||
@@ -169,11 +198,11 @@ def makeunicodedata(unicode, trace):
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char] or 'N')
|
||||
graphemebreak = GRAPHEME_CLUSTER_NAMES.index(unicode.grapheme_breaks[char] or 'Other')
|
||||
extpict = unicode.ext_picts[char]
|
||||
bidirectional = BIDIRECTIONAL_NAMES.index(unicode.bidi_classes[char])
|
||||
if record:
|
||||
# extract database properties
|
||||
category = CATEGORY_NAMES.index(record.general_category)
|
||||
combining = int(record.canonical_combining_class)
|
||||
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
|
||||
mirrored = record.bidi_mirrored == "Y"
|
||||
normalizationquickcheck = record.quick_check
|
||||
incb = INDIC_CONJUNCT_BREAK_NAMES.index(record.incb)
|
||||
@@ -181,12 +210,12 @@ def makeunicodedata(unicode, trace):
|
||||
category, combining, bidirectional, mirrored, eastasianwidth,
|
||||
normalizationquickcheck, graphemebreak, incb, extpict,
|
||||
)
|
||||
elif eastasianwidth or graphemebreak or extpict:
|
||||
# an unassigned but reserved character, with a known
|
||||
# east_asian_width or grapheme_break or ext_pict
|
||||
item = (0, 0, 0, 0, eastasianwidth, 0, graphemebreak, 0, extpict)
|
||||
else:
|
||||
continue
|
||||
if eastasianwidth or graphemebreak or extpict or bidirectional:
|
||||
item = (0, 0, bidirectional, 0, eastasianwidth,
|
||||
0, graphemebreak, 0, extpict)
|
||||
else:
|
||||
continue
|
||||
|
||||
# add entry to index and item tables
|
||||
i = cache.get(item)
|
||||
@@ -457,7 +486,7 @@ def makeunicodetype(unicode, trace):
|
||||
if record:
|
||||
# extract database properties
|
||||
category = record.general_category
|
||||
bidirectional = record.bidi_class
|
||||
bidirectional = unicode.bidi_classes[char]
|
||||
properties = record.binary_properties
|
||||
flags = 0
|
||||
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||
@@ -770,6 +799,8 @@ def merge_old_version(version, new, old):
|
||||
# category 0 is "unassigned"
|
||||
category_changes[i] = 0
|
||||
continue
|
||||
if old.bidi_classes[i] != new.bidi_classes[i]:
|
||||
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(old.bidi_classes[i])
|
||||
# check characters that differ
|
||||
if old.table[i] != new.table[i]:
|
||||
for k, field in enumerate(dataclasses.fields(UcdRecord)):
|
||||
@@ -783,7 +814,8 @@ def merge_old_version(version, new, old):
|
||||
elif k == 2:
|
||||
category_changes[i] = CATEGORY_NAMES.index(value)
|
||||
elif k == 4:
|
||||
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
|
||||
# bidi_class changes handled via bidi_classes
|
||||
pass
|
||||
elif k == 5:
|
||||
# We assume that all normalization changes are in 1:1 mappings
|
||||
assert " " not in value
|
||||
@@ -1042,6 +1074,28 @@ class UnicodeData:
|
||||
table[i].east_asian_width = widths[i]
|
||||
self.widths = widths
|
||||
|
||||
# Read DerivedBidiClass.txt for bidi classes
|
||||
# see https://www.unicode.org/reports/tr44/#Missing_Conventions
|
||||
bidi_classes = [None] * 0x110000
|
||||
for i in range(0, 0x110000):
|
||||
if table[i] is not None:
|
||||
bidi_classes[i] = table[i].bidi_class
|
||||
if version != '3.2.0':
|
||||
missing_re = re.compile(
|
||||
r'# @missing: ([\dA-F]+\.\.[\dA-F]+); (\w+)'
|
||||
)
|
||||
with open_data(DERIVED_BIDI_CLASS, version) as f:
|
||||
for l in f:
|
||||
m = missing_re.match(l)
|
||||
if not m:
|
||||
continue
|
||||
name = BIDI_LONG_NAMES[m[2]]
|
||||
for i in expand_range(m[1]):
|
||||
bidi_classes[i] = name
|
||||
for char, (bidi,) in UcdFile(DERIVED_BIDI_CLASS, version).expanded():
|
||||
bidi_classes[char] = bidi
|
||||
self.bidi_classes = bidi_classes
|
||||
|
||||
for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
||||
if not propinfo:
|
||||
# binary property
|
||||
|
||||
Reference in New Issue
Block a user