gh-142224: unicodedata: support bidi classes for unassigned code points (GH-144815)

This commit is contained in:
Stan Ulbrych
2026-02-18 10:54:07 +00:00
committed by GitHub
parent 7a7521bcfa
commit e49bfca87c
4 changed files with 2384 additions and 2199 deletions
+16 -5
View File
@@ -319,7 +319,7 @@ class BaseUnicodeFunctionsTest:
self.assertRaises(TypeError, self.db.category, 'xx')
def test_bidirectional(self):
self.assertEqual(self.db.bidirectional('\uFFFE'), '')
self.assertEqual(self.db.bidirectional('\uFFFE'), 'BN')
self.assertEqual(self.db.bidirectional(' '), 'WS')
self.assertEqual(self.db.bidirectional('A'), 'L')
self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
@@ -347,6 +347,17 @@ class BaseUnicodeFunctionsTest:
self.assertRaises(TypeError, self.db.bidirectional)
self.assertRaises(TypeError, self.db.bidirectional, 'xx')
def test_bidirectional_unassigned(self):
if self.old:
return
self.assertEqual(self.db.bidirectional('\u0378'), 'L')
self.assertEqual(self.db.bidirectional('\u077F'), 'AL')
self.assertEqual(self.db.bidirectional('\u20CF'), 'ET')
self.assertEqual(self.db.bidirectional('\u0590'), 'R')
self.assertEqual(self.db.bidirectional('\uFFFF'), 'BN')
self.assertEqual(self.db.bidirectional('\U0001FFFE'), 'BN')
self.assertEqual(self.db.bidirectional('\U00010D01'), 'AL')
def test_decomposition(self):
self.assertEqual(self.db.decomposition('\uFFFE'),'')
self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
@@ -676,9 +687,9 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = ('83cc43a2fbb779185832b4c049217d80b05bf349'
expectedchecksum = ('668dbbea1136e69d4f00677a5988b23bc78aefc6'
if quicktest else
'180bdc91143d8aa2eb9dd6726e66d37606205942')
'b869af769bd8fe352c04622ab90533dc54df5cf3')
@requires_resource('network')
def test_all_names(self):
@@ -966,9 +977,9 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
db = unicodedata.ucd_3_2_0
old = True
expectedchecksum = ('4154d8d1232837e255edf3cdcbb5ab184d71f4a4'
expectedchecksum = ('2164a66700e03cba9c9f5ed9e9a8d594d2da136a'
if quicktest else
'3aabaf66823b21b3d305dad804a62f6f6387c93e')
'a8276cec9b6991779c5bdaa46c1ae7cc50bc2403')
class UnicodeMiscTest(unittest.TestCase):
@@ -0,0 +1,2 @@
:func:`unicodedata.bidirectional` now return the correct default bidi class
for unassigned code points.
+2304 -2186
View File
File diff suppressed because it is too large Load Diff
+62 -8
View File
@@ -29,6 +29,7 @@
import dataclasses
import os
import sys
import re
import zipfile
from functools import partial
@@ -49,6 +50,7 @@ UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.zip"
DERIVED_BIDI_CLASS = "extracted/DerivedBidiClass%s.txt"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
@@ -79,6 +81,33 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON", "LRI", "RLI", "FSI", "PDI" ]
# https://www.unicode.org/reports/tr44/#BC_Values_Table
BIDI_LONG_NAMES = {
'Left_To_Right': 'L',
'Right_To_Left': 'R',
'Arabic_Letter': 'AL',
'European_Number': 'EN',
'European_Separator': 'ES',
'European_Terminator': 'ET',
'Arabic_Number': 'AN',
'Common_Separator': 'CS',
'Nonspacing_Mark': 'NSM',
'Boundary_Neutral': 'BN',
'Paragraph_Separator': 'B',
'Segment_Separator': 'S',
'White_Space': 'WS',
'Other_Neutral': 'ON',
'Left_To_Right_Embedding': 'LRE',
'Left_To_Right_Override': 'LRO',
'Right_To_Left_Embedding': 'RLE',
'Right_To_Left_Override': 'RLO',
'Pop_Directional_Format': 'PDF',
'Left_To_Right_Isolate': 'LRI',
'Right_To_Left_Isolate': 'RLI',
'First_Strong_Isolate': 'FSI',
'Pop_Directional_Isolate': 'PDI',
}
# "Other" needs to be the first entry, see the comment in makeunicodedata
GRAPHEME_CLUSTER_NAMES = [ 'Other', 'Prepend', 'CR', 'LF', 'Control',
'Extend', 'Regional_Indicator', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT',
@@ -169,11 +198,11 @@ def makeunicodedata(unicode, trace):
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char] or 'N')
graphemebreak = GRAPHEME_CLUSTER_NAMES.index(unicode.grapheme_breaks[char] or 'Other')
extpict = unicode.ext_picts[char]
bidirectional = BIDIRECTIONAL_NAMES.index(unicode.bidi_classes[char])
if record:
# extract database properties
category = CATEGORY_NAMES.index(record.general_category)
combining = int(record.canonical_combining_class)
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
mirrored = record.bidi_mirrored == "Y"
normalizationquickcheck = record.quick_check
incb = INDIC_CONJUNCT_BREAK_NAMES.index(record.incb)
@@ -181,12 +210,12 @@ def makeunicodedata(unicode, trace):
category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck, graphemebreak, incb, extpict,
)
elif eastasianwidth or graphemebreak or extpict:
# an unassigned but reserved character, with a known
# east_asian_width or grapheme_break or ext_pict
item = (0, 0, 0, 0, eastasianwidth, 0, graphemebreak, 0, extpict)
else:
continue
if eastasianwidth or graphemebreak or extpict or bidirectional:
item = (0, 0, bidirectional, 0, eastasianwidth,
0, graphemebreak, 0, extpict)
else:
continue
# add entry to index and item tables
i = cache.get(item)
@@ -457,7 +486,7 @@ def makeunicodetype(unicode, trace):
if record:
# extract database properties
category = record.general_category
bidirectional = record.bidi_class
bidirectional = unicode.bidi_classes[char]
properties = record.binary_properties
flags = 0
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
@@ -770,6 +799,8 @@ def merge_old_version(version, new, old):
# category 0 is "unassigned"
category_changes[i] = 0
continue
if old.bidi_classes[i] != new.bidi_classes[i]:
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(old.bidi_classes[i])
# check characters that differ
if old.table[i] != new.table[i]:
for k, field in enumerate(dataclasses.fields(UcdRecord)):
@@ -783,7 +814,8 @@ def merge_old_version(version, new, old):
elif k == 2:
category_changes[i] = CATEGORY_NAMES.index(value)
elif k == 4:
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
# bidi_class changes handled via bidi_classes
pass
elif k == 5:
# We assume that all normalization changes are in 1:1 mappings
assert " " not in value
@@ -1042,6 +1074,28 @@ class UnicodeData:
table[i].east_asian_width = widths[i]
self.widths = widths
# Read DerivedBidiClass.txt for bidi classes
# see https://www.unicode.org/reports/tr44/#Missing_Conventions
bidi_classes = [None] * 0x110000
for i in range(0, 0x110000):
if table[i] is not None:
bidi_classes[i] = table[i].bidi_class
if version != '3.2.0':
missing_re = re.compile(
r'# @missing: ([\dA-F]+\.\.[\dA-F]+); (\w+)'
)
with open_data(DERIVED_BIDI_CLASS, version) as f:
for l in f:
m = missing_re.match(l)
if not m:
continue
name = BIDI_LONG_NAMES[m[2]]
for i in expand_range(m[1]):
bidi_classes[i] = name
for char, (bidi,) in UcdFile(DERIVED_BIDI_CLASS, version).expanded():
bidi_classes[char] = bidi
self.bidi_classes = bidi_classes
for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
if not propinfo:
# binary property