gh-142224: unicodedata: support bidi classes for unassigned code points (GH-144815)

2026-05-06 12:49:07 -04:00 · 2026-02-18 10:54:07 +00:00
parent 7a7521bcfa
commit e49bfca87c
4 changed files with 2384 additions and 2199 deletions
@@ -319,7 +319,7 @@ class BaseUnicodeFunctionsTest:
        self.assertRaises(TypeError, self.db.category, 'xx')

    def test_bidirectional(self):
-        self.assertEqual(self.db.bidirectional('\uFFFE'), '')
+        self.assertEqual(self.db.bidirectional('\uFFFE'), 'BN')
        self.assertEqual(self.db.bidirectional(' '), 'WS')
        self.assertEqual(self.db.bidirectional('A'), 'L')
        self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
@@ -347,6 +347,17 @@ class BaseUnicodeFunctionsTest:
        self.assertRaises(TypeError, self.db.bidirectional)
        self.assertRaises(TypeError, self.db.bidirectional, 'xx')

+    def test_bidirectional_unassigned(self):
+        if self.old:
+            return
+        self.assertEqual(self.db.bidirectional('\u0378'), 'L')
+        self.assertEqual(self.db.bidirectional('\u077F'), 'AL')
+        self.assertEqual(self.db.bidirectional('\u20CF'), 'ET')
+        self.assertEqual(self.db.bidirectional('\u0590'), 'R')
+        self.assertEqual(self.db.bidirectional('\uFFFF'), 'BN')
+        self.assertEqual(self.db.bidirectional('\U0001FFFE'), 'BN')
+        self.assertEqual(self.db.bidirectional('\U00010D01'), 'AL')
+
    def test_decomposition(self):
        self.assertEqual(self.db.decomposition('\uFFFE'),'')
        self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
@@ -676,9 +687,9 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):

    # Update this if the database changes. Make sure to do a full rebuild
    # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = ('83cc43a2fbb779185832b4c049217d80b05bf349'
+    expectedchecksum = ('668dbbea1136e69d4f00677a5988b23bc78aefc6'
                        if quicktest else
-                        '180bdc91143d8aa2eb9dd6726e66d37606205942')
+                        'b869af769bd8fe352c04622ab90533dc54df5cf3')

    @requires_resource('network')
    def test_all_names(self):
@@ -966,9 +977,9 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
 class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
    db = unicodedata.ucd_3_2_0
    old = True
-    expectedchecksum = ('4154d8d1232837e255edf3cdcbb5ab184d71f4a4'
+    expectedchecksum = ('2164a66700e03cba9c9f5ed9e9a8d594d2da136a'
                        if quicktest else
-                        '3aabaf66823b21b3d305dad804a62f6f6387c93e')
+                        'a8276cec9b6991779c5bdaa46c1ae7cc50bc2403')


 class UnicodeMiscTest(unittest.TestCase):
@@ -0,0 +1,2 @@
+:func:`unicodedata.bidirectional` now return the correct default bidi class
+for unassigned code points.
@@ -29,6 +29,7 @@
 import dataclasses
 import os
 import sys
+import re
 import zipfile

 from functools import partial
@@ -49,6 +50,7 @@ UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
 UNIHAN = "Unihan%s.zip"
+DERIVED_BIDI_CLASS = "extracted/DerivedBidiClass%s.txt"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
@@ -79,6 +81,33 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
    "ON", "LRI", "RLI", "FSI", "PDI" ]

+# https://www.unicode.org/reports/tr44/#BC_Values_Table
+BIDI_LONG_NAMES = {
+    'Left_To_Right': 'L',
+    'Right_To_Left': 'R',
+    'Arabic_Letter': 'AL',
+    'European_Number': 'EN',
+    'European_Separator': 'ES',
+    'European_Terminator': 'ET',
+    'Arabic_Number': 'AN',
+    'Common_Separator': 'CS',
+    'Nonspacing_Mark': 'NSM',
+    'Boundary_Neutral': 'BN',
+    'Paragraph_Separator': 'B',
+    'Segment_Separator': 'S',
+    'White_Space': 'WS',
+    'Other_Neutral': 'ON',
+    'Left_To_Right_Embedding': 'LRE',
+    'Left_To_Right_Override': 'LRO',
+    'Right_To_Left_Embedding': 'RLE',
+    'Right_To_Left_Override': 'RLO',
+    'Pop_Directional_Format': 'PDF',
+    'Left_To_Right_Isolate': 'LRI',
+    'Right_To_Left_Isolate': 'RLI',
+    'First_Strong_Isolate': 'FSI',
+    'Pop_Directional_Isolate': 'PDI',
+}
+
 # "Other" needs to be the first entry, see the comment in makeunicodedata
 GRAPHEME_CLUSTER_NAMES = [ 'Other', 'Prepend', 'CR', 'LF', 'Control',
    'Extend', 'Regional_Indicator', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT',
@@ -169,11 +198,11 @@ def makeunicodedata(unicode, trace):
        eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char] or 'N')
        graphemebreak = GRAPHEME_CLUSTER_NAMES.index(unicode.grapheme_breaks[char] or 'Other')
        extpict = unicode.ext_picts[char]
+        bidirectional = BIDIRECTIONAL_NAMES.index(unicode.bidi_classes[char])
        if record:
            # extract database properties
            category = CATEGORY_NAMES.index(record.general_category)
            combining = int(record.canonical_combining_class)
-            bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
            mirrored = record.bidi_mirrored == "Y"
            normalizationquickcheck = record.quick_check
            incb = INDIC_CONJUNCT_BREAK_NAMES.index(record.incb)
@@ -181,12 +210,12 @@ def makeunicodedata(unicode, trace):
                category, combining, bidirectional, mirrored, eastasianwidth,
                normalizationquickcheck, graphemebreak, incb, extpict,
                )
-        elif eastasianwidth or graphemebreak or extpict:
-            # an unassigned but reserved character, with a known
-            # east_asian_width or grapheme_break or ext_pict
-            item = (0, 0, 0, 0, eastasianwidth, 0, graphemebreak, 0, extpict)
        else:
-            continue
+            if eastasianwidth or graphemebreak or extpict or bidirectional:
+                item = (0, 0, bidirectional, 0, eastasianwidth,
+                        0, graphemebreak, 0, extpict)
+            else:
+                continue

        # add entry to index and item tables
        i = cache.get(item)
@@ -457,7 +486,7 @@ def makeunicodetype(unicode, trace):
        if record:
            # extract database properties
            category = record.general_category
-            bidirectional = record.bidi_class
+            bidirectional = unicode.bidi_classes[char]
            properties = record.binary_properties
            flags = 0
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
@@ -770,6 +799,8 @@ def merge_old_version(version, new, old):
            # category 0 is "unassigned"
            category_changes[i] = 0
            continue
+        if old.bidi_classes[i] != new.bidi_classes[i]:
+            bidir_changes[i] = BIDIRECTIONAL_NAMES.index(old.bidi_classes[i])
        # check characters that differ
        if old.table[i] != new.table[i]:
            for k, field in enumerate(dataclasses.fields(UcdRecord)):
@@ -783,7 +814,8 @@ def merge_old_version(version, new, old):
                    elif k == 2:
                        category_changes[i] = CATEGORY_NAMES.index(value)
                    elif k == 4:
-                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
+                        # bidi_class changes handled via bidi_classes
+                        pass
                    elif k == 5:
                        # We assume that all normalization changes are in 1:1 mappings
                        assert " " not in value
@@ -1042,6 +1074,28 @@ class UnicodeData:
                table[i].east_asian_width = widths[i]
        self.widths = widths

+        # Read DerivedBidiClass.txt for bidi classes
+        # see https://www.unicode.org/reports/tr44/#Missing_Conventions
+        bidi_classes = [None] * 0x110000
+        for i in range(0, 0x110000):
+            if table[i] is not None:
+                bidi_classes[i] = table[i].bidi_class
+        if version != '3.2.0':
+            missing_re = re.compile(
+                r'# @missing: ([\dA-F]+\.\.[\dA-F]+); (\w+)'
+            )
+            with open_data(DERIVED_BIDI_CLASS, version) as f:
+                for l in f:
+                    m = missing_re.match(l)
+                    if not m:
+                        continue
+                    name = BIDI_LONG_NAMES[m[2]]
+                    for i in expand_range(m[1]):
+                        bidi_classes[i] = name
+            for char, (bidi,) in UcdFile(DERIVED_BIDI_CLASS, version).expanded():
+                bidi_classes[char] = bidi
+        self.bidi_classes = bidi_classes
+
        for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
            if not propinfo:
                # binary property