- rework Oracle to no longer do its own unicode conversion; this has been observed

to be very slow. this now has the effect of producing "conditional" unicode conversion for the Oracle backend, as it still returns NVARCHAR etc. as unicode [ticket:2911] - add new "conditional" functionality to unicode processors; the C-level function now uses PyUnicode_Check() as a fast alternative to the isinstance() check in Python
2026-05-27 02:52:53 -04:00 · 2014-01-17 17:36:43 -05:00
parent 4765895d10
commit 882f615c68
5 changed files with 104 additions and 17 deletions
@@ -14,6 +14,43 @@
 .. changelog::
    :version: 0.9.2

+    .. change::
+        :tags: bug, oracle
+        :tickets: 2911
+
+        It's been observed that the usage of a cx_Oracle "outputtypehandler"
+        in Python 2.xx in order to coerce string values to Unicode is inordinately
+        expensive; even though cx_Oracle is written in C, when you pass the
+        Python ``unicode`` primitive to cursor.var() and associate with an output
+        handler, the library counts every conversion as a Python function call
+        with all the requisite overhead being recorded; this *despite* the fact
+        when running in Python 3, all strings are also unconditionally coerced
+        to unicode but it does *not* incur this overhead,
+        meaning that cx_Oracle is failing to use performant techniques in Py2K.
+        As SQLAlchemy cannot easily select for this style of type handler on a
+        per-column basis, the handler was assembled unconditionally thereby
+        adding the overhead to all string access.
+
+        So this logic has been replaced with SQLAlchemy's own unicode
+        conversion system, which now
+        only takes effect in Py2K for columns that are requested as unicode.
+        When C extensions are used, SQLAlchemy's system appears to be 2-3x faster than
+        cx_Oracle's.  Additionally, SQLAlchemy's unicode conversion has been
+        enhanced such that when the "conditional" converter is required
+        (now needed for the Oracle backend), the check for "already unicode" is now
+        performed in C and no longer introduces significant overhead.
+
+        This change has two impacts on the cx_Oracle backend.  One is that
+        string values in Py2K which aren't specifically requested with the
+        Unicode type or convert_unicode=True will now come back as ``str``,
+        not ``unicode`` - this behavior is similar to a backend such as
+        MySQL.  Additionally, when unicode values are requested with the cx_Oracle
+        backend, if the C extensions are *not* used, there is now an additional
+        overhead of an isinstance() check per column.  This tradeoff has been
+        made as it can be worked around and no longer places a performance burden
+        on the likely majority of Oracle result columns that are non-unicode
+        strings.
+
    .. change::
        :tags: bug, orm
        :tickets: 2908
@@ -409,6 +409,45 @@ UnicodeResultProcessor_process(UnicodeResultProcessor *self, PyObject *value)
    return PyUnicode_Decode(str, len, encoding, errors);
 }

+static PyObject *
+UnicodeResultProcessor_conditional_process(UnicodeResultProcessor *self, PyObject *value)
+{
+    const char *encoding, *errors;
+    char *str;
+    Py_ssize_t len;
+
+    if (value == Py_None)
+        Py_RETURN_NONE;
+
+#if PY_MAJOR_VERSION >= 3
+    if (PyUnicode_Check(value) == 1) {
+        Py_INCREF(value);
+        return value;
+    }
+
+    if (PyBytes_AsStringAndSize(value, &str, &len))
+        return NULL;
+
+    encoding = PyBytes_AS_STRING(self->encoding);
+    errors = PyBytes_AS_STRING(self->errors);
+#else
+
+    if (PyUnicode_Check(value) == 1) {
+        Py_INCREF(value);
+        return value;
+    }
+
+    if (PyString_AsStringAndSize(value, &str, &len))
+        return NULL;
+
+
+    encoding = PyString_AS_STRING(self->encoding);
+    errors = PyString_AS_STRING(self->errors);
+#endif
+
+    return PyUnicode_Decode(str, len, encoding, errors);
+}
+
 static void
 UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
 {
@@ -424,6 +463,8 @@ UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
 static PyMethodDef UnicodeResultProcessor_methods[] = {
    {"process", (PyCFunction)UnicodeResultProcessor_process, METH_O,
     "The value processor itself."},
+    {"conditional_process", (PyCFunction)UnicodeResultProcessor_conditional_process, METH_O,
+     "Conditional version of the value processor."},
    {NULL}  /* Sentinel */
 };

@@ -748,9 +748,6 @@ class OracleDialect_cx_oracle(OracleDialect):
                            255,
                            outconverter=self._detect_decimal,
                            arraysize=cursor.arraysize)
-            # allow all strings to come back natively as Unicode
-            elif defaultType in (cx_Oracle.STRING, cx_Oracle.FIXED_CHAR):
-                return cursor.var(util.text_type, size, cursor.arraysize)

        def on_connect(conn):
            conn.outputtypehandler = output_type_handler
@@ -15,6 +15,7 @@ They all share one common characteristic: None is passed through unchanged.
 import codecs
 import re
 import datetime
+from . import util


 def str_to_datetime_processor_factory(regexp, type_):
@@ -66,6 +67,21 @@ def py_fallback():
                return decoder(value, errors)[0]
        return process

+    def to_conditional_unicode_processor_factory(encoding, errors=None):
+        decoder = codecs.getdecoder(encoding)
+
+        def process(value):
+            if value is None:
+                return None
+            elif isinstance(value, util.text_type):
+                return value
+            else:
+                # decoder returns a tuple: (value, len). Simply dropping the
+                # len part is safe: it is done that way in the normal
+                # 'xx'.decode(encoding) code path.
+                return decoder(value, errors)[0]
+        return process
+
    def to_decimal_processor_factory(target_class, scale):
        fstring = "%%.%df" % scale

@@ -113,12 +129,17 @@ try:
                                       str_to_date

    def to_unicode_processor_factory(encoding, errors=None):
-        # this is cumbersome but it would be even more so on the C side
        if errors is not None:
            return UnicodeResultProcessor(encoding, errors).process
        else:
            return UnicodeResultProcessor(encoding).process

+    def to_conditional_unicode_processor_factory(encoding, errors=None):
+        if errors is not None:
+            return UnicodeResultProcessor(encoding, errors).conditional_process
+        else:
+            return UnicodeResultProcessor(encoding).conditional_process
+
    def to_decimal_processor_factory(target_class, scale):
        # Note that the scale argument is not taken into account for integer
        # values in the C implementation while it is in the Python one.
@@ -204,20 +204,11 @@ class String(Concatenable, TypeEngine):
                                    dialect.encoding, self.unicode_error)

            if needs_isinstance:
-                # we wouldn't be here unless convert_unicode='force'
-                # was specified, or the driver has erratic unicode-returning
-                # habits.  since we will be getting back unicode
-                # in most cases, we check for it (decode will fail).
-                def process(value):
-                    if isinstance(value, util.text_type):
-                        return value
-                    else:
-                        return to_unicode(value)
-                return process
+                return processors.to_conditional_unicode_processor_factory(
+                                    dialect.encoding, self.unicode_error)
            else:
-                # here, we assume that the object is not unicode,
-                # avoiding expensive isinstance() check.
-                return to_unicode
+                return processors.to_unicode_processor_factory(
+                                    dialect.encoding, self.unicode_error)
        else:
            return None