- rework Oracle to no longer do its own unicode conversion; this has been observed

to be very slow.  this now has the effect of producing "conditional" unicode
conversion for the Oracle backend, as it still returns NVARCHAR etc. as unicode
[ticket:2911]
- add new "conditional" functionality to unicode processors; the C-level
function now uses PyUnicode_Check() as a fast alternative to the isinstance()
check in Python
This commit is contained in:
Mike Bayer
2014-01-17 17:36:43 -05:00
parent 4765895d10
commit 882f615c68
5 changed files with 104 additions and 17 deletions
+37
View File
@@ -14,6 +14,43 @@
.. changelog::
:version: 0.9.2
.. change::
:tags: bug, oracle
:tickets: 2911
It's been observed that the usage of a cx_Oracle "outputtypehandler"
in Python 2.xx in order to coerce string values to Unicode is inordinately
expensive; even though cx_Oracle is written in C, when you pass the
Python ``unicode`` primitive to cursor.var() and associate with an output
handler, the library counts every conversion as a Python function call
with all the requisite overhead being recorded; this *despite* the fact
when running in Python 3, all strings are also unconditionally coerced
to unicode but it does *not* incur this overhead,
meaning that cx_Oracle is failing to use performant techniques in Py2K.
As SQLAlchemy cannot easily select for this style of type handler on a
per-column basis, the handler was assembled unconditionally thereby
adding the overhead to all string access.
So this logic has been replaced with SQLAlchemy's own unicode
conversion system, which now
only takes effect in Py2K for columns that are requested as unicode.
When C extensions are used, SQLAlchemy's system appears to be 2-3x faster than
cx_Oracle's. Additionally, SQLAlchemy's unicode conversion has been
enhanced such that when the "conditional" converter is required
(now needed for the Oracle backend), the check for "already unicode" is now
performed in C and no longer introduces significant overhead.
This change has two impacts on the cx_Oracle backend. One is that
string values in Py2K which aren't specifically requested with the
Unicode type or convert_unicode=True will now come back as ``str``,
not ``unicode`` - this behavior is similar to a backend such as
MySQL. Additionally, when unicode values are requested with the cx_Oracle
backend, if the C extensions are *not* used, there is now an additional
overhead of an isinstance() check per column. This tradeoff has been
made as it can be worked around and no longer places a performance burden
on the likely majority of Oracle result columns that are non-unicode
strings.
.. change::
:tags: bug, orm
:tickets: 2908
+41
View File
@@ -409,6 +409,45 @@ UnicodeResultProcessor_process(UnicodeResultProcessor *self, PyObject *value)
return PyUnicode_Decode(str, len, encoding, errors);
}
static PyObject *
UnicodeResultProcessor_conditional_process(UnicodeResultProcessor *self, PyObject *value)
{
const char *encoding, *errors;
char *str;
Py_ssize_t len;
if (value == Py_None)
Py_RETURN_NONE;
#if PY_MAJOR_VERSION >= 3
if (PyUnicode_Check(value) == 1) {
Py_INCREF(value);
return value;
}
if (PyBytes_AsStringAndSize(value, &str, &len))
return NULL;
encoding = PyBytes_AS_STRING(self->encoding);
errors = PyBytes_AS_STRING(self->errors);
#else
if (PyUnicode_Check(value) == 1) {
Py_INCREF(value);
return value;
}
if (PyString_AsStringAndSize(value, &str, &len))
return NULL;
encoding = PyString_AS_STRING(self->encoding);
errors = PyString_AS_STRING(self->errors);
#endif
return PyUnicode_Decode(str, len, encoding, errors);
}
static void
UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
{
@@ -424,6 +463,8 @@ UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
static PyMethodDef UnicodeResultProcessor_methods[] = {
{"process", (PyCFunction)UnicodeResultProcessor_process, METH_O,
"The value processor itself."},
{"conditional_process", (PyCFunction)UnicodeResultProcessor_conditional_process, METH_O,
"Conditional version of the value processor."},
{NULL} /* Sentinel */
};
@@ -748,9 +748,6 @@ class OracleDialect_cx_oracle(OracleDialect):
255,
outconverter=self._detect_decimal,
arraysize=cursor.arraysize)
# allow all strings to come back natively as Unicode
elif defaultType in (cx_Oracle.STRING, cx_Oracle.FIXED_CHAR):
return cursor.var(util.text_type, size, cursor.arraysize)
def on_connect(conn):
conn.outputtypehandler = output_type_handler
+22 -1
View File
@@ -15,6 +15,7 @@ They all share one common characteristic: None is passed through unchanged.
import codecs
import re
import datetime
from . import util
def str_to_datetime_processor_factory(regexp, type_):
@@ -66,6 +67,21 @@ def py_fallback():
return decoder(value, errors)[0]
return process
def to_conditional_unicode_processor_factory(encoding, errors=None):
decoder = codecs.getdecoder(encoding)
def process(value):
if value is None:
return None
elif isinstance(value, util.text_type):
return value
else:
# decoder returns a tuple: (value, len). Simply dropping the
# len part is safe: it is done that way in the normal
# 'xx'.decode(encoding) code path.
return decoder(value, errors)[0]
return process
def to_decimal_processor_factory(target_class, scale):
fstring = "%%.%df" % scale
@@ -113,12 +129,17 @@ try:
str_to_date
def to_unicode_processor_factory(encoding, errors=None):
# this is cumbersome but it would be even more so on the C side
if errors is not None:
return UnicodeResultProcessor(encoding, errors).process
else:
return UnicodeResultProcessor(encoding).process
def to_conditional_unicode_processor_factory(encoding, errors=None):
if errors is not None:
return UnicodeResultProcessor(encoding, errors).conditional_process
else:
return UnicodeResultProcessor(encoding).conditional_process
def to_decimal_processor_factory(target_class, scale):
# Note that the scale argument is not taken into account for integer
# values in the C implementation while it is in the Python one.
+4 -13
View File
@@ -204,20 +204,11 @@ class String(Concatenable, TypeEngine):
dialect.encoding, self.unicode_error)
if needs_isinstance:
# we wouldn't be here unless convert_unicode='force'
# was specified, or the driver has erratic unicode-returning
# habits. since we will be getting back unicode
# in most cases, we check for it (decode will fail).
def process(value):
if isinstance(value, util.text_type):
return value
else:
return to_unicode(value)
return process
return processors.to_conditional_unicode_processor_factory(
dialect.encoding, self.unicode_error)
else:
# here, we assume that the object is not unicode,
# avoiding expensive isinstance() check.
return to_unicode
return processors.to_unicode_processor_factory(
dialect.encoding, self.unicode_error)
else:
return None