mirror of
https://github.com/sqlalchemy/sqlalchemy.git
synced 2026-05-27 02:52:53 -04:00
- rework Oracle to no longer do its own unicode conversion; this has been observed
to be very slow. this now has the effect of producing "conditional" unicode conversion for the Oracle backend, as it still returns NVARCHAR etc. as unicode [ticket:2911] - add new "conditional" functionality to unicode processors; the C-level function now uses PyUnicode_Check() as a fast alternative to the isinstance() check in Python
This commit is contained in:
Vendored
+37
@@ -14,6 +14,43 @@
|
||||
.. changelog::
|
||||
:version: 0.9.2
|
||||
|
||||
.. change::
|
||||
:tags: bug, oracle
|
||||
:tickets: 2911
|
||||
|
||||
It's been observed that the usage of a cx_Oracle "outputtypehandler"
|
||||
in Python 2.xx in order to coerce string values to Unicode is inordinately
|
||||
expensive; even though cx_Oracle is written in C, when you pass the
|
||||
Python ``unicode`` primitive to cursor.var() and associate with an output
|
||||
handler, the library counts every conversion as a Python function call
|
||||
with all the requisite overhead being recorded; this *despite* the fact
|
||||
when running in Python 3, all strings are also unconditionally coerced
|
||||
to unicode but it does *not* incur this overhead,
|
||||
meaning that cx_Oracle is failing to use performant techniques in Py2K.
|
||||
As SQLAlchemy cannot easily select for this style of type handler on a
|
||||
per-column basis, the handler was assembled unconditionally thereby
|
||||
adding the overhead to all string access.
|
||||
|
||||
So this logic has been replaced with SQLAlchemy's own unicode
|
||||
conversion system, which now
|
||||
only takes effect in Py2K for columns that are requested as unicode.
|
||||
When C extensions are used, SQLAlchemy's system appears to be 2-3x faster than
|
||||
cx_Oracle's. Additionally, SQLAlchemy's unicode conversion has been
|
||||
enhanced such that when the "conditional" converter is required
|
||||
(now needed for the Oracle backend), the check for "already unicode" is now
|
||||
performed in C and no longer introduces significant overhead.
|
||||
|
||||
This change has two impacts on the cx_Oracle backend. One is that
|
||||
string values in Py2K which aren't specifically requested with the
|
||||
Unicode type or convert_unicode=True will now come back as ``str``,
|
||||
not ``unicode`` - this behavior is similar to a backend such as
|
||||
MySQL. Additionally, when unicode values are requested with the cx_Oracle
|
||||
backend, if the C extensions are *not* used, there is now an additional
|
||||
overhead of an isinstance() check per column. This tradeoff has been
|
||||
made as it can be worked around and no longer places a performance burden
|
||||
on the likely majority of Oracle result columns that are non-unicode
|
||||
strings.
|
||||
|
||||
.. change::
|
||||
:tags: bug, orm
|
||||
:tickets: 2908
|
||||
|
||||
@@ -409,6 +409,45 @@ UnicodeResultProcessor_process(UnicodeResultProcessor *self, PyObject *value)
|
||||
return PyUnicode_Decode(str, len, encoding, errors);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
UnicodeResultProcessor_conditional_process(UnicodeResultProcessor *self, PyObject *value)
|
||||
{
|
||||
const char *encoding, *errors;
|
||||
char *str;
|
||||
Py_ssize_t len;
|
||||
|
||||
if (value == Py_None)
|
||||
Py_RETURN_NONE;
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
if (PyUnicode_Check(value) == 1) {
|
||||
Py_INCREF(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
if (PyBytes_AsStringAndSize(value, &str, &len))
|
||||
return NULL;
|
||||
|
||||
encoding = PyBytes_AS_STRING(self->encoding);
|
||||
errors = PyBytes_AS_STRING(self->errors);
|
||||
#else
|
||||
|
||||
if (PyUnicode_Check(value) == 1) {
|
||||
Py_INCREF(value);
|
||||
return value;
|
||||
}
|
||||
|
||||
if (PyString_AsStringAndSize(value, &str, &len))
|
||||
return NULL;
|
||||
|
||||
|
||||
encoding = PyString_AS_STRING(self->encoding);
|
||||
errors = PyString_AS_STRING(self->errors);
|
||||
#endif
|
||||
|
||||
return PyUnicode_Decode(str, len, encoding, errors);
|
||||
}
|
||||
|
||||
static void
|
||||
UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
|
||||
{
|
||||
@@ -424,6 +463,8 @@ UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
|
||||
static PyMethodDef UnicodeResultProcessor_methods[] = {
|
||||
{"process", (PyCFunction)UnicodeResultProcessor_process, METH_O,
|
||||
"The value processor itself."},
|
||||
{"conditional_process", (PyCFunction)UnicodeResultProcessor_conditional_process, METH_O,
|
||||
"Conditional version of the value processor."},
|
||||
{NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
|
||||
@@ -748,9 +748,6 @@ class OracleDialect_cx_oracle(OracleDialect):
|
||||
255,
|
||||
outconverter=self._detect_decimal,
|
||||
arraysize=cursor.arraysize)
|
||||
# allow all strings to come back natively as Unicode
|
||||
elif defaultType in (cx_Oracle.STRING, cx_Oracle.FIXED_CHAR):
|
||||
return cursor.var(util.text_type, size, cursor.arraysize)
|
||||
|
||||
def on_connect(conn):
|
||||
conn.outputtypehandler = output_type_handler
|
||||
|
||||
@@ -15,6 +15,7 @@ They all share one common characteristic: None is passed through unchanged.
|
||||
import codecs
|
||||
import re
|
||||
import datetime
|
||||
from . import util
|
||||
|
||||
|
||||
def str_to_datetime_processor_factory(regexp, type_):
|
||||
@@ -66,6 +67,21 @@ def py_fallback():
|
||||
return decoder(value, errors)[0]
|
||||
return process
|
||||
|
||||
def to_conditional_unicode_processor_factory(encoding, errors=None):
|
||||
decoder = codecs.getdecoder(encoding)
|
||||
|
||||
def process(value):
|
||||
if value is None:
|
||||
return None
|
||||
elif isinstance(value, util.text_type):
|
||||
return value
|
||||
else:
|
||||
# decoder returns a tuple: (value, len). Simply dropping the
|
||||
# len part is safe: it is done that way in the normal
|
||||
# 'xx'.decode(encoding) code path.
|
||||
return decoder(value, errors)[0]
|
||||
return process
|
||||
|
||||
def to_decimal_processor_factory(target_class, scale):
|
||||
fstring = "%%.%df" % scale
|
||||
|
||||
@@ -113,12 +129,17 @@ try:
|
||||
str_to_date
|
||||
|
||||
def to_unicode_processor_factory(encoding, errors=None):
|
||||
# this is cumbersome but it would be even more so on the C side
|
||||
if errors is not None:
|
||||
return UnicodeResultProcessor(encoding, errors).process
|
||||
else:
|
||||
return UnicodeResultProcessor(encoding).process
|
||||
|
||||
def to_conditional_unicode_processor_factory(encoding, errors=None):
|
||||
if errors is not None:
|
||||
return UnicodeResultProcessor(encoding, errors).conditional_process
|
||||
else:
|
||||
return UnicodeResultProcessor(encoding).conditional_process
|
||||
|
||||
def to_decimal_processor_factory(target_class, scale):
|
||||
# Note that the scale argument is not taken into account for integer
|
||||
# values in the C implementation while it is in the Python one.
|
||||
|
||||
@@ -204,20 +204,11 @@ class String(Concatenable, TypeEngine):
|
||||
dialect.encoding, self.unicode_error)
|
||||
|
||||
if needs_isinstance:
|
||||
# we wouldn't be here unless convert_unicode='force'
|
||||
# was specified, or the driver has erratic unicode-returning
|
||||
# habits. since we will be getting back unicode
|
||||
# in most cases, we check for it (decode will fail).
|
||||
def process(value):
|
||||
if isinstance(value, util.text_type):
|
||||
return value
|
||||
else:
|
||||
return to_unicode(value)
|
||||
return process
|
||||
return processors.to_conditional_unicode_processor_factory(
|
||||
dialect.encoding, self.unicode_error)
|
||||
else:
|
||||
# here, we assume that the object is not unicode,
|
||||
# avoiding expensive isinstance() check.
|
||||
return to_unicode
|
||||
return processors.to_unicode_processor_factory(
|
||||
dialect.encoding, self.unicode_error)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user