peewee/playhouse/sqlite_udf.py

import collections
import datetime
import heapq
import json
import math
import os
import random
import re
import struct
import sys
import threading
import zlib
try:
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse


SQLITE_DATETIME_FORMATS = (
    '%Y-%m-%d %H:%M:%S.%f',
    '%Y-%m-%d %H:%M:%S.%f%z',
    '%Y-%m-%d %H:%M:%S',
    '%Y-%m-%d %H:%M:%S%z',
    '%Y-%m-%d',
    '%H:%M:%S',
    '%H:%M:%S.%f',
    '%H:%M')

from peewee import format_date_time

def format_date_time_sqlite(date_value):
    return format_date_time(date_value, SQLITE_DATETIME_FORMATS)

try:
    from playhouse import _sqlite_udf as cython_udf
except ImportError:
    cython_udf = None


# Group udf by function.
CONTROL_FLOW = 'control_flow'
DATE = 'date'
FILE = 'file'
HELPER = 'helpers'
JSON = 'json'
MATH = 'math'
RANK = 'rank'
STRING = 'string'

AGGREGATE_COLLECTION = {}
UDF_COLLECTION = {}


class synchronized_dict(dict):
    def __init__(self, *args, **kwargs):
        super(synchronized_dict, self).__init__(*args, **kwargs)
        self._lock = threading.Lock()

    def __getitem__(self, key):
        with self._lock:
            return super(synchronized_dict, self).__getitem__(key)

    def __setitem__(self, key, value):
        with self._lock:
            return super(synchronized_dict, self).__setitem__(key, value)

    def __delitem__(self, key):
        with self._lock:
            return super(synchronized_dict, self).__delitem__(key)


STATE = synchronized_dict()
SETTINGS = synchronized_dict()

# Class and function decorators.
def aggregate(*groups):
    def decorator(klass):
        for group in groups:
            AGGREGATE_COLLECTION.setdefault(group, [])
            AGGREGATE_COLLECTION[group].append(klass)
        return klass
    return decorator

def udf(group, name=None):
    def decorator(fn):
        UDF_COLLECTION.setdefault(group, [])
        UDF_COLLECTION[group].append((fn, name or fn.__name__))
        return fn
    return decorator

# Register aggregates / functions with connection.
def register_aggregate_groups(db, *groups):
    seen = set()
    for group in groups:
        klasses = AGGREGATE_COLLECTION.get(group, ())
        for klass in klasses:
            name = getattr(klass, 'name', klass.__name__)
            if name not in seen:
                seen.add(name)
                db.register_aggregate(klass, name)

def register_udf_groups(db, *groups):
    seen = set()
    for group in groups:
        functions = UDF_COLLECTION.get(group, ())
        for function, name in functions:
            if name not in seen:
                seen.add(name)
                db.register_function(function, name)

def register_groups(db, *groups):
    register_aggregate_groups(db, *groups)
    register_udf_groups(db, *groups)

def register_all(db):
    register_aggregate_groups(db, *AGGREGATE_COLLECTION)
    register_udf_groups(db, *UDF_COLLECTION)


# Begin actual user-defined functions and aggregates.

# Scalar functions.
@udf(CONTROL_FLOW)
def if_then_else(cond, truthy, falsey=None):
    if cond:
        return truthy
    return falsey

@udf(DATE)
def strip_tz(date_str):
    date_str = date_str.replace('T', ' ')
    tz_idx1 = date_str.find('+')
    if tz_idx1 != -1:
        return date_str[:tz_idx1]
    tz_idx2 = date_str.find('-')
    if tz_idx2 > 13:
        return date_str[:tz_idx2]
    return date_str

@udf(DATE)
def human_delta(nseconds, glue=', '):
    parts = (
        (86400 * 365, 'year'),
        (86400 * 30, 'month'),
        (86400 * 7, 'week'),
        (86400, 'day'),
        (3600, 'hour'),
        (60, 'minute'),
        (1, 'second'),
    )
    accum = []
    for offset, name in parts:
        val, nseconds = divmod(nseconds, offset)
        if val:
            suffix = val != 1 and 's' or ''
            accum.append('%s %s%s' % (val, name, suffix))
    if not accum:
        return '0 seconds'
    return glue.join(accum)

@udf(FILE)
def file_ext(filename):
    try:
        res = os.path.splitext(filename)
    except ValueError:
        return None
    return res[1]

@udf(FILE)
def file_read(filename):
    try:
        with open(filename) as fh:
            return fh.read()
    except:
        pass

@udf(HELPER)
def gzip(data, compression=9):
    if isinstance(data, str):
        data = bytes(data.encode('raw_unicode_escape'))
    return zlib.compress(data, compression)

@udf(HELPER)
def gunzip(data):
    return zlib.decompress(data)

@udf(HELPER)
def hostname(url):
    parse_result = urlparse(url)
    if parse_result:
        return parse_result.netloc

@udf(HELPER)
def toggle(key):
    key = key.lower()
    STATE[key] = ret = not STATE.get(key)
    return ret

@udf(HELPER)
def setting(key, value=None):
    if value is None:
        return SETTINGS.get(key)
    else:
        SETTINGS[key] = value
        return value

@udf(HELPER)
def clear_settings():
    SETTINGS.clear()

@udf(HELPER)
def clear_toggles():
    STATE.clear()

@udf(MATH)
def randomrange(start, end=None, step=None):
    if end is None:
        start, end = 0, start
    elif step is None:
        step = 1
    return random.randrange(start, end, step)

@udf(MATH)
def gauss_distribution(mean, sigma):
    try:
        return random.gauss(mean, sigma)
    except ValueError:
        return None

@udf(MATH)
def sqrt(n):
    try:
        return math.sqrt(n)
    except ValueError:
        return None

@udf(MATH)
def tonumber(s):
    try:
        return int(s)
    except ValueError:
        try:
            return float(s)
        except:
            return None

@udf(STRING)
def substr_count(haystack, needle):
    if not haystack or not needle:
        return 0
    return haystack.count(needle)

@udf(STRING)
def strip_chars(haystack, chars):
    return haystack.strip(chars)

@udf(JSON)
def json_contains(src_json, obj_json):
    stack = []
    try:
        stack.append((json.loads(obj_json), json.loads(src_json)))
    except:
        # Invalid JSON!
        return False

    while stack:
        obj, src = stack.pop()
        if isinstance(src, dict):
            if isinstance(obj, dict):
                for key in obj:
                    if key not in src:
                        return False
                    stack.append((obj[key], src[key]))
            elif isinstance(obj, list):
                for item in obj:
                    if item not in src:
                        return False
            elif obj not in src:
                return False
        elif isinstance(src, list):
            if isinstance(obj, dict):
                return False
            elif isinstance(obj, list):
                try:
                    for i in range(len(obj)):
                        stack.append((obj[i], src[i]))
                except IndexError:
                    return False
            elif obj not in src:
                return False
        elif obj != src:
            return False
    return True


# Aggregates.
class _heap_agg(object):
    def __init__(self):
        self.heap = []
        self.ct = 0

    def process(self, value):
        return value

    def step(self, value):
        self.ct += 1
        heapq.heappush(self.heap, self.process(value))

class _datetime_heap_agg(_heap_agg):
    def process(self, value):
        return format_date_time_sqlite(value)

@aggregate(DATE)
class mintdiff(_datetime_heap_agg):
    def finalize(self):
        dtp = min_diff = None
        while self.heap:
            if min_diff is None:
                if dtp is None:
                    dtp = heapq.heappop(self.heap)
                    continue
            dt = heapq.heappop(self.heap)
            diff = dt - dtp
            if min_diff is None or min_diff > diff:
                min_diff = diff
            dtp = dt
        if min_diff is not None:
            return min_diff.total_seconds()

@aggregate(DATE)
class avgtdiff(_datetime_heap_agg):
    def finalize(self):
        if self.ct < 1:
            return
        elif self.ct == 1:
            return 0

        total = ct = 0
        dtp = None
        while self.heap:
            if total == 0:
                if dtp is None:
                    dtp = heapq.heappop(self.heap)
                    continue

            dt = heapq.heappop(self.heap)
            diff = dt - dtp
            ct += 1
            total += diff.total_seconds()
            dtp = dt

        return float(total) / ct

@aggregate(DATE)
class duration(object):
    def __init__(self):
        self._min = self._max = None

    def step(self, value):
        dt = format_date_time_sqlite(value)
        if self._min is None or dt < self._min:
            self._min = dt
        if self._max is None or dt > self._max:
            self._max = dt

    def finalize(self):
        if self._min and self._max:
            td = (self._max - self._min)
            return td.total_seconds()
        return None

@aggregate(MATH)
class mode(object):
    def __init__(self):
        self.items = collections.Counter()

    def step(self, *args):
        self.items.update(args)

    def finalize(self):
        if self.items:
            return self.items.most_common(1)[0][0]

@aggregate(MATH)
class minrange(_heap_agg):
    def finalize(self):
        if self.ct == 0:
            return
        elif self.ct == 1:
            return 0

        prev = min_diff = None

        while self.heap:
            if min_diff is None:
                if prev is None:
                    prev = heapq.heappop(self.heap)
                    continue
            curr = heapq.heappop(self.heap)
            diff = curr - prev
            if min_diff is None or min_diff > diff:
                min_diff = diff
            prev = curr
        return min_diff

@aggregate(MATH)
class avgrange(_heap_agg):
    def finalize(self):
        if self.ct == 0:
            return
        elif self.ct == 1:
            return 0

        total = ct = 0
        prev = None
        while self.heap:
            if total == 0:
                if prev is None:
                    prev = heapq.heappop(self.heap)
                    continue

            curr = heapq.heappop(self.heap)
            diff = curr - prev
            ct += 1
            total += diff
            prev = curr

        return float(total) / ct

@aggregate(MATH)
class _range(object):
    name = 'range'

    def __init__(self):
        self._min = self._max = None

    def step(self, value):
        if self._min is None or value < self._min:
            self._min = value
        if self._max is None or value > self._max:
            self._max = value

    def finalize(self):
        if self._min is not None and self._max is not None:
            return self._max - self._min
        return None

@aggregate(MATH)
class stddev(object):
    def __init__(self):
        self.n = 0
        self.values = []
    def step(self, v):
        self.n += 1
        self.values.append(v)
    def finalize(self):
        if self.n <= 1:
            return 0
        mean = sum(self.values) / self.n
        return math.sqrt(sum((i - mean) ** 2 for i in self.values) / (self.n - 1))


def _parse_match_info(buf):
    # See http://sqlite.org/fts3.html#matchinfo
    bufsize = len(buf)  # Length in bytes.
    return [struct.unpack('@I', buf[i:i+4])[0] for i in range(0, bufsize, 4)]

def get_weights(ncol, raw_weights):
    if not raw_weights:
        return [1] * ncol
    else:
        weights = [0] * ncol
        for i, weight in enumerate(raw_weights):
            weights[i] = weight
    return weights

# Ranking implementation, which parse matchinfo.
def rank(raw_match_info, *raw_weights):
    # Handle match_info called w/default args 'pcx' - based on the example rank
    # function http://sqlite.org/fts3.html#appendix_a
    match_info = _parse_match_info(raw_match_info)
    score = 0.0

    p, c = match_info[:2]
    weights = get_weights(c, raw_weights)

    # matchinfo X value corresponds to, for each phrase in the search query, a
    # list of 3 values for each column in the search table.
    # So if we have a two-phrase search query and three columns of data, the
    # following would be the layout:
    # p0 : c0=[0, 1, 2],   c1=[3, 4, 5],    c2=[6, 7, 8]
    # p1 : c0=[9, 10, 11], c1=[12, 13, 14], c2=[15, 16, 17]
    for phrase_num in range(p):
        phrase_info_idx = 2 + (phrase_num * c * 3)
        for col_num in range(c):
            weight = weights[col_num]
            if not weight:
                continue

            col_idx = phrase_info_idx + (col_num * 3)

            # The idea is that we count the number of times the phrase appears
            # in this column of the current row, compared to how many times it
            # appears in this column across all rows. The ratio of these values
            # provides a rough way to score based on "high value" terms.
            row_hits = match_info[col_idx]
            all_rows_hits = match_info[col_idx + 1]
            if row_hits > 0:
                score += weight * (float(row_hits) / all_rows_hits)

    return -score

# Okapi BM25 ranking implementation (FTS4 only).
def bm25(raw_match_info, *args):
    """
    Usage:

        # Format string *must* be pcnalx
        # Second parameter to bm25 specifies the index of the column, on
        # the table being queries.
        bm25(matchinfo(document_tbl, 'pcnalx'), 1) AS rank
    """
    match_info = _parse_match_info(raw_match_info)
    K = 1.2
    B = 0.75
    score = 0.0

    P_O, C_O, N_O, A_O = range(4)  # Offsets into the matchinfo buffer.
    term_count = match_info[P_O]  # n
    col_count = match_info[C_O]
    total_docs = match_info[N_O]  # N
    L_O = A_O + col_count
    X_O = L_O + col_count

    # Worked example of pcnalx for two columns and two phrases, 100 docs total.
    # {
    #   p  = 2
    #   c  = 2
    #   n  = 100
    #   a0 = 4   -- avg number of tokens for col0, e.g. title
    #   a1 = 40  -- avg number of tokens for col1, e.g. body
    #   l0 = 5   -- curr doc has 5 tokens in col0
    #   l1 = 30  -- curr doc has 30 tokens in col1
    #
    #   x000     -- hits this row for phrase0, col0
    #   x001     -- hits all rows for phrase0, col0
    #   x002     -- rows with phrase0 in col0 at least once
    #
    #   x010     -- hits this row for phrase0, col1
    #   x011     -- hits all rows for phrase0, col1
    #   x012     -- rows with phrase0 in col1 at least once
    #
    #   x100     -- hits this row for phrase1, col0
    #   x101     -- hits all rows for phrase1, col0
    #   x102     -- rows with phrase1 in col0 at least once
    #
    #   x110     -- hits this row for phrase1, col1
    #   x111     -- hits all rows for phrase1, col1
    #   x112     -- rows with phrase1 in col1 at least once
    # }

    weights = get_weights(col_count, args)

    for i in range(term_count):
        for j in range(col_count):
            weight = weights[j]
            if weight == 0:
                continue

            x = X_O + (3 * (j + i * col_count))
            term_frequency = float(match_info[x])  # f(qi, D)
            docs_with_term = float(match_info[x + 2])  # n(qi)

            # log( (N - n(qi) + 0.5) / (n(qi) + 0.5) )
            idf = math.log(
                    (total_docs - docs_with_term + 0.5) /
                    (docs_with_term + 0.5))
            if idf <= 0.0:
                idf = 1e-6

            doc_length = float(match_info[L_O + j])  # |D|
            avg_length = float(match_info[A_O + j]) or 1.  # avgdl
            ratio = doc_length / avg_length

            num = term_frequency * (K + 1.0)
            b_part = 1.0 - B + (B * ratio)
            denom = term_frequency + (K * b_part)

            pc_score = idf * (num / denom)
            score += (pc_score * weight)

    return -score


if cython_udf is not None:
    rank = udf(RANK, 'fts_rank')(cython_udf.peewee_rank)
    lucene = udf(RANK, 'fts_lucene')(cython_udf.peewee_lucene)
    bm25 = udf(RANK, 'fts_bm25')(cython_udf.peewee_bm25)
    bm25f = udf(RANK, 'fts_bm25f')(cython_udf.peewee_bm25f)

    damerau_levenshtein_dist = udf(STRING)(cython_udf.damerau_levenshtein_dist)
    levenshtein_dist = udf(STRING)(cython_udf.levenshtein_dist)
    str_dist = udf(STRING)(cython_udf.str_dist)
    median = aggregate(MATH)(cython_udf.median)
else:
    rank = udf(RANK, 'fts_rank')(rank)
    bm25 = udf(RANK, 'fts_bm25')(bm25)