sqlalchemy/examples/sharding/attribute_shard.py

"""a basic example of using the SQLAlchemy Sharding API.
Sharding refers to horizontally scaling data across multiple
databases.

In this example, four sqlite databases will store information about
weather data on a database-per-continent basis.

To set up a sharding system, you need:
    1. multiple databases, each assined a 'shard id'
    2. a function which can return a single shard id, given an instance
    to be saved; this is called "shard_chooser"
    3. a function which can return a list of shard ids which apply to a particular
    instance identifier; this is called "id_chooser".  If it returns all shard ids,
    all shards will be searched.
    4. a function which can return a list of shard ids to try, given a particular
    Query ("query_chooser").  If it returns all shard ids, all shards will be
    queried and the results joined together.
"""

# step 1. imports
from sqlalchemy import *
from sqlalchemy.orm import *
from sqlalchemy.orm.shard import ShardedSession
from sqlalchemy.sql import ColumnOperators
import datetime, operator

# step 2. databases
echo = True
db1 = create_engine('sqlite:///shard1.db', echo=echo)
db2 = create_engine('sqlite:///shard2.db', echo=echo)
db3 = create_engine('sqlite:///shard3.db', echo=echo)
db4 = create_engine('sqlite:///shard4.db', echo=echo)


# step 3. create session function.  this binds the shard ids
# to databases within a ShardedSession and returns it.
def create_session():
    s = ShardedSession(shard_chooser, id_chooser, query_chooser)
    s.bind_shard('north_america', db1)
    s.bind_shard('asia', db2)
    s.bind_shard('europe', db3)
    s.bind_shard('south_america', db4)
    return s

# step 4.  table setup.
meta = MetaData()

# we need a way to create identifiers which are unique across all
# databases.  one easy way would be to just use a composite primary key, where one
# value is the shard id.  but here, we'll show something more "generic", an
# id generation function.  we'll use a simplistic "id table" stored in database
# #1.  Any other method will do just as well; UUID, hilo, application-specific, etc.

ids = Table('ids', meta,
    Column('nextid', Integer, nullable=False))

def id_generator(ctx):
    # in reality, might want to use a separate transaction for this.
    c = db1.connect()
    nextid = c.execute(ids.select(for_update=True)).scalar()
    c.execute(ids.update(values={ids.c.nextid : ids.c.nextid + 1}))
    return nextid

# table setup.  we'll store a lead table of continents/cities,
# and a secondary table storing locations.
# a particular row will be placed in the database whose shard id corresponds to the
# 'continent'.  in this setup, secondary rows in 'weather_reports' will
# be placed in the same DB as that of the parent, but this can be changed
# if you're willing to write more complex sharding functions.

weather_locations = Table("weather_locations", meta,
        Column('id', Integer, primary_key=True, default=id_generator),
        Column('continent', String(30), nullable=False),
        Column('city', String(50), nullable=False)
    )

weather_reports = Table("weather_reports", meta,
    Column('id', Integer, primary_key=True),
    Column('location_id', Integer, ForeignKey('weather_locations.id')),
    Column('temperature', Float),
    Column('report_time', DateTime, default=datetime.datetime.now),
)

# create tables
for db in (db1, db2, db3, db4):
    meta.drop_all(db)
    meta.create_all(db)

# establish initial "id" in db1
db1.execute(ids.insert(), nextid=1)


# step 5. define sharding functions.

# we'll use a straight mapping of a particular set of "country"
# attributes to shard id.
shard_lookup = {
    'North America':'north_america',
    'Asia':'asia',
    'Europe':'europe',
    'South America':'south_america'
}

# shard_chooser - looks at the given instance and returns a shard id
# note that we need to define conditions for
# the WeatherLocation class, as well as our secondary Report class which will
# point back to its WeatherLocation via its 'location' attribute.
def shard_chooser(mapper, instance):
    if isinstance(instance, WeatherLocation):
        return shard_lookup[instance.continent]
    else:
        return shard_chooser(mapper, instance.location)

# id_chooser.  given a primary key, returns a list of shards
# to search.  here, we don't have any particular information from a
# pk so we just return all shard ids. often, youd want to do some
# kind of round-robin strategy here so that requests are evenly
# distributed among DBs
def id_chooser(ident):
    return ['north_america', 'asia', 'europe', 'south_america']

# query_chooser.  this also returns a list of shard ids, which can
# just be all of them.  but here we'll search into the Query in order
# to try to narrow down the list of shards to query.
def query_chooser(query):
    ids = []

    # here we will traverse through the query's criterion, searching
    # for SQL constructs.  we'll grab continent names as we find them
    # and convert to shard ids
    class FindContinent(sql.ClauseVisitor):
        def visit_binary(self, binary):
            if binary.left is weather_locations.c.continent:
                if binary.operator == operator.eq:
                    ids.append(shard_lookup[binary.right.value])
                elif binary.operator == ColumnOperators.in_op:
                    for bind in binary.right.clauses:
                        ids.append(shard_lookup[bind.value])

    FindContinent().traverse(query._criterion)
    if len(ids) == 0:
        return ['north_america', 'asia', 'europe', 'south_america']
    else:
        return ids

# step 6.  mapped classes.
class WeatherLocation(object):
    def __init__(self, continent, city):
        self.continent = continent
        self.city = city

class Report(object):
    def __init__(self, temperature):
        self.temperature = temperature

# step 7.  mappers
mapper(WeatherLocation, weather_locations, properties={
    'reports':relation(Report, backref='location')
})

mapper(Report, weather_reports)


# save and load objects!

tokyo = WeatherLocation('Asia', 'Tokyo')
newyork = WeatherLocation('North America', 'New York')
toronto = WeatherLocation('North America', 'Toronto')
london = WeatherLocation('Europe', 'London')
dublin = WeatherLocation('Europe', 'Dublin')
brasilia = WeatherLocation('South America', 'Brasila')
quito = WeatherLocation('South America', 'Quito')

tokyo.reports.append(Report(80.0))
newyork.reports.append(Report(75))
quito.reports.append(Report(85))

sess = create_session()
for c in [tokyo, newyork, toronto, london, dublin, brasilia, quito]:
    sess.save(c)
sess.flush()

sess.clear()

t = sess.query(WeatherLocation).get(tokyo.id)
assert t.city == tokyo.city
assert t.reports[0].temperature == 80.0

north_american_cities = sess.query(WeatherLocation).filter(WeatherLocation.continent == 'North America')
assert [c.city for c in north_american_cities] == ['New York', 'Toronto']

asia_and_europe = sess.query(WeatherLocation).filter(WeatherLocation.continent.in_('Europe', 'Asia'))
assert set([c.city for c in asia_and_europe]) == set(['Tokyo', 'London', 'Dublin'])