cassandra.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. """celery.backends.cassandra"""
  2. try:
  3. import pycassa
  4. from thrift import Thrift
  5. C = __import__('cassandra').ttypes # FIXME Namespace kludge
  6. except ImportError:
  7. pycassa = None
  8. import itertools
  9. import random
  10. import socket
  11. import time
  12. from datetime import datetime
  13. from celery.backends.base import BaseDictBackend
  14. from celery import conf
  15. from celery.exceptions import ImproperlyConfigured
  16. from celery.loaders import load_settings
  17. from celery.log import setup_logger
  18. from celery.serialization import pickle
  19. from celery import states
  20. class CassandraBackend(BaseDictBackend):
  21. """Highly fault tolerant Cassandra backend.
  22. .. attribute:: servers
  23. List of Cassandra servers with format: "hostname:port".
  24. :raises celery.exceptions.ImproperlyConfigured: if
  25. module :mod:`pycassa` is not available.
  26. """
  27. servers = []
  28. keyspace = None
  29. column_family = None
  30. _retry_timeout = 300
  31. _retry_wait = 3
  32. _index_shards = 64
  33. _index_keys = ["celery.results.index!%02x" % i
  34. for i in range(_index_shards)]
  35. def __init__(self, servers=None, keyspace=None, column_family=None,
  36. cassandra_options=None, **kwargs):
  37. """Initialize Cassandra backend.
  38. Raises :class:`celery.exceptions.ImproperlyConfigured` if
  39. the :setting:`CASSANDRA_SERVERS` setting is not set.
  40. """
  41. self.logger = setup_logger("celery.backends.cassandra")
  42. self.result_expires = kwargs.get("result_expires") or \
  43. conf.TASK_RESULT_EXPIRES
  44. if not pycassa:
  45. raise ImproperlyConfigured(
  46. "You need to install the pycassa library to use the "
  47. "Cassandra backend. See http://github.com/vomjom/pycassa")
  48. settings = load_settings()
  49. self.servers = servers or \
  50. getattr(settings, "CASSANDRA_SERVERS", self.servers)
  51. self.keyspace = keyspace or \
  52. getattr(settings, "CASSANDRA_KEYSPACE",
  53. self.keyspace)
  54. self.column_family = column_family or \
  55. getattr(settings, "CASSANDRA_COLUMN_FAMILY",
  56. self.column_family)
  57. self.cassandra_options = dict(cassandra_options or {},
  58. **getattr(settings,
  59. "CASSANDRA_OPTIONS", {}))
  60. if not self.servers or not self.keyspace or not self.column_family:
  61. raise ImproperlyConfigured(
  62. "Cassandra backend not configured.")
  63. super(CassandraBackend, self).__init__()
  64. self._column_family = None
  65. def _retry_on_error(func):
  66. def wrapper(*args, **kwargs):
  67. self = args[0]
  68. ts = time.time() + self._retry_timeout
  69. while 1:
  70. try:
  71. return func(*args, **kwargs)
  72. except (pycassa.InvalidRequestException,
  73. pycassa.NoServerAvailable,
  74. pycassa.TimedOutException,
  75. pycassa.UnavailableException,
  76. socket.error,
  77. socket.timeout,
  78. Thrift.TException), exc:
  79. self.logger.warn('Cassandra error: %s. Retrying...' % exc)
  80. if time.time() > ts:
  81. raise
  82. time.sleep(self._retry_wait)
  83. return wrapper
  84. def _get_column_family(self):
  85. if self._column_family is None:
  86. conn = pycassa.connect(self.servers,
  87. **self.cassandra_options)
  88. self._column_family = \
  89. pycassa.ColumnFamily(conn, self.keyspace,
  90. self.column_family,
  91. read_consistency_level=pycassa.ConsistencyLevel.DCQUORUM,
  92. write_consistency_level=pycassa.ConsistencyLevel.DCQUORUM)
  93. return self._column_family
  94. def process_cleanup(self):
  95. if self._column_family is not None:
  96. self._column_family = None
  97. @_retry_on_error
  98. def _store_result(self, task_id, result, status, traceback=None):
  99. """Store return value and status of an executed task."""
  100. cf = self._get_column_family()
  101. date_done = datetime.utcnow()
  102. index_key = 'celery.results.index!%02x' % (
  103. random.randrange(self._index_shards))
  104. index_column_name = '%8x!%s' % (time.mktime(date_done.timetuple()),
  105. task_id)
  106. meta = {"status": status,
  107. "result": pickle.dumps(result),
  108. "date_done": date_done.strftime('%Y-%m-%dT%H:%M:%SZ'),
  109. "traceback": pickle.dumps(traceback)}
  110. cf.insert(task_id, meta)
  111. cf.insert(index_key, {index_column_name: status})
  112. @_retry_on_error
  113. def _get_task_meta_for(self, task_id):
  114. """Get task metadata for a task by id."""
  115. cf = self._get_column_family()
  116. try:
  117. obj = cf.get(task_id)
  118. meta = {
  119. "task_id": task_id,
  120. "status": obj["status"],
  121. "result": pickle.loads(str(obj["result"])),
  122. "date_done": obj["date_done"],
  123. "traceback": pickle.loads(str(obj["traceback"])),
  124. }
  125. except (KeyError, pycassa.NotFoundException):
  126. meta = {"status": states.PENDING, "result": None}
  127. return meta
  128. def cleanup(self):
  129. """Delete expired metadata."""
  130. self.logger.debug('Running cleanup...')
  131. expires = datetime.utcnow() - self.result_expires
  132. end_column = '%8x"' % (time.mktime(expires.timetuple()))
  133. cf = self._get_column_family()
  134. column_parent = C.ColumnParent(cf.column_family)
  135. slice_pred = C.SlicePredicate(
  136. slice_range=C.SliceRange('', end_column,
  137. count=2 ** 30))
  138. columns = cf.client.multiget_slice(cf.keyspace, self._index_keys,
  139. column_parent, slice_pred,
  140. pycassa.ConsistencyLevel.DCQUORUM)
  141. index_cols = [c.column.name
  142. for c in itertools.chain(*columns.values())]
  143. for k in self._index_keys:
  144. cf.remove(k, index_cols)
  145. task_ids = [c[9:] for c in index_cols]
  146. for k in task_ids:
  147. cf.remove(k)
  148. self.logger.debug('Cleaned %i expired results' % len(task_ids))