cassandra.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. """celery.backends.cassandra"""
  2. try:
  3. import pycassa
  4. from thrift import Thrift
  5. C = pycassa.cassandra.ttypes
  6. except ImportError:
  7. pycassa = None
  8. import itertools
  9. import random
  10. import socket
  11. import time
  12. from datetime import datetime
  13. from celery.backends.base import BaseDictBackend
  14. from celery.exceptions import ImproperlyConfigured
  15. from celery.utils.serialization import pickle
  16. from celery.utils.timeutils import maybe_timedelta
  17. from celery import states
  18. class CassandraBackend(BaseDictBackend):
  19. """Highly fault tolerant Cassandra backend.
  20. .. attribute:: servers
  21. List of Cassandra servers with format: "hostname:port".
  22. :raises celery.exceptions.ImproperlyConfigured: if
  23. module :mod:`pycassa` is not available.
  24. """
  25. servers = []
  26. keyspace = None
  27. column_family = None
  28. _retry_timeout = 300
  29. _retry_wait = 3
  30. _index_shards = 64
  31. _index_keys = ["celery.results.index!%02x" % i
  32. for i in range(_index_shards)]
  33. def __init__(self, servers=None, keyspace=None, column_family=None,
  34. cassandra_options=None, **kwargs):
  35. """Initialize Cassandra backend.
  36. Raises :class:`celery.exceptions.ImproperlyConfigured` if
  37. the :setting:`CASSANDRA_SERVERS` setting is not set.
  38. """
  39. super(CassandraBackend, self).__init__(**kwargs)
  40. self.logger = self.app.log.setup_logger(
  41. name="celery.backends.cassandra")
  42. self.result_expires = kwargs.get("result_expires") or \
  43. maybe_timedelta(
  44. self.app.conf.CELERY_TASK_RESULT_EXPIRES)
  45. if not pycassa:
  46. raise ImproperlyConfigured(
  47. "You need to install the pycassa library to use the "
  48. "Cassandra backend. See https://github.com/pycassa/pycassa")
  49. self.servers = servers or \
  50. self.app.conf.get("CASSANDRA_SERVERS", self.servers)
  51. self.keyspace = keyspace or \
  52. self.app.conf.get("CASSANDRA_KEYSPACE",
  53. self.keyspace)
  54. self.column_family = column_family or \
  55. self.app.conf.get("CASSANDRA_COLUMN_FAMILY",
  56. self.column_family)
  57. self.cassandra_options = dict(cassandra_options or {},
  58. **self.app.conf.get("CASSANDRA_OPTIONS",
  59. {}))
  60. read_cons = self.app.conf.get("CASSANDRA_READ_CONSISTENCY",
  61. "LOCAL_QUORUM")
  62. write_cons = self.app.conf.get("CASSANDRA_WRITE_CONSISTENCY",
  63. "LOCAL_QUORUM")
  64. try:
  65. self.read_consistency = getattr(pycassa.ConsistencyLevel,
  66. read_cons)
  67. except AttributeError:
  68. self.read_consistency = pycassa.ConsistencyLevel.LOCAL_QUORUM
  69. try:
  70. self.write_consistency = getattr(pycassa.ConsistencyLevel,
  71. write_cons)
  72. except AttributeError:
  73. self.write_consistency = pycassa.ConsistencyLevel.LOCAL_QUORUM
  74. if not self.servers or not self.keyspace or not self.column_family:
  75. raise ImproperlyConfigured(
  76. "Cassandra backend not configured.")
  77. self._column_family = None
  78. def _retry_on_error(self, func):
  79. def wrapper(*args, **kwargs):
  80. self = args[0]
  81. ts = time.time() + self._retry_timeout
  82. while 1:
  83. try:
  84. return func(*args, **kwargs)
  85. except (pycassa.InvalidRequestException,
  86. pycassa.TimedOutException,
  87. pycassa.UnavailableException,
  88. socket.error,
  89. socket.timeout,
  90. Thrift.TException), exc:
  91. self.logger.warn('Cassandra error: %s. Retrying...' % exc)
  92. if time.time() > ts:
  93. raise
  94. time.sleep(self._retry_wait)
  95. return wrapper
  96. def _get_column_family(self):
  97. if self._column_family is None:
  98. conn = pycassa.connect(self.keyspace, servers=self.servers,
  99. **self.cassandra_options)
  100. self._column_family = \
  101. pycassa.ColumnFamily(conn, self.column_family,
  102. read_consistency_level=self.read_consistency,
  103. write_consistency_level=self.write_consistency)
  104. return self._column_family
  105. def process_cleanup(self):
  106. if self._column_family is not None:
  107. self._column_family = None
  108. @_retry_on_error
  109. def _store_result(self, task_id, result, status, traceback=None):
  110. """Store return value and status of an executed task."""
  111. cf = self._get_column_family()
  112. date_done = datetime.utcnow()
  113. index_key = 'celery.results.index!%02x' % (
  114. random.randrange(self._index_shards))
  115. index_column_name = '%8x!%s' % (time.mktime(date_done.timetuple()),
  116. task_id)
  117. meta = {"status": status,
  118. "result": pickle.dumps(result),
  119. "date_done": date_done.strftime('%Y-%m-%dT%H:%M:%SZ'),
  120. "traceback": pickle.dumps(traceback)}
  121. cf.insert(task_id, meta)
  122. cf.insert(index_key, {index_column_name: status})
  123. @_retry_on_error
  124. def _get_task_meta_for(self, task_id):
  125. """Get task metadata for a task by id."""
  126. cf = self._get_column_family()
  127. try:
  128. obj = cf.get(task_id)
  129. meta = {
  130. "task_id": task_id,
  131. "status": obj["status"],
  132. "result": pickle.loads(str(obj["result"])),
  133. "date_done": obj["date_done"],
  134. "traceback": pickle.loads(str(obj["traceback"])),
  135. }
  136. except (KeyError, pycassa.NotFoundException):
  137. meta = {"status": states.PENDING, "result": None}
  138. return meta
  139. def cleanup(self):
  140. """Delete expired metadata."""
  141. self.logger.debug('Running cleanup...')
  142. expires = datetime.utcnow() - self.result_expires
  143. end_column = '%8x"' % (time.mktime(expires.timetuple()))
  144. cf = self._get_column_family()
  145. column_parent = C.ColumnParent(cf.column_family)
  146. slice_pred = C.SlicePredicate(
  147. slice_range=C.SliceRange('', end_column,
  148. count=2 ** 30))
  149. columns = cf.client.multiget_slice(cf.keyspace, self._index_keys,
  150. column_parent, slice_pred,
  151. self.read_consistency)
  152. index_cols = [c.column.name
  153. for c in itertools.chain(*columns.values())]
  154. for k in self._index_keys:
  155. cf.remove(k, index_cols)
  156. task_ids = [c[9:] for c in index_cols]
  157. for k in task_ids:
  158. cf.remove(k)
  159. self.logger.debug('Cleaned %i expired results' % len(task_ids))