gossip.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. """Worker <-> Worker communication Bootstep."""
  2. from __future__ import absolute_import, unicode_literals
  3. from collections import defaultdict
  4. from functools import partial
  5. from heapq import heappush
  6. from operator import itemgetter
  7. from kombu import Consumer
  8. from kombu.asynchronous.semaphore import DummyLock
  9. from celery import bootsteps
  10. from celery.five import values
  11. from celery.utils.log import get_logger
  12. from celery.utils.objects import Bunch
  13. from .mingle import Mingle
  14. __all__ = ['Gossip']
  15. logger = get_logger(__name__)
  16. debug, info = logger.debug, logger.info
  17. class Gossip(bootsteps.ConsumerStep):
  18. """Bootstep consuming events from other workers.
  19. This keeps the logical clock value up to date.
  20. """
  21. label = 'Gossip'
  22. requires = (Mingle,)
  23. _cons_stamp_fields = itemgetter(
  24. 'id', 'clock', 'hostname', 'pid', 'topic', 'action', 'cver',
  25. )
  26. compatible_transports = {'amqp', 'redis'}
  27. def __init__(self, c, without_gossip=False,
  28. interval=5.0, heartbeat_interval=2.0, **kwargs):
  29. self.enabled = not without_gossip and self.compatible_transport(c.app)
  30. self.app = c.app
  31. c.gossip = self
  32. self.Receiver = c.app.events.Receiver
  33. self.hostname = c.hostname
  34. self.full_hostname = '.'.join([self.hostname, str(c.pid)])
  35. self.on = Bunch(
  36. node_join=set(),
  37. node_leave=set(),
  38. node_lost=set(),
  39. )
  40. self.timer = c.timer
  41. if self.enabled:
  42. self.state = c.app.events.State(
  43. on_node_join=self.on_node_join,
  44. on_node_leave=self.on_node_leave,
  45. max_tasks_in_memory=1,
  46. )
  47. if c.hub:
  48. c._mutex = DummyLock()
  49. self.update_state = self.state.event
  50. self.interval = interval
  51. self.heartbeat_interval = heartbeat_interval
  52. self._tref = None
  53. self.consensus_requests = defaultdict(list)
  54. self.consensus_replies = {}
  55. self.event_handlers = {
  56. 'worker.elect': self.on_elect,
  57. 'worker.elect.ack': self.on_elect_ack,
  58. }
  59. self.clock = c.app.clock
  60. self.election_handlers = {
  61. 'task': self.call_task
  62. }
  63. super(Gossip, self).__init__(c, **kwargs)
  64. def compatible_transport(self, app):
  65. with app.connection_for_read() as conn:
  66. return conn.transport.driver_type in self.compatible_transports
  67. def election(self, id, topic, action=None):
  68. self.consensus_replies[id] = []
  69. self.dispatcher.send(
  70. 'worker-elect',
  71. id=id, topic=topic, action=action, cver=1,
  72. )
  73. def call_task(self, task):
  74. try:
  75. self.app.signature(task).apply_async()
  76. except Exception as exc: # pylint: disable=broad-except
  77. logger.exception('Could not call task: %r', exc)
  78. def on_elect(self, event):
  79. try:
  80. (id_, clock, hostname, pid,
  81. topic, action, _) = self._cons_stamp_fields(event)
  82. except KeyError as exc:
  83. return logger.exception('election request missing field %s', exc)
  84. heappush(
  85. self.consensus_requests[id_],
  86. (clock, '%s.%s' % (hostname, pid), topic, action),
  87. )
  88. self.dispatcher.send('worker-elect-ack', id=id_)
  89. def start(self, c):
  90. super(Gossip, self).start(c)
  91. self.dispatcher = c.event_dispatcher
  92. def on_elect_ack(self, event):
  93. id = event['id']
  94. try:
  95. replies = self.consensus_replies[id]
  96. except KeyError:
  97. return # not for us
  98. alive_workers = set(self.state.alive_workers())
  99. replies.append(event['hostname'])
  100. if len(replies) >= len(alive_workers):
  101. _, leader, topic, action = self.clock.sort_heap(
  102. self.consensus_requests[id],
  103. )
  104. if leader == self.full_hostname:
  105. info('I won the election %r', id)
  106. try:
  107. handler = self.election_handlers[topic]
  108. except KeyError:
  109. logger.exception('Unknown election topic %r', topic)
  110. else:
  111. handler(action)
  112. else:
  113. info('node %s elected for %r', leader, id)
  114. self.consensus_requests.pop(id, None)
  115. self.consensus_replies.pop(id, None)
  116. def on_node_join(self, worker):
  117. debug('%s joined the party', worker.hostname)
  118. self._call_handlers(self.on.node_join, worker)
  119. def on_node_leave(self, worker):
  120. debug('%s left', worker.hostname)
  121. self._call_handlers(self.on.node_leave, worker)
  122. def on_node_lost(self, worker):
  123. info('missed heartbeat from %s', worker.hostname)
  124. self._call_handlers(self.on.node_lost, worker)
  125. def _call_handlers(self, handlers, *args, **kwargs):
  126. for handler in handlers:
  127. try:
  128. handler(*args, **kwargs)
  129. except Exception as exc: # pylint: disable=broad-except
  130. logger.exception(
  131. 'Ignored error from handler %r: %r', handler, exc)
  132. def register_timer(self):
  133. if self._tref is not None:
  134. self._tref.cancel()
  135. self._tref = self.timer.call_repeatedly(self.interval, self.periodic)
  136. def periodic(self):
  137. workers = self.state.workers
  138. dirty = set()
  139. for worker in values(workers):
  140. if not worker.alive:
  141. dirty.add(worker)
  142. self.on_node_lost(worker)
  143. for worker in dirty:
  144. workers.pop(worker.hostname, None)
  145. def get_consumers(self, channel):
  146. self.register_timer()
  147. ev = self.Receiver(channel, routing_key='worker.#',
  148. queue_ttl=self.heartbeat_interval)
  149. return [Consumer(
  150. channel,
  151. queues=[ev.queue],
  152. on_message=partial(self.on_message, ev.event_from_message),
  153. no_ack=True
  154. )]
  155. def on_message(self, prepare, message):
  156. _type = message.delivery_info['routing_key']
  157. # For redis when `fanout_patterns=False` (See Issue #1882)
  158. if _type.split('.', 1)[0] == 'task':
  159. return
  160. try:
  161. handler = self.event_handlers[_type]
  162. except KeyError:
  163. pass
  164. else:
  165. return handler(message.payload)
  166. # proto2: hostname in header; proto1: in body
  167. hostname = (message.headers.get('hostname') or
  168. message.payload['hostname'])
  169. if hostname != self.hostname:
  170. _, event = prepare(message.payload)
  171. self.update_state(event)
  172. else:
  173. self.clock.forward()