webcrawler.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. """Recursive webcrawler example.
  2. For asynchronous DNS lookups install the `dnspython` package:
  3. $ pip install dnspython
  4. Requires the `pybloom` module for the bloom filter which is used
  5. to ensure a lower chance of recrawling a URL previously seen.
  6. Since the bloom filter is not shared, but only passed as an argument
  7. to each subtask, it would be much better to have this as a centralized
  8. service. Redis sets could also be a practical solution.
  9. A BloomFilter with a capacity of 100_000 members and an error rate
  10. of 0.001 is 2.8MB pickled, but if compressed with zlib it only takes
  11. up 2.9kB(!).
  12. We don't have to do compression manually, just set the tasks compression
  13. to "zlib", and the serializer to "pickle".
  14. """
  15. from __future__ import absolute_import, print_function, unicode_literals
  16. import re
  17. import requests
  18. from celery import task, group
  19. from eventlet import Timeout
  20. from pybloom import BloomFilter
  21. try:
  22. from urllib.parse import urlsplit
  23. except ImportError:
  24. from urlparse import urlsplit # noqa
  25. # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
  26. url_regex = re.compile(
  27. r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
  28. def domain(url):
  29. """Return the domain part of a URL."""
  30. return urlsplit(url)[1].split(':')[0]
  31. @task(ignore_result=True, serializer='pickle', compression='zlib')
  32. def crawl(url, seen=None):
  33. print('crawling: {0}'.format(url))
  34. if not seen:
  35. seen = BloomFilter(capacity=50000, error_rate=0.0001)
  36. with Timeout(5, False):
  37. try:
  38. response = requests.get(url)
  39. except requests.exception.RequestError:
  40. return
  41. location = domain(url)
  42. wanted_urls = []
  43. for url_match in url_regex.finditer(response.text):
  44. url = url_match.group(0)
  45. # To not destroy the internet, we only fetch URLs on the same domain.
  46. if url not in seen and location in domain(url):
  47. wanted_urls.append(url)
  48. seen.add(url)
  49. subtasks = group(crawl.s(url, seen) for url in wanted_urls)
  50. subtasks.delay()