webcrawler.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. """Recursive webcrawler example.
  2. One problem with this solution is that it does not remember
  3. urls it has already seen.
  4. For asynchronous DNS lookups install the `dnspython` package:
  5. $ pip install dnspython
  6. If the `pybloom` module is installed it will use a Bloom Filter
  7. to ensure a lower chance of recrawling an URL it has already seen.
  8. Since the bloom filter is not shared, but only passed as an argument
  9. to each subtask, it would be much better to have this as a centralized
  10. service.
  11. A BloomFilter with a capacity of 100_000 members and an error rate
  12. of 0.001 is 2.8MB pickled, but if compressed with zlib it only takes
  13. up 2.9kB(!).
  14. We don't have to do compression manually, just set the tasks compression
  15. to "zlib", and the serializer to "pickle".
  16. """
  17. from __future__ import with_statement
  18. import re
  19. import time
  20. import urlparse
  21. from celery.decorators import task
  22. from celery.task.sets import TaskSet
  23. from eventlet import Timeout
  24. from eventlet.green import urllib2
  25. try:
  26. from pybloom import BloomFilter
  27. except ImportError:
  28. # Dummy object used if pybloom is not installed.
  29. class BloomFilter(object):
  30. def __init__(self, **kwargs):
  31. pass
  32. def add(self, member):
  33. pass
  34. def __contains__(self, member):
  35. return False
  36. # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
  37. url_regex = re.compile(
  38. r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
  39. def domain(url):
  40. """Returns the domain part of an URL."""
  41. return urlparse.urlsplit(url)[1].split(":")[0]
  42. @task(ignore_result=True, serializer="pickle", compression="zlib")
  43. def crawl(url, seen=None):
  44. print("crawling: %r" % (url, ))
  45. if not seen:
  46. seen = BloomFilter(capacity=50000, error_rate=0.0001)
  47. with Timeout(5, False):
  48. try:
  49. data = urllib2.urlopen(url).read()
  50. except (urllib2.HTTPError, IOError):
  51. return
  52. location = domain(url)
  53. wanted_urls = []
  54. for url_match in url_regex.finditer(data):
  55. url = url_match.group(0)
  56. # To not destroy the internet, we only fetch URLs on the same domain.
  57. if url not in seen and location in domain(url):
  58. wanted_urls.append(url)
  59. seen.add(url)
  60. subtasks = TaskSet(crawl.subtask((url, seen)) for url in wanted_urls)
  61. subtasks.apply_async()