webcrawler.py 1004 B

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. """Recursive webcrawler example.
  2. One problem with this solution is that it does not remember
  3. urls it has already seen.
  4. To add support for this a bloom filter or redis sets can be used.
  5. """
  6. from __future__ import with_statement
  7. import re
  8. import time
  9. import urlparse
  10. from celery.decorators import task
  11. from eventlet import Timeout
  12. from eventlet.green import urllib2
  13. # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
  14. url_regex = re.compile(
  15. r'\b(([\w-]+://?|www[.])[^\s()<>]+ (?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
  16. def domain(url):
  17. return urlparse.urlsplit(url)[1].split(":")[0]
  18. @task
  19. def crawl(url):
  20. print("crawling: %r" % (url, ))
  21. location = domain(url)
  22. data = ''
  23. with Timeout(5, False):
  24. data = urllib2.urlopen(url).read()
  25. for url_match in url_regex.finditer(data):
  26. new_url = url_match.group(0)
  27. # Don't destroy the internet
  28. if location in domain(new_url):
  29. crawl.delay(new_url)
  30. time.sleep(0.3)