webcrawler.py 996 B

1234567891011121314151617181920212223242526272829303132333435363738
  1. """Recursive webcrawler example.
  2. One problem with this solution is that it does not remember
  3. urls it has already seen.
  4. To add support for this a bloom filter or redis sets can be used.
  5. """
  6. from __future__ import with_statement
  7. import re
  8. import time
  9. import urlparse
  10. from celery.decorators import task
  11. from eventlet import Timeout
  12. from eventlet.green import urllib2
  13. # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
  14. url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
  15. def domain(url):
  16. return urlparse.urlsplit(url)[1].split(":")[0]
  17. @task
  18. def crawl(url):
  19. print("crawling: %r" % (url, ))
  20. location = domain(url)
  21. data = ''
  22. with Timeout(5, False):
  23. data = urllib2.urlopen(url).read()
  24. for url_match in url_regex.finditer(data):
  25. new_url = url_match.group(0)
  26. # Don't destroy the internet
  27. if location in domain(new_url):
  28. crawl.delay(new_url)
  29. time.sleep(0.3)