12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- """Recursive webcrawler example.
- One problem with this solution is that it does not remember
- urls it has already seen.
- For asynchronous DNS lookups install the `dnspython` package:
- $ pip install dnspython
- If the `pybloom` module is installed it will use a Bloom Filter
- to ensure a lower chance of recrawling an URL it has already seen.
- Since the bloom filter is not shared, but only passed as an argument
- to each subtask, it would be much better to have this as a centralized
- service.
- A BloomFilter with a capacity of 100_000 members and an error rate
- of 0.001 is 2.8MB pickled, but if compressed with zlib it only takes
- up 2.9kB(!).
- We don't have to do compression manually, just set the tasks compression
- to "zlib", and the serializer to "pickle".
- """
- from __future__ import with_statement
- import re
- import time
- import urlparse
- from celery.decorators import task
- from celery.task.sets import TaskSet
- from eventlet import Timeout
- from eventlet.green import urllib2
- try:
- from pybloom import BloomFilter
- except ImportError:
- # Dummy object used if pybloom is not installed.
- class BloomFilter(object):
- def __init__(self, **kwargs):
- pass
- def add(self, member):
- pass
- def __contains__(self, member):
- return False
- # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
- url_regex = re.compile(
- r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
- def domain(url):
- """Returns the domain part of an URL."""
- return urlparse.urlsplit(url)[1].split(":")[0]
- @task(ignore_result=True, serializer="pickle", compression="zlib")
- def crawl(url, seen=None):
- print("crawling: %r" % (url, ))
- if not seen:
- seen = BloomFilter(capacity=50000, error_rate=0.0001)
- with Timeout(5, False):
- try:
- data = urllib2.urlopen(url).read()
- except (urllib2.HTTPError, IOError):
- return
- location = domain(url)
- wanted_urls = []
- for url_match in url_regex.finditer(data):
- url = url_match.group(0)
- # To not destroy the internet, we only fetch URLs on the same domain.
- if url not in seen and location in domain(url):
- wanted_urls.append(url)
- seen.add(url)
- subtasks = TaskSet(crawl.subtask((url, seen)) for url in wanted_urls)
- subtasks.apply_async()
|