| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 | 
							- """Recursive webcrawler example.
 
- For asynchronous DNS lookups install the `dnspython` package:
 
-     $ pip install dnspython
 
- Requires the `pybloom` module for the bloom filter which is used
 
- to ensure a lower chance of recrawling an URL previously seen.
 
- Since the bloom filter is not shared, but only passed as an argument
 
- to each subtask, it would be much better to have this as a centralized
 
- service.  Redis sets could also be a practical solution.
 
- A BloomFilter with a capacity of 100_000 members and an error rate
 
- of 0.001 is 2.8MB pickled, but if compressed with zlib it only takes
 
- up 2.9kB(!).
 
- We don't have to do compression manually, just set the tasks compression
 
- to "zlib", and the serializer to "pickle".
 
- """
 
- import re
 
- import time
 
- import urlparse
 
- from celery import task, group
 
- from eventlet import Timeout
 
- from eventlet.green import urllib2
 
- from pybloom import BloomFilter
 
- # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
 
- url_regex = re.compile(
 
-     r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
 
- def domain(url):
 
-     """Returns the domain part of an URL."""
 
-     return urlparse.urlsplit(url)[1].split(':')[0]
 
- @task(ignore_result=True, serializer='pickle', compression='zlib')
 
- def crawl(url, seen=None):
 
-     print('crawling: {0}'.format(url))
 
-     if not seen:
 
-         seen = BloomFilter(capacity=50000, error_rate=0.0001)
 
-     with Timeout(5, False):
 
-         try:
 
-             data = urllib2.urlopen(url).read()
 
-         except (urllib2.HTTPError, IOError):
 
-             return
 
-     location = domain(url)
 
-     wanted_urls = []
 
-     for url_match in url_regex.finditer(data):
 
-         url = url_match.group(0)
 
-         # To not destroy the internet, we only fetch URLs on the same domain.
 
-         if url not in seen and location in domain(url):
 
-             wanted_urls.append(url)
 
-             seen.add(url)
 
-     subtasks = group(crawl.s(url, seen) for url in wanted_urls)
 
-     subtasks.apply_async()
 
 
  |