| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 | 
							- """Recursive webcrawler example.
 
- For asynchronous DNS lookups install the `dnspython` package:
 
-     $ pip install dnspython
 
- Requires the `pybloom` module for the bloom filter which is used
 
- to ensure a lower chance of recrawling an URL previously seen.
 
- Since the bloom filter is not shared, but only passed as an argument
 
- to each subtask, it would be much better to have this as a centralized
 
- service.  Redis sets could also be a practical solution.
 
- A BloomFilter with a capacity of 100_000 members and an error rate
 
- of 0.001 is 2.8MB pickled, but if compressed with zlib it only takes
 
- up 2.9kB(!).
 
- We don't have to do compression manually, just set the tasks compression
 
- to "zlib", and the serializer to "pickle".
 
- """
 
- import re
 
- try:
 
-     from urllib.parse import urlsplit
 
- except ImportError:
 
-     from urlparse import urlsplit  # noqa
 
- import requests
 
- from celery import task, group
 
- from eventlet import Timeout
 
- from pybloom import BloomFilter
 
- # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
 
- url_regex = re.compile(
 
-     r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
 
- def domain(url):
 
-     """Return the domain part of an URL."""
 
-     return urlsplit(url)[1].split(':')[0]
 
- @task(ignore_result=True, serializer='pickle', compression='zlib')
 
- def crawl(url, seen=None):
 
-     print('crawling: {0}'.format(url))
 
-     if not seen:
 
-         seen = BloomFilter(capacity=50000, error_rate=0.0001)
 
-     with Timeout(5, False):
 
-         try:
 
-             response = requests.get(url)
 
-         except Exception:
 
-             return
 
-     location = domain(url)
 
-     wanted_urls = []
 
-     for url_match in url_regex.finditer(response.text):
 
-         url = url_match.group(0)
 
-         # To not destroy the internet, we only fetch URLs on the same domain.
 
-         if url not in seen and location in domain(url):
 
-             wanted_urls.append(url)
 
-             seen.add(url)
 
-     subtasks = group(crawl.s(url, seen) for url in wanted_urls)
 
-     subtasks()
 
 
  |