| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 | """Recursive webcrawler example.For asynchronous DNS lookups install the `dnspython` package:    $ pip install dnspythonRequires the `pybloom` module for the bloom filter which is usedto ensure a lower chance of recrawling an URL previously seen.Since the bloom filter is not shared, but only passed as an argumentto each subtask, it would be much better to have this as a centralizedservice.  Redis sets could also be a practical solution.A BloomFilter with a capacity of 100_000 members and an error rateof 0.001 is 2.8MB pickled, but if compressed with zlib it only takesup 2.9kB(!).We don't have to do compression manually, just set the tasks compressionto "zlib", and the serializer to "pickle"."""import retry:    from urllib.parse import urlsplitexcept ImportError:    from urlparse import urlsplit  # noqaimport requestsfrom celery import task, groupfrom eventlet import Timeoutfrom pybloom import BloomFilter# http://daringfireball.net/2009/11/liberal_regex_for_matching_urlsurl_regex = re.compile(    r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')def domain(url):    """Return the domain part of an URL."""    return urlsplit(url)[1].split(':')[0]@task(ignore_result=True, serializer='pickle', compression='zlib')def crawl(url, seen=None):    print('crawling: {0}'.format(url))    if not seen:        seen = BloomFilter(capacity=50000, error_rate=0.0001)    with Timeout(5, False):        try:            response = requests.get(url)        except Exception:            return    location = domain(url)    wanted_urls = []    for url_match in url_regex.finditer(response.text):        url = url_match.group(0)        # To not destroy the internet, we only fetch URLs on the same domain.        if url not in seen and location in domain(url):            wanted_urls.append(url)            seen.add(url)    subtasks = group(crawl.s(url, seen) for url in wanted_urls)    subtasks()
 |