yangck
/
celery


			
							1234567891011121314151617181920212223242526272829303132333435363738394041
							"""Recursive webcrawler example.

One problem with this solution is that it does not remember
urls it has already seen.

To add support for this a bloom filter or redis sets can be used.

"""

from __future__ import with_statement

import re
import time
import urlparse

from celery.decorators import task
from eventlet import Timeout
from eventlet.green import urllib2

# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
url_regex = re.compile(
    r'\b(([\w-]+://?|www[.])[^\s()<>]+ (?:\([\w\d]+\)|([^[:punct:]\s]|/)))')


def domain(url):
    return urlparse.urlsplit(url)[1].split(":")[0]


@task
def crawl(url):
    print("crawling: %r" % (url, ))
    location = domain(url)
    data = ''
    with Timeout(5, False):
        data = urllib2.urlopen(url).read()
    for url_match in url_regex.finditer(data):
        new_url = url_match.group(0)
        # Don't destroy the internet
        if location in domain(new_url):
            crawl.delay(new_url)
            time.sleep(0.3)