|
@@ -0,0 +1,38 @@
|
|
|
+"""Recursive webcrawler example.
|
|
|
+
|
|
|
+One problem with this solution is that it does not remember
|
|
|
+urls it has already seen.
|
|
|
+
|
|
|
+To add support for this a bloom filter or redis sets can be used.
|
|
|
+
|
|
|
+"""
|
|
|
+
|
|
|
+from __future__ import with_statement
|
|
|
+
|
|
|
+import re
|
|
|
+import time
|
|
|
+import urlparse
|
|
|
+
|
|
|
+from celery.decorators import task
|
|
|
+from eventlet import Timeout
|
|
|
+from eventlet.green import urllib2
|
|
|
+
|
|
|
+# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
|
|
|
+url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
|
|
|
+
|
|
|
+def domain(url):
|
|
|
+ return urlparse.urlsplit(url)[1].split(":")[0]
|
|
|
+
|
|
|
+@task
|
|
|
+def crawl(url):
|
|
|
+ print("crawling: %r" % (url, ))
|
|
|
+ location = domain(url)
|
|
|
+ data = ''
|
|
|
+ with Timeout(5, False):
|
|
|
+ data = urllib2.urlopen(url).read()
|
|
|
+ for url_match in url_regex.finditer(data):
|
|
|
+ new_url = url_match.group(0)
|
|
|
+ # Don't destroy the internet
|
|
|
+ if location in domain(new_url):
|
|
|
+ crawl.delay(new_url)
|
|
|
+ time.sleep(0.3)
|