14 年之前 · 6f252d0da0
--- a/examples/eventlet/celeryconfig.py
+++ b/examples/eventlet/celeryconfig.py
@@ -12,4 +12,4 @@ CELERY_DISABLE_RATE_LIMITS = True
 
				 CELERY_RESULT_BACKEND = "amqp"
			
 
				 CELERY_TASK_RESULT_EXPIRES = 30 * 60
			
 
				 
			
 
				-CELERY_IMPORTS = ("tasks", )
			
 
				+CELERY_IMPORTS = ("tasks", "webcrawler")
			
--- a/examples/eventlet/webcrawler.py
+++ b/examples/eventlet/webcrawler.py
@@ -0,0 +1,38 @@
 
				+"""Recursive webcrawler example.
			
 
				+
			
 
				+One problem with this solution is that it does not remember
			
 
				+urls it has already seen.
			
 
				+
			
 
				+To add support for this a bloom filter or redis sets can be used.
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+from __future__ import with_statement
			
 
				+
			
 
				+import re
			
 
				+import time
			
 
				+import urlparse
			
 
				+
			
 
				+from celery.decorators import task
			
 
				+from eventlet import Timeout
			
 
				+from eventlet.green import urllib2
			
 
				+
			
 
				+# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
			
 
				+url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
			
 
				+
			
 
				+def domain(url):
			
 
				+    return urlparse.urlsplit(url)[1].split(":")[0]
			
 
				+
			
 
				+@task
			
 
				+def crawl(url):
			
 
				+    print("crawling: %r" % (url, ))
			
 
				+    location = domain(url)
			
 
				+    data = ''
			
 
				+    with Timeout(5, False):
			
 
				+        data = urllib2.urlopen(url).read()
			
 
				+    for url_match in url_regex.finditer(data):
			
 
				+        new_url = url_match.group(0)
			
 
				+        # Don't destroy the internet
			
 
				+        if location in domain(new_url):
			
 
				+            crawl.delay(new_url)
			
 
				+            time.sleep(0.3)