Browse Source

Eventlet webcrawler example now uses a bloom filter to have a lower chance of recrawling an URL it has already crawled

Ask Solem 14 years ago
parent
commit
edf8efa652
1 changed files with 56 additions and 12 deletions
  1. 56 12
      examples/eventlet/webcrawler.py

+ 56 - 12
examples/eventlet/webcrawler.py

@@ -3,7 +3,24 @@
 One problem with this solution is that it does not remember
 urls it has already seen.
 
-To add support for this a bloom filter or redis sets can be used.
+For asynchronous DNS lookups install the `dnspython` package:
+
+    $ pip install dnspython
+
+If the `pybloom` module is installed it will use a Bloom Filter
+to ensure a lower chance of recrawling an URL it has already seen.
+
+Since the bloom filter is not shared, but only passed as an argument
+to each subtask, it would be much better to have this as a centralized
+service.
+
+A BloomFilter with a capacity of 100_000 members and an error rate
+of 0.001 is 2.8MB pickled, but if compressed with zlib it only takes
+up 2.9kB(!).
+
+We don't have to do compression manually, just set the tasks compression
+to "zlib", and the serializer to "pickle".
+
 
 """
 
@@ -14,28 +31,55 @@ import time
 import urlparse
 
 from celery.decorators import task
+from celery.task.sets import TaskSet
 from eventlet import Timeout
 from eventlet.green import urllib2
 
+try:
+    from pybloom import BloomFilter
+except ImportError:
+    # Dummy object used if pybloom is not installed.
+    class BloomFilter(object):
+
+        def __init__(self, **kwargs):
+            pass
+
+        def add(self, member):
+            pass
+
+        def __contains__(self, member):
+            return False
+
 # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
 url_regex = re.compile(
-    r'\b(([\w-]+://?|www[.])[^\s()<>]+ (?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
+    r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
 
 
 def domain(url):
+    """Returns the domain part of an URL."""
     return urlparse.urlsplit(url)[1].split(":")[0]
 
 
-@task
-def crawl(url):
+@task(ignore_result=True, serializer="pickle", compression="zlib")
+def crawl(url, seen=None):
     print("crawling: %r" % (url, ))
-    location = domain(url)
-    data = ''
+    if not seen:
+        seen = BloomFilter(capacity=50000, error_rate=0.0001)
+
     with Timeout(5, False):
-        data = urllib2.urlopen(url).read()
+        try:
+            data = urllib2.urlopen(url).read()
+        except (urllib2.HTTPError, IOError):
+            return
+
+    location = domain(url)
+    wanted_urls = []
     for url_match in url_regex.finditer(data):
-        new_url = url_match.group(0)
-        # Don't destroy the internet
-        if location in domain(new_url):
-            crawl.delay(new_url)
-            time.sleep(0.3)
+        url = url_match.group(0)
+        # To not destroy the internet, we only fetch URLs on the same domain.
+        if url not in seen and location in domain(url):
+            wanted_urls.append(url)
+            seen.add(url)
+
+    subtasks = TaskSet(crawl.subtask((url, seen)) for url in wanted_urls)
+    subtasks.apply_async()