1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
| #coding:utf-8 #!/usr/bin/env python
from bitarray import bitarray # 3rd party import mmh3 import scrapy from BeautifulSoup import BeautifulSoup as BS import os ls = os.linesep
class BloomFilter(set):
def __init__(self, size, hash_count): super(BloomFilter, self).__init__() self.bit_array = bitarray(size) self.bit_array.setall(0) self.size = size self.hash_count = hash_count
def __len__(self): return self.size
def __iter__(self): return iter(self.bit_array)
def add(self, item): for ii in range(self.hash_count): index = mmh3.hash(item, ii) % self.size self.bit_array[index] = 1
return self
def __contains__(self, item): out = True for ii in range(self.hash_count): index = mmh3.hash(item, ii) % self.size if self.bit_array[index] == 0: out = False
return out
class DmozSpider(scrapy.Spider): name = "baidu" allowed_domains = ["baidu.com"] start_urls = [ "http://baike.baidu.com/item/%E7%BA%B3%E5%85%B0%E6%98%8E%E7%8F%A0" ]
def parse(self, response):
# fname = "/media/common/娱乐/Electronic_Design/Coding/Python/Scrapy/tutorial/tutorial/spiders/temp" # # html = response.xpath('//html').extract()[0] # fobj = open(fname, 'w') # fobj.writelines(html.encode('utf-8')) # fobj.close()
bloom = BloomFilter(1000, 10) animals = ['dog', 'cat', 'giraffe', 'fly', 'mosquito', 'horse', 'eagle', 'bird', 'bison', 'boar', 'butterfly', 'ant', 'anaconda', 'bear', 'chicken', 'dolphin', 'donkey', 'crow', 'crocodile'] # First insertion of animals into the bloom filter for animal in animals: bloom.add(animal)
# Membership existence for already inserted animals # There should not be any false negatives for animal in animals: if animal in bloom: print('{} is in bloom filter as expected'.format(animal)) else: print('Something is terribly went wrong for {}'.format(animal)) print('FALSE NEGATIVE!')
# Membership existence for not inserted animals # There could be false positives other_animals = ['badger', 'cow', 'pig', 'sheep', 'bee', 'wolf', 'fox', 'whale', 'shark', 'fish', 'turkey', 'duck', 'dove', 'deer', 'elephant', 'frog', 'falcon', 'goat', 'gorilla', 'hawk'] for other_animal in other_animals: if other_animal in bloom: print('{} is not in the bloom, but a false positive'.format(other_animal)) else: print('{} is not in the bloom filter as expected'.format(other_animal))
|