1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
   |  class ProxyMetaclass(type):               def __new__(cls, name, bases, attrs):         count = 0         attrs['__CrawlFunc__'] = []         for k, v in attrs.items():             if 'crawl_' in k:                 attrs['__CrawlFunc__'].append(k)                 count += 1         attrs['__CrawlFuncCount__'] = count         return type.__new__(cls, name, bases, attrs)
  class Crawler(object, metaclass=ProxyMetaclass):      def get_proxies(self, callback):         proxies = []         for proxy in eval("self.{}()".format(callback)):              print('成功获取到代理', proxy)             proxies.append(proxy)         return proxies
                                    
      def crawl_daili66(self, page_count=4):         """         从代理 66 获取代理         :param page_count: 页码         :return: 代理         """         start_url = 'http://www.66ip.cn/{}.html'         urls = [start_url.format(page) for page in range(1, page_count + 1)]         for url in urls:             print('Crawling', url)             html = get_page(url)             if html:                 doc = pq(html)                 trs = doc('.containerbox table tr:gt(0)').items()                 for tr in trs:                     ip = tr.find('td:nth-child(1)').text()                     port = tr.find('td:nth-child(2)').text()                     yield ':'.join([ip, port])
      def crawl_proxy360(self):         """         从 Proxy360 获取代理         :return: 代理         """         start_url = 'http://www.proxy360.cn/Region/China'         print('Crawling', start_url)         html = get_page(start_url)         if html:             doc = pq(html)             lines = doc('div[name="list_proxy_ip"]').items()             for line in lines:                 ip = line.find('.tbBottomLine:nth-child(1)').text()                 port = line.find('.tbBottomLine:nth-child(2)').text()                 yield ':'.join([ip, port])
 
  class Getter():     def __init__(self):         self.redis = RedisClient()         self.crawler = Crawler()
      def is_over_threshold(self):         """         判断是否达到了代理池限制         """         if self.redis.count() >= POOL_UPPER_THRESHOLD:             return True         else:             return False
      def run(self):         print('获取器开始执行')         if not self.is_over_threshold():             for callback_label in range(self.crawler.__CrawlFuncCount__):                 callback = self.crawler.__CrawlFunc__[callback_label]                                  proxies = self.crawler.get_proxies(callback)                 sys.stdout.flush()                 for proxy in proxies:                     self.redis.add(proxy)
 
  |