scrapy基于CrawlSpider实现爬取西刺代理并验证是否能用

逃离我推掉我的手 2022-05-31 00:42 249阅读 0赞

基于scrapy框架的CrawlSpider类实现跟进爬取并利用xpath匹配出每一页的代理ip与端口号保存在txt文档中。  
因为西刺代理页面过多，并且靠后的基本没用，所以我们只爬取前9页。  
spider文件如下：

# -*- coding: utf-8 -*-
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from proIP.items import ProipItem
    import requests
    class CrawlipSpider(CrawlSpider):
        name = 'crawlIP'
        allowed_domains = ['xicidaili.com']
        start_urls = ['http://www.xicidaili.com/nn/1']
        #利用正则匹配
        rules = (
            Rule(LinkExtractor(allow=r'^http://www.xicidaili.com/nn/[1-9]?$'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            #提取出ip列表
            ip_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[2]/text()').extract()
            #提取出端口列表
            port_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[3]/text()').extract()
            #提取出协议类型列表
            type_list = response.xpath('//table[@id="ip_list"]//tr[@class="odd" or @class=""]/td[6]/text()').extract()
            print(response.url)
            for (ip, port, type) in zip(ip_list, port_list, type_list):
                proxies = {type: ip+port}
                try:
                #设置代理链接百度 如果状态码为200 则表示该代理可以使用 然后交给流水线处理
                    if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:
                        print('success %s' % ip)
                        item = ProipItem()
                        item['url'] = type + '://' + ip + ':' + port
                        yield item
                except:
                    print('fail %s' % ip)

item文件如下：

# -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class ProipItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        url = scrapy.Field()

值得注意的是对user-agent的设置，在settings文件中修改如下：

DEFAULT_REQUEST_HEADERS = {
        'Host': 'www.xicidaili.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Referer': 'http://www.xicidaili.com/',
        # 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJTdjNDI5MzI0MjI2NzZhOTI3MmI5ZmRiYzQxMWRjNjZkBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUNYK3dac04xaXExRXhwYXlSZ251NHpTYmFuV011OEhHbjVBM09DOFl1WGs9BjsARg%3D%3D--106494228e4547863ecf9d88d308c67aecd3a08b',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    FEED_EXPORT_ENCODING = 'utf-8'

Pipeline文件如下：

# -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    class ProipPipeline(object):
        def __init__(self):
            self.file = open('ip02.txt', 'a', encoding='utf-8')
    
        def process_item(self, item, spider):
            text = item['url'] + '\n'
            self.file.write(text)
            return item
    
        def close_spider(self, spider):
            self.file.close()
            pass