Python+Scrapy爬蟲框架之Middleware文件詳解


Python+Scrapy爬蟲框架之Middleware文件詳解

<code># -*- coding: utf-8 -*-

# 在這裡定義蜘蛛中間件的模型
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# ===========================Spider Middleware============================
# 定義:介於Scrapy引擎和爬蟲之間的框架,主要工作是處理蜘蛛的響應輸入和請求輸出。
# Spider Middleware功能:處理爬蟲的請求輸入和響應輸出
# scrapy已經提供了一些直接使用的中間件,他被SPIDER_MIDDLEWARES_BASE定義:
# {
#     'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
#     'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,
#     'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,
#     'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,
#     'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,
# }

# =================SpiderMiddleware類==================
class MaoyanSpiderMiddleware(object):
    @classmethod

    # 類方法,參數crawler,可以通過crawler調用settings裡的全局參數
    def from_crawler(cls, crawler):
        """
        :param crawler: 獲取settings裡的全局參數,如crawler.settings.get(參數)
        """
        s = cls()
        # 調用spider_opened函數進行爬取數據並對該函數發送該信號。該信號一般用來分配spider的資源
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        # 調用spider_closed函數進行關閉爬蟲並對該函數發送該信號。該信號用來釋放spider在spider_opened時佔用的資源。
        # crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    # 當返回來的response被Spider Middleware處理時,該方法被調用
    def process_spider_input(self, response, spider):
        """
        :param response: 被Spider Middleware處理的response對象
        :param spider: 返回response對應的spider對象
        """
        return None

    # 當spider處理response對象的結果後,該方法被調用
    def process_spider_output(self, response, result, spider):
        """
        :param response: 被spider處理後得到結果的response對象
        :param result: result包含Item或request對象的可迭代對象,即spider返回的response結果
        :param spider: 返回response對象的spider對象
        """
        # 遍歷返回的可迭代對象
        for i in result:
            yield i

    # 當spider的process_spider_input和process_spider_output發生異常時調用該方法
    def process_spider_exception(self, response, exception, spider):
        """
        :param response: 異常被拋出時被處理的response對象
        :param exception: 拋出的異常
        :param spider: 拋出該異常的spider對象
        """
        pass

    # 以spider啟動的request為參數調用該方法,返回一個request可迭代對象
    def process_start_requests(self, start_requests, spider):
        """
        :param start_requests: 開始請求的可迭代對象
        :param spider: 開始請求所對應的spider對象
        """
        # 遍歷可迭代對象
        for r in start_requests:
            yield r


    # 當spider開啟時調用該函數,說明開始爬取數據並分配spider的資源
    def spider_opened(self, spider):
        """
        :param spider: 開始爬取的spider對象
        """
        spider.logger.info('Spider opened: %s' % spider.name)


    # # 當某個spider被關閉時,說明關閉該爬蟲並釋放spider在spider_opened時佔用的資源。
    # def spider_closed(self, spider):
    #     """
    #     :param spider: 開始爬取的spider對象
    #     """
    #     spider.logger.info('Spider opened:%s'%spider.name)




# ======================Downloader Middleware========================
# 定義:位於Scrapy引擎和下載器之間的框架,主要是處理Scrapy引擎與下載器之間的請求及響應。見scrapy框架圖
# Downloader Middleware功能:可以修改User-Agent、處理重定向、設置代理、失敗重試、設置Cookies等
# scrapy已經提供了一些直接使用的中間件,他被DOWNLOADER_MIDDLEWARES_BASE定義:
# {
#     'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
#     'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
#     'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
#     'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
#     'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
#     'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
#     'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
#     'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
#     'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
#     'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
#     'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
#     'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
#     'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
#     'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
# }


# ===============DownloaderMiddleware類=================
class MaoyanDownloaderMiddleware(object):
    @classmethod

    # 類方法,參數crawler,可以通過crawler調用settings裡的全局參數
    def from_crawler(cls, crawler):
        """
        :param crawler: 獲取settings裡的全局參數,如crawler.settings.get(參數)
        """
        s = cls()
        # 調用spider_opened函數進行爬取數據並對該函數發送該信號。該信號一般用來分配spider的資源
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

        # 調用spider_closed函數進行關閉爬蟲並對該函數發送該信號。該信號用來釋放spider在spider_opened時佔用的資源。
        # crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    # request被scrapy從調度器調度給Downloader Middleware之前調用該方法對request對象進行處理
    def process_request(self, request, spider):
        """
        :param request: 就是scrapy從調度器調度出來的request對象
        :param spider: 就是scrapy調度出來的request對象的spider對象
        """
        return None

    # request對象被Downloader Middleware執行後返回response是才調用該方法對response對象進行處理
    def process_response(self, request, response, spider):
        """
        :param request: 調度出來被Downloader Middleware處理的request對象
        :param response: Downloader Middleware處理request對象返回後的response對象
        :param spider: response返回來的spider對象
        """
        return response

    # 當process_request和process_response發生異常時調用
    def process_exception(self, request, exception, spider):
        """
        :param request:  產生異常的request對象
        :param exception:  拋出的異常對象
        :param spider: 產生異常的request對象的spider對象
        """
        pass

    # 當spider開啟時調用該函數,說明開始爬取數據並分配spider的資源
    def spider_opened(self, spider):
        """
        :param spider: 開始爬取的spider對象
        """
        spider.logger.info('Spider opened: %s' % spider.name)


    # # 當某個spider被關閉時,說明關閉該爬蟲並釋放spider在spider_opened時佔用的資源。
    # def spider_closed(self, spider):
    #     """
    #     :param spider: 開始爬取的spider對象
    #     """
    #     spider.logger.info('Spider opened: %s' % spider.name)/<code> 


分享到:


相關文章: