<code># -*- coding: utf-8 -*- # 在這裡定義蜘蛛中間件的模型 # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals # ===========================Spider Middleware============================ # 定義:介於Scrapy引擎和爬蟲之間的框架,主要工作是處理蜘蛛的響應輸入和請求輸出。 # Spider Middleware功能:處理爬蟲的請求輸入和響應輸出 # scrapy已經提供了一些直接使用的中間件,他被SPIDER_MIDDLEWARES_BASE定義: # { # 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50, # 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500, # 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700, # 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800, # 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900, # } # =================SpiderMiddleware類================== class MaoyanSpiderMiddleware(object): @classmethod # 類方法,參數crawler,可以通過crawler調用settings裡的全局參數 def from_crawler(cls, crawler): """ :param crawler: 獲取settings裡的全局參數,如crawler.settings.get(參數) """ s = cls() # 調用spider_opened函數進行爬取數據並對該函數發送該信號。該信號一般用來分配spider的資源 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) # 調用spider_closed函數進行關閉爬蟲並對該函數發送該信號。該信號用來釋放spider在spider_opened時佔用的資源。 # crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s # 當返回來的response被Spider Middleware處理時,該方法被調用 def process_spider_input(self, response, spider): """ :param response: 被Spider Middleware處理的response對象 :param spider: 返回response對應的spider對象 """ return None # 當spider處理response對象的結果後,該方法被調用 def process_spider_output(self, response, result, spider): """ :param response: 被spider處理後得到結果的response對象 :param result: result包含Item或request對象的可迭代對象,即spider返回的response結果 :param spider: 返回response對象的spider對象 """ # 遍歷返回的可迭代對象 for i in result: yield i # 當spider的process_spider_input和process_spider_output發生異常時調用該方法 def process_spider_exception(self, response, exception, spider): """ :param response: 異常被拋出時被處理的response對象 :param exception: 拋出的異常 :param spider: 拋出該異常的spider對象 """ pass # 以spider啟動的request為參數調用該方法,返回一個request可迭代對象 def process_start_requests(self, start_requests, spider): """ :param start_requests: 開始請求的可迭代對象 :param spider: 開始請求所對應的spider對象 """ # 遍歷可迭代對象 for r in start_requests: yield r # 當spider開啟時調用該函數,說明開始爬取數據並分配spider的資源 def spider_opened(self, spider): """ :param spider: 開始爬取的spider對象 """ spider.logger.info('Spider opened: %s' % spider.name) # # 當某個spider被關閉時,說明關閉該爬蟲並釋放spider在spider_opened時佔用的資源。 # def spider_closed(self, spider): # """ # :param spider: 開始爬取的spider對象 # """ # spider.logger.info('Spider opened:%s'%spider.name) # ======================Downloader Middleware======================== # 定義:位於Scrapy引擎和下載器之間的框架,主要是處理Scrapy引擎與下載器之間的請求及響應。見scrapy框架圖 # Downloader Middleware功能:可以修改User-Agent、處理重定向、設置代理、失敗重試、設置Cookies等 # scrapy已經提供了一些直接使用的中間件,他被DOWNLOADER_MIDDLEWARES_BASE定義: # { # 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100, # 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300, # 'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350, # 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400, # 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500, # 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550, # 'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580, # 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590, # 'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600, # 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700, # 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750, # 'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830, # 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850, # 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900, # } # ===============DownloaderMiddleware類================= class MaoyanDownloaderMiddleware(object): @classmethod # 類方法,參數crawler,可以通過crawler調用settings裡的全局參數 def from_crawler(cls, crawler): """ :param crawler: 獲取settings裡的全局參數,如crawler.settings.get(參數) """ s = cls() # 調用spider_opened函數進行爬取數據並對該函數發送該信號。該信號一般用來分配spider的資源 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) # 調用spider_closed函數進行關閉爬蟲並對該函數發送該信號。該信號用來釋放spider在spider_opened時佔用的資源。 # crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s # request被scrapy從調度器調度給Downloader Middleware之前調用該方法對request對象進行處理 def process_request(self, request, spider): """ :param request: 就是scrapy從調度器調度出來的request對象 :param spider: 就是scrapy調度出來的request對象的spider對象 """ return None # request對象被Downloader Middleware執行後返回response是才調用該方法對response對象進行處理 def process_response(self, request, response, spider): """ :param request: 調度出來被Downloader Middleware處理的request對象 :param response: Downloader Middleware處理request對象返回後的response對象 :param spider: response返回來的spider對象 """ return response # 當process_request和process_response發生異常時調用 def process_exception(self, request, exception, spider): """ :param request: 產生異常的request對象 :param exception: 拋出的異常對象 :param spider: 產生異常的request對象的spider對象 """ pass # 當spider開啟時調用該函數,說明開始爬取數據並分配spider的資源 def spider_opened(self, spider): """ :param spider: 開始爬取的spider對象 """ spider.logger.info('Spider opened: %s' % spider.name) # # 當某個spider被關閉時,說明關閉該爬蟲並釋放spider在spider_opened時佔用的資源。 # def spider_closed(self, spider): # """ # :param spider: 開始爬取的spider對象 # """ # spider.logger.info('Spider opened: %s' % spider.name)/<code>