現在越來越多的人在工作中使用到爬蟲,各個網站的反爬蟲機制也越來越嚴格,下面就自己構建一個代理ip池。
無私分享全套Python爬蟲乾貨,如果你也想學習Python,@ 私信小編獲取一.手動更新ip池
1.1在setting配置文件中新增ip池
IPPOOL=[
{“ipaddr”:”61.129.70.131:8080”},
{“ipaddr”:”61.152.81.193:9100”},
{“ipaddr”:”120.204.85.29:3128”},
{“ipaddr”:”219.228.126.86:8123”},
{“ipaddr”:”61.152.81.193:9100”},
{“ipaddr”:”218.82.33.225:53853”},
{“ipaddr”:”223.167.190.17:42789”}
]
1.2修改middlewares.py文件
<code>import random
from scrapy import signals
from youx.settings import IPPOOL
class MyproxiesSpiderMiddleware(object):
def __init__(self,ip=''):
self.ip=ip
def process_request(self, request, spider):
thisip=random.choice(IPPOOL)
print("this is ip:"+thisip["ipaddr"])
request.meta["proxy"]="http://"+thisip["ipaddr"]
/<code>
1.3在setting裡面配置DOWNLOADER_MIDDLEWARES
<code> DOWNLOADER_MIDDLEWARES = {
# 'youx.middlewares.MyCustomDownloaderMiddleware': 543,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': None,
'youx.middlewares.MyproxiesSpiderMiddleware': 125
} /<code>
二.直接在middlewares.py文件裡面添加ip池
2.1middlewares文件裡面代碼
import base64
import random
from scrapy import signals
PROXIES = [
{‘ip_port’: ‘61.160.233.8’, ‘user_pass’: ”},
{‘ip_port’: ‘125.93.149.186’, ‘user_pass’: ”},
{‘ip_port’: ‘58.38.86.181’, ‘user_pass’: ”},
{‘ip_port’: ‘119.142.86.110’, ‘user_pass’: ”},
{‘ip_port’: ‘124.161.16.89’, ‘user_pass’: ”},
{‘ip_port’: ‘61.160.233.8’, ‘user_pass’: ”},
{‘ip_port’: ‘101.94.131.237’, ‘user_pass’: ”},
{‘ip_port’: ‘219.157.162.97’, ‘user_pass’: ”},
{‘ip_port’: ‘61.152.89.18’, ‘user_pass’: ”},
{‘ip_port’: ‘139.224.132.192’, ‘user_pass’: ”}
]
<code> class ProxyMiddleware(object):
def process_request(self, request, spider):
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = "http://%s" % proxy['ip_port']
encoded_user_pass = base64.encodestring(proxy['user_pass'])
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
else:
request.meta['proxy'] = "http://%s" % proxy['ip_port']
/<code>
2.2setting裡面代碼
<code>DOWNLOADER_MIDDLEWARES = {
# 'youx.middlewares.MyCustomDownloaderMiddleware': 543,
'youx.middlewares.ProxyMiddleware': 700,
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': None,
}
/<code>
三.自動更新IP池
3.1這裡寫個自動獲取IP的類proxies.py,執行一下把獲取的IP保存到txt文件中去:
<code># *-* coding:utf-8 *-*
import requests
from bs4 import BeautifulSoup
import lxml
from multiprocessing import Process, Queue
import random
import json
import time
import requests
class Proxies(object):
"""docstring for Proxies"""
def __init__(self, page=3):
self.proxies = []
self.verify_pro = []
self.page = page
self.headers = {
'Accept': '*/*',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
self.get_proxies()
self.get_proxies_nn()
def get_proxies(self):
page = random.randint(1,10)
page_stop = page + self.page
while page < page_stop:
url = 'http://www.xicidaili.com/nt/%d' % page
html = requests.get(url, headers=self.headers).content
soup = BeautifulSoup(html, 'lxml')
ip_list = soup.find(id='ip_list')
for odd in ip_list.find_all(class_='odd'):
protocol = odd.find_all('td')[5].get_text().lower()+'://'
self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))
page += 1
def get_proxies_nn(self):
page = random.randint(1,10)
page_stop = page + self.page
while page < page_stop:
url = 'http://www.xicidaili.com/nn/%d' % page
html = requests.get(url, headers=self.headers).content
soup = BeautifulSoup(html, 'lxml')
ip_list = soup.find(id='ip_list')
for odd in ip_list.find_all(class_='odd'):
protocol = odd.find_all('td')[5].get_text().lower() + '://'
self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))
page += 1
def verify_proxies(self):
# 沒驗證的代理
old_queue = Queue()
# 驗證後的代理
new_queue = Queue()
print ('verify proxy........')
works = []
for _ in range(15):
works.append(Process(target=self.verify_one_proxy, args=(old_queue,new_queue)))
for work in works:
work.start()
for proxy in self.proxies:
old_queue.put(proxy)
for work in works:
old_queue.put(0)
for work in works:
work.join()
self.proxies = []
while 1:
try:
self.proxies.append(new_queue.get(timeout=1))
except:
break
print ('verify_proxies done!')
def verify_one_proxy(self, old_queue, new_queue):
while 1:
proxy = old_queue.get()
if proxy == 0:break
protocol = 'https' if 'https' in proxy else 'http'
proxies = {protocol: proxy}
try:
if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:
print ('success %s' % proxy)
new_queue.put(proxy)
except:
print ('fail %s' % proxy)
if __name__ == '__main__':
a = Proxies()
a.verify_proxies()
print (a.proxies)
proxie = a.proxies
with open('proxies.txt', 'a') as f:
for proxy in proxie:
f.write(proxy+'\\n') /<code>
這些IP就會保存到proxies.txt文件中去。
為了幫助大家更輕鬆的學好Python,我給大家分享一套Python學習資料,希望對正在學習的你有所幫助!
獲取方式:關注並私信小編 “ 學習 ”,即可免費獲取!
閱讀更多 極光代理 的文章