程序員爬蟲自建代理ip池,爬取數據不怕反爬蟲,就等你來看。

代理實際上指的就是代理服務器,它的功能是代理網絡用戶去取得網絡信息 。也可以說它是網絡信息的中轉站 。

無私分享全套Python爬蟲乾貨,如果你也想學習Python,@ 私信小編獲取

我們瞭解了代理池的四大問題,所以我們可以根據這四個問題去分析設計一個代理池框架,我們可以分成四個模塊。分別是獲取模塊、檢測模塊、存儲模塊、接口模塊 。這樣不僅有利於我們的維護,也使得可以更高效的完成我們的需求。

代碼模塊

獲取模塊

import requests

import chardet

import traceback

from lxml import etree

class Downloader(object):

def __init__(self):

self.headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

}

def download(self, url):

print('正在下載頁面:{}'.format(url))

try:

resp = requests.get(url, headers=self.headers)

resp.encoding = chardet.detect(resp.content)['encoding']

if resp.status_code == 200:

return self.xpath_parse(resp.text)

else:

raise ConnectionError

except Exception:

print('下載頁面出錯:{}'.format(url))

traceback.print_exc()

def xpath_parse(self, resp):

try:

page = etree.HTML(resp)

trs = page.xpath('//div[@id="list"]/table/tbody/tr')

proxy_list = []

for tr in trs:

ip = tr.xpath('./td[1]/text()')[0]

port = tr.xpath('./td[2]/text()')[0]

proxy = {

'proxy': ip + ':' + port

}

proxy_list.append(proxy)

return proxy_list

except Exception:

print('解析IP地址出錯')

traceback.print_exc()

if __name__ == '__main__':

print(Downloader().download('https://www.kuaidaili.com/free/inha/1/'))

存儲模塊

import pymongo

from pymongo.errors import DuplicateKeyError

class MongoDB(object):

def __init__(self):

self.client = pymongo.MongoClient()

self.db = self.client['proxypool3']

self.proxies = self.db['proxies']

self.proxies.ensure_index('proxy', unique=True)

self.proxies.create_index()

# createIndex()

def insert(self, proxy):

try:

self.proxies.insert(proxy)

print('插入成功:{}'.format(proxy))

except DuplicateKeyError:

pass

def delete(self, conditions):

self.proxies.remove(conditions)

print('刪除成功:{}'.format(conditions))

def update(self, conditions, values):

self.proxies.update(conditions, {"$set": values})

print('更新成功:{},{}'.format(conditions,values))

def get(self, count, conditions=None):

conditions = conditions if conditions else {}

count = int(count)

items = self.proxies.find(conditions, limit=count).sort('delay', pymongo.ASCENDING)

items = list(items)

return items

def get_count(self):

return self.proxies.count({})

if __name__ == '__main__':

m = MongoDB()

print(m.get(3))

檢測模塊

import requests

import time

import traceback

from requests.exceptions import ProxyError, ConnectionError

from db.mongo_db import MongoDB

from multiprocessing.pool import ThreadPool

def valid_many(proxy_list, method):

pool = ThreadPool(16)

for proxy in proxy_list:

pool.apply_async(valid_one, args=(proxy, method))

pool.close()

pool.join()

def valid_one(proxy, method, url='https://www.baidu.com'):

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

}

proxies = {

'http': 'http://' + proxy['proxy'],

'https': 'http://' + proxy['proxy']

}

try:

start_time = time.time()

# requests.packages.urllib3.disable_warnings()

resp = requests.get(url, headers=headers, proxies=proxies, timeout=5, verify=False)

delay = round(time.time() - start_time, 2)

if resp.status_code == 200:

proxy['delay'] = delay

if method == 'insert':

MongoDB().insert(proxy)

elif method == 'check':

MongoDB().update({'proxy': proxy['proxy']}, {'delay': proxy['delay']})

else:

if method == 'check':

MongoDB().delete({'proxy': proxy['proxy']})

except (ProxyError, ConnectionError):

if method == 'check':

MongoDB().delete({'proxy': proxy['proxy']})

except Exception:

traceback.print_exc()

API接口模塊

import flask

import json

from db.mongo_db import MongoDB

app = flask.Flask(__name__)

@app.route('/one')

def get_one():

proxies = MongoDB().get(1)

result = [proxy['proxy'] for proxy in proxies]

return json.dumps(result)

@app.route('/many')

def get_many():

args = flask.request.args

proxies = MongoDB().get(args['count'])

result = [proxy['proxy'] for proxy in proxies]

return json.dumps(result)

def run():

app.run()

為了幫助大家更輕鬆的學好Python,我給大家分享一套Python學習資料,希望對正在學習的你有所幫助!

獲取方式:關注並私信小編 “ 學習 ”,即可免費獲取!

程序員爬蟲自建代理ip池,爬取數據不怕反爬蟲,就等你來看。


分享到:


相關文章: