用python一天爬取20萬條企業信息,20條線程共同努力的結果

爬蟲環境

python3.7+pycharm

用python一天爬取20萬條企業信息,20條線程共同努力的結果

最近發現一個網站,首商網,上面企業信息百萬以上,然而網站一點兒反爬機制都沒有,這對我們喜歡爬蟲的來講豈不是太爽了,直接拿出擼一套代碼,用了三次併發,每次用20條線程,爬了五六個小時,拿下了20萬條數據,美滋滋!

用python一天爬取20萬條企業信息,20條線程共同努力的結果

還是老規矩,下面直接上代碼,所有的註釋以及解釋都在代碼中,可以直接運行:

for k in range(1, 1651, 50):
 # -*- coding: utf-8 -*-
 # 本項目是原始的異步爬蟲,沒有封裝為函數
 import asyncio
 import aiohttp
 import time
 from bs4 import BeautifulSoup
 import csv
 import requests
 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
 # 先用併發獲取每個頁面的子鏈接
 ########################################################################################################################
 pro = 'zhaoshuang:LINA5201314@ 14.215.44.251:28803'
 proxies = {'http://': 'http://' + pro,
 'httpS://': 'https://' + pro
 }
 # 加入請求頭
 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit'
 '/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
 wzs = []
 def parser(url):
 print(url)
 try:
 response = requests.get(url, headers=headers)
 soup1 = BeautifulSoup(response.text, "lxml")
 # body > div.list_contain > div.left > div.list_li > ul > li:nth-child(1) > table > tbody > tr > td:nth-child(3) > div.title > a
 wz = soup1.select('div.title')
 for i in wz:
 wzs.append(i.contents[0].get("href"))
 time.sleep(1)
 except:
 print('公司正在審核中')
 urls = ['http://www.sooshong.com/c-3p{}'.format(num) for num in range(k, k + 50)]
 # 利用併發加速爬取,最大線程為50個,本文章中一共有50個網站,可以加入50個線程
 # 建立一個加速器對象,線程數每個網站都不同,太大網站接受不了會造成數據損失
 executor = ThreadPoolExecutor(max_workers=10)
 # submit()的參數: 第一個為函數, 之後為該函數的傳入參數,允許有多個
 future_tasks = [executor.submit(parser, url) for url in urls]
 # 等待所有的線程完成,才進入後續的執行
 wait(future_tasks, return_when=ALL_COMPLETED)
 print('子頁鏈接抓取完畢!')
 ########################################################################################################################
 # 使用併發法爬取詳細頁鏈接
 # 定義函數獲取每個網頁需要爬取的內容
 wzs1 = []
 def parser(url):
 # 利用正則表達式解析網頁
 try:
 res = requests.get(url, headers=headers)
 # 對響應體進行解析
 soup = BeautifulSoup(res.text, "lxml")
 # 找到頁面子鏈接,進入子頁面,對子頁面進行抓取
 # 用select函數抽取需要的內容,單擊需要的內容》檢查》copy select
 lianjie = soup.select('#main > div.main > div.intro > div.intros > div.text > p > a')
 lianjie = lianjie[0].get('href')
 wzs1.append(lianjie)
 print(lianjie)
 except:
 print('子頁解析失敗')
 # 利用併發加速爬取,最大線程為50個,本文章中一共有50個網站,可以加入50個線程
 # 建立一個加速器對象,線程數每個網站都不同,太大網站接受不了會造成數據損失
 executor = ThreadPoolExecutor(max_workers=10)
 # submit()的參數: 第一個為函數, 之後為該函數的傳入參數,允許有多個
 future_tasks = [executor.submit(parser, url) for url in wzs]
 # 等待所有的線程完成,才進入後續的執行
 wait(future_tasks, return_when=ALL_COMPLETED)
 print('詳細頁鏈接獲取完畢!')
 """
 # 使用異步法抓取子頁面的鏈接
 ########################################################################################################################
 async def get_html(sess, ur):
 try:
 proxy_auth = aiohttp.BasicAuth('zhaoshuang', 'LINA5201314')
 html = await sess.get(ur,
 headers=headers) # , proxy='http://'+'14.116.200.33:28803', proxy_auth=proxy_auth)
 r = await html.text()
 return r
 except:
 print("error")
 # f = requests.get('http://211775.sooshong.com', headers=headers)
 wzs1 = []
 # 解析網頁
 async def parser(respo):
 # 利用正則表達式解析網頁
 try:
 # 對響應體進行解析
 soup = BeautifulSoup(respo, "lxml")
 # 找到頁面子鏈接,進入子頁面,對子頁面進行抓取
 # 用select函數抽取需要的內容,單擊需要的內容》檢查》copy select
 lianjie = soup.select('#main > div.main > div.intro > div.intros > div.text > p > a')
 lianjie = lianjie[0].get('href')
 wzs1.append(lianjie)
 print(lianjie)
 company = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(1) > strong") # 標題
 company = company[0].text
 # 匹配電話號碼
 dianhua = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(3)") # 地址
 dianhua = dianhua[0].text.split(":")[1]
 # 匹配手機號碼
 phone = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(4)") # 日租價格
 phone = phone[0].text.split(":")[1]
 # 匹配傳真
 chuanzhen = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(5)") # 月租價格
 chuanzhen = chuanzhen[0].text.split(":")[1]
 # 經營模式
 jingying = soup.select("#main > div.aside > div.info > div.info_c > p:nth-child(8)") # 面積大小
 jingying = jingying[0].text.split(":")[1]
 # 公司地址
 address = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(9)') # 抽取建造年份
 address = address[0].text.split(":")[1]
 # 公司簡介
 # introduction = soup.select("#main > div.main > div.intro > div.intros > div.text > p") # 樓層屬性
 # introduction = introduction[0].text.strip()
 data = [company, address, dianhua, phone, chuanzhen, jingying]
 print(data)
 with open('首富網企業7.csv', 'a+', newline='', encoding='GB2312', errors='ignore') as csvfile:
 w1 = csv.writer(csvfile)
 w1.writerow(data, [1])
 
 except:
 print("出錯!")
 async def main(loop):
 async with aiohttp.ClientSession() as sess:
 tasks = []
 for ii in wzs:
 ur = ii
 try:
 tasks.append(loop.create_task(get_html(sess, ur)))
 except:
 print('error')
 # 設置0.1的網絡延遲增加爬取效率
 await asyncio.sleep(0.1)
 finished, unfinised = await asyncio.wait(tasks)
 for i1 in finished:
 await parser(i1.result())
 if __name__ == '__main__':
 t1 = time.time()
 loop = asyncio.get_event_loop()
 loop.run_until_complete(main(loop))
 print("花費時間", time.time() - t1)
 print('詳細頁鏈接抓取完畢!')
 """
 ########################################################################################################################
 # 使用併發法獲取詳細頁的內容
 ########################################################################################################################
 # 定義函數獲取每個網頁需要爬取的內容
 def parser(url):
 global data
 try:
 res = requests.get(url, headers=headers)
 # 對響應體進行解析
 soup = BeautifulSoup(res.text, 'lxml')
 # 找到頁面子鏈接,進入子頁面,對子頁面進行抓取
 # 用select函數抽取需要的內容,單擊需要的內容》檢查》copy select
 company = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(1) > strong')
 company = company[0].text
 name = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(2)')
 name = name[0].text
 dianhua = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(3)')
 dianhua = dianhua[0].text.split(':')[1]
 shouji = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(4)')
 shouji = shouji[0].text.split(':')[1]
 chuanzhen = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(5)')
 chuanzhen = chuanzhen[0].text.split(':')[1]
 product = soup.select('tr:nth-child(1) > td:nth-child(2)')
 product = product[0].text
 company_type = soup.select('tr:nth-child(2) > td:nth-child(2) > span')
 company_type = company_type[0].text.strip()
 legal_person = soup.select('tr:nth-child(3) > td:nth-child(2)')
 legal_person = legal_person[0].text
 main_address = soup.select('tr:nth-child(5) > td:nth-child(2) > span')
 main_address = main_address[0].text
 brand = soup.select('tr:nth-child(6) > td:nth-child(2) > span')
 brand = brand[0].text
 area = soup.select('tr:nth-child(9) > td:nth-child(2) > span')
 area = area[0].text
 industry = soup.select('tr:nth-child(1) > td:nth-child(4)')
 industry = industry[0].text.strip()
 address = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(9)')
 address = address[0].text.split(':')[1]
 jingying = soup.select('#main > div.aside > div.info > div.info_c > p:nth-child(8)')
 jingying = jingying[0].text.split(':')[1]
 date = soup.select('tr:nth-child(5) > td:nth-child(4) > span')
 date = date[0].text
 wangzhi = soup.select('tr:nth-child(12) > td:nth-child(4) > p > span > a')
 wangzhi = wangzhi[0].text
 data = [company, date, name, legal_person, shouji, dianhua, chuanzhen, company_type, jingying, industry,
 product, wangzhi, area, brand, address, main_address] # 將以上數據放入列表中打印在命令框
 print(data)
 with open('服裝1.csv', 'a', newline='', encoding='GB2312') as csvfile:
 w1 = csv.writer(csvfile)
 w1.writerow(data)
 except:
 with open('服裝2.csv', 'a', newline='', encoding='utf-8-sig') as csvfile:
 w1 = csv.writer(csvfile)
 w1.writerow(data)
 print('utf解碼成功')
 # 利用併發加速爬取,最大線程為50個,本文章中一共有50個網站,可以加入50個線程
 # 建立一個加速器對象,線程數每個網站都不同,太大網站接受不了會造成數據損失
 executor = ThreadPoolExecutor(max_workers=10)
 # submit()的參數: 第一個為函數, 之後為該函數的傳入參數,允許有多個
 future_tasks = [executor.submit(parser, url) for url in wzs1]
 # 等待所有的線程完成,才進入後續的執行
 wait(future_tasks, return_when=ALL_COMPLETED)
 print('全部信息抓取完畢')

 


分享到:


相關文章: