Python爬虫教程:采集快手用户所有公开作品,包括图集和视频!

写在前面

  • 代码功能如题:根据快手用户的id来爬取用户所有公开作品,包括图集和视频。
  • 原理:其实就是利用基于chromium内核的浏览器自带的devtools对所有请求进行排查找出包含作品链接的请求,然后用代码模拟请求去获得数据,再根据url下载作品保存就行了,包括一些网站的自动注册登录、操作都可以模拟。这个其实应该算是写过爬虫的同学们都知道。
Python爬虫教程:采集快手用户所有公开作品,包括图集和视频!

核心代码

  • 废话不多说,上核心代码
<code>def __crawl_user(self, uid):

if

uid.isdigit(): uid = self.__switch_id(uid) payload = {

"operationName"

:

"privateFeedsQuery"

,

"variables"

: {

"principalId"

: uid,

"pcursor"

:

""

,

"count"

:

999

},

"query"

:

"query privateFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n privateFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n}\n"

} res = requests.post(self.__data_url, headers=self.__headers, json=payload) works = json.loads(res.content.decode(encoding=

'utf-8'

, errors=

'strict'

))[

'data'

][

'privateFeeds'

][

'list'

]

if

not

os

.

path

.exists(

"../data"

):

os

.makedirs(

"../data"

) # 这两行代码将response写入json供分析 # with

open

(

"data/"

+ uid +

".json"

,

"w"

) as fp: # fp.

write

(json.dumps(works, indent=

2

)) # 防止该用户在直播,第一个作品默认为直播,导致获取信息为NoneType

if

works[

0

][

'id'

] is None: works.pop(

0

) name = re.

sub

(r

'[\\/:*?"<>|\r\n]+'

,

""

, works[

0

][

'user'

][

'name'

]) dir =

"data/"

+ name +

"("

+ uid +

")/"

#

print

(

len

(works))

if

not

os

.

path

.exists(dir):

os

.makedirs(dir) #

if

not

os

.

path

.exists(dir +

".list"

): #

print

(

""

)

print

(

"开始爬取用户 "

+ name +

",保存在目录 "

+ dir)

print

(

" 共有"

+ str(

len

(works)) +

"个作品"

)

for

j

in

range(

len

(works)): self.__crawl_work(uid, dir, works[j], j +

1

)

time

.sleep(

1

)

print

(

"用户 "

+ name +

"爬取完成!"

)

print

()

time

.sleep(

1

)/<code>

快手分为五种类型的作品,在作品里面表现为workType属性

  • 其中两种图集: vertical和multiple,意味着拼接长图和多图,所有图片的链接在imgUrls里
  • 一种单张图片: single 图片链接也在imgUrls里
  • K歌: ksong 图片链接一样,不考虑爬取音频...
  • 视频: video 需要解析html获得视频链接


Python爬虫教程:采集快手用户所有公开作品,包括图集和视频!


<code>

def

__crawl_work

(self, uid, dir, work, wdx)

:

w_type = work[

'workType'

] w_caption = re.sub(

r"\s+"

,

" "

, work[

'caption'

]) w_name = re.sub(

r'[\/:*?"<>|\r\n]+'

,

""

, w_caption)[

0

:

24

] w_time = time.strftime(

'%Y-%m-%d'

, time.localtime(work[

'timestamp'

] /

1000

))

if

w_type ==

'vertical'

or

w_type ==

'multiple'

or

w_type ==

"single"

or

w_type ==

'ksong'

: w_urls = work[

'imgUrls'

] l = len(w_urls) print(

" "

+ str(wdx) +

")图集作品:"

+ w_caption +

","

+

"共有"

+ str(l) +

"张图片"

)

for

i

in

range(l): p_name = w_time +

"_"

+ w_name +

"_"

+ str(i +

1

) +

".jpg"

pic = dir + p_name

if

not

os.path.exists(pic): r = requests.get(w_urls[i]) r.raise_for_status()

with

open(pic,

"wb"

)

as

f: f.write(r.content) print(

" "

+ str(i +

1

) +

"/"

+ str(l) +

" 图片 "

+ p_name +

" 下载成功 √"

)

else

: print(

" "

+ str(i +

1

) +

"/"

+ str(l) +

" 图片 "

+ p_name +

" 已存在 √"

)

elif

w_type ==

'video'

: w_url = self.__work_url + work[

'id'

] res = requests.get(w_url, headers=self.__headers_mobile, params={

"fid"

:

1841409882

,

"cc"

:

"share_copylink"

,

"shareId"

:

"143108986354"

}) html = res.text waitreplace = work[

'id'

] +

'".*?"srcNoMark":"(.*?)"'

v_url = re.findall(waitreplace, html)

try

: print(

" "

+ str(wdx) +

")视频作品:"

+ w_caption)

except

: print(

" 这里似乎有点小错误,已跳过"

) v_name = w_time +

"_"

+ w_name +

".mp4"

video = dir + v_name

if

v_url:

if

not

os.path.exists(video): r = requests.get(v_url[

0

]) r.raise_for_status()

with

open(video,

"wb"

)

as

f: f.write(r.content) print(

" 视频 "

+ v_name +

" 下载成功 √"

)

else

: print(

" 视频 "

+ v_name +

" 已存在 √"

)

else

: print(

"未找到视频"

)

else

: print(

"错误的类型"

)/<code>
  • payload就是post参数,这个是在devtools的request请求底下可以找到的
  • 其实就是解析json,然后里面有图片的url和视频的id,我注释掉的两行代码可以保存完整的json的,你可以去掉注释然后看分析保存的json
  • 剩下的看源码吧,不难理解的

  • Python爬虫教程:采集快手用户所有公开作品,包括图集和视频!


    注意事项:

    • 不考虑提供列表可选的批量下载功能
    • 有需要的合理功能可以issue反馈,看到后会考虑是否修改
    • 如果需要自定义自己的需求,可以拿走代码自行修改,喜欢的话给个star给个follow
    • 本代码仅供学习使用,不可违反法律爬取视频,以及私自盗用搬运视频,后果自负

    项目源码地址 https://github.com/oGsLP/kuaishou-crawler


    分享到:


    相關文章: