数据获取的渠道:企业内部,机台,网站 etc..., 现使用R的rvest包进行网络数据的爬取
code:
library(rvest)
library(stringr)
library(tidyr)
library(dplyr)
url
page
# get positon
position%html_nodes("ul.sojob-list div.sojob-item-main div.job-info h3")%>%html_text(trim=TRUE)
p%data.frame()
names(p)
# get area
area%html_nodes("ul.sojob-list div.sojob-item-main div.job-info a.area")%>%html_text(trim=TRUE)
a%data.frame()
names(a)
# 提取了salary education experience
experience%html_nodes("ul.sojob-list div.sojob-item-main div.job-info p.condition span")%>%html_text(trim=TRUE)
names(e)
dt
colnames(dt)
for(n in 1:3){
j
i
while(i dt[j,n] j i } } # 循环查询 for(n in 1:10){ url page position%html_nodes("ul.sojob-list div.sojob-item-main div.job-info h3")%>%html_text(trim=TRUE) p1%data.frame() names(p1) p #area area%html_nodes("ul.sojob-list div.sojob-item-main div.job-info a.area")%>%html_text(trim=TRUE) a1%data.frame() names(a1) a #experience experience%html_nodes("ul.sojob-list div.sojob-item-main div.job-info p.condition span")%>%html_text(trim=TRUE) dt1 colnames(dt1) for(d in 1:3){ j i while(i<=length(e)){ dt[j,d] j i } } dt } work_info } # 数据可视化: 略...
閱讀更多 阿國 的文章