爬取BOSS直聘信息selenium+CSS及总结
标签: 爬取BOSS直聘信息selenium+CSS及总结 博客 51CTO博客
2023-04-09 18:23:51 108浏览
爬取BOSS直聘信息selenium+CSS及总结,CSS+selenium
1、
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
f = open(r'D:\Pyprogram\venv\从零开始学python网络爬虫\爬取BOOS直聘.csv','wt',newline='',encoding='utf-8')
writer = csv.writer(f)
writer.writerow(('岗位','地区','薪资','工作年限','学历','公司','工作待遇'))
driver = webdriver.Chrome()
driver.get('https://www.zhipin.com/?sid=sem_pz_bdpc_dasou_title')
driver.implicitly_wait(10)
#driver.find_element(By.CSS_SELECTOR,'#wrap > div.column-search-panel.search-panel-new > div > div.search-box > div.search-form > form > div.search-form-con > p > input').send_keys('python')
driver.find_element(By.CSS_SELECTOR,'div.search-form-con > p > input').send_keys('python')
time.sleep(1)
#driver.find_element(By.CSS_SELECTOR,'#wrap > div.column-search-panel.search-panel-new > div > div.search-box > div.search-form > form > button').click()
driver.find_element(By.CSS_SELECTOR,'div.search-form > form > button').click()
# bottons = driver.find_elements(By.CSS_SELECTOR,'#wrap > div.page-job-wrapper > div.page-job-inner > div > div.job-list-wrapper > div.search-job-result > div > div > div > a:nth-child(2)')
# time.sleep(1)
# driver.close()
#links = driver.find_elements(By.CSS_SELECTOR,'#wrap > div.page-job-wrapper > div.page-job-inner > div > div.job-list-wrapper > div.search-job-result > ul > li')
# print(link)
# for link in links:
# job_names = link.find_element(By.CSS_SELECTOR,'div.job-title.clearfix > span.job-name').text
# titels = link.find_element(By.CSS_SELECTOR,'div.job-title.clearfix > span.job-area-wrapper > span').text
# print(job_names,titels)
def get_info():
job_names = driver.find_elements(By.CSS_SELECTOR,'div.job-title.clearfix > span.job-name')
areas = driver.find_elements(By.CSS_SELECTOR,'div.job-title.clearfix > span.job-area-wrapper > span')
salarys = driver.find_elements(By.CSS_SELECTOR,'div.job-card-body.clearfix > a > div.job-info.clearfix > span')
experiences = driver.find_elements(By.CSS_SELECTOR,'div.job-card-body.clearfix > a > div.job-info.clearfix > ul > li:nth-child(1)')
educations = driver.find_elements(By.CSS_SELECTOR,'div.job-info.clearfix > ul > li:nth-child(2)')
companys = driver.find_elements(By.CSS_SELECTOR,'div.job-card-body.clearfix > div > div.company-info > h3 > a')
descs =driver.find_elements(By.CSS_SELECTOR,'ul > li > div.job-card-footer.clearfix > div')
for job_name,area,salary,experience,education,company,desc in zip(job_names,areas,salarys,experiences,educations,companys,descs):
data= {
'job_name':job_name.text,
'area':area.text,
'salary':salary.text,
'experience':experience.text,
'education':education.text,
'company':company.text,
'desc':desc.text
}
print(data)
writer.writerow((job_name.text, area.text, salary.text,experience.text,education.text,company.text,desc.text))
#点击下一页按钮
driver.find_element(By.CSS_SELECTOR,'#wrap > div.page-job-wrapper > div.page-job-inner > div > div.job-list-wrapper > div.search-job-result > div > div > div > a:nth-child(10) > i').click()
current_handle = driver.current_window_handle
handles = driver.window_handles
for handle in handles:
driver.switch_to.window(handle)
time.sleep(2)
if __name__ == '__main__':
for page in range(1,3):
time.sleep(10)
print(f'爬取第{page}页')
time.sleep(15)
get_info()
time(1)
driver.quit()
2、报错,调了很久,试过强制等待和增加判断条件,都不行。网上百度看了很多网友的经验,最终增加句柄搞定
current_handle = driver.current_window_handle
handles = driver.window_handles
for handle in handles:
driver.switch_to.window(handle)
time.sleep(2)
好博客就要一起分享哦!分享海报
此处可发布评论
评论(0)展开评论
暂无评论,快来写一下吧
展开评论