常见的反爬虫策略:
今天在爬取豆瓣评分钱250的电影名单时得到了返回值为418的状态码,于是我发现是被网站的反爬虫策略给禁锢住了
结果差异一下发现是网站设置了反爬虫机制
解决方法很简单,就是模拟一个headers让网页认为是真人请求了数据即可
解决方案:
import requests
pages_index = range(0,250,25)
index_list = list(pages_index)
htmls = []
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
for idx in index_list:
url = f"https://movie.douban.com/top250?start={idx}&filter="
print("craw html:",url)
r = requests.get(url,headers=headers)
print(r.status_code)
if r.status_code != 200:
raise Exception("error")
htmls.append(r.text)
return htmls
最后的状态码就是200啦!
爬虫案例
爬取豆瓣top250的电影信息
# 导入需要用到的包
import requests
from bs4 import BeautifulSoup
import json
import pprint
pages_index = range(0,250,25)
index_list = list(pages_index)
print(list(pages_index))
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
# url = "https://movie.douban.com/top250"
def download_all_html():
"""
下载所有的HTML,用于后续的分析
:return:
"""
htmls = []
for idx in index_list:
url = f"https://movie.douban.com/top250?start={idx}&filter="
print("craw html:",url)
r = requests.get(url,headers=headers)
print(r.status_code)
if r.status_code != 200:
raise Exception("error")
htmls.append(r.text)
return htmls
# 执行爬取
htmls = download_all_html()
def parse_single_html(html):
"""
解析单个html,得到数据
:param html:
:return:
"""
soup = BeautifulSoup(html,"html.parser")
article_items = (
soup.find("div",class_="article")
.find("ol",class_="grid_view")
.find_all("div",class_="item")
)
datas = []
for article_item in article_items:
rank = article_item.find("div",class_="pic").find("em").get_text()
info = article_item.find("div",class_="info")
title = info.find("span",class_="title").get_text()
stars = (
info.find("div",class_="bd")
.find("div",class_="star")
.find_all("span")
)
rating_star = stars[0]["class"][0]
rating_num = stars[1].get_text()
comments = stars[2].get_text()
datas.append({
"rank":rank,
"title":title,
"rating_star":rating_star.replace("rating","").replace("-t",""),
"comments":comments.replace("人评价",""),
"rating_num":rating_num
})
return datas
pprint.pprint(parse_single_html(htmls[0]))
[{'comments': '',
'rank': '1',
'rating_num': '9.7',
'rating_star': '5',
'title': '肖申克的救赎'},
{'comments': '',
'rank': '2',
'rating_num': '9.6',
'rating_star': '5',
'title': '霸王别姬'},
{'comments': '',
'rank': '3',
'rating_num': '9.5',
'rating_star': '5',
'title': '阿甘正传'},
{'comments': '',
'rank': '4',
'rating_num': '9.5',
'rating_star': '5',
'title': '泰坦尼克号'},
{'comments': '',
'rank': '5',
'rating_num': '9.4',
'rating_star': '45',
'title': '这个杀手不太冷'},
{'comments': '',
'rank': '6',
'rating_num': '9.6',
'rating_star': '5',
'title': '美丽人生'},
{'comments': '',
'rank': '7',
'rating_num': '9.4',
'rating_star': '45',
'title': '千与千寻'},
{'comments': '',
'rank': '8',
'rating_num': '9.6',
'rating_star': '5',
'title': '辛德勒的名单'},
{'comments': '',
'rank': '9',
'rating_num': '9.4',
'rating_star': '45',
'title': '星际穿越'},
{'comments': '',
'rank': '10',
'rating_num': '9.4',
'rating_star': '45',
'title': '盗梦空间'},
{'comments': '',
'rank': '11',
'rating_num': '9.4',
'rating_star': '45',
'title': '楚门的世界'},
{'comments': '',
'rank': '12',
'rating_num': '9.4',
'rating_star': '45',
'title': '忠犬八公的故事'},
{'comments': '',
'rank': '13',
'rating_num': '9.3',
'rating_star': '45',
'title': '海上钢琴师'},
{'comments': '',
'rank': '14',
'rating_num': '9.2',
'rating_star': '45',
'title': '三傻大闹宝莱坞'},
{'comments': '',
'rank': '15',
'rating_num': '9.3',
'rating_star': '45',
'title': '放牛班的春天'},
{'comments': '',
'rank': '16',
'rating_num': '9.3',
'rating_star': '45',
'title': '机器人总动员'},
{'comments': '',
'rank': '17',
'rating_num': '9.3',
'rating_star': '45',
'title': '无间道'},
{'comments': '',
'rank': '18',
'rating_num': '9.2',
'rating_star': '45',
'title': '疯狂动物城'},
{'comments': '',
'rank': '19',
'rating_num': '9.6',
'rating_star': '5',
'title': '控方证人'},
{'comments': '',
'rank': '20',
'rating_num': '9.2',
'rating_star': '45',
'title': '大话西游之大圣娶亲'},
{'comments': '',
'rank': '21',
'rating_num': '9.4',
'rating_star': '45',
'title': '熔炉'},
{'comments': '',
'rank': '22',
'rating_num': '9.3',
'rating_star': '45',
'title': '教父'},
{'comments': '',
'rank': '23',
'rating_num': '9.2',
'rating_star': '45',
'title': '当幸福来敲门'},
{'comments': '',
'rank': '24',
'rating_num': '9.3',
'rating_star': '45',
'title': '触不可及'},
{'comments': '',
'rank': '25',
'rating_num': '9.1',
'rating_star': '45',
'title': '怦然心动'}]
如果想要导入到Excel中的话需要下载两个包 pandas和openpyxl
# pprint.pprint(parse_single_html(htmls[0]))
all_datas=[]
for html in htmls:
all_datas.extend(parse_single_html(html))
pf = pd.DataFrame(all_datas)
pf.to_excel("豆瓣top250电影排名.xlsx")
大功告成!!!