第一次写出python的爬虫小项目挺开心的 ,也是对自己这段时间学习成果的认可。主要功能就是爬取pc端的虎牙直播lol分类下的主播名称和主播的人气 然后进行数据的排序
尽管项目虽小 但是也算是踏入python学习中的一大步
爬虫代码如下
import re
from urllib import request
class Spider():
url = 'https://www.huya.com/g/lol'
# 爬虫框架 BeautifulSoup , Scrapy
# <span class="txt">
# <span class="avatar fl">
# <img data-original="https://huyaimg.msstatic.com/avatar/1094/63/f20eec58c49c79f9925e88c60463e0_180_135.jpg" src="//a.msstatic.com/huya/main/assets/img/default/84x84.jpg" data-default-img="84x84" alt="虎丶牙莎莉" title="虎丶牙莎莉">
# <i class="nick" title="虎丶牙莎莉">虎丶牙莎莉</i>
# </span>
# <span class="num"><i class="num-icon"></i><i class="js-num">360.0万</i></span>
# </span>
# </span>
# ? 非贪婪
root_pattern = '<span class="txt">([\s\S]*?)</li>'
name_pattern = '<i class="nick" title="[\s\S]*?">([\s\S]*?)</i>'
number_pattern = '<i class="js-num">([\s\S]*?)</i>'
# 私有方法
def __fetch_content(self):
r = request.urlopen(Spider.url)
# bytes
htmls = r.read()
htmls = str(htmls,encoding='utf-8')
return htmls
# 处理字符串 htmls
def __analysis(self,htmls):
root_html = re.findall(Spider.root_pattern,htmls)
anchors = []
for html in root_html:
name = re.findall(Spider.name_pattern,html)
number = re.findall(Spider.number_pattern,html)
anchor = {'name':name,'number':number}
anchors.append(anchor)
return anchors
# 数据精炼
def __refine(self,anchors):
l = lambda anchor: {
'name':anchor['name'][0],
'number':anchor['number'][0]
}
return map(l,anchors)
# 排序
def __sort(self,anchors):
# filter
anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
return anchors
# 比较
def __sort_seed(self,anchor):
r = re.findall('[1-9]\d*\.?\d*',anchor['number'])
number = float(r[0])
# print(number)
if '万' in anchor['number']:
number *= 10000
return number
# 显示
def __show(self,anchors):
for rank in range(0,len(anchors)):
print('排名' + str(rank+1) + ' : ' + anchors[rank]['name'] + '---------' + '人气'+anchors[rank]['number']+'人')
# 入口方法
def go(self):
htmls = self.__fetch_content()
anchors = self.__analysis(htmls)
anchors = list(self.__refine(anchors))
anchors = self.__sort(anchors)
self.__show(anchors)
spider = Spider()
spider.go()
加油阿飞!