python3爬虫-网易云排行榜，网易云歌手及作品

import requests, re, json, os, time

from fake_useragent import UserAgent

from lxml import etree

from urllib import parse

class MyError(Exception):

    def __init__(self, status, msg):

        self.status = status

        self.msg = msg

class WyRinking():

    def __init__(self):

        ua = UserAgent()

        self.stratUrl = "https://music.163.com/discover/toplist"

        self.headers = {

            "User-Agent": ua.random

        }

        self.timeout = 10

        self.allow_redirects = False

        self.nameList = []

        self.urlList = []

    def __getRinkNameUrl(self, response):

        '''获取所有排行榜名字，和url'''

        html_selector = self.__etreeSelector(response)

        self.nameList = html_selector.xpath(

            "//div[contains(@class,'item') and contains(@class,'f-cb')]/p[@class='name']/a/text()") or []

        self.urlList = html_selector.xpath(

            "//div[contains(@class,'item') and contains(@class,'f-cb')]/p[@class='name']/a/@href") or []

    def __getPageHtml(self, url):

        '''请求页面'''

        try:

            response = requests.get(url, headers=self.headers, timeout=self.timeout,

                                    allow_redirects=self.allow_redirects)

            return response

        except requests.exceptions.Timeout as e:

            print("Timeout Error>>:", e)

            self.__getPageHtml(url=url)

    def __getRankHtml(self):

        '''获取每个排行榜的html源码'''

        if not self.nameList and not self.urlList:

            raise MyError(10000, "{},{} 数据不能为空".format(self.nameList, self.urlList))

        if len(self.nameList) != len(self.urlList):

            raise MyError(10001, "nameList,urlList数据不能一一对应")

        for i in range(len(self.urlList)):

            url = parse.urljoin(self.stratUrl, url=self.urlList[i])

            response = self.__getPageHtml(url=url)

            response.customizeName = self.nameList[i]

            self.__getRankInfo(response)

    def __getRankInfo(self, response):

        '''获取到网页中的json格式数据，写入到文件'''

        html_selector = self.__etreeSelector(response)

        test = html_selector.xpath("//*[@id='song-list-pre-data']/text()")[0] or ""

        updateTime = html_selector.xpath("//span[contains(@class,'sep') and contains(@class,'s-fc3')]/text()")[0]

        try:

            data = json.loads(test)

        except json.decoder.JSONDecodeError:

            data = json.loads(test + '"}}]')

        '''

        if not len(songNmaeList) == len(songUrlList) == len(songIdList) == len(songIdList):

            raise MyError(10001, "songNmaeList,songUrlList,songIdList,songIdList数据不能一一对应")

        '''

        fileName = response.customizeName + '--' + updateTime + ".json"

        if not Rink_BASE_PATH:

            raise MyError(10005, "需要在全局中配置该参数Rink_BASE_PATH，用于文件存放地址")

        if not os.path.exists(Rink_BASE_PATH):

            os.makedirs(Rink_BASE_PATH)

        path = os.path.join(Rink_BASE_PATH, fileName)

        self.__writeToFile(path, data)

    def __writeToFile(self, path, data):

        print('正在写入文件{}.json'.format(path))

        index = 1

        with open(path, "w", encoding="utf-8") as f:

            for data_dic in data:

                dic = {}

                dic["rankNum"] = index

                dic["songId"] = data_dic.get("id")

                dic["songName"] = data_dic.get("name")

                dic["artistsInfo"] = data_dic.get("artists")

                dic["commentThreadId"] = data_dic.get("commentThreadId")

                f.write(json.dumps(dic, ensure_ascii=False) + "\n")

                index += 1

    def __reSongId(self, songurl: str):

        '''

        :param songurl:  /song?id=1336871144 格式类似于这样

        '''

        pattern = r"id=(\d+)"

        try:

            id = re.findall(pattern, songurl)[0]

        except IndexError:

            raise MyError(10002, "歌曲id获取失败")

        return id

    def collectRanking(self):

        '''获取网易云排行榜数据'''

        response = self.__getPageHtml(url=self.stratUrl)

        self.__getRinkNameUrl(response)

        self.__getRankHtml()

    def __etreeSelector(self, response):

        '''将response对象转换为xml格式'''

        return etree.HTML(response.text)

class WySinger():

    __isFirstStatus = True  # 请求华语男歌手页面的时候，获取到A-Z对应的参数，这个只需要获取一次就足够

    def __init__(self):

        ua = UserAgent()

        self.stratUrl = "https://music.163.com/discover/artist"

        self.headers = {

            "User-Agent": ua.random

        }

        self.timeout = 10

        self.allow_redirects = False

        self.sCategoryNameList = []

        self.sCategoryIdList = []

        self.sCategoryUrlList = []

        self.initialIdList = []

        self.markList = []

    def __getPageHtml(self, url):

        '''请求页面'''

        try:

            response = requests.get(url, headers=self.headers, timeout=self.timeout,

                                    allow_redirects=self.allow_redirects)

            return response

        except requests.exceptions.Timeout as e:

            print("Timeout Error>>:", e)

            self.__getPageHtml(url=url)

    def __getSingerCategory(self, response):

        htmlSelector = self.__etreeSelector(response)

        sCategoryNameList = htmlSelector.xpath(

            "//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/text()")

        sCategoryIdList = htmlSelector.xpath(

            "//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/@data-cat")

        sCategoryUrlList = htmlSelector.xpath(

            "//*[@id='singer-cat-nav']/div[@class='blk']//li/a[@class='cat-flag']/@href")

        if sCategoryUrlList and len(sCategoryNameList) == len(sCategoryIdList) == len(sCategoryUrlList):

            self.sCategoryNameList = sCategoryNameList or []

            self.sCategoryIdList = sCategoryIdList or []

            self.sCategoryUrlList = [parse.urljoin(self.stratUrl, url) for url in sCategoryUrlList or []]

    def __getSingerListPage(self):

        if not self.sCategoryNameList and not self.sCategoryUrlList:

            raise MyError(10000, "{},{} 数据不能为空".format(self.sCategoryNameList, self.sCategoryUrlList))

        if len(self.sCategoryNameList) != len(self.sCategoryUrlList):

            raise MyError(10001, "nameList,urlList数据不能一一对应")

        for sCategoryUrl in self.sCategoryUrlList:

            response = self.__getPageHtml(sCategoryUrl)

            if self.__isFirstStatus:

                self.__getInitialId(response)

                self.__isFirstStatus = False

            for inintalId in self.initialIdList:

                if inintalId == "-1":

                    # inintalId 为-1的时候代表热门，但是会和后面的歌手信息重复，所以做个判断

                    continue

                url = sCategoryUrl + "&initial=" + inintalId

                res = self.__getPageHtml(url)

                yield res

    def __getSingerIdUrl(self, response):

        htmlSelector = self.__etreeSelector(response)

        aSelector = htmlSelector.xpath(

            "//*[@id='m-artist-box']//a[@class='msk'] | //*[@id='m-artist-box']/li[@class='sml']/a[1]")

        singerUrlList = [parse.urljoin(self.stratUrl, selector.xpath("@href")[0]) for selector in aSelector]

        singerNameList = [selector.xpath("@title")[0].replace("的音乐", "") for selector in aSelector]

        if singerUrlList and len(singerUrlList) == len(singerNameList):

            yield list(zip(singerUrlList, singerNameList))

        else:

            yield []

    def __getInitialId(self, response):

        '''获取A-Z对应的initialId'''

        htmlSelector = self.__etreeSelector(response)

        urlList = htmlSelector.xpath("//*[@id='initial-selector']/li/a/@href")

        initialIdList = [self.__reInitialId(url) for url in urlList]

        markList = htmlSelector.xpath("//*[@id='initial-selector']/li/a/text()")

        if len(initialIdList) == len(markList):

            self.initialIdList = initialIdList

            self.markList = markList

    def __reInitialId(self, url):

        '''

        url格式为：/discover/artist/cat?id=1001&initial=-1

        '''

        pattern = r"initial=(.*)"

        initialId = re.findall(pattern, url, re.S)[0]

        return initialId

    def __getSingerDetails(self, response):

        htmlSelector = self.__etreeSelector(response)

        try:

            data_json = htmlSelector.xpath("//*[@id='song-list-pre-data']/text()")[0]

            data_list = json.loads(data_json, strict=False)

            singerDetails_json = htmlSelector.xpath("//script[@type='application/ld+json']/text()")[0]

            singerDetails_dict = json.loads(singerDetails_json, strict=False)

            singerDetails_content = singerDetails_dict.get("description")

            return data_list, singerDetails_content

        except Exception as e:

            # 有些音乐人是没有音乐作品的，所以通过索引取值([0])就会抛异常，我这里捕捉改异常，不进行处理就好

            print(e)

            return None, None

    def __writeToFile(self, datalist, singerDetails_content, singerName):

        if not os.path.exists(Singer_BASE_PATH):

            os.makedirs(Singer_BASE_PATH)

        path = os.path.join(Singer_BASE_PATH, singerName)

        print("正在写入{}".format(singerName))

        with open(path + ".txt", 'w', encoding="utf-8") as f:

            f.write("歌手简介：{}".format(singerDetails_content) + "\n")

            for data in datalist:

                f.write("-" * 50 + "\n")

                f.write("歌曲名：{}".format(data.get("name")) + "\n")

                f.write("歌曲ID：{}".format(data.get("privilege").get("id")) + "\n")

                f.write("歌曲专辑：{}".format(data.get("album").get("name")) + "\n")

                f.write("歌曲别号：{}".format("无" if not data.get("alias") else data.get("alias")) + "\n")

    def __etreeSelector(self, response):

        '''将response对象转换为xml格式'''

        return etree.HTML(response.text)

    def collectSinger(self):

        response = self.__getPageHtml(url=self.stratUrl)

        self.__getSingerCategory(response)

        resGenerator = self.__getSingerListPage()

        for res in resGenerator:

            time.sleep(1)

            a = self.__getSingerIdUrl(res)  # a是一个生成器，不知道取啥名,它__next__就是一个列表，这个列表就是当前页面所有歌手名和url的元组

            for i in a:  # i 就是 a__next__得来的列表

                for b in i:  # b 就是我们想要的结果  一个元组(歌手名,歌手详情页的url)

                    singerUrl = b[0]

                    singerName = b[1]

                    singerResponse = self.__getPageHtml(singerUrl)

                    datalist, singerDetails_content = self.__getSingerDetails(singerResponse)

                    if not datalist and not singerDetails_content:

                        continue

                    self.__writeToFile(datalist, singerDetails_content, singerName)

if __name__ == '__main__':

    Rink_BASE_PATH = r"D:\spidersData\Rinking"

    Singer_BASE_PATH = r"D:\spidersData\SingerInfo"

    wangyiyun = WyRinking()

    wangyiyun.collectRanking()  # 获取网易云排行榜数据

    wangyiyun = WySinger()

    wangyiyun.collectSinger()  # 获取网易云所有歌手及作品

python3爬虫-网易云排行榜，网易云歌手及作品的更多相关文章

python3爬虫应用--爬取网易云音乐（两种办法）
一.需求好久没有碰爬虫了,竟不知道从何入手.偶然看到一篇知乎的评论(https://www.zhihu.com/question/20799742/answer/99491808),一时兴起就也照葫 ...
Python爬虫——request实例：爬取网易云音乐华语男歌手top10歌曲
requests是python的一个HTTP客户端库,跟urllib,urllib2类似,但比那两个要简洁的多,至于request库的用法, 推荐一篇不错的博文:https://cuiqingcai. ...
爬虫综合大作业——网易云音乐爬虫 & 数据可视化分析
作业要求来自于https://edu.cnblogs.com/campus/gzcc/GZCC-16SE2/homework/3075 爬虫综合大作业选择一个热点或者你感兴趣的主题. 选择爬取的对象 ...
在Ubuntu18.04.2LTS上使用wine安装qq，微信，迅雷，百度网盘，网易云音乐等软件
在Ubuntu18.04.2LTS上使用wine安装qq,微信,迅雷,百度网盘,网易云音乐等软件一.前言在Linux上办公有一点一直是大家的痛,那就是这些系统上没有我们常用的一些软件,比如QQ,微 ...
Python实现简单的爬虫获取某刀网的更新数据
昨天晚上无聊时,想着练习一下Python所以写了一个小爬虫获取小刀娱乐网里的更新数据 #!/usr/bin/python # coding: utf-8 import urllib.request i ...
Python爬虫爬取全书网小说，程序源码+程序详细分析
Python爬虫爬取全书网小说教程第一步:打开谷歌浏览器,搜索全书网,然后再点击你想下载的小说,进入图一页面后点击F12选择Network,如果没有内容按F5刷新一下点击Network之后出现如下 ...
python3爬虫系列19之反爬随机 User-Agent 和 ip代理池的使用
站长资讯平台:python3爬虫系列19之随机User-Agent 和ip代理池的使用我们前面几篇讲了爬虫增速多进程,进程池的用法之类的,爬虫速度加快呢,也会带来一些坏事. 1. 前言比如随着我们爬虫 ...
[原]openstack-kilo--issue(七)：虚拟机怎么通外网，外网怎么ping通虚拟机
=====问题======= 虚拟机可以ping通外网,外网能ping通虚拟机但是收不到reply 这个问题本人遇到有两种情况: 1.安装完整openstack-kilo后,在route中和虚拟机中抓 ...
C#获取内网和外网IP
写了个小客户端,里面用到了获取内网和外网的IP地址,代码如下: // InnerIP var ipHost = Dns.Resolve(Dns.GetHostName()); ]; innerIP = ...

随机推荐

前端面试经典题目合集（HTML+CSS）一
1.说说你对HTML语义化的理解? (1)什么是HTML语义化? 根据内容的结构化(内容语义化),选择合适的标签(代码语义化)便于开发者阅读和写出更优雅的代码的同时让浏览器的爬虫和机器很好地解析. ( ...
windows server 2008远程桌面最大连接数设置
1. 运行gpedit.msc: 2. 选择计算机配置-->管理模板-->Windows组件-->远程桌面服务-->远程桌面会话主机-->连接: 3. 双击“限制连接的数 ...
基础架构之Docker私有库
由于项目要容器化,所有搭建自己的镜像库也是很有必要的,不然发到直接使用官方的镜像库,速度绝对能让你头疼,这篇文章就介绍搭建自己的镜像私有库. (一) 环境要求 Centos 7.5.1804 Doc ...
icon-font 字体图标的引用
1.font-class引用 font-class是unicode使用方式的一种变种,主要是解决unicode书写不直观,语意不明确的问题. 与unicode使用方式相比,具有如下特点: 兼容性良好, ...
微信小程序scroll-view隐藏滚动条方法
在wxss里加入以下代码: ::-webkit-scrollbar{ width: 0; height: 0; color: transparent; } 源链接:https://blog.csd ...
如何为Android平台编译 opencv3 和 opencv_contrib (Linux)
编译出来的opencv库有问题,正在调试中 ...... 本文以编译opencv 3.3.0 和 opencv_contrib 3.3.0为例,系统为 Linux x64 (Fedora 21),具体 ...
Math类中常用方法
public static int abs(int a) , public static long abs(long a), public static float abs(float a), pu ...
MongoDB学习笔记——概述
概述 MongoDB是一个跨平台,面向文档的数据库.MongoDB创建的数据库可以实现高可用性,高性能,并且能够轻松拓展. MongodbDB的运行方式主要给予两个概念:集合(collection)和 ...
【NLP_Stanford课堂】语言模型2
一.如何评价语言模型的好坏标准:比起语法不通的.不太可能出现的句子,是否为“真实”或"比较可能出现的”句子分配更高的概率过程:先在训练数据集上训练模型的参数,然后在测试数据集上测试模型的 ...
Qt::FocusPolicy的使用
http://blog.csdn.net/imxiangzi/article/details/50742813

python3爬虫-网易云排行榜，网易云歌手及作品

python3爬虫-网易云排行榜，网易云歌手及作品的更多相关文章

随机推荐

热门专题