使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

备注还没来得及写，共爬取八千多的歌手，每名歌手平均三十首歌曲算，大概二十多万首歌曲

run.py

 #!/usr/bin/env python

 # -*- coding: utf-8 -*-

 __author__ = 'Zqf'

 from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider

 from scrapy.crawler import CrawlerProcess

 from scrapy.utils.project import get_project_settings

 # 获取settings.py模块的设置

 settings = get_project_settings()

 process = CrawlerProcess(settings=settings)

 # 可以添加多个spider

 process.crawl(DingdianSimpleSpider)

 # 启动爬虫，会阻塞，直到爬取完成

 process.start()

kugou.py

 #!/usr/bin/env python

 # -*- coding: utf-8 -*-

 import re

 __author__ = 'Zqf'

 import scrapy

 from kugoumusic.items import KugoumusicItem

 from scrapy.linkextractors import LinkExtractor

 from scrapy.spiders import Rule

 class KugouSpiders(scrapy.spiders.CrawlSpider):

     name = 'kugou'

     start_urls = ['http://www.kugou.com/']

     rules = (

         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',

                                   'http://www.kugou.com/yy/singer/index/\d-([a-z]|null)-1.html'])),

         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/\d+.html']), callback='parse_item')

     )

     def parse_item(self, response):

         singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()

         print(singer)

         songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()

         print(songs)

         item = KugoumusicItem()

         item['singer'] = singer

         item['songs'] = songs

         yield item

items.py

 # -*- coding: utf-8 -*-

 # Define here the models for your scraped items

 #

 # See documentation in:

 # https://doc.scrapy.org/en/latest/topics/items.html

 import scrapy

 class KugoumusicItem(scrapy.Item):

     # define the fields for your item here like:

     # name = scrapy.Field()

     singer = scrapy.Field()

     songs = scrapy.Field()

pipelines.py

 # -*- coding: utf-8 -*-

 # Define your item pipelines here

 #

 # Don't forget to add your pipeline to the ITEM_PIPELINES setting

 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

 from pymongo import MongoClient

 class KugoumusicPipeline(object):

     def open_spider(self, spider):

         # mongo_config = spider.settings['MONGO_CONFIG']

         # host = '127.0.0.1', port = 27017

         self.client = MongoClient(host='127.0.0.1', port=27017)

         self.coll = self.client['student_db']['kugou']

         self.li = []

     def close_spider(self, spider):

         self.insert()

         self.client.close()

     def insert(self):

         self.coll.insert_many(self.li)

     def process_item(self, item, spider):

         if len(self.li) >= 100:

             self.insert()

             self.li = []

             print("成功插入100条数据-------------------------------------")

         else:

             self.li.append(dict(item))

         return item

settings.py

 # -*- coding: utf-8 -*-

 # Scrapy settings for kugoumusic project

 #

 # For simplicity, this file contains only settings considered important or

 # commonly used. You can find more settings consulting the documentation:

 #

 #     https://doc.scrapy.org/en/latest/topics/settings.html

 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

 BOT_NAME = 'kugoumusic'

 SPIDER_MODULES = ['kugoumusic.spiders']

 NEWSPIDER_MODULE = 'kugoumusic.spiders'

 # MONGO_CONFIG = ['192.168.62.35:1806, '

 #               '192.168.62.240:1806, '

 #               '192.168.62.23:1806, '

 #               '192.168.62.32:1806, '

 #               '192.168.62.25:1806, '

 #               '192.168.62.28:1806, '

 #               '192.168.62.241:1806']

 # MONGO_CONFIG = {

 #     'host': '127.0.0.1',

 #     'port': 27017

     # 'user': 'root',

     # 'password': '123456',

     # 'db': 's1806',

     # 'charset': 'utf8'

 # }

 # Crawl responsibly by identifying yourself (and your website) on the user-agent

 #USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)'

 # Obey robots.txt rules

 ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)

 #CONCURRENT_REQUESTS = 32

 # Configure a delay for requests for the same website (default: 0)

 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

 # See also autothrottle settings and docs

 #DOWNLOAD_DELAY = 3

 # The download delay setting will honor only one of:

 #CONCURRENT_REQUESTS_PER_DOMAIN = 16

 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)

 #COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)

 #TELNETCONSOLE_ENABLED = False

 # Override the default request headers:

 DEFAULT_REQUEST_HEADERS = {

     'Connection': 'keep-alive',

     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',

     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

     'Accept-Encoding': 'gzip, deflate, br',

     'Accept-Language': 'zh-CN,zh;q=0.9',

 }

 # Enable or disable spider middlewares

 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

 #SPIDER_MIDDLEWARES = {

 #    'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,

 #}

 # Enable or disable downloader middlewares

 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

 #DOWNLOADER_MIDDLEWARES = {

 #    'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,

 #}

 # Enable or disable extensions

 # See https://doc.scrapy.org/en/latest/topics/extensions.html

 #EXTENSIONS = {

 #    'scrapy.extensions.telnet.TelnetConsole': None,

 #}

 # Configure item pipelines

 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

 ITEM_PIPELINES = {

    'kugoumusic.pipelines.KugoumusicPipeline': 300,

 }

 # Enable and configure the AutoThrottle extension (disabled by default)

 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html

 #AUTOTHROTTLE_ENABLED = True

 # The initial download delay

 #AUTOTHROTTLE_START_DELAY = 5

 # The maximum download delay to be set in case of high latencies

 #AUTOTHROTTLE_MAX_DELAY = 60

 # The average number of requests Scrapy should be sending in parallel to

 # each remote server

 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

 # Enable showing throttling stats for every response received:

 #AUTOTHROTTLE_DEBUG = False

 # Enable and configure HTTP caching (disabled by default)

 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

 #HTTPCACHE_ENABLED = True

 #HTTPCACHE_EXPIRATION_SECS = 0

 #HTTPCACHE_DIR = 'httpcache'

 #HTTPCACHE_IGNORE_HTTP_CODES = []

 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中的更多相关文章

Java爬虫系列之实战：爬取酷狗音乐网 TOP500 的歌曲(附源码)
在前面分享的两篇随笔中分别介绍了HttpClient和Jsoup以及简单的代码案例: Java爬虫系列二:使用HttpClient抓取页面HTML Java爬虫系列三:使用Jsoup解析HTML 今天 ...
python爬取酷狗音乐排行榜
本文为大家分享了python爬取酷狗音乐排行榜的具体代码,供大家参考,具体内容如下
python使用beautifulsoup4爬取酷狗音乐
声明:本文仅为技术交流,请勿用于它处. 小编经常在网上听一些音乐但是有一些网站好多音乐都是付费下载的正好我会点爬虫技术,空闲时间写了一份,截止4月底没有问题的,会下载到当前目录,只要按照bs4库就好, ...
python爬取酷狗音乐
url:https://www.kugou.com/yy/html/rank.html 我们随便访问一个歌曲可以看到url有个hash https://www.kugou.com/song/#hash ...
【Python】【爬虫】爬取酷狗音乐网络红歌榜
原理:我的上篇博客 import requests import time from bs4 import BeautifulSoup def get_html(url): ''' 获得 HTML ' ...
Python爬取酷狗飙升榜前十首（100）首，写入CSV文件
酷狗飙升榜,写入CSV文件爬取酷狗音乐飙升榜的前十首歌名.歌手.时间,是一个很好的爬取网页内容的例子,对爬虫不熟悉的读者可以根据这个例子熟悉爬虫是如何爬取网页内容的. 需要用到的库:requests ...
【Python】【爬虫】爬取酷狗TOP500
好啦好啦,那我们来拉开我们的爬虫之旅吧~~~ 这一只小爬虫是爬取酷狗TOP500的,使用的爬取手法简单粗暴,目的是帮大家初步窥探爬虫长啥样,后期会慢慢变得健壮起来的. 环境配置在此之前需要下载一个谷 ...
htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载
上次学了jsoup之后,发现一些动态生成的网页内容是无法抓取的,于是又学习了htmlunit,下面是抓取酷狗音乐与qq音乐链接的例子: 酷狗音乐: import java.io.BufferedInp ...
使用Xpath爬取酷狗TOP500的歌曲信息
使用xpath爬取酷狗TOP500的歌曲信息, 将排名.歌手名.歌曲名.歌曲时长,提取的结果以文件形式保存下来.参考网址:http://www.kugou.com/yy/rank/home/1-888 ...

随机推荐

LED全彩显示屏色度空间
摘要:LED全彩显示屏.LED电子大屏幕如果要有一个良好的视觉效果,其中色度占有一席重要的位置,那么该如何让LED显示屏的色度更均匀.合理呢,下面为大家总结出以下几点,供大家参考. LED全彩显示屏. ...
使用inet_ntoa() 时编译提示错误：
char*inet_ntoa(struct in_addr in)将一个IP转换成一个互联网标准点分格式的字符串. 我把 inet_ntoa 需要的头文件加上去: #include <sy ...
P4576 [CQOI2013]棋盘游戏
传送门很显然,除非白子和黑子相邻,否则必然是黑子获胜虽然我并没有看出来那么现在对黑子来说它要尽可能快的赢,对白子它要多苟一会儿然后就是这个叫做对抗搜索的东西了 //minamoto #inclu ...
洛谷 P1037 产生数
题目描述给出一个整数n(n<10^30)和k个变换规则(k≤15). 规则: 一位数可变换成另一个一位数: 规则的右部不能为零. 例如:n=234.有规则(k=2): 2->53-> ...
HDU 4135 容斥原理
思路: 直接容斥 //By SiriusRen #include <cstdio> using namespace std; #define int long long ; int cas ...
.net面试题 2016
经典面试题2016——50题 1.面向对象语言具有——继承性——,——封装性——,——多态性—— 继承性:就是让一个类型的对象拥有另一个类型的对象的属性的方法.继承后,子类拥有父类的属性和方法. 封装 ...
FCC 基础JavaScript 练习2
1. 引号不是字符串中唯一的可以被转义字符.下面是常见的转义序列列表: \' 单引号 \" 双引号 \\ 反斜杠符 \n 换行符 \r 回车符 \t 制表符 \b 退格符 \f 换页符 ...
JavaScript（九）正则表达式
RegExp 正则表达式 ,一般被创建出来就是用于字符串的替换查找方法中的 1.创建正则表达式 var reg = /pattern/flag; // 字面量 var reg = new RegE ...
[Windows Server 2008] IP安全策略限制端口方法
★ 欢迎来到[护卫神·V课堂],网站地址:http://v.huweishen.com ★ 护卫神·V课堂是护卫神旗下专业提供服务器教学视频的网站,每周更新视频. ★ 本节我们将带领大家:限制143 ...
3星|《投机教父尼德霍夫的股票投机术》：2003年的书了。作者97年投机大亏后在CNBC《金钱》栏目上的股市评论文章集。
查资料作者在97年金融危机中大亏,之后在CNBC<金钱>栏目上跟人合写股市评论文章,本书是那些股评文章的集合.有资料说作者在08年有一次大亏. 从这些文章看,作者是比较冷静地看待股市的,不 ...

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中的更多相关文章

随机推荐

热门专题