备注还没来得及写,共爬取八千多的歌手,每名歌手平均三十首歌曲算,大概二十多万首歌曲

run.py

 #!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Zqf'
from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings # 获取settings.py模块的设置
settings = get_project_settings()
process = CrawlerProcess(settings=settings) # 可以添加多个spider
process.crawl(DingdianSimpleSpider) # 启动爬虫,会阻塞,直到爬取完成
process.start()

kugou.py

 #!/usr/bin/env python
# -*- coding: utf-8 -*-
import re __author__ = 'Zqf' import scrapy
from kugoumusic.items import KugoumusicItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule class KugouSpiders(scrapy.spiders.CrawlSpider):
name = 'kugou' start_urls = ['http://www.kugou.com/'] rules = (
Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',
'http://www.kugou.com/yy/singer/index/\d-([a-z]|null)-1.html'])),
Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/\d+.html']), callback='parse_item')
) def parse_item(self, response):
singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()
print(singer)
songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()
print(songs) item = KugoumusicItem()
item['singer'] = singer
item['songs'] = songs yield item

items.py

 # -*- coding: utf-8 -*-

 # Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class KugoumusicItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
singer = scrapy.Field()
songs = scrapy.Field()

pipelines.py

 # -*- coding: utf-8 -*-

 # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient class KugoumusicPipeline(object): def open_spider(self, spider):
# mongo_config = spider.settings['MONGO_CONFIG']
# host = '127.0.0.1', port = 27017
self.client = MongoClient(host='127.0.0.1', port=27017)
self.coll = self.client['student_db']['kugou']
self.li = [] def close_spider(self, spider):
self.insert()
self.client.close() def insert(self):
self.coll.insert_many(self.li) def process_item(self, item, spider):
if len(self.li) >= 100:
self.insert()
self.li = []
print("成功插入100条数据-------------------------------------")
else:
self.li.append(dict(item)) return item

settings.py

 # -*- coding: utf-8 -*-

 # Scrapy settings for kugoumusic project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'kugoumusic' SPIDER_MODULES = ['kugoumusic.spiders']
NEWSPIDER_MODULE = 'kugoumusic.spiders' # MONGO_CONFIG = ['192.168.62.35:1806, '
# '192.168.62.240:1806, '
# '192.168.62.23:1806, '
# '192.168.62.32:1806, '
# '192.168.62.25:1806, '
# '192.168.62.28:1806, '
# '192.168.62.241:1806'] # MONGO_CONFIG = {
# 'host': '127.0.0.1',
# 'port': 27017
# 'user': 'root',
# 'password': '123456',
# 'db': 's1806',
# 'charset': 'utf8'
# }
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)' # Obey robots.txt rules
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
#COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'kugoumusic.pipelines.KugoumusicPipeline': 300,
} # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中的更多相关文章

  1. Java爬虫系列之实战:爬取酷狗音乐网 TOP500 的歌曲(附源码)

    在前面分享的两篇随笔中分别介绍了HttpClient和Jsoup以及简单的代码案例: Java爬虫系列二:使用HttpClient抓取页面HTML Java爬虫系列三:使用Jsoup解析HTML 今天 ...

  2. python爬取酷狗音乐排行榜

    本文为大家分享了python爬取酷狗音乐排行榜的具体代码,供大家参考,具体内容如下  

  3. python使用beautifulsoup4爬取酷狗音乐

    声明:本文仅为技术交流,请勿用于它处. 小编经常在网上听一些音乐但是有一些网站好多音乐都是付费下载的正好我会点爬虫技术,空闲时间写了一份,截止4月底没有问题的,会下载到当前目录,只要按照bs4库就好, ...

  4. python爬取酷狗音乐

    url:https://www.kugou.com/yy/html/rank.html 我们随便访问一个歌曲可以看到url有个hash https://www.kugou.com/song/#hash ...

  5. 【Python】【爬虫】爬取酷狗音乐网络红歌榜

    原理:我的上篇博客 import requests import time from bs4 import BeautifulSoup def get_html(url): ''' 获得 HTML ' ...

  6. Python爬取酷狗飙升榜前十首(100)首,写入CSV文件

    酷狗飙升榜,写入CSV文件 爬取酷狗音乐飙升榜的前十首歌名.歌手.时间,是一个很好的爬取网页内容的例子,对爬虫不熟悉的读者可以根据这个例子熟悉爬虫是如何爬取网页内容的. 需要用到的库:requests ...

  7. 【Python】【爬虫】爬取酷狗TOP500

    好啦好啦,那我们来拉开我们的爬虫之旅吧~~~ 这一只小爬虫是爬取酷狗TOP500的,使用的爬取手法简单粗暴,目的是帮大家初步窥探爬虫长啥样,后期会慢慢变得健壮起来的. 环境配置 在此之前需要下载一个谷 ...

  8. htmlunit+fastjson抓取酷狗音乐 qq音乐链接及下载

    上次学了jsoup之后,发现一些动态生成的网页内容是无法抓取的,于是又学习了htmlunit,下面是抓取酷狗音乐与qq音乐链接的例子: 酷狗音乐: import java.io.BufferedInp ...

  9. 使用Xpath爬取酷狗TOP500的歌曲信息

    使用xpath爬取酷狗TOP500的歌曲信息, 将排名.歌手名.歌曲名.歌曲时长,提取的结果以文件形式保存下来.参考网址:http://www.kugou.com/yy/rank/home/1-888 ...

随机推荐

  1. bzoj 3308 九月的咖啡店

    题目大意: 求若干个<=n的数 两两互质 使和最大 求这个最大的和 思路: 显然,得到两个结论 1 最终的所有数都只能分解为两个质因数 2 这两个质因数 一个<根号n 一个>根号n ...

  2. 解方程 2014NOIP提高组 (数学)

    解方程  时间限制: 1 s  空间限制: 128000 KB  题目等级 : 钻石 Diamond       题目描述 Description 输入描述 Input Description 输入文 ...

  3. LoadRunner监控Linux配置教程

    LoadRunner监控Linux资源时弹出如下错误: Monitor name :UNIX Resources. Cannot initialize the monitoring on 192.16 ...

  4. redis简介及常见问题

    目录 简介 特点 优点 高性能 高并发 为什么要用 redis 而不用 map/guava 做缓存? redis 和 memcached 的区别 Redis快的原因 为什么redis是单线程 为什么r ...

  5. Java多线程(三)SimpleDateFormat

    多线程报错:java.lang.NumberFormatException: multiple points SimpleDateFormat是非线程安全的,在多线程情况下会有问题,在每个线程下得各自 ...

  6. [C和指针] 1-快速上手、2-基本概念、3-数据

    第1章 快速上手 1.1.1 空白和注释   程序的空白的作用: 空行将程序的不同部分分割开来:制表符缩进语句,可以更好地显示程序的结构等等.     软件最大的开销并非在于编写,而是在于维护,所以需 ...

  7. 2017杭电多校第六场1008 Kirinriki

    传送门 Kirinriki Time Limit: 4000/2000 MS (Java/Others)    Memory Limit: 65536/65536 K (Java/Others) To ...

  8. Windows及Linux环境下Tomcat的JVM参数调优

    Windows环境: catalina.bat文件修改 set JAVA_OPTS=-server -Xms4096m -Xmx4096m -XX:PermSize=512m -XX:MaxPermS ...

  9. js jquery 获取服务器控件的三种方法

    由于ASP.NET网页运行后,服务器控件会随机生成客户端id,jquery获取时候不太好操作,google了下,总结有以下3种方法: 服务器控件代码:<asp:TextBox ID=" ...

  10. SQL函数类的操作,增加,查询

    数据库连接: 表的创建: 创建连接对象,命令对象类: 添加函数: 查询函数类: List<>集合,里面专门放对象 函数主体: 查询: foreach只能修改,不能添加删除